MemProfContextDisambiguation.cpp source code [llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp]

1	//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements support for context disambiguation of allocation
10	// calls for profile guided heap optimization. Specifically, it uses Memprof
11	// profiles which indicate context specific allocation behavior (currently
12	// distinguishing cold vs hot memory allocations). Cloning is performed to
13	// expose the cold allocation call contexts, and the allocation calls are
14	// subsequently annotated with an attribute for later transformation.
15	//
16	// The transformations can be performed either directly on IR (regular LTO), or
17	// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18	// Both types of LTO operate on a the same base graph representation, which
19	// uses CRTP to support either IR or Index formats.
20	//
21	//===----------------------------------------------------------------------===//
22
23	#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24	#include "llvm/ADT/DenseMap.h"
25	#include "llvm/ADT/DenseSet.h"
26	#include "llvm/ADT/MapVector.h"
27	#include "llvm/ADT/SetOperations.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallVector.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/Analysis/MemoryProfileInfo.h"
33	#include "llvm/Analysis/ModuleSummaryAnalysis.h"
34	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35	#include "llvm/Bitcode/BitcodeReader.h"
36	#include "llvm/IR/Instructions.h"
37	#include "llvm/IR/Module.h"
38	#include "llvm/IR/ModuleSummaryIndex.h"
39	#include "llvm/Pass.h"
40	#include "llvm/Support/CommandLine.h"
41	#include "llvm/Support/GraphWriter.h"
42	#include "llvm/Support/InterleavedRange.h"
43	#include "llvm/Support/raw_ostream.h"
44	#include "llvm/Transforms/IPO.h"
45	#include "llvm/Transforms/Utils/CallPromotionUtils.h"
46	#include "llvm/Transforms/Utils/Cloning.h"
47	#include "llvm/Transforms/Utils/Instrumentation.h"
48	#include <deque>
49	#include <sstream>
50	#include <unordered_map>
51	#include <vector>
52	using namespace llvm;
53	using namespace llvm::memprof;
54
55	#define DEBUG_TYPE "memprof-context-disambiguation"
56
57	STATISTIC(FunctionClonesAnalysis,
58	"Number of function clones created during whole program analysis");
59	STATISTIC(FunctionClonesThinBackend,
60	"Number of function clones created during ThinLTO backend");
61	STATISTIC(FunctionsClonedThinBackend,
62	"Number of functions that had clones created during ThinLTO backend");
63	STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
64	"cloned) during whole program analysis");
65	STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
66	"during whole program analysis");
67	STATISTIC(AllocTypeNotColdThinBackend,
68	"Number of not cold static allocations (possibly cloned) during "
69	"ThinLTO backend");
70	STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
71	"(possibly cloned) during ThinLTO backend");
72	STATISTIC(OrigAllocsThinBackend,
73	"Number of original (not cloned) allocations with memprof profiles "
74	"during ThinLTO backend");
75	STATISTIC(
76	AllocVersionsThinBackend,
77	"Number of allocation versions (including clones) during ThinLTO backend");
78	STATISTIC(MaxAllocVersionsThinBackend,
79	"Maximum number of allocation versions created for an original "
80	"allocation during ThinLTO backend");
81	STATISTIC(UnclonableAllocsThinBackend,
82	"Number of unclonable ambigous allocations during ThinLTO backend");
83	STATISTIC(RemovedEdgesWithMismatchedCallees,
84	"Number of edges removed due to mismatched callees (profiled vs IR)");
85	STATISTIC(FoundProfiledCalleeCount,
86	"Number of profiled callees found via tail calls");
87	STATISTIC(FoundProfiledCalleeDepth,
88	"Aggregate depth of profiled callees found via tail calls");
89	STATISTIC(FoundProfiledCalleeMaxDepth,
90	"Maximum depth of profiled callees found via tail calls");
91	STATISTIC(FoundProfiledCalleeNonUniquelyCount,
92	"Number of profiled callees found via multiple tail call chains");
93	STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
94	STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
95	STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
96	STATISTIC(MissingAllocForContextId,
97	"Number of missing alloc nodes for context ids");
98	STATISTIC(SkippedCallsCloning,
99	"Number of calls skipped during cloning due to unexpected operand");
100
101	static cl::opt<std::string> DotFilePathPrefix(
102	"memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
103	cl::value_desc ("filename"),
104	cl::desc ("Specify the path prefix of the MemProf dot files."));
105
106	static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
107	cl::Hidden,
108	cl::desc ("Export graph to dot files."));
109
110	// How much of the graph to export to dot.
111	enum DotScope {
112	All, // The full CCG graph.
113	Alloc, // Only contexts for the specified allocation.
114	Context, // Only the specified context.
115	};
116
117	static cl::opt<DotScope> DotGraphScope(
118	"memprof-dot-scope", cl::desc ("Scope of graph to export to dot"),
119	cl::Hidden, cl::init(Val: DotScope::All),
120	cl::values(
121	clEnumValN(DotScope::All, "all", "Export full callsite graph"),
122	clEnumValN(DotScope::Alloc, "alloc",
123	"Export only nodes with contexts feeding given "
124	"-memprof-dot-alloc-id"),
125	clEnumValN(DotScope::Context, "context",
126	"Export only nodes with given -memprof-dot-context-id")));
127
128	static cl::opt<unsigned>
129	AllocIdForDot("memprof-dot-alloc-id", cl::init(Val: `0`), cl::Hidden,
130	cl::desc ("Id of alloc to export if -memprof-dot-scope=alloc "
131	"or to highlight if -memprof-dot-scope=all"));
132
133	static cl::opt<unsigned> ContextIdForDot(
134	"memprof-dot-context-id", cl::init(Val: `0`), cl::Hidden,
135	cl::desc ("Id of context to export if -memprof-dot-scope=context or to "
136	"highlight otherwise"));
137
138	static cl::opt<bool>
139	DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
140	cl::desc ("Dump CallingContextGraph to stdout after each stage."));
141
142	static cl::opt<bool>
143	VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
144	cl::desc ("Perform verification checks on CallingContextGraph."));
145
146	static cl::opt<bool>
147	VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
148	cl::desc ("Perform frequent verification checks on nodes."));
149
150	static cl::opt<std::string> MemProfImportSummary(
151	"memprof-import-summary",
152	cl::desc ("Import summary to use for testing the ThinLTO backend via opt"),
153	cl::Hidden);
154
155	static cl::opt<unsigned>
156	TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: `5`),
157	cl::Hidden,
158	cl::desc ("Max depth to recursively search for missing "
159	"frames through tail calls."));
160
161	// Optionally enable cloning of callsites involved with recursive cycles
162	static cl::opt<bool> AllowRecursiveCallsites(
163	"memprof-allow-recursive-callsites", cl::init(Val: true), cl::Hidden,
164	cl::desc ("Allow cloning of callsites involved in recursive cycles"));
165
166	static cl::opt<bool> CloneRecursiveContexts(
167	"memprof-clone-recursive-contexts", cl::init(Val: true), cl::Hidden,
168	cl::desc ("Allow cloning of contexts through recursive cycles"));
169
170	// Generally this is needed for correct assignment of allocation clones to
171	// function clones, however, allow it to be disabled for debugging while the
172	// functionality is new and being tested more widely.
173	static cl::opt<bool>
174	MergeClones("memprof-merge-clones", cl::init(Val: true), cl::Hidden,
175	cl::desc ("Merge clones before assigning functions"));
176
177	// When disabled, try to detect and prevent cloning of recursive contexts.
178	// This is only necessary until we support cloning through recursive cycles.
179	// Leave on by default for now, as disabling requires a little bit of compile
180	// time overhead and doesn't affect correctness, it will just inflate the cold
181	// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
182	static cl::opt<bool> AllowRecursiveContexts(
183	"memprof-allow-recursive-contexts", cl::init(Val: true), cl::Hidden,
184	cl::desc ("Allow cloning of contexts having recursive cycles"));
185
186	// Set the minimum absolute count threshold for allowing inlining of indirect
187	// calls promoted during cloning.
188	static cl::opt<unsigned> MemProfICPNoInlineThreshold(
189	"memprof-icp-noinline-threshold", cl::init(Val: `2`), cl::Hidden,
190	cl::desc ("Minimum absolute count for promoted target to be inlinable"));
191
192	namespace llvm {
193	cl::opt<bool> EnableMemProfContextDisambiguation(
194	"enable-memprof-context-disambiguation", cl::init(Val: false), cl::Hidden,
195	cl::ZeroOrMore, cl::desc ("Enable MemProf context disambiguation"));
196
197	// Indicate we are linking with an allocator that supports hot/cold operator
198	// new interfaces.
199	cl::opt<bool> SupportsHotColdNew(
200	"supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
201	cl::desc ("Linking with hot/cold operator new interfaces"));
202
203	static cl::opt<bool> MemProfRequireDefinitionForPromotion(
204	"memprof-require-definition-for-promotion", cl::init(Val: false), cl::Hidden,
205	cl::desc (
206	"Require target function definition when promoting indirect calls"));
207	} // namespace llvm
208
209	extern cl::opt<bool> MemProfReportHintedSizes;
210	extern cl::opt<unsigned> MinClonedColdBytePercent;
211
212	namespace {
213	/// CRTP base for graphs built from either IR or ThinLTO summary index.
214	///
215	/// The graph represents the call contexts in all memprof metadata on allocation
216	/// calls, with nodes for the allocations themselves, as well as for the calls
217	/// in each context. The graph is initially built from the allocation memprof
218	/// metadata (or summary) MIBs. It is then updated to match calls with callsite
219	/// metadata onto the nodes, updating it to reflect any inlining performed on
220	/// those calls.
221	///
222	/// Each MIB (representing an allocation's call context with allocation
223	/// behavior) is assigned a unique context id during the graph build. The edges
224	/// and nodes in the graph are decorated with the context ids they carry. This
225	/// is used to correctly update the graph when cloning is performed so that we
226	/// can uniquify the context for a single (possibly cloned) allocation.
227	template <typename DerivedCCG, typename FuncTy, typename CallTy>
228	class CallsiteContextGraph {
229	public:
230	CallsiteContextGraph() = default;
231	CallsiteContextGraph(const CallsiteContextGraph &) = default;
232	CallsiteContextGraph(CallsiteContextGraph &&) = default;
233
234	/// Main entry point to perform analysis and transformations on graph.
235	bool process();
236
237	/// Perform cloning on the graph necessary to uniquely identify the allocation
238	/// behavior of an allocation based on its context.
239	void identifyClones();
240
241	/// Assign callsite clones to functions, cloning functions as needed to
242	/// accommodate the combinations of their callsite clones reached by callers.
243	/// For regular LTO this clones functions and callsites in the IR, but for
244	/// ThinLTO the cloning decisions are noted in the summaries and later applied
245	/// in applyImport.
246	bool assignFunctions();
247
248	void dump() const;
249	void print(raw_ostream &OS) const;
250	void printTotalSizes(raw_ostream &OS) const;
251
252	friend raw_ostream &operator<<(raw_ostream &OS,
253	const CallsiteContextGraph &CCG) {
254	CCG.print(OS);
255	return OS;
256	}
257
258	friend struct GraphTraits<
259	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
260	friend struct DOTGraphTraits<
261	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
262
263	void exportToDot(std::string Label) const;
264
265	/// Represents a function clone via FuncTy pointer and clone number pair.
266	struct FuncInfo final
267	: public std::pair<FuncTy , unsigned* /Clone number/> {
268	using Base = std::pair<FuncTy , unsigned*>;
269	FuncInfo(const Base &B) : Base(B) {}
270	FuncInfo(FuncTy F = nullptr, unsigned* CloneNo = `0`) : Base(F, CloneNo) {}
271	explicit operator bool() const { return this->first != nullptr; }
272	FuncTy func() const* { return this->first; }
273	unsigned cloneNo() const { return this->second; }
274	};
275
276	/// Represents a callsite clone via CallTy and clone number pair.
277	struct CallInfo final : public std::pair<CallTy, unsigned /Clone number/> {
278	using Base = std::pair<CallTy, unsigned>;
279	CallInfo(const Base &B) : Base(B) {}
280	CallInfo(CallTy Call = nullptr, unsigned CloneNo = `0`)
281	: Base(Call, CloneNo) {}
282	explicit operator bool() const { return (bool)this->first; }
283	CallTy call() const { return this->first; }
284	unsigned cloneNo() const { return this->second; }
285	void setCloneNo(unsigned N) { this->second = N; }
286	void print(raw_ostream &OS) const {
287	if (!operator bool()) {
288	assert(!cloneNo());
289	OS << "null Call";
290	return;
291	}
292	call()->print(OS);
293	OS << "\t(clone " << cloneNo() << ")";
294	}
295	void dump() const {
296	print(OS&: dbgs());
297	dbgs() << "\n";
298	}
299	friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
300	Call.print(OS);
301	return OS;
302	}
303	};
304
305	struct ContextEdge;
306
307	/// Node in the Callsite Context Graph
308	struct ContextNode {
309	// Keep this for now since in the IR case where we have an Instruction it*
310	// is not as immediately discoverable. Used for printing richer information
311	// when dumping graph.
312	bool IsAllocation;
313
314	// Keeps track of when the Call was reset to null because there was
315	// recursion.
316	bool Recursive = false;
317
318	// This will be formed by ORing together the AllocationType enum values
319	// for contexts including this node.
320	uint8_t AllocTypes = `0`;
321
322	// The corresponding allocation or interior call. This is the primary call
323	// for which we have created this node.
324	CallInfo Call;
325
326	// List of other calls that can be treated the same as the primary call
327	// through cloning. I.e. located in the same function and have the same
328	// (possibly pruned) stack ids. They will be updated the same way as the
329	// primary call when assigning to function clones.
330	SmallVector<CallInfo, `0`> MatchingCalls;
331
332	// For alloc nodes this is a unique id assigned when constructed, and for
333	// callsite stack nodes it is the original stack id when the node is
334	// constructed from the memprof MIB metadata on the alloc nodes. Note that
335	// this is only used when matching callsite metadata onto the stack nodes
336	// created when processing the allocation memprof MIBs, and for labeling
337	// nodes in the dot graph. Therefore we don't bother to assign a value for
338	// clones.
339	uint64_t OrigStackOrAllocId = `0`;
340
341	// Edges to all callees in the profiled call stacks.
342	// TODO: Should this be a map (from Callee node) for more efficient lookup?
343	std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
344
345	// Edges to all callers in the profiled call stacks.
346	// TODO: Should this be a map (from Caller node) for more efficient lookup?
347	std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
348
349	// Returns true if we need to look at the callee edges for determining the
350	// node context ids and allocation type.
351	bool useCallerEdgesForContextInfo() const {
352	// Typically if the callee edges are empty either the caller edges are
353	// also empty, or this is an allocation (leaf node). However, if we are
354	// allowing recursive callsites and contexts this will be violated for
355	// incompletely cloned recursive cycles.
356	assert(!CalleeEdges.empty() \|\| CallerEdges.empty() \|\| IsAllocation \|\|
357	(AllowRecursiveCallsites && AllowRecursiveContexts));
358	// When cloning for a recursive context, during cloning we might be in the
359	// midst of cloning for a recurrence and have moved context ids off of a
360	// caller edge onto the clone but not yet off of the incoming caller
361	// (back) edge. If we don't look at those we miss the fact that this node
362	// still has context ids of interest.
363	return IsAllocation \|\| CloneRecursiveContexts;
364	}
365
366	// Compute the context ids for this node from the union of its edge context
367	// ids.
368	DenseSet<uint32_t> getContextIds() const {
369	unsigned Count = `0`;
370	// Compute the number of ids for reserve below. In general we only need to
371	// look at one set of edges, typically the callee edges, since other than
372	// allocations and in some cases during recursion cloning, all the context
373	// ids on the callers should also flow out via callee edges.
374	for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
375	Count += Edge->getContextIds().size();
376	DenseSet<uint32_t> ContextIds;
377	ContextIds.reserve(Size: Count);
378	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
379	CalleeEdges, useCallerEdgesForContextInfo()
380	? CallerEdges
381	: std::vector<std::shared_ptr<ContextEdge>>());
382	for (const auto &Edge : Edges)
383	ContextIds.insert_range(Edge->getContextIds());
384	return ContextIds;
385	}
386
387	// Compute the allocation type for this node from the OR of its edge
388	// allocation types.
389	uint8_t computeAllocType() const {
390	uint8_t BothTypes =
391	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
392	uint8_t AllocType = (uint8_t)AllocationType::None;
393	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
394	CalleeEdges, useCallerEdgesForContextInfo()
395	? CallerEdges
396	: std::vector<std::shared_ptr<ContextEdge>>());
397	for (const auto &Edge : Edges) {
398	AllocType \|= Edge->AllocTypes;
399	// Bail early if alloc type reached both, no further refinement.
400	if (AllocType == BothTypes)
401	return AllocType;
402	}
403	return AllocType;
404	}
405
406	// The context ids set for this node is empty if its edge context ids are
407	// also all empty.
408	bool emptyContextIds() const {
409	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
410	CalleeEdges, useCallerEdgesForContextInfo()
411	? CallerEdges
412	: std::vector<std::shared_ptr<ContextEdge>>());
413	for (const auto &Edge : Edges) {
414	if (!Edge->getContextIds().empty())
415	return false;
416	}
417	return true;
418	}
419
420	// List of clones of this ContextNode, initially empty.
421	std::vector<ContextNode *> Clones;
422
423	// If a clone, points to the original uncloned node.
424	ContextNode CloneOf = nullptr*;
425
426	ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
427
428	ContextNode(bool IsAllocation, CallInfo C)
429	: IsAllocation(IsAllocation), Call(C) {}
430
431	void addClone(ContextNode *Clone) {
432	if (CloneOf) {
433	CloneOf->Clones.push_back(Clone);
434	Clone->CloneOf = CloneOf;
435	} else {
436	Clones.push_back(Clone);
437	assert(!Clone->CloneOf);
438	Clone->CloneOf = this;
439	}
440	}
441
442	ContextNode *getOrigNode() {
443	if (!CloneOf)
444	return this;
445	return CloneOf;
446	}
447
448	void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
449	unsigned int ContextId);
450
451	ContextEdge findEdgeFromCallee(const* ContextNode *Callee);
452	ContextEdge findEdgeFromCaller(const* ContextNode *Caller);
453	void eraseCalleeEdge(const ContextEdge *Edge);
454	void eraseCallerEdge(const ContextEdge *Edge);
455
456	void setCall(CallInfo C) { Call = C; }
457
458	bool hasCall() const { return (bool)Call.call(); }
459
460	void printCall(raw_ostream &OS) const { Call.print(OS); }
461
462	// True if this node was effectively removed from the graph, in which case
463	// it should have an allocation type of None and empty context ids.
464	bool isRemoved() const {
465	// Typically if the callee edges are empty either the caller edges are
466	// also empty, or this is an allocation (leaf node). However, if we are
467	// allowing recursive callsites and contexts this will be violated for
468	// incompletely cloned recursive cycles.
469	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
470	(AllocTypes == (uint8_t)AllocationType::None) ==
471	emptyContextIds());
472	return AllocTypes == (uint8_t)AllocationType::None;
473	}
474
475	void dump() const;
476	void print(raw_ostream &OS) const;
477
478	friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
479	Node.print(OS);
480	return OS;
481	}
482	};
483
484	/// Edge in the Callsite Context Graph from a ContextNode N to a caller or
485	/// callee.
486	struct ContextEdge {
487	ContextNode *Callee;
488	ContextNode *Caller;
489
490	// This will be formed by ORing together the AllocationType enum values
491	// for contexts including this edge.
492	uint8_t AllocTypes = `0`;
493
494	// Set just before initiating cloning when cloning of recursive contexts is
495	// enabled. Used to defer cloning of backedges until we have done cloning of
496	// the callee node for non-backedge caller edges. This exposes cloning
497	// opportunities through the backedge of the cycle.
498	// TODO: Note that this is not updated during cloning, and it is unclear
499	// whether that would be needed.
500	bool IsBackedge = false;
501
502	// The set of IDs for contexts including this edge.
503	DenseSet<uint32_t> ContextIds;
504
505	ContextEdge(ContextNode Callee, ContextNode Caller, uint8_t AllocType,
506	DenseSet<uint32_t> ContextIds)
507	: Callee(Callee), Caller(Caller), AllocTypes(AllocType),
508	ContextIds (std::move(ContextIds)) {}
509
510	DenseSet<uint32_t> &getContextIds() { return ContextIds; }
511
512	// Helper to clear the fields of this edge when we are removing it from the
513	// graph.
514	inline void clear() {
515	ContextIds.clear();
516	AllocTypes = (uint8_t)AllocationType::None;
517	Caller = nullptr;
518	Callee = nullptr;
519	}
520
521	// Check if edge was removed from the graph. This is useful while iterating
522	// over a copy of edge lists when performing operations that mutate the
523	// graph in ways that might remove one of the edges.
524	inline bool isRemoved() const {
525	if (Callee \|\| Caller)
526	return false;
527	// Any edges that have been removed from the graph but are still in a
528	// shared_ptr somewhere should have all fields null'ed out by clear()
529	// above.
530	assert(AllocTypes == (uint8_t)AllocationType::None);
531	assert(ContextIds.empty());
532	return true;
533	}
534
535	void dump() const;
536	void print(raw_ostream &OS) const;
537
538	friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
539	Edge.print(OS);
540	return OS;
541	}
542	};
543
544	/// Helpers to remove edges that have allocation type None (due to not
545	/// carrying any context ids) after transformations.
546	void removeNoneTypeCalleeEdges(ContextNode *Node);
547	void removeNoneTypeCallerEdges(ContextNode *Node);
548	void
549	recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
550	DenseSet<const ContextNode *> &Visited);
551
552	protected:
553	/// Get a list of nodes corresponding to the stack ids in the given callsite
554	/// context.
555	template <class NodeT, class IteratorT>
556	std::vector<uint64_t>
557	getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
558
559	/// Adds nodes for the given allocation and any stack ids on its memprof MIB
560	/// metadata (or summary).
561	ContextNode addAllocNode(CallInfo Call, const* FuncTy *F);
562
563	/// Adds nodes for the given MIB stack ids.
564	template <class NodeT, class IteratorT>
565	void addStackNodesForMIB(ContextNode *AllocNode,
566	CallStack<NodeT, IteratorT> &StackContext,
567	CallStack<NodeT, IteratorT> &CallsiteContext,
568	AllocationType AllocType,
569	ArrayRef<ContextTotalSize> ContextSizeInfo);
570
571	/// Matches all callsite metadata (or summary) to the nodes created for
572	/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
573	/// inlining performed on those callsite instructions.
574	void updateStackNodes();
575
576	/// Update graph to conservatively handle any callsite stack nodes that target
577	/// multiple different callee target functions.
578	void handleCallsitesWithMultipleTargets();
579
580	/// Mark backedges via the standard DFS based backedge algorithm.
581	void markBackedges();
582
583	/// Merge clones generated during cloning for different allocations but that
584	/// are called by the same caller node, to ensure proper function assignment.
585	void mergeClones();
586
587	// Try to partition calls on the given node (already placed into the AllCalls
588	// array) by callee function, creating new copies of Node as needed to hold
589	// calls with different callees, and moving the callee edges appropriately.
590	// Returns true if partitioning was successful.
591	bool partitionCallsByCallee(
592	ContextNode *Node, ArrayRef<CallInfo> AllCalls,
593	std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
594
595	/// Save lists of calls with MemProf metadata in each function, for faster
596	/// iteration.
597	MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
598
599	/// Map from callsite node to the enclosing caller function.
600	std::map<const ContextNode , const* FuncTy *> NodeToCallingFunc;
601
602	// When exporting to dot, and an allocation id is specified, contains the
603	// context ids on that allocation.
604	DenseSet<uint32_t> DotAllocContextIds;
605
606	private:
607	using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
608
609	// Structure to keep track of information for each call as we are matching
610	// non-allocation callsites onto context nodes created from the allocation
611	// call metadata / summary contexts.
612	struct CallContextInfo {
613	// The callsite we're trying to match.
614	CallTy Call;
615	// The callsites stack ids that have a context node in the graph.
616	std::vector<uint64_t> StackIds;
617	// The function containing this callsite.
618	const FuncTy *Func;
619	// Initially empty, if needed this will be updated to contain the context
620	// ids for use in a new context node created for this callsite.
621	DenseSet<uint32_t> ContextIds;
622	};
623
624	/// Helper to remove edge from graph, updating edge iterator if it is provided
625	/// (in which case CalleeIter indicates which edge list is being iterated).
626	/// This will also perform the necessary clearing of the ContextEdge members
627	/// to enable later checking if the edge has been removed (since we may have
628	/// other copies of the shared_ptr in existence, and in fact rely on this to
629	/// enable removal while iterating over a copy of a node's edge list).
630	void removeEdgeFromGraph(ContextEdge Edge, EdgeIter EI = nullptr,
631	bool CalleeIter = true);
632
633	/// Assigns the given Node to calls at or inlined into the location with
634	/// the Node's stack id, after post order traversing and processing its
635	/// caller nodes. Uses the call information recorded in the given
636	/// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
637	/// as needed. Called by updateStackNodes which sets up the given
638	/// StackIdToMatchingCalls map.
639	void assignStackNodesPostOrder(
640	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
641	DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
642	DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
643
644	/// Duplicates the given set of context ids, updating the provided
645	/// map from each original id with the newly generated context ids,
646	/// and returning the new duplicated id set.
647	DenseSet<uint32_t> duplicateContextIds(
648	const DenseSet<uint32_t> &StackSequenceContextIds,
649	DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
650
651	/// Propagates all duplicated context ids across the graph.
652	void propagateDuplicateContextIds(
653	const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
654
655	/// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
656	/// else to its callers. Also updates OrigNode's edges to remove any context
657	/// ids moved to the newly created edge.
658	void connectNewNode(ContextNode NewNode, ContextNode OrigNode,
659	bool TowardsCallee,
660	DenseSet<uint32_t> RemainingContextIds);
661
662	/// Get the stack id corresponding to the given Id or Index (for IR this will
663	/// return itself, for a summary index this will return the id recorded in the
664	/// index for that stack id index value).
665	uint64_t getStackId(uint64_t IdOrIndex) const {
666	return static_cast<const DerivedCCG >(this*)->getStackId(IdOrIndex);
667	}
668
669	/// Returns true if the given call targets the callee of the given edge, or if
670	/// we were able to identify the call chain through intermediate tail calls.
671	/// In the latter case new context nodes are added to the graph for the
672	/// identified tail calls, and their synthesized nodes are added to
673	/// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
674	/// the updated edges and to prepare it for an increment in the caller.
675	bool
676	calleesMatch(CallTy Call, EdgeIter &EI,
677	MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
678
679	// Return the callee function of the given call, or nullptr if it can't be
680	// determined
681	const FuncTy *getCalleeFunc(CallTy Call) {
682	return static_cast<DerivedCCG >(this*)->getCalleeFunc(Call);
683	}
684
685	/// Returns true if the given call targets the given function, or if we were
686	/// able to identify the call chain through intermediate tail calls (in which
687	/// case FoundCalleeChain will be populated).
688	bool calleeMatchesFunc(
689	CallTy Call, const FuncTy Func, const* FuncTy *CallerFunc,
690	std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
691	return static_cast<DerivedCCG >(this*)->calleeMatchesFunc(
692	Call, Func, CallerFunc, FoundCalleeChain);
693	}
694
695	/// Returns true if both call instructions have the same callee.
696	bool sameCallee(CallTy Call1, CallTy Call2) {
697	return static_cast<DerivedCCG >(this*)->sameCallee(Call1, Call2);
698	}
699
700	/// Get a list of nodes corresponding to the stack ids in the given
701	/// callsite's context.
702	std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
703	return static_cast<DerivedCCG >(this*)->getStackIdsWithContextNodesForCall(
704	Call);
705	}
706
707	/// Get the last stack id in the context for callsite.
708	uint64_t getLastStackId(CallTy Call) {
709	return static_cast<DerivedCCG >(this*)->getLastStackId(Call);
710	}
711
712	/// Update the allocation call to record type of allocated memory.
713	void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
714	AllocType == AllocationType::Cold ? AllocTypeCold ++ : AllocTypeNotCold ++;
715	static_cast<DerivedCCG >(this*)->updateAllocationCall(Call, AllocType);
716	}
717
718	/// Get the AllocationType assigned to the given allocation instruction clone.
719	AllocationType getAllocationCallType(const CallInfo &Call) const {
720	return static_cast<const DerivedCCG >(this*)->getAllocationCallType(Call);
721	}
722
723	/// Update non-allocation call to invoke (possibly cloned) function
724	/// CalleeFunc.
725	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
726	static_cast<DerivedCCG >(this*)->updateCall(CallerCall, CalleeFunc);
727	}
728
729	/// Clone the given function for the given callsite, recording mapping of all
730	/// of the functions tracked calls to their new versions in the CallMap.
731	/// Assigns new clones to clone number CloneNo.
732	FuncInfo cloneFunctionForCallsite(
733	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
734	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
735	return static_cast<DerivedCCG >(this*)->cloneFunctionForCallsite(
736	Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
737	}
738
739	/// Gets a label to use in the dot graph for the given call clone in the given
740	/// function.
741	std::string getLabel(const FuncTy Func, const* CallTy Call,
742	unsigned CloneNo) const {
743	return static_cast<const DerivedCCG >(this*)->getLabel(Func, Call, CloneNo);
744	}
745
746	// Create and return a new ContextNode.
747	ContextNode createNewNode(bool* IsAllocation, const FuncTy F = nullptr*,
748	CallInfo C = CallInfo()) {
749	NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
750	auto *NewNode = NodeOwner.back().get();
751	if (F)
752	NodeToCallingFunc[NewNode] = F;
753	return NewNode;
754	}
755
756	/// Helpers to find the node corresponding to the given call or stackid.
757	ContextNode getNodeForInst(const* CallInfo &C);
758	ContextNode getNodeForAlloc(const* CallInfo &C);
759	ContextNode *getNodeForStackId(uint64_t StackId);
760
761	/// Computes the alloc type corresponding to the given context ids, by
762	/// unioning their recorded alloc types.
763	uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
764
765	/// Returns the allocation type of the intersection of the contexts of two
766	/// nodes (based on their provided context id sets), optimized for the case
767	/// when Node1Ids is smaller than Node2Ids.
768	uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
769	const DenseSet<uint32_t> &Node2Ids) const;
770
771	/// Returns the allocation type of the intersection of the contexts of two
772	/// nodes (based on their provided context id sets).
773	uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
774	const DenseSet<uint32_t> &Node2Ids) const;
775
776	/// Create a clone of Edge's callee and move Edge to that new callee node,
777	/// performing the necessary context id and allocation type updates.
778	/// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
779	/// moved to an edge to the new callee.
780	ContextNode *
781	moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
782	DenseSet<uint32_t> ContextIdsToMove = {});
783
784	/// Change the callee of Edge to existing callee clone NewCallee, performing
785	/// the necessary context id and allocation type updates.
786	/// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
787	/// moved to an edge to the new callee.
788	void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
789	ContextNode *NewCallee,
790	bool NewClone = false,
791	DenseSet<uint32_t> ContextIdsToMove = {});
792
793	/// Change the caller of the edge at the given callee edge iterator to be
794	/// NewCaller, performing the necessary context id and allocation type
795	/// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
796	/// a simplified version of it as we always move the given edge and all of its
797	/// context ids.
798	void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
799	ContextNode *NewCaller);
800
801	/// Recursive helper for marking backedges via DFS.
802	void markBackedges(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
803	DenseSet<const ContextNode *> &CurrentStack);
804
805	/// Recursive helper for merging clones.
806	void
807	mergeClones(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
808	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
809	/// Main worker for merging callee clones for a given node.
810	void mergeNodeCalleeClones(
811	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
812	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
813	/// Helper to find other callers of the given set of callee edges that can
814	/// share the same callee merge node.
815	void findOtherCallersToShareMerge(
816	ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
817	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
818	DenseSet<ContextNode *> &OtherCallersToShareMerge);
819
820	/// Recursively perform cloning on the graph for the given Node and its
821	/// callers, in order to uniquely identify the allocation behavior of an
822	/// allocation given its context. The context ids of the allocation being
823	/// processed are given in AllocContextIds.
824	void identifyClones(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
825	const DenseSet<uint32_t> &AllocContextIds);
826
827	/// Map from each context ID to the AllocationType assigned to that context.
828	DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
829
830	/// Map from each contextID to the profiled full contexts and their total
831	/// sizes (there may be more than one due to context trimming),
832	/// optionally populated when requested (via MemProfReportHintedSizes or
833	/// MinClonedColdBytePercent).
834	DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
835
836	/// Identifies the context node created for a stack id when adding the MIB
837	/// contexts to the graph. This is used to locate the context nodes when
838	/// trying to assign the corresponding callsites with those stack ids to these
839	/// nodes.
840	DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
841
842	/// Maps to track the calls to their corresponding nodes in the graph.
843	MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
844	MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
845
846	/// Owner of all ContextNode unique_ptrs.
847	std::vector<std::unique_ptr<ContextNode>> NodeOwner;
848
849	/// Perform sanity checks on graph when requested.
850	void check() const;
851
852	/// Keeps track of the last unique context id assigned.
853	unsigned int LastContextId = `0`;
854	};
855
856	template <typename DerivedCCG, typename FuncTy, typename CallTy>
857	using ContextNode =
858	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
859	template <typename DerivedCCG, typename FuncTy, typename CallTy>
860	using ContextEdge =
861	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
862	template <typename DerivedCCG, typename FuncTy, typename CallTy>
863	using FuncInfo =
864	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
865	template <typename DerivedCCG, typename FuncTy, typename CallTy>
866	using CallInfo =
867	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
868
869	/// CRTP derived class for graphs built from IR (regular LTO).
870	class ModuleCallsiteContextGraph
871	: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
872	Instruction *> {
873	public:
874	ModuleCallsiteContextGraph(
875	Module &M,
876	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
877
878	private:
879	friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
880	Instruction *>;
881
882	uint64_t getStackId(uint64_t IdOrIndex) const;
883	const Function getCalleeFunc(Instruction Call);
884	bool calleeMatchesFunc(
885	Instruction Call, const* Function Func, const* Function *CallerFunc,
886	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain);
887	bool sameCallee(Instruction Call1, Instruction Call2);
888	bool findProfiledCalleeThroughTailCalls(
889	const Function ProfiledCallee, Value CurCallee, unsigned Depth,
890	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain,
891	bool &FoundMultipleCalleeChains);
892	uint64_t getLastStackId(Instruction *Call);
893	std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
894	void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
895	AllocationType getAllocationCallType(const CallInfo &Call) const;
896	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
897	CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
898	Instruction *>::FuncInfo
899	cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
900	std::map<CallInfo, CallInfo> &CallMap,
901	std::vector<CallInfo> &CallsWithMetadataInFunc,
902	unsigned CloneNo);
903	std::string getLabel(const Function Func, const* Instruction *Call,
904	unsigned CloneNo) const;
905
906	const Module &Mod;
907	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
908	};
909
910	/// Represents a call in the summary index graph, which can either be an
911	/// allocation or an interior callsite node in an allocation's context.
912	/// Holds a pointer to the corresponding data structure in the index.
913	struct IndexCall : public PointerUnion<CallsiteInfo , AllocInfo > {
914	IndexCall() : PointerUnion () {}
915	IndexCall(std::nullptr_t) : IndexCall () {}
916	IndexCall(CallsiteInfo *StackNode) : PointerUnion (StackNode) {}
917	IndexCall(AllocInfo *AllocNode) : PointerUnion (AllocNode) {}
918	IndexCall(PointerUnion PT) : PointerUnion (PT) {}
919
920	IndexCall *operator->() { return this; }
921
922	void print(raw_ostream &OS) const {
923	PointerUnion<CallsiteInfo , AllocInfo > Base = *this;
924	if (auto AI = llvm::dyn_cast_if_present<AllocInfo >(Val&: Base)) {
925	OS << *AI;
926	} else {
927	auto CI = llvm::dyn_cast_if_present<CallsiteInfo >(Val&: Base);
928	assert(CI);
929	OS << *CI;
930	}
931	}
932	};
933	} // namespace
934
935	namespace llvm {
936	template <> struct simplify_type<IndexCall> {
937	using SimpleType = PointerUnion<CallsiteInfo , AllocInfo >;
938	static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
939	};
940	template <> struct simplify_type<const IndexCall> {
941	using SimpleType = const PointerUnion<CallsiteInfo , AllocInfo >;
942	static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
943	};
944	} // namespace llvm
945
946	namespace {
947	/// CRTP derived class for graphs built from summary index (ThinLTO).
948	class IndexCallsiteContextGraph
949	: public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
950	IndexCall> {
951	public:
952	IndexCallsiteContextGraph(
953	ModuleSummaryIndex &Index,
954	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
955	isPrevailing);
956
957	~IndexCallsiteContextGraph() {
958	// Now that we are done with the graph it is safe to add the new
959	// CallsiteInfo structs to the function summary vectors. The graph nodes
960	// point into locations within these vectors, so we don't want to add them
961	// any earlier.
962	for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
963	auto *FS = I.first;
964	for (auto &Callsite : I.second)
965	FS->addCallsite(Callsite&: *Callsite.second);
966	}
967	}
968
969	private:
970	friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
971	IndexCall>;
972
973	uint64_t getStackId(uint64_t IdOrIndex) const;
974	const FunctionSummary *getCalleeFunc(IndexCall &Call);
975	bool calleeMatchesFunc(
976	IndexCall &Call, const FunctionSummary *Func,
977	const FunctionSummary *CallerFunc,
978	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
979	bool sameCallee(IndexCall &Call1, IndexCall &Call2);
980	bool findProfiledCalleeThroughTailCalls(
981	ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
982	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
983	bool &FoundMultipleCalleeChains);
984	uint64_t getLastStackId(IndexCall &Call);
985	std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
986	void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
987	AllocationType getAllocationCallType(const CallInfo &Call) const;
988	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
989	CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
990	IndexCall>::FuncInfo
991	cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
992	std::map<CallInfo, CallInfo> &CallMap,
993	std::vector<CallInfo> &CallsWithMetadataInFunc,
994	unsigned CloneNo);
995	std::string getLabel(const FunctionSummary Func, const* IndexCall &Call,
996	unsigned CloneNo) const;
997
998	// Saves mapping from function summaries containing memprof records back to
999	// its VI, for use in checking and debugging.
1000	std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1001
1002	const ModuleSummaryIndex &Index;
1003	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1004	isPrevailing;
1005
1006	// Saves/owns the callsite info structures synthesized for missing tail call
1007	// frames that we discover while building the graph.
1008	// It maps from the summary of the function making the tail call, to a map
1009	// of callee ValueInfo to corresponding synthesized callsite info.
1010	std::unordered_map<FunctionSummary *,
1011	std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1012	FunctionCalleesToSynthesizedCallsiteInfos;
1013	};
1014	} // namespace
1015
1016	namespace llvm {
1017	template <>
1018	struct DenseMapInfo<typename CallsiteContextGraph<
1019	ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1020	: public DenseMapInfo<std::pair<Instruction , unsigned*>> {};
1021	template <>
1022	struct DenseMapInfo<typename CallsiteContextGraph<
1023	IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1024	: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1025	template <>
1026	struct DenseMapInfo<IndexCall>
1027	: public DenseMapInfo<PointerUnion<CallsiteInfo , AllocInfo >> {};
1028	} // end namespace llvm
1029
1030	namespace {
1031
1032	// Map the uint8_t alloc types (which may contain NotCold\|Cold) to the alloc
1033	// type we should actually use on the corresponding allocation.
1034	// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1035	// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1036	// from NotCold.
1037	AllocationType allocTypeToUse(uint8_t AllocTypes) {
1038	assert(AllocTypes != (uint8_t)AllocationType::None);
1039	if (AllocTypes ==
1040	((uint8_t)AllocationType::NotCold \| (uint8_t)AllocationType::Cold))
1041	return AllocationType::NotCold;
1042	else
1043	return (AllocationType)AllocTypes;
1044	}
1045
1046	// Helper to check if the alloc types for all edges recorded in the
1047	// InAllocTypes vector match the alloc types for all edges in the Edges
1048	// vector.
1049	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1050	bool allocTypesMatch(
1051	const std::vector<uint8_t> &InAllocTypes,
1052	const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1053	&Edges) {
1054	// This should be called only when the InAllocTypes vector was computed for
1055	// this set of Edges. Make sure the sizes are the same.
1056	assert(InAllocTypes.size() == Edges.size());
1057	return std::equal(
1058	InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1059	[](const uint8_t &l,
1060	const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1061	// Can share if one of the edges is None type - don't
1062	// care about the type along that edge as it doesn't
1063	// exist for those context ids.
1064	if (l == (uint8_t)AllocationType::None \|\|
1065	r->AllocTypes == (uint8_t)AllocationType::None)
1066	return true;
1067	return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
1068	});
1069	}
1070
1071	// Helper to check if the alloc types for all edges recorded in the
1072	// InAllocTypes vector match the alloc types for callee edges in the given
1073	// clone. Because the InAllocTypes were computed from the original node's callee
1074	// edges, and other cloning could have happened after this clone was created, we
1075	// need to find the matching clone callee edge, which may or may not exist.
1076	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1077	bool allocTypesMatchClone(
1078	const std::vector<uint8_t> &InAllocTypes,
1079	const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1080	const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1081	assert(Node);
1082	// InAllocTypes should have been computed for the original node's callee
1083	// edges.
1084	assert(InAllocTypes.size() == Node->CalleeEdges.size());
1085	// First create a map of the clone callee edge callees to the edge alloc type.
1086	DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1087	EdgeCalleeMap;
1088	for (const auto &E : Clone->CalleeEdges) {
1089	assert(!EdgeCalleeMap.contains(E->Callee));
1090	EdgeCalleeMap[E->Callee] = E->AllocTypes;
1091	}
1092	// Next, walk the original node's callees, and look for the corresponding
1093	// clone edge to that callee.
1094	for (unsigned I = `0`; I < Node->CalleeEdges.size(); I++) {
1095	auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1096	// Not found is ok, we will simply add an edge if we use this clone.
1097	if (Iter == EdgeCalleeMap.end())
1098	continue;
1099	// Can share if one of the edges is None type - don't
1100	// care about the type along that edge as it doesn't
1101	// exist for those context ids.
1102	if (InAllocTypes [I] == (uint8_t)AllocationType::None \|\|
1103	Iter->second == (uint8_t)AllocationType::None)
1104	continue;
1105	if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes [I]))
1106	return false;
1107	}
1108	return true;
1109	}
1110
1111	} // end anonymous namespace
1112
1113	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1114	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1115	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1116	const CallInfo &C) {
1117	ContextNode *Node = getNodeForAlloc(C);
1118	if (Node)
1119	return Node;
1120
1121	return NonAllocationCallToContextNodeMap.lookup(C);
1122	}
1123
1124	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1125	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1126	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1127	const CallInfo &C) {
1128	return AllocationCallToContextNodeMap.lookup(C);
1129	}
1130
1131	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1132	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1133	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1134	uint64_t StackId) {
1135	auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1136	if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1137	return StackEntryNode->second;
1138	return nullptr;
1139	}
1140
1141	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1142	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1143	addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1144	unsigned int ContextId) {
1145	for (auto &Edge : CallerEdges) {
1146	if (Edge->Caller == Caller) {
1147	Edge->AllocTypes \|= (uint8_t)AllocType;
1148	Edge->getContextIds().insert(ContextId);
1149	return;
1150	}
1151	}
1152	std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1153	this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1154	CallerEdges.push_back(Edge);
1155	Caller->CalleeEdges.push_back(Edge);
1156	}
1157
1158	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1159	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1160	ContextEdge Edge, EdgeIter EI, bool CalleeIter) {
1161	assert(!EI \|\| (*EI)->get() == Edge);
1162	assert(!Edge->isRemoved());
1163	// Save the Caller and Callee pointers so we can erase Edge from their edge
1164	// lists after clearing Edge below. We do the clearing first in case it is
1165	// destructed after removing from the edge lists (if those were the last
1166	// shared_ptr references to Edge).
1167	auto *Callee = Edge->Callee;
1168	auto *Caller = Edge->Caller;
1169
1170	// Make sure the edge fields are cleared out so we can properly detect
1171	// removed edges if Edge is not destructed because there is still a shared_ptr
1172	// reference.
1173	Edge->clear();
1174
1175	#ifndef NDEBUG
1176	auto CalleeCallerCount = Callee->CallerEdges.size();
1177	auto CallerCalleeCount = Caller->CalleeEdges.size();
1178	#endif
1179	if (!EI) {
1180	Callee->eraseCallerEdge(Edge);
1181	Caller->eraseCalleeEdge(Edge);
1182	} else if (CalleeIter) {
1183	Callee->eraseCallerEdge(Edge);
1184	EI = Caller->CalleeEdges.erase(EI);
1185	} else {
1186	Caller->eraseCalleeEdge(Edge);
1187	EI = Callee->CallerEdges.erase(EI);
1188	}
1189	assert(Callee->CallerEdges.size() < CalleeCallerCount);
1190	assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1191	}
1192
1193	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1194	void CallsiteContextGraph<
1195	DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1196	for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1197	auto Edge = *EI;
1198	if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1199	assert(Edge->ContextIds.empty());
1200	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /CalleeIter=/true);
1201	} else
1202	++EI;
1203	}
1204	}
1205
1206	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1207	void CallsiteContextGraph<
1208	DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1209	for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1210	auto Edge = *EI;
1211	if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1212	assert(Edge->ContextIds.empty());
1213	Edge->Caller->eraseCalleeEdge(Edge.get());
1214	EI = Node->CallerEdges.erase(EI);
1215	} else
1216	++EI;
1217	}
1218	}
1219
1220	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1221	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1222	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1223	findEdgeFromCallee(const ContextNode *Callee) {
1224	for (const auto &Edge : CalleeEdges)
1225	if (Edge->Callee == Callee)
1226	return Edge.get();
1227	return nullptr;
1228	}
1229
1230	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1231	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1232	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1233	findEdgeFromCaller(const ContextNode *Caller) {
1234	for (const auto &Edge : CallerEdges)
1235	if (Edge->Caller == Caller)
1236	return Edge.get();
1237	return nullptr;
1238	}
1239
1240	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1241	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1242	eraseCalleeEdge(const ContextEdge *Edge) {
1243	auto EI = llvm::find_if(
1244	CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1245	return CalleeEdge.get() == Edge;
1246	});
1247	assert(EI != CalleeEdges.end());
1248	CalleeEdges.erase(EI);
1249	}
1250
1251	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1252	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1253	eraseCallerEdge(const ContextEdge *Edge) {
1254	auto EI = llvm::find_if(
1255	CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1256	return CallerEdge.get() == Edge;
1257	});
1258	assert(EI != CallerEdges.end());
1259	CallerEdges.erase(EI);
1260	}
1261
1262	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1263	uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1264	DenseSet<uint32_t> &ContextIds) const {
1265	uint8_t BothTypes =
1266	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
1267	uint8_t AllocType = (uint8_t)AllocationType::None;
1268	for (auto Id : ContextIds) {
1269	AllocType \|= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1270	// Bail early if alloc type reached both, no further refinement.
1271	if (AllocType == BothTypes)
1272	return AllocType;
1273	}
1274	return AllocType;
1275	}
1276
1277	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1278	uint8_t
1279	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1280	const DenseSet<uint32_t> &Node1Ids,
1281	const DenseSet<uint32_t> &Node2Ids) const {
1282	uint8_t BothTypes =
1283	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
1284	uint8_t AllocType = (uint8_t)AllocationType::None;
1285	for (auto Id : Node1Ids) {
1286	if (!Node2Ids.count(V: Id))
1287	continue;
1288	AllocType \|= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1289	// Bail early if alloc type reached both, no further refinement.
1290	if (AllocType == BothTypes)
1291	return AllocType;
1292	}
1293	return AllocType;
1294	}
1295
1296	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1297	uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1298	const DenseSet<uint32_t> &Node1Ids,
1299	const DenseSet<uint32_t> &Node2Ids) const {
1300	if (Node1Ids.size() < Node2Ids.size())
1301	return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1302	else
1303	return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
1304	}
1305
1306	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1307	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1308	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1309	CallInfo Call, const FuncTy *F) {
1310	assert(!getNodeForAlloc(Call));
1311	ContextNode AllocNode = createNewNode(/IsAllocation=/*true, F, C: Call);
1312	AllocationCallToContextNodeMap[Call] = AllocNode;
1313	// Use LastContextId as a uniq id for MIB allocation nodes.
1314	AllocNode->OrigStackOrAllocId = LastContextId;
1315	// Alloc type should be updated as we add in the MIBs. We should assert
1316	// afterwards that it is not still None.
1317	AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1318
1319	return AllocNode;
1320	}
1321
1322	static std::string getAllocTypeString(uint8_t AllocTypes) {
1323	if (!AllocTypes)
1324	return "None";
1325	std::string Str;
1326	if (AllocTypes & (uint8_t)AllocationType::NotCold)
1327	Str += "NotCold";
1328	if (AllocTypes & (uint8_t)AllocationType::Cold)
1329	Str += "Cold";
1330	return Str;
1331	}
1332
1333	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1334	template <class NodeT, class IteratorT>
1335	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1336	ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1337	CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1338	ArrayRef<ContextTotalSize> ContextSizeInfo) {
1339	// Treating the hot alloc type as NotCold before the disambiguation for "hot"
1340	// is done.
1341	if (AllocType == AllocationType::Hot)
1342	AllocType = AllocationType::NotCold;
1343
1344	ContextIdToAllocationType [++LastContextId] = AllocType;
1345
1346	if (!ContextSizeInfo.empty()) {
1347	auto &Entry = ContextIdToContextSizeInfos [LastContextId];
1348	Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end());
1349	}
1350
1351	// Update alloc type and context ids for this MIB.
1352	AllocNode->AllocTypes \|= (uint8_t)AllocType;
1353
1354	// Now add or update nodes for each stack id in alloc's context.
1355	// Later when processing the stack ids on non-alloc callsites we will adjust
1356	// for any inlining in the context.
1357	ContextNode *PrevNode = AllocNode;
1358	// Look for recursion (direct recursion should have been collapsed by
1359	// module summary analysis, here we should just be detecting mutual
1360	// recursion). Mark these nodes so we don't try to clone.
1361	SmallSet<uint64_t, `8`> StackIdSet;
1362	// Skip any on the allocation call (inlining).
1363	for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1364	ContextIter != StackContext.end(); ++ContextIter) {
1365	auto StackId = getStackId(IdOrIndex: *ContextIter);
1366	ContextNode *StackNode = getNodeForStackId(StackId);
1367	if (!StackNode) {
1368	StackNode = createNewNode(/IsAllocation=/false);
1369	StackEntryIdToContextNodeMap[StackId] = StackNode;
1370	StackNode->OrigStackOrAllocId = StackId;
1371	}
1372	// Marking a node recursive will prevent its cloning completely, even for
1373	// non-recursive contexts flowing through it.
1374	if (!AllowRecursiveCallsites) {
1375	auto Ins = StackIdSet.insert(StackId);
1376	if (!Ins.second)
1377	StackNode->Recursive = true;
1378	}
1379	StackNode->AllocTypes \|= (uint8_t)AllocType;
1380	PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1381	PrevNode = StackNode;
1382	}
1383	}
1384
1385	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1386	DenseSet<uint32_t>
1387	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1388	const DenseSet<uint32_t> &StackSequenceContextIds,
1389	DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1390	DenseSet<uint32_t> NewContextIds;
1391	for (auto OldId : StackSequenceContextIds) {
1392	NewContextIds.insert(V: ++LastContextId);
1393	OldToNewContextIds [OldId].insert(V: LastContextId);
1394	assert(ContextIdToAllocationType.count(OldId));
1395	// The new context has the same allocation type as original.
1396	ContextIdToAllocationType [LastContextId] = ContextIdToAllocationType [OldId];
1397	if (DotAllocContextIds.contains(V: OldId))
1398	DotAllocContextIds.insert(V: LastContextId);
1399	}
1400	return NewContextIds;
1401	}
1402
1403	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1404	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1405	propagateDuplicateContextIds(
1406	const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1407	// Build a set of duplicated context ids corresponding to the input id set.
1408	auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1409	DenseSet<uint32_t> NewIds;
1410	for (auto Id : ContextIds)
1411	if (auto NewId = OldToNewContextIds.find(Val: Id);
1412	NewId != OldToNewContextIds.end())
1413	NewIds.insert_range(R: NewId ->second);
1414	return NewIds;
1415	};
1416
1417	// Recursively update context ids sets along caller edges.
1418	auto UpdateCallers = [&](ContextNode *Node,
1419	DenseSet<const ContextEdge *> &Visited,
1420	auto &&UpdateCallers) -> void {
1421	for (const auto &Edge : Node->CallerEdges) {
1422	auto Inserted = Visited.insert(Edge.get());
1423	if (!Inserted.second)
1424	continue;
1425	ContextNode *NextNode = Edge->Caller;
1426	DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1427	// Only need to recursively iterate to NextNode via this caller edge if
1428	// it resulted in any added ids to NextNode.
1429	if (!NewIdsToAdd.empty()) {
1430	Edge->getContextIds().insert_range(NewIdsToAdd);
1431	UpdateCallers(NextNode, Visited, UpdateCallers);
1432	}
1433	}
1434	};
1435
1436	DenseSet<const ContextEdge *> Visited;
1437	for (auto &Entry : AllocationCallToContextNodeMap) {
1438	auto *Node = Entry.second;
1439	UpdateCallers(Node, Visited, UpdateCallers);
1440	}
1441	}
1442
1443	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1444	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1445	ContextNode NewNode, ContextNode OrigNode, bool TowardsCallee,
1446	// This must be passed by value to make a copy since it will be adjusted
1447	// as ids are moved.
1448	DenseSet<uint32_t> RemainingContextIds) {
1449	auto &OrigEdges =
1450	TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1451	DenseSet<uint32_t> RecursiveContextIds;
1452	DenseSet<uint32_t> AllCallerContextIds;
1453	if (AllowRecursiveCallsites) {
1454	// Identify which context ids are recursive which is needed to properly
1455	// update the RemainingContextIds set. The relevant recursive context ids
1456	// are those that are in multiple edges.
1457	for (auto &CE : OrigEdges) {
1458	AllCallerContextIds.reserve(Size: CE->getContextIds().size());
1459	for (auto Id : CE->getContextIds())
1460	if (!AllCallerContextIds.insert(Id).second)
1461	RecursiveContextIds.insert(Id);
1462	}
1463	}
1464	// Increment iterator in loop so that we can remove edges as needed.
1465	for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1466	auto Edge = *EI;
1467	DenseSet<uint32_t> NewEdgeContextIds;
1468	DenseSet<uint32_t> NotFoundContextIds;
1469	// Remove any matching context ids from Edge, return set that were found and
1470	// removed, these are the new edge's context ids. Also update the remaining
1471	// (not found ids).
1472	set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1473	NotFoundContextIds);
1474	// Update the remaining context ids set for the later edges. This is a
1475	// compile time optimization.
1476	if (RecursiveContextIds.empty()) {
1477	// No recursive ids, so all of the previously remaining context ids that
1478	// were not seen on this edge are the new remaining set.
1479	RemainingContextIds.swap(RHS&: NotFoundContextIds);
1480	} else {
1481	// Keep the recursive ids in the remaining set as we expect to see those
1482	// on another edge. We can remove the non-recursive remaining ids that
1483	// were seen on this edge, however. We already have the set of remaining
1484	// ids that were on this edge (in NewEdgeContextIds). Figure out which are
1485	// non-recursive and only remove those. Note that despite the higher
1486	// overhead of updating the remaining context ids set when recursion
1487	// handling is enabled, it was found to be at worst performance neutral
1488	// and in one case a clear win.
1489	DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1490	set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds);
1491	set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds);
1492	}
1493	// If no matching context ids for this edge, skip it.
1494	if (NewEdgeContextIds.empty()) {
1495	++EI;
1496	continue;
1497	}
1498	if (TowardsCallee) {
1499	uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1500	auto NewEdge = std::make_shared<ContextEdge>(
1501	Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1502	NewNode->CalleeEdges.push_back(NewEdge);
1503	NewEdge->Callee->CallerEdges.push_back(NewEdge);
1504	} else {
1505	uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1506	auto NewEdge = std::make_shared<ContextEdge>(
1507	NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1508	NewNode->CallerEdges.push_back(NewEdge);
1509	NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1510	}
1511	// Remove old edge if context ids empty.
1512	if (Edge->getContextIds().empty()) {
1513	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee);
1514	continue;
1515	}
1516	++EI;
1517	}
1518	}
1519
1520	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1521	static void checkEdge(
1522	const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1523	// Confirm that alloc type is not None and that we have at least one context
1524	// id.
1525	assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1526	assert(!Edge->ContextIds.empty());
1527	}
1528
1529	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1530	static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1531	bool CheckEdges = true) {
1532	if (Node->isRemoved())
1533	return;
1534	#ifndef NDEBUG
1535	// Compute node's context ids once for use in asserts.
1536	auto NodeContextIds = Node->getContextIds();
1537	#endif
1538	// Node's context ids should be the union of both its callee and caller edge
1539	// context ids.
1540	if (Node->CallerEdges.size()) {
1541	DenseSet<uint32_t> CallerEdgeContextIds(
1542	Node->CallerEdges.front()->ContextIds);
1543	for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1544	if (CheckEdges)
1545	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1546	set_union(CallerEdgeContextIds, Edge->ContextIds);
1547	}
1548	// Node can have more context ids than callers if some contexts terminate at
1549	// node and some are longer. If we are allowing recursive callsites and
1550	// contexts this will be violated for incompletely cloned recursive cycles,
1551	// so skip the checking in that case.
1552	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
1553	NodeContextIds == CallerEdgeContextIds \|\|
1554	set_is_subset(CallerEdgeContextIds, NodeContextIds));
1555	}
1556	if (Node->CalleeEdges.size()) {
1557	DenseSet<uint32_t> CalleeEdgeContextIds(
1558	Node->CalleeEdges.front()->ContextIds);
1559	for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1560	if (CheckEdges)
1561	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1562	set_union(CalleeEdgeContextIds, Edge->getContextIds());
1563	}
1564	// If we are allowing recursive callsites and contexts this will be violated
1565	// for incompletely cloned recursive cycles, so skip the checking in that
1566	// case.
1567	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
1568	NodeContextIds == CalleeEdgeContextIds);
1569	}
1570	// FIXME: Since this checking is only invoked under an option, we should
1571	// change the error checking from using assert to something that will trigger
1572	// an error on a release build.
1573	#ifndef NDEBUG
1574	// Make sure we don't end up with duplicate edges between the same caller and
1575	// callee.
1576	DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1577	for (const auto &E : Node->CalleeEdges)
1578	NodeSet.insert(E->Callee);
1579	assert(NodeSet.size() == Node->CalleeEdges.size());
1580	#endif
1581	}
1582
1583	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1584	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1585	assignStackNodesPostOrder(
1586	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
1587	DenseMap<uint64_t, std::vector<CallContextInfo>>
1588	&StackIdToMatchingCalls,
1589	DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1590	auto Inserted = Visited.insert(Node);
1591	if (!Inserted.second)
1592	return;
1593	// Post order traversal. Iterate over a copy since we may add nodes and
1594	// therefore new callers during the recursive call, invalidating any
1595	// iterator over the original edge vector. We don't need to process these
1596	// new nodes as they were already processed on creation.
1597	auto CallerEdges = Node->CallerEdges;
1598	for (auto &Edge : CallerEdges) {
1599	// Skip any that have been removed during the recursion.
1600	if (Edge->isRemoved()) {
1601	assert(!is_contained(Node->CallerEdges, Edge));
1602	continue;
1603	}
1604	assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls,
1605	CallToMatchingCall);
1606	}
1607
1608	// If this node's stack id is in the map, update the graph to contain new
1609	// nodes representing any inlining at interior callsites. Note we move the
1610	// associated context ids over to the new nodes.
1611
1612	// Ignore this node if it is for an allocation or we didn't record any
1613	// stack id lists ending at it.
1614	if (Node->IsAllocation \|\|
1615	!StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1616	return;
1617
1618	auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1619	// Handle the simple case first. A single call with a single stack id.
1620	// In this case there is no need to create any new context nodes, simply
1621	// assign the context node for stack id to this Call.
1622	if (Calls.size() == `1`) {
1623	auto &[Call, Ids, Func, SavedContextIds] = Calls[`0`];
1624	if (Ids.size() == `1`) {
1625	assert(SavedContextIds.empty());
1626	// It should be this Node
1627	assert(Node == getNodeForStackId(Ids[`0`]));
1628	if (Node->Recursive)
1629	return;
1630	Node->setCall(Call);
1631	NonAllocationCallToContextNodeMap[Call] = Node;
1632	NodeToCallingFunc[Node] = Func;
1633	return;
1634	}
1635	}
1636
1637	#ifndef NDEBUG
1638	// Find the node for the last stack id, which should be the same
1639	// across all calls recorded for this id, and is this node's id.
1640	uint64_t LastId = Node->OrigStackOrAllocId;
1641	ContextNode *LastNode = getNodeForStackId(LastId);
1642	// We should only have kept stack ids that had nodes.
1643	assert(LastNode);
1644	assert(LastNode == Node);
1645	#else
1646	ContextNode *LastNode = Node;
1647	#endif
1648
1649	// Compute the last node's context ids once, as it is shared by all calls in
1650	// this entry.
1651	DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1652
1653	[[maybe_unused]] bool PrevIterCreatedNode = false;
1654	bool CreatedNode = false;
1655	for (unsigned I = `0`; I < Calls.size();
1656	I++, PrevIterCreatedNode = CreatedNode) {
1657	CreatedNode = false;
1658	auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1659	// Skip any for which we didn't assign any ids, these don't get a node in
1660	// the graph.
1661	if (SavedContextIds.empty()) {
1662	// If this call has a matching call (located in the same function and
1663	// having the same stack ids), simply add it to the context node created
1664	// for its matching call earlier. These can be treated the same through
1665	// cloning and get updated at the same time.
1666	if (!CallToMatchingCall.contains(Call))
1667	continue;
1668	auto MatchingCall = CallToMatchingCall[Call];
1669	if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1670	// This should only happen if we had a prior iteration, and it didn't
1671	// create a node because of the below recomputation of context ids
1672	// finding none remaining and continuing early.
1673	assert(I > `0` && !PrevIterCreatedNode);
1674	continue;
1675	}
1676	NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1677	Call);
1678	continue;
1679	}
1680
1681	assert(LastId == Ids.back());
1682
1683	// Recompute the context ids for this stack id sequence (the
1684	// intersection of the context ids of the corresponding nodes).
1685	// Start with the ids we saved in the map for this call, which could be
1686	// duplicated context ids. We have to recompute as we might have overlap
1687	// overlap between the saved context ids for different last nodes, and
1688	// removed them already during the post order traversal.
1689	set_intersect(SavedContextIds, LastNodeContextIds);
1690	ContextNode *PrevNode = LastNode;
1691	bool Skip = false;
1692	// Iterate backwards through the stack Ids, starting after the last Id
1693	// in the list, which was handled once outside for all Calls.
1694	for (auto IdIter = Ids.rbegin() + `1`; IdIter != Ids.rend(); IdIter++) {
1695	auto Id = *IdIter;
1696	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1697	// We should only have kept stack ids that had nodes and weren't
1698	// recursive.
1699	assert(CurNode);
1700	assert(!CurNode->Recursive);
1701
1702	auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1703	if (!Edge) {
1704	Skip = true;
1705	break;
1706	}
1707	PrevNode = CurNode;
1708
1709	// Update the context ids, which is the intersection of the ids along
1710	// all edges in the sequence.
1711	set_intersect(SavedContextIds, Edge->getContextIds());
1712
1713	// If we now have no context ids for clone, skip this call.
1714	if (SavedContextIds.empty()) {
1715	Skip = true;
1716	break;
1717	}
1718	}
1719	if (Skip)
1720	continue;
1721
1722	// Create new context node.
1723	ContextNode NewNode = createNewNode(/IsAllocation=/*false, F: Func, C: Call);
1724	NonAllocationCallToContextNodeMap[Call] = NewNode;
1725	CreatedNode = true;
1726	NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds);
1727
1728	ContextNode *FirstNode = getNodeForStackId(StackId: Ids[`0`]);
1729	assert(FirstNode);
1730
1731	// Connect to callees of innermost stack frame in inlined call chain.
1732	// This updates context ids for FirstNode's callee's to reflect those
1733	// moved to NewNode.
1734	connectNewNode(NewNode, OrigNode: FirstNode, /TowardsCallee=/true, RemainingContextIds: SavedContextIds);
1735
1736	// Connect to callers of outermost stack frame in inlined call chain.
1737	// This updates context ids for FirstNode's caller's to reflect those
1738	// moved to NewNode.
1739	connectNewNode(NewNode, OrigNode: LastNode, /TowardsCallee=/false, RemainingContextIds: SavedContextIds);
1740
1741	// Now we need to remove context ids from edges/nodes between First and
1742	// Last Node.
1743	PrevNode = nullptr;
1744	for (auto Id : Ids) {
1745	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1746	// We should only have kept stack ids that had nodes.
1747	assert(CurNode);
1748
1749	// Remove the context ids moved to NewNode from CurNode, and the
1750	// edge from the prior node.
1751	if (PrevNode) {
1752	auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1753	// If the sequence contained recursion, we might have already removed
1754	// some edges during the connectNewNode calls above.
1755	if (!PrevEdge) {
1756	PrevNode = CurNode;
1757	continue;
1758	}
1759	set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1760	if (PrevEdge->getContextIds().empty())
1761	removeEdgeFromGraph(Edge: PrevEdge);
1762	}
1763	// Since we update the edges from leaf to tail, only look at the callee
1764	// edges. This isn't an alloc node, so if there are no callee edges, the
1765	// alloc type is None.
1766	CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1767	? (uint8_t)AllocationType::None
1768	: CurNode->computeAllocType();
1769	PrevNode = CurNode;
1770	}
1771	if (VerifyNodes) {
1772	checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /CheckEdges=/true);
1773	for (auto Id : Ids) {
1774	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1775	// We should only have kept stack ids that had nodes.
1776	assert(CurNode);
1777	checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /CheckEdges=/true);
1778	}
1779	}
1780	}
1781	}
1782
1783	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1784	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1785	// Map of stack id to all calls with that as the last (outermost caller)
1786	// callsite id that has a context node (some might not due to pruning
1787	// performed during matching of the allocation profile contexts).
1788	// The CallContextInfo contains the Call and a list of its stack ids with
1789	// ContextNodes, the function containing Call, and the set of context ids
1790	// the analysis will eventually identify for use in any new node created
1791	// for that callsite.
1792	DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1793	for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1794	for (auto &Call : CallsWithMetadata) {
1795	// Ignore allocations, already handled.
1796	if (AllocationCallToContextNodeMap.count(Call))
1797	continue;
1798	auto StackIdsWithContextNodes =
1799	getStackIdsWithContextNodesForCall(Call: Call.call());
1800	// If there were no nodes created for MIBs on allocs (maybe this was in
1801	// the unambiguous part of the MIB stack that was pruned), ignore.
1802	if (StackIdsWithContextNodes.empty())
1803	continue;
1804	// Otherwise, record this Call along with the list of ids for the last
1805	// (outermost caller) stack id with a node.
1806	StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1807	{Call.call(), StackIdsWithContextNodes, Func, {}});
1808	}
1809	}
1810
1811	// First make a pass through all stack ids that correspond to a call,
1812	// as identified in the above loop. Compute the context ids corresponding to
1813	// each of these calls when they correspond to multiple stack ids due to
1814	// due to inlining. Perform any duplication of context ids required when
1815	// there is more than one call with the same stack ids. Their (possibly newly
1816	// duplicated) context ids are saved in the StackIdToMatchingCalls map.
1817	DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1818	// Save a map from each call to any that are found to match it. I.e. located
1819	// in the same function and have the same (possibly pruned) stack ids. We use
1820	// this to avoid creating extra graph nodes as they can be treated the same.
1821	DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1822	for (auto &It : StackIdToMatchingCalls) {
1823	auto &Calls = It.getSecond();
1824	// Skip single calls with a single stack id. These don't need a new node.
1825	if (Calls.size() == `1`) {
1826	auto &Ids = Calls[`0`].StackIds;
1827	if (Ids.size() == `1`)
1828	continue;
1829	}
1830	// In order to do the best and maximal matching of inlined calls to context
1831	// node sequences we will sort the vectors of stack ids in descending order
1832	// of length, and within each length, lexicographically by stack id. The
1833	// latter is so that we can specially handle calls that have identical stack
1834	// id sequences (either due to cloning or artificially because of the MIB
1835	// context pruning). Those with the same Ids are then sorted by function to
1836	// facilitate efficiently mapping them to the same context node.
1837	// Because the functions are pointers, to ensure a stable sort first assign
1838	// each function pointer to its first index in the Calls array, and then use
1839	// that to sort by.
1840	DenseMap<const FuncTy , unsigned*> FuncToIndex;
1841	for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1842	FuncToIndex.insert({CallCtxInfo.Func, Idx});
1843	llvm::stable_sort(
1844	Calls,
1845	[&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1846	return A.StackIds.size() > B.StackIds.size() \|\|
1847	(A.StackIds.size() == B.StackIds.size() &&
1848	(A.StackIds < B.StackIds \|\|
1849	(A.StackIds == B.StackIds &&
1850	FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1851	});
1852
1853	// Find the node for the last stack id, which should be the same
1854	// across all calls recorded for this id, and is the id for this
1855	// entry in the StackIdToMatchingCalls map.
1856	uint64_t LastId = It.getFirst();
1857	ContextNode *LastNode = getNodeForStackId(StackId: LastId);
1858	// We should only have kept stack ids that had nodes.
1859	assert(LastNode);
1860
1861	if (LastNode->Recursive)
1862	continue;
1863
1864	// Initialize the context ids with the last node's. We will subsequently
1865	// refine the context ids by computing the intersection along all edges.
1866	DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1867	assert(!LastNodeContextIds.empty());
1868
1869	#ifndef NDEBUG
1870	// Save the set of functions seen for a particular set of the same stack
1871	// ids. This is used to ensure that they have been correctly sorted to be
1872	// adjacent in the Calls list, since we rely on that to efficiently place
1873	// all such matching calls onto the same context node.
1874	DenseSet<const FuncTy *> MatchingIdsFuncSet;
1875	#endif
1876
1877	for (unsigned I = `0`; I < Calls.size(); I++) {
1878	auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1879	assert(SavedContextIds.empty());
1880	assert(LastId == Ids.back());
1881
1882	#ifndef NDEBUG
1883	// If this call has a different set of ids than the last one, clear the
1884	// set used to ensure they are sorted properly.
1885	if (I > `0` && Ids != Calls[I - `1`].StackIds)
1886	MatchingIdsFuncSet.clear();
1887	#endif
1888
1889	// First compute the context ids for this stack id sequence (the
1890	// intersection of the context ids of the corresponding nodes).
1891	// Start with the remaining saved ids for the last node.
1892	assert(!LastNodeContextIds.empty());
1893	DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1894
1895	ContextNode *PrevNode = LastNode;
1896	ContextNode *CurNode = LastNode;
1897	bool Skip = false;
1898
1899	// Iterate backwards through the stack Ids, starting after the last Id
1900	// in the list, which was handled once outside for all Calls.
1901	for (auto IdIter = Ids.rbegin() + `1`; IdIter != Ids.rend(); IdIter++) {
1902	auto Id = *IdIter;
1903	CurNode = getNodeForStackId(StackId: Id);
1904	// We should only have kept stack ids that had nodes.
1905	assert(CurNode);
1906
1907	if (CurNode->Recursive) {
1908	Skip = true;
1909	break;
1910	}
1911
1912	auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1913	// If there is no edge then the nodes belong to different MIB contexts,
1914	// and we should skip this inlined context sequence. For example, this
1915	// particular inlined context may include stack ids A->B, and we may
1916	// indeed have nodes for both A and B, but it is possible that they were
1917	// never profiled in sequence in a single MIB for any allocation (i.e.
1918	// we might have profiled an allocation that involves the callsite A,
1919	// but through a different one of its callee callsites, and we might
1920	// have profiled an allocation that involves callsite B, but reached
1921	// from a different caller callsite).
1922	if (!Edge) {
1923	Skip = true;
1924	break;
1925	}
1926	PrevNode = CurNode;
1927
1928	// Update the context ids, which is the intersection of the ids along
1929	// all edges in the sequence.
1930	set_intersect(StackSequenceContextIds, Edge->getContextIds());
1931
1932	// If we now have no context ids for clone, skip this call.
1933	if (StackSequenceContextIds.empty()) {
1934	Skip = true;
1935	break;
1936	}
1937	}
1938	if (Skip)
1939	continue;
1940
1941	// If some of this call's stack ids did not have corresponding nodes (due
1942	// to pruning), don't include any context ids for contexts that extend
1943	// beyond these nodes. Otherwise we would be matching part of unrelated /
1944	// not fully matching stack contexts. To do this, subtract any context ids
1945	// found in caller nodes of the last node found above.
1946	if (Ids.back() != getLastStackId(Call)) {
1947	for (const auto &PE : LastNode->CallerEdges) {
1948	set_subtract(StackSequenceContextIds, PE->getContextIds());
1949	if (StackSequenceContextIds.empty())
1950	break;
1951	}
1952	// If we now have no context ids for clone, skip this call.
1953	if (StackSequenceContextIds.empty())
1954	continue;
1955	}
1956
1957	#ifndef NDEBUG
1958	// If the prior call had the same stack ids this set would not be empty.
1959	// Check if we already have a call that "matches" because it is located
1960	// in the same function. If the Calls list was sorted properly we should
1961	// not encounter this situation as all such entries should be adjacent
1962	// and processed in bulk further below.
1963	assert(!MatchingIdsFuncSet.contains(Func));
1964
1965	MatchingIdsFuncSet.insert(Func);
1966	#endif
1967
1968	// Check if the next set of stack ids is the same (since the Calls vector
1969	// of tuples is sorted by the stack ids we can just look at the next one).
1970	// If so, save them in the CallToMatchingCall map so that they get
1971	// assigned to the same context node, and skip them.
1972	bool DuplicateContextIds = false;
1973	for (unsigned J = I + `1`; J < Calls.size(); J++) {
1974	auto &CallCtxInfo = Calls[J];
1975	auto &NextIds = CallCtxInfo.StackIds;
1976	if (NextIds != Ids)
1977	break;
1978	auto *NextFunc = CallCtxInfo.Func;
1979	if (NextFunc != Func) {
1980	// We have another Call with the same ids but that cannot share this
1981	// node, must duplicate ids for it.
1982	DuplicateContextIds = true;
1983	break;
1984	}
1985	auto &NextCall = CallCtxInfo.Call;
1986	CallToMatchingCall[NextCall] = Call;
1987	// Update I so that it gets incremented correctly to skip this call.
1988	I = J;
1989	}
1990
1991	// If we don't have duplicate context ids, then we can assign all the
1992	// context ids computed for the original node sequence to this call.
1993	// If there are duplicate calls with the same stack ids then we synthesize
1994	// new context ids that are duplicates of the originals. These are
1995	// assigned to SavedContextIds, which is a reference into the map entry
1996	// for this call, allowing us to access these ids later on.
1997	OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
1998	StackSequenceContextIds.size());
1999	SavedContextIds =
2000	DuplicateContextIds
2001	? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2002	: StackSequenceContextIds;
2003	assert(!SavedContextIds.empty());
2004
2005	if (!DuplicateContextIds) {
2006	// Update saved last node's context ids to remove those that are
2007	// assigned to other calls, so that it is ready for the next call at
2008	// this stack id.
2009	set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
2010	if (LastNodeContextIds.empty())
2011	break;
2012	}
2013	}
2014	}
2015
2016	// Propagate the duplicate context ids over the graph.
2017	propagateDuplicateContextIds(OldToNewContextIds);
2018
2019	if (VerifyCCG)
2020	check();
2021
2022	// Now perform a post-order traversal over the graph, starting with the
2023	// allocation nodes, essentially processing nodes from callers to callees.
2024	// For any that contains an id in the map, update the graph to contain new
2025	// nodes representing any inlining at interior callsites. Note we move the
2026	// associated context ids over to the new nodes.
2027	DenseSet<const ContextNode *> Visited;
2028	for (auto &Entry : AllocationCallToContextNodeMap)
2029	assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls,
2030	CallToMatchingCall);
2031	if (VerifyCCG)
2032	check();
2033	}
2034
2035	uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2036	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2037	Call->getMetadata(KindID: LLVMContext::MD_callsite));
2038	return CallsiteContext.back();
2039	}
2040
2041	uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2042	assert(isa<CallsiteInfo *>(Call));
2043	CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2044	CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2045	// Need to convert index into stack id.
2046	return Index.getStackIdAtIndex(Index: CallsiteContext.back());
2047	}
2048
2049	static const std::string MemProfCloneSuffix = ".memprof.";
2050
2051	static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2052	// We use CloneNo == 0 to refer to the original version, which doesn't get
2053	// renamed with a suffix.
2054	if (!CloneNo)
2055	return Base.str();
2056	return (Base + MemProfCloneSuffix + Twine (CloneNo)).str();
2057	}
2058
2059	static bool isMemProfClone(const Function &F) {
2060	return F.getName().contains(Other: MemProfCloneSuffix);
2061	}
2062
2063	std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2064	const Instruction *Call,
2065	unsigned CloneNo) const {
2066	return (Twine (Call->getFunction()->getName()) + " -> " +
2067	cast<CallBase>(Val: Call)->getCalledFunction()->getName())
2068	.str();
2069	}
2070
2071	std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2072	const IndexCall &Call,
2073	unsigned CloneNo) const {
2074	auto VI = FSToVIMap.find(x: Func);
2075	assert(VI != FSToVIMap.end());
2076	if (isa<AllocInfo *>(Val: Call))
2077	return (VI ->second.name() + " -> alloc").str();
2078	else {
2079	auto Callsite = dyn_cast_if_present<CallsiteInfo >(Val: Call);
2080	return (VI ->second.name() + " -> " +
2081	getMemProfFuncName(Base: Callsite->Callee.name(),
2082	CloneNo: Callsite->Clones [CloneNo]))
2083	.str();
2084	}
2085	}
2086
2087	std::vector<uint64_t>
2088	ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2089	Instruction *Call) {
2090	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2091	Call->getMetadata(KindID: LLVMContext::MD_callsite));
2092	return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2093	CallsiteContext);
2094	}
2095
2096	std::vector<uint64_t>
2097	IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2098	assert(isa<CallsiteInfo *>(Call));
2099	CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2100	CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2101	return getStackIdsWithContextNodes<CallsiteInfo,
2102	SmallVector<unsigned>::const_iterator>(
2103	CallsiteContext);
2104	}
2105
2106	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2107	template <class NodeT, class IteratorT>
2108	std::vector<uint64_t>
2109	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2110	CallStack<NodeT, IteratorT> &CallsiteContext) {
2111	std::vector<uint64_t> StackIds;
2112	for (auto IdOrIndex : CallsiteContext) {
2113	auto StackId = getStackId(IdOrIndex);
2114	ContextNode *Node = getNodeForStackId(StackId);
2115	if (!Node)
2116	break;
2117	StackIds.push_back(StackId);
2118	}
2119	return StackIds;
2120	}
2121
2122	ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2123	Module &M,
2124	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2125	: Mod(M), OREGetter (OREGetter) {
2126	for (auto &F : M) {
2127	std::vector<CallInfo> CallsWithMetadata;
2128	for (auto &BB : F) {
2129	for (auto &I : BB) {
2130	if (!isa<CallBase>(Val: I))
2131	continue;
2132	if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
2133	CallsWithMetadata.push_back(x: &I);
2134	auto *AllocNode = addAllocNode(Call: &I, F: &F);
2135	auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
2136	assert(CallsiteMD);
2137	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2138	// Add all of the MIBs and their stack nodes.
2139	for (auto &MDOp : MemProfMD->operands()) {
2140	auto MIBMD = cast<const* MDNode>(Val: MDOp);
2141	std::vector<ContextTotalSize> ContextSizeInfo;
2142	// Collect the context size information if it exists.
2143	if (MIBMD->getNumOperands() > `2`) {
2144	for (unsigned I = `2`; I < MIBMD->getNumOperands(); I++) {
2145	MDNode *ContextSizePair =
2146	dyn_cast<MDNode>(Val: MIBMD->getOperand(I));
2147	assert(ContextSizePair->getNumOperands() == `2`);
2148	uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2149	MD: ContextSizePair->getOperand(I: `0`))
2150	->getZExtValue();
2151	uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2152	MD: ContextSizePair->getOperand(I: `1`))
2153	->getZExtValue();
2154	ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2155	}
2156	}
2157	MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
2158	assert(StackNode);
2159	CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2160	addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2161	AllocNode, StackContext, CallsiteContext,
2162	AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo);
2163	}
2164	// If exporting the graph to dot and an allocation id of interest was
2165	// specified, record all the context ids for this allocation node.
2166	if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2167	DotAllocContextIds = AllocNode->getContextIds();
2168	assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2169	// Memprof and callsite metadata on memory allocations no longer
2170	// needed.
2171	I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
2172	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2173	}
2174	// For callsite metadata, add to list for this function for later use.
2175	else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) {
2176	CallsWithMetadata.push_back(x: &I);
2177	}
2178	}
2179	}
2180	if (!CallsWithMetadata.empty())
2181	FuncToCallsWithMetadata [&F] = CallsWithMetadata;
2182	}
2183
2184	if (DumpCCG) {
2185	dbgs() << "CCG before updating call stack chains:\n";
2186	dbgs() << *this;
2187	}
2188
2189	if (ExportToDot)
2190	exportToDot(Label: "prestackupdate");
2191
2192	updateStackNodes();
2193
2194	if (ExportToDot)
2195	exportToDot(Label: "poststackupdate");
2196
2197	handleCallsitesWithMultipleTargets();
2198
2199	markBackedges();
2200
2201	// Strip off remaining callsite metadata, no longer needed.
2202	for (auto &FuncEntry : FuncToCallsWithMetadata)
2203	for (auto &Call : FuncEntry.second)
2204	Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2205	}
2206
2207	IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2208	ModuleSummaryIndex &Index,
2209	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2210	isPrevailing)
2211	: Index(Index), isPrevailing (isPrevailing) {
2212	for (auto &I : Index) {
2213	auto VI = Index.getValueInfo(R: I);
2214	for (auto &S : VI.getSummaryList()) {
2215	// We should only add the prevailing nodes. Otherwise we may try to clone
2216	// in a weak copy that won't be linked (and may be different than the
2217	// prevailing version).
2218	// We only keep the memprof summary on the prevailing copy now when
2219	// building the combined index, as a space optimization, however don't
2220	// rely on this optimization. The linker doesn't resolve local linkage
2221	// values so don't check whether those are prevailing.
2222	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
2223	!isPrevailing (VI.getGUID(), S.get()))
2224	continue;
2225	auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
2226	if (!FS)
2227	continue;
2228	std::vector<CallInfo> CallsWithMetadata;
2229	if (!FS->allocs().empty()) {
2230	for (auto &AN : FS->mutableAllocs()) {
2231	// This can happen because of recursion elimination handling that
2232	// currently exists in ModuleSummaryAnalysis. Skip these for now.
2233	// We still added them to the summary because we need to be able to
2234	// correlate properly in applyImport in the backends.
2235	if (AN.MIBs.empty())
2236	continue;
2237	IndexCall AllocCall(&AN);
2238	CallsWithMetadata.push_back(x: AllocCall);
2239	auto *AllocNode = addAllocNode(Call: AllocCall, F: FS);
2240	// Pass an empty CallStack to the CallsiteContext (second)
2241	// parameter, since for ThinLTO we already collapsed out the inlined
2242	// stack ids on the allocation call during ModuleSummaryAnalysis.
2243	CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2244	EmptyContext;
2245	unsigned I = `0`;
2246	assert(!metadataMayIncludeContextSizeInfo() \|\|
2247	AN.ContextSizeInfos.size() == AN.MIBs.size());
2248	// Now add all of the MIBs and their stack nodes.
2249	for (auto &MIB : AN.MIBs) {
2250	CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2251	StackContext(&MIB);
2252	std::vector<ContextTotalSize> ContextSizeInfo;
2253	if (!AN.ContextSizeInfos.empty()) {
2254	for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos [I])
2255	ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2256	}
2257	addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2258	AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType,
2259	ContextSizeInfo);
2260	I++;
2261	}
2262	// If exporting the graph to dot and an allocation id of interest was
2263	// specified, record all the context ids for this allocation node.
2264	if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2265	DotAllocContextIds = AllocNode->getContextIds();
2266	assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2267	// Initialize version 0 on the summary alloc node to the current alloc
2268	// type, unless it has both types in which case make it default, so
2269	// that in the case where we aren't able to clone the original version
2270	// always ends up with the default allocation behavior.
2271	AN.Versions [`0`] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
2272	}
2273	}
2274	// For callsite metadata, add to list for this function for later use.
2275	if (!FS->callsites().empty())
2276	for (auto &SN : FS->mutableCallsites()) {
2277	IndexCall StackNodeCall(&SN);
2278	CallsWithMetadata.push_back(x: StackNodeCall);
2279	}
2280
2281	if (!CallsWithMetadata.empty())
2282	FuncToCallsWithMetadata [FS] = CallsWithMetadata;
2283
2284	if (!FS->allocs().empty() \|\| !FS->callsites().empty())
2285	FSToVIMap [FS] = VI;
2286	}
2287	}
2288
2289	if (DumpCCG) {
2290	dbgs() << "CCG before updating call stack chains:\n";
2291	dbgs() << *this;
2292	}
2293
2294	if (ExportToDot)
2295	exportToDot(Label: "prestackupdate");
2296
2297	updateStackNodes();
2298
2299	if (ExportToDot)
2300	exportToDot(Label: "poststackupdate");
2301
2302	handleCallsitesWithMultipleTargets();
2303
2304	markBackedges();
2305	}
2306
2307	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2308	void CallsiteContextGraph<DerivedCCG, FuncTy,
2309	CallTy>::handleCallsitesWithMultipleTargets() {
2310	// Look for and workaround callsites that call multiple functions.
2311	// This can happen for indirect calls, which needs better handling, and in
2312	// more rare cases (e.g. macro expansion).
2313	// TODO: To fix this for indirect calls we will want to perform speculative
2314	// devirtualization using either the normal PGO info with ICP, or using the
2315	// information in the profiled MemProf contexts. We can do this prior to
2316	// this transformation for regular LTO, and for ThinLTO we can simulate that
2317	// effect in the summary and perform the actual speculative devirtualization
2318	// while cloning in the ThinLTO backend.
2319
2320	// Keep track of the new nodes synthesized for discovered tail calls missing
2321	// from the profiled contexts.
2322	MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2323
2324	std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2325	for (auto &Entry : NonAllocationCallToContextNodeMap) {
2326	auto *Node = Entry.second;
2327	assert(Node->Clones.empty());
2328	// Check all node callees and see if in the same function.
2329	// We need to check all of the calls recorded in this Node, because in some
2330	// cases we may have had multiple calls with the same debug info calling
2331	// different callees. This can happen, for example, when an object is
2332	// constructed in the paramter list - the destructor call of the object has
2333	// the same debug info (line/col) as the call the object was passed to.
2334	// Here we will prune any that don't match all callee nodes.
2335	std::vector<CallInfo> AllCalls;
2336	AllCalls.reserve(Node->MatchingCalls.size() + `1`);
2337	AllCalls.push_back(Node->Call);
2338	llvm::append_range(AllCalls, Node->MatchingCalls);
2339
2340	// First see if we can partition the calls by callee function, creating new
2341	// nodes to host each set of calls calling the same callees. This is
2342	// necessary for support indirect calls with ThinLTO, for which we
2343	// synthesized CallsiteInfo records for each target. They will all have the
2344	// same callsite stack ids and would be sharing a context node at this
2345	// point. We need to perform separate cloning for each, which will be
2346	// applied along with speculative devirtualization in the ThinLTO backends
2347	// as needed. Note this does not currently support looking through tail
2348	// calls, it is unclear if we need that for indirect call targets.
2349	// First partition calls by callee func. Map indexed by func, value is
2350	// struct with list of matching calls, assigned node.
2351	if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2352	continue;
2353
2354	auto It = AllCalls.begin();
2355	// Iterate through the calls until we find the first that matches.
2356	for (; It != AllCalls.end(); ++It) {
2357	auto ThisCall = *It;
2358	bool Match = true;
2359	for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2360	++EI) {
2361	auto Edge = *EI;
2362	if (!Edge->Callee->hasCall())
2363	continue;
2364	assert(NodeToCallingFunc.count(Edge->Callee));
2365	// Check if the called function matches that of the callee node.
2366	if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) {
2367	Match = false;
2368	break;
2369	}
2370	}
2371	// Found a call that matches the callee nodes, we can quit now.
2372	if (Match) {
2373	// If the first match is not the primary call on the Node, update it
2374	// now. We will update the list of matching calls further below.
2375	if (Node->Call != ThisCall) {
2376	Node->setCall(ThisCall);
2377	// We need to update the NonAllocationCallToContextNodeMap, but don't
2378	// want to do this during iteration over that map, so save the calls
2379	// that need updated entries.
2380	NewCallToNode.push_back({ThisCall, Node});
2381	}
2382	break;
2383	}
2384	}
2385	// We will update this list below (or leave it cleared if there was no
2386	// match found above).
2387	Node->MatchingCalls.clear();
2388	// If we hit the end of the AllCalls vector, no call matching the callee
2389	// nodes was found, clear the call information in the node.
2390	if (It == AllCalls.end()) {
2391	RemovedEdgesWithMismatchedCallees ++;
2392	// Work around by setting Node to have a null call, so it gets
2393	// skipped during cloning. Otherwise assignFunctions will assert
2394	// because its data structures are not designed to handle this case.
2395	Node->setCall(CallInfo());
2396	continue;
2397	}
2398	// Now add back any matching calls that call the same function as the
2399	// matching primary call on Node.
2400	for (++It; It != AllCalls.end(); ++It) {
2401	auto ThisCall = *It;
2402	if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call()))
2403	continue;
2404	Node->MatchingCalls.push_back(ThisCall);
2405	}
2406	}
2407
2408	// Remove all mismatched nodes identified in the above loop from the node map
2409	// (checking whether they have a null call which is set above). For a
2410	// MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2411	// to do the removal via remove_if than by individually erasing entries above.
2412	// Also remove any entries if we updated the node's primary call above.
2413	NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2414	return !it.second->hasCall() \|\| it.second->Call != it.first;
2415	});
2416
2417	// Add entries for any new primary calls recorded above.
2418	for (auto &[Call, Node] : NewCallToNode)
2419	NonAllocationCallToContextNodeMap[Call] = Node;
2420
2421	// Add the new nodes after the above loop so that the iteration is not
2422	// invalidated.
2423	for (auto &[Call, Node] : TailCallToContextNodeMap)
2424	NonAllocationCallToContextNodeMap[Call] = Node;
2425	}
2426
2427	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2428	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2429	ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2430	std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2431	// Struct to keep track of all the calls having the same callee function,
2432	// and the node we eventually assign to them. Eventually we will record the
2433	// context node assigned to this group of calls.
2434	struct CallsWithSameCallee {
2435	std::vector<CallInfo> Calls;
2436	ContextNode Node = nullptr*;
2437	};
2438
2439	// First partition calls by callee function. Build map from each function
2440	// to the list of matching calls.
2441	DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2442	for (auto ThisCall : AllCalls) {
2443	auto *F = getCalleeFunc(Call: ThisCall.call());
2444	if (F)
2445	CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2446	}
2447
2448	// Next, walk through all callee edges. For each callee node, get its
2449	// containing function and see if it was recorded in the above map (meaning we
2450	// have at least one matching call). Build another map from each callee node
2451	// with a matching call to the structure instance created above containing all
2452	// the calls.
2453	DenseMap<ContextNode , CallsWithSameCallee > CalleeNodeToCallInfo;
2454	for (const auto &Edge : Node->CalleeEdges) {
2455	if (!Edge->Callee->hasCall())
2456	continue;
2457	const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2458	if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2459	CalleeNodeToCallInfo[Edge->Callee] =
2460	&CalleeFuncToCallInfo[ProfiledCalleeFunc];
2461	}
2462
2463	// If there are entries in the second map, then there were no matching
2464	// calls/callees, nothing to do here. Return so we can go to the handling that
2465	// looks through tail calls.
2466	if (CalleeNodeToCallInfo.empty())
2467	return false;
2468
2469	// Walk through all callee edges again. Any and all callee edges that didn't
2470	// match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2471	// new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2472	// ignored during cloning. If it is in the map, then we use the node recorded
2473	// in that entry (creating it if needed), and move the callee edge to it.
2474	// The first callee will use the original node instead of creating a new one.
2475	// Note that any of the original calls on this node (in AllCalls) that didn't
2476	// have a callee function automatically get dropped from the node as part of
2477	// this process.
2478	ContextNode UnmatchedCalleesNode = nullptr*;
2479	// Track whether we already assigned original node to a callee.
2480	bool UsedOrigNode = false;
2481	assert(NodeToCallingFunc[Node]);
2482	// Iterate over a copy of Node's callee edges, since we may need to remove
2483	// edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2484	// makes it less error-prone.
2485	auto CalleeEdges = Node->CalleeEdges;
2486	for (auto &Edge : CalleeEdges) {
2487	if (!Edge->Callee->hasCall())
2488	continue;
2489
2490	// Will be updated below to point to whatever (caller) node this callee edge
2491	// should be moved to.
2492	ContextNode CallerNodeToUse = nullptr*;
2493
2494	// Handle the case where there were no matching calls first. Move this
2495	// callee edge to the UnmatchedCalleesNode, creating it if needed.
2496	if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2497	if (!UnmatchedCalleesNode)
2498	UnmatchedCalleesNode =
2499	createNewNode(/IsAllocation=/false, F: NodeToCallingFunc[Node]);
2500	CallerNodeToUse = UnmatchedCalleesNode;
2501	} else {
2502	// Look up the information recorded for this callee node, and use the
2503	// recorded caller node (creating it if needed).
2504	auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2505	if (!Info->Node) {
2506	// If we haven't assigned any callees to the original node use it.
2507	if (!UsedOrigNode) {
2508	Info->Node = Node;
2509	// Clear the set of matching calls which will be updated below.
2510	Node->MatchingCalls.clear();
2511	UsedOrigNode = true;
2512	} else
2513	Info->Node =
2514	createNewNode(/IsAllocation=/false, F: NodeToCallingFunc[Node]);
2515	assert(!Info->Calls.empty());
2516	// The first call becomes the primary call for this caller node, and the
2517	// rest go in the matching calls list.
2518	Info->Node->setCall(Info->Calls.front());
2519	llvm::append_range(Info->Node->MatchingCalls,
2520	llvm::drop_begin(Info->Calls));
2521	// Save the primary call to node correspondence so that we can update
2522	// the NonAllocationCallToContextNodeMap, which is being iterated in the
2523	// caller of this function.
2524	NewCallToNode.push_back({Info->Node->Call, Info->Node});
2525	}
2526	CallerNodeToUse = Info->Node;
2527	}
2528
2529	// Don't need to move edge if we are using the original node;
2530	if (CallerNodeToUse == Node)
2531	continue;
2532
2533	moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse);
2534	}
2535	// Now that we are done moving edges, clean up any caller edges that ended
2536	// up with no type or context ids. During moveCalleeEdgeToNewCaller all
2537	// caller edges from Node are replicated onto the new callers, and it
2538	// simplifies the handling to leave them until we have moved all
2539	// edges/context ids.
2540	for (auto &I : CalleeNodeToCallInfo)
2541	removeNoneTypeCallerEdges(Node: I.second->Node);
2542	if (UnmatchedCalleesNode)
2543	removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode);
2544	removeNoneTypeCallerEdges(Node);
2545
2546	return true;
2547	}
2548
2549	uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2550	// In the Module (IR) case this is already the Id.
2551	return IdOrIndex;
2552	}
2553
2554	uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2555	// In the Index case this is an index into the stack id list in the summary
2556	// index, convert it to an Id.
2557	return Index.getStackIdAtIndex(Index: IdOrIndex);
2558	}
2559
2560	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2561	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2562	CallTy Call, EdgeIter &EI,
2563	MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2564	auto Edge = *EI;
2565	const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2566	const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2567	// Will be populated in order of callee to caller if we find a chain of tail
2568	// calls between the profiled caller and callee.
2569	std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2570	if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
2571	FoundCalleeChain))
2572	return false;
2573
2574	// The usual case where the profiled callee matches that of the IR/summary.
2575	if (FoundCalleeChain.empty())
2576	return true;
2577
2578	auto AddEdge = [Edge, &EI](ContextNode Caller, ContextNode Callee) {
2579	auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2580	// If there is already an edge between these nodes, simply update it and
2581	// return.
2582	if (CurEdge) {
2583	CurEdge->ContextIds.insert_range(Edge->ContextIds);
2584	CurEdge->AllocTypes \|= Edge->AllocTypes;
2585	return;
2586	}
2587	// Otherwise, create a new edge and insert it into the caller and callee
2588	// lists.
2589	auto NewEdge = std::make_shared<ContextEdge>(
2590	Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2591	Callee->CallerEdges.push_back(NewEdge);
2592	if (Caller == Edge->Caller) {
2593	// If we are inserting the new edge into the current edge's caller, insert
2594	// the new edge before the current iterator position, and then increment
2595	// back to the current edge.
2596	EI = Caller->CalleeEdges.insert(EI, NewEdge);
2597	++EI;
2598	assert(*EI == Edge &&
2599	"Iterator position not restored after insert and increment");
2600	} else
2601	Caller->CalleeEdges.push_back(NewEdge);
2602	};
2603
2604	// Create new nodes for each found callee and connect in between the profiled
2605	// caller and callee.
2606	auto *CurCalleeNode = Edge->Callee;
2607	for (auto &[NewCall, Func] : FoundCalleeChain) {
2608	ContextNode NewNode = nullptr*;
2609	// First check if we have already synthesized a node for this tail call.
2610	if (TailCallToContextNodeMap.count(NewCall)) {
2611	NewNode = TailCallToContextNodeMap[NewCall];
2612	NewNode->AllocTypes \|= Edge->AllocTypes;
2613	} else {
2614	FuncToCallsWithMetadata[Func].push_back({NewCall});
2615	// Create Node and record node info.
2616	NewNode = createNewNode(/IsAllocation=/false, F: Func, C: NewCall);
2617	TailCallToContextNodeMap[NewCall] = NewNode;
2618	NewNode->AllocTypes = Edge->AllocTypes;
2619	}
2620
2621	// Hook up node to its callee node
2622	AddEdge(NewNode, CurCalleeNode);
2623
2624	CurCalleeNode = NewNode;
2625	}
2626
2627	// Hook up edge's original caller to new callee node.
2628	AddEdge(Edge->Caller, CurCalleeNode);
2629
2630	#ifndef NDEBUG
2631	// Save this because Edge's fields get cleared below when removed.
2632	auto *Caller = Edge->Caller;
2633	#endif
2634
2635	// Remove old edge
2636	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /CalleeIter=/true);
2637
2638	// To simplify the increment of EI in the caller, subtract one from EI.
2639	// In the final AddEdge call we would have either added a new callee edge,
2640	// to Edge->Caller, or found an existing one. Either way we are guaranteed
2641	// that there is at least one callee edge.
2642	assert(!Caller->CalleeEdges.empty());
2643	--EI;
2644
2645	return true;
2646	}
2647
2648	bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2649	const Function ProfiledCallee, Value CurCallee, unsigned Depth,
2650	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain,
2651	bool &FoundMultipleCalleeChains) {
2652	// Stop recursive search if we have already explored the maximum specified
2653	// depth.
2654	if (Depth > TailCallSearchDepth)
2655	return false;
2656
2657	auto SaveCallsiteInfo = [&](Instruction Callsite, Function F) {
2658	FoundCalleeChain.push_back(x: {Callsite, F});
2659	};
2660
2661	auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
2662	if (!CalleeFunc) {
2663	auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
2664	assert(Alias);
2665	CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
2666	assert(CalleeFunc);
2667	}
2668
2669	// Look for tail calls in this function, and check if they either call the
2670	// profiled callee directly, or indirectly (via a recursive search).
2671	// Only succeed if there is a single unique tail call chain found between the
2672	// profiled caller and callee, otherwise we could perform incorrect cloning.
2673	bool FoundSingleCalleeChain = false;
2674	for (auto &BB : *CalleeFunc) {
2675	for (auto &I : BB) {
2676	auto *CB = dyn_cast<CallBase>(Val: &I);
2677	if (!CB \|\| !CB->isTailCall())
2678	continue;
2679	auto *CalledValue = CB->getCalledOperand();
2680	auto *CalledFunction = CB->getCalledFunction();
2681	if (CalledValue && !CalledFunction) {
2682	CalledValue = CalledValue->stripPointerCasts();
2683	// Stripping pointer casts can reveal a called function.
2684	CalledFunction = dyn_cast<Function>(Val: CalledValue);
2685	}
2686	// Check if this is an alias to a function. If so, get the
2687	// called aliasee for the checks below.
2688	if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
2689	assert(!CalledFunction &&
2690	"Expected null called function in callsite for alias");
2691	CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
2692	}
2693	if (!CalledFunction)
2694	continue;
2695	if (CalledFunction == ProfiledCallee) {
2696	if (FoundSingleCalleeChain) {
2697	FoundMultipleCalleeChains = true;
2698	return false;
2699	}
2700	FoundSingleCalleeChain = true;
2701	FoundProfiledCalleeCount ++;
2702	FoundProfiledCalleeDepth += Depth;
2703	if (Depth > FoundProfiledCalleeMaxDepth)
2704	FoundProfiledCalleeMaxDepth = Depth;
2705	SaveCallsiteInfo (&I, CalleeFunc);
2706	} else if (findProfiledCalleeThroughTailCalls(
2707	ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + `1`,
2708	FoundCalleeChain, FoundMultipleCalleeChains)) {
2709	// findProfiledCalleeThroughTailCalls should not have returned
2710	// true if FoundMultipleCalleeChains.
2711	assert(!FoundMultipleCalleeChains);
2712	if (FoundSingleCalleeChain) {
2713	FoundMultipleCalleeChains = true;
2714	return false;
2715	}
2716	FoundSingleCalleeChain = true;
2717	SaveCallsiteInfo (&I, CalleeFunc);
2718	} else if (FoundMultipleCalleeChains)
2719	return false;
2720	}
2721	}
2722
2723	return FoundSingleCalleeChain;
2724	}
2725
2726	const Function ModuleCallsiteContextGraph::getCalleeFunc(Instruction Call) {
2727	auto *CB = dyn_cast<CallBase>(Val: Call);
2728	if (!CB->getCalledOperand() \|\| CB->isIndirectCall())
2729	return nullptr;
2730	auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2731	auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
2732	if (Alias)
2733	return dyn_cast<Function>(Val: Alias->getAliasee());
2734	return dyn_cast<Function>(Val: CalleeVal);
2735	}
2736
2737	bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2738	Instruction Call, const* Function Func, const* Function *CallerFunc,
2739	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain) {
2740	auto *CB = dyn_cast<CallBase>(Val: Call);
2741	if (!CB->getCalledOperand() \|\| CB->isIndirectCall())
2742	return false;
2743	auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2744	auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
2745	if (CalleeFunc == Func)
2746	return true;
2747	auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
2748	if (Alias && Alias->getAliasee() == Func)
2749	return true;
2750
2751	// Recursively search for the profiled callee through tail calls starting with
2752	// the actual Callee. The discovered tail call chain is saved in
2753	// FoundCalleeChain, and we will fixup the graph to include these callsites
2754	// after returning.
2755	// FIXME: We will currently redo the same recursive walk if we find the same
2756	// mismatched callee from another callsite. We can improve this with more
2757	// bookkeeping of the created chain of new nodes for each mismatch.
2758	unsigned Depth = `1`;
2759	bool FoundMultipleCalleeChains = false;
2760	if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
2761	FoundCalleeChain,
2762	FoundMultipleCalleeChains)) {
2763	LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2764	<< Func->getName() << " from " << CallerFunc->getName()
2765	<< " that actually called " << CalleeVal->getName()
2766	<< (FoundMultipleCalleeChains
2767	? " (found multiple possible chains)"
2768	: "")
2769	<< "\n");
2770	if (FoundMultipleCalleeChains)
2771	FoundProfiledCalleeNonUniquelyCount ++;
2772	return false;
2773	}
2774
2775	return true;
2776	}
2777
2778	bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2779	Instruction *Call2) {
2780	auto *CB1 = cast<CallBase>(Val: Call1);
2781	if (!CB1->getCalledOperand() \|\| CB1->isIndirectCall())
2782	return false;
2783	auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2784	auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1);
2785	auto *CB2 = cast<CallBase>(Val: Call2);
2786	if (!CB2->getCalledOperand() \|\| CB2->isIndirectCall())
2787	return false;
2788	auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2789	auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2);
2790	return CalleeFunc1 == CalleeFunc2;
2791	}
2792
2793	bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2794	ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2795	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2796	bool &FoundMultipleCalleeChains) {
2797	// Stop recursive search if we have already explored the maximum specified
2798	// depth.
2799	if (Depth > TailCallSearchDepth)
2800	return false;
2801
2802	auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2803	// Make a CallsiteInfo for each discovered callee, if one hasn't already
2804	// been synthesized.
2805	if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) \|\|
2806	!FunctionCalleesToSynthesizedCallsiteInfos [FS].count(x: Callee))
2807	// StackIds is empty (we don't have debug info available in the index for
2808	// these callsites)
2809	FunctionCalleesToSynthesizedCallsiteInfos [FS][Callee] =
2810	std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
2811	CallsiteInfo *NewCallsiteInfo =
2812	FunctionCalleesToSynthesizedCallsiteInfos [FS][Callee].get();
2813	FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
2814	};
2815
2816	// Look for tail calls in this function, and check if they either call the
2817	// profiled callee directly, or indirectly (via a recursive search).
2818	// Only succeed if there is a single unique tail call chain found between the
2819	// profiled caller and callee, otherwise we could perform incorrect cloning.
2820	bool FoundSingleCalleeChain = false;
2821	for (auto &S : CurCallee.getSummaryList()) {
2822	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
2823	!isPrevailing (CurCallee.getGUID(), S.get()))
2824	continue;
2825	auto *FS = dyn_cast<FunctionSummary>(Val: S ->getBaseObject());
2826	if (!FS)
2827	continue;
2828	auto FSVI = CurCallee;
2829	auto *AS = dyn_cast<AliasSummary>(Val: S.get());
2830	if (AS)
2831	FSVI = AS->getAliaseeVI();
2832	for (auto &CallEdge : FS->calls()) {
2833	if (!CallEdge.second.hasTailCall())
2834	continue;
2835	if (CallEdge.first == ProfiledCallee) {
2836	if (FoundSingleCalleeChain) {
2837	FoundMultipleCalleeChains = true;
2838	return false;
2839	}
2840	FoundSingleCalleeChain = true;
2841	FoundProfiledCalleeCount ++;
2842	FoundProfiledCalleeDepth += Depth;
2843	if (Depth > FoundProfiledCalleeMaxDepth)
2844	FoundProfiledCalleeMaxDepth = Depth;
2845	CreateAndSaveCallsiteInfo (CallEdge.first, FS);
2846	// Add FS to FSToVIMap in case it isn't already there.
2847	assert(!FSToVIMap.count(FS) \|\| FSToVIMap[FS] == FSVI);
2848	FSToVIMap [FS] = FSVI;
2849	} else if (findProfiledCalleeThroughTailCalls(
2850	ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + `1`,
2851	FoundCalleeChain, FoundMultipleCalleeChains)) {
2852	// findProfiledCalleeThroughTailCalls should not have returned
2853	// true if FoundMultipleCalleeChains.
2854	assert(!FoundMultipleCalleeChains);
2855	if (FoundSingleCalleeChain) {
2856	FoundMultipleCalleeChains = true;
2857	return false;
2858	}
2859	FoundSingleCalleeChain = true;
2860	CreateAndSaveCallsiteInfo (CallEdge.first, FS);
2861	// Add FS to FSToVIMap in case it isn't already there.
2862	assert(!FSToVIMap.count(FS) \|\| FSToVIMap[FS] == FSVI);
2863	FSToVIMap [FS] = FSVI;
2864	} else if (FoundMultipleCalleeChains)
2865	return false;
2866	}
2867	}
2868
2869	return FoundSingleCalleeChain;
2870	}
2871
2872	const FunctionSummary *
2873	IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2874	ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
2875	if (Callee.getSummaryList().empty())
2876	return nullptr;
2877	return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[`0`]->getBaseObject());
2878	}
2879
2880	bool IndexCallsiteContextGraph::calleeMatchesFunc(
2881	IndexCall &Call, const FunctionSummary *Func,
2882	const FunctionSummary *CallerFunc,
2883	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2884	ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
2885	// If there is no summary list then this is a call to an externally defined
2886	// symbol.
2887	AliasSummary *Alias =
2888	Callee.getSummaryList().empty()
2889	? nullptr
2890	: dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[`0`].get());
2891	assert(FSToVIMap.count(Func));
2892	auto FuncVI = FSToVIMap [Func];
2893	if (Callee == FuncVI \|\|
2894	// If callee is an alias, check the aliasee, since only function
2895	// summary base objects will contain the stack node summaries and thus
2896	// get a context node.
2897	(Alias && Alias->getAliaseeVI() == FuncVI))
2898	return true;
2899
2900	// Recursively search for the profiled callee through tail calls starting with
2901	// the actual Callee. The discovered tail call chain is saved in
2902	// FoundCalleeChain, and we will fixup the graph to include these callsites
2903	// after returning.
2904	// FIXME: We will currently redo the same recursive walk if we find the same
2905	// mismatched callee from another callsite. We can improve this with more
2906	// bookkeeping of the created chain of new nodes for each mismatch.
2907	unsigned Depth = `1`;
2908	bool FoundMultipleCalleeChains = false;
2909	if (!findProfiledCalleeThroughTailCalls(
2910	ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2911	LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2912	<< " from " << FSToVIMap[CallerFunc]
2913	<< " that actually called " << Callee
2914	<< (FoundMultipleCalleeChains
2915	? " (found multiple possible chains)"
2916	: "")
2917	<< "\n");
2918	if (FoundMultipleCalleeChains)
2919	FoundProfiledCalleeNonUniquelyCount ++;
2920	return false;
2921	}
2922
2923	return true;
2924	}
2925
2926	bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2927	ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee;
2928	ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee;
2929	return Callee1 == Callee2;
2930	}
2931
2932	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2933	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2934	const {
2935	print(OS&: dbgs());
2936	dbgs() << "\n";
2937	}
2938
2939	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2940	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2941	raw_ostream &OS) const {
2942	OS << "Node " << this << "\n";
2943	OS << "\t";
2944	printCall(OS);
2945	if (Recursive)
2946	OS << " (recursive)";
2947	OS << "\n";
2948	if (!MatchingCalls.empty()) {
2949	OS << "\tMatchingCalls:\n";
2950	for (auto &MatchingCall : MatchingCalls) {
2951	OS << "\t";
2952	MatchingCall.print(OS);
2953	OS << "\n";
2954	}
2955	}
2956	OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2957	OS << "\tContextIds:";
2958	// Make a copy of the computed context ids that we can sort for stability.
2959	auto ContextIds = getContextIds();
2960	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2961	std::sort(first: SortedIds.begin(), last: SortedIds.end());
2962	for (auto Id : SortedIds)
2963	OS << " " << Id;
2964	OS << "\n";
2965	OS << "\tCalleeEdges:\n";
2966	for (auto &Edge : CalleeEdges)
2967	OS << "\t\t" << *Edge << "\n";
2968	OS << "\tCallerEdges:\n";
2969	for (auto &Edge : CallerEdges)
2970	OS << "\t\t" << *Edge << "\n";
2971	if (!Clones.empty()) {
2972	OS << "\tClones: " << llvm::interleaved(Clones) << "\n";
2973	} else if (CloneOf) {
2974	OS << "\tClone of " << CloneOf << "\n";
2975	}
2976	}
2977
2978	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2979	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2980	const {
2981	print(OS&: dbgs());
2982	dbgs() << "\n";
2983	}
2984
2985	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2986	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2987	raw_ostream &OS) const {
2988	OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2989	<< (IsBackedge ? " (BE)" : "")
2990	<< " AllocTypes: " << getAllocTypeString(AllocTypes);
2991	OS << " ContextIds:";
2992	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2993	std::sort(first: SortedIds.begin(), last: SortedIds.end());
2994	for (auto Id : SortedIds)
2995	OS << " " << Id;
2996	}
2997
2998	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2999	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3000	print(OS&: dbgs());
3001	}
3002
3003	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3004	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3005	raw_ostream &OS) const {
3006	OS << "Callsite Context Graph:\n";
3007	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3008	for (const auto Node : nodes<GraphType>(this)) {
3009	if (Node->isRemoved())
3010	continue;
3011	Node->print(OS);
3012	OS << "\n";
3013	}
3014	}
3015
3016	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3017	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3018	raw_ostream &OS) const {
3019	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3020	for (const auto Node : nodes<GraphType>(this)) {
3021	if (Node->isRemoved())
3022	continue;
3023	if (!Node->IsAllocation)
3024	continue;
3025	DenseSet<uint32_t> ContextIds = Node->getContextIds();
3026	auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call);
3027	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3028	std::sort(first: SortedIds.begin(), last: SortedIds.end());
3029	for (auto Id : SortedIds) {
3030	auto TypeI = ContextIdToAllocationType.find(Val: Id);
3031	assert(TypeI != ContextIdToAllocationType.end());
3032	auto CSI = ContextIdToContextSizeInfos.find(Val: Id);
3033	if (CSI != ContextIdToContextSizeInfos.end()) {
3034	for (auto &Info : CSI ->second) {
3035	OS << "MemProf hinting: "
3036	<< getAllocTypeString(AllocTypes: (uint8_t)TypeI ->second)
3037	<< " full allocation context " << Info.FullStackId
3038	<< " with total size " << Info.TotalSize << " is "
3039	<< getAllocTypeString(Node->AllocTypes) << " after cloning";
3040	if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3041	OS << " marked " << getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall)
3042	<< " due to cold byte percent";
3043	// Print the internal context id to aid debugging and visualization.
3044	OS << " (context id " << Id << ")";
3045	OS << "\n";
3046	}
3047	}
3048	}
3049	}
3050	}
3051
3052	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3053	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3054	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3055	for (const auto Node : nodes<GraphType>(this)) {
3056	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
3057	for (auto &Edge : Node->CallerEdges)
3058	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3059	}
3060	}
3061
3062	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3063	struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3064	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3065	using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3066
3067	using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3068	static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3069
3070	using nodes_iterator =
3071	mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3072	decltype(&getNode)>;
3073
3074	static nodes_iterator nodes_begin(GraphType G) {
3075	return nodes_iterator(G->NodeOwner.begin(), &getNode);
3076	}
3077
3078	static nodes_iterator nodes_end(GraphType G) {
3079	return nodes_iterator(G->NodeOwner.end(), &getNode);
3080	}
3081
3082	static NodeRef getEntryNode(GraphType G) {
3083	return G->NodeOwner.begin()->get();
3084	}
3085
3086	using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3087	static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3088	GetCallee(const EdgePtrTy &P) {
3089	return P->Callee;
3090	}
3091
3092	using ChildIteratorType =
3093	mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3094	DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3095	decltype(&GetCallee)>;
3096
3097	static ChildIteratorType child_begin(NodeRef N) {
3098	return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3099	}
3100
3101	static ChildIteratorType child_end(NodeRef N) {
3102	return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3103	}
3104	};
3105
3106	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3107	struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3108	: public DefaultDOTGraphTraits {
3109	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3110	// If the user requested the full graph to be exported, but provided an
3111	// allocation id, or if the user gave a context id and requested more than
3112	// just a specific context to be exported, note that highlighting is
3113	// enabled.
3114	DoHighlight =
3115	(AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) \|\|
3116	(ContextIdForDot.getNumOccurrences() &&
3117	DotGraphScope != DotScope::Context);
3118	}
3119
3120	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3121	using GTraits = GraphTraits<GraphType>;
3122	using NodeRef = typename GTraits::NodeRef;
3123	using ChildIteratorType = typename GTraits::ChildIteratorType;
3124
3125	static std::string getNodeLabel(NodeRef Node, GraphType G) {
3126	std::string LabelString =
3127	(Twine ("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3128	Twine(Node->OrigStackOrAllocId))
3129	.str();
3130	LabelString += "\n";
3131	if (Node->hasCall()) {
3132	auto Func = G->NodeToCallingFunc.find(Node);
3133	assert(Func != G->NodeToCallingFunc.end());
3134	LabelString +=
3135	G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3136	} else {
3137	LabelString += "null call";
3138	if (Node->Recursive)
3139	LabelString += " (recursive)";
3140	else
3141	LabelString += " (external)";
3142	}
3143	return LabelString;
3144	}
3145
3146	static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3147	auto ContextIds = Node->getContextIds();
3148	// If highlighting enabled, see if this node contains any of the context ids
3149	// of interest. If so, it will use a different color and a larger fontsize
3150	// (which makes the node larger as well).
3151	bool Highlight = false;
3152	if (DoHighlight) {
3153	assert(ContextIdForDot.getNumOccurrences() \|\|
3154	AllocIdForDot.getNumOccurrences());
3155	if (ContextIdForDot.getNumOccurrences())
3156	Highlight = ContextIds.contains(ContextIdForDot);
3157	else
3158	Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3159	}
3160	std::string AttributeString = (Twine ("tooltip=\"") + getNodeId(Node) + " " +
3161	getContextIds(ContextIds) + "\"")
3162	.str();
3163	// Default fontsize is 14
3164	if (Highlight)
3165	AttributeString += ",fontsize=\"30\"";
3166	AttributeString +=
3167	(Twine (",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"")
3168	.str();
3169	if (Node->CloneOf) {
3170	AttributeString += ",color=\"blue\"";
3171	AttributeString += ",style=\"filled,bold,dashed\"";
3172	} else
3173	AttributeString += ",style=\"filled\"";
3174	return AttributeString;
3175	}
3176
3177	static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3178	GraphType G) {
3179	auto &Edge = *(ChildIter.getCurrent());
3180	// If highlighting enabled, see if this edge contains any of the context ids
3181	// of interest. If so, it will use a different color and a heavier arrow
3182	// size and weight (the larger weight makes the highlighted path
3183	// straighter).
3184	bool Highlight = false;
3185	if (DoHighlight) {
3186	assert(ContextIdForDot.getNumOccurrences() \|\|
3187	AllocIdForDot.getNumOccurrences());
3188	if (ContextIdForDot.getNumOccurrences())
3189	Highlight = Edge->ContextIds.contains(ContextIdForDot);
3190	else
3191	Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3192	}
3193	auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight);
3194	std::string AttributeString =
3195	(Twine ("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
3196	// fillcolor is the arrow head and color is the line
3197	Twine (",fillcolor=\"") + Color + "\"" + Twine (",color=\"") + Color +
3198	"\"")
3199	.str();
3200	if (Edge->IsBackedge)
3201	AttributeString += ",style=\"dotted\"";
3202	// Default penwidth and weight are both 1.
3203	if (Highlight)
3204	AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3205	return AttributeString;
3206	}
3207
3208	// Since the NodeOwners list includes nodes that are no longer connected to
3209	// the graph, skip them here.
3210	static bool isNodeHidden(NodeRef Node, GraphType G) {
3211	if (Node->isRemoved())
3212	return true;
3213	// If a scope smaller than the full graph was requested, see if this node
3214	// contains any of the context ids of interest.
3215	if (DotGraphScope == DotScope::Alloc)
3216	return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3217	if (DotGraphScope == DotScope::Context)
3218	return !Node->getContextIds().contains(ContextIdForDot);
3219	return false;
3220	}
3221
3222	private:
3223	static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3224	std::string IdString = "ContextIds:";
3225	if (ContextIds.size() < `100`) {
3226	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3227	std::sort(first: SortedIds.begin(), last: SortedIds.end());
3228	for (auto Id : SortedIds)
3229	IdString += (" " + Twine (Id)).str();
3230	} else {
3231	IdString += (" (" + Twine (ContextIds.size()) + " ids)").str();
3232	}
3233	return IdString;
3234	}
3235
3236	static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3237	// If DoHighlight is not enabled, we want to use the highlight colors for
3238	// NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3239	// both compatible with the color scheme before highlighting was supported,
3240	// and for the NotCold+Cold color the non-highlight color is a bit more
3241	// readable.
3242	if (AllocTypes == (uint8_t)AllocationType::NotCold)
3243	// Color "brown1" actually looks like a lighter red.
3244	return !DoHighlight \|\| Highlight ? "brown1" : "lightpink";
3245	if (AllocTypes == (uint8_t)AllocationType::Cold)
3246	return !DoHighlight \|\| Highlight ? "cyan" : "lightskyblue";
3247	if (AllocTypes ==
3248	((uint8_t)AllocationType::NotCold \| (uint8_t)AllocationType::Cold))
3249	return Highlight ? "magenta" : "mediumorchid1";
3250	return "gray";
3251	}
3252
3253	static std::string getNodeId(NodeRef Node) {
3254	std::stringstream SStream;
3255	SStream << std::hex << "N0x" << (unsigned long long)Node;
3256	std::string Result = SStream.str();
3257	return Result;
3258	}
3259
3260	// True if we should highlight a specific context or allocation's contexts in
3261	// the emitted graph.
3262	static bool DoHighlight;
3263	};
3264
3265	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3266	bool DOTGraphTraits<
3267	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3268	false;
3269
3270	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3271	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3272	std::string Label) const {
3273	WriteGraph(this, "", false, Label,
3274	DotFilePathPrefix + "ccg." + Label + ".dot");
3275	}
3276
3277	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3278	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3279	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3280	const std::shared_ptr<ContextEdge> &Edge,
3281	DenseSet<uint32_t> ContextIdsToMove) {
3282	ContextNode *Node = Edge->Callee;
3283	assert(NodeToCallingFunc.count(Node));
3284	ContextNode *Clone =
3285	createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call);
3286	Node->addClone(Clone);
3287	Clone->MatchingCalls = Node->MatchingCalls;
3288	moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /NewClone=/true,
3289	ContextIdsToMove);
3290	return Clone;
3291	}
3292
3293	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3294	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3295	moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3296	ContextNode NewCallee, bool* NewClone,
3297	DenseSet<uint32_t> ContextIdsToMove) {
3298	// NewCallee and Edge's current callee must be clones of the same original
3299	// node (Edge's current callee may be the original node too).
3300	assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3301
3302	bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3303
3304	ContextNode *OldCallee = Edge->Callee;
3305
3306	// We might already have an edge to the new callee from earlier cloning for a
3307	// different allocation. If one exists we will reuse it.
3308	auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3309
3310	// Callers will pass an empty ContextIdsToMove set when they want to move the
3311	// edge. Copy in Edge's ids for simplicity.
3312	if (ContextIdsToMove.empty())
3313	ContextIdsToMove = Edge->getContextIds();
3314
3315	// If we are moving all of Edge's ids, then just move the whole Edge.
3316	// Otherwise only move the specified subset, to a new edge if needed.
3317	if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3318	// First, update the alloc types on New Callee from Edge.
3319	// Do this before we potentially clear Edge's fields below!
3320	NewCallee->AllocTypes \|= Edge->AllocTypes;
3321	// Moving the whole Edge.
3322	if (ExistingEdgeToNewCallee) {
3323	// Since we already have an edge to NewCallee, simply move the ids
3324	// onto it, and remove the existing Edge.
3325	ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3326	ExistingEdgeToNewCallee->AllocTypes \|= Edge->AllocTypes;
3327	assert(Edge->ContextIds == ContextIdsToMove);
3328	removeEdgeFromGraph(Edge: Edge.get());
3329	} else {
3330	// Otherwise just reconnect Edge to NewCallee.
3331	Edge->Callee = NewCallee;
3332	NewCallee->CallerEdges.push_back(Edge);
3333	// Remove it from callee where it was previously connected.
3334	OldCallee->eraseCallerEdge(Edge.get());
3335	// Don't need to update Edge's context ids since we are simply
3336	// reconnecting it.
3337	}
3338	} else {
3339	// Only moving a subset of Edge's ids.
3340	// Compute the alloc type of the subset of ids being moved.
3341	auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
3342	if (ExistingEdgeToNewCallee) {
3343	// Since we already have an edge to NewCallee, simply move the ids
3344	// onto it.
3345	ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3346	ExistingEdgeToNewCallee->AllocTypes \|= CallerEdgeAllocType;
3347	} else {
3348	// Otherwise, create a new edge to NewCallee for the ids being moved.
3349	auto NewEdge = std::make_shared<ContextEdge>(
3350	NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3351	Edge->Caller->CalleeEdges.push_back(NewEdge);
3352	NewCallee->CallerEdges.push_back(NewEdge);
3353	}
3354	// In either case, need to update the alloc types on NewCallee, and remove
3355	// those ids and update the alloc type on the original Edge.
3356	NewCallee->AllocTypes \|= CallerEdgeAllocType;
3357	set_subtract(Edge->ContextIds, ContextIdsToMove);
3358	Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
3359	}
3360	// Now walk the old callee node's callee edges and move Edge's context ids
3361	// over to the corresponding edge into the clone (which is created here if
3362	// this is a newly created clone).
3363	for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3364	ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3365	// If this is a direct recursion edge, use NewCallee (the clone) as the
3366	// callee as well, so that any edge updated/created here is also direct
3367	// recursive.
3368	if (CalleeToUse == OldCallee) {
3369	// If this is a recursive edge, see if we already moved a recursive edge
3370	// (which would have to have been this one) - if we were only moving a
3371	// subset of context ids it would still be on OldCallee.
3372	if (EdgeIsRecursive) {
3373	assert(OldCalleeEdge == Edge);
3374	continue;
3375	}
3376	CalleeToUse = NewCallee;
3377	}
3378	// The context ids moving to the new callee are the subset of this edge's
3379	// context ids and the context ids on the caller edge being moved.
3380	DenseSet<uint32_t> EdgeContextIdsToMove =
3381	set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3382	set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3383	OldCalleeEdge->AllocTypes =
3384	computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
3385	if (!NewClone) {
3386	// Update context ids / alloc type on corresponding edge to NewCallee.
3387	// There is a chance this may not exist if we are reusing an existing
3388	// clone, specifically during function assignment, where we would have
3389	// removed none type edges after creating the clone. If we can't find
3390	// a corresponding edge there, fall through to the cloning below.
3391	if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3392	NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3393	NewCalleeEdge->AllocTypes \|= computeAllocType(ContextIds&: EdgeContextIdsToMove);
3394	continue;
3395	}
3396	}
3397	auto NewEdge = std::make_shared<ContextEdge>(
3398	CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3399	EdgeContextIdsToMove);
3400	NewCallee->CalleeEdges.push_back(NewEdge);
3401	NewEdge->Callee->CallerEdges.push_back(NewEdge);
3402	}
3403	// Recompute the node alloc type now that its callee edges have been
3404	// updated (since we will compute from those edges).
3405	OldCallee->AllocTypes = OldCallee->computeAllocType();
3406	// OldCallee alloc type should be None iff its context id set is now empty.
3407	assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3408	OldCallee->emptyContextIds());
3409	if (VerifyCCG) {
3410	checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /CheckEdges=/false);
3411	checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /CheckEdges=/false);
3412	for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3413	checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3414	/CheckEdges=/false);
3415	for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3416	checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3417	/CheckEdges=/false);
3418	}
3419	}
3420
3421	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3422	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3423	moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3424	ContextNode *NewCaller) {
3425	auto *OldCallee = Edge->Callee;
3426	auto *NewCallee = OldCallee;
3427	// If this edge was direct recursive, make any new/updated edge also direct
3428	// recursive to NewCaller.
3429	bool Recursive = Edge->Caller == Edge->Callee;
3430	if (Recursive)
3431	NewCallee = NewCaller;
3432
3433	ContextNode *OldCaller = Edge->Caller;
3434	OldCaller->eraseCalleeEdge(Edge.get());
3435
3436	// We might already have an edge to the new caller. If one exists we will
3437	// reuse it.
3438	auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3439
3440	if (ExistingEdgeToNewCaller) {
3441	// Since we already have an edge to NewCaller, simply move the ids
3442	// onto it, and remove the existing Edge.
3443	ExistingEdgeToNewCaller->getContextIds().insert_range(
3444	Edge->getContextIds());
3445	ExistingEdgeToNewCaller->AllocTypes \|= Edge->AllocTypes;
3446	Edge->ContextIds.clear();
3447	Edge->AllocTypes = (uint8_t)AllocationType::None;
3448	OldCallee->eraseCallerEdge(Edge.get());
3449	} else {
3450	// Otherwise just reconnect Edge to NewCaller.
3451	Edge->Caller = NewCaller;
3452	NewCaller->CalleeEdges.push_back(Edge);
3453	if (Recursive) {
3454	assert(NewCallee == NewCaller);
3455	// In the case of (direct) recursive edges, we update the callee as well
3456	// so that it becomes recursive on the new caller.
3457	Edge->Callee = NewCallee;
3458	NewCallee->CallerEdges.push_back(Edge);
3459	OldCallee->eraseCallerEdge(Edge.get());
3460	}
3461	// Don't need to update Edge's context ids since we are simply
3462	// reconnecting it.
3463	}
3464	// In either case, need to update the alloc types on New Caller.
3465	NewCaller->AllocTypes \|= Edge->AllocTypes;
3466
3467	// Now walk the old caller node's caller edges and move Edge's context ids
3468	// over to the corresponding edge into the node (which is created here if
3469	// this is a newly created node). We can tell whether this is a newly created
3470	// node by seeing if it has any caller edges yet.
3471	#ifndef NDEBUG
3472	bool IsNewNode = NewCaller->CallerEdges.empty();
3473	#endif
3474	// If we just moved a direct recursive edge, presumably its context ids should
3475	// also flow out of OldCaller via some other non-recursive callee edge. We
3476	// don't want to remove the recursive context ids from other caller edges yet,
3477	// otherwise the context ids get into an inconsistent state on OldCaller.
3478	// We will update these context ids on the non-recursive caller edge when and
3479	// if they are updated on the non-recursive callee.
3480	if (!Recursive) {
3481	for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3482	auto OldCallerCaller = OldCallerEdge->Caller;
3483	// The context ids moving to the new caller are the subset of this edge's
3484	// context ids and the context ids on the callee edge being moved.
3485	DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3486	OldCallerEdge->getContextIds(), Edge->getContextIds());
3487	if (OldCaller == OldCallerCaller) {
3488	OldCallerCaller = NewCaller;
3489	// Don't actually move this one. The caller will move it directly via a
3490	// call to this function with this as the Edge if it is appropriate to
3491	// move to a diff node that has a matching callee (itself).
3492	continue;
3493	}
3494	set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3495	OldCallerEdge->AllocTypes =
3496	computeAllocType(ContextIds&: OldCallerEdge->getContextIds());
3497	// In this function we expect that any pre-existing node already has edges
3498	// from the same callers as the old node. That should be true in the
3499	// current use case, where we will remove None-type edges after copying
3500	// over all caller edges from the callee.
3501	auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3502	// Since we would have skipped caller edges when moving a direct recursive
3503	// edge, this may not hold true when recursive handling enabled.
3504	assert(IsNewNode \|\| ExistingCallerEdge \|\| AllowRecursiveCallsites);
3505	if (ExistingCallerEdge) {
3506	ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3507	ExistingCallerEdge->AllocTypes \|=
3508	computeAllocType(ContextIds&: EdgeContextIdsToMove);
3509	continue;
3510	}
3511	auto NewEdge = std::make_shared<ContextEdge>(
3512	NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3513	EdgeContextIdsToMove);
3514	NewCaller->CallerEdges.push_back(NewEdge);
3515	NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3516	}
3517	}
3518	// Recompute the node alloc type now that its caller edges have been
3519	// updated (since we will compute from those edges).
3520	OldCaller->AllocTypes = OldCaller->computeAllocType();
3521	// OldCaller alloc type should be None iff its context id set is now empty.
3522	assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3523	OldCaller->emptyContextIds());
3524	if (VerifyCCG) {
3525	checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /CheckEdges=/false);
3526	checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /CheckEdges=/false);
3527	for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3528	checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3529	/CheckEdges=/false);
3530	for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3531	checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3532	/CheckEdges=/false);
3533	}
3534	}
3535
3536	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3537	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3538	recursivelyRemoveNoneTypeCalleeEdges(
3539	ContextNode Node, DenseSet<const* ContextNode *> &Visited) {
3540	auto Inserted = Visited.insert(Node);
3541	if (!Inserted.second)
3542	return;
3543
3544	removeNoneTypeCalleeEdges(Node);
3545
3546	for (auto *Clone : Node->Clones)
3547	recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
3548
3549	// The recursive call may remove some of this Node's caller edges.
3550	// Iterate over a copy and skip any that were removed.
3551	auto CallerEdges = Node->CallerEdges;
3552	for (auto &Edge : CallerEdges) {
3553	// Skip any that have been removed by an earlier recursive call.
3554	if (Edge->isRemoved()) {
3555	assert(!is_contained(Node->CallerEdges, Edge));
3556	continue;
3557	}
3558	recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
3559	}
3560	}
3561
3562	// This is the standard DFS based backedge discovery algorithm.
3563	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3564	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3565	// If we are cloning recursive contexts, find and mark backedges from all root
3566	// callers, using the typical DFS based backedge analysis.
3567	if (!CloneRecursiveContexts)
3568	return;
3569	DenseSet<const ContextNode *> Visited;
3570	DenseSet<const ContextNode *> CurrentStack;
3571	for (auto &Entry : NonAllocationCallToContextNodeMap) {
3572	auto *Node = Entry.second;
3573	if (Node->isRemoved())
3574	continue;
3575	// It is a root if it doesn't have callers.
3576	if (!Node->CallerEdges.empty())
3577	continue;
3578	markBackedges(Node, Visited, CurrentStack);
3579	assert(CurrentStack.empty());
3580	}
3581	}
3582
3583	// Recursive helper for above markBackedges method.
3584	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3585	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3586	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
3587	DenseSet<const ContextNode *> &CurrentStack) {
3588	auto I = Visited.insert(Node);
3589	// We should only call this for unvisited nodes.
3590	assert(I.second);
3591	(void)I;
3592	for (auto &CalleeEdge : Node->CalleeEdges) {
3593	auto *Callee = CalleeEdge->Callee;
3594	if (Visited.count(Callee)) {
3595	// Since this was already visited we need to check if it is currently on
3596	// the recursive stack in which case it is a backedge.
3597	if (CurrentStack.count(Callee))
3598	CalleeEdge->IsBackedge = true;
3599	continue;
3600	}
3601	CurrentStack.insert(Callee);
3602	markBackedges(Callee, Visited, CurrentStack);
3603	CurrentStack.erase(Callee);
3604	}
3605	}
3606
3607	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3608	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3609	DenseSet<const ContextNode *> Visited;
3610	for (auto &Entry : AllocationCallToContextNodeMap) {
3611	Visited.clear();
3612	identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3613	}
3614	Visited.clear();
3615	for (auto &Entry : AllocationCallToContextNodeMap)
3616	recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
3617	if (VerifyCCG)
3618	check();
3619	}
3620
3621	// helper function to check an AllocType is cold or notcold or both.
3622	bool checkColdOrNotCold(uint8_t AllocType) {
3623	return (AllocType == (uint8_t)AllocationType::Cold) \|\|
3624	(AllocType == (uint8_t)AllocationType::NotCold) \|\|
3625	(AllocType ==
3626	((uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold));
3627	}
3628
3629	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3630	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3631	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
3632	const DenseSet<uint32_t> &AllocContextIds) {
3633	if (VerifyNodes)
3634	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
3635	assert(!Node->CloneOf);
3636
3637	// If Node as a null call, then either it wasn't found in the module (regular
3638	// LTO) or summary index (ThinLTO), or there were other conditions blocking
3639	// cloning (e.g. recursion, calls multiple targets, etc).
3640	// Do this here so that we don't try to recursively clone callers below, which
3641	// isn't useful at least for this node.
3642	if (!Node->hasCall())
3643	return;
3644
3645	// No need to look at any callers if allocation type already unambiguous.
3646	if (hasSingleAllocType(Node->AllocTypes))
3647	return;
3648
3649	#ifndef NDEBUG
3650	auto Insert =
3651	#endif
3652	Visited.insert(Node);
3653	// We should not have visited this node yet.
3654	assert(Insert.second);
3655	// The recursive call to identifyClones may delete the current edge from the
3656	// CallerEdges vector. Make a copy and iterate on that, simpler than passing
3657	// in an iterator and having recursive call erase from it. Other edges may
3658	// also get removed during the recursion, which will have null Callee and
3659	// Caller pointers (and are deleted later), so we skip those below.
3660	{
3661	auto CallerEdges = Node->CallerEdges;
3662	for (auto &Edge : CallerEdges) {
3663	// Skip any that have been removed by an earlier recursive call.
3664	if (Edge->isRemoved()) {
3665	assert(!is_contained(Node->CallerEdges, Edge));
3666	continue;
3667	}
3668	// Defer backedges. See comments further below where these edges are
3669	// handled during the cloning of this Node.
3670	if (Edge->IsBackedge) {
3671	// We should only mark these if cloning recursive contexts, where we
3672	// need to do this deferral.
3673	assert(CloneRecursiveContexts);
3674	continue;
3675	}
3676	// Ignore any caller we previously visited via another edge.
3677	if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3678	identifyClones(Edge->Caller, Visited, AllocContextIds);
3679	}
3680	}
3681	}
3682
3683	// Check if we reached an unambiguous call or have have only a single caller.
3684	if (hasSingleAllocType(Node->AllocTypes) \|\| Node->CallerEdges.size() <= `1`)
3685	return;
3686
3687	// We need to clone.
3688
3689	// Try to keep the original version as alloc type NotCold. This will make
3690	// cases with indirect calls or any other situation with an unknown call to
3691	// the original function get the default behavior. We do this by sorting the
3692	// CallerEdges of the Node we will clone by alloc type.
3693	//
3694	// Give NotCold edge the lowest sort priority so those edges are at the end of
3695	// the caller edges vector, and stay on the original version (since the below
3696	// code clones greedily until it finds all remaining edges have the same type
3697	// and leaves the remaining ones on the original Node).
3698	//
3699	// We shouldn't actually have any None type edges, so the sorting priority for
3700	// that is arbitrary, and we assert in that case below.
3701	const unsigned AllocTypeCloningPriority[] = {/None/ `3`, /NotCold/ `4`,
3702	/Cold/ `1`,
3703	/NotColdCold/ `2`};
3704	llvm::stable_sort(Node->CallerEdges,
3705	[&](const std::shared_ptr<ContextEdge> &A,
3706	const std::shared_ptr<ContextEdge> &B) {
3707	// Nodes with non-empty context ids should be sorted
3708	// before those with empty context ids.
3709	if (A->ContextIds.empty())
3710	// Either B ContextIds are non-empty (in which case we
3711	// should return false because B < A), or B ContextIds
3712	// are empty, in which case they are equal, and we
3713	// should maintain the original relative ordering.
3714	return false;
3715	if (B->ContextIds.empty())
3716	return true;
3717
3718	if (A->AllocTypes == B->AllocTypes)
3719	// Use the first context id for each edge as a
3720	// tie-breaker.
3721	return A->ContextIds.begin() < B->ContextIds.begin();
3722	return AllocTypeCloningPriority[A->AllocTypes] <
3723	AllocTypeCloningPriority[B->AllocTypes];
3724	});
3725
3726	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3727
3728	DenseSet<uint32_t> RecursiveContextIds;
3729	assert(AllowRecursiveContexts \|\| !CloneRecursiveContexts);
3730	// If we are allowing recursive callsites, but have also disabled recursive
3731	// contexts, look for context ids that show up in multiple caller edges.
3732	if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
3733	DenseSet<uint32_t> AllCallerContextIds;
3734	for (auto &CE : Node->CallerEdges) {
3735	// Resize to the largest set of caller context ids, since we know the
3736	// final set will be at least that large.
3737	AllCallerContextIds.reserve(Size: CE->getContextIds().size());
3738	for (auto Id : CE->getContextIds())
3739	if (!AllCallerContextIds.insert(Id).second)
3740	RecursiveContextIds.insert(Id);
3741	}
3742	}
3743
3744	// Iterate until we find no more opportunities for disambiguating the alloc
3745	// types via cloning. In most cases this loop will terminate once the Node
3746	// has a single allocation type, in which case no more cloning is needed.
3747	// Iterate over a copy of Node's caller edges, since we may need to remove
3748	// edges in the moveEdgeTo methods, and this simplifies the handling and*
3749	// makes it less error-prone.
3750	auto CallerEdges = Node->CallerEdges;
3751	for (auto &CallerEdge : CallerEdges) {
3752	// Skip any that have been removed by an earlier recursive call.
3753	if (CallerEdge->isRemoved()) {
3754	assert(!is_contained(Node->CallerEdges, CallerEdge));
3755	continue;
3756	}
3757	assert(CallerEdge->Callee == Node);
3758
3759	// See if cloning the prior caller edge left this node with a single alloc
3760	// type or a single caller. In that case no more cloning of Node is needed.
3761	if (hasSingleAllocType(Node->AllocTypes) \|\| Node->CallerEdges.size() <= `1`)
3762	break;
3763
3764	// If the caller was not successfully matched to a call in the IR/summary,
3765	// there is no point in trying to clone for it as we can't update that call.
3766	if (!CallerEdge->Caller->hasCall())
3767	continue;
3768
3769	// Only need to process the ids along this edge pertaining to the given
3770	// allocation.
3771	auto CallerEdgeContextsForAlloc =
3772	set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3773	if (!RecursiveContextIds.empty())
3774	CallerEdgeContextsForAlloc =
3775	set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3776	if (CallerEdgeContextsForAlloc.empty())
3777	continue;
3778
3779	auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
3780
3781	// Compute the node callee edge alloc types corresponding to the context ids
3782	// for this caller edge.
3783	std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3784	CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
3785	for (auto &CalleeEdge : Node->CalleeEdges)
3786	CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3787	Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
3788
3789	// Don't clone if doing so will not disambiguate any alloc types amongst
3790	// caller edges (including the callee edges that would be cloned).
3791	// Otherwise we will simply move all edges to the clone.
3792	//
3793	// First check if by cloning we will disambiguate the caller allocation
3794	// type from node's allocation type. Query allocTypeToUse so that we don't
3795	// bother cloning to distinguish NotCold+Cold from NotCold. Note that
3796	// neither of these should be None type.
3797	//
3798	// Then check if by cloning node at least one of the callee edges will be
3799	// disambiguated by splitting out different context ids.
3800	//
3801	// However, always do the cloning if this is a backedge, in which case we
3802	// have not yet cloned along this caller edge.
3803	assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3804	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3805	if (!CallerEdge->IsBackedge &&
3806	allocTypeToUse(CallerAllocTypeForAlloc) ==
3807	allocTypeToUse(Node->AllocTypes) &&
3808	allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3809	CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3810	continue;
3811	}
3812
3813	if (CallerEdge->IsBackedge) {
3814	// We should only mark these if cloning recursive contexts, where we
3815	// need to do this deferral.
3816	assert(CloneRecursiveContexts);
3817	DeferredBackedges ++;
3818	}
3819
3820	// If this is a backedge, we now do recursive cloning starting from its
3821	// caller since we may have moved unambiguous caller contexts to a clone
3822	// of this Node in a previous iteration of the current loop, giving more
3823	// opportunity for cloning through the backedge. Because we sorted the
3824	// caller edges earlier so that cold caller edges are first, we would have
3825	// visited and cloned this node for any unamibiguously cold non-recursive
3826	// callers before any ambiguous backedge callers. Note that we don't do this
3827	// if the caller is already cloned or visited during cloning (e.g. via a
3828	// different context path from the allocation).
3829	// TODO: Can we do better in the case where the caller was already visited?
3830	if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3831	!Visited.count(CallerEdge->Caller)) {
3832	const auto OrigIdCount = CallerEdge->getContextIds().size();
3833	// Now do the recursive cloning of this backedge's caller, which was
3834	// deferred earlier.
3835	identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3836	removeNoneTypeCalleeEdges(Node: CallerEdge->Caller);
3837	// See if the recursive call to identifyClones moved the context ids to a
3838	// new edge from this node to a clone of caller, and switch to looking at
3839	// that new edge so that we clone Node for the new caller clone.
3840	bool UpdatedEdge = false;
3841	if (OrigIdCount > CallerEdge->getContextIds().size()) {
3842	for (auto E : Node->CallerEdges) {
3843	// Only interested in clones of the current edges caller.
3844	if (E->Caller->CloneOf != CallerEdge->Caller)
3845	continue;
3846	// See if this edge contains any of the context ids originally on the
3847	// current caller edge.
3848	auto CallerEdgeContextsForAllocNew =
3849	set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3850	if (CallerEdgeContextsForAllocNew.empty())
3851	continue;
3852	// Make sure we don't pick a previously existing caller edge of this
3853	// Node, which would be processed on a different iteration of the
3854	// outer loop over the saved CallerEdges.
3855	if (llvm::is_contained(CallerEdges, E))
3856	continue;
3857	// The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3858	// are updated further below for all cases where we just invoked
3859	// identifyClones recursively.
3860	CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3861	CallerEdge = E;
3862	UpdatedEdge = true;
3863	break;
3864	}
3865	}
3866	// If cloning removed this edge (and we didn't update it to a new edge
3867	// above), we're done with this edge. It's possible we moved all of the
3868	// context ids to an existing clone, in which case there's no need to do
3869	// further processing for them.
3870	if (CallerEdge->isRemoved())
3871	continue;
3872
3873	// Now we need to update the information used for the cloning decisions
3874	// further below, as we may have modified edges and their context ids.
3875
3876	// Note if we changed the CallerEdge above we would have already updated
3877	// the context ids.
3878	if (!UpdatedEdge) {
3879	CallerEdgeContextsForAlloc = set_intersection(
3880	CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3881	if (CallerEdgeContextsForAlloc.empty())
3882	continue;
3883	}
3884	// Update the other information that depends on the edges and on the now
3885	// updated CallerEdgeContextsForAlloc.
3886	CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
3887	CalleeEdgeAllocTypesForCallerEdge.clear();
3888	for (auto &CalleeEdge : Node->CalleeEdges) {
3889	CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3890	Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
3891	}
3892	}
3893
3894	// First see if we can use an existing clone. Check each clone and its
3895	// callee edges for matching alloc types.
3896	ContextNode Clone = nullptr*;
3897	for (auto *CurClone : Node->Clones) {
3898	if (allocTypeToUse(CurClone->AllocTypes) !=
3899	allocTypeToUse(CallerAllocTypeForAlloc))
3900	continue;
3901
3902	bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3903	hasSingleAllocType(CallerAllocTypeForAlloc);
3904	// The above check should mean that if both have single alloc types that
3905	// they should be equal.
3906	assert(!BothSingleAlloc \|\|
3907	CurClone->AllocTypes == CallerAllocTypeForAlloc);
3908
3909	// If either both have a single alloc type (which are the same), or if the
3910	// clone's callee edges have the same alloc types as those for the current
3911	// allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3912	// then we can reuse this clone.
3913	if (BothSingleAlloc \|\| allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3914	CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3915	Clone = CurClone;
3916	break;
3917	}
3918	}
3919
3920	// The edge iterator is adjusted when we move the CallerEdge to the clone.
3921	if (Clone)
3922	moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /NewClone=/false,
3923	ContextIdsToMove: CallerEdgeContextsForAlloc);
3924	else
3925	Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc);
3926
3927	// Sanity check that no alloc types on clone or its edges are None.
3928	assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3929	}
3930
3931	// We should still have some context ids on the original Node.
3932	assert(!Node->emptyContextIds());
3933
3934	// Sanity check that no alloc types on node or edges are None.
3935	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3936
3937	if (VerifyNodes)
3938	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
3939	}
3940
3941	void ModuleCallsiteContextGraph::updateAllocationCall(
3942	CallInfo &Call, AllocationType AllocType) {
3943	std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
3944	auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
3945	Kind: "memprof", Val: AllocTypeString);
3946	cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
3947	OREGetter (Call.call()->getFunction())
3948	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofAttribute", Call.call())
3949	<< ore::NV ("AllocationCall", Call.call()) << " in clone "
3950	<< ore::NV ("Caller", Call.call()->getFunction())
3951	<< " marked with memprof allocation attribute "
3952	<< ore::NV ("Attribute", AllocTypeString));
3953	}
3954
3955	void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3956	AllocationType AllocType) {
3957	auto AI = cast<AllocInfo >(Val: Call.call());
3958	assert(AI);
3959	assert(AI->Versions.size() > Call.cloneNo());
3960	AI->Versions [Call.cloneNo()] = (uint8_t)AllocType;
3961	}
3962
3963	AllocationType
3964	ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3965	const auto *CB = cast<CallBase>(Val: Call.call());
3966	if (!CB->getAttributes().hasFnAttr(Kind: "memprof"))
3967	return AllocationType::None;
3968	return CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
3969	? AllocationType::Cold
3970	: AllocationType::NotCold;
3971	}
3972
3973	AllocationType
3974	IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3975	const auto AI = cast<AllocInfo >(Val: Call.call());
3976	assert(AI->Versions.size() > Call.cloneNo());
3977	return (AllocationType)AI->Versions [Call.cloneNo()];
3978	}
3979
3980	void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3981	FuncInfo CalleeFunc) {
3982	if (CalleeFunc.cloneNo() > `0`)
3983	cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
3984	OREGetter (CallerCall.call()->getFunction())
3985	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CallerCall.call())
3986	<< ore::NV ("Call", CallerCall.call()) << " in clone "
3987	<< ore::NV ("Caller", CallerCall.call()->getFunction())
3988	<< " assigned to call function clone "
3989	<< ore::NV ("Callee", CalleeFunc.func()));
3990	}
3991
3992	void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3993	FuncInfo CalleeFunc) {
3994	auto CI = cast<CallsiteInfo >(Val: CallerCall.call());
3995	assert(CI &&
3996	"Caller cannot be an allocation which should not have profiled calls");
3997	assert(CI->Clones.size() > CallerCall.cloneNo());
3998	CI->Clones [CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3999	}
4000
4001	CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4002	Instruction *>::FuncInfo
4003	ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4004	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
4005	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4006	// Use existing LLVM facilities for cloning and obtaining Call in clone
4007	ValueToValueMapTy VMap;
4008	auto *NewFunc = CloneFunction(F: Func.func(), VMap);
4009	std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
4010	assert(!Func.func()->getParent()->getFunction(Name));
4011	NewFunc->setName(Name);
4012	if (auto *SP = NewFunc->getSubprogram())
4013	SP->replaceLinkageName(
4014	LN: MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name));
4015	for (auto &Inst : CallsWithMetadataInFunc) {
4016	// This map always has the initial version in it.
4017	assert(Inst.cloneNo() == `0`);
4018	CallMap [Inst] = {cast<Instruction>(Val&: VMap [Inst.call()]), CloneNo};
4019	}
4020	OREGetter (Func.func())
4021	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofClone", Func.func())
4022	<< "created clone " << ore::NV ("NewFunction", NewFunc));
4023	return {NewFunc, CloneNo};
4024	}
4025
4026	CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4027	IndexCall>::FuncInfo
4028	IndexCallsiteContextGraph::cloneFunctionForCallsite(
4029	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
4030	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4031	// Check how many clones we have of Call (and therefore function).
4032	// The next clone number is the current size of versions array.
4033	// Confirm this matches the CloneNo provided by the caller, which is based on
4034	// the number of function clones we have.
4035	assert(CloneNo == (isa<AllocInfo *>(Call.call())
4036	? cast<AllocInfo *>(Call.call())->Versions.size()
4037	: cast<CallsiteInfo *>(Call.call())->Clones.size()));
4038	// Walk all the instructions in this function. Create a new version for
4039	// each (by adding an entry to the Versions/Clones summary array), and copy
4040	// over the version being called for the function clone being cloned here.
4041	// Additionally, add an entry to the CallMap for the new function clone,
4042	// mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4043	// to the new call clone.
4044	for (auto &Inst : CallsWithMetadataInFunc) {
4045	// This map always has the initial version in it.
4046	assert(Inst.cloneNo() == `0`);
4047	if (auto AI = dyn_cast<AllocInfo >(Val: Inst.call())) {
4048	assert(AI->Versions.size() == CloneNo);
4049	// We assign the allocation type later (in updateAllocationCall), just add
4050	// an entry for it here.
4051	AI->Versions.push_back(Elt: `0`);
4052	} else {
4053	auto CI = cast<CallsiteInfo >(Val: Inst.call());
4054	assert(CI && CI->Clones.size() == CloneNo);
4055	// We assign the clone number later (in updateCall), just add an entry for
4056	// it here.
4057	CI->Clones.push_back(Elt: `0`);
4058	}
4059	CallMap [Inst] = {Inst.call(), CloneNo};
4060	}
4061	return {Func.func(), CloneNo};
4062	}
4063
4064	// We perform cloning for each allocation node separately. However, this
4065	// sometimes results in a situation where the same node calls multiple
4066	// clones of the same callee, created for different allocations. This
4067	// causes issues when assigning functions to these clones, as each node can
4068	// in reality only call a single callee clone.
4069	//
4070	// To address this, before assigning functions, merge callee clone nodes as
4071	// needed using a post order traversal from the allocations. We attempt to
4072	// use existing clones as the merge node when legal, and to share them
4073	// among callers with the same properties (callers calling the same set of
4074	// callee clone nodes for the same allocations).
4075	//
4076	// Without this fix, in some cases incorrect function assignment will lead
4077	// to calling the wrong allocation clone.
4078	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4079	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4080	if (!MergeClones)
4081	return;
4082
4083	// Generate a map from context id to the associated allocation node for use
4084	// when merging clones.
4085	DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4086	for (auto &Entry : AllocationCallToContextNodeMap) {
4087	auto *Node = Entry.second;
4088	for (auto Id : Node->getContextIds())
4089	ContextIdToAllocationNode[Id] = Node->getOrigNode();
4090	for (auto *Clone : Node->Clones) {
4091	for (auto Id : Clone->getContextIds())
4092	ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4093	}
4094	}
4095
4096	// Post order traversal starting from allocations to ensure each callsite
4097	// calls a single clone of its callee. Callee nodes that are clones of each
4098	// other are merged (via new merge nodes if needed) to achieve this.
4099	DenseSet<const ContextNode *> Visited;
4100	for (auto &Entry : AllocationCallToContextNodeMap) {
4101	auto *Node = Entry.second;
4102
4103	mergeClones(Node, Visited, ContextIdToAllocationNode);
4104
4105	// Make a copy so the recursive post order traversal that may create new
4106	// clones doesn't mess up iteration. Note that the recursive traversal
4107	// itself does not call mergeClones on any of these nodes, which are all
4108	// (clones of) allocations.
4109	auto Clones = Node->Clones;
4110	for (auto *Clone : Clones)
4111	mergeClones(Clone, Visited, ContextIdToAllocationNode);
4112	}
4113
4114	if (DumpCCG) {
4115	dbgs() << "CCG after merging:\n";
4116	dbgs() << *this;
4117	}
4118	if (ExportToDot)
4119	exportToDot(Label: "aftermerge");
4120
4121	if (VerifyCCG) {
4122	check();
4123	}
4124	}
4125
4126	// Recursive helper for above mergeClones method.
4127	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4128	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4129	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
4130	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4131	auto Inserted = Visited.insert(Node);
4132	if (!Inserted.second)
4133	return;
4134
4135	// Make a copy since the recursive call may move a caller edge to a new
4136	// callee, messing up the iterator.
4137	auto CallerEdges = Node->CallerEdges;
4138	for (auto CallerEdge : CallerEdges) {
4139	// Skip any caller edge moved onto a different callee during recursion.
4140	if (CallerEdge->Callee != Node)
4141	continue;
4142	mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4143	}
4144
4145	// Merge for this node after we handle its callers.
4146	mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4147	}
4148
4149	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4150	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4151	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
4152	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4153	// Ignore Node if we moved all of its contexts to clones.
4154	if (Node->emptyContextIds())
4155	return;
4156
4157	// First identify groups of clones among Node's callee edges, by building
4158	// a map from each callee base node to the associated callee edges from Node.
4159	MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4160	OrigNodeToCloneEdges;
4161	for (const auto &E : Node->CalleeEdges) {
4162	auto *Callee = E->Callee;
4163	if (!Callee->CloneOf && Callee->Clones.empty())
4164	continue;
4165	ContextNode *Base = Callee->getOrigNode();
4166	OrigNodeToCloneEdges[Base].push_back(E);
4167	}
4168
4169	// Helper for callee edge sorting below. Return true if A's callee has fewer
4170	// caller edges than B, or if A is a clone and B is not, or if A's first
4171	// context id is smaller than B's.
4172	auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4173	const std::shared_ptr<ContextEdge> &B) {
4174	if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4175	return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4176	if (A->Callee->CloneOf && !B->Callee->CloneOf)
4177	return true;
4178	else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4179	return false;
4180	// Use the first context id for each edge as a
4181	// tie-breaker.
4182	return A->ContextIds.begin() < B->ContextIds.begin();
4183	};
4184
4185	// Process each set of callee clones called by Node, performing the needed
4186	// merging.
4187	for (auto Entry : OrigNodeToCloneEdges) {
4188	// CalleeEdges is the set of edges from Node reaching callees that are
4189	// mutual clones of each other.
4190	auto &CalleeEdges = Entry.second;
4191	auto NumCalleeClones = CalleeEdges.size();
4192	// A single edge means there is no merging needed.
4193	if (NumCalleeClones == `1`)
4194	continue;
4195	// Sort the CalleeEdges calling this group of clones in ascending order of
4196	// their caller edge counts, putting the original non-clone node first in
4197	// cases of a tie. This simplifies finding an existing node to use as the
4198	// merge node.
4199	llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4200
4201	/// Find other callers of the given set of callee edges that can
4202	/// share the same callee merge node. See the comments at this method
4203	/// definition for details.
4204	DenseSet<ContextNode *> OtherCallersToShareMerge;
4205	findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4206	OtherCallersToShareMerge);
4207
4208	// Now do the actual merging. Identify existing or create a new MergeNode
4209	// during the first iteration. Move each callee over, along with edges from
4210	// other callers we've determined above can share the same merge node.
4211	ContextNode MergeNode = nullptr*;
4212	DenseMap<ContextNode , unsigned*> CallerToMoveCount;
4213	for (auto CalleeEdge : CalleeEdges) {
4214	auto *OrigCallee = CalleeEdge->Callee;
4215	// If we don't have a MergeNode yet (only happens on the first iteration,
4216	// as a new one will be created when we go to move the first callee edge
4217	// over as needed), see if we can use this callee.
4218	if (!MergeNode) {
4219	// If there are no other callers, simply use this callee.
4220	if (CalleeEdge->Callee->CallerEdges.size() == `1`) {
4221	MergeNode = OrigCallee;
4222	NonNewMergedNodes ++;
4223	continue;
4224	}
4225	// Otherwise, if we have identified other caller nodes that can share
4226	// the merge node with Node, see if all of OrigCallee's callers are
4227	// going to share the same merge node. In that case we can use callee
4228	// (since all of its callers would move to the new merge node).
4229	if (!OtherCallersToShareMerge.empty()) {
4230	bool MoveAllCallerEdges = true;
4231	for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4232	if (CalleeCallerE == CalleeEdge)
4233	continue;
4234	if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4235	MoveAllCallerEdges = false;
4236	break;
4237	}
4238	}
4239	// If we are going to move all callers over, we can use this callee as
4240	// the MergeNode.
4241	if (MoveAllCallerEdges) {
4242	MergeNode = OrigCallee;
4243	NonNewMergedNodes ++;
4244	continue;
4245	}
4246	}
4247	}
4248	// Move this callee edge, creating a new merge node if necessary.
4249	if (MergeNode) {
4250	assert(MergeNode != OrigCallee);
4251	moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode,
4252	/NewClone/ false);
4253	} else {
4254	MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4255	NewMergedNodes ++;
4256	}
4257	// Now move all identified edges from other callers over to the merge node
4258	// as well.
4259	if (!OtherCallersToShareMerge.empty()) {
4260	// Make and iterate over a copy of OrigCallee's caller edges because
4261	// some of these will be moved off of the OrigCallee and that would mess
4262	// up the iteration from OrigCallee.
4263	auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4264	for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4265	if (CalleeCallerE == CalleeEdge)
4266	continue;
4267	if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4268	continue;
4269	CallerToMoveCount[CalleeCallerE->Caller]++;
4270	moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode,
4271	/NewClone/ false);
4272	}
4273	}
4274	removeNoneTypeCalleeEdges(Node: OrigCallee);
4275	removeNoneTypeCalleeEdges(Node: MergeNode);
4276	}
4277	}
4278	}
4279
4280	// Look for other nodes that have edges to the same set of callee
4281	// clones as the current Node. Those can share the eventual merge node
4282	// (reducing cloning and binary size overhead) iff:
4283	// - they have edges to the same set of callee clones
4284	// - each callee edge reaches a subset of the same allocations as Node's
4285	// corresponding edge to the same callee clone.
4286	// The second requirement is to ensure that we don't undo any of the
4287	// necessary cloning to distinguish contexts with different allocation
4288	// behavior.
4289	// FIXME: This is somewhat conservative, as we really just need to ensure
4290	// that they don't reach the same allocations as contexts on edges from Node
4291	// going to any of the other* callee clones being merged. However, that*
4292	// requires more tracking and checking to get right.
4293	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4294	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4295	findOtherCallersToShareMerge(
4296	ContextNode *Node,
4297	std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4298	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4299	DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4300	auto NumCalleeClones = CalleeEdges.size();
4301	// This map counts how many edges to the same callee clone exist for other
4302	// caller nodes of each callee clone.
4303	DenseMap<ContextNode , unsigned*> OtherCallersToSharedCalleeEdgeCount;
4304	// Counts the number of other caller nodes that have edges to all callee
4305	// clones that don't violate the allocation context checking.
4306	unsigned PossibleOtherCallerNodes = `0`;
4307
4308	// We only need to look at other Caller nodes if the first callee edge has
4309	// multiple callers (recall they are sorted in ascending order above).
4310	if (CalleeEdges[`0`]->Callee->CallerEdges.size() < `2`)
4311	return;
4312
4313	// For each callee edge:
4314	// - Collect the count of other caller nodes calling the same callees.
4315	// - Collect the alloc nodes reached by contexts on each callee edge.
4316	DenseMap<ContextEdge , DenseSet<ContextNode >> CalleeEdgeToAllocNodes;
4317	for (auto CalleeEdge : CalleeEdges) {
4318	assert(CalleeEdge->Callee->CallerEdges.size() > `1`);
4319	// For each other caller of the same callee, increment the count of
4320	// edges reaching the same callee clone.
4321	for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4322	if (CalleeCallerEdges->Caller == Node) {
4323	assert(CalleeCallerEdges == CalleeEdge);
4324	continue;
4325	}
4326	OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4327	// If this caller edge now reaches all of the same callee clones,
4328	// increment the count of candidate other caller nodes.
4329	if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4330	NumCalleeClones)
4331	PossibleOtherCallerNodes++;
4332	}
4333	// Collect the alloc nodes reached by contexts on each callee edge, for
4334	// later analysis.
4335	for (auto Id : CalleeEdge->getContextIds()) {
4336	auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4337	if (!Alloc) {
4338	// FIXME: unclear why this happens occasionally, presumably
4339	// imperfect graph updates possibly with recursion.
4340	MissingAllocForContextId ++;
4341	continue;
4342	}
4343	CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4344	}
4345	}
4346
4347	// Now walk the callee edges again, and make sure that for each candidate
4348	// caller node all of its edges to the callees reach the same allocs (or
4349	// a subset) as those along the corresponding callee edge from Node.
4350	for (auto CalleeEdge : CalleeEdges) {
4351	assert(CalleeEdge->Callee->CallerEdges.size() > `1`);
4352	// Stop if we do not have any (more) candidate other caller nodes.
4353	if (!PossibleOtherCallerNodes)
4354	break;
4355	auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4356	// Check each other caller of this callee clone.
4357	for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4358	// Not interested in the callee edge from Node itself.
4359	if (CalleeCallerE == CalleeEdge)
4360	continue;
4361	// Skip any callers that didn't have callee edges to all the same
4362	// callee clones.
4363	if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4364	NumCalleeClones)
4365	continue;
4366	// Make sure that each context along edge from candidate caller node
4367	// reaches an allocation also reached by this callee edge from Node.
4368	for (auto Id : CalleeCallerE->getContextIds()) {
4369	auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4370	if (!Alloc)
4371	continue;
4372	// If not, simply reset the map entry to 0 so caller is ignored, and
4373	// reduce the count of candidate other caller nodes.
4374	if (!CurCalleeAllocNodes.contains(Alloc)) {
4375	OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = `0`;
4376	PossibleOtherCallerNodes--;
4377	break;
4378	}
4379	}
4380	}
4381	}
4382
4383	if (!PossibleOtherCallerNodes)
4384	return;
4385
4386	// Build the set of other caller nodes that can use the same callee merge
4387	// node.
4388	for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4389	if (Count != NumCalleeClones)
4390	continue;
4391	OtherCallersToShareMerge.insert(OtherCaller);
4392	}
4393	}
4394
4395	// This method assigns cloned callsites to functions, cloning the functions as
4396	// needed. The assignment is greedy and proceeds roughly as follows:
4397	//
4398	// For each function Func:
4399	// For each call with graph Node having clones:
4400	// Initialize ClonesWorklist to Node and its clones
4401	// Initialize NodeCloneCount to 0
4402	// While ClonesWorklist is not empty:
4403	// Clone = pop front ClonesWorklist
4404	// NodeCloneCount++
4405	// If Func has been cloned less than NodeCloneCount times:
4406	// If NodeCloneCount is 1:
4407	// Assign Clone to original Func
4408	// Continue
4409	// Create a new function clone
4410	// If other callers not assigned to call a function clone yet:
4411	// Assign them to call new function clone
4412	// Continue
4413	// Assign any other caller calling the cloned version to new clone
4414	//
4415	// For each caller of Clone:
4416	// If caller is assigned to call a specific function clone:
4417	// If we cannot assign Clone to that function clone:
4418	// Create new callsite Clone NewClone
4419	// Add NewClone to ClonesWorklist
4420	// Continue
4421	// Assign Clone to existing caller's called function clone
4422	// Else:
4423	// If Clone not already assigned to a function clone:
4424	// Assign to first function clone without assignment
4425	// Assign caller to selected function clone
4426	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4427	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4428	bool Changed = false;
4429
4430	mergeClones();
4431
4432	// Keep track of the assignment of nodes (callsites) to function clones they
4433	// call.
4434	DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4435
4436	// Update caller node to call function version CalleeFunc, by recording the
4437	// assignment in CallsiteToCalleeFuncCloneMap.
4438	auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4439	const FuncInfo &CalleeFunc) {
4440	assert(Caller->hasCall());
4441	CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4442	};
4443
4444	// Walk all functions for which we saw calls with memprof metadata, and handle
4445	// cloning for each of its calls.
4446	for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4447	FuncInfo OrigFunc(Func);
4448	// Map from each clone of OrigFunc to a map of remappings of each call of
4449	// interest (from original uncloned call to the corresponding cloned call in
4450	// that function clone).
4451	std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
4452	for (auto &Call : CallsWithMetadata) {
4453	ContextNode *Node = getNodeForInst(C: Call);
4454	// Skip call if we do not have a node for it (all uses of its stack ids
4455	// were either on inlined chains or pruned from the MIBs), or if we did
4456	// not create any clones for it.
4457	if (!Node \|\| Node->Clones.empty())
4458	continue;
4459	assert(Node->hasCall() &&
4460	"Not having a call should have prevented cloning");
4461
4462	// Track the assignment of function clones to clones of the current
4463	// callsite Node being handled.
4464	std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4465
4466	// Assign callsite version CallsiteClone to function version FuncClone,
4467	// and also assign (possibly cloned) Call to CallsiteClone.
4468	auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4469	CallInfo &Call,
4470	ContextNode *CallsiteClone,
4471	bool IsAlloc) {
4472	// Record the clone of callsite node assigned to this function clone.
4473	FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4474
4475	assert(FuncClonesToCallMap.count(FuncClone));
4476	std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
4477	CallInfo CallClone(Call);
4478	if (auto It = CallMap.find(Call); It != CallMap.end())
4479	CallClone = It->second;
4480	CallsiteClone->setCall(CallClone);
4481	// Need to do the same for all matching calls.
4482	for (auto &MatchingCall : Node->MatchingCalls) {
4483	CallInfo CallClone(MatchingCall);
4484	if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4485	CallClone = It->second;
4486	// Updates the call in the list.
4487	MatchingCall = CallClone;
4488	}
4489	};
4490
4491	// Keep track of the clones of callsite Node that need to be assigned to
4492	// function clones. This list may be expanded in the loop body below if we
4493	// find additional cloning is required.
4494	std::deque<ContextNode *> ClonesWorklist;
4495	// Ignore original Node if we moved all of its contexts to clones.
4496	if (!Node->emptyContextIds())
4497	ClonesWorklist.push_back(Node);
4498	llvm::append_range(ClonesWorklist, Node->Clones);
4499
4500	// Now walk through all of the clones of this callsite Node that we need,
4501	// and determine the assignment to a corresponding clone of the current
4502	// function (creating new function clones as needed).
4503	unsigned NodeCloneCount = `0`;
4504	while (!ClonesWorklist.empty()) {
4505	ContextNode *Clone = ClonesWorklist.front();
4506	ClonesWorklist.pop_front();
4507	NodeCloneCount++;
4508	if (VerifyNodes)
4509	checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4510
4511	// Need to create a new function clone if we have more callsite clones
4512	// than existing function clones, which would have been assigned to an
4513	// earlier clone in the list (we assign callsite clones to function
4514	// clones greedily).
4515	if (FuncClonesToCallMap.size() < NodeCloneCount) {
4516	// If this is the first callsite copy, assign to original function.
4517	if (NodeCloneCount == `1`) {
4518	// Since FuncClonesToCallMap is empty in this case, no clones have
4519	// been created for this function yet, and no callers should have
4520	// been assigned a function clone for this callee node yet.
4521	assert(llvm::none_of(
4522	Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4523	return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4524	}));
4525	// Initialize with empty call map, assign Clone to original function
4526	// and its callers, and skip to the next clone.
4527	FuncClonesToCallMap[OrigFunc] = {};
4528	AssignCallsiteCloneToFuncClone(
4529	OrigFunc, Call, Clone,
4530	AllocationCallToContextNodeMap.count(Call));
4531	for (auto &CE : Clone->CallerEdges) {
4532	// Ignore any caller that does not have a recorded callsite Call.
4533	if (!CE->Caller->hasCall())
4534	continue;
4535	RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4536	}
4537	continue;
4538	}
4539
4540	// First locate which copy of OrigFunc to clone again. If a caller
4541	// of this callsite clone was already assigned to call a particular
4542	// function clone, we need to redirect all of those callers to the
4543	// new function clone, and update their other callees within this
4544	// function.
4545	FuncInfo PreviousAssignedFuncClone;
4546	auto EI = llvm::find_if(
4547	Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4548	return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4549	});
4550	bool CallerAssignedToCloneOfFunc = false;
4551	if (EI != Clone->CallerEdges.end()) {
4552	const std::shared_ptr<ContextEdge> &Edge = *EI;
4553	PreviousAssignedFuncClone =
4554	CallsiteToCalleeFuncCloneMap[Edge->Caller];
4555	CallerAssignedToCloneOfFunc = true;
4556	}
4557
4558	// Clone function and save it along with the CallInfo map created
4559	// during cloning in the FuncClonesToCallMap.
4560	std::map<CallInfo, CallInfo> NewCallMap;
4561	unsigned CloneNo = FuncClonesToCallMap.size();
4562	assert(CloneNo > `0` && "Clone 0 is the original function, which "
4563	"should already exist in the map");
4564	FuncInfo NewFuncClone = cloneFunctionForCallsite(
4565	Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
4566	FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
4567	FunctionClonesAnalysis ++;
4568	Changed = true;
4569
4570	// If no caller callsites were already assigned to a clone of this
4571	// function, we can simply assign this clone to the new func clone
4572	// and update all callers to it, then skip to the next clone.
4573	if (!CallerAssignedToCloneOfFunc) {
4574	AssignCallsiteCloneToFuncClone(
4575	NewFuncClone, Call, Clone,
4576	AllocationCallToContextNodeMap.count(Call));
4577	for (auto &CE : Clone->CallerEdges) {
4578	// Ignore any caller that does not have a recorded callsite Call.
4579	if (!CE->Caller->hasCall())
4580	continue;
4581	RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4582	}
4583	continue;
4584	}
4585
4586	// We may need to do additional node cloning in this case.
4587	// Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4588	// that were previously assigned to call PreviousAssignedFuncClone,
4589	// to record that they now call NewFuncClone.
4590	// The none type edge removal may remove some of this Clone's caller
4591	// edges, if it is reached via another of its caller's callees.
4592	// Iterate over a copy and skip any that were removed.
4593	auto CallerEdges = Clone->CallerEdges;
4594	for (auto CE : CallerEdges) {
4595	// Skip any that have been removed on an earlier iteration.
4596	if (CE->isRemoved()) {
4597	assert(!is_contained(Clone->CallerEdges, CE));
4598	continue;
4599	}
4600	assert(CE);
4601	// Ignore any caller that does not have a recorded callsite Call.
4602	if (!CE->Caller->hasCall())
4603	continue;
4604
4605	if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) \|\|
4606	// We subsequently fall through to later handling that
4607	// will perform any additional cloning required for
4608	// callers that were calling other function clones.
4609	CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4610	PreviousAssignedFuncClone)
4611	continue;
4612
4613	RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4614
4615	// If we are cloning a function that was already assigned to some
4616	// callers, then essentially we are creating new callsite clones
4617	// of the other callsites in that function that are reached by those
4618	// callers. Clone the other callees of the current callsite's caller
4619	// that were already assigned to PreviousAssignedFuncClone
4620	// accordingly. This is important since we subsequently update the
4621	// calls from the nodes in the graph and their assignments to callee
4622	// functions recorded in CallsiteToCalleeFuncCloneMap.
4623	// The none type edge removal may remove some of this caller's
4624	// callee edges, if it is reached via another of its callees.
4625	// Iterate over a copy and skip any that were removed.
4626	auto CalleeEdges = CE->Caller->CalleeEdges;
4627	for (auto CalleeEdge : CalleeEdges) {
4628	// Skip any that have been removed on an earlier iteration when
4629	// cleaning up newly None type callee edges.
4630	if (CalleeEdge->isRemoved()) {
4631	assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4632	continue;
4633	}
4634	assert(CalleeEdge);
4635	ContextNode *Callee = CalleeEdge->Callee;
4636	// Skip the current callsite, we are looking for other
4637	// callsites Caller calls, as well as any that does not have a
4638	// recorded callsite Call.
4639	if (Callee == Clone \|\| !Callee->hasCall())
4640	continue;
4641	// Skip direct recursive calls. We don't need/want to clone the
4642	// caller node again, and this loop will not behave as expected if
4643	// we tried.
4644	if (Callee == CalleeEdge->Caller)
4645	continue;
4646	ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4647	removeNoneTypeCalleeEdges(Node: NewClone);
4648	// Moving the edge may have resulted in some none type
4649	// callee edges on the original Callee.
4650	removeNoneTypeCalleeEdges(Node: Callee);
4651	assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4652	// If the Callee node was already assigned to call a specific
4653	// function version, make sure its new clone is assigned to call
4654	// that same function clone.
4655	if (CallsiteToCalleeFuncCloneMap.count(Callee))
4656	RecordCalleeFuncOfCallsite(
4657	NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
4658	// Update NewClone with the new Call clone of this callsite's Call
4659	// created for the new function clone created earlier.
4660	// Recall that we have already ensured when building the graph
4661	// that each caller can only call callsites within the same
4662	// function, so we are guaranteed that Callee Call is in the
4663	// current OrigFunc.
4664	// CallMap is set up as indexed by original Call at clone 0.
4665	CallInfo OrigCall(Callee->getOrigNode()->Call);
4666	OrigCall.setCloneNo(`0`);
4667	std::map<CallInfo, CallInfo> &CallMap =
4668	FuncClonesToCallMap[NewFuncClone];
4669	assert(CallMap.count(OrigCall));
4670	CallInfo NewCall(CallMap[OrigCall]);
4671	assert(NewCall);
4672	NewClone->setCall(NewCall);
4673	// Need to do the same for all matching calls.
4674	for (auto &MatchingCall : NewClone->MatchingCalls) {
4675	CallInfo OrigMatchingCall(MatchingCall);
4676	OrigMatchingCall.setCloneNo(`0`);
4677	assert(CallMap.count(OrigMatchingCall));
4678	CallInfo NewCall(CallMap[OrigMatchingCall]);
4679	assert(NewCall);
4680	// Updates the call in the list.
4681	MatchingCall = NewCall;
4682	}
4683	}
4684	}
4685	// Fall through to handling below to perform the recording of the
4686	// function for this callsite clone. This enables handling of cases
4687	// where the callers were assigned to different clones of a function.
4688	}
4689
4690	// See if we can use existing function clone. Walk through
4691	// all caller edges to see if any have already been assigned to
4692	// a clone of this callsite's function. If we can use it, do so. If not,
4693	// because that function clone is already assigned to a different clone
4694	// of this callsite, then we need to clone again.
4695	// Basically, this checking is needed to handle the case where different
4696	// caller functions/callsites may need versions of this function
4697	// containing different mixes of callsite clones across the different
4698	// callsites within the function. If that happens, we need to create
4699	// additional function clones to handle the various combinations.
4700	//
4701	// Keep track of any new clones of this callsite created by the
4702	// following loop, as well as any existing clone that we decided to
4703	// assign this clone to.
4704	std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4705	FuncInfo FuncCloneAssignedToCurCallsiteClone;
4706	// Iterate over a copy of Clone's caller edges, since we may need to
4707	// remove edges in the moveEdgeTo methods, and this simplifies the*
4708	// handling and makes it less error-prone.
4709	auto CloneCallerEdges = Clone->CallerEdges;
4710	for (auto &Edge : CloneCallerEdges) {
4711	// Skip removed edges (due to direct recursive edges updated when
4712	// updating callee edges when moving an edge and subsequently
4713	// removed by call to removeNoneTypeCalleeEdges on the Clone).
4714	if (Edge->isRemoved())
4715	continue;
4716	// Ignore any caller that does not have a recorded callsite Call.
4717	if (!Edge->Caller->hasCall())
4718	continue;
4719	// If this caller already assigned to call a version of OrigFunc, need
4720	// to ensure we can assign this callsite clone to that function clone.
4721	if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4722	FuncInfo FuncCloneCalledByCaller =
4723	CallsiteToCalleeFuncCloneMap[Edge->Caller];
4724	// First we need to confirm that this function clone is available
4725	// for use by this callsite node clone.
4726	//
4727	// While FuncCloneToCurNodeCloneMap is built only for this Node and
4728	// its callsite clones, one of those callsite clones X could have
4729	// been assigned to the same function clone called by Edge's caller
4730	// - if Edge's caller calls another callsite within Node's original
4731	// function, and that callsite has another caller reaching clone X.
4732	// We need to clone Node again in this case.
4733	if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4734	FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4735	Clone) \|\|
4736	// Detect when we have multiple callers of this callsite that
4737	// have already been assigned to specific, and different, clones
4738	// of OrigFunc (due to other unrelated callsites in Func they
4739	// reach via call contexts). Is this Clone of callsite Node
4740	// assigned to a different clone of OrigFunc? If so, clone Node
4741	// again.
4742	(FuncCloneAssignedToCurCallsiteClone &&
4743	FuncCloneAssignedToCurCallsiteClone !=
4744	FuncCloneCalledByCaller)) {
4745	// We need to use a different newly created callsite clone, in
4746	// order to assign it to another new function clone on a
4747	// subsequent iteration over the Clones array (adjusted below).
4748	// Note we specifically do not reset the
4749	// CallsiteToCalleeFuncCloneMap entry for this caller, so that
4750	// when this new clone is processed later we know which version of
4751	// the function to copy (so that other callsite clones we have
4752	// assigned to that function clone are properly cloned over). See
4753	// comments in the function cloning handling earlier.
4754
4755	// Check if we already have cloned this callsite again while
4756	// walking through caller edges, for a caller calling the same
4757	// function clone. If so, we can move this edge to that new clone
4758	// rather than creating yet another new clone.
4759	if (FuncCloneToNewCallsiteCloneMap.count(
4760	FuncCloneCalledByCaller)) {
4761	ContextNode *NewClone =
4762	FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4763	moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone);
4764	// Cleanup any none type edges cloned over.
4765	removeNoneTypeCalleeEdges(Node: NewClone);
4766	} else {
4767	// Create a new callsite clone.
4768	ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4769	removeNoneTypeCalleeEdges(Node: NewClone);
4770	FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4771	NewClone;
4772	// Add to list of clones and process later.
4773	ClonesWorklist.push_back(NewClone);
4774	assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4775	}
4776	// Moving the caller edge may have resulted in some none type
4777	// callee edges.
4778	removeNoneTypeCalleeEdges(Node: Clone);
4779	// We will handle the newly created callsite clone in a subsequent
4780	// iteration over this Node's Clones.
4781	continue;
4782	}
4783
4784	// Otherwise, we can use the function clone already assigned to this
4785	// caller.
4786	if (!FuncCloneAssignedToCurCallsiteClone) {
4787	FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4788	// Assign Clone to FuncCloneCalledByCaller
4789	AssignCallsiteCloneToFuncClone(
4790	FuncCloneCalledByCaller, Call, Clone,
4791	AllocationCallToContextNodeMap.count(Call));
4792	} else
4793	// Don't need to do anything - callsite is already calling this
4794	// function clone.
4795	assert(FuncCloneAssignedToCurCallsiteClone ==
4796	FuncCloneCalledByCaller);
4797
4798	} else {
4799	// We have not already assigned this caller to a version of
4800	// OrigFunc. Do the assignment now.
4801
4802	// First check if we have already assigned this callsite clone to a
4803	// clone of OrigFunc for another caller during this iteration over
4804	// its caller edges.
4805	if (!FuncCloneAssignedToCurCallsiteClone) {
4806	// Find first function in FuncClonesToCallMap without an assigned
4807	// clone of this callsite Node. We should always have one
4808	// available at this point due to the earlier cloning when the
4809	// FuncClonesToCallMap size was smaller than the clone number.
4810	for (auto &CF : FuncClonesToCallMap) {
4811	if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4812	FuncCloneAssignedToCurCallsiteClone = CF.first;
4813	break;
4814	}
4815	}
4816	assert(FuncCloneAssignedToCurCallsiteClone);
4817	// Assign Clone to FuncCloneAssignedToCurCallsiteClone
4818	AssignCallsiteCloneToFuncClone(
4819	FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4820	AllocationCallToContextNodeMap.count(Call));
4821	} else
4822	assert(FuncCloneToCurNodeCloneMap
4823	[FuncCloneAssignedToCurCallsiteClone] == Clone);
4824	// Update callers to record function version called.
4825	RecordCalleeFuncOfCallsite(Edge->Caller,
4826	FuncCloneAssignedToCurCallsiteClone);
4827	}
4828	}
4829	}
4830	if (VerifyCCG) {
4831	checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4832	for (const auto &PE : Node->CalleeEdges)
4833	checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4834	for (const auto &CE : Node->CallerEdges)
4835	checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4836	for (auto *Clone : Node->Clones) {
4837	checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4838	for (const auto &PE : Clone->CalleeEdges)
4839	checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4840	for (const auto &CE : Clone->CallerEdges)
4841	checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4842	}
4843	}
4844	}
4845	}
4846
4847	uint8_t BothTypes =
4848	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
4849
4850	auto UpdateCalls = [&](ContextNode *Node,
4851	DenseSet<const ContextNode *> &Visited,
4852	auto &&UpdateCalls) {
4853	auto Inserted = Visited.insert(Node);
4854	if (!Inserted.second)
4855	return;
4856
4857	for (auto *Clone : Node->Clones)
4858	UpdateCalls(Clone, Visited, UpdateCalls);
4859
4860	for (auto &Edge : Node->CallerEdges)
4861	UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4862
4863	// Skip if either no call to update, or if we ended up with no context ids
4864	// (we moved all edges onto other clones).
4865	if (!Node->hasCall() \|\| Node->emptyContextIds())
4866	return;
4867
4868	if (Node->IsAllocation) {
4869	auto AT = allocTypeToUse(Node->AllocTypes);
4870	// If the allocation type is ambiguous, and more aggressive hinting
4871	// has been enabled via the MinClonedColdBytePercent flag, see if this
4872	// allocation should be hinted cold anyway because its fraction cold bytes
4873	// allocated is at least the given threshold.
4874	if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < `100` &&
4875	!ContextIdToContextSizeInfos.empty()) {
4876	uint64_t TotalCold = `0`;
4877	uint64_t Total = `0`;
4878	for (auto Id : Node->getContextIds()) {
4879	auto TypeI = ContextIdToAllocationType.find(Id);
4880	assert(TypeI != ContextIdToAllocationType.end());
4881	auto CSI = ContextIdToContextSizeInfos.find(Id);
4882	if (CSI != ContextIdToContextSizeInfos.end()) {
4883	for (auto &Info : CSI->second) {
4884	Total += Info.TotalSize;
4885	if (TypeI->second == AllocationType::Cold)
4886	TotalCold += Info.TotalSize;
4887	}
4888	}
4889	}
4890	if (TotalCold * `100` >= Total * MinClonedColdBytePercent)
4891	AT = AllocationType::Cold;
4892	}
4893	updateAllocationCall(Call&: Node->Call, AllocType: AT);
4894	assert(Node->MatchingCalls.empty());
4895	return;
4896	}
4897
4898	if (!CallsiteToCalleeFuncCloneMap.count(Node))
4899	return;
4900
4901	auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4902	updateCall(CallerCall&: Node->Call, CalleeFunc);
4903	// Update all the matching calls as well.
4904	for (auto &Call : Node->MatchingCalls)
4905	updateCall(CallerCall&: Call, CalleeFunc);
4906	};
4907
4908	// Performs DFS traversal starting from allocation nodes to update calls to
4909	// reflect cloning decisions recorded earlier. For regular LTO this will
4910	// update the actual calls in the IR to call the appropriate function clone
4911	// (and add attributes to allocation calls), whereas for ThinLTO the decisions
4912	// are recorded in the summary entries.
4913	DenseSet<const ContextNode *> Visited;
4914	for (auto &Entry : AllocationCallToContextNodeMap)
4915	UpdateCalls(Entry.second, Visited, UpdateCalls);
4916
4917	return Changed;
4918	}
4919
4920	static SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> createFunctionClones(
4921	Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4922	std::map<const Function , SmallPtrSet<const* GlobalAlias *, `1`>>
4923	&FuncToAliasMap) {
4924	// The first "clone" is the original copy, we should only call this if we
4925	// needed to create new clones.
4926	assert(NumClones > `1`);
4927	SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> VMaps;
4928	VMaps.reserve(N: NumClones - `1`);
4929	FunctionsClonedThinBackend ++;
4930	for (unsigned I = `1`; I < NumClones; I++) {
4931	VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
4932	auto NewF = CloneFunction(F: &F, VMap&: VMaps.back());
4933	FunctionClonesThinBackend ++;
4934	// Strip memprof and callsite metadata from clone as they are no longer
4935	// needed.
4936	for (auto &BB : *NewF) {
4937	for (auto &Inst : BB) {
4938	Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
4939	Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
4940	}
4941	}
4942	std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
4943	auto *PrevF = M.getFunction(Name);
4944	if (PrevF) {
4945	// We might have created this when adjusting callsite in another
4946	// function. It should be a declaration.
4947	assert(PrevF->isDeclaration());
4948	NewF->takeName(V: PrevF);
4949	PrevF->replaceAllUsesWith(V: NewF);
4950	PrevF->eraseFromParent();
4951	} else
4952	NewF->setName(Name);
4953	if (auto *SP = NewF->getSubprogram())
4954	SP->replaceLinkageName(
4955	LN: MDString::get(Context&: NewF->getParent()->getContext(), Str: Name));
4956	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofClone", &F)
4957	<< "created clone " << ore::NV ("NewFunction", NewF));
4958
4959	// Now handle aliases to this function, and clone those as well.
4960	if (!FuncToAliasMap.count(x: &F))
4961	continue;
4962	for (auto *A : FuncToAliasMap [&F]) {
4963	std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I);
4964	auto *PrevA = M.getNamedAlias(Name);
4965	auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
4966	AddressSpace: A->getType()->getPointerAddressSpace(),
4967	Linkage: A->getLinkage(), Name, Aliasee: NewF);
4968	NewA->copyAttributesFrom(Src: A);
4969	if (PrevA) {
4970	// We might have created this when adjusting callsite in another
4971	// function. It should be a declaration.
4972	assert(PrevA->isDeclaration());
4973	NewA->takeName(V: PrevA);
4974	PrevA->replaceAllUsesWith(V: NewA);
4975	PrevA->eraseFromParent();
4976	}
4977	}
4978	}
4979	return VMaps;
4980	}
4981
4982	// Locate the summary for F. This is complicated by the fact that it might
4983	// have been internalized or promoted.
4984	static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
4985	const ModuleSummaryIndex *ImportSummary,
4986	const Function CallingFunc = nullptr*) {
4987	// FIXME: Ideally we would retain the original GUID in some fashion on the
4988	// function (e.g. as metadata), but for now do our best to locate the
4989	// summary without that information.
4990	ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
4991	if (!TheFnVI)
4992	// See if theFn was internalized, by checking index directly with
4993	// original name (this avoids the name adjustment done by getGUID() for
4994	// internal symbols).
4995	TheFnVI = ImportSummary->getValueInfo(
4996	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName()));
4997	if (TheFnVI)
4998	return TheFnVI;
4999	// Now query with the original name before any promotion was performed.
5000	StringRef OrigName =
5001	ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
5002	// When this pass is enabled, we always add thinlto_src_file provenance
5003	// metadata to imported function definitions, which allows us to recreate the
5004	// original internal symbol's GUID.
5005	auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file");
5006	// If this is a call to an imported/promoted local for which we didn't import
5007	// the definition, the metadata will not exist on the declaration. However,
5008	// since we are doing this early, before any inlining in the LTO backend, we
5009	// can simply look at the metadata on the calling function which must have
5010	// been from the same module if F was an internal symbol originally.
5011	if (!SrcFileMD && F.isDeclaration()) {
5012	// We would only call this for a declaration for a direct callsite, in which
5013	// case the caller would have provided the calling function pointer.
5014	assert(CallingFunc);
5015	SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file");
5016	// If this is a promoted local (OrigName != F.getName()), since this is a
5017	// declaration, it must be imported from a different module and therefore we
5018	// should always find the metadata on its calling function. Any call to a
5019	// promoted local that came from this module should still be a definition.
5020	assert(SrcFileMD \|\| OrigName == F.getName());
5021	}
5022	StringRef SrcFile = M.getSourceFileName();
5023	if (SrcFileMD)
5024	SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: `0`))->getString();
5025	std::string OrigId = GlobalValue::getGlobalIdentifier(
5026	Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5027	TheFnVI = ImportSummary->getValueInfo(
5028	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5029	// Internal func in original module may have gotten a numbered suffix if we
5030	// imported an external function with the same name. This happens
5031	// automatically during IR linking for naming conflicts. It would have to
5032	// still be internal in that case (otherwise it would have been renamed on
5033	// promotion in which case we wouldn't have a naming conflict).
5034	if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5035	F.getName().contains(C: `'.'`)) {
5036	OrigName = F.getName().rsplit(Separator: `'.'`).first;
5037	OrigId = GlobalValue::getGlobalIdentifier(
5038	Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5039	TheFnVI = ImportSummary->getValueInfo(
5040	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5041	}
5042	// The only way we may not have a VI is if this is a declaration created for
5043	// an imported reference. For distributed ThinLTO we may not have a VI for
5044	// such declarations in the distributed summary.
5045	assert(TheFnVI \|\| F.isDeclaration());
5046	return TheFnVI;
5047	}
5048
5049	bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5050	Module &M) {
5051	ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5052	Symtab = std::make_unique<InstrProfSymtab>();
5053	// Don't add canonical names, to avoid multiple functions to the symtab
5054	// when they both have the same root name with "." suffixes stripped.
5055	// If we pick the wrong one then this could lead to incorrect ICP and calling
5056	// a memprof clone that we don't actually create (resulting in linker unsats).
5057	// What this means is that the GUID of the function (or its PGOFuncName
5058	// metadata) must* match that in the VP metadata to allow promotion.*
5059	// In practice this should not be a limitation, since local functions should
5060	// have PGOFuncName metadata and global function names shouldn't need any
5061	// special handling (they should not get the ".llvm." suffix that the*
5062	// canonicalization handling is attempting to strip).
5063	if (Error E = Symtab ->create(M, /InLTO=/true, /AddCanonical=/false)) {
5064	std::string SymtabFailure = toString(E: std::move(E));
5065	M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure);
5066	return false;
5067	}
5068	return true;
5069	}
5070
5071	#ifndef NDEBUG
5072	// Sanity check that the MIB stack ids match between the summary and
5073	// instruction metadata.
5074	static void checkAllocContextIds(
5075	const AllocInfo &AllocNode, const MDNode *MemProfMD,
5076	const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5077	const ModuleSummaryIndex *ImportSummary) {
5078	auto MIBIter = AllocNode.MIBs.begin();
5079	for (auto &MDOp : MemProfMD->operands()) {
5080	assert(MIBIter != AllocNode.MIBs.end());
5081	auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5082	auto MIBMD = cast<const* MDNode>(MDOp);
5083	MDNode *StackMDNode = getMIBStackNode(MIBMD);
5084	assert(StackMDNode);
5085	CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5086	auto ContextIterBegin =
5087	StackContext.beginAfterSharedPrefix(CallsiteContext);
5088	// Skip the checking on the first iteration.
5089	uint64_t LastStackContextId =
5090	(ContextIterBegin != StackContext.end() && *ContextIterBegin == `0`) ? `1`
5091	: `0`;
5092	for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5093	++ContextIter) {
5094	// If this is a direct recursion, simply skip the duplicate
5095	// entries, to be consistent with how the summary ids were
5096	// generated during ModuleSummaryAnalysis.
5097	if (LastStackContextId == *ContextIter)
5098	continue;
5099	LastStackContextId = *ContextIter;
5100	assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5101	assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5102	*ContextIter);
5103	StackIdIndexIter++;
5104	}
5105	MIBIter++;
5106	}
5107	}
5108	#endif
5109
5110	bool MemProfContextDisambiguation::applyImport(Module &M) {
5111	assert(ImportSummary);
5112	bool Changed = false;
5113
5114	// We also need to clone any aliases that reference cloned functions, because
5115	// the modified callsites may invoke via the alias. Keep track of the aliases
5116	// for each function.
5117	std::map<const Function , SmallPtrSet<const* GlobalAlias *, `1`>>
5118	FuncToAliasMap;
5119	for (auto &A : M.aliases()) {
5120	auto *Aliasee = A.getAliaseeObject();
5121	if (auto *F = dyn_cast<Function>(Val: Aliasee))
5122	FuncToAliasMap [F].insert(Ptr: &A);
5123	}
5124
5125	if (!initializeIndirectCallPromotionInfo(M))
5126	return false;
5127
5128	for (auto &F : M) {
5129	if (F.isDeclaration() \|\| isMemProfClone(F))
5130	continue;
5131
5132	OptimizationRemarkEmitter ORE(&F);
5133
5134	SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> VMaps;
5135	bool ClonesCreated = false;
5136	unsigned NumClonesCreated = `0`;
5137	auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5138	// We should at least have version 0 which is the original copy.
5139	assert(NumClones > `0`);
5140	// If only one copy needed use original.
5141	if (NumClones == `1`)
5142	return;
5143	// If we already performed cloning of this function, confirm that the
5144	// requested number of clones matches (the thin link should ensure the
5145	// number of clones for each constituent callsite is consistent within
5146	// each function), before returning.
5147	if (ClonesCreated) {
5148	assert(NumClonesCreated == NumClones);
5149	return;
5150	}
5151	VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5152	// The first "clone" is the original copy, which doesn't have a VMap.
5153	assert(VMaps.size() == NumClones - `1`);
5154	Changed = true;
5155	ClonesCreated = true;
5156	NumClonesCreated = NumClones;
5157	};
5158
5159	auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5160	Function *CalledFunction) {
5161	// Perform cloning if not yet done.
5162	CloneFuncIfNeeded (/NumClones=/StackNode.Clones.size());
5163
5164	assert(!isMemProfClone(*CalledFunction));
5165
5166	// Because we update the cloned calls by calling setCalledOperand (see
5167	// comment below), out of an abundance of caution make sure the called
5168	// function was actually the called operand (or its aliasee). We also
5169	// strip pointer casts when looking for calls (to match behavior during
5170	// summary generation), however, with opaque pointers in theory this
5171	// should not be an issue. Note we still clone the current function
5172	// (containing this call) above, as that could be needed for its callers.
5173	auto *GA = dyn_cast_or_null<GlobalAlias>(Val: CB->getCalledOperand());
5174	if (CalledFunction != CB->getCalledOperand() &&
5175	(!GA \|\| CalledFunction != GA->getAliaseeObject())) {
5176	SkippedCallsCloning ++;
5177	return;
5178	}
5179	// Update the calls per the summary info.
5180	// Save orig name since it gets updated in the first iteration
5181	// below.
5182	auto CalleeOrigName = CalledFunction->getName();
5183	for (unsigned J = `0`; J < StackNode.Clones.size(); J++) {
5184	// Do nothing if this version calls the original version of its
5185	// callee.
5186	if (!StackNode.Clones [J])
5187	continue;
5188	auto NewF = M.getOrInsertFunction(
5189	Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones [J]),
5190	T: CalledFunction->getFunctionType());
5191	CallBase *CBClone;
5192	// Copy 0 is the original function.
5193	if (!J)
5194	CBClone = CB;
5195	else
5196	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5197	// Set the called operand directly instead of calling setCalledFunction,
5198	// as the latter mutates the function type on the call. In rare cases
5199	// we may have a slightly different type on a callee function
5200	// declaration due to it being imported from a different module with
5201	// incomplete types. We really just want to change the name of the
5202	// function to the clone, and not make any type changes.
5203	CBClone->setCalledOperand(NewF.getCallee());
5204	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CBClone)
5205	<< ore::NV ("Call", CBClone) << " in clone "
5206	<< ore::NV ("Caller", CBClone->getFunction())
5207	<< " assigned to call function clone "
5208	<< ore::NV ("Callee", NewF.getCallee()));
5209	}
5210	};
5211
5212	// Locate the summary for F.
5213	ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5214	// If not found, this could be an imported local (see comment in
5215	// findValueInfoForFunc). Skip for now as it will be cloned in its original
5216	// module (where it would have been promoted to global scope so should
5217	// satisfy any reference in this module).
5218	if (!TheFnVI)
5219	continue;
5220
5221	auto *GVSummary =
5222	ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
5223	if (!GVSummary) {
5224	// Must have been imported, use the summary which matches the definition。
5225	// (might be multiple if this was a linkonce_odr).
5226	auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
5227	assert(SrcModuleMD &&
5228	"enable-import-metadata is needed to emit thinlto_src_module");
5229	StringRef SrcModule =
5230	dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: `0`))->getString();
5231	for (auto &GVS : TheFnVI.getSummaryList()) {
5232	if (GVS ->modulePath() == SrcModule) {
5233	GVSummary = GVS.get();
5234	break;
5235	}
5236	}
5237	assert(GVSummary && GVSummary->modulePath() == SrcModule);
5238	}
5239
5240	// If this was an imported alias skip it as we won't have the function
5241	// summary, and it should be cloned in the original module.
5242	if (isa<AliasSummary>(Val: GVSummary))
5243	continue;
5244
5245	auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
5246
5247	if (FS->allocs().empty() && FS->callsites().empty())
5248	continue;
5249
5250	auto SI = FS->callsites().begin();
5251	auto AI = FS->allocs().begin();
5252
5253	// To handle callsite infos synthesized for tail calls which have missing
5254	// frames in the profiled context, map callee VI to the synthesized callsite
5255	// info.
5256	DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5257	// Iterate the callsites for this function in reverse, since we place all
5258	// those synthesized for tail calls at the end.
5259	for (auto CallsiteIt = FS->callsites().rbegin();
5260	CallsiteIt != FS->callsites().rend(); CallsiteIt ++) {
5261	auto &Callsite = *CallsiteIt;
5262	// Stop as soon as we see a non-synthesized callsite info (see comment
5263	// above loop). All the entries added for discovered tail calls have empty
5264	// stack ids.
5265	if (!Callsite.StackIdIndices.empty())
5266	break;
5267	MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
5268	}
5269
5270	// Keeps track of needed ICP for the function.
5271	SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5272
5273	// Assume for now that the instructions are in the exact same order
5274	// as when the summary was created, but confirm this is correct by
5275	// matching the stack ids.
5276	for (auto &BB : F) {
5277	for (auto &I : BB) {
5278	auto *CB = dyn_cast<CallBase>(Val: &I);
5279	// Same handling as when creating module summary.
5280	if (!mayHaveMemprofSummary(CB))
5281	continue;
5282
5283	auto *CalledValue = CB->getCalledOperand();
5284	auto *CalledFunction = CB->getCalledFunction();
5285	if (CalledValue && !CalledFunction) {
5286	CalledValue = CalledValue->stripPointerCasts();
5287	// Stripping pointer casts can reveal a called function.
5288	CalledFunction = dyn_cast<Function>(Val: CalledValue);
5289	}
5290	// Check if this is an alias to a function. If so, get the
5291	// called aliasee for the checks below.
5292	if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
5293	assert(!CalledFunction &&
5294	"Expected null called function in callsite for alias");
5295	CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
5296	}
5297
5298	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5299	I.getMetadata(KindID: LLVMContext::MD_callsite));
5300	auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
5301
5302	// Include allocs that were already assigned a memprof function
5303	// attribute in the statistics.
5304	if (CB->getAttributes().hasFnAttr(Kind: "memprof")) {
5305	assert(!MemProfMD);
5306	CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
5307	? AllocTypeColdThinBackend ++
5308	: AllocTypeNotColdThinBackend ++;
5309	OrigAllocsThinBackend ++;
5310	AllocVersionsThinBackend ++;
5311	if (!MaxAllocVersionsThinBackend)
5312	MaxAllocVersionsThinBackend = `1`;
5313	continue;
5314	}
5315
5316	if (MemProfMD) {
5317	// Consult the next alloc node.
5318	assert(AI != FS->allocs().end());
5319	auto &AllocNode = *(AI++);
5320
5321	#ifndef NDEBUG
5322	checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5323	ImportSummary);
5324	#endif
5325
5326	// Perform cloning if not yet done.
5327	CloneFuncIfNeeded (/NumClones=/AllocNode.Versions.size());
5328
5329	OrigAllocsThinBackend ++;
5330	AllocVersionsThinBackend += AllocNode.Versions.size();
5331	if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5332	MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5333
5334	// If there is only one version that means we didn't end up
5335	// considering this function for cloning, and in that case the alloc
5336	// will still be none type or should have gotten the default NotCold.
5337	// Skip that after calling clone helper since that does some sanity
5338	// checks that confirm we haven't decided yet that we need cloning.
5339	// We might have a single version that is cold due to the
5340	// MinClonedColdBytePercent heuristic, make sure we don't skip in that
5341	// case.
5342	if (AllocNode.Versions.size() == `1` &&
5343	(AllocationType)AllocNode.Versions [`0`] != AllocationType::Cold) {
5344	assert((AllocationType)AllocNode.Versions[`0`] ==
5345	AllocationType::NotCold \|\|
5346	(AllocationType)AllocNode.Versions[`0`] ==
5347	AllocationType::None);
5348	UnclonableAllocsThinBackend ++;
5349	continue;
5350	}
5351
5352	// All versions should have a singular allocation type.
5353	assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5354	return Type == ((uint8_t)AllocationType::NotCold \|
5355	(uint8_t)AllocationType::Cold);
5356	}));
5357
5358	// Update the allocation types per the summary info.
5359	for (unsigned J = `0`; J < AllocNode.Versions.size(); J++) {
5360	// Ignore any that didn't get an assigned allocation type.
5361	if (AllocNode.Versions [J] == (uint8_t)AllocationType::None)
5362	continue;
5363	AllocationType AllocTy = (AllocationType)AllocNode.Versions [J];
5364	AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend ++
5365	: AllocTypeNotColdThinBackend ++;
5366	std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
5367	auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
5368	Val: AllocTypeString);
5369	CallBase *CBClone;
5370	// Copy 0 is the original function.
5371	if (!J)
5372	CBClone = CB;
5373	else
5374	// Since VMaps are only created for new clones, we index with
5375	// clone J-1 (J==0 is the original clone and does not have a VMaps
5376	// entry).
5377	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5378	CBClone->addFnAttr(Attr: A);
5379	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofAttribute", CBClone)
5380	<< ore::NV ("AllocationCall", CBClone) << " in clone "
5381	<< ore::NV ("Caller", CBClone->getFunction())
5382	<< " marked with memprof allocation attribute "
5383	<< ore::NV ("Attribute", AllocTypeString));
5384	}
5385	} else if (!CallsiteContext.empty()) {
5386	if (!CalledFunction) {
5387	#ifndef NDEBUG
5388	// We should have skipped inline assembly calls.
5389	auto *CI = dyn_cast<CallInst>(CB);
5390	assert(!CI \|\| !CI->isInlineAsm());
5391	#endif
5392	// We should have skipped direct calls via a Constant.
5393	assert(CalledValue && !isa<Constant>(CalledValue));
5394
5395	// This is an indirect call, see if we have profile information and
5396	// whether any clones were recorded for the profiled targets (that
5397	// we synthesized CallsiteInfo summary records for when building the
5398	// index).
5399	auto NumClones =
5400	recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo);
5401
5402	// Perform cloning if not yet done. This is done here in case
5403	// we don't need to do ICP, but might need to clone this
5404	// function as it is the target of other cloned calls.
5405	if (NumClones)
5406	CloneFuncIfNeeded (NumClones);
5407	}
5408
5409	else {
5410	// Consult the next callsite node.
5411	assert(SI != FS->callsites().end());
5412	auto &StackNode = *(SI++);
5413
5414	#ifndef NDEBUG
5415	// Sanity check that the stack ids match between the summary and
5416	// instruction metadata.
5417	auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5418	for (auto StackId : CallsiteContext) {
5419	assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5420	assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5421	StackId);
5422	StackIdIndexIter++;
5423	}
5424	#endif
5425
5426	CloneCallsite (StackNode, CB, CalledFunction);
5427	}
5428	} else if (CB->isTailCall() && CalledFunction) {
5429	// Locate the synthesized callsite info for the callee VI, if any was
5430	// created, and use that for cloning.
5431	ValueInfo CalleeVI =
5432	findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F);
5433	if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
5434	auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
5435	assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5436	CloneCallsite (Callsite ->second, CB, CalledFunction);
5437	}
5438	}
5439	}
5440	}
5441
5442	// Now do any promotion required for cloning.
5443	performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5444	}
5445
5446	// We skip some of the functions and instructions above, so remove all the
5447	// metadata in a single sweep here.
5448	for (auto &F : M) {
5449	// We can skip memprof clones because createFunctionClones already strips
5450	// the metadata from the newly created clones.
5451	if (F.isDeclaration() \|\| isMemProfClone(F))
5452	continue;
5453	for (auto &BB : F) {
5454	for (auto &I : BB) {
5455	if (!isa<CallBase>(Val: I))
5456	continue;
5457	I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
5458	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
5459	}
5460	}
5461	}
5462
5463	return Changed;
5464	}
5465
5466	unsigned MemProfContextDisambiguation::recordICPInfo(
5467	CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5468	ArrayRef<CallsiteInfo>::iterator &SI,
5469	SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5470	// First see if we have profile information for this indirect call.
5471	uint32_t NumCandidates;
5472	uint64_t TotalCount;
5473	auto CandidateProfileData =
5474	ICallAnalysis ->getPromotionCandidatesForInstruction(I: CB, TotalCount,
5475	NumCandidates);
5476	if (CandidateProfileData.empty())
5477	return `0`;
5478
5479	// Iterate through all of the candidate profiled targets along with the
5480	// CallsiteInfo summary records synthesized for them when building the index,
5481	// and see if any are cloned and/or refer to clones.
5482	bool ICPNeeded = false;
5483	unsigned NumClones = `0`;
5484	size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI);
5485	for (const auto &Candidate : CandidateProfileData) {
5486	#ifndef NDEBUG
5487	auto CalleeValueInfo =
5488	#endif
5489	ImportSummary->getValueInfo(GUID: Candidate.Value);
5490	// We might not have a ValueInfo if this is a distributed
5491	// ThinLTO backend and decided not to import that function.
5492	assert(!CalleeValueInfo \|\| SI->Callee == CalleeValueInfo);
5493	assert(SI != AllCallsites.end());
5494	auto &StackNode = *(SI++);
5495	// See if any of the clones of the indirect callsite for this
5496	// profiled target should call a cloned version of the profiled
5497	// target. We only need to do the ICP here if so.
5498	ICPNeeded \|= llvm::any_of(Range: StackNode.Clones,
5499	P: [](unsigned CloneNo) { return CloneNo != `0`; });
5500	// Every callsite in the same function should have been cloned the same
5501	// number of times.
5502	assert(!NumClones \|\| NumClones == StackNode.Clones.size());
5503	NumClones = StackNode.Clones.size();
5504	}
5505	if (!ICPNeeded)
5506	return NumClones;
5507	// Save information for ICP, which is performed later to avoid messing up the
5508	// current function traversal.
5509	ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates,
5510	.TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex});
5511	return NumClones;
5512	}
5513
5514	void MemProfContextDisambiguation::performICP(
5515	Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5516	ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5517	ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5518	OptimizationRemarkEmitter &ORE) {
5519	// Now do any promotion required for cloning. Specifically, for each
5520	// recorded ICP candidate (which was only recorded because one clone of that
5521	// candidate should call a cloned target), we perform ICP (speculative
5522	// devirtualization) for each clone of the callsite, and update its callee
5523	// to the appropriate clone. Note that the ICP compares against the original
5524	// version of the target, which is what is in the vtable.
5525	for (auto &Info : ICallAnalysisInfo) {
5526	auto *CB = Info.CB;
5527	auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5528	auto TotalCount = Info.TotalCount;
5529	unsigned NumPromoted = `0`;
5530	unsigned NumClones = `0`;
5531
5532	for (auto &Candidate : Info.CandidateProfileData) {
5533	auto &StackNode = AllCallsites [CallsiteIndex++];
5534
5535	// All calls in the same function must have the same number of clones.
5536	assert(!NumClones \|\| NumClones == StackNode.Clones.size());
5537	NumClones = StackNode.Clones.size();
5538
5539	// See if the target is in the module. If it wasn't imported, it is
5540	// possible that this profile could have been collected on a different
5541	// target (or version of the code), and we need to be conservative
5542	// (similar to what is done in the ICP pass).
5543	Function *TargetFunction = Symtab ->getFunction(FuncMD5Hash: Candidate.Value);
5544	if (TargetFunction == nullptr \|\|
5545	// Any ThinLTO global dead symbol removal should have already
5546	// occurred, so it should be safe to promote when the target is a
5547	// declaration.
5548	// TODO: Remove internal option once more fully tested.
5549	(MemProfRequireDefinitionForPromotion &&
5550	TargetFunction->isDeclaration())) {
5551	ORE.emit(RemarkBuilder: [&]() {
5552	return OptimizationRemarkMissed (DEBUG_TYPE, "UnableToFindTarget", CB)
5553	<< "Memprof cannot promote indirect call: target with md5sum "
5554	<< ore::NV ("target md5sum", Candidate.Value) << " not found";
5555	});
5556	// FIXME: See if we can use the new declaration importing support to
5557	// at least get the declarations imported for this case. Hot indirect
5558	// targets should have been imported normally, however.
5559	continue;
5560	}
5561
5562	// Check if legal to promote
5563	const char Reason = nullptr*;
5564	if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) {
5565	ORE.emit(RemarkBuilder: [&]() {
5566	return OptimizationRemarkMissed (DEBUG_TYPE, "UnableToPromote", CB)
5567	<< "Memprof cannot promote indirect call to "
5568	<< ore::NV ("TargetFunction", TargetFunction)
5569	<< " with count of " << ore::NV ("TotalCount", TotalCount)
5570	<< ": " << Reason;
5571	});
5572	continue;
5573	}
5574
5575	assert(!isMemProfClone(*TargetFunction));
5576
5577	// Handle each call clone, applying ICP so that each clone directly
5578	// calls the specified callee clone, guarded by the appropriate ICP
5579	// check.
5580	CallBase *CBClone = CB;
5581	for (unsigned J = `0`; J < NumClones; J++) {
5582	// Copy 0 is the original function.
5583	if (J > `0`)
5584	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5585	// We do the promotion using the original name, so that the comparison
5586	// is against the name in the vtable. Then just below, change the new
5587	// direct call to call the cloned function.
5588	auto &DirectCall =
5589	pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count,
5590	TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE);
5591	auto *TargetToUse = TargetFunction;
5592	// Call original if this version calls the original version of its
5593	// callee.
5594	if (StackNode.Clones [J]) {
5595	TargetToUse =
5596	cast<Function>(Val: M.getOrInsertFunction(
5597	Name: getMemProfFuncName(Base: TargetFunction->getName(),
5598	CloneNo: StackNode.Clones [J]),
5599	T: TargetFunction->getFunctionType())
5600	.getCallee());
5601	}
5602	DirectCall.setCalledFunction(TargetToUse);
5603	// During matching we generate synthetic VP metadata for indirect calls
5604	// not already having any, from the memprof profile's callee GUIDs. If
5605	// we subsequently promote and inline those callees, we currently lose
5606	// the ability to generate this synthetic VP metadata. Optionally apply
5607	// a noinline attribute to promoted direct calls, where the threshold is
5608	// set to capture synthetic VP metadata targets which get a count of 1.
5609	if (MemProfICPNoInlineThreshold &&
5610	Candidate.Count < MemProfICPNoInlineThreshold)
5611	DirectCall.setIsNoInline();
5612	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CBClone)
5613	<< ore::NV ("Call", CBClone) << " in clone "
5614	<< ore::NV ("Caller", CBClone->getFunction())
5615	<< " promoted and assigned to call function clone "
5616	<< ore::NV ("Callee", TargetToUse));
5617	}
5618
5619	// Update TotalCount (all clones should get same count above)
5620	TotalCount -= Candidate.Count;
5621	NumPromoted++;
5622	}
5623	// Adjust the MD.prof metadata for all clones, now that we have the new
5624	// TotalCount and the number promoted.
5625	CallBase *CBClone = CB;
5626	for (unsigned J = `0`; J < NumClones; J++) {
5627	// Copy 0 is the original function.
5628	if (J > `0`)
5629	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5630	// First delete the old one.
5631	CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
5632	// If all promoted, we don't need the MD.prof metadata.
5633	// Otherwise we need update with the un-promoted records back.
5634	if (TotalCount != `0`)
5635	annotateValueSite(
5636	M, Inst&: *CBClone, VDs: ArrayRef(Info.CandidateProfileData).slice(N: NumPromoted),
5637	Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates);
5638	}
5639	}
5640	}
5641
5642	template <typename DerivedCCG, typename FuncTy, typename CallTy>
5643	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
5644	if (DumpCCG) {
5645	dbgs() << "CCG before cloning:\n";
5646	dbgs() << *this;
5647	}
5648	if (ExportToDot)
5649	exportToDot(Label: "postbuild");
5650
5651	if (VerifyCCG) {
5652	check();
5653	}
5654
5655	identifyClones();
5656
5657	if (VerifyCCG) {
5658	check();
5659	}
5660
5661	if (DumpCCG) {
5662	dbgs() << "CCG after cloning:\n";
5663	dbgs() << *this;
5664	}
5665	if (ExportToDot)
5666	exportToDot(Label: "cloned");
5667
5668	bool Changed = assignFunctions();
5669
5670	if (DumpCCG) {
5671	dbgs() << "CCG after assigning function clones:\n";
5672	dbgs() << *this;
5673	}
5674	if (ExportToDot)
5675	exportToDot(Label: "clonefuncassign");
5676
5677	if (MemProfReportHintedSizes)
5678	printTotalSizes(OS&: errs());
5679
5680	return Changed;
5681	}
5682
5683	bool MemProfContextDisambiguation::processModule(
5684	Module &M,
5685	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
5686
5687	// If we have an import summary, then the cloning decisions were made during
5688	// the thin link on the index. Apply them and return.
5689	if (ImportSummary)
5690	return applyImport(M);
5691
5692	// TODO: If/when other types of memprof cloning are enabled beyond just for
5693	// hot and cold, we will need to change this to individually control the
5694	// AllocationType passed to addStackNodesForMIB during CCG construction.
5695	// Note that we specifically check this after applying imports above, so that
5696	// the option isn't needed to be passed to distributed ThinLTO backend
5697	// clang processes, which won't necessarily have visibility into the linker
5698	// dependences. Instead the information is communicated from the LTO link to
5699	// the backends via the combined summary index.
5700	if (!SupportsHotColdNew)
5701	return false;
5702
5703	ModuleCallsiteContextGraph CCG(M, OREGetter);
5704	return CCG.process();
5705	}
5706
5707	MemProfContextDisambiguation::MemProfContextDisambiguation(
5708	const ModuleSummaryIndex Summary, bool* isSamplePGO)
5709	: ImportSummary(Summary), isSamplePGO(isSamplePGO) {
5710	// Check the dot graph printing options once here, to make sure we have valid
5711	// and expected combinations.
5712	if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
5713	llvm::report_fatal_error(
5714	reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
5715	if (DotGraphScope == DotScope::Context &&
5716	!ContextIdForDot.getNumOccurrences())
5717	llvm::report_fatal_error(
5718	reason: "-memprof-dot-scope=context requires -memprof-dot-context-id");
5719	if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
5720	ContextIdForDot.getNumOccurrences())
5721	llvm::report_fatal_error(
5722	reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
5723	"-memprof-dot-context-id");
5724	if (ImportSummary) {
5725	// The MemProfImportSummary should only be used for testing ThinLTO
5726	// distributed backend handling via opt, in which case we don't have a
5727	// summary from the pass pipeline.
5728	assert(MemProfImportSummary.empty());
5729	return;
5730	}
5731	if (MemProfImportSummary.empty())
5732	return;
5733
5734	auto ReadSummaryFile =
5735	errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
5736	if (!ReadSummaryFile) {
5737	logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
5738	ErrorBanner: "Error loading file '" + MemProfImportSummary +
5739	"': ");
5740	return;
5741	}
5742	auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
5743	if (!ImportSummaryForTestingOrErr) {
5744	logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
5745	ErrorBanner: "Error parsing file '" + MemProfImportSummary +
5746	"': ");
5747	return;
5748	}
5749	ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
5750	ImportSummary = ImportSummaryForTesting.get();
5751	}
5752
5753	PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
5754	ModuleAnalysisManager &AM) {
5755	auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
5756	auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
5757	return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
5758	};
5759	if (!processModule(M, OREGetter))
5760	return PreservedAnalyses::all();
5761	return PreservedAnalyses::none();
5762	}
5763
5764	void MemProfContextDisambiguation::run(
5765	ModuleSummaryIndex &Index,
5766	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
5767	isPrevailing) {
5768	// TODO: If/when other types of memprof cloning are enabled beyond just for
5769	// hot and cold, we will need to change this to individually control the
5770	// AllocationType passed to addStackNodesForMIB during CCG construction.
5771	// The index was set from the option, so these should be in sync.
5772	assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
5773	if (!SupportsHotColdNew)
5774	return;
5775
5776	IndexCallsiteContextGraph CCG(Index, isPrevailing);
5777	CCG.process();
5778	}
5779

source code of llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp