1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
23#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/Analysis/MemoryProfileInfo.h"
33#include "llvm/Analysis/ModuleSummaryAnalysis.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/Bitcode/BitcodeReader.h"
36#include "llvm/IR/Instructions.h"
37#include "llvm/IR/Module.h"
38#include "llvm/IR/ModuleSummaryIndex.h"
39#include "llvm/Pass.h"
40#include "llvm/Support/CommandLine.h"
41#include "llvm/Support/GraphWriter.h"
42#include "llvm/Support/InterleavedRange.h"
43#include "llvm/Support/raw_ostream.h"
44#include "llvm/Transforms/IPO.h"
45#include "llvm/Transforms/Utils/CallPromotionUtils.h"
46#include "llvm/Transforms/Utils/Cloning.h"
47#include "llvm/Transforms/Utils/Instrumentation.h"
48#include <deque>
49#include <sstream>
50#include <unordered_map>
51#include <vector>
52using namespace llvm;
53using namespace llvm::memprof;
54
55#define DEBUG_TYPE "memprof-context-disambiguation"
56
57STATISTIC(FunctionClonesAnalysis,
58 "Number of function clones created during whole program analysis");
59STATISTIC(FunctionClonesThinBackend,
60 "Number of function clones created during ThinLTO backend");
61STATISTIC(FunctionsClonedThinBackend,
62 "Number of functions that had clones created during ThinLTO backend");
63STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
64 "cloned) during whole program analysis");
65STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
66 "during whole program analysis");
67STATISTIC(AllocTypeNotColdThinBackend,
68 "Number of not cold static allocations (possibly cloned) during "
69 "ThinLTO backend");
70STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
71 "(possibly cloned) during ThinLTO backend");
72STATISTIC(OrigAllocsThinBackend,
73 "Number of original (not cloned) allocations with memprof profiles "
74 "during ThinLTO backend");
75STATISTIC(
76 AllocVersionsThinBackend,
77 "Number of allocation versions (including clones) during ThinLTO backend");
78STATISTIC(MaxAllocVersionsThinBackend,
79 "Maximum number of allocation versions created for an original "
80 "allocation during ThinLTO backend");
81STATISTIC(UnclonableAllocsThinBackend,
82 "Number of unclonable ambigous allocations during ThinLTO backend");
83STATISTIC(RemovedEdgesWithMismatchedCallees,
84 "Number of edges removed due to mismatched callees (profiled vs IR)");
85STATISTIC(FoundProfiledCalleeCount,
86 "Number of profiled callees found via tail calls");
87STATISTIC(FoundProfiledCalleeDepth,
88 "Aggregate depth of profiled callees found via tail calls");
89STATISTIC(FoundProfiledCalleeMaxDepth,
90 "Maximum depth of profiled callees found via tail calls");
91STATISTIC(FoundProfiledCalleeNonUniquelyCount,
92 "Number of profiled callees found via multiple tail call chains");
93STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
94STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
95STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
96STATISTIC(MissingAllocForContextId,
97 "Number of missing alloc nodes for context ids");
98STATISTIC(SkippedCallsCloning,
99 "Number of calls skipped during cloning due to unexpected operand");
100
101static cl::opt<std::string> DotFilePathPrefix(
102 "memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
103 cl::value_desc("filename"),
104 cl::desc("Specify the path prefix of the MemProf dot files."));
105
106static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
107 cl::Hidden,
108 cl::desc("Export graph to dot files."));
109
110// How much of the graph to export to dot.
111enum DotScope {
112 All, // The full CCG graph.
113 Alloc, // Only contexts for the specified allocation.
114 Context, // Only the specified context.
115};
116
117static cl::opt<DotScope> DotGraphScope(
118 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
119 cl::Hidden, cl::init(Val: DotScope::All),
120 cl::values(
121 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
122 clEnumValN(DotScope::Alloc, "alloc",
123 "Export only nodes with contexts feeding given "
124 "-memprof-dot-alloc-id"),
125 clEnumValN(DotScope::Context, "context",
126 "Export only nodes with given -memprof-dot-context-id")));
127
128static cl::opt<unsigned>
129 AllocIdForDot("memprof-dot-alloc-id", cl::init(Val: 0), cl::Hidden,
130 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
131 "or to highlight if -memprof-dot-scope=all"));
132
133static cl::opt<unsigned> ContextIdForDot(
134 "memprof-dot-context-id", cl::init(Val: 0), cl::Hidden,
135 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
136 "highlight otherwise"));
137
138static cl::opt<bool>
139 DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
140 cl::desc("Dump CallingContextGraph to stdout after each stage."));
141
142static cl::opt<bool>
143 VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
144 cl::desc("Perform verification checks on CallingContextGraph."));
145
146static cl::opt<bool>
147 VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
148 cl::desc("Perform frequent verification checks on nodes."));
149
150static cl::opt<std::string> MemProfImportSummary(
151 "memprof-import-summary",
152 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
153 cl::Hidden);
154
155static cl::opt<unsigned>
156 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: 5),
157 cl::Hidden,
158 cl::desc("Max depth to recursively search for missing "
159 "frames through tail calls."));
160
161// Optionally enable cloning of callsites involved with recursive cycles
162static cl::opt<bool> AllowRecursiveCallsites(
163 "memprof-allow-recursive-callsites", cl::init(Val: true), cl::Hidden,
164 cl::desc("Allow cloning of callsites involved in recursive cycles"));
165
166static cl::opt<bool> CloneRecursiveContexts(
167 "memprof-clone-recursive-contexts", cl::init(Val: true), cl::Hidden,
168 cl::desc("Allow cloning of contexts through recursive cycles"));
169
170// Generally this is needed for correct assignment of allocation clones to
171// function clones, however, allow it to be disabled for debugging while the
172// functionality is new and being tested more widely.
173static cl::opt<bool>
174 MergeClones("memprof-merge-clones", cl::init(Val: true), cl::Hidden,
175 cl::desc("Merge clones before assigning functions"));
176
177// When disabled, try to detect and prevent cloning of recursive contexts.
178// This is only necessary until we support cloning through recursive cycles.
179// Leave on by default for now, as disabling requires a little bit of compile
180// time overhead and doesn't affect correctness, it will just inflate the cold
181// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
182static cl::opt<bool> AllowRecursiveContexts(
183 "memprof-allow-recursive-contexts", cl::init(Val: true), cl::Hidden,
184 cl::desc("Allow cloning of contexts having recursive cycles"));
185
186// Set the minimum absolute count threshold for allowing inlining of indirect
187// calls promoted during cloning.
188static cl::opt<unsigned> MemProfICPNoInlineThreshold(
189 "memprof-icp-noinline-threshold", cl::init(Val: 2), cl::Hidden,
190 cl::desc("Minimum absolute count for promoted target to be inlinable"));
191
192namespace llvm {
193cl::opt<bool> EnableMemProfContextDisambiguation(
194 "enable-memprof-context-disambiguation", cl::init(Val: false), cl::Hidden,
195 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
196
197// Indicate we are linking with an allocator that supports hot/cold operator
198// new interfaces.
199cl::opt<bool> SupportsHotColdNew(
200 "supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
201 cl::desc("Linking with hot/cold operator new interfaces"));
202
203static cl::opt<bool> MemProfRequireDefinitionForPromotion(
204 "memprof-require-definition-for-promotion", cl::init(Val: false), cl::Hidden,
205 cl::desc(
206 "Require target function definition when promoting indirect calls"));
207} // namespace llvm
208
209extern cl::opt<bool> MemProfReportHintedSizes;
210extern cl::opt<unsigned> MinClonedColdBytePercent;
211
212namespace {
213/// CRTP base for graphs built from either IR or ThinLTO summary index.
214///
215/// The graph represents the call contexts in all memprof metadata on allocation
216/// calls, with nodes for the allocations themselves, as well as for the calls
217/// in each context. The graph is initially built from the allocation memprof
218/// metadata (or summary) MIBs. It is then updated to match calls with callsite
219/// metadata onto the nodes, updating it to reflect any inlining performed on
220/// those calls.
221///
222/// Each MIB (representing an allocation's call context with allocation
223/// behavior) is assigned a unique context id during the graph build. The edges
224/// and nodes in the graph are decorated with the context ids they carry. This
225/// is used to correctly update the graph when cloning is performed so that we
226/// can uniquify the context for a single (possibly cloned) allocation.
227template <typename DerivedCCG, typename FuncTy, typename CallTy>
228class CallsiteContextGraph {
229public:
230 CallsiteContextGraph() = default;
231 CallsiteContextGraph(const CallsiteContextGraph &) = default;
232 CallsiteContextGraph(CallsiteContextGraph &&) = default;
233
234 /// Main entry point to perform analysis and transformations on graph.
235 bool process();
236
237 /// Perform cloning on the graph necessary to uniquely identify the allocation
238 /// behavior of an allocation based on its context.
239 void identifyClones();
240
241 /// Assign callsite clones to functions, cloning functions as needed to
242 /// accommodate the combinations of their callsite clones reached by callers.
243 /// For regular LTO this clones functions and callsites in the IR, but for
244 /// ThinLTO the cloning decisions are noted in the summaries and later applied
245 /// in applyImport.
246 bool assignFunctions();
247
248 void dump() const;
249 void print(raw_ostream &OS) const;
250 void printTotalSizes(raw_ostream &OS) const;
251
252 friend raw_ostream &operator<<(raw_ostream &OS,
253 const CallsiteContextGraph &CCG) {
254 CCG.print(OS);
255 return OS;
256 }
257
258 friend struct GraphTraits<
259 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
260 friend struct DOTGraphTraits<
261 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
262
263 void exportToDot(std::string Label) const;
264
265 /// Represents a function clone via FuncTy pointer and clone number pair.
266 struct FuncInfo final
267 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
268 using Base = std::pair<FuncTy *, unsigned>;
269 FuncInfo(const Base &B) : Base(B) {}
270 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
271 explicit operator bool() const { return this->first != nullptr; }
272 FuncTy *func() const { return this->first; }
273 unsigned cloneNo() const { return this->second; }
274 };
275
276 /// Represents a callsite clone via CallTy and clone number pair.
277 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
278 using Base = std::pair<CallTy, unsigned>;
279 CallInfo(const Base &B) : Base(B) {}
280 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
281 : Base(Call, CloneNo) {}
282 explicit operator bool() const { return (bool)this->first; }
283 CallTy call() const { return this->first; }
284 unsigned cloneNo() const { return this->second; }
285 void setCloneNo(unsigned N) { this->second = N; }
286 void print(raw_ostream &OS) const {
287 if (!operator bool()) {
288 assert(!cloneNo());
289 OS << "null Call";
290 return;
291 }
292 call()->print(OS);
293 OS << "\t(clone " << cloneNo() << ")";
294 }
295 void dump() const {
296 print(OS&: dbgs());
297 dbgs() << "\n";
298 }
299 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
300 Call.print(OS);
301 return OS;
302 }
303 };
304
305 struct ContextEdge;
306
307 /// Node in the Callsite Context Graph
308 struct ContextNode {
309 // Keep this for now since in the IR case where we have an Instruction* it
310 // is not as immediately discoverable. Used for printing richer information
311 // when dumping graph.
312 bool IsAllocation;
313
314 // Keeps track of when the Call was reset to null because there was
315 // recursion.
316 bool Recursive = false;
317
318 // This will be formed by ORing together the AllocationType enum values
319 // for contexts including this node.
320 uint8_t AllocTypes = 0;
321
322 // The corresponding allocation or interior call. This is the primary call
323 // for which we have created this node.
324 CallInfo Call;
325
326 // List of other calls that can be treated the same as the primary call
327 // through cloning. I.e. located in the same function and have the same
328 // (possibly pruned) stack ids. They will be updated the same way as the
329 // primary call when assigning to function clones.
330 SmallVector<CallInfo, 0> MatchingCalls;
331
332 // For alloc nodes this is a unique id assigned when constructed, and for
333 // callsite stack nodes it is the original stack id when the node is
334 // constructed from the memprof MIB metadata on the alloc nodes. Note that
335 // this is only used when matching callsite metadata onto the stack nodes
336 // created when processing the allocation memprof MIBs, and for labeling
337 // nodes in the dot graph. Therefore we don't bother to assign a value for
338 // clones.
339 uint64_t OrigStackOrAllocId = 0;
340
341 // Edges to all callees in the profiled call stacks.
342 // TODO: Should this be a map (from Callee node) for more efficient lookup?
343 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
344
345 // Edges to all callers in the profiled call stacks.
346 // TODO: Should this be a map (from Caller node) for more efficient lookup?
347 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
348
349 // Returns true if we need to look at the callee edges for determining the
350 // node context ids and allocation type.
351 bool useCallerEdgesForContextInfo() const {
352 // Typically if the callee edges are empty either the caller edges are
353 // also empty, or this is an allocation (leaf node). However, if we are
354 // allowing recursive callsites and contexts this will be violated for
355 // incompletely cloned recursive cycles.
356 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
357 (AllowRecursiveCallsites && AllowRecursiveContexts));
358 // When cloning for a recursive context, during cloning we might be in the
359 // midst of cloning for a recurrence and have moved context ids off of a
360 // caller edge onto the clone but not yet off of the incoming caller
361 // (back) edge. If we don't look at those we miss the fact that this node
362 // still has context ids of interest.
363 return IsAllocation || CloneRecursiveContexts;
364 }
365
366 // Compute the context ids for this node from the union of its edge context
367 // ids.
368 DenseSet<uint32_t> getContextIds() const {
369 unsigned Count = 0;
370 // Compute the number of ids for reserve below. In general we only need to
371 // look at one set of edges, typically the callee edges, since other than
372 // allocations and in some cases during recursion cloning, all the context
373 // ids on the callers should also flow out via callee edges.
374 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
375 Count += Edge->getContextIds().size();
376 DenseSet<uint32_t> ContextIds;
377 ContextIds.reserve(Size: Count);
378 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
379 CalleeEdges, useCallerEdgesForContextInfo()
380 ? CallerEdges
381 : std::vector<std::shared_ptr<ContextEdge>>());
382 for (const auto &Edge : Edges)
383 ContextIds.insert_range(Edge->getContextIds());
384 return ContextIds;
385 }
386
387 // Compute the allocation type for this node from the OR of its edge
388 // allocation types.
389 uint8_t computeAllocType() const {
390 uint8_t BothTypes =
391 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
392 uint8_t AllocType = (uint8_t)AllocationType::None;
393 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
394 CalleeEdges, useCallerEdgesForContextInfo()
395 ? CallerEdges
396 : std::vector<std::shared_ptr<ContextEdge>>());
397 for (const auto &Edge : Edges) {
398 AllocType |= Edge->AllocTypes;
399 // Bail early if alloc type reached both, no further refinement.
400 if (AllocType == BothTypes)
401 return AllocType;
402 }
403 return AllocType;
404 }
405
406 // The context ids set for this node is empty if its edge context ids are
407 // also all empty.
408 bool emptyContextIds() const {
409 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
410 CalleeEdges, useCallerEdgesForContextInfo()
411 ? CallerEdges
412 : std::vector<std::shared_ptr<ContextEdge>>());
413 for (const auto &Edge : Edges) {
414 if (!Edge->getContextIds().empty())
415 return false;
416 }
417 return true;
418 }
419
420 // List of clones of this ContextNode, initially empty.
421 std::vector<ContextNode *> Clones;
422
423 // If a clone, points to the original uncloned node.
424 ContextNode *CloneOf = nullptr;
425
426 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
427
428 ContextNode(bool IsAllocation, CallInfo C)
429 : IsAllocation(IsAllocation), Call(C) {}
430
431 void addClone(ContextNode *Clone) {
432 if (CloneOf) {
433 CloneOf->Clones.push_back(Clone);
434 Clone->CloneOf = CloneOf;
435 } else {
436 Clones.push_back(Clone);
437 assert(!Clone->CloneOf);
438 Clone->CloneOf = this;
439 }
440 }
441
442 ContextNode *getOrigNode() {
443 if (!CloneOf)
444 return this;
445 return CloneOf;
446 }
447
448 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
449 unsigned int ContextId);
450
451 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
452 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
453 void eraseCalleeEdge(const ContextEdge *Edge);
454 void eraseCallerEdge(const ContextEdge *Edge);
455
456 void setCall(CallInfo C) { Call = C; }
457
458 bool hasCall() const { return (bool)Call.call(); }
459
460 void printCall(raw_ostream &OS) const { Call.print(OS); }
461
462 // True if this node was effectively removed from the graph, in which case
463 // it should have an allocation type of None and empty context ids.
464 bool isRemoved() const {
465 // Typically if the callee edges are empty either the caller edges are
466 // also empty, or this is an allocation (leaf node). However, if we are
467 // allowing recursive callsites and contexts this will be violated for
468 // incompletely cloned recursive cycles.
469 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
470 (AllocTypes == (uint8_t)AllocationType::None) ==
471 emptyContextIds());
472 return AllocTypes == (uint8_t)AllocationType::None;
473 }
474
475 void dump() const;
476 void print(raw_ostream &OS) const;
477
478 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
479 Node.print(OS);
480 return OS;
481 }
482 };
483
484 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
485 /// callee.
486 struct ContextEdge {
487 ContextNode *Callee;
488 ContextNode *Caller;
489
490 // This will be formed by ORing together the AllocationType enum values
491 // for contexts including this edge.
492 uint8_t AllocTypes = 0;
493
494 // Set just before initiating cloning when cloning of recursive contexts is
495 // enabled. Used to defer cloning of backedges until we have done cloning of
496 // the callee node for non-backedge caller edges. This exposes cloning
497 // opportunities through the backedge of the cycle.
498 // TODO: Note that this is not updated during cloning, and it is unclear
499 // whether that would be needed.
500 bool IsBackedge = false;
501
502 // The set of IDs for contexts including this edge.
503 DenseSet<uint32_t> ContextIds;
504
505 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
506 DenseSet<uint32_t> ContextIds)
507 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
508 ContextIds(std::move(ContextIds)) {}
509
510 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
511
512 // Helper to clear the fields of this edge when we are removing it from the
513 // graph.
514 inline void clear() {
515 ContextIds.clear();
516 AllocTypes = (uint8_t)AllocationType::None;
517 Caller = nullptr;
518 Callee = nullptr;
519 }
520
521 // Check if edge was removed from the graph. This is useful while iterating
522 // over a copy of edge lists when performing operations that mutate the
523 // graph in ways that might remove one of the edges.
524 inline bool isRemoved() const {
525 if (Callee || Caller)
526 return false;
527 // Any edges that have been removed from the graph but are still in a
528 // shared_ptr somewhere should have all fields null'ed out by clear()
529 // above.
530 assert(AllocTypes == (uint8_t)AllocationType::None);
531 assert(ContextIds.empty());
532 return true;
533 }
534
535 void dump() const;
536 void print(raw_ostream &OS) const;
537
538 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
539 Edge.print(OS);
540 return OS;
541 }
542 };
543
544 /// Helpers to remove edges that have allocation type None (due to not
545 /// carrying any context ids) after transformations.
546 void removeNoneTypeCalleeEdges(ContextNode *Node);
547 void removeNoneTypeCallerEdges(ContextNode *Node);
548 void
549 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
550 DenseSet<const ContextNode *> &Visited);
551
552protected:
553 /// Get a list of nodes corresponding to the stack ids in the given callsite
554 /// context.
555 template <class NodeT, class IteratorT>
556 std::vector<uint64_t>
557 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
558
559 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
560 /// metadata (or summary).
561 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
562
563 /// Adds nodes for the given MIB stack ids.
564 template <class NodeT, class IteratorT>
565 void addStackNodesForMIB(ContextNode *AllocNode,
566 CallStack<NodeT, IteratorT> &StackContext,
567 CallStack<NodeT, IteratorT> &CallsiteContext,
568 AllocationType AllocType,
569 ArrayRef<ContextTotalSize> ContextSizeInfo);
570
571 /// Matches all callsite metadata (or summary) to the nodes created for
572 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
573 /// inlining performed on those callsite instructions.
574 void updateStackNodes();
575
576 /// Update graph to conservatively handle any callsite stack nodes that target
577 /// multiple different callee target functions.
578 void handleCallsitesWithMultipleTargets();
579
580 /// Mark backedges via the standard DFS based backedge algorithm.
581 void markBackedges();
582
583 /// Merge clones generated during cloning for different allocations but that
584 /// are called by the same caller node, to ensure proper function assignment.
585 void mergeClones();
586
587 // Try to partition calls on the given node (already placed into the AllCalls
588 // array) by callee function, creating new copies of Node as needed to hold
589 // calls with different callees, and moving the callee edges appropriately.
590 // Returns true if partitioning was successful.
591 bool partitionCallsByCallee(
592 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
593 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
594
595 /// Save lists of calls with MemProf metadata in each function, for faster
596 /// iteration.
597 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
598
599 /// Map from callsite node to the enclosing caller function.
600 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
601
602 // When exporting to dot, and an allocation id is specified, contains the
603 // context ids on that allocation.
604 DenseSet<uint32_t> DotAllocContextIds;
605
606private:
607 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
608
609 // Structure to keep track of information for each call as we are matching
610 // non-allocation callsites onto context nodes created from the allocation
611 // call metadata / summary contexts.
612 struct CallContextInfo {
613 // The callsite we're trying to match.
614 CallTy Call;
615 // The callsites stack ids that have a context node in the graph.
616 std::vector<uint64_t> StackIds;
617 // The function containing this callsite.
618 const FuncTy *Func;
619 // Initially empty, if needed this will be updated to contain the context
620 // ids for use in a new context node created for this callsite.
621 DenseSet<uint32_t> ContextIds;
622 };
623
624 /// Helper to remove edge from graph, updating edge iterator if it is provided
625 /// (in which case CalleeIter indicates which edge list is being iterated).
626 /// This will also perform the necessary clearing of the ContextEdge members
627 /// to enable later checking if the edge has been removed (since we may have
628 /// other copies of the shared_ptr in existence, and in fact rely on this to
629 /// enable removal while iterating over a copy of a node's edge list).
630 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
631 bool CalleeIter = true);
632
633 /// Assigns the given Node to calls at or inlined into the location with
634 /// the Node's stack id, after post order traversing and processing its
635 /// caller nodes. Uses the call information recorded in the given
636 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
637 /// as needed. Called by updateStackNodes which sets up the given
638 /// StackIdToMatchingCalls map.
639 void assignStackNodesPostOrder(
640 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
641 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
642 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
643
644 /// Duplicates the given set of context ids, updating the provided
645 /// map from each original id with the newly generated context ids,
646 /// and returning the new duplicated id set.
647 DenseSet<uint32_t> duplicateContextIds(
648 const DenseSet<uint32_t> &StackSequenceContextIds,
649 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
650
651 /// Propagates all duplicated context ids across the graph.
652 void propagateDuplicateContextIds(
653 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
654
655 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
656 /// else to its callers. Also updates OrigNode's edges to remove any context
657 /// ids moved to the newly created edge.
658 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
659 bool TowardsCallee,
660 DenseSet<uint32_t> RemainingContextIds);
661
662 /// Get the stack id corresponding to the given Id or Index (for IR this will
663 /// return itself, for a summary index this will return the id recorded in the
664 /// index for that stack id index value).
665 uint64_t getStackId(uint64_t IdOrIndex) const {
666 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
667 }
668
669 /// Returns true if the given call targets the callee of the given edge, or if
670 /// we were able to identify the call chain through intermediate tail calls.
671 /// In the latter case new context nodes are added to the graph for the
672 /// identified tail calls, and their synthesized nodes are added to
673 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
674 /// the updated edges and to prepare it for an increment in the caller.
675 bool
676 calleesMatch(CallTy Call, EdgeIter &EI,
677 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
678
679 // Return the callee function of the given call, or nullptr if it can't be
680 // determined
681 const FuncTy *getCalleeFunc(CallTy Call) {
682 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
683 }
684
685 /// Returns true if the given call targets the given function, or if we were
686 /// able to identify the call chain through intermediate tail calls (in which
687 /// case FoundCalleeChain will be populated).
688 bool calleeMatchesFunc(
689 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
690 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
691 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
692 Call, Func, CallerFunc, FoundCalleeChain);
693 }
694
695 /// Returns true if both call instructions have the same callee.
696 bool sameCallee(CallTy Call1, CallTy Call2) {
697 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
698 }
699
700 /// Get a list of nodes corresponding to the stack ids in the given
701 /// callsite's context.
702 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
703 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
704 Call);
705 }
706
707 /// Get the last stack id in the context for callsite.
708 uint64_t getLastStackId(CallTy Call) {
709 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
710 }
711
712 /// Update the allocation call to record type of allocated memory.
713 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
714 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
715 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
716 }
717
718 /// Get the AllocationType assigned to the given allocation instruction clone.
719 AllocationType getAllocationCallType(const CallInfo &Call) const {
720 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
721 }
722
723 /// Update non-allocation call to invoke (possibly cloned) function
724 /// CalleeFunc.
725 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
726 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
727 }
728
729 /// Clone the given function for the given callsite, recording mapping of all
730 /// of the functions tracked calls to their new versions in the CallMap.
731 /// Assigns new clones to clone number CloneNo.
732 FuncInfo cloneFunctionForCallsite(
733 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
734 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
735 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
736 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
737 }
738
739 /// Gets a label to use in the dot graph for the given call clone in the given
740 /// function.
741 std::string getLabel(const FuncTy *Func, const CallTy Call,
742 unsigned CloneNo) const {
743 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
744 }
745
746 // Create and return a new ContextNode.
747 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
748 CallInfo C = CallInfo()) {
749 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
750 auto *NewNode = NodeOwner.back().get();
751 if (F)
752 NodeToCallingFunc[NewNode] = F;
753 return NewNode;
754 }
755
756 /// Helpers to find the node corresponding to the given call or stackid.
757 ContextNode *getNodeForInst(const CallInfo &C);
758 ContextNode *getNodeForAlloc(const CallInfo &C);
759 ContextNode *getNodeForStackId(uint64_t StackId);
760
761 /// Computes the alloc type corresponding to the given context ids, by
762 /// unioning their recorded alloc types.
763 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
764
765 /// Returns the allocation type of the intersection of the contexts of two
766 /// nodes (based on their provided context id sets), optimized for the case
767 /// when Node1Ids is smaller than Node2Ids.
768 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
769 const DenseSet<uint32_t> &Node2Ids) const;
770
771 /// Returns the allocation type of the intersection of the contexts of two
772 /// nodes (based on their provided context id sets).
773 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
774 const DenseSet<uint32_t> &Node2Ids) const;
775
776 /// Create a clone of Edge's callee and move Edge to that new callee node,
777 /// performing the necessary context id and allocation type updates.
778 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
779 /// moved to an edge to the new callee.
780 ContextNode *
781 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
782 DenseSet<uint32_t> ContextIdsToMove = {});
783
784 /// Change the callee of Edge to existing callee clone NewCallee, performing
785 /// the necessary context id and allocation type updates.
786 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
787 /// moved to an edge to the new callee.
788 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
789 ContextNode *NewCallee,
790 bool NewClone = false,
791 DenseSet<uint32_t> ContextIdsToMove = {});
792
793 /// Change the caller of the edge at the given callee edge iterator to be
794 /// NewCaller, performing the necessary context id and allocation type
795 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
796 /// a simplified version of it as we always move the given edge and all of its
797 /// context ids.
798 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
799 ContextNode *NewCaller);
800
801 /// Recursive helper for marking backedges via DFS.
802 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
803 DenseSet<const ContextNode *> &CurrentStack);
804
805 /// Recursive helper for merging clones.
806 void
807 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
808 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
809 /// Main worker for merging callee clones for a given node.
810 void mergeNodeCalleeClones(
811 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
812 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
813 /// Helper to find other callers of the given set of callee edges that can
814 /// share the same callee merge node.
815 void findOtherCallersToShareMerge(
816 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
817 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
818 DenseSet<ContextNode *> &OtherCallersToShareMerge);
819
820 /// Recursively perform cloning on the graph for the given Node and its
821 /// callers, in order to uniquely identify the allocation behavior of an
822 /// allocation given its context. The context ids of the allocation being
823 /// processed are given in AllocContextIds.
824 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
825 const DenseSet<uint32_t> &AllocContextIds);
826
827 /// Map from each context ID to the AllocationType assigned to that context.
828 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
829
830 /// Map from each contextID to the profiled full contexts and their total
831 /// sizes (there may be more than one due to context trimming),
832 /// optionally populated when requested (via MemProfReportHintedSizes or
833 /// MinClonedColdBytePercent).
834 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
835
836 /// Identifies the context node created for a stack id when adding the MIB
837 /// contexts to the graph. This is used to locate the context nodes when
838 /// trying to assign the corresponding callsites with those stack ids to these
839 /// nodes.
840 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
841
842 /// Maps to track the calls to their corresponding nodes in the graph.
843 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
844 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
845
846 /// Owner of all ContextNode unique_ptrs.
847 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
848
849 /// Perform sanity checks on graph when requested.
850 void check() const;
851
852 /// Keeps track of the last unique context id assigned.
853 unsigned int LastContextId = 0;
854};
855
856template <typename DerivedCCG, typename FuncTy, typename CallTy>
857using ContextNode =
858 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
859template <typename DerivedCCG, typename FuncTy, typename CallTy>
860using ContextEdge =
861 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
862template <typename DerivedCCG, typename FuncTy, typename CallTy>
863using FuncInfo =
864 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
865template <typename DerivedCCG, typename FuncTy, typename CallTy>
866using CallInfo =
867 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
868
869/// CRTP derived class for graphs built from IR (regular LTO).
870class ModuleCallsiteContextGraph
871 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
872 Instruction *> {
873public:
874 ModuleCallsiteContextGraph(
875 Module &M,
876 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
877
878private:
879 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
880 Instruction *>;
881
882 uint64_t getStackId(uint64_t IdOrIndex) const;
883 const Function *getCalleeFunc(Instruction *Call);
884 bool calleeMatchesFunc(
885 Instruction *Call, const Function *Func, const Function *CallerFunc,
886 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
887 bool sameCallee(Instruction *Call1, Instruction *Call2);
888 bool findProfiledCalleeThroughTailCalls(
889 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
890 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
891 bool &FoundMultipleCalleeChains);
892 uint64_t getLastStackId(Instruction *Call);
893 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
894 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
895 AllocationType getAllocationCallType(const CallInfo &Call) const;
896 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
897 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
898 Instruction *>::FuncInfo
899 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
900 std::map<CallInfo, CallInfo> &CallMap,
901 std::vector<CallInfo> &CallsWithMetadataInFunc,
902 unsigned CloneNo);
903 std::string getLabel(const Function *Func, const Instruction *Call,
904 unsigned CloneNo) const;
905
906 const Module &Mod;
907 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
908};
909
910/// Represents a call in the summary index graph, which can either be an
911/// allocation or an interior callsite node in an allocation's context.
912/// Holds a pointer to the corresponding data structure in the index.
913struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
914 IndexCall() : PointerUnion() {}
915 IndexCall(std::nullptr_t) : IndexCall() {}
916 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
917 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
918 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
919
920 IndexCall *operator->() { return this; }
921
922 void print(raw_ostream &OS) const {
923 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
924 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val&: Base)) {
925 OS << *AI;
926 } else {
927 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val&: Base);
928 assert(CI);
929 OS << *CI;
930 }
931 }
932};
933} // namespace
934
935namespace llvm {
936template <> struct simplify_type<IndexCall> {
937 using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
938 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
939};
940template <> struct simplify_type<const IndexCall> {
941 using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
942 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
943};
944} // namespace llvm
945
946namespace {
947/// CRTP derived class for graphs built from summary index (ThinLTO).
948class IndexCallsiteContextGraph
949 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
950 IndexCall> {
951public:
952 IndexCallsiteContextGraph(
953 ModuleSummaryIndex &Index,
954 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
955 isPrevailing);
956
957 ~IndexCallsiteContextGraph() {
958 // Now that we are done with the graph it is safe to add the new
959 // CallsiteInfo structs to the function summary vectors. The graph nodes
960 // point into locations within these vectors, so we don't want to add them
961 // any earlier.
962 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
963 auto *FS = I.first;
964 for (auto &Callsite : I.second)
965 FS->addCallsite(Callsite&: *Callsite.second);
966 }
967 }
968
969private:
970 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
971 IndexCall>;
972
973 uint64_t getStackId(uint64_t IdOrIndex) const;
974 const FunctionSummary *getCalleeFunc(IndexCall &Call);
975 bool calleeMatchesFunc(
976 IndexCall &Call, const FunctionSummary *Func,
977 const FunctionSummary *CallerFunc,
978 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
979 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
980 bool findProfiledCalleeThroughTailCalls(
981 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
982 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
983 bool &FoundMultipleCalleeChains);
984 uint64_t getLastStackId(IndexCall &Call);
985 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
986 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
987 AllocationType getAllocationCallType(const CallInfo &Call) const;
988 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
989 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
990 IndexCall>::FuncInfo
991 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
992 std::map<CallInfo, CallInfo> &CallMap,
993 std::vector<CallInfo> &CallsWithMetadataInFunc,
994 unsigned CloneNo);
995 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
996 unsigned CloneNo) const;
997
998 // Saves mapping from function summaries containing memprof records back to
999 // its VI, for use in checking and debugging.
1000 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1001
1002 const ModuleSummaryIndex &Index;
1003 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1004 isPrevailing;
1005
1006 // Saves/owns the callsite info structures synthesized for missing tail call
1007 // frames that we discover while building the graph.
1008 // It maps from the summary of the function making the tail call, to a map
1009 // of callee ValueInfo to corresponding synthesized callsite info.
1010 std::unordered_map<FunctionSummary *,
1011 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1012 FunctionCalleesToSynthesizedCallsiteInfos;
1013};
1014} // namespace
1015
1016namespace llvm {
1017template <>
1018struct DenseMapInfo<typename CallsiteContextGraph<
1019 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1020 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
1021template <>
1022struct DenseMapInfo<typename CallsiteContextGraph<
1023 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1024 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1025template <>
1026struct DenseMapInfo<IndexCall>
1027 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1028} // end namespace llvm
1029
1030namespace {
1031
1032// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1033// type we should actually use on the corresponding allocation.
1034// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1035// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1036// from NotCold.
1037AllocationType allocTypeToUse(uint8_t AllocTypes) {
1038 assert(AllocTypes != (uint8_t)AllocationType::None);
1039 if (AllocTypes ==
1040 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1041 return AllocationType::NotCold;
1042 else
1043 return (AllocationType)AllocTypes;
1044}
1045
1046// Helper to check if the alloc types for all edges recorded in the
1047// InAllocTypes vector match the alloc types for all edges in the Edges
1048// vector.
1049template <typename DerivedCCG, typename FuncTy, typename CallTy>
1050bool allocTypesMatch(
1051 const std::vector<uint8_t> &InAllocTypes,
1052 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1053 &Edges) {
1054 // This should be called only when the InAllocTypes vector was computed for
1055 // this set of Edges. Make sure the sizes are the same.
1056 assert(InAllocTypes.size() == Edges.size());
1057 return std::equal(
1058 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1059 [](const uint8_t &l,
1060 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1061 // Can share if one of the edges is None type - don't
1062 // care about the type along that edge as it doesn't
1063 // exist for those context ids.
1064 if (l == (uint8_t)AllocationType::None ||
1065 r->AllocTypes == (uint8_t)AllocationType::None)
1066 return true;
1067 return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
1068 });
1069}
1070
1071// Helper to check if the alloc types for all edges recorded in the
1072// InAllocTypes vector match the alloc types for callee edges in the given
1073// clone. Because the InAllocTypes were computed from the original node's callee
1074// edges, and other cloning could have happened after this clone was created, we
1075// need to find the matching clone callee edge, which may or may not exist.
1076template <typename DerivedCCG, typename FuncTy, typename CallTy>
1077bool allocTypesMatchClone(
1078 const std::vector<uint8_t> &InAllocTypes,
1079 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1080 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1081 assert(Node);
1082 // InAllocTypes should have been computed for the original node's callee
1083 // edges.
1084 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1085 // First create a map of the clone callee edge callees to the edge alloc type.
1086 DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1087 EdgeCalleeMap;
1088 for (const auto &E : Clone->CalleeEdges) {
1089 assert(!EdgeCalleeMap.contains(E->Callee));
1090 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1091 }
1092 // Next, walk the original node's callees, and look for the corresponding
1093 // clone edge to that callee.
1094 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1095 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1096 // Not found is ok, we will simply add an edge if we use this clone.
1097 if (Iter == EdgeCalleeMap.end())
1098 continue;
1099 // Can share if one of the edges is None type - don't
1100 // care about the type along that edge as it doesn't
1101 // exist for those context ids.
1102 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1103 Iter->second == (uint8_t)AllocationType::None)
1104 continue;
1105 if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes[I]))
1106 return false;
1107 }
1108 return true;
1109}
1110
1111} // end anonymous namespace
1112
1113template <typename DerivedCCG, typename FuncTy, typename CallTy>
1114typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1115CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1116 const CallInfo &C) {
1117 ContextNode *Node = getNodeForAlloc(C);
1118 if (Node)
1119 return Node;
1120
1121 return NonAllocationCallToContextNodeMap.lookup(C);
1122}
1123
1124template <typename DerivedCCG, typename FuncTy, typename CallTy>
1125typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1126CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1127 const CallInfo &C) {
1128 return AllocationCallToContextNodeMap.lookup(C);
1129}
1130
1131template <typename DerivedCCG, typename FuncTy, typename CallTy>
1132typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1133CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1134 uint64_t StackId) {
1135 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1136 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1137 return StackEntryNode->second;
1138 return nullptr;
1139}
1140
1141template <typename DerivedCCG, typename FuncTy, typename CallTy>
1142void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1143 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1144 unsigned int ContextId) {
1145 for (auto &Edge : CallerEdges) {
1146 if (Edge->Caller == Caller) {
1147 Edge->AllocTypes |= (uint8_t)AllocType;
1148 Edge->getContextIds().insert(ContextId);
1149 return;
1150 }
1151 }
1152 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1153 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1154 CallerEdges.push_back(Edge);
1155 Caller->CalleeEdges.push_back(Edge);
1156}
1157
1158template <typename DerivedCCG, typename FuncTy, typename CallTy>
1159void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1160 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1161 assert(!EI || (*EI)->get() == Edge);
1162 assert(!Edge->isRemoved());
1163 // Save the Caller and Callee pointers so we can erase Edge from their edge
1164 // lists after clearing Edge below. We do the clearing first in case it is
1165 // destructed after removing from the edge lists (if those were the last
1166 // shared_ptr references to Edge).
1167 auto *Callee = Edge->Callee;
1168 auto *Caller = Edge->Caller;
1169
1170 // Make sure the edge fields are cleared out so we can properly detect
1171 // removed edges if Edge is not destructed because there is still a shared_ptr
1172 // reference.
1173 Edge->clear();
1174
1175#ifndef NDEBUG
1176 auto CalleeCallerCount = Callee->CallerEdges.size();
1177 auto CallerCalleeCount = Caller->CalleeEdges.size();
1178#endif
1179 if (!EI) {
1180 Callee->eraseCallerEdge(Edge);
1181 Caller->eraseCalleeEdge(Edge);
1182 } else if (CalleeIter) {
1183 Callee->eraseCallerEdge(Edge);
1184 *EI = Caller->CalleeEdges.erase(*EI);
1185 } else {
1186 Caller->eraseCalleeEdge(Edge);
1187 *EI = Callee->CallerEdges.erase(*EI);
1188 }
1189 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1190 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1191}
1192
1193template <typename DerivedCCG, typename FuncTy, typename CallTy>
1194void CallsiteContextGraph<
1195 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1196 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1197 auto Edge = *EI;
1198 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1199 assert(Edge->ContextIds.empty());
1200 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
1201 } else
1202 ++EI;
1203 }
1204}
1205
1206template <typename DerivedCCG, typename FuncTy, typename CallTy>
1207void CallsiteContextGraph<
1208 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1209 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1210 auto Edge = *EI;
1211 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1212 assert(Edge->ContextIds.empty());
1213 Edge->Caller->eraseCalleeEdge(Edge.get());
1214 EI = Node->CallerEdges.erase(EI);
1215 } else
1216 ++EI;
1217 }
1218}
1219
1220template <typename DerivedCCG, typename FuncTy, typename CallTy>
1221typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1222CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1223 findEdgeFromCallee(const ContextNode *Callee) {
1224 for (const auto &Edge : CalleeEdges)
1225 if (Edge->Callee == Callee)
1226 return Edge.get();
1227 return nullptr;
1228}
1229
1230template <typename DerivedCCG, typename FuncTy, typename CallTy>
1231typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1232CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1233 findEdgeFromCaller(const ContextNode *Caller) {
1234 for (const auto &Edge : CallerEdges)
1235 if (Edge->Caller == Caller)
1236 return Edge.get();
1237 return nullptr;
1238}
1239
1240template <typename DerivedCCG, typename FuncTy, typename CallTy>
1241void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1242 eraseCalleeEdge(const ContextEdge *Edge) {
1243 auto EI = llvm::find_if(
1244 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1245 return CalleeEdge.get() == Edge;
1246 });
1247 assert(EI != CalleeEdges.end());
1248 CalleeEdges.erase(EI);
1249}
1250
1251template <typename DerivedCCG, typename FuncTy, typename CallTy>
1252void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1253 eraseCallerEdge(const ContextEdge *Edge) {
1254 auto EI = llvm::find_if(
1255 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1256 return CallerEdge.get() == Edge;
1257 });
1258 assert(EI != CallerEdges.end());
1259 CallerEdges.erase(EI);
1260}
1261
1262template <typename DerivedCCG, typename FuncTy, typename CallTy>
1263uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1264 DenseSet<uint32_t> &ContextIds) const {
1265 uint8_t BothTypes =
1266 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1267 uint8_t AllocType = (uint8_t)AllocationType::None;
1268 for (auto Id : ContextIds) {
1269 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1270 // Bail early if alloc type reached both, no further refinement.
1271 if (AllocType == BothTypes)
1272 return AllocType;
1273 }
1274 return AllocType;
1275}
1276
1277template <typename DerivedCCG, typename FuncTy, typename CallTy>
1278uint8_t
1279CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1280 const DenseSet<uint32_t> &Node1Ids,
1281 const DenseSet<uint32_t> &Node2Ids) const {
1282 uint8_t BothTypes =
1283 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1284 uint8_t AllocType = (uint8_t)AllocationType::None;
1285 for (auto Id : Node1Ids) {
1286 if (!Node2Ids.count(V: Id))
1287 continue;
1288 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1289 // Bail early if alloc type reached both, no further refinement.
1290 if (AllocType == BothTypes)
1291 return AllocType;
1292 }
1293 return AllocType;
1294}
1295
1296template <typename DerivedCCG, typename FuncTy, typename CallTy>
1297uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1298 const DenseSet<uint32_t> &Node1Ids,
1299 const DenseSet<uint32_t> &Node2Ids) const {
1300 if (Node1Ids.size() < Node2Ids.size())
1301 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1302 else
1303 return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
1304}
1305
1306template <typename DerivedCCG, typename FuncTy, typename CallTy>
1307typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1308CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1309 CallInfo Call, const FuncTy *F) {
1310 assert(!getNodeForAlloc(Call));
1311 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, C: Call);
1312 AllocationCallToContextNodeMap[Call] = AllocNode;
1313 // Use LastContextId as a uniq id for MIB allocation nodes.
1314 AllocNode->OrigStackOrAllocId = LastContextId;
1315 // Alloc type should be updated as we add in the MIBs. We should assert
1316 // afterwards that it is not still None.
1317 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1318
1319 return AllocNode;
1320}
1321
1322static std::string getAllocTypeString(uint8_t AllocTypes) {
1323 if (!AllocTypes)
1324 return "None";
1325 std::string Str;
1326 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1327 Str += "NotCold";
1328 if (AllocTypes & (uint8_t)AllocationType::Cold)
1329 Str += "Cold";
1330 return Str;
1331}
1332
1333template <typename DerivedCCG, typename FuncTy, typename CallTy>
1334template <class NodeT, class IteratorT>
1335void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1336 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1337 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1338 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1339 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1340 // is done.
1341 if (AllocType == AllocationType::Hot)
1342 AllocType = AllocationType::NotCold;
1343
1344 ContextIdToAllocationType[++LastContextId] = AllocType;
1345
1346 if (!ContextSizeInfo.empty()) {
1347 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1348 Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end());
1349 }
1350
1351 // Update alloc type and context ids for this MIB.
1352 AllocNode->AllocTypes |= (uint8_t)AllocType;
1353
1354 // Now add or update nodes for each stack id in alloc's context.
1355 // Later when processing the stack ids on non-alloc callsites we will adjust
1356 // for any inlining in the context.
1357 ContextNode *PrevNode = AllocNode;
1358 // Look for recursion (direct recursion should have been collapsed by
1359 // module summary analysis, here we should just be detecting mutual
1360 // recursion). Mark these nodes so we don't try to clone.
1361 SmallSet<uint64_t, 8> StackIdSet;
1362 // Skip any on the allocation call (inlining).
1363 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1364 ContextIter != StackContext.end(); ++ContextIter) {
1365 auto StackId = getStackId(IdOrIndex: *ContextIter);
1366 ContextNode *StackNode = getNodeForStackId(StackId);
1367 if (!StackNode) {
1368 StackNode = createNewNode(/*IsAllocation=*/false);
1369 StackEntryIdToContextNodeMap[StackId] = StackNode;
1370 StackNode->OrigStackOrAllocId = StackId;
1371 }
1372 // Marking a node recursive will prevent its cloning completely, even for
1373 // non-recursive contexts flowing through it.
1374 if (!AllowRecursiveCallsites) {
1375 auto Ins = StackIdSet.insert(StackId);
1376 if (!Ins.second)
1377 StackNode->Recursive = true;
1378 }
1379 StackNode->AllocTypes |= (uint8_t)AllocType;
1380 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1381 PrevNode = StackNode;
1382 }
1383}
1384
1385template <typename DerivedCCG, typename FuncTy, typename CallTy>
1386DenseSet<uint32_t>
1387CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1388 const DenseSet<uint32_t> &StackSequenceContextIds,
1389 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1390 DenseSet<uint32_t> NewContextIds;
1391 for (auto OldId : StackSequenceContextIds) {
1392 NewContextIds.insert(V: ++LastContextId);
1393 OldToNewContextIds[OldId].insert(V: LastContextId);
1394 assert(ContextIdToAllocationType.count(OldId));
1395 // The new context has the same allocation type as original.
1396 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1397 if (DotAllocContextIds.contains(V: OldId))
1398 DotAllocContextIds.insert(V: LastContextId);
1399 }
1400 return NewContextIds;
1401}
1402
1403template <typename DerivedCCG, typename FuncTy, typename CallTy>
1404void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1405 propagateDuplicateContextIds(
1406 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1407 // Build a set of duplicated context ids corresponding to the input id set.
1408 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1409 DenseSet<uint32_t> NewIds;
1410 for (auto Id : ContextIds)
1411 if (auto NewId = OldToNewContextIds.find(Val: Id);
1412 NewId != OldToNewContextIds.end())
1413 NewIds.insert_range(R: NewId->second);
1414 return NewIds;
1415 };
1416
1417 // Recursively update context ids sets along caller edges.
1418 auto UpdateCallers = [&](ContextNode *Node,
1419 DenseSet<const ContextEdge *> &Visited,
1420 auto &&UpdateCallers) -> void {
1421 for (const auto &Edge : Node->CallerEdges) {
1422 auto Inserted = Visited.insert(Edge.get());
1423 if (!Inserted.second)
1424 continue;
1425 ContextNode *NextNode = Edge->Caller;
1426 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1427 // Only need to recursively iterate to NextNode via this caller edge if
1428 // it resulted in any added ids to NextNode.
1429 if (!NewIdsToAdd.empty()) {
1430 Edge->getContextIds().insert_range(NewIdsToAdd);
1431 UpdateCallers(NextNode, Visited, UpdateCallers);
1432 }
1433 }
1434 };
1435
1436 DenseSet<const ContextEdge *> Visited;
1437 for (auto &Entry : AllocationCallToContextNodeMap) {
1438 auto *Node = Entry.second;
1439 UpdateCallers(Node, Visited, UpdateCallers);
1440 }
1441}
1442
1443template <typename DerivedCCG, typename FuncTy, typename CallTy>
1444void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1445 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1446 // This must be passed by value to make a copy since it will be adjusted
1447 // as ids are moved.
1448 DenseSet<uint32_t> RemainingContextIds) {
1449 auto &OrigEdges =
1450 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1451 DenseSet<uint32_t> RecursiveContextIds;
1452 DenseSet<uint32_t> AllCallerContextIds;
1453 if (AllowRecursiveCallsites) {
1454 // Identify which context ids are recursive which is needed to properly
1455 // update the RemainingContextIds set. The relevant recursive context ids
1456 // are those that are in multiple edges.
1457 for (auto &CE : OrigEdges) {
1458 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
1459 for (auto Id : CE->getContextIds())
1460 if (!AllCallerContextIds.insert(Id).second)
1461 RecursiveContextIds.insert(Id);
1462 }
1463 }
1464 // Increment iterator in loop so that we can remove edges as needed.
1465 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1466 auto Edge = *EI;
1467 DenseSet<uint32_t> NewEdgeContextIds;
1468 DenseSet<uint32_t> NotFoundContextIds;
1469 // Remove any matching context ids from Edge, return set that were found and
1470 // removed, these are the new edge's context ids. Also update the remaining
1471 // (not found ids).
1472 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1473 NotFoundContextIds);
1474 // Update the remaining context ids set for the later edges. This is a
1475 // compile time optimization.
1476 if (RecursiveContextIds.empty()) {
1477 // No recursive ids, so all of the previously remaining context ids that
1478 // were not seen on this edge are the new remaining set.
1479 RemainingContextIds.swap(RHS&: NotFoundContextIds);
1480 } else {
1481 // Keep the recursive ids in the remaining set as we expect to see those
1482 // on another edge. We can remove the non-recursive remaining ids that
1483 // were seen on this edge, however. We already have the set of remaining
1484 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1485 // non-recursive and only remove those. Note that despite the higher
1486 // overhead of updating the remaining context ids set when recursion
1487 // handling is enabled, it was found to be at worst performance neutral
1488 // and in one case a clear win.
1489 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1490 set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds);
1491 set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds);
1492 }
1493 // If no matching context ids for this edge, skip it.
1494 if (NewEdgeContextIds.empty()) {
1495 ++EI;
1496 continue;
1497 }
1498 if (TowardsCallee) {
1499 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1500 auto NewEdge = std::make_shared<ContextEdge>(
1501 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1502 NewNode->CalleeEdges.push_back(NewEdge);
1503 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1504 } else {
1505 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1506 auto NewEdge = std::make_shared<ContextEdge>(
1507 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1508 NewNode->CallerEdges.push_back(NewEdge);
1509 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1510 }
1511 // Remove old edge if context ids empty.
1512 if (Edge->getContextIds().empty()) {
1513 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee);
1514 continue;
1515 }
1516 ++EI;
1517 }
1518}
1519
1520template <typename DerivedCCG, typename FuncTy, typename CallTy>
1521static void checkEdge(
1522 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1523 // Confirm that alloc type is not None and that we have at least one context
1524 // id.
1525 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1526 assert(!Edge->ContextIds.empty());
1527}
1528
1529template <typename DerivedCCG, typename FuncTy, typename CallTy>
1530static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1531 bool CheckEdges = true) {
1532 if (Node->isRemoved())
1533 return;
1534#ifndef NDEBUG
1535 // Compute node's context ids once for use in asserts.
1536 auto NodeContextIds = Node->getContextIds();
1537#endif
1538 // Node's context ids should be the union of both its callee and caller edge
1539 // context ids.
1540 if (Node->CallerEdges.size()) {
1541 DenseSet<uint32_t> CallerEdgeContextIds(
1542 Node->CallerEdges.front()->ContextIds);
1543 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1544 if (CheckEdges)
1545 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1546 set_union(CallerEdgeContextIds, Edge->ContextIds);
1547 }
1548 // Node can have more context ids than callers if some contexts terminate at
1549 // node and some are longer. If we are allowing recursive callsites and
1550 // contexts this will be violated for incompletely cloned recursive cycles,
1551 // so skip the checking in that case.
1552 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1553 NodeContextIds == CallerEdgeContextIds ||
1554 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1555 }
1556 if (Node->CalleeEdges.size()) {
1557 DenseSet<uint32_t> CalleeEdgeContextIds(
1558 Node->CalleeEdges.front()->ContextIds);
1559 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1560 if (CheckEdges)
1561 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1562 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1563 }
1564 // If we are allowing recursive callsites and contexts this will be violated
1565 // for incompletely cloned recursive cycles, so skip the checking in that
1566 // case.
1567 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1568 NodeContextIds == CalleeEdgeContextIds);
1569 }
1570 // FIXME: Since this checking is only invoked under an option, we should
1571 // change the error checking from using assert to something that will trigger
1572 // an error on a release build.
1573#ifndef NDEBUG
1574 // Make sure we don't end up with duplicate edges between the same caller and
1575 // callee.
1576 DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1577 for (const auto &E : Node->CalleeEdges)
1578 NodeSet.insert(E->Callee);
1579 assert(NodeSet.size() == Node->CalleeEdges.size());
1580#endif
1581}
1582
1583template <typename DerivedCCG, typename FuncTy, typename CallTy>
1584void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1585 assignStackNodesPostOrder(
1586 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1587 DenseMap<uint64_t, std::vector<CallContextInfo>>
1588 &StackIdToMatchingCalls,
1589 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1590 auto Inserted = Visited.insert(Node);
1591 if (!Inserted.second)
1592 return;
1593 // Post order traversal. Iterate over a copy since we may add nodes and
1594 // therefore new callers during the recursive call, invalidating any
1595 // iterator over the original edge vector. We don't need to process these
1596 // new nodes as they were already processed on creation.
1597 auto CallerEdges = Node->CallerEdges;
1598 for (auto &Edge : CallerEdges) {
1599 // Skip any that have been removed during the recursion.
1600 if (Edge->isRemoved()) {
1601 assert(!is_contained(Node->CallerEdges, Edge));
1602 continue;
1603 }
1604 assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls,
1605 CallToMatchingCall);
1606 }
1607
1608 // If this node's stack id is in the map, update the graph to contain new
1609 // nodes representing any inlining at interior callsites. Note we move the
1610 // associated context ids over to the new nodes.
1611
1612 // Ignore this node if it is for an allocation or we didn't record any
1613 // stack id lists ending at it.
1614 if (Node->IsAllocation ||
1615 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1616 return;
1617
1618 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1619 // Handle the simple case first. A single call with a single stack id.
1620 // In this case there is no need to create any new context nodes, simply
1621 // assign the context node for stack id to this Call.
1622 if (Calls.size() == 1) {
1623 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1624 if (Ids.size() == 1) {
1625 assert(SavedContextIds.empty());
1626 // It should be this Node
1627 assert(Node == getNodeForStackId(Ids[0]));
1628 if (Node->Recursive)
1629 return;
1630 Node->setCall(Call);
1631 NonAllocationCallToContextNodeMap[Call] = Node;
1632 NodeToCallingFunc[Node] = Func;
1633 return;
1634 }
1635 }
1636
1637#ifndef NDEBUG
1638 // Find the node for the last stack id, which should be the same
1639 // across all calls recorded for this id, and is this node's id.
1640 uint64_t LastId = Node->OrigStackOrAllocId;
1641 ContextNode *LastNode = getNodeForStackId(LastId);
1642 // We should only have kept stack ids that had nodes.
1643 assert(LastNode);
1644 assert(LastNode == Node);
1645#else
1646 ContextNode *LastNode = Node;
1647#endif
1648
1649 // Compute the last node's context ids once, as it is shared by all calls in
1650 // this entry.
1651 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1652
1653 [[maybe_unused]] bool PrevIterCreatedNode = false;
1654 bool CreatedNode = false;
1655 for (unsigned I = 0; I < Calls.size();
1656 I++, PrevIterCreatedNode = CreatedNode) {
1657 CreatedNode = false;
1658 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1659 // Skip any for which we didn't assign any ids, these don't get a node in
1660 // the graph.
1661 if (SavedContextIds.empty()) {
1662 // If this call has a matching call (located in the same function and
1663 // having the same stack ids), simply add it to the context node created
1664 // for its matching call earlier. These can be treated the same through
1665 // cloning and get updated at the same time.
1666 if (!CallToMatchingCall.contains(Call))
1667 continue;
1668 auto MatchingCall = CallToMatchingCall[Call];
1669 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1670 // This should only happen if we had a prior iteration, and it didn't
1671 // create a node because of the below recomputation of context ids
1672 // finding none remaining and continuing early.
1673 assert(I > 0 && !PrevIterCreatedNode);
1674 continue;
1675 }
1676 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1677 Call);
1678 continue;
1679 }
1680
1681 assert(LastId == Ids.back());
1682
1683 // Recompute the context ids for this stack id sequence (the
1684 // intersection of the context ids of the corresponding nodes).
1685 // Start with the ids we saved in the map for this call, which could be
1686 // duplicated context ids. We have to recompute as we might have overlap
1687 // overlap between the saved context ids for different last nodes, and
1688 // removed them already during the post order traversal.
1689 set_intersect(SavedContextIds, LastNodeContextIds);
1690 ContextNode *PrevNode = LastNode;
1691 bool Skip = false;
1692 // Iterate backwards through the stack Ids, starting after the last Id
1693 // in the list, which was handled once outside for all Calls.
1694 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1695 auto Id = *IdIter;
1696 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1697 // We should only have kept stack ids that had nodes and weren't
1698 // recursive.
1699 assert(CurNode);
1700 assert(!CurNode->Recursive);
1701
1702 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1703 if (!Edge) {
1704 Skip = true;
1705 break;
1706 }
1707 PrevNode = CurNode;
1708
1709 // Update the context ids, which is the intersection of the ids along
1710 // all edges in the sequence.
1711 set_intersect(SavedContextIds, Edge->getContextIds());
1712
1713 // If we now have no context ids for clone, skip this call.
1714 if (SavedContextIds.empty()) {
1715 Skip = true;
1716 break;
1717 }
1718 }
1719 if (Skip)
1720 continue;
1721
1722 // Create new context node.
1723 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: Call);
1724 NonAllocationCallToContextNodeMap[Call] = NewNode;
1725 CreatedNode = true;
1726 NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds);
1727
1728 ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]);
1729 assert(FirstNode);
1730
1731 // Connect to callees of innermost stack frame in inlined call chain.
1732 // This updates context ids for FirstNode's callee's to reflect those
1733 // moved to NewNode.
1734 connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true, RemainingContextIds: SavedContextIds);
1735
1736 // Connect to callers of outermost stack frame in inlined call chain.
1737 // This updates context ids for FirstNode's caller's to reflect those
1738 // moved to NewNode.
1739 connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false, RemainingContextIds: SavedContextIds);
1740
1741 // Now we need to remove context ids from edges/nodes between First and
1742 // Last Node.
1743 PrevNode = nullptr;
1744 for (auto Id : Ids) {
1745 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1746 // We should only have kept stack ids that had nodes.
1747 assert(CurNode);
1748
1749 // Remove the context ids moved to NewNode from CurNode, and the
1750 // edge from the prior node.
1751 if (PrevNode) {
1752 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1753 // If the sequence contained recursion, we might have already removed
1754 // some edges during the connectNewNode calls above.
1755 if (!PrevEdge) {
1756 PrevNode = CurNode;
1757 continue;
1758 }
1759 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1760 if (PrevEdge->getContextIds().empty())
1761 removeEdgeFromGraph(Edge: PrevEdge);
1762 }
1763 // Since we update the edges from leaf to tail, only look at the callee
1764 // edges. This isn't an alloc node, so if there are no callee edges, the
1765 // alloc type is None.
1766 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1767 ? (uint8_t)AllocationType::None
1768 : CurNode->computeAllocType();
1769 PrevNode = CurNode;
1770 }
1771 if (VerifyNodes) {
1772 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1773 for (auto Id : Ids) {
1774 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1775 // We should only have kept stack ids that had nodes.
1776 assert(CurNode);
1777 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1778 }
1779 }
1780 }
1781}
1782
1783template <typename DerivedCCG, typename FuncTy, typename CallTy>
1784void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1785 // Map of stack id to all calls with that as the last (outermost caller)
1786 // callsite id that has a context node (some might not due to pruning
1787 // performed during matching of the allocation profile contexts).
1788 // The CallContextInfo contains the Call and a list of its stack ids with
1789 // ContextNodes, the function containing Call, and the set of context ids
1790 // the analysis will eventually identify for use in any new node created
1791 // for that callsite.
1792 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1793 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1794 for (auto &Call : CallsWithMetadata) {
1795 // Ignore allocations, already handled.
1796 if (AllocationCallToContextNodeMap.count(Call))
1797 continue;
1798 auto StackIdsWithContextNodes =
1799 getStackIdsWithContextNodesForCall(Call: Call.call());
1800 // If there were no nodes created for MIBs on allocs (maybe this was in
1801 // the unambiguous part of the MIB stack that was pruned), ignore.
1802 if (StackIdsWithContextNodes.empty())
1803 continue;
1804 // Otherwise, record this Call along with the list of ids for the last
1805 // (outermost caller) stack id with a node.
1806 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1807 {Call.call(), StackIdsWithContextNodes, Func, {}});
1808 }
1809 }
1810
1811 // First make a pass through all stack ids that correspond to a call,
1812 // as identified in the above loop. Compute the context ids corresponding to
1813 // each of these calls when they correspond to multiple stack ids due to
1814 // due to inlining. Perform any duplication of context ids required when
1815 // there is more than one call with the same stack ids. Their (possibly newly
1816 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1817 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1818 // Save a map from each call to any that are found to match it. I.e. located
1819 // in the same function and have the same (possibly pruned) stack ids. We use
1820 // this to avoid creating extra graph nodes as they can be treated the same.
1821 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1822 for (auto &It : StackIdToMatchingCalls) {
1823 auto &Calls = It.getSecond();
1824 // Skip single calls with a single stack id. These don't need a new node.
1825 if (Calls.size() == 1) {
1826 auto &Ids = Calls[0].StackIds;
1827 if (Ids.size() == 1)
1828 continue;
1829 }
1830 // In order to do the best and maximal matching of inlined calls to context
1831 // node sequences we will sort the vectors of stack ids in descending order
1832 // of length, and within each length, lexicographically by stack id. The
1833 // latter is so that we can specially handle calls that have identical stack
1834 // id sequences (either due to cloning or artificially because of the MIB
1835 // context pruning). Those with the same Ids are then sorted by function to
1836 // facilitate efficiently mapping them to the same context node.
1837 // Because the functions are pointers, to ensure a stable sort first assign
1838 // each function pointer to its first index in the Calls array, and then use
1839 // that to sort by.
1840 DenseMap<const FuncTy *, unsigned> FuncToIndex;
1841 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1842 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1843 llvm::stable_sort(
1844 Calls,
1845 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1846 return A.StackIds.size() > B.StackIds.size() ||
1847 (A.StackIds.size() == B.StackIds.size() &&
1848 (A.StackIds < B.StackIds ||
1849 (A.StackIds == B.StackIds &&
1850 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1851 });
1852
1853 // Find the node for the last stack id, which should be the same
1854 // across all calls recorded for this id, and is the id for this
1855 // entry in the StackIdToMatchingCalls map.
1856 uint64_t LastId = It.getFirst();
1857 ContextNode *LastNode = getNodeForStackId(StackId: LastId);
1858 // We should only have kept stack ids that had nodes.
1859 assert(LastNode);
1860
1861 if (LastNode->Recursive)
1862 continue;
1863
1864 // Initialize the context ids with the last node's. We will subsequently
1865 // refine the context ids by computing the intersection along all edges.
1866 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1867 assert(!LastNodeContextIds.empty());
1868
1869#ifndef NDEBUG
1870 // Save the set of functions seen for a particular set of the same stack
1871 // ids. This is used to ensure that they have been correctly sorted to be
1872 // adjacent in the Calls list, since we rely on that to efficiently place
1873 // all such matching calls onto the same context node.
1874 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1875#endif
1876
1877 for (unsigned I = 0; I < Calls.size(); I++) {
1878 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1879 assert(SavedContextIds.empty());
1880 assert(LastId == Ids.back());
1881
1882#ifndef NDEBUG
1883 // If this call has a different set of ids than the last one, clear the
1884 // set used to ensure they are sorted properly.
1885 if (I > 0 && Ids != Calls[I - 1].StackIds)
1886 MatchingIdsFuncSet.clear();
1887#endif
1888
1889 // First compute the context ids for this stack id sequence (the
1890 // intersection of the context ids of the corresponding nodes).
1891 // Start with the remaining saved ids for the last node.
1892 assert(!LastNodeContextIds.empty());
1893 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1894
1895 ContextNode *PrevNode = LastNode;
1896 ContextNode *CurNode = LastNode;
1897 bool Skip = false;
1898
1899 // Iterate backwards through the stack Ids, starting after the last Id
1900 // in the list, which was handled once outside for all Calls.
1901 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1902 auto Id = *IdIter;
1903 CurNode = getNodeForStackId(StackId: Id);
1904 // We should only have kept stack ids that had nodes.
1905 assert(CurNode);
1906
1907 if (CurNode->Recursive) {
1908 Skip = true;
1909 break;
1910 }
1911
1912 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1913 // If there is no edge then the nodes belong to different MIB contexts,
1914 // and we should skip this inlined context sequence. For example, this
1915 // particular inlined context may include stack ids A->B, and we may
1916 // indeed have nodes for both A and B, but it is possible that they were
1917 // never profiled in sequence in a single MIB for any allocation (i.e.
1918 // we might have profiled an allocation that involves the callsite A,
1919 // but through a different one of its callee callsites, and we might
1920 // have profiled an allocation that involves callsite B, but reached
1921 // from a different caller callsite).
1922 if (!Edge) {
1923 Skip = true;
1924 break;
1925 }
1926 PrevNode = CurNode;
1927
1928 // Update the context ids, which is the intersection of the ids along
1929 // all edges in the sequence.
1930 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1931
1932 // If we now have no context ids for clone, skip this call.
1933 if (StackSequenceContextIds.empty()) {
1934 Skip = true;
1935 break;
1936 }
1937 }
1938 if (Skip)
1939 continue;
1940
1941 // If some of this call's stack ids did not have corresponding nodes (due
1942 // to pruning), don't include any context ids for contexts that extend
1943 // beyond these nodes. Otherwise we would be matching part of unrelated /
1944 // not fully matching stack contexts. To do this, subtract any context ids
1945 // found in caller nodes of the last node found above.
1946 if (Ids.back() != getLastStackId(Call)) {
1947 for (const auto &PE : LastNode->CallerEdges) {
1948 set_subtract(StackSequenceContextIds, PE->getContextIds());
1949 if (StackSequenceContextIds.empty())
1950 break;
1951 }
1952 // If we now have no context ids for clone, skip this call.
1953 if (StackSequenceContextIds.empty())
1954 continue;
1955 }
1956
1957#ifndef NDEBUG
1958 // If the prior call had the same stack ids this set would not be empty.
1959 // Check if we already have a call that "matches" because it is located
1960 // in the same function. If the Calls list was sorted properly we should
1961 // not encounter this situation as all such entries should be adjacent
1962 // and processed in bulk further below.
1963 assert(!MatchingIdsFuncSet.contains(Func));
1964
1965 MatchingIdsFuncSet.insert(Func);
1966#endif
1967
1968 // Check if the next set of stack ids is the same (since the Calls vector
1969 // of tuples is sorted by the stack ids we can just look at the next one).
1970 // If so, save them in the CallToMatchingCall map so that they get
1971 // assigned to the same context node, and skip them.
1972 bool DuplicateContextIds = false;
1973 for (unsigned J = I + 1; J < Calls.size(); J++) {
1974 auto &CallCtxInfo = Calls[J];
1975 auto &NextIds = CallCtxInfo.StackIds;
1976 if (NextIds != Ids)
1977 break;
1978 auto *NextFunc = CallCtxInfo.Func;
1979 if (NextFunc != Func) {
1980 // We have another Call with the same ids but that cannot share this
1981 // node, must duplicate ids for it.
1982 DuplicateContextIds = true;
1983 break;
1984 }
1985 auto &NextCall = CallCtxInfo.Call;
1986 CallToMatchingCall[NextCall] = Call;
1987 // Update I so that it gets incremented correctly to skip this call.
1988 I = J;
1989 }
1990
1991 // If we don't have duplicate context ids, then we can assign all the
1992 // context ids computed for the original node sequence to this call.
1993 // If there are duplicate calls with the same stack ids then we synthesize
1994 // new context ids that are duplicates of the originals. These are
1995 // assigned to SavedContextIds, which is a reference into the map entry
1996 // for this call, allowing us to access these ids later on.
1997 OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
1998 StackSequenceContextIds.size());
1999 SavedContextIds =
2000 DuplicateContextIds
2001 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2002 : StackSequenceContextIds;
2003 assert(!SavedContextIds.empty());
2004
2005 if (!DuplicateContextIds) {
2006 // Update saved last node's context ids to remove those that are
2007 // assigned to other calls, so that it is ready for the next call at
2008 // this stack id.
2009 set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
2010 if (LastNodeContextIds.empty())
2011 break;
2012 }
2013 }
2014 }
2015
2016 // Propagate the duplicate context ids over the graph.
2017 propagateDuplicateContextIds(OldToNewContextIds);
2018
2019 if (VerifyCCG)
2020 check();
2021
2022 // Now perform a post-order traversal over the graph, starting with the
2023 // allocation nodes, essentially processing nodes from callers to callees.
2024 // For any that contains an id in the map, update the graph to contain new
2025 // nodes representing any inlining at interior callsites. Note we move the
2026 // associated context ids over to the new nodes.
2027 DenseSet<const ContextNode *> Visited;
2028 for (auto &Entry : AllocationCallToContextNodeMap)
2029 assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls,
2030 CallToMatchingCall);
2031 if (VerifyCCG)
2032 check();
2033}
2034
2035uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2036 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2037 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2038 return CallsiteContext.back();
2039}
2040
2041uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2042 assert(isa<CallsiteInfo *>(Call));
2043 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2044 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2045 // Need to convert index into stack id.
2046 return Index.getStackIdAtIndex(Index: CallsiteContext.back());
2047}
2048
2049static const std::string MemProfCloneSuffix = ".memprof.";
2050
2051static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2052 // We use CloneNo == 0 to refer to the original version, which doesn't get
2053 // renamed with a suffix.
2054 if (!CloneNo)
2055 return Base.str();
2056 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2057}
2058
2059static bool isMemProfClone(const Function &F) {
2060 return F.getName().contains(Other: MemProfCloneSuffix);
2061}
2062
2063std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2064 const Instruction *Call,
2065 unsigned CloneNo) const {
2066 return (Twine(Call->getFunction()->getName()) + " -> " +
2067 cast<CallBase>(Val: Call)->getCalledFunction()->getName())
2068 .str();
2069}
2070
2071std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2072 const IndexCall &Call,
2073 unsigned CloneNo) const {
2074 auto VI = FSToVIMap.find(x: Func);
2075 assert(VI != FSToVIMap.end());
2076 if (isa<AllocInfo *>(Val: Call))
2077 return (VI->second.name() + " -> alloc").str();
2078 else {
2079 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call);
2080 return (VI->second.name() + " -> " +
2081 getMemProfFuncName(Base: Callsite->Callee.name(),
2082 CloneNo: Callsite->Clones[CloneNo]))
2083 .str();
2084 }
2085}
2086
2087std::vector<uint64_t>
2088ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2089 Instruction *Call) {
2090 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2091 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2092 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2093 CallsiteContext);
2094}
2095
2096std::vector<uint64_t>
2097IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2098 assert(isa<CallsiteInfo *>(Call));
2099 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2100 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2101 return getStackIdsWithContextNodes<CallsiteInfo,
2102 SmallVector<unsigned>::const_iterator>(
2103 CallsiteContext);
2104}
2105
2106template <typename DerivedCCG, typename FuncTy, typename CallTy>
2107template <class NodeT, class IteratorT>
2108std::vector<uint64_t>
2109CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2110 CallStack<NodeT, IteratorT> &CallsiteContext) {
2111 std::vector<uint64_t> StackIds;
2112 for (auto IdOrIndex : CallsiteContext) {
2113 auto StackId = getStackId(IdOrIndex);
2114 ContextNode *Node = getNodeForStackId(StackId);
2115 if (!Node)
2116 break;
2117 StackIds.push_back(StackId);
2118 }
2119 return StackIds;
2120}
2121
2122ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2123 Module &M,
2124 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2125 : Mod(M), OREGetter(OREGetter) {
2126 for (auto &F : M) {
2127 std::vector<CallInfo> CallsWithMetadata;
2128 for (auto &BB : F) {
2129 for (auto &I : BB) {
2130 if (!isa<CallBase>(Val: I))
2131 continue;
2132 if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
2133 CallsWithMetadata.push_back(x: &I);
2134 auto *AllocNode = addAllocNode(Call: &I, F: &F);
2135 auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
2136 assert(CallsiteMD);
2137 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2138 // Add all of the MIBs and their stack nodes.
2139 for (auto &MDOp : MemProfMD->operands()) {
2140 auto *MIBMD = cast<const MDNode>(Val: MDOp);
2141 std::vector<ContextTotalSize> ContextSizeInfo;
2142 // Collect the context size information if it exists.
2143 if (MIBMD->getNumOperands() > 2) {
2144 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2145 MDNode *ContextSizePair =
2146 dyn_cast<MDNode>(Val: MIBMD->getOperand(I));
2147 assert(ContextSizePair->getNumOperands() == 2);
2148 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2149 MD: ContextSizePair->getOperand(I: 0))
2150 ->getZExtValue();
2151 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2152 MD: ContextSizePair->getOperand(I: 1))
2153 ->getZExtValue();
2154 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2155 }
2156 }
2157 MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
2158 assert(StackNode);
2159 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2160 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2161 AllocNode, StackContext, CallsiteContext,
2162 AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo);
2163 }
2164 // If exporting the graph to dot and an allocation id of interest was
2165 // specified, record all the context ids for this allocation node.
2166 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2167 DotAllocContextIds = AllocNode->getContextIds();
2168 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2169 // Memprof and callsite metadata on memory allocations no longer
2170 // needed.
2171 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
2172 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2173 }
2174 // For callsite metadata, add to list for this function for later use.
2175 else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) {
2176 CallsWithMetadata.push_back(x: &I);
2177 }
2178 }
2179 }
2180 if (!CallsWithMetadata.empty())
2181 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2182 }
2183
2184 if (DumpCCG) {
2185 dbgs() << "CCG before updating call stack chains:\n";
2186 dbgs() << *this;
2187 }
2188
2189 if (ExportToDot)
2190 exportToDot(Label: "prestackupdate");
2191
2192 updateStackNodes();
2193
2194 if (ExportToDot)
2195 exportToDot(Label: "poststackupdate");
2196
2197 handleCallsitesWithMultipleTargets();
2198
2199 markBackedges();
2200
2201 // Strip off remaining callsite metadata, no longer needed.
2202 for (auto &FuncEntry : FuncToCallsWithMetadata)
2203 for (auto &Call : FuncEntry.second)
2204 Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2205}
2206
2207IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2208 ModuleSummaryIndex &Index,
2209 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2210 isPrevailing)
2211 : Index(Index), isPrevailing(isPrevailing) {
2212 for (auto &I : Index) {
2213 auto VI = Index.getValueInfo(R: I);
2214 for (auto &S : VI.getSummaryList()) {
2215 // We should only add the prevailing nodes. Otherwise we may try to clone
2216 // in a weak copy that won't be linked (and may be different than the
2217 // prevailing version).
2218 // We only keep the memprof summary on the prevailing copy now when
2219 // building the combined index, as a space optimization, however don't
2220 // rely on this optimization. The linker doesn't resolve local linkage
2221 // values so don't check whether those are prevailing.
2222 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2223 !isPrevailing(VI.getGUID(), S.get()))
2224 continue;
2225 auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
2226 if (!FS)
2227 continue;
2228 std::vector<CallInfo> CallsWithMetadata;
2229 if (!FS->allocs().empty()) {
2230 for (auto &AN : FS->mutableAllocs()) {
2231 // This can happen because of recursion elimination handling that
2232 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2233 // We still added them to the summary because we need to be able to
2234 // correlate properly in applyImport in the backends.
2235 if (AN.MIBs.empty())
2236 continue;
2237 IndexCall AllocCall(&AN);
2238 CallsWithMetadata.push_back(x: AllocCall);
2239 auto *AllocNode = addAllocNode(Call: AllocCall, F: FS);
2240 // Pass an empty CallStack to the CallsiteContext (second)
2241 // parameter, since for ThinLTO we already collapsed out the inlined
2242 // stack ids on the allocation call during ModuleSummaryAnalysis.
2243 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2244 EmptyContext;
2245 unsigned I = 0;
2246 assert(!metadataMayIncludeContextSizeInfo() ||
2247 AN.ContextSizeInfos.size() == AN.MIBs.size());
2248 // Now add all of the MIBs and their stack nodes.
2249 for (auto &MIB : AN.MIBs) {
2250 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2251 StackContext(&MIB);
2252 std::vector<ContextTotalSize> ContextSizeInfo;
2253 if (!AN.ContextSizeInfos.empty()) {
2254 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2255 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2256 }
2257 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2258 AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType,
2259 ContextSizeInfo);
2260 I++;
2261 }
2262 // If exporting the graph to dot and an allocation id of interest was
2263 // specified, record all the context ids for this allocation node.
2264 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2265 DotAllocContextIds = AllocNode->getContextIds();
2266 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2267 // Initialize version 0 on the summary alloc node to the current alloc
2268 // type, unless it has both types in which case make it default, so
2269 // that in the case where we aren't able to clone the original version
2270 // always ends up with the default allocation behavior.
2271 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
2272 }
2273 }
2274 // For callsite metadata, add to list for this function for later use.
2275 if (!FS->callsites().empty())
2276 for (auto &SN : FS->mutableCallsites()) {
2277 IndexCall StackNodeCall(&SN);
2278 CallsWithMetadata.push_back(x: StackNodeCall);
2279 }
2280
2281 if (!CallsWithMetadata.empty())
2282 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2283
2284 if (!FS->allocs().empty() || !FS->callsites().empty())
2285 FSToVIMap[FS] = VI;
2286 }
2287 }
2288
2289 if (DumpCCG) {
2290 dbgs() << "CCG before updating call stack chains:\n";
2291 dbgs() << *this;
2292 }
2293
2294 if (ExportToDot)
2295 exportToDot(Label: "prestackupdate");
2296
2297 updateStackNodes();
2298
2299 if (ExportToDot)
2300 exportToDot(Label: "poststackupdate");
2301
2302 handleCallsitesWithMultipleTargets();
2303
2304 markBackedges();
2305}
2306
2307template <typename DerivedCCG, typename FuncTy, typename CallTy>
2308void CallsiteContextGraph<DerivedCCG, FuncTy,
2309 CallTy>::handleCallsitesWithMultipleTargets() {
2310 // Look for and workaround callsites that call multiple functions.
2311 // This can happen for indirect calls, which needs better handling, and in
2312 // more rare cases (e.g. macro expansion).
2313 // TODO: To fix this for indirect calls we will want to perform speculative
2314 // devirtualization using either the normal PGO info with ICP, or using the
2315 // information in the profiled MemProf contexts. We can do this prior to
2316 // this transformation for regular LTO, and for ThinLTO we can simulate that
2317 // effect in the summary and perform the actual speculative devirtualization
2318 // while cloning in the ThinLTO backend.
2319
2320 // Keep track of the new nodes synthesized for discovered tail calls missing
2321 // from the profiled contexts.
2322 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2323
2324 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2325 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2326 auto *Node = Entry.second;
2327 assert(Node->Clones.empty());
2328 // Check all node callees and see if in the same function.
2329 // We need to check all of the calls recorded in this Node, because in some
2330 // cases we may have had multiple calls with the same debug info calling
2331 // different callees. This can happen, for example, when an object is
2332 // constructed in the paramter list - the destructor call of the object has
2333 // the same debug info (line/col) as the call the object was passed to.
2334 // Here we will prune any that don't match all callee nodes.
2335 std::vector<CallInfo> AllCalls;
2336 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2337 AllCalls.push_back(Node->Call);
2338 llvm::append_range(AllCalls, Node->MatchingCalls);
2339
2340 // First see if we can partition the calls by callee function, creating new
2341 // nodes to host each set of calls calling the same callees. This is
2342 // necessary for support indirect calls with ThinLTO, for which we
2343 // synthesized CallsiteInfo records for each target. They will all have the
2344 // same callsite stack ids and would be sharing a context node at this
2345 // point. We need to perform separate cloning for each, which will be
2346 // applied along with speculative devirtualization in the ThinLTO backends
2347 // as needed. Note this does not currently support looking through tail
2348 // calls, it is unclear if we need that for indirect call targets.
2349 // First partition calls by callee func. Map indexed by func, value is
2350 // struct with list of matching calls, assigned node.
2351 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2352 continue;
2353
2354 auto It = AllCalls.begin();
2355 // Iterate through the calls until we find the first that matches.
2356 for (; It != AllCalls.end(); ++It) {
2357 auto ThisCall = *It;
2358 bool Match = true;
2359 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2360 ++EI) {
2361 auto Edge = *EI;
2362 if (!Edge->Callee->hasCall())
2363 continue;
2364 assert(NodeToCallingFunc.count(Edge->Callee));
2365 // Check if the called function matches that of the callee node.
2366 if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) {
2367 Match = false;
2368 break;
2369 }
2370 }
2371 // Found a call that matches the callee nodes, we can quit now.
2372 if (Match) {
2373 // If the first match is not the primary call on the Node, update it
2374 // now. We will update the list of matching calls further below.
2375 if (Node->Call != ThisCall) {
2376 Node->setCall(ThisCall);
2377 // We need to update the NonAllocationCallToContextNodeMap, but don't
2378 // want to do this during iteration over that map, so save the calls
2379 // that need updated entries.
2380 NewCallToNode.push_back({ThisCall, Node});
2381 }
2382 break;
2383 }
2384 }
2385 // We will update this list below (or leave it cleared if there was no
2386 // match found above).
2387 Node->MatchingCalls.clear();
2388 // If we hit the end of the AllCalls vector, no call matching the callee
2389 // nodes was found, clear the call information in the node.
2390 if (It == AllCalls.end()) {
2391 RemovedEdgesWithMismatchedCallees++;
2392 // Work around by setting Node to have a null call, so it gets
2393 // skipped during cloning. Otherwise assignFunctions will assert
2394 // because its data structures are not designed to handle this case.
2395 Node->setCall(CallInfo());
2396 continue;
2397 }
2398 // Now add back any matching calls that call the same function as the
2399 // matching primary call on Node.
2400 for (++It; It != AllCalls.end(); ++It) {
2401 auto ThisCall = *It;
2402 if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call()))
2403 continue;
2404 Node->MatchingCalls.push_back(ThisCall);
2405 }
2406 }
2407
2408 // Remove all mismatched nodes identified in the above loop from the node map
2409 // (checking whether they have a null call which is set above). For a
2410 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2411 // to do the removal via remove_if than by individually erasing entries above.
2412 // Also remove any entries if we updated the node's primary call above.
2413 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2414 return !it.second->hasCall() || it.second->Call != it.first;
2415 });
2416
2417 // Add entries for any new primary calls recorded above.
2418 for (auto &[Call, Node] : NewCallToNode)
2419 NonAllocationCallToContextNodeMap[Call] = Node;
2420
2421 // Add the new nodes after the above loop so that the iteration is not
2422 // invalidated.
2423 for (auto &[Call, Node] : TailCallToContextNodeMap)
2424 NonAllocationCallToContextNodeMap[Call] = Node;
2425}
2426
2427template <typename DerivedCCG, typename FuncTy, typename CallTy>
2428bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2429 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2430 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2431 // Struct to keep track of all the calls having the same callee function,
2432 // and the node we eventually assign to them. Eventually we will record the
2433 // context node assigned to this group of calls.
2434 struct CallsWithSameCallee {
2435 std::vector<CallInfo> Calls;
2436 ContextNode *Node = nullptr;
2437 };
2438
2439 // First partition calls by callee function. Build map from each function
2440 // to the list of matching calls.
2441 DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2442 for (auto ThisCall : AllCalls) {
2443 auto *F = getCalleeFunc(Call: ThisCall.call());
2444 if (F)
2445 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2446 }
2447
2448 // Next, walk through all callee edges. For each callee node, get its
2449 // containing function and see if it was recorded in the above map (meaning we
2450 // have at least one matching call). Build another map from each callee node
2451 // with a matching call to the structure instance created above containing all
2452 // the calls.
2453 DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
2454 for (const auto &Edge : Node->CalleeEdges) {
2455 if (!Edge->Callee->hasCall())
2456 continue;
2457 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2458 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2459 CalleeNodeToCallInfo[Edge->Callee] =
2460 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2461 }
2462
2463 // If there are entries in the second map, then there were no matching
2464 // calls/callees, nothing to do here. Return so we can go to the handling that
2465 // looks through tail calls.
2466 if (CalleeNodeToCallInfo.empty())
2467 return false;
2468
2469 // Walk through all callee edges again. Any and all callee edges that didn't
2470 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2471 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2472 // ignored during cloning. If it is in the map, then we use the node recorded
2473 // in that entry (creating it if needed), and move the callee edge to it.
2474 // The first callee will use the original node instead of creating a new one.
2475 // Note that any of the original calls on this node (in AllCalls) that didn't
2476 // have a callee function automatically get dropped from the node as part of
2477 // this process.
2478 ContextNode *UnmatchedCalleesNode = nullptr;
2479 // Track whether we already assigned original node to a callee.
2480 bool UsedOrigNode = false;
2481 assert(NodeToCallingFunc[Node]);
2482 // Iterate over a copy of Node's callee edges, since we may need to remove
2483 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2484 // makes it less error-prone.
2485 auto CalleeEdges = Node->CalleeEdges;
2486 for (auto &Edge : CalleeEdges) {
2487 if (!Edge->Callee->hasCall())
2488 continue;
2489
2490 // Will be updated below to point to whatever (caller) node this callee edge
2491 // should be moved to.
2492 ContextNode *CallerNodeToUse = nullptr;
2493
2494 // Handle the case where there were no matching calls first. Move this
2495 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2496 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2497 if (!UnmatchedCalleesNode)
2498 UnmatchedCalleesNode =
2499 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2500 CallerNodeToUse = UnmatchedCalleesNode;
2501 } else {
2502 // Look up the information recorded for this callee node, and use the
2503 // recorded caller node (creating it if needed).
2504 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2505 if (!Info->Node) {
2506 // If we haven't assigned any callees to the original node use it.
2507 if (!UsedOrigNode) {
2508 Info->Node = Node;
2509 // Clear the set of matching calls which will be updated below.
2510 Node->MatchingCalls.clear();
2511 UsedOrigNode = true;
2512 } else
2513 Info->Node =
2514 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2515 assert(!Info->Calls.empty());
2516 // The first call becomes the primary call for this caller node, and the
2517 // rest go in the matching calls list.
2518 Info->Node->setCall(Info->Calls.front());
2519 llvm::append_range(Info->Node->MatchingCalls,
2520 llvm::drop_begin(Info->Calls));
2521 // Save the primary call to node correspondence so that we can update
2522 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2523 // caller of this function.
2524 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2525 }
2526 CallerNodeToUse = Info->Node;
2527 }
2528
2529 // Don't need to move edge if we are using the original node;
2530 if (CallerNodeToUse == Node)
2531 continue;
2532
2533 moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse);
2534 }
2535 // Now that we are done moving edges, clean up any caller edges that ended
2536 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2537 // caller edges from Node are replicated onto the new callers, and it
2538 // simplifies the handling to leave them until we have moved all
2539 // edges/context ids.
2540 for (auto &I : CalleeNodeToCallInfo)
2541 removeNoneTypeCallerEdges(Node: I.second->Node);
2542 if (UnmatchedCalleesNode)
2543 removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode);
2544 removeNoneTypeCallerEdges(Node);
2545
2546 return true;
2547}
2548
2549uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2550 // In the Module (IR) case this is already the Id.
2551 return IdOrIndex;
2552}
2553
2554uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2555 // In the Index case this is an index into the stack id list in the summary
2556 // index, convert it to an Id.
2557 return Index.getStackIdAtIndex(Index: IdOrIndex);
2558}
2559
2560template <typename DerivedCCG, typename FuncTy, typename CallTy>
2561bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2562 CallTy Call, EdgeIter &EI,
2563 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2564 auto Edge = *EI;
2565 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2566 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2567 // Will be populated in order of callee to caller if we find a chain of tail
2568 // calls between the profiled caller and callee.
2569 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2570 if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
2571 FoundCalleeChain))
2572 return false;
2573
2574 // The usual case where the profiled callee matches that of the IR/summary.
2575 if (FoundCalleeChain.empty())
2576 return true;
2577
2578 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2579 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2580 // If there is already an edge between these nodes, simply update it and
2581 // return.
2582 if (CurEdge) {
2583 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2584 CurEdge->AllocTypes |= Edge->AllocTypes;
2585 return;
2586 }
2587 // Otherwise, create a new edge and insert it into the caller and callee
2588 // lists.
2589 auto NewEdge = std::make_shared<ContextEdge>(
2590 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2591 Callee->CallerEdges.push_back(NewEdge);
2592 if (Caller == Edge->Caller) {
2593 // If we are inserting the new edge into the current edge's caller, insert
2594 // the new edge before the current iterator position, and then increment
2595 // back to the current edge.
2596 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2597 ++EI;
2598 assert(*EI == Edge &&
2599 "Iterator position not restored after insert and increment");
2600 } else
2601 Caller->CalleeEdges.push_back(NewEdge);
2602 };
2603
2604 // Create new nodes for each found callee and connect in between the profiled
2605 // caller and callee.
2606 auto *CurCalleeNode = Edge->Callee;
2607 for (auto &[NewCall, Func] : FoundCalleeChain) {
2608 ContextNode *NewNode = nullptr;
2609 // First check if we have already synthesized a node for this tail call.
2610 if (TailCallToContextNodeMap.count(NewCall)) {
2611 NewNode = TailCallToContextNodeMap[NewCall];
2612 NewNode->AllocTypes |= Edge->AllocTypes;
2613 } else {
2614 FuncToCallsWithMetadata[Func].push_back({NewCall});
2615 // Create Node and record node info.
2616 NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: NewCall);
2617 TailCallToContextNodeMap[NewCall] = NewNode;
2618 NewNode->AllocTypes = Edge->AllocTypes;
2619 }
2620
2621 // Hook up node to its callee node
2622 AddEdge(NewNode, CurCalleeNode);
2623
2624 CurCalleeNode = NewNode;
2625 }
2626
2627 // Hook up edge's original caller to new callee node.
2628 AddEdge(Edge->Caller, CurCalleeNode);
2629
2630#ifndef NDEBUG
2631 // Save this because Edge's fields get cleared below when removed.
2632 auto *Caller = Edge->Caller;
2633#endif
2634
2635 // Remove old edge
2636 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
2637
2638 // To simplify the increment of EI in the caller, subtract one from EI.
2639 // In the final AddEdge call we would have either added a new callee edge,
2640 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2641 // that there is at least one callee edge.
2642 assert(!Caller->CalleeEdges.empty());
2643 --EI;
2644
2645 return true;
2646}
2647
2648bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2649 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2650 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2651 bool &FoundMultipleCalleeChains) {
2652 // Stop recursive search if we have already explored the maximum specified
2653 // depth.
2654 if (Depth > TailCallSearchDepth)
2655 return false;
2656
2657 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2658 FoundCalleeChain.push_back(x: {Callsite, F});
2659 };
2660
2661 auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
2662 if (!CalleeFunc) {
2663 auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
2664 assert(Alias);
2665 CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
2666 assert(CalleeFunc);
2667 }
2668
2669 // Look for tail calls in this function, and check if they either call the
2670 // profiled callee directly, or indirectly (via a recursive search).
2671 // Only succeed if there is a single unique tail call chain found between the
2672 // profiled caller and callee, otherwise we could perform incorrect cloning.
2673 bool FoundSingleCalleeChain = false;
2674 for (auto &BB : *CalleeFunc) {
2675 for (auto &I : BB) {
2676 auto *CB = dyn_cast<CallBase>(Val: &I);
2677 if (!CB || !CB->isTailCall())
2678 continue;
2679 auto *CalledValue = CB->getCalledOperand();
2680 auto *CalledFunction = CB->getCalledFunction();
2681 if (CalledValue && !CalledFunction) {
2682 CalledValue = CalledValue->stripPointerCasts();
2683 // Stripping pointer casts can reveal a called function.
2684 CalledFunction = dyn_cast<Function>(Val: CalledValue);
2685 }
2686 // Check if this is an alias to a function. If so, get the
2687 // called aliasee for the checks below.
2688 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
2689 assert(!CalledFunction &&
2690 "Expected null called function in callsite for alias");
2691 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
2692 }
2693 if (!CalledFunction)
2694 continue;
2695 if (CalledFunction == ProfiledCallee) {
2696 if (FoundSingleCalleeChain) {
2697 FoundMultipleCalleeChains = true;
2698 return false;
2699 }
2700 FoundSingleCalleeChain = true;
2701 FoundProfiledCalleeCount++;
2702 FoundProfiledCalleeDepth += Depth;
2703 if (Depth > FoundProfiledCalleeMaxDepth)
2704 FoundProfiledCalleeMaxDepth = Depth;
2705 SaveCallsiteInfo(&I, CalleeFunc);
2706 } else if (findProfiledCalleeThroughTailCalls(
2707 ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1,
2708 FoundCalleeChain, FoundMultipleCalleeChains)) {
2709 // findProfiledCalleeThroughTailCalls should not have returned
2710 // true if FoundMultipleCalleeChains.
2711 assert(!FoundMultipleCalleeChains);
2712 if (FoundSingleCalleeChain) {
2713 FoundMultipleCalleeChains = true;
2714 return false;
2715 }
2716 FoundSingleCalleeChain = true;
2717 SaveCallsiteInfo(&I, CalleeFunc);
2718 } else if (FoundMultipleCalleeChains)
2719 return false;
2720 }
2721 }
2722
2723 return FoundSingleCalleeChain;
2724}
2725
2726const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2727 auto *CB = dyn_cast<CallBase>(Val: Call);
2728 if (!CB->getCalledOperand() || CB->isIndirectCall())
2729 return nullptr;
2730 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2731 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
2732 if (Alias)
2733 return dyn_cast<Function>(Val: Alias->getAliasee());
2734 return dyn_cast<Function>(Val: CalleeVal);
2735}
2736
2737bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2738 Instruction *Call, const Function *Func, const Function *CallerFunc,
2739 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2740 auto *CB = dyn_cast<CallBase>(Val: Call);
2741 if (!CB->getCalledOperand() || CB->isIndirectCall())
2742 return false;
2743 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2744 auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
2745 if (CalleeFunc == Func)
2746 return true;
2747 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
2748 if (Alias && Alias->getAliasee() == Func)
2749 return true;
2750
2751 // Recursively search for the profiled callee through tail calls starting with
2752 // the actual Callee. The discovered tail call chain is saved in
2753 // FoundCalleeChain, and we will fixup the graph to include these callsites
2754 // after returning.
2755 // FIXME: We will currently redo the same recursive walk if we find the same
2756 // mismatched callee from another callsite. We can improve this with more
2757 // bookkeeping of the created chain of new nodes for each mismatch.
2758 unsigned Depth = 1;
2759 bool FoundMultipleCalleeChains = false;
2760 if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
2761 FoundCalleeChain,
2762 FoundMultipleCalleeChains)) {
2763 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2764 << Func->getName() << " from " << CallerFunc->getName()
2765 << " that actually called " << CalleeVal->getName()
2766 << (FoundMultipleCalleeChains
2767 ? " (found multiple possible chains)"
2768 : "")
2769 << "\n");
2770 if (FoundMultipleCalleeChains)
2771 FoundProfiledCalleeNonUniquelyCount++;
2772 return false;
2773 }
2774
2775 return true;
2776}
2777
2778bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2779 Instruction *Call2) {
2780 auto *CB1 = cast<CallBase>(Val: Call1);
2781 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2782 return false;
2783 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2784 auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1);
2785 auto *CB2 = cast<CallBase>(Val: Call2);
2786 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2787 return false;
2788 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2789 auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2);
2790 return CalleeFunc1 == CalleeFunc2;
2791}
2792
2793bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2794 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2795 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2796 bool &FoundMultipleCalleeChains) {
2797 // Stop recursive search if we have already explored the maximum specified
2798 // depth.
2799 if (Depth > TailCallSearchDepth)
2800 return false;
2801
2802 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2803 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2804 // been synthesized.
2805 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) ||
2806 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee))
2807 // StackIds is empty (we don't have debug info available in the index for
2808 // these callsites)
2809 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2810 std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
2811 CallsiteInfo *NewCallsiteInfo =
2812 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2813 FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
2814 };
2815
2816 // Look for tail calls in this function, and check if they either call the
2817 // profiled callee directly, or indirectly (via a recursive search).
2818 // Only succeed if there is a single unique tail call chain found between the
2819 // profiled caller and callee, otherwise we could perform incorrect cloning.
2820 bool FoundSingleCalleeChain = false;
2821 for (auto &S : CurCallee.getSummaryList()) {
2822 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2823 !isPrevailing(CurCallee.getGUID(), S.get()))
2824 continue;
2825 auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject());
2826 if (!FS)
2827 continue;
2828 auto FSVI = CurCallee;
2829 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
2830 if (AS)
2831 FSVI = AS->getAliaseeVI();
2832 for (auto &CallEdge : FS->calls()) {
2833 if (!CallEdge.second.hasTailCall())
2834 continue;
2835 if (CallEdge.first == ProfiledCallee) {
2836 if (FoundSingleCalleeChain) {
2837 FoundMultipleCalleeChains = true;
2838 return false;
2839 }
2840 FoundSingleCalleeChain = true;
2841 FoundProfiledCalleeCount++;
2842 FoundProfiledCalleeDepth += Depth;
2843 if (Depth > FoundProfiledCalleeMaxDepth)
2844 FoundProfiledCalleeMaxDepth = Depth;
2845 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2846 // Add FS to FSToVIMap in case it isn't already there.
2847 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2848 FSToVIMap[FS] = FSVI;
2849 } else if (findProfiledCalleeThroughTailCalls(
2850 ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1,
2851 FoundCalleeChain, FoundMultipleCalleeChains)) {
2852 // findProfiledCalleeThroughTailCalls should not have returned
2853 // true if FoundMultipleCalleeChains.
2854 assert(!FoundMultipleCalleeChains);
2855 if (FoundSingleCalleeChain) {
2856 FoundMultipleCalleeChains = true;
2857 return false;
2858 }
2859 FoundSingleCalleeChain = true;
2860 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2861 // Add FS to FSToVIMap in case it isn't already there.
2862 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2863 FSToVIMap[FS] = FSVI;
2864 } else if (FoundMultipleCalleeChains)
2865 return false;
2866 }
2867 }
2868
2869 return FoundSingleCalleeChain;
2870}
2871
2872const FunctionSummary *
2873IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2874 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
2875 if (Callee.getSummaryList().empty())
2876 return nullptr;
2877 return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[0]->getBaseObject());
2878}
2879
2880bool IndexCallsiteContextGraph::calleeMatchesFunc(
2881 IndexCall &Call, const FunctionSummary *Func,
2882 const FunctionSummary *CallerFunc,
2883 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2884 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
2885 // If there is no summary list then this is a call to an externally defined
2886 // symbol.
2887 AliasSummary *Alias =
2888 Callee.getSummaryList().empty()
2889 ? nullptr
2890 : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get());
2891 assert(FSToVIMap.count(Func));
2892 auto FuncVI = FSToVIMap[Func];
2893 if (Callee == FuncVI ||
2894 // If callee is an alias, check the aliasee, since only function
2895 // summary base objects will contain the stack node summaries and thus
2896 // get a context node.
2897 (Alias && Alias->getAliaseeVI() == FuncVI))
2898 return true;
2899
2900 // Recursively search for the profiled callee through tail calls starting with
2901 // the actual Callee. The discovered tail call chain is saved in
2902 // FoundCalleeChain, and we will fixup the graph to include these callsites
2903 // after returning.
2904 // FIXME: We will currently redo the same recursive walk if we find the same
2905 // mismatched callee from another callsite. We can improve this with more
2906 // bookkeeping of the created chain of new nodes for each mismatch.
2907 unsigned Depth = 1;
2908 bool FoundMultipleCalleeChains = false;
2909 if (!findProfiledCalleeThroughTailCalls(
2910 ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2911 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2912 << " from " << FSToVIMap[CallerFunc]
2913 << " that actually called " << Callee
2914 << (FoundMultipleCalleeChains
2915 ? " (found multiple possible chains)"
2916 : "")
2917 << "\n");
2918 if (FoundMultipleCalleeChains)
2919 FoundProfiledCalleeNonUniquelyCount++;
2920 return false;
2921 }
2922
2923 return true;
2924}
2925
2926bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2927 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee;
2928 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee;
2929 return Callee1 == Callee2;
2930}
2931
2932template <typename DerivedCCG, typename FuncTy, typename CallTy>
2933void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2934 const {
2935 print(OS&: dbgs());
2936 dbgs() << "\n";
2937}
2938
2939template <typename DerivedCCG, typename FuncTy, typename CallTy>
2940void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2941 raw_ostream &OS) const {
2942 OS << "Node " << this << "\n";
2943 OS << "\t";
2944 printCall(OS);
2945 if (Recursive)
2946 OS << " (recursive)";
2947 OS << "\n";
2948 if (!MatchingCalls.empty()) {
2949 OS << "\tMatchingCalls:\n";
2950 for (auto &MatchingCall : MatchingCalls) {
2951 OS << "\t";
2952 MatchingCall.print(OS);
2953 OS << "\n";
2954 }
2955 }
2956 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2957 OS << "\tContextIds:";
2958 // Make a copy of the computed context ids that we can sort for stability.
2959 auto ContextIds = getContextIds();
2960 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2961 std::sort(first: SortedIds.begin(), last: SortedIds.end());
2962 for (auto Id : SortedIds)
2963 OS << " " << Id;
2964 OS << "\n";
2965 OS << "\tCalleeEdges:\n";
2966 for (auto &Edge : CalleeEdges)
2967 OS << "\t\t" << *Edge << "\n";
2968 OS << "\tCallerEdges:\n";
2969 for (auto &Edge : CallerEdges)
2970 OS << "\t\t" << *Edge << "\n";
2971 if (!Clones.empty()) {
2972 OS << "\tClones: " << llvm::interleaved(Clones) << "\n";
2973 } else if (CloneOf) {
2974 OS << "\tClone of " << CloneOf << "\n";
2975 }
2976}
2977
2978template <typename DerivedCCG, typename FuncTy, typename CallTy>
2979void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2980 const {
2981 print(OS&: dbgs());
2982 dbgs() << "\n";
2983}
2984
2985template <typename DerivedCCG, typename FuncTy, typename CallTy>
2986void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2987 raw_ostream &OS) const {
2988 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2989 << (IsBackedge ? " (BE)" : "")
2990 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2991 OS << " ContextIds:";
2992 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2993 std::sort(first: SortedIds.begin(), last: SortedIds.end());
2994 for (auto Id : SortedIds)
2995 OS << " " << Id;
2996}
2997
2998template <typename DerivedCCG, typename FuncTy, typename CallTy>
2999void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3000 print(OS&: dbgs());
3001}
3002
3003template <typename DerivedCCG, typename FuncTy, typename CallTy>
3004void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3005 raw_ostream &OS) const {
3006 OS << "Callsite Context Graph:\n";
3007 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3008 for (const auto Node : nodes<GraphType>(this)) {
3009 if (Node->isRemoved())
3010 continue;
3011 Node->print(OS);
3012 OS << "\n";
3013 }
3014}
3015
3016template <typename DerivedCCG, typename FuncTy, typename CallTy>
3017void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3018 raw_ostream &OS) const {
3019 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3020 for (const auto Node : nodes<GraphType>(this)) {
3021 if (Node->isRemoved())
3022 continue;
3023 if (!Node->IsAllocation)
3024 continue;
3025 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3026 auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call);
3027 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3028 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3029 for (auto Id : SortedIds) {
3030 auto TypeI = ContextIdToAllocationType.find(Val: Id);
3031 assert(TypeI != ContextIdToAllocationType.end());
3032 auto CSI = ContextIdToContextSizeInfos.find(Val: Id);
3033 if (CSI != ContextIdToContextSizeInfos.end()) {
3034 for (auto &Info : CSI->second) {
3035 OS << "MemProf hinting: "
3036 << getAllocTypeString(AllocTypes: (uint8_t)TypeI->second)
3037 << " full allocation context " << Info.FullStackId
3038 << " with total size " << Info.TotalSize << " is "
3039 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3040 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3041 OS << " marked " << getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall)
3042 << " due to cold byte percent";
3043 // Print the internal context id to aid debugging and visualization.
3044 OS << " (context id " << Id << ")";
3045 OS << "\n";
3046 }
3047 }
3048 }
3049 }
3050}
3051
3052template <typename DerivedCCG, typename FuncTy, typename CallTy>
3053void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3054 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3055 for (const auto Node : nodes<GraphType>(this)) {
3056 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3057 for (auto &Edge : Node->CallerEdges)
3058 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3059 }
3060}
3061
3062template <typename DerivedCCG, typename FuncTy, typename CallTy>
3063struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3064 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3065 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3066
3067 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3068 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3069
3070 using nodes_iterator =
3071 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3072 decltype(&getNode)>;
3073
3074 static nodes_iterator nodes_begin(GraphType G) {
3075 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3076 }
3077
3078 static nodes_iterator nodes_end(GraphType G) {
3079 return nodes_iterator(G->NodeOwner.end(), &getNode);
3080 }
3081
3082 static NodeRef getEntryNode(GraphType G) {
3083 return G->NodeOwner.begin()->get();
3084 }
3085
3086 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3087 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3088 GetCallee(const EdgePtrTy &P) {
3089 return P->Callee;
3090 }
3091
3092 using ChildIteratorType =
3093 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3094 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3095 decltype(&GetCallee)>;
3096
3097 static ChildIteratorType child_begin(NodeRef N) {
3098 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3099 }
3100
3101 static ChildIteratorType child_end(NodeRef N) {
3102 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3103 }
3104};
3105
3106template <typename DerivedCCG, typename FuncTy, typename CallTy>
3107struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3108 : public DefaultDOTGraphTraits {
3109 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3110 // If the user requested the full graph to be exported, but provided an
3111 // allocation id, or if the user gave a context id and requested more than
3112 // just a specific context to be exported, note that highlighting is
3113 // enabled.
3114 DoHighlight =
3115 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3116 (ContextIdForDot.getNumOccurrences() &&
3117 DotGraphScope != DotScope::Context);
3118 }
3119
3120 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3121 using GTraits = GraphTraits<GraphType>;
3122 using NodeRef = typename GTraits::NodeRef;
3123 using ChildIteratorType = typename GTraits::ChildIteratorType;
3124
3125 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3126 std::string LabelString =
3127 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3128 Twine(Node->OrigStackOrAllocId))
3129 .str();
3130 LabelString += "\n";
3131 if (Node->hasCall()) {
3132 auto Func = G->NodeToCallingFunc.find(Node);
3133 assert(Func != G->NodeToCallingFunc.end());
3134 LabelString +=
3135 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3136 } else {
3137 LabelString += "null call";
3138 if (Node->Recursive)
3139 LabelString += " (recursive)";
3140 else
3141 LabelString += " (external)";
3142 }
3143 return LabelString;
3144 }
3145
3146 static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3147 auto ContextIds = Node->getContextIds();
3148 // If highlighting enabled, see if this node contains any of the context ids
3149 // of interest. If so, it will use a different color and a larger fontsize
3150 // (which makes the node larger as well).
3151 bool Highlight = false;
3152 if (DoHighlight) {
3153 assert(ContextIdForDot.getNumOccurrences() ||
3154 AllocIdForDot.getNumOccurrences());
3155 if (ContextIdForDot.getNumOccurrences())
3156 Highlight = ContextIds.contains(ContextIdForDot);
3157 else
3158 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3159 }
3160 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3161 getContextIds(ContextIds) + "\"")
3162 .str();
3163 // Default fontsize is 14
3164 if (Highlight)
3165 AttributeString += ",fontsize=\"30\"";
3166 AttributeString +=
3167 (Twine(",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"")
3168 .str();
3169 if (Node->CloneOf) {
3170 AttributeString += ",color=\"blue\"";
3171 AttributeString += ",style=\"filled,bold,dashed\"";
3172 } else
3173 AttributeString += ",style=\"filled\"";
3174 return AttributeString;
3175 }
3176
3177 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3178 GraphType G) {
3179 auto &Edge = *(ChildIter.getCurrent());
3180 // If highlighting enabled, see if this edge contains any of the context ids
3181 // of interest. If so, it will use a different color and a heavier arrow
3182 // size and weight (the larger weight makes the highlighted path
3183 // straighter).
3184 bool Highlight = false;
3185 if (DoHighlight) {
3186 assert(ContextIdForDot.getNumOccurrences() ||
3187 AllocIdForDot.getNumOccurrences());
3188 if (ContextIdForDot.getNumOccurrences())
3189 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3190 else
3191 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3192 }
3193 auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight);
3194 std::string AttributeString =
3195 (Twine("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
3196 // fillcolor is the arrow head and color is the line
3197 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3198 "\"")
3199 .str();
3200 if (Edge->IsBackedge)
3201 AttributeString += ",style=\"dotted\"";
3202 // Default penwidth and weight are both 1.
3203 if (Highlight)
3204 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3205 return AttributeString;
3206 }
3207
3208 // Since the NodeOwners list includes nodes that are no longer connected to
3209 // the graph, skip them here.
3210 static bool isNodeHidden(NodeRef Node, GraphType G) {
3211 if (Node->isRemoved())
3212 return true;
3213 // If a scope smaller than the full graph was requested, see if this node
3214 // contains any of the context ids of interest.
3215 if (DotGraphScope == DotScope::Alloc)
3216 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3217 if (DotGraphScope == DotScope::Context)
3218 return !Node->getContextIds().contains(ContextIdForDot);
3219 return false;
3220 }
3221
3222private:
3223 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3224 std::string IdString = "ContextIds:";
3225 if (ContextIds.size() < 100) {
3226 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3227 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3228 for (auto Id : SortedIds)
3229 IdString += (" " + Twine(Id)).str();
3230 } else {
3231 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3232 }
3233 return IdString;
3234 }
3235
3236 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3237 // If DoHighlight is not enabled, we want to use the highlight colors for
3238 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3239 // both compatible with the color scheme before highlighting was supported,
3240 // and for the NotCold+Cold color the non-highlight color is a bit more
3241 // readable.
3242 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3243 // Color "brown1" actually looks like a lighter red.
3244 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3245 if (AllocTypes == (uint8_t)AllocationType::Cold)
3246 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3247 if (AllocTypes ==
3248 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3249 return Highlight ? "magenta" : "mediumorchid1";
3250 return "gray";
3251 }
3252
3253 static std::string getNodeId(NodeRef Node) {
3254 std::stringstream SStream;
3255 SStream << std::hex << "N0x" << (unsigned long long)Node;
3256 std::string Result = SStream.str();
3257 return Result;
3258 }
3259
3260 // True if we should highlight a specific context or allocation's contexts in
3261 // the emitted graph.
3262 static bool DoHighlight;
3263};
3264
3265template <typename DerivedCCG, typename FuncTy, typename CallTy>
3266bool DOTGraphTraits<
3267 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3268 false;
3269
3270template <typename DerivedCCG, typename FuncTy, typename CallTy>
3271void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3272 std::string Label) const {
3273 WriteGraph(this, "", false, Label,
3274 DotFilePathPrefix + "ccg." + Label + ".dot");
3275}
3276
3277template <typename DerivedCCG, typename FuncTy, typename CallTy>
3278typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3279CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3280 const std::shared_ptr<ContextEdge> &Edge,
3281 DenseSet<uint32_t> ContextIdsToMove) {
3282 ContextNode *Node = Edge->Callee;
3283 assert(NodeToCallingFunc.count(Node));
3284 ContextNode *Clone =
3285 createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call);
3286 Node->addClone(Clone);
3287 Clone->MatchingCalls = Node->MatchingCalls;
3288 moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /*NewClone=*/true,
3289 ContextIdsToMove);
3290 return Clone;
3291}
3292
3293template <typename DerivedCCG, typename FuncTy, typename CallTy>
3294void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3295 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3296 ContextNode *NewCallee, bool NewClone,
3297 DenseSet<uint32_t> ContextIdsToMove) {
3298 // NewCallee and Edge's current callee must be clones of the same original
3299 // node (Edge's current callee may be the original node too).
3300 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3301
3302 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3303
3304 ContextNode *OldCallee = Edge->Callee;
3305
3306 // We might already have an edge to the new callee from earlier cloning for a
3307 // different allocation. If one exists we will reuse it.
3308 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3309
3310 // Callers will pass an empty ContextIdsToMove set when they want to move the
3311 // edge. Copy in Edge's ids for simplicity.
3312 if (ContextIdsToMove.empty())
3313 ContextIdsToMove = Edge->getContextIds();
3314
3315 // If we are moving all of Edge's ids, then just move the whole Edge.
3316 // Otherwise only move the specified subset, to a new edge if needed.
3317 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3318 // First, update the alloc types on New Callee from Edge.
3319 // Do this before we potentially clear Edge's fields below!
3320 NewCallee->AllocTypes |= Edge->AllocTypes;
3321 // Moving the whole Edge.
3322 if (ExistingEdgeToNewCallee) {
3323 // Since we already have an edge to NewCallee, simply move the ids
3324 // onto it, and remove the existing Edge.
3325 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3326 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3327 assert(Edge->ContextIds == ContextIdsToMove);
3328 removeEdgeFromGraph(Edge: Edge.get());
3329 } else {
3330 // Otherwise just reconnect Edge to NewCallee.
3331 Edge->Callee = NewCallee;
3332 NewCallee->CallerEdges.push_back(Edge);
3333 // Remove it from callee where it was previously connected.
3334 OldCallee->eraseCallerEdge(Edge.get());
3335 // Don't need to update Edge's context ids since we are simply
3336 // reconnecting it.
3337 }
3338 } else {
3339 // Only moving a subset of Edge's ids.
3340 // Compute the alloc type of the subset of ids being moved.
3341 auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
3342 if (ExistingEdgeToNewCallee) {
3343 // Since we already have an edge to NewCallee, simply move the ids
3344 // onto it.
3345 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3346 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3347 } else {
3348 // Otherwise, create a new edge to NewCallee for the ids being moved.
3349 auto NewEdge = std::make_shared<ContextEdge>(
3350 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3351 Edge->Caller->CalleeEdges.push_back(NewEdge);
3352 NewCallee->CallerEdges.push_back(NewEdge);
3353 }
3354 // In either case, need to update the alloc types on NewCallee, and remove
3355 // those ids and update the alloc type on the original Edge.
3356 NewCallee->AllocTypes |= CallerEdgeAllocType;
3357 set_subtract(Edge->ContextIds, ContextIdsToMove);
3358 Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
3359 }
3360 // Now walk the old callee node's callee edges and move Edge's context ids
3361 // over to the corresponding edge into the clone (which is created here if
3362 // this is a newly created clone).
3363 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3364 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3365 // If this is a direct recursion edge, use NewCallee (the clone) as the
3366 // callee as well, so that any edge updated/created here is also direct
3367 // recursive.
3368 if (CalleeToUse == OldCallee) {
3369 // If this is a recursive edge, see if we already moved a recursive edge
3370 // (which would have to have been this one) - if we were only moving a
3371 // subset of context ids it would still be on OldCallee.
3372 if (EdgeIsRecursive) {
3373 assert(OldCalleeEdge == Edge);
3374 continue;
3375 }
3376 CalleeToUse = NewCallee;
3377 }
3378 // The context ids moving to the new callee are the subset of this edge's
3379 // context ids and the context ids on the caller edge being moved.
3380 DenseSet<uint32_t> EdgeContextIdsToMove =
3381 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3382 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3383 OldCalleeEdge->AllocTypes =
3384 computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
3385 if (!NewClone) {
3386 // Update context ids / alloc type on corresponding edge to NewCallee.
3387 // There is a chance this may not exist if we are reusing an existing
3388 // clone, specifically during function assignment, where we would have
3389 // removed none type edges after creating the clone. If we can't find
3390 // a corresponding edge there, fall through to the cloning below.
3391 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3392 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3393 NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove);
3394 continue;
3395 }
3396 }
3397 auto NewEdge = std::make_shared<ContextEdge>(
3398 CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3399 EdgeContextIdsToMove);
3400 NewCallee->CalleeEdges.push_back(NewEdge);
3401 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3402 }
3403 // Recompute the node alloc type now that its callee edges have been
3404 // updated (since we will compute from those edges).
3405 OldCallee->AllocTypes = OldCallee->computeAllocType();
3406 // OldCallee alloc type should be None iff its context id set is now empty.
3407 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3408 OldCallee->emptyContextIds());
3409 if (VerifyCCG) {
3410 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3411 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3412 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3413 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3414 /*CheckEdges=*/false);
3415 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3416 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3417 /*CheckEdges=*/false);
3418 }
3419}
3420
3421template <typename DerivedCCG, typename FuncTy, typename CallTy>
3422void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3423 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3424 ContextNode *NewCaller) {
3425 auto *OldCallee = Edge->Callee;
3426 auto *NewCallee = OldCallee;
3427 // If this edge was direct recursive, make any new/updated edge also direct
3428 // recursive to NewCaller.
3429 bool Recursive = Edge->Caller == Edge->Callee;
3430 if (Recursive)
3431 NewCallee = NewCaller;
3432
3433 ContextNode *OldCaller = Edge->Caller;
3434 OldCaller->eraseCalleeEdge(Edge.get());
3435
3436 // We might already have an edge to the new caller. If one exists we will
3437 // reuse it.
3438 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3439
3440 if (ExistingEdgeToNewCaller) {
3441 // Since we already have an edge to NewCaller, simply move the ids
3442 // onto it, and remove the existing Edge.
3443 ExistingEdgeToNewCaller->getContextIds().insert_range(
3444 Edge->getContextIds());
3445 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3446 Edge->ContextIds.clear();
3447 Edge->AllocTypes = (uint8_t)AllocationType::None;
3448 OldCallee->eraseCallerEdge(Edge.get());
3449 } else {
3450 // Otherwise just reconnect Edge to NewCaller.
3451 Edge->Caller = NewCaller;
3452 NewCaller->CalleeEdges.push_back(Edge);
3453 if (Recursive) {
3454 assert(NewCallee == NewCaller);
3455 // In the case of (direct) recursive edges, we update the callee as well
3456 // so that it becomes recursive on the new caller.
3457 Edge->Callee = NewCallee;
3458 NewCallee->CallerEdges.push_back(Edge);
3459 OldCallee->eraseCallerEdge(Edge.get());
3460 }
3461 // Don't need to update Edge's context ids since we are simply
3462 // reconnecting it.
3463 }
3464 // In either case, need to update the alloc types on New Caller.
3465 NewCaller->AllocTypes |= Edge->AllocTypes;
3466
3467 // Now walk the old caller node's caller edges and move Edge's context ids
3468 // over to the corresponding edge into the node (which is created here if
3469 // this is a newly created node). We can tell whether this is a newly created
3470 // node by seeing if it has any caller edges yet.
3471#ifndef NDEBUG
3472 bool IsNewNode = NewCaller->CallerEdges.empty();
3473#endif
3474 // If we just moved a direct recursive edge, presumably its context ids should
3475 // also flow out of OldCaller via some other non-recursive callee edge. We
3476 // don't want to remove the recursive context ids from other caller edges yet,
3477 // otherwise the context ids get into an inconsistent state on OldCaller.
3478 // We will update these context ids on the non-recursive caller edge when and
3479 // if they are updated on the non-recursive callee.
3480 if (!Recursive) {
3481 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3482 auto OldCallerCaller = OldCallerEdge->Caller;
3483 // The context ids moving to the new caller are the subset of this edge's
3484 // context ids and the context ids on the callee edge being moved.
3485 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3486 OldCallerEdge->getContextIds(), Edge->getContextIds());
3487 if (OldCaller == OldCallerCaller) {
3488 OldCallerCaller = NewCaller;
3489 // Don't actually move this one. The caller will move it directly via a
3490 // call to this function with this as the Edge if it is appropriate to
3491 // move to a diff node that has a matching callee (itself).
3492 continue;
3493 }
3494 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3495 OldCallerEdge->AllocTypes =
3496 computeAllocType(ContextIds&: OldCallerEdge->getContextIds());
3497 // In this function we expect that any pre-existing node already has edges
3498 // from the same callers as the old node. That should be true in the
3499 // current use case, where we will remove None-type edges after copying
3500 // over all caller edges from the callee.
3501 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3502 // Since we would have skipped caller edges when moving a direct recursive
3503 // edge, this may not hold true when recursive handling enabled.
3504 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3505 if (ExistingCallerEdge) {
3506 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3507 ExistingCallerEdge->AllocTypes |=
3508 computeAllocType(ContextIds&: EdgeContextIdsToMove);
3509 continue;
3510 }
3511 auto NewEdge = std::make_shared<ContextEdge>(
3512 NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3513 EdgeContextIdsToMove);
3514 NewCaller->CallerEdges.push_back(NewEdge);
3515 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3516 }
3517 }
3518 // Recompute the node alloc type now that its caller edges have been
3519 // updated (since we will compute from those edges).
3520 OldCaller->AllocTypes = OldCaller->computeAllocType();
3521 // OldCaller alloc type should be None iff its context id set is now empty.
3522 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3523 OldCaller->emptyContextIds());
3524 if (VerifyCCG) {
3525 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3526 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3527 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3528 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3529 /*CheckEdges=*/false);
3530 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3531 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3532 /*CheckEdges=*/false);
3533 }
3534}
3535
3536template <typename DerivedCCG, typename FuncTy, typename CallTy>
3537void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3538 recursivelyRemoveNoneTypeCalleeEdges(
3539 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3540 auto Inserted = Visited.insert(Node);
3541 if (!Inserted.second)
3542 return;
3543
3544 removeNoneTypeCalleeEdges(Node);
3545
3546 for (auto *Clone : Node->Clones)
3547 recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
3548
3549 // The recursive call may remove some of this Node's caller edges.
3550 // Iterate over a copy and skip any that were removed.
3551 auto CallerEdges = Node->CallerEdges;
3552 for (auto &Edge : CallerEdges) {
3553 // Skip any that have been removed by an earlier recursive call.
3554 if (Edge->isRemoved()) {
3555 assert(!is_contained(Node->CallerEdges, Edge));
3556 continue;
3557 }
3558 recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
3559 }
3560}
3561
3562// This is the standard DFS based backedge discovery algorithm.
3563template <typename DerivedCCG, typename FuncTy, typename CallTy>
3564void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3565 // If we are cloning recursive contexts, find and mark backedges from all root
3566 // callers, using the typical DFS based backedge analysis.
3567 if (!CloneRecursiveContexts)
3568 return;
3569 DenseSet<const ContextNode *> Visited;
3570 DenseSet<const ContextNode *> CurrentStack;
3571 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3572 auto *Node = Entry.second;
3573 if (Node->isRemoved())
3574 continue;
3575 // It is a root if it doesn't have callers.
3576 if (!Node->CallerEdges.empty())
3577 continue;
3578 markBackedges(Node, Visited, CurrentStack);
3579 assert(CurrentStack.empty());
3580 }
3581}
3582
3583// Recursive helper for above markBackedges method.
3584template <typename DerivedCCG, typename FuncTy, typename CallTy>
3585void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3586 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3587 DenseSet<const ContextNode *> &CurrentStack) {
3588 auto I = Visited.insert(Node);
3589 // We should only call this for unvisited nodes.
3590 assert(I.second);
3591 (void)I;
3592 for (auto &CalleeEdge : Node->CalleeEdges) {
3593 auto *Callee = CalleeEdge->Callee;
3594 if (Visited.count(Callee)) {
3595 // Since this was already visited we need to check if it is currently on
3596 // the recursive stack in which case it is a backedge.
3597 if (CurrentStack.count(Callee))
3598 CalleeEdge->IsBackedge = true;
3599 continue;
3600 }
3601 CurrentStack.insert(Callee);
3602 markBackedges(Callee, Visited, CurrentStack);
3603 CurrentStack.erase(Callee);
3604 }
3605}
3606
3607template <typename DerivedCCG, typename FuncTy, typename CallTy>
3608void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3609 DenseSet<const ContextNode *> Visited;
3610 for (auto &Entry : AllocationCallToContextNodeMap) {
3611 Visited.clear();
3612 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3613 }
3614 Visited.clear();
3615 for (auto &Entry : AllocationCallToContextNodeMap)
3616 recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
3617 if (VerifyCCG)
3618 check();
3619}
3620
3621// helper function to check an AllocType is cold or notcold or both.
3622bool checkColdOrNotCold(uint8_t AllocType) {
3623 return (AllocType == (uint8_t)AllocationType::Cold) ||
3624 (AllocType == (uint8_t)AllocationType::NotCold) ||
3625 (AllocType ==
3626 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
3627}
3628
3629template <typename DerivedCCG, typename FuncTy, typename CallTy>
3630void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3631 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3632 const DenseSet<uint32_t> &AllocContextIds) {
3633 if (VerifyNodes)
3634 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3635 assert(!Node->CloneOf);
3636
3637 // If Node as a null call, then either it wasn't found in the module (regular
3638 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3639 // cloning (e.g. recursion, calls multiple targets, etc).
3640 // Do this here so that we don't try to recursively clone callers below, which
3641 // isn't useful at least for this node.
3642 if (!Node->hasCall())
3643 return;
3644
3645 // No need to look at any callers if allocation type already unambiguous.
3646 if (hasSingleAllocType(Node->AllocTypes))
3647 return;
3648
3649#ifndef NDEBUG
3650 auto Insert =
3651#endif
3652 Visited.insert(Node);
3653 // We should not have visited this node yet.
3654 assert(Insert.second);
3655 // The recursive call to identifyClones may delete the current edge from the
3656 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3657 // in an iterator and having recursive call erase from it. Other edges may
3658 // also get removed during the recursion, which will have null Callee and
3659 // Caller pointers (and are deleted later), so we skip those below.
3660 {
3661 auto CallerEdges = Node->CallerEdges;
3662 for (auto &Edge : CallerEdges) {
3663 // Skip any that have been removed by an earlier recursive call.
3664 if (Edge->isRemoved()) {
3665 assert(!is_contained(Node->CallerEdges, Edge));
3666 continue;
3667 }
3668 // Defer backedges. See comments further below where these edges are
3669 // handled during the cloning of this Node.
3670 if (Edge->IsBackedge) {
3671 // We should only mark these if cloning recursive contexts, where we
3672 // need to do this deferral.
3673 assert(CloneRecursiveContexts);
3674 continue;
3675 }
3676 // Ignore any caller we previously visited via another edge.
3677 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3678 identifyClones(Edge->Caller, Visited, AllocContextIds);
3679 }
3680 }
3681 }
3682
3683 // Check if we reached an unambiguous call or have have only a single caller.
3684 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3685 return;
3686
3687 // We need to clone.
3688
3689 // Try to keep the original version as alloc type NotCold. This will make
3690 // cases with indirect calls or any other situation with an unknown call to
3691 // the original function get the default behavior. We do this by sorting the
3692 // CallerEdges of the Node we will clone by alloc type.
3693 //
3694 // Give NotCold edge the lowest sort priority so those edges are at the end of
3695 // the caller edges vector, and stay on the original version (since the below
3696 // code clones greedily until it finds all remaining edges have the same type
3697 // and leaves the remaining ones on the original Node).
3698 //
3699 // We shouldn't actually have any None type edges, so the sorting priority for
3700 // that is arbitrary, and we assert in that case below.
3701 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3702 /*Cold*/ 1,
3703 /*NotColdCold*/ 2};
3704 llvm::stable_sort(Node->CallerEdges,
3705 [&](const std::shared_ptr<ContextEdge> &A,
3706 const std::shared_ptr<ContextEdge> &B) {
3707 // Nodes with non-empty context ids should be sorted
3708 // before those with empty context ids.
3709 if (A->ContextIds.empty())
3710 // Either B ContextIds are non-empty (in which case we
3711 // should return false because B < A), or B ContextIds
3712 // are empty, in which case they are equal, and we
3713 // should maintain the original relative ordering.
3714 return false;
3715 if (B->ContextIds.empty())
3716 return true;
3717
3718 if (A->AllocTypes == B->AllocTypes)
3719 // Use the first context id for each edge as a
3720 // tie-breaker.
3721 return *A->ContextIds.begin() < *B->ContextIds.begin();
3722 return AllocTypeCloningPriority[A->AllocTypes] <
3723 AllocTypeCloningPriority[B->AllocTypes];
3724 });
3725
3726 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3727
3728 DenseSet<uint32_t> RecursiveContextIds;
3729 assert(AllowRecursiveContexts || !CloneRecursiveContexts);
3730 // If we are allowing recursive callsites, but have also disabled recursive
3731 // contexts, look for context ids that show up in multiple caller edges.
3732 if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
3733 DenseSet<uint32_t> AllCallerContextIds;
3734 for (auto &CE : Node->CallerEdges) {
3735 // Resize to the largest set of caller context ids, since we know the
3736 // final set will be at least that large.
3737 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
3738 for (auto Id : CE->getContextIds())
3739 if (!AllCallerContextIds.insert(Id).second)
3740 RecursiveContextIds.insert(Id);
3741 }
3742 }
3743
3744 // Iterate until we find no more opportunities for disambiguating the alloc
3745 // types via cloning. In most cases this loop will terminate once the Node
3746 // has a single allocation type, in which case no more cloning is needed.
3747 // Iterate over a copy of Node's caller edges, since we may need to remove
3748 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3749 // makes it less error-prone.
3750 auto CallerEdges = Node->CallerEdges;
3751 for (auto &CallerEdge : CallerEdges) {
3752 // Skip any that have been removed by an earlier recursive call.
3753 if (CallerEdge->isRemoved()) {
3754 assert(!is_contained(Node->CallerEdges, CallerEdge));
3755 continue;
3756 }
3757 assert(CallerEdge->Callee == Node);
3758
3759 // See if cloning the prior caller edge left this node with a single alloc
3760 // type or a single caller. In that case no more cloning of Node is needed.
3761 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3762 break;
3763
3764 // If the caller was not successfully matched to a call in the IR/summary,
3765 // there is no point in trying to clone for it as we can't update that call.
3766 if (!CallerEdge->Caller->hasCall())
3767 continue;
3768
3769 // Only need to process the ids along this edge pertaining to the given
3770 // allocation.
3771 auto CallerEdgeContextsForAlloc =
3772 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3773 if (!RecursiveContextIds.empty())
3774 CallerEdgeContextsForAlloc =
3775 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3776 if (CallerEdgeContextsForAlloc.empty())
3777 continue;
3778
3779 auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
3780
3781 // Compute the node callee edge alloc types corresponding to the context ids
3782 // for this caller edge.
3783 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3784 CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
3785 for (auto &CalleeEdge : Node->CalleeEdges)
3786 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3787 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
3788
3789 // Don't clone if doing so will not disambiguate any alloc types amongst
3790 // caller edges (including the callee edges that would be cloned).
3791 // Otherwise we will simply move all edges to the clone.
3792 //
3793 // First check if by cloning we will disambiguate the caller allocation
3794 // type from node's allocation type. Query allocTypeToUse so that we don't
3795 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3796 // neither of these should be None type.
3797 //
3798 // Then check if by cloning node at least one of the callee edges will be
3799 // disambiguated by splitting out different context ids.
3800 //
3801 // However, always do the cloning if this is a backedge, in which case we
3802 // have not yet cloned along this caller edge.
3803 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3804 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3805 if (!CallerEdge->IsBackedge &&
3806 allocTypeToUse(CallerAllocTypeForAlloc) ==
3807 allocTypeToUse(Node->AllocTypes) &&
3808 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3809 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3810 continue;
3811 }
3812
3813 if (CallerEdge->IsBackedge) {
3814 // We should only mark these if cloning recursive contexts, where we
3815 // need to do this deferral.
3816 assert(CloneRecursiveContexts);
3817 DeferredBackedges++;
3818 }
3819
3820 // If this is a backedge, we now do recursive cloning starting from its
3821 // caller since we may have moved unambiguous caller contexts to a clone
3822 // of this Node in a previous iteration of the current loop, giving more
3823 // opportunity for cloning through the backedge. Because we sorted the
3824 // caller edges earlier so that cold caller edges are first, we would have
3825 // visited and cloned this node for any unamibiguously cold non-recursive
3826 // callers before any ambiguous backedge callers. Note that we don't do this
3827 // if the caller is already cloned or visited during cloning (e.g. via a
3828 // different context path from the allocation).
3829 // TODO: Can we do better in the case where the caller was already visited?
3830 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3831 !Visited.count(CallerEdge->Caller)) {
3832 const auto OrigIdCount = CallerEdge->getContextIds().size();
3833 // Now do the recursive cloning of this backedge's caller, which was
3834 // deferred earlier.
3835 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3836 removeNoneTypeCalleeEdges(Node: CallerEdge->Caller);
3837 // See if the recursive call to identifyClones moved the context ids to a
3838 // new edge from this node to a clone of caller, and switch to looking at
3839 // that new edge so that we clone Node for the new caller clone.
3840 bool UpdatedEdge = false;
3841 if (OrigIdCount > CallerEdge->getContextIds().size()) {
3842 for (auto E : Node->CallerEdges) {
3843 // Only interested in clones of the current edges caller.
3844 if (E->Caller->CloneOf != CallerEdge->Caller)
3845 continue;
3846 // See if this edge contains any of the context ids originally on the
3847 // current caller edge.
3848 auto CallerEdgeContextsForAllocNew =
3849 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3850 if (CallerEdgeContextsForAllocNew.empty())
3851 continue;
3852 // Make sure we don't pick a previously existing caller edge of this
3853 // Node, which would be processed on a different iteration of the
3854 // outer loop over the saved CallerEdges.
3855 if (llvm::is_contained(CallerEdges, E))
3856 continue;
3857 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3858 // are updated further below for all cases where we just invoked
3859 // identifyClones recursively.
3860 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3861 CallerEdge = E;
3862 UpdatedEdge = true;
3863 break;
3864 }
3865 }
3866 // If cloning removed this edge (and we didn't update it to a new edge
3867 // above), we're done with this edge. It's possible we moved all of the
3868 // context ids to an existing clone, in which case there's no need to do
3869 // further processing for them.
3870 if (CallerEdge->isRemoved())
3871 continue;
3872
3873 // Now we need to update the information used for the cloning decisions
3874 // further below, as we may have modified edges and their context ids.
3875
3876 // Note if we changed the CallerEdge above we would have already updated
3877 // the context ids.
3878 if (!UpdatedEdge) {
3879 CallerEdgeContextsForAlloc = set_intersection(
3880 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3881 if (CallerEdgeContextsForAlloc.empty())
3882 continue;
3883 }
3884 // Update the other information that depends on the edges and on the now
3885 // updated CallerEdgeContextsForAlloc.
3886 CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
3887 CalleeEdgeAllocTypesForCallerEdge.clear();
3888 for (auto &CalleeEdge : Node->CalleeEdges) {
3889 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3890 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
3891 }
3892 }
3893
3894 // First see if we can use an existing clone. Check each clone and its
3895 // callee edges for matching alloc types.
3896 ContextNode *Clone = nullptr;
3897 for (auto *CurClone : Node->Clones) {
3898 if (allocTypeToUse(CurClone->AllocTypes) !=
3899 allocTypeToUse(CallerAllocTypeForAlloc))
3900 continue;
3901
3902 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3903 hasSingleAllocType(CallerAllocTypeForAlloc);
3904 // The above check should mean that if both have single alloc types that
3905 // they should be equal.
3906 assert(!BothSingleAlloc ||
3907 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3908
3909 // If either both have a single alloc type (which are the same), or if the
3910 // clone's callee edges have the same alloc types as those for the current
3911 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3912 // then we can reuse this clone.
3913 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3914 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3915 Clone = CurClone;
3916 break;
3917 }
3918 }
3919
3920 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3921 if (Clone)
3922 moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /*NewClone=*/false,
3923 ContextIdsToMove: CallerEdgeContextsForAlloc);
3924 else
3925 Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc);
3926
3927 // Sanity check that no alloc types on clone or its edges are None.
3928 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3929 }
3930
3931 // We should still have some context ids on the original Node.
3932 assert(!Node->emptyContextIds());
3933
3934 // Sanity check that no alloc types on node or edges are None.
3935 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3936
3937 if (VerifyNodes)
3938 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3939}
3940
3941void ModuleCallsiteContextGraph::updateAllocationCall(
3942 CallInfo &Call, AllocationType AllocType) {
3943 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
3944 auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
3945 Kind: "memprof", Val: AllocTypeString);
3946 cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
3947 OREGetter(Call.call()->getFunction())
3948 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3949 << ore::NV("AllocationCall", Call.call()) << " in clone "
3950 << ore::NV("Caller", Call.call()->getFunction())
3951 << " marked with memprof allocation attribute "
3952 << ore::NV("Attribute", AllocTypeString));
3953}
3954
3955void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3956 AllocationType AllocType) {
3957 auto *AI = cast<AllocInfo *>(Val: Call.call());
3958 assert(AI);
3959 assert(AI->Versions.size() > Call.cloneNo());
3960 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3961}
3962
3963AllocationType
3964ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3965 const auto *CB = cast<CallBase>(Val: Call.call());
3966 if (!CB->getAttributes().hasFnAttr(Kind: "memprof"))
3967 return AllocationType::None;
3968 return CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
3969 ? AllocationType::Cold
3970 : AllocationType::NotCold;
3971}
3972
3973AllocationType
3974IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3975 const auto *AI = cast<AllocInfo *>(Val: Call.call());
3976 assert(AI->Versions.size() > Call.cloneNo());
3977 return (AllocationType)AI->Versions[Call.cloneNo()];
3978}
3979
3980void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3981 FuncInfo CalleeFunc) {
3982 if (CalleeFunc.cloneNo() > 0)
3983 cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
3984 OREGetter(CallerCall.call()->getFunction())
3985 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3986 << ore::NV("Call", CallerCall.call()) << " in clone "
3987 << ore::NV("Caller", CallerCall.call()->getFunction())
3988 << " assigned to call function clone "
3989 << ore::NV("Callee", CalleeFunc.func()));
3990}
3991
3992void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3993 FuncInfo CalleeFunc) {
3994 auto *CI = cast<CallsiteInfo *>(Val: CallerCall.call());
3995 assert(CI &&
3996 "Caller cannot be an allocation which should not have profiled calls");
3997 assert(CI->Clones.size() > CallerCall.cloneNo());
3998 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3999}
4000
4001CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4002 Instruction *>::FuncInfo
4003ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4004 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
4005 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4006 // Use existing LLVM facilities for cloning and obtaining Call in clone
4007 ValueToValueMapTy VMap;
4008 auto *NewFunc = CloneFunction(F: Func.func(), VMap);
4009 std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
4010 assert(!Func.func()->getParent()->getFunction(Name));
4011 NewFunc->setName(Name);
4012 if (auto *SP = NewFunc->getSubprogram())
4013 SP->replaceLinkageName(
4014 LN: MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name));
4015 for (auto &Inst : CallsWithMetadataInFunc) {
4016 // This map always has the initial version in it.
4017 assert(Inst.cloneNo() == 0);
4018 CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo};
4019 }
4020 OREGetter(Func.func())
4021 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4022 << "created clone " << ore::NV("NewFunction", NewFunc));
4023 return {NewFunc, CloneNo};
4024}
4025
4026CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4027 IndexCall>::FuncInfo
4028IndexCallsiteContextGraph::cloneFunctionForCallsite(
4029 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
4030 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4031 // Check how many clones we have of Call (and therefore function).
4032 // The next clone number is the current size of versions array.
4033 // Confirm this matches the CloneNo provided by the caller, which is based on
4034 // the number of function clones we have.
4035 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4036 ? cast<AllocInfo *>(Call.call())->Versions.size()
4037 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4038 // Walk all the instructions in this function. Create a new version for
4039 // each (by adding an entry to the Versions/Clones summary array), and copy
4040 // over the version being called for the function clone being cloned here.
4041 // Additionally, add an entry to the CallMap for the new function clone,
4042 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4043 // to the new call clone.
4044 for (auto &Inst : CallsWithMetadataInFunc) {
4045 // This map always has the initial version in it.
4046 assert(Inst.cloneNo() == 0);
4047 if (auto *AI = dyn_cast<AllocInfo *>(Val: Inst.call())) {
4048 assert(AI->Versions.size() == CloneNo);
4049 // We assign the allocation type later (in updateAllocationCall), just add
4050 // an entry for it here.
4051 AI->Versions.push_back(Elt: 0);
4052 } else {
4053 auto *CI = cast<CallsiteInfo *>(Val: Inst.call());
4054 assert(CI && CI->Clones.size() == CloneNo);
4055 // We assign the clone number later (in updateCall), just add an entry for
4056 // it here.
4057 CI->Clones.push_back(Elt: 0);
4058 }
4059 CallMap[Inst] = {Inst.call(), CloneNo};
4060 }
4061 return {Func.func(), CloneNo};
4062}
4063
4064// We perform cloning for each allocation node separately. However, this
4065// sometimes results in a situation where the same node calls multiple
4066// clones of the same callee, created for different allocations. This
4067// causes issues when assigning functions to these clones, as each node can
4068// in reality only call a single callee clone.
4069//
4070// To address this, before assigning functions, merge callee clone nodes as
4071// needed using a post order traversal from the allocations. We attempt to
4072// use existing clones as the merge node when legal, and to share them
4073// among callers with the same properties (callers calling the same set of
4074// callee clone nodes for the same allocations).
4075//
4076// Without this fix, in some cases incorrect function assignment will lead
4077// to calling the wrong allocation clone.
4078template <typename DerivedCCG, typename FuncTy, typename CallTy>
4079void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4080 if (!MergeClones)
4081 return;
4082
4083 // Generate a map from context id to the associated allocation node for use
4084 // when merging clones.
4085 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4086 for (auto &Entry : AllocationCallToContextNodeMap) {
4087 auto *Node = Entry.second;
4088 for (auto Id : Node->getContextIds())
4089 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4090 for (auto *Clone : Node->Clones) {
4091 for (auto Id : Clone->getContextIds())
4092 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4093 }
4094 }
4095
4096 // Post order traversal starting from allocations to ensure each callsite
4097 // calls a single clone of its callee. Callee nodes that are clones of each
4098 // other are merged (via new merge nodes if needed) to achieve this.
4099 DenseSet<const ContextNode *> Visited;
4100 for (auto &Entry : AllocationCallToContextNodeMap) {
4101 auto *Node = Entry.second;
4102
4103 mergeClones(Node, Visited, ContextIdToAllocationNode);
4104
4105 // Make a copy so the recursive post order traversal that may create new
4106 // clones doesn't mess up iteration. Note that the recursive traversal
4107 // itself does not call mergeClones on any of these nodes, which are all
4108 // (clones of) allocations.
4109 auto Clones = Node->Clones;
4110 for (auto *Clone : Clones)
4111 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4112 }
4113
4114 if (DumpCCG) {
4115 dbgs() << "CCG after merging:\n";
4116 dbgs() << *this;
4117 }
4118 if (ExportToDot)
4119 exportToDot(Label: "aftermerge");
4120
4121 if (VerifyCCG) {
4122 check();
4123 }
4124}
4125
4126// Recursive helper for above mergeClones method.
4127template <typename DerivedCCG, typename FuncTy, typename CallTy>
4128void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4129 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4130 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4131 auto Inserted = Visited.insert(Node);
4132 if (!Inserted.second)
4133 return;
4134
4135 // Make a copy since the recursive call may move a caller edge to a new
4136 // callee, messing up the iterator.
4137 auto CallerEdges = Node->CallerEdges;
4138 for (auto CallerEdge : CallerEdges) {
4139 // Skip any caller edge moved onto a different callee during recursion.
4140 if (CallerEdge->Callee != Node)
4141 continue;
4142 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4143 }
4144
4145 // Merge for this node after we handle its callers.
4146 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4147}
4148
4149template <typename DerivedCCG, typename FuncTy, typename CallTy>
4150void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4151 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4152 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4153 // Ignore Node if we moved all of its contexts to clones.
4154 if (Node->emptyContextIds())
4155 return;
4156
4157 // First identify groups of clones among Node's callee edges, by building
4158 // a map from each callee base node to the associated callee edges from Node.
4159 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4160 OrigNodeToCloneEdges;
4161 for (const auto &E : Node->CalleeEdges) {
4162 auto *Callee = E->Callee;
4163 if (!Callee->CloneOf && Callee->Clones.empty())
4164 continue;
4165 ContextNode *Base = Callee->getOrigNode();
4166 OrigNodeToCloneEdges[Base].push_back(E);
4167 }
4168
4169 // Helper for callee edge sorting below. Return true if A's callee has fewer
4170 // caller edges than B, or if A is a clone and B is not, or if A's first
4171 // context id is smaller than B's.
4172 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4173 const std::shared_ptr<ContextEdge> &B) {
4174 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4175 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4176 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4177 return true;
4178 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4179 return false;
4180 // Use the first context id for each edge as a
4181 // tie-breaker.
4182 return *A->ContextIds.begin() < *B->ContextIds.begin();
4183 };
4184
4185 // Process each set of callee clones called by Node, performing the needed
4186 // merging.
4187 for (auto Entry : OrigNodeToCloneEdges) {
4188 // CalleeEdges is the set of edges from Node reaching callees that are
4189 // mutual clones of each other.
4190 auto &CalleeEdges = Entry.second;
4191 auto NumCalleeClones = CalleeEdges.size();
4192 // A single edge means there is no merging needed.
4193 if (NumCalleeClones == 1)
4194 continue;
4195 // Sort the CalleeEdges calling this group of clones in ascending order of
4196 // their caller edge counts, putting the original non-clone node first in
4197 // cases of a tie. This simplifies finding an existing node to use as the
4198 // merge node.
4199 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4200
4201 /// Find other callers of the given set of callee edges that can
4202 /// share the same callee merge node. See the comments at this method
4203 /// definition for details.
4204 DenseSet<ContextNode *> OtherCallersToShareMerge;
4205 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4206 OtherCallersToShareMerge);
4207
4208 // Now do the actual merging. Identify existing or create a new MergeNode
4209 // during the first iteration. Move each callee over, along with edges from
4210 // other callers we've determined above can share the same merge node.
4211 ContextNode *MergeNode = nullptr;
4212 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4213 for (auto CalleeEdge : CalleeEdges) {
4214 auto *OrigCallee = CalleeEdge->Callee;
4215 // If we don't have a MergeNode yet (only happens on the first iteration,
4216 // as a new one will be created when we go to move the first callee edge
4217 // over as needed), see if we can use this callee.
4218 if (!MergeNode) {
4219 // If there are no other callers, simply use this callee.
4220 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4221 MergeNode = OrigCallee;
4222 NonNewMergedNodes++;
4223 continue;
4224 }
4225 // Otherwise, if we have identified other caller nodes that can share
4226 // the merge node with Node, see if all of OrigCallee's callers are
4227 // going to share the same merge node. In that case we can use callee
4228 // (since all of its callers would move to the new merge node).
4229 if (!OtherCallersToShareMerge.empty()) {
4230 bool MoveAllCallerEdges = true;
4231 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4232 if (CalleeCallerE == CalleeEdge)
4233 continue;
4234 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4235 MoveAllCallerEdges = false;
4236 break;
4237 }
4238 }
4239 // If we are going to move all callers over, we can use this callee as
4240 // the MergeNode.
4241 if (MoveAllCallerEdges) {
4242 MergeNode = OrigCallee;
4243 NonNewMergedNodes++;
4244 continue;
4245 }
4246 }
4247 }
4248 // Move this callee edge, creating a new merge node if necessary.
4249 if (MergeNode) {
4250 assert(MergeNode != OrigCallee);
4251 moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode,
4252 /*NewClone*/ false);
4253 } else {
4254 MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4255 NewMergedNodes++;
4256 }
4257 // Now move all identified edges from other callers over to the merge node
4258 // as well.
4259 if (!OtherCallersToShareMerge.empty()) {
4260 // Make and iterate over a copy of OrigCallee's caller edges because
4261 // some of these will be moved off of the OrigCallee and that would mess
4262 // up the iteration from OrigCallee.
4263 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4264 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4265 if (CalleeCallerE == CalleeEdge)
4266 continue;
4267 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4268 continue;
4269 CallerToMoveCount[CalleeCallerE->Caller]++;
4270 moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode,
4271 /*NewClone*/ false);
4272 }
4273 }
4274 removeNoneTypeCalleeEdges(Node: OrigCallee);
4275 removeNoneTypeCalleeEdges(Node: MergeNode);
4276 }
4277 }
4278}
4279
4280// Look for other nodes that have edges to the same set of callee
4281// clones as the current Node. Those can share the eventual merge node
4282// (reducing cloning and binary size overhead) iff:
4283// - they have edges to the same set of callee clones
4284// - each callee edge reaches a subset of the same allocations as Node's
4285// corresponding edge to the same callee clone.
4286// The second requirement is to ensure that we don't undo any of the
4287// necessary cloning to distinguish contexts with different allocation
4288// behavior.
4289// FIXME: This is somewhat conservative, as we really just need to ensure
4290// that they don't reach the same allocations as contexts on edges from Node
4291// going to any of the *other* callee clones being merged. However, that
4292// requires more tracking and checking to get right.
4293template <typename DerivedCCG, typename FuncTy, typename CallTy>
4294void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4295 findOtherCallersToShareMerge(
4296 ContextNode *Node,
4297 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4298 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4299 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4300 auto NumCalleeClones = CalleeEdges.size();
4301 // This map counts how many edges to the same callee clone exist for other
4302 // caller nodes of each callee clone.
4303 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4304 // Counts the number of other caller nodes that have edges to all callee
4305 // clones that don't violate the allocation context checking.
4306 unsigned PossibleOtherCallerNodes = 0;
4307
4308 // We only need to look at other Caller nodes if the first callee edge has
4309 // multiple callers (recall they are sorted in ascending order above).
4310 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4311 return;
4312
4313 // For each callee edge:
4314 // - Collect the count of other caller nodes calling the same callees.
4315 // - Collect the alloc nodes reached by contexts on each callee edge.
4316 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4317 for (auto CalleeEdge : CalleeEdges) {
4318 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4319 // For each other caller of the same callee, increment the count of
4320 // edges reaching the same callee clone.
4321 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4322 if (CalleeCallerEdges->Caller == Node) {
4323 assert(CalleeCallerEdges == CalleeEdge);
4324 continue;
4325 }
4326 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4327 // If this caller edge now reaches all of the same callee clones,
4328 // increment the count of candidate other caller nodes.
4329 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4330 NumCalleeClones)
4331 PossibleOtherCallerNodes++;
4332 }
4333 // Collect the alloc nodes reached by contexts on each callee edge, for
4334 // later analysis.
4335 for (auto Id : CalleeEdge->getContextIds()) {
4336 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4337 if (!Alloc) {
4338 // FIXME: unclear why this happens occasionally, presumably
4339 // imperfect graph updates possibly with recursion.
4340 MissingAllocForContextId++;
4341 continue;
4342 }
4343 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4344 }
4345 }
4346
4347 // Now walk the callee edges again, and make sure that for each candidate
4348 // caller node all of its edges to the callees reach the same allocs (or
4349 // a subset) as those along the corresponding callee edge from Node.
4350 for (auto CalleeEdge : CalleeEdges) {
4351 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4352 // Stop if we do not have any (more) candidate other caller nodes.
4353 if (!PossibleOtherCallerNodes)
4354 break;
4355 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4356 // Check each other caller of this callee clone.
4357 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4358 // Not interested in the callee edge from Node itself.
4359 if (CalleeCallerE == CalleeEdge)
4360 continue;
4361 // Skip any callers that didn't have callee edges to all the same
4362 // callee clones.
4363 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4364 NumCalleeClones)
4365 continue;
4366 // Make sure that each context along edge from candidate caller node
4367 // reaches an allocation also reached by this callee edge from Node.
4368 for (auto Id : CalleeCallerE->getContextIds()) {
4369 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4370 if (!Alloc)
4371 continue;
4372 // If not, simply reset the map entry to 0 so caller is ignored, and
4373 // reduce the count of candidate other caller nodes.
4374 if (!CurCalleeAllocNodes.contains(Alloc)) {
4375 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4376 PossibleOtherCallerNodes--;
4377 break;
4378 }
4379 }
4380 }
4381 }
4382
4383 if (!PossibleOtherCallerNodes)
4384 return;
4385
4386 // Build the set of other caller nodes that can use the same callee merge
4387 // node.
4388 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4389 if (Count != NumCalleeClones)
4390 continue;
4391 OtherCallersToShareMerge.insert(OtherCaller);
4392 }
4393}
4394
4395// This method assigns cloned callsites to functions, cloning the functions as
4396// needed. The assignment is greedy and proceeds roughly as follows:
4397//
4398// For each function Func:
4399// For each call with graph Node having clones:
4400// Initialize ClonesWorklist to Node and its clones
4401// Initialize NodeCloneCount to 0
4402// While ClonesWorklist is not empty:
4403// Clone = pop front ClonesWorklist
4404// NodeCloneCount++
4405// If Func has been cloned less than NodeCloneCount times:
4406// If NodeCloneCount is 1:
4407// Assign Clone to original Func
4408// Continue
4409// Create a new function clone
4410// If other callers not assigned to call a function clone yet:
4411// Assign them to call new function clone
4412// Continue
4413// Assign any other caller calling the cloned version to new clone
4414//
4415// For each caller of Clone:
4416// If caller is assigned to call a specific function clone:
4417// If we cannot assign Clone to that function clone:
4418// Create new callsite Clone NewClone
4419// Add NewClone to ClonesWorklist
4420// Continue
4421// Assign Clone to existing caller's called function clone
4422// Else:
4423// If Clone not already assigned to a function clone:
4424// Assign to first function clone without assignment
4425// Assign caller to selected function clone
4426template <typename DerivedCCG, typename FuncTy, typename CallTy>
4427bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4428 bool Changed = false;
4429
4430 mergeClones();
4431
4432 // Keep track of the assignment of nodes (callsites) to function clones they
4433 // call.
4434 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4435
4436 // Update caller node to call function version CalleeFunc, by recording the
4437 // assignment in CallsiteToCalleeFuncCloneMap.
4438 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4439 const FuncInfo &CalleeFunc) {
4440 assert(Caller->hasCall());
4441 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4442 };
4443
4444 // Walk all functions for which we saw calls with memprof metadata, and handle
4445 // cloning for each of its calls.
4446 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4447 FuncInfo OrigFunc(Func);
4448 // Map from each clone of OrigFunc to a map of remappings of each call of
4449 // interest (from original uncloned call to the corresponding cloned call in
4450 // that function clone).
4451 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
4452 for (auto &Call : CallsWithMetadata) {
4453 ContextNode *Node = getNodeForInst(C: Call);
4454 // Skip call if we do not have a node for it (all uses of its stack ids
4455 // were either on inlined chains or pruned from the MIBs), or if we did
4456 // not create any clones for it.
4457 if (!Node || Node->Clones.empty())
4458 continue;
4459 assert(Node->hasCall() &&
4460 "Not having a call should have prevented cloning");
4461
4462 // Track the assignment of function clones to clones of the current
4463 // callsite Node being handled.
4464 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4465
4466 // Assign callsite version CallsiteClone to function version FuncClone,
4467 // and also assign (possibly cloned) Call to CallsiteClone.
4468 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4469 CallInfo &Call,
4470 ContextNode *CallsiteClone,
4471 bool IsAlloc) {
4472 // Record the clone of callsite node assigned to this function clone.
4473 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4474
4475 assert(FuncClonesToCallMap.count(FuncClone));
4476 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
4477 CallInfo CallClone(Call);
4478 if (auto It = CallMap.find(Call); It != CallMap.end())
4479 CallClone = It->second;
4480 CallsiteClone->setCall(CallClone);
4481 // Need to do the same for all matching calls.
4482 for (auto &MatchingCall : Node->MatchingCalls) {
4483 CallInfo CallClone(MatchingCall);
4484 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4485 CallClone = It->second;
4486 // Updates the call in the list.
4487 MatchingCall = CallClone;
4488 }
4489 };
4490
4491 // Keep track of the clones of callsite Node that need to be assigned to
4492 // function clones. This list may be expanded in the loop body below if we
4493 // find additional cloning is required.
4494 std::deque<ContextNode *> ClonesWorklist;
4495 // Ignore original Node if we moved all of its contexts to clones.
4496 if (!Node->emptyContextIds())
4497 ClonesWorklist.push_back(Node);
4498 llvm::append_range(ClonesWorklist, Node->Clones);
4499
4500 // Now walk through all of the clones of this callsite Node that we need,
4501 // and determine the assignment to a corresponding clone of the current
4502 // function (creating new function clones as needed).
4503 unsigned NodeCloneCount = 0;
4504 while (!ClonesWorklist.empty()) {
4505 ContextNode *Clone = ClonesWorklist.front();
4506 ClonesWorklist.pop_front();
4507 NodeCloneCount++;
4508 if (VerifyNodes)
4509 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4510
4511 // Need to create a new function clone if we have more callsite clones
4512 // than existing function clones, which would have been assigned to an
4513 // earlier clone in the list (we assign callsite clones to function
4514 // clones greedily).
4515 if (FuncClonesToCallMap.size() < NodeCloneCount) {
4516 // If this is the first callsite copy, assign to original function.
4517 if (NodeCloneCount == 1) {
4518 // Since FuncClonesToCallMap is empty in this case, no clones have
4519 // been created for this function yet, and no callers should have
4520 // been assigned a function clone for this callee node yet.
4521 assert(llvm::none_of(
4522 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4523 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4524 }));
4525 // Initialize with empty call map, assign Clone to original function
4526 // and its callers, and skip to the next clone.
4527 FuncClonesToCallMap[OrigFunc] = {};
4528 AssignCallsiteCloneToFuncClone(
4529 OrigFunc, Call, Clone,
4530 AllocationCallToContextNodeMap.count(Call));
4531 for (auto &CE : Clone->CallerEdges) {
4532 // Ignore any caller that does not have a recorded callsite Call.
4533 if (!CE->Caller->hasCall())
4534 continue;
4535 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4536 }
4537 continue;
4538 }
4539
4540 // First locate which copy of OrigFunc to clone again. If a caller
4541 // of this callsite clone was already assigned to call a particular
4542 // function clone, we need to redirect all of those callers to the
4543 // new function clone, and update their other callees within this
4544 // function.
4545 FuncInfo PreviousAssignedFuncClone;
4546 auto EI = llvm::find_if(
4547 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4548 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4549 });
4550 bool CallerAssignedToCloneOfFunc = false;
4551 if (EI != Clone->CallerEdges.end()) {
4552 const std::shared_ptr<ContextEdge> &Edge = *EI;
4553 PreviousAssignedFuncClone =
4554 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4555 CallerAssignedToCloneOfFunc = true;
4556 }
4557
4558 // Clone function and save it along with the CallInfo map created
4559 // during cloning in the FuncClonesToCallMap.
4560 std::map<CallInfo, CallInfo> NewCallMap;
4561 unsigned CloneNo = FuncClonesToCallMap.size();
4562 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4563 "should already exist in the map");
4564 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4565 Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
4566 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
4567 FunctionClonesAnalysis++;
4568 Changed = true;
4569
4570 // If no caller callsites were already assigned to a clone of this
4571 // function, we can simply assign this clone to the new func clone
4572 // and update all callers to it, then skip to the next clone.
4573 if (!CallerAssignedToCloneOfFunc) {
4574 AssignCallsiteCloneToFuncClone(
4575 NewFuncClone, Call, Clone,
4576 AllocationCallToContextNodeMap.count(Call));
4577 for (auto &CE : Clone->CallerEdges) {
4578 // Ignore any caller that does not have a recorded callsite Call.
4579 if (!CE->Caller->hasCall())
4580 continue;
4581 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4582 }
4583 continue;
4584 }
4585
4586 // We may need to do additional node cloning in this case.
4587 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4588 // that were previously assigned to call PreviousAssignedFuncClone,
4589 // to record that they now call NewFuncClone.
4590 // The none type edge removal may remove some of this Clone's caller
4591 // edges, if it is reached via another of its caller's callees.
4592 // Iterate over a copy and skip any that were removed.
4593 auto CallerEdges = Clone->CallerEdges;
4594 for (auto CE : CallerEdges) {
4595 // Skip any that have been removed on an earlier iteration.
4596 if (CE->isRemoved()) {
4597 assert(!is_contained(Clone->CallerEdges, CE));
4598 continue;
4599 }
4600 assert(CE);
4601 // Ignore any caller that does not have a recorded callsite Call.
4602 if (!CE->Caller->hasCall())
4603 continue;
4604
4605 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4606 // We subsequently fall through to later handling that
4607 // will perform any additional cloning required for
4608 // callers that were calling other function clones.
4609 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4610 PreviousAssignedFuncClone)
4611 continue;
4612
4613 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4614
4615 // If we are cloning a function that was already assigned to some
4616 // callers, then essentially we are creating new callsite clones
4617 // of the other callsites in that function that are reached by those
4618 // callers. Clone the other callees of the current callsite's caller
4619 // that were already assigned to PreviousAssignedFuncClone
4620 // accordingly. This is important since we subsequently update the
4621 // calls from the nodes in the graph and their assignments to callee
4622 // functions recorded in CallsiteToCalleeFuncCloneMap.
4623 // The none type edge removal may remove some of this caller's
4624 // callee edges, if it is reached via another of its callees.
4625 // Iterate over a copy and skip any that were removed.
4626 auto CalleeEdges = CE->Caller->CalleeEdges;
4627 for (auto CalleeEdge : CalleeEdges) {
4628 // Skip any that have been removed on an earlier iteration when
4629 // cleaning up newly None type callee edges.
4630 if (CalleeEdge->isRemoved()) {
4631 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4632 continue;
4633 }
4634 assert(CalleeEdge);
4635 ContextNode *Callee = CalleeEdge->Callee;
4636 // Skip the current callsite, we are looking for other
4637 // callsites Caller calls, as well as any that does not have a
4638 // recorded callsite Call.
4639 if (Callee == Clone || !Callee->hasCall())
4640 continue;
4641 // Skip direct recursive calls. We don't need/want to clone the
4642 // caller node again, and this loop will not behave as expected if
4643 // we tried.
4644 if (Callee == CalleeEdge->Caller)
4645 continue;
4646 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4647 removeNoneTypeCalleeEdges(Node: NewClone);
4648 // Moving the edge may have resulted in some none type
4649 // callee edges on the original Callee.
4650 removeNoneTypeCalleeEdges(Node: Callee);
4651 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4652 // If the Callee node was already assigned to call a specific
4653 // function version, make sure its new clone is assigned to call
4654 // that same function clone.
4655 if (CallsiteToCalleeFuncCloneMap.count(Callee))
4656 RecordCalleeFuncOfCallsite(
4657 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
4658 // Update NewClone with the new Call clone of this callsite's Call
4659 // created for the new function clone created earlier.
4660 // Recall that we have already ensured when building the graph
4661 // that each caller can only call callsites within the same
4662 // function, so we are guaranteed that Callee Call is in the
4663 // current OrigFunc.
4664 // CallMap is set up as indexed by original Call at clone 0.
4665 CallInfo OrigCall(Callee->getOrigNode()->Call);
4666 OrigCall.setCloneNo(0);
4667 std::map<CallInfo, CallInfo> &CallMap =
4668 FuncClonesToCallMap[NewFuncClone];
4669 assert(CallMap.count(OrigCall));
4670 CallInfo NewCall(CallMap[OrigCall]);
4671 assert(NewCall);
4672 NewClone->setCall(NewCall);
4673 // Need to do the same for all matching calls.
4674 for (auto &MatchingCall : NewClone->MatchingCalls) {
4675 CallInfo OrigMatchingCall(MatchingCall);
4676 OrigMatchingCall.setCloneNo(0);
4677 assert(CallMap.count(OrigMatchingCall));
4678 CallInfo NewCall(CallMap[OrigMatchingCall]);
4679 assert(NewCall);
4680 // Updates the call in the list.
4681 MatchingCall = NewCall;
4682 }
4683 }
4684 }
4685 // Fall through to handling below to perform the recording of the
4686 // function for this callsite clone. This enables handling of cases
4687 // where the callers were assigned to different clones of a function.
4688 }
4689
4690 // See if we can use existing function clone. Walk through
4691 // all caller edges to see if any have already been assigned to
4692 // a clone of this callsite's function. If we can use it, do so. If not,
4693 // because that function clone is already assigned to a different clone
4694 // of this callsite, then we need to clone again.
4695 // Basically, this checking is needed to handle the case where different
4696 // caller functions/callsites may need versions of this function
4697 // containing different mixes of callsite clones across the different
4698 // callsites within the function. If that happens, we need to create
4699 // additional function clones to handle the various combinations.
4700 //
4701 // Keep track of any new clones of this callsite created by the
4702 // following loop, as well as any existing clone that we decided to
4703 // assign this clone to.
4704 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4705 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4706 // Iterate over a copy of Clone's caller edges, since we may need to
4707 // remove edges in the moveEdgeTo* methods, and this simplifies the
4708 // handling and makes it less error-prone.
4709 auto CloneCallerEdges = Clone->CallerEdges;
4710 for (auto &Edge : CloneCallerEdges) {
4711 // Skip removed edges (due to direct recursive edges updated when
4712 // updating callee edges when moving an edge and subsequently
4713 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4714 if (Edge->isRemoved())
4715 continue;
4716 // Ignore any caller that does not have a recorded callsite Call.
4717 if (!Edge->Caller->hasCall())
4718 continue;
4719 // If this caller already assigned to call a version of OrigFunc, need
4720 // to ensure we can assign this callsite clone to that function clone.
4721 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4722 FuncInfo FuncCloneCalledByCaller =
4723 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4724 // First we need to confirm that this function clone is available
4725 // for use by this callsite node clone.
4726 //
4727 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4728 // its callsite clones, one of those callsite clones X could have
4729 // been assigned to the same function clone called by Edge's caller
4730 // - if Edge's caller calls another callsite within Node's original
4731 // function, and that callsite has another caller reaching clone X.
4732 // We need to clone Node again in this case.
4733 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4734 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4735 Clone) ||
4736 // Detect when we have multiple callers of this callsite that
4737 // have already been assigned to specific, and different, clones
4738 // of OrigFunc (due to other unrelated callsites in Func they
4739 // reach via call contexts). Is this Clone of callsite Node
4740 // assigned to a different clone of OrigFunc? If so, clone Node
4741 // again.
4742 (FuncCloneAssignedToCurCallsiteClone &&
4743 FuncCloneAssignedToCurCallsiteClone !=
4744 FuncCloneCalledByCaller)) {
4745 // We need to use a different newly created callsite clone, in
4746 // order to assign it to another new function clone on a
4747 // subsequent iteration over the Clones array (adjusted below).
4748 // Note we specifically do not reset the
4749 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4750 // when this new clone is processed later we know which version of
4751 // the function to copy (so that other callsite clones we have
4752 // assigned to that function clone are properly cloned over). See
4753 // comments in the function cloning handling earlier.
4754
4755 // Check if we already have cloned this callsite again while
4756 // walking through caller edges, for a caller calling the same
4757 // function clone. If so, we can move this edge to that new clone
4758 // rather than creating yet another new clone.
4759 if (FuncCloneToNewCallsiteCloneMap.count(
4760 FuncCloneCalledByCaller)) {
4761 ContextNode *NewClone =
4762 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4763 moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone);
4764 // Cleanup any none type edges cloned over.
4765 removeNoneTypeCalleeEdges(Node: NewClone);
4766 } else {
4767 // Create a new callsite clone.
4768 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4769 removeNoneTypeCalleeEdges(Node: NewClone);
4770 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4771 NewClone;
4772 // Add to list of clones and process later.
4773 ClonesWorklist.push_back(NewClone);
4774 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4775 }
4776 // Moving the caller edge may have resulted in some none type
4777 // callee edges.
4778 removeNoneTypeCalleeEdges(Node: Clone);
4779 // We will handle the newly created callsite clone in a subsequent
4780 // iteration over this Node's Clones.
4781 continue;
4782 }
4783
4784 // Otherwise, we can use the function clone already assigned to this
4785 // caller.
4786 if (!FuncCloneAssignedToCurCallsiteClone) {
4787 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4788 // Assign Clone to FuncCloneCalledByCaller
4789 AssignCallsiteCloneToFuncClone(
4790 FuncCloneCalledByCaller, Call, Clone,
4791 AllocationCallToContextNodeMap.count(Call));
4792 } else
4793 // Don't need to do anything - callsite is already calling this
4794 // function clone.
4795 assert(FuncCloneAssignedToCurCallsiteClone ==
4796 FuncCloneCalledByCaller);
4797
4798 } else {
4799 // We have not already assigned this caller to a version of
4800 // OrigFunc. Do the assignment now.
4801
4802 // First check if we have already assigned this callsite clone to a
4803 // clone of OrigFunc for another caller during this iteration over
4804 // its caller edges.
4805 if (!FuncCloneAssignedToCurCallsiteClone) {
4806 // Find first function in FuncClonesToCallMap without an assigned
4807 // clone of this callsite Node. We should always have one
4808 // available at this point due to the earlier cloning when the
4809 // FuncClonesToCallMap size was smaller than the clone number.
4810 for (auto &CF : FuncClonesToCallMap) {
4811 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4812 FuncCloneAssignedToCurCallsiteClone = CF.first;
4813 break;
4814 }
4815 }
4816 assert(FuncCloneAssignedToCurCallsiteClone);
4817 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4818 AssignCallsiteCloneToFuncClone(
4819 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4820 AllocationCallToContextNodeMap.count(Call));
4821 } else
4822 assert(FuncCloneToCurNodeCloneMap
4823 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4824 // Update callers to record function version called.
4825 RecordCalleeFuncOfCallsite(Edge->Caller,
4826 FuncCloneAssignedToCurCallsiteClone);
4827 }
4828 }
4829 }
4830 if (VerifyCCG) {
4831 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4832 for (const auto &PE : Node->CalleeEdges)
4833 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4834 for (const auto &CE : Node->CallerEdges)
4835 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4836 for (auto *Clone : Node->Clones) {
4837 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4838 for (const auto &PE : Clone->CalleeEdges)
4839 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4840 for (const auto &CE : Clone->CallerEdges)
4841 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4842 }
4843 }
4844 }
4845 }
4846
4847 uint8_t BothTypes =
4848 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
4849
4850 auto UpdateCalls = [&](ContextNode *Node,
4851 DenseSet<const ContextNode *> &Visited,
4852 auto &&UpdateCalls) {
4853 auto Inserted = Visited.insert(Node);
4854 if (!Inserted.second)
4855 return;
4856
4857 for (auto *Clone : Node->Clones)
4858 UpdateCalls(Clone, Visited, UpdateCalls);
4859
4860 for (auto &Edge : Node->CallerEdges)
4861 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4862
4863 // Skip if either no call to update, or if we ended up with no context ids
4864 // (we moved all edges onto other clones).
4865 if (!Node->hasCall() || Node->emptyContextIds())
4866 return;
4867
4868 if (Node->IsAllocation) {
4869 auto AT = allocTypeToUse(Node->AllocTypes);
4870 // If the allocation type is ambiguous, and more aggressive hinting
4871 // has been enabled via the MinClonedColdBytePercent flag, see if this
4872 // allocation should be hinted cold anyway because its fraction cold bytes
4873 // allocated is at least the given threshold.
4874 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4875 !ContextIdToContextSizeInfos.empty()) {
4876 uint64_t TotalCold = 0;
4877 uint64_t Total = 0;
4878 for (auto Id : Node->getContextIds()) {
4879 auto TypeI = ContextIdToAllocationType.find(Id);
4880 assert(TypeI != ContextIdToAllocationType.end());
4881 auto CSI = ContextIdToContextSizeInfos.find(Id);
4882 if (CSI != ContextIdToContextSizeInfos.end()) {
4883 for (auto &Info : CSI->second) {
4884 Total += Info.TotalSize;
4885 if (TypeI->second == AllocationType::Cold)
4886 TotalCold += Info.TotalSize;
4887 }
4888 }
4889 }
4890 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4891 AT = AllocationType::Cold;
4892 }
4893 updateAllocationCall(Call&: Node->Call, AllocType: AT);
4894 assert(Node->MatchingCalls.empty());
4895 return;
4896 }
4897
4898 if (!CallsiteToCalleeFuncCloneMap.count(Node))
4899 return;
4900
4901 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4902 updateCall(CallerCall&: Node->Call, CalleeFunc);
4903 // Update all the matching calls as well.
4904 for (auto &Call : Node->MatchingCalls)
4905 updateCall(CallerCall&: Call, CalleeFunc);
4906 };
4907
4908 // Performs DFS traversal starting from allocation nodes to update calls to
4909 // reflect cloning decisions recorded earlier. For regular LTO this will
4910 // update the actual calls in the IR to call the appropriate function clone
4911 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4912 // are recorded in the summary entries.
4913 DenseSet<const ContextNode *> Visited;
4914 for (auto &Entry : AllocationCallToContextNodeMap)
4915 UpdateCalls(Entry.second, Visited, UpdateCalls);
4916
4917 return Changed;
4918}
4919
4920static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
4921 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4922 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4923 &FuncToAliasMap) {
4924 // The first "clone" is the original copy, we should only call this if we
4925 // needed to create new clones.
4926 assert(NumClones > 1);
4927 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
4928 VMaps.reserve(N: NumClones - 1);
4929 FunctionsClonedThinBackend++;
4930 for (unsigned I = 1; I < NumClones; I++) {
4931 VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
4932 auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back());
4933 FunctionClonesThinBackend++;
4934 // Strip memprof and callsite metadata from clone as they are no longer
4935 // needed.
4936 for (auto &BB : *NewF) {
4937 for (auto &Inst : BB) {
4938 Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
4939 Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
4940 }
4941 }
4942 std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
4943 auto *PrevF = M.getFunction(Name);
4944 if (PrevF) {
4945 // We might have created this when adjusting callsite in another
4946 // function. It should be a declaration.
4947 assert(PrevF->isDeclaration());
4948 NewF->takeName(V: PrevF);
4949 PrevF->replaceAllUsesWith(V: NewF);
4950 PrevF->eraseFromParent();
4951 } else
4952 NewF->setName(Name);
4953 if (auto *SP = NewF->getSubprogram())
4954 SP->replaceLinkageName(
4955 LN: MDString::get(Context&: NewF->getParent()->getContext(), Str: Name));
4956 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4957 << "created clone " << ore::NV("NewFunction", NewF));
4958
4959 // Now handle aliases to this function, and clone those as well.
4960 if (!FuncToAliasMap.count(x: &F))
4961 continue;
4962 for (auto *A : FuncToAliasMap[&F]) {
4963 std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I);
4964 auto *PrevA = M.getNamedAlias(Name);
4965 auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
4966 AddressSpace: A->getType()->getPointerAddressSpace(),
4967 Linkage: A->getLinkage(), Name, Aliasee: NewF);
4968 NewA->copyAttributesFrom(Src: A);
4969 if (PrevA) {
4970 // We might have created this when adjusting callsite in another
4971 // function. It should be a declaration.
4972 assert(PrevA->isDeclaration());
4973 NewA->takeName(V: PrevA);
4974 PrevA->replaceAllUsesWith(V: NewA);
4975 PrevA->eraseFromParent();
4976 }
4977 }
4978 }
4979 return VMaps;
4980}
4981
4982// Locate the summary for F. This is complicated by the fact that it might
4983// have been internalized or promoted.
4984static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
4985 const ModuleSummaryIndex *ImportSummary,
4986 const Function *CallingFunc = nullptr) {
4987 // FIXME: Ideally we would retain the original GUID in some fashion on the
4988 // function (e.g. as metadata), but for now do our best to locate the
4989 // summary without that information.
4990 ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
4991 if (!TheFnVI)
4992 // See if theFn was internalized, by checking index directly with
4993 // original name (this avoids the name adjustment done by getGUID() for
4994 // internal symbols).
4995 TheFnVI = ImportSummary->getValueInfo(
4996 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName()));
4997 if (TheFnVI)
4998 return TheFnVI;
4999 // Now query with the original name before any promotion was performed.
5000 StringRef OrigName =
5001 ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
5002 // When this pass is enabled, we always add thinlto_src_file provenance
5003 // metadata to imported function definitions, which allows us to recreate the
5004 // original internal symbol's GUID.
5005 auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file");
5006 // If this is a call to an imported/promoted local for which we didn't import
5007 // the definition, the metadata will not exist on the declaration. However,
5008 // since we are doing this early, before any inlining in the LTO backend, we
5009 // can simply look at the metadata on the calling function which must have
5010 // been from the same module if F was an internal symbol originally.
5011 if (!SrcFileMD && F.isDeclaration()) {
5012 // We would only call this for a declaration for a direct callsite, in which
5013 // case the caller would have provided the calling function pointer.
5014 assert(CallingFunc);
5015 SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file");
5016 // If this is a promoted local (OrigName != F.getName()), since this is a
5017 // declaration, it must be imported from a different module and therefore we
5018 // should always find the metadata on its calling function. Any call to a
5019 // promoted local that came from this module should still be a definition.
5020 assert(SrcFileMD || OrigName == F.getName());
5021 }
5022 StringRef SrcFile = M.getSourceFileName();
5023 if (SrcFileMD)
5024 SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: 0))->getString();
5025 std::string OrigId = GlobalValue::getGlobalIdentifier(
5026 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5027 TheFnVI = ImportSummary->getValueInfo(
5028 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5029 // Internal func in original module may have gotten a numbered suffix if we
5030 // imported an external function with the same name. This happens
5031 // automatically during IR linking for naming conflicts. It would have to
5032 // still be internal in that case (otherwise it would have been renamed on
5033 // promotion in which case we wouldn't have a naming conflict).
5034 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5035 F.getName().contains(C: '.')) {
5036 OrigName = F.getName().rsplit(Separator: '.').first;
5037 OrigId = GlobalValue::getGlobalIdentifier(
5038 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5039 TheFnVI = ImportSummary->getValueInfo(
5040 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5041 }
5042 // The only way we may not have a VI is if this is a declaration created for
5043 // an imported reference. For distributed ThinLTO we may not have a VI for
5044 // such declarations in the distributed summary.
5045 assert(TheFnVI || F.isDeclaration());
5046 return TheFnVI;
5047}
5048
5049bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5050 Module &M) {
5051 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5052 Symtab = std::make_unique<InstrProfSymtab>();
5053 // Don't add canonical names, to avoid multiple functions to the symtab
5054 // when they both have the same root name with "." suffixes stripped.
5055 // If we pick the wrong one then this could lead to incorrect ICP and calling
5056 // a memprof clone that we don't actually create (resulting in linker unsats).
5057 // What this means is that the GUID of the function (or its PGOFuncName
5058 // metadata) *must* match that in the VP metadata to allow promotion.
5059 // In practice this should not be a limitation, since local functions should
5060 // have PGOFuncName metadata and global function names shouldn't need any
5061 // special handling (they should not get the ".llvm.*" suffix that the
5062 // canonicalization handling is attempting to strip).
5063 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5064 std::string SymtabFailure = toString(E: std::move(E));
5065 M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure);
5066 return false;
5067 }
5068 return true;
5069}
5070
5071#ifndef NDEBUG
5072// Sanity check that the MIB stack ids match between the summary and
5073// instruction metadata.
5074static void checkAllocContextIds(
5075 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5076 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5077 const ModuleSummaryIndex *ImportSummary) {
5078 auto MIBIter = AllocNode.MIBs.begin();
5079 for (auto &MDOp : MemProfMD->operands()) {
5080 assert(MIBIter != AllocNode.MIBs.end());
5081 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5082 auto *MIBMD = cast<const MDNode>(MDOp);
5083 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5084 assert(StackMDNode);
5085 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5086 auto ContextIterBegin =
5087 StackContext.beginAfterSharedPrefix(CallsiteContext);
5088 // Skip the checking on the first iteration.
5089 uint64_t LastStackContextId =
5090 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5091 : 0;
5092 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5093 ++ContextIter) {
5094 // If this is a direct recursion, simply skip the duplicate
5095 // entries, to be consistent with how the summary ids were
5096 // generated during ModuleSummaryAnalysis.
5097 if (LastStackContextId == *ContextIter)
5098 continue;
5099 LastStackContextId = *ContextIter;
5100 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5101 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5102 *ContextIter);
5103 StackIdIndexIter++;
5104 }
5105 MIBIter++;
5106 }
5107}
5108#endif
5109
5110bool MemProfContextDisambiguation::applyImport(Module &M) {
5111 assert(ImportSummary);
5112 bool Changed = false;
5113
5114 // We also need to clone any aliases that reference cloned functions, because
5115 // the modified callsites may invoke via the alias. Keep track of the aliases
5116 // for each function.
5117 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5118 FuncToAliasMap;
5119 for (auto &A : M.aliases()) {
5120 auto *Aliasee = A.getAliaseeObject();
5121 if (auto *F = dyn_cast<Function>(Val: Aliasee))
5122 FuncToAliasMap[F].insert(Ptr: &A);
5123 }
5124
5125 if (!initializeIndirectCallPromotionInfo(M))
5126 return false;
5127
5128 for (auto &F : M) {
5129 if (F.isDeclaration() || isMemProfClone(F))
5130 continue;
5131
5132 OptimizationRemarkEmitter ORE(&F);
5133
5134 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5135 bool ClonesCreated = false;
5136 unsigned NumClonesCreated = 0;
5137 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5138 // We should at least have version 0 which is the original copy.
5139 assert(NumClones > 0);
5140 // If only one copy needed use original.
5141 if (NumClones == 1)
5142 return;
5143 // If we already performed cloning of this function, confirm that the
5144 // requested number of clones matches (the thin link should ensure the
5145 // number of clones for each constituent callsite is consistent within
5146 // each function), before returning.
5147 if (ClonesCreated) {
5148 assert(NumClonesCreated == NumClones);
5149 return;
5150 }
5151 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5152 // The first "clone" is the original copy, which doesn't have a VMap.
5153 assert(VMaps.size() == NumClones - 1);
5154 Changed = true;
5155 ClonesCreated = true;
5156 NumClonesCreated = NumClones;
5157 };
5158
5159 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5160 Function *CalledFunction) {
5161 // Perform cloning if not yet done.
5162 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
5163
5164 assert(!isMemProfClone(*CalledFunction));
5165
5166 // Because we update the cloned calls by calling setCalledOperand (see
5167 // comment below), out of an abundance of caution make sure the called
5168 // function was actually the called operand (or its aliasee). We also
5169 // strip pointer casts when looking for calls (to match behavior during
5170 // summary generation), however, with opaque pointers in theory this
5171 // should not be an issue. Note we still clone the current function
5172 // (containing this call) above, as that could be needed for its callers.
5173 auto *GA = dyn_cast_or_null<GlobalAlias>(Val: CB->getCalledOperand());
5174 if (CalledFunction != CB->getCalledOperand() &&
5175 (!GA || CalledFunction != GA->getAliaseeObject())) {
5176 SkippedCallsCloning++;
5177 return;
5178 }
5179 // Update the calls per the summary info.
5180 // Save orig name since it gets updated in the first iteration
5181 // below.
5182 auto CalleeOrigName = CalledFunction->getName();
5183 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5184 // Do nothing if this version calls the original version of its
5185 // callee.
5186 if (!StackNode.Clones[J])
5187 continue;
5188 auto NewF = M.getOrInsertFunction(
5189 Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]),
5190 T: CalledFunction->getFunctionType());
5191 CallBase *CBClone;
5192 // Copy 0 is the original function.
5193 if (!J)
5194 CBClone = CB;
5195 else
5196 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5197 // Set the called operand directly instead of calling setCalledFunction,
5198 // as the latter mutates the function type on the call. In rare cases
5199 // we may have a slightly different type on a callee function
5200 // declaration due to it being imported from a different module with
5201 // incomplete types. We really just want to change the name of the
5202 // function to the clone, and not make any type changes.
5203 CBClone->setCalledOperand(NewF.getCallee());
5204 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5205 << ore::NV("Call", CBClone) << " in clone "
5206 << ore::NV("Caller", CBClone->getFunction())
5207 << " assigned to call function clone "
5208 << ore::NV("Callee", NewF.getCallee()));
5209 }
5210 };
5211
5212 // Locate the summary for F.
5213 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5214 // If not found, this could be an imported local (see comment in
5215 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5216 // module (where it would have been promoted to global scope so should
5217 // satisfy any reference in this module).
5218 if (!TheFnVI)
5219 continue;
5220
5221 auto *GVSummary =
5222 ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
5223 if (!GVSummary) {
5224 // Must have been imported, use the summary which matches the definition。
5225 // (might be multiple if this was a linkonce_odr).
5226 auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
5227 assert(SrcModuleMD &&
5228 "enable-import-metadata is needed to emit thinlto_src_module");
5229 StringRef SrcModule =
5230 dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString();
5231 for (auto &GVS : TheFnVI.getSummaryList()) {
5232 if (GVS->modulePath() == SrcModule) {
5233 GVSummary = GVS.get();
5234 break;
5235 }
5236 }
5237 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5238 }
5239
5240 // If this was an imported alias skip it as we won't have the function
5241 // summary, and it should be cloned in the original module.
5242 if (isa<AliasSummary>(Val: GVSummary))
5243 continue;
5244
5245 auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
5246
5247 if (FS->allocs().empty() && FS->callsites().empty())
5248 continue;
5249
5250 auto SI = FS->callsites().begin();
5251 auto AI = FS->allocs().begin();
5252
5253 // To handle callsite infos synthesized for tail calls which have missing
5254 // frames in the profiled context, map callee VI to the synthesized callsite
5255 // info.
5256 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5257 // Iterate the callsites for this function in reverse, since we place all
5258 // those synthesized for tail calls at the end.
5259 for (auto CallsiteIt = FS->callsites().rbegin();
5260 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5261 auto &Callsite = *CallsiteIt;
5262 // Stop as soon as we see a non-synthesized callsite info (see comment
5263 // above loop). All the entries added for discovered tail calls have empty
5264 // stack ids.
5265 if (!Callsite.StackIdIndices.empty())
5266 break;
5267 MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
5268 }
5269
5270 // Keeps track of needed ICP for the function.
5271 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5272
5273 // Assume for now that the instructions are in the exact same order
5274 // as when the summary was created, but confirm this is correct by
5275 // matching the stack ids.
5276 for (auto &BB : F) {
5277 for (auto &I : BB) {
5278 auto *CB = dyn_cast<CallBase>(Val: &I);
5279 // Same handling as when creating module summary.
5280 if (!mayHaveMemprofSummary(CB))
5281 continue;
5282
5283 auto *CalledValue = CB->getCalledOperand();
5284 auto *CalledFunction = CB->getCalledFunction();
5285 if (CalledValue && !CalledFunction) {
5286 CalledValue = CalledValue->stripPointerCasts();
5287 // Stripping pointer casts can reveal a called function.
5288 CalledFunction = dyn_cast<Function>(Val: CalledValue);
5289 }
5290 // Check if this is an alias to a function. If so, get the
5291 // called aliasee for the checks below.
5292 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
5293 assert(!CalledFunction &&
5294 "Expected null called function in callsite for alias");
5295 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
5296 }
5297
5298 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5299 I.getMetadata(KindID: LLVMContext::MD_callsite));
5300 auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
5301
5302 // Include allocs that were already assigned a memprof function
5303 // attribute in the statistics.
5304 if (CB->getAttributes().hasFnAttr(Kind: "memprof")) {
5305 assert(!MemProfMD);
5306 CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
5307 ? AllocTypeColdThinBackend++
5308 : AllocTypeNotColdThinBackend++;
5309 OrigAllocsThinBackend++;
5310 AllocVersionsThinBackend++;
5311 if (!MaxAllocVersionsThinBackend)
5312 MaxAllocVersionsThinBackend = 1;
5313 continue;
5314 }
5315
5316 if (MemProfMD) {
5317 // Consult the next alloc node.
5318 assert(AI != FS->allocs().end());
5319 auto &AllocNode = *(AI++);
5320
5321#ifndef NDEBUG
5322 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5323 ImportSummary);
5324#endif
5325
5326 // Perform cloning if not yet done.
5327 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
5328
5329 OrigAllocsThinBackend++;
5330 AllocVersionsThinBackend += AllocNode.Versions.size();
5331 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5332 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5333
5334 // If there is only one version that means we didn't end up
5335 // considering this function for cloning, and in that case the alloc
5336 // will still be none type or should have gotten the default NotCold.
5337 // Skip that after calling clone helper since that does some sanity
5338 // checks that confirm we haven't decided yet that we need cloning.
5339 // We might have a single version that is cold due to the
5340 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5341 // case.
5342 if (AllocNode.Versions.size() == 1 &&
5343 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5344 assert((AllocationType)AllocNode.Versions[0] ==
5345 AllocationType::NotCold ||
5346 (AllocationType)AllocNode.Versions[0] ==
5347 AllocationType::None);
5348 UnclonableAllocsThinBackend++;
5349 continue;
5350 }
5351
5352 // All versions should have a singular allocation type.
5353 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5354 return Type == ((uint8_t)AllocationType::NotCold |
5355 (uint8_t)AllocationType::Cold);
5356 }));
5357
5358 // Update the allocation types per the summary info.
5359 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5360 // Ignore any that didn't get an assigned allocation type.
5361 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5362 continue;
5363 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5364 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5365 : AllocTypeNotColdThinBackend++;
5366 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
5367 auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
5368 Val: AllocTypeString);
5369 CallBase *CBClone;
5370 // Copy 0 is the original function.
5371 if (!J)
5372 CBClone = CB;
5373 else
5374 // Since VMaps are only created for new clones, we index with
5375 // clone J-1 (J==0 is the original clone and does not have a VMaps
5376 // entry).
5377 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5378 CBClone->addFnAttr(Attr: A);
5379 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5380 << ore::NV("AllocationCall", CBClone) << " in clone "
5381 << ore::NV("Caller", CBClone->getFunction())
5382 << " marked with memprof allocation attribute "
5383 << ore::NV("Attribute", AllocTypeString));
5384 }
5385 } else if (!CallsiteContext.empty()) {
5386 if (!CalledFunction) {
5387#ifndef NDEBUG
5388 // We should have skipped inline assembly calls.
5389 auto *CI = dyn_cast<CallInst>(CB);
5390 assert(!CI || !CI->isInlineAsm());
5391#endif
5392 // We should have skipped direct calls via a Constant.
5393 assert(CalledValue && !isa<Constant>(CalledValue));
5394
5395 // This is an indirect call, see if we have profile information and
5396 // whether any clones were recorded for the profiled targets (that
5397 // we synthesized CallsiteInfo summary records for when building the
5398 // index).
5399 auto NumClones =
5400 recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo);
5401
5402 // Perform cloning if not yet done. This is done here in case
5403 // we don't need to do ICP, but might need to clone this
5404 // function as it is the target of other cloned calls.
5405 if (NumClones)
5406 CloneFuncIfNeeded(NumClones);
5407 }
5408
5409 else {
5410 // Consult the next callsite node.
5411 assert(SI != FS->callsites().end());
5412 auto &StackNode = *(SI++);
5413
5414#ifndef NDEBUG
5415 // Sanity check that the stack ids match between the summary and
5416 // instruction metadata.
5417 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5418 for (auto StackId : CallsiteContext) {
5419 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5420 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5421 StackId);
5422 StackIdIndexIter++;
5423 }
5424#endif
5425
5426 CloneCallsite(StackNode, CB, CalledFunction);
5427 }
5428 } else if (CB->isTailCall() && CalledFunction) {
5429 // Locate the synthesized callsite info for the callee VI, if any was
5430 // created, and use that for cloning.
5431 ValueInfo CalleeVI =
5432 findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F);
5433 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
5434 auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
5435 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5436 CloneCallsite(Callsite->second, CB, CalledFunction);
5437 }
5438 }
5439 }
5440 }
5441
5442 // Now do any promotion required for cloning.
5443 performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5444 }
5445
5446 // We skip some of the functions and instructions above, so remove all the
5447 // metadata in a single sweep here.
5448 for (auto &F : M) {
5449 // We can skip memprof clones because createFunctionClones already strips
5450 // the metadata from the newly created clones.
5451 if (F.isDeclaration() || isMemProfClone(F))
5452 continue;
5453 for (auto &BB : F) {
5454 for (auto &I : BB) {
5455 if (!isa<CallBase>(Val: I))
5456 continue;
5457 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
5458 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
5459 }
5460 }
5461 }
5462
5463 return Changed;
5464}
5465
5466unsigned MemProfContextDisambiguation::recordICPInfo(
5467 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5468 ArrayRef<CallsiteInfo>::iterator &SI,
5469 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5470 // First see if we have profile information for this indirect call.
5471 uint32_t NumCandidates;
5472 uint64_t TotalCount;
5473 auto CandidateProfileData =
5474 ICallAnalysis->getPromotionCandidatesForInstruction(I: CB, TotalCount,
5475 NumCandidates);
5476 if (CandidateProfileData.empty())
5477 return 0;
5478
5479 // Iterate through all of the candidate profiled targets along with the
5480 // CallsiteInfo summary records synthesized for them when building the index,
5481 // and see if any are cloned and/or refer to clones.
5482 bool ICPNeeded = false;
5483 unsigned NumClones = 0;
5484 size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI);
5485 for (const auto &Candidate : CandidateProfileData) {
5486#ifndef NDEBUG
5487 auto CalleeValueInfo =
5488#endif
5489 ImportSummary->getValueInfo(GUID: Candidate.Value);
5490 // We might not have a ValueInfo if this is a distributed
5491 // ThinLTO backend and decided not to import that function.
5492 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5493 assert(SI != AllCallsites.end());
5494 auto &StackNode = *(SI++);
5495 // See if any of the clones of the indirect callsite for this
5496 // profiled target should call a cloned version of the profiled
5497 // target. We only need to do the ICP here if so.
5498 ICPNeeded |= llvm::any_of(Range: StackNode.Clones,
5499 P: [](unsigned CloneNo) { return CloneNo != 0; });
5500 // Every callsite in the same function should have been cloned the same
5501 // number of times.
5502 assert(!NumClones || NumClones == StackNode.Clones.size());
5503 NumClones = StackNode.Clones.size();
5504 }
5505 if (!ICPNeeded)
5506 return NumClones;
5507 // Save information for ICP, which is performed later to avoid messing up the
5508 // current function traversal.
5509 ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates,
5510 .TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex});
5511 return NumClones;
5512}
5513
5514void MemProfContextDisambiguation::performICP(
5515 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5516 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5517 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5518 OptimizationRemarkEmitter &ORE) {
5519 // Now do any promotion required for cloning. Specifically, for each
5520 // recorded ICP candidate (which was only recorded because one clone of that
5521 // candidate should call a cloned target), we perform ICP (speculative
5522 // devirtualization) for each clone of the callsite, and update its callee
5523 // to the appropriate clone. Note that the ICP compares against the original
5524 // version of the target, which is what is in the vtable.
5525 for (auto &Info : ICallAnalysisInfo) {
5526 auto *CB = Info.CB;
5527 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5528 auto TotalCount = Info.TotalCount;
5529 unsigned NumPromoted = 0;
5530 unsigned NumClones = 0;
5531
5532 for (auto &Candidate : Info.CandidateProfileData) {
5533 auto &StackNode = AllCallsites[CallsiteIndex++];
5534
5535 // All calls in the same function must have the same number of clones.
5536 assert(!NumClones || NumClones == StackNode.Clones.size());
5537 NumClones = StackNode.Clones.size();
5538
5539 // See if the target is in the module. If it wasn't imported, it is
5540 // possible that this profile could have been collected on a different
5541 // target (or version of the code), and we need to be conservative
5542 // (similar to what is done in the ICP pass).
5543 Function *TargetFunction = Symtab->getFunction(FuncMD5Hash: Candidate.Value);
5544 if (TargetFunction == nullptr ||
5545 // Any ThinLTO global dead symbol removal should have already
5546 // occurred, so it should be safe to promote when the target is a
5547 // declaration.
5548 // TODO: Remove internal option once more fully tested.
5549 (MemProfRequireDefinitionForPromotion &&
5550 TargetFunction->isDeclaration())) {
5551 ORE.emit(RemarkBuilder: [&]() {
5552 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5553 << "Memprof cannot promote indirect call: target with md5sum "
5554 << ore::NV("target md5sum", Candidate.Value) << " not found";
5555 });
5556 // FIXME: See if we can use the new declaration importing support to
5557 // at least get the declarations imported for this case. Hot indirect
5558 // targets should have been imported normally, however.
5559 continue;
5560 }
5561
5562 // Check if legal to promote
5563 const char *Reason = nullptr;
5564 if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) {
5565 ORE.emit(RemarkBuilder: [&]() {
5566 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5567 << "Memprof cannot promote indirect call to "
5568 << ore::NV("TargetFunction", TargetFunction)
5569 << " with count of " << ore::NV("TotalCount", TotalCount)
5570 << ": " << Reason;
5571 });
5572 continue;
5573 }
5574
5575 assert(!isMemProfClone(*TargetFunction));
5576
5577 // Handle each call clone, applying ICP so that each clone directly
5578 // calls the specified callee clone, guarded by the appropriate ICP
5579 // check.
5580 CallBase *CBClone = CB;
5581 for (unsigned J = 0; J < NumClones; J++) {
5582 // Copy 0 is the original function.
5583 if (J > 0)
5584 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5585 // We do the promotion using the original name, so that the comparison
5586 // is against the name in the vtable. Then just below, change the new
5587 // direct call to call the cloned function.
5588 auto &DirectCall =
5589 pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count,
5590 TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE);
5591 auto *TargetToUse = TargetFunction;
5592 // Call original if this version calls the original version of its
5593 // callee.
5594 if (StackNode.Clones[J]) {
5595 TargetToUse =
5596 cast<Function>(Val: M.getOrInsertFunction(
5597 Name: getMemProfFuncName(Base: TargetFunction->getName(),
5598 CloneNo: StackNode.Clones[J]),
5599 T: TargetFunction->getFunctionType())
5600 .getCallee());
5601 }
5602 DirectCall.setCalledFunction(TargetToUse);
5603 // During matching we generate synthetic VP metadata for indirect calls
5604 // not already having any, from the memprof profile's callee GUIDs. If
5605 // we subsequently promote and inline those callees, we currently lose
5606 // the ability to generate this synthetic VP metadata. Optionally apply
5607 // a noinline attribute to promoted direct calls, where the threshold is
5608 // set to capture synthetic VP metadata targets which get a count of 1.
5609 if (MemProfICPNoInlineThreshold &&
5610 Candidate.Count < MemProfICPNoInlineThreshold)
5611 DirectCall.setIsNoInline();
5612 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5613 << ore::NV("Call", CBClone) << " in clone "
5614 << ore::NV("Caller", CBClone->getFunction())
5615 << " promoted and assigned to call function clone "
5616 << ore::NV("Callee", TargetToUse));
5617 }
5618
5619 // Update TotalCount (all clones should get same count above)
5620 TotalCount -= Candidate.Count;
5621 NumPromoted++;
5622 }
5623 // Adjust the MD.prof metadata for all clones, now that we have the new
5624 // TotalCount and the number promoted.
5625 CallBase *CBClone = CB;
5626 for (unsigned J = 0; J < NumClones; J++) {
5627 // Copy 0 is the original function.
5628 if (J > 0)
5629 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5630 // First delete the old one.
5631 CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
5632 // If all promoted, we don't need the MD.prof metadata.
5633 // Otherwise we need update with the un-promoted records back.
5634 if (TotalCount != 0)
5635 annotateValueSite(
5636 M, Inst&: *CBClone, VDs: ArrayRef(Info.CandidateProfileData).slice(N: NumPromoted),
5637 Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates);
5638 }
5639 }
5640}
5641
5642template <typename DerivedCCG, typename FuncTy, typename CallTy>
5643bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
5644 if (DumpCCG) {
5645 dbgs() << "CCG before cloning:\n";
5646 dbgs() << *this;
5647 }
5648 if (ExportToDot)
5649 exportToDot(Label: "postbuild");
5650
5651 if (VerifyCCG) {
5652 check();
5653 }
5654
5655 identifyClones();
5656
5657 if (VerifyCCG) {
5658 check();
5659 }
5660
5661 if (DumpCCG) {
5662 dbgs() << "CCG after cloning:\n";
5663 dbgs() << *this;
5664 }
5665 if (ExportToDot)
5666 exportToDot(Label: "cloned");
5667
5668 bool Changed = assignFunctions();
5669
5670 if (DumpCCG) {
5671 dbgs() << "CCG after assigning function clones:\n";
5672 dbgs() << *this;
5673 }
5674 if (ExportToDot)
5675 exportToDot(Label: "clonefuncassign");
5676
5677 if (MemProfReportHintedSizes)
5678 printTotalSizes(OS&: errs());
5679
5680 return Changed;
5681}
5682
5683bool MemProfContextDisambiguation::processModule(
5684 Module &M,
5685 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
5686
5687 // If we have an import summary, then the cloning decisions were made during
5688 // the thin link on the index. Apply them and return.
5689 if (ImportSummary)
5690 return applyImport(M);
5691
5692 // TODO: If/when other types of memprof cloning are enabled beyond just for
5693 // hot and cold, we will need to change this to individually control the
5694 // AllocationType passed to addStackNodesForMIB during CCG construction.
5695 // Note that we specifically check this after applying imports above, so that
5696 // the option isn't needed to be passed to distributed ThinLTO backend
5697 // clang processes, which won't necessarily have visibility into the linker
5698 // dependences. Instead the information is communicated from the LTO link to
5699 // the backends via the combined summary index.
5700 if (!SupportsHotColdNew)
5701 return false;
5702
5703 ModuleCallsiteContextGraph CCG(M, OREGetter);
5704 return CCG.process();
5705}
5706
5707MemProfContextDisambiguation::MemProfContextDisambiguation(
5708 const ModuleSummaryIndex *Summary, bool isSamplePGO)
5709 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
5710 // Check the dot graph printing options once here, to make sure we have valid
5711 // and expected combinations.
5712 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
5713 llvm::report_fatal_error(
5714 reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
5715 if (DotGraphScope == DotScope::Context &&
5716 !ContextIdForDot.getNumOccurrences())
5717 llvm::report_fatal_error(
5718 reason: "-memprof-dot-scope=context requires -memprof-dot-context-id");
5719 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
5720 ContextIdForDot.getNumOccurrences())
5721 llvm::report_fatal_error(
5722 reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
5723 "-memprof-dot-context-id");
5724 if (ImportSummary) {
5725 // The MemProfImportSummary should only be used for testing ThinLTO
5726 // distributed backend handling via opt, in which case we don't have a
5727 // summary from the pass pipeline.
5728 assert(MemProfImportSummary.empty());
5729 return;
5730 }
5731 if (MemProfImportSummary.empty())
5732 return;
5733
5734 auto ReadSummaryFile =
5735 errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
5736 if (!ReadSummaryFile) {
5737 logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
5738 ErrorBanner: "Error loading file '" + MemProfImportSummary +
5739 "': ");
5740 return;
5741 }
5742 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
5743 if (!ImportSummaryForTestingOrErr) {
5744 logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
5745 ErrorBanner: "Error parsing file '" + MemProfImportSummary +
5746 "': ");
5747 return;
5748 }
5749 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
5750 ImportSummary = ImportSummaryForTesting.get();
5751}
5752
5753PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
5754 ModuleAnalysisManager &AM) {
5755 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
5756 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
5757 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
5758 };
5759 if (!processModule(M, OREGetter))
5760 return PreservedAnalyses::all();
5761 return PreservedAnalyses::none();
5762}
5763
5764void MemProfContextDisambiguation::run(
5765 ModuleSummaryIndex &Index,
5766 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
5767 isPrevailing) {
5768 // TODO: If/when other types of memprof cloning are enabled beyond just for
5769 // hot and cold, we will need to change this to individually control the
5770 // AllocationType passed to addStackNodesForMIB during CCG construction.
5771 // The index was set from the option, so these should be in sync.
5772 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
5773 if (!SupportsHotColdNew)
5774 return;
5775
5776 IndexCallsiteContextGraph CCG(Index, isPrevailing);
5777 CCG.process();
5778}
5779

source code of llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp