#include "llvm/ProfileData/MemProf.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/Function.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/SampleProf.h" #include "llvm/Support/BLAKE3.h" #include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/HashBuilder.h" namespace llvm { namespace memprof { MemProfSchema getFullSchema() { MemProfSchema List; #define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name); #include "llvm/ProfileData/MIBEntryDef.inc" #undef MIBEntryDef return List; } MemProfSchema getHotColdSchema() { return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime, Meta::TotalLifetimeAccessDensity}; } static size_t serializedSizeV0(const IndexedAllocationInfo &IAI, const MemProfSchema &Schema) { size_t Size = 0; // The number of frames to serialize. Size += sizeof(uint64_t); // The callstack frame ids. Size += sizeof(FrameId) * IAI.CallStack.size(); // The size of the payload. Size += PortableMemInfoBlock::serializedSize(Schema); return Size; } static size_t serializedSizeV2(const IndexedAllocationInfo &IAI, const MemProfSchema &Schema) { size_t Size = 0; // The CallStackId Size += sizeof(CallStackId); // The size of the payload. Size += PortableMemInfoBlock::serializedSize(Schema); return Size; } static size_t serializedSizeV3(const IndexedAllocationInfo &IAI, const MemProfSchema &Schema) { size_t Size = 0; // The linear call stack ID. Size += sizeof(LinearCallStackId); // The size of the payload. Size += PortableMemInfoBlock::serializedSize(Schema); return Size; } size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema, IndexedVersion Version) const { switch (Version) { case Version0: case Version1: return serializedSizeV0(*this, Schema); case Version2: return serializedSizeV2(*this, Schema); case Version3: return serializedSizeV3(*this, Schema); } llvm_unreachable("unsupported MemProf version"); } static size_t serializedSizeV0(const IndexedMemProfRecord &Record, const MemProfSchema &Schema) { // The number of alloc sites to serialize. size_t Result = sizeof(uint64_t); for (const IndexedAllocationInfo &N : Record.AllocSites) Result += N.serializedSize(Schema, Version0); // The number of callsites we have information for. Result += sizeof(uint64_t); for (const auto &Frames : Record.CallSites) { // The number of frame ids to serialize. Result += sizeof(uint64_t); Result += Frames.size() * sizeof(FrameId); } return Result; } static size_t serializedSizeV2(const IndexedMemProfRecord &Record, const MemProfSchema &Schema) { // The number of alloc sites to serialize. size_t Result = sizeof(uint64_t); for (const IndexedAllocationInfo &N : Record.AllocSites) Result += N.serializedSize(Schema, Version2); // The number of callsites we have information for. Result += sizeof(uint64_t); // The CallStackId Result += Record.CallSiteIds.size() * sizeof(CallStackId); return Result; } static size_t serializedSizeV3(const IndexedMemProfRecord &Record, const MemProfSchema &Schema) { // The number of alloc sites to serialize. size_t Result = sizeof(uint64_t); for (const IndexedAllocationInfo &N : Record.AllocSites) Result += N.serializedSize(Schema, Version3); // The number of callsites we have information for. Result += sizeof(uint64_t); // The linear call stack ID. Result += Record.CallSiteIds.size() * sizeof(LinearCallStackId); return Result; } size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema, IndexedVersion Version) const { switch (Version) { case Version0: case Version1: return serializedSizeV0(*this, Schema); case Version2: return serializedSizeV2(*this, Schema); case Version3: return serializedSizeV3(*this, Schema); } llvm_unreachable("unsupported MemProf version"); } static void serializeV0(const IndexedMemProfRecord &Record, const MemProfSchema &Schema, raw_ostream &OS) { using namespace support; endian::Writer LE(OS, llvm::endianness::little); LE.write(Record.AllocSites.size()); for (const IndexedAllocationInfo &N : Record.AllocSites) { LE.write(N.CallStack.size()); for (const FrameId &Id : N.CallStack) LE.write(Id); N.Info.serialize(Schema, OS); } // Related contexts. LE.write(Record.CallSites.size()); for (const auto &Frames : Record.CallSites) { LE.write(Frames.size()); for (const FrameId &Id : Frames) LE.write(Id); } } static void serializeV2(const IndexedMemProfRecord &Record, const MemProfSchema &Schema, raw_ostream &OS) { using namespace support; endian::Writer LE(OS, llvm::endianness::little); LE.write(Record.AllocSites.size()); for (const IndexedAllocationInfo &N : Record.AllocSites) { LE.write(N.CSId); N.Info.serialize(Schema, OS); } // Related contexts. LE.write(Record.CallSiteIds.size()); for (const auto &CSId : Record.CallSiteIds) LE.write(CSId); } static void serializeV3( const IndexedMemProfRecord &Record, const MemProfSchema &Schema, raw_ostream &OS, llvm::DenseMap &MemProfCallStackIndexes) { using namespace support; endian::Writer LE(OS, llvm::endianness::little); LE.write(Record.AllocSites.size()); for (const IndexedAllocationInfo &N : Record.AllocSites) { assert(MemProfCallStackIndexes.contains(N.CSId)); LE.write(MemProfCallStackIndexes[N.CSId]); N.Info.serialize(Schema, OS); } // Related contexts. LE.write(Record.CallSiteIds.size()); for (const auto &CSId : Record.CallSiteIds) { assert(MemProfCallStackIndexes.contains(CSId)); LE.write(MemProfCallStackIndexes[CSId]); } } void IndexedMemProfRecord::serialize( const MemProfSchema &Schema, raw_ostream &OS, IndexedVersion Version, llvm::DenseMap *MemProfCallStackIndexes) const { switch (Version) { case Version0: case Version1: serializeV0(*this, Schema, OS); return; case Version2: serializeV2(*this, Schema, OS); return; case Version3: serializeV3(*this, Schema, OS, *MemProfCallStackIndexes); return; } llvm_unreachable("unsupported MemProf version"); } static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema, const unsigned char *Ptr) { using namespace support; IndexedMemProfRecord Record; // Read the meminfo nodes. const uint64_t NumNodes = endian::readNext(Ptr); for (uint64_t I = 0; I < NumNodes; I++) { IndexedAllocationInfo Node; const uint64_t NumFrames = endian::readNext(Ptr); for (uint64_t J = 0; J < NumFrames; J++) { const FrameId Id = endian::readNext(Ptr); Node.CallStack.push_back(Id); } Node.CSId = hashCallStack(Node.CallStack); Node.Info.deserialize(Schema, Ptr); Ptr += PortableMemInfoBlock::serializedSize(Schema); Record.AllocSites.push_back(Node); } // Read the callsite information. const uint64_t NumCtxs = endian::readNext(Ptr); for (uint64_t J = 0; J < NumCtxs; J++) { const uint64_t NumFrames = endian::readNext(Ptr); llvm::SmallVector Frames; Frames.reserve(NumFrames); for (uint64_t K = 0; K < NumFrames; K++) { const FrameId Id = endian::readNext(Ptr); Frames.push_back(Id); } Record.CallSites.push_back(Frames); Record.CallSiteIds.push_back(hashCallStack(Frames)); } return Record; } static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema, const unsigned char *Ptr) { using namespace support; IndexedMemProfRecord Record; // Read the meminfo nodes. const uint64_t NumNodes = endian::readNext(Ptr); Record.AllocSites.reserve(NumNodes); for (uint64_t I = 0; I < NumNodes; I++) { IndexedAllocationInfo Node; Node.CSId = endian::readNext(Ptr); Node.Info.deserialize(Schema, Ptr); Ptr += PortableMemInfoBlock::serializedSize(Schema); Record.AllocSites.push_back(Node); } // Read the callsite information. const uint64_t NumCtxs = endian::readNext(Ptr); Record.CallSiteIds.reserve(NumCtxs); for (uint64_t J = 0; J < NumCtxs; J++) { CallStackId CSId = endian::readNext(Ptr); Record.CallSiteIds.push_back(CSId); } return Record; } static IndexedMemProfRecord deserializeV3(const MemProfSchema &Schema, const unsigned char *Ptr) { using namespace support; IndexedMemProfRecord Record; // Read the meminfo nodes. const uint64_t NumNodes = endian::readNext(Ptr); Record.AllocSites.reserve(NumNodes); for (uint64_t I = 0; I < NumNodes; I++) { IndexedAllocationInfo Node; Node.CSId = endian::readNext(Ptr); Node.Info.deserialize(Schema, Ptr); Ptr += PortableMemInfoBlock::serializedSize(Schema); Record.AllocSites.push_back(Node); } // Read the callsite information. const uint64_t NumCtxs = endian::readNext(Ptr); Record.CallSiteIds.reserve(NumCtxs); for (uint64_t J = 0; J < NumCtxs; J++) { // We are storing LinearCallStackId in CallSiteIds, which is a vector of // CallStackId. Assert that CallStackId is no smaller than // LinearCallStackId. static_assert(sizeof(LinearCallStackId) <= sizeof(CallStackId)); LinearCallStackId CSId = endian::readNext(Ptr); Record.CallSiteIds.push_back(CSId); } return Record; } IndexedMemProfRecord IndexedMemProfRecord::deserialize(const MemProfSchema &Schema, const unsigned char *Ptr, IndexedVersion Version) { switch (Version) { case Version0: case Version1: return deserializeV0(Schema, Ptr); case Version2: return deserializeV2(Schema, Ptr); case Version3: return deserializeV3(Schema, Ptr); } llvm_unreachable("unsupported MemProf version"); } MemProfRecord IndexedMemProfRecord::toMemProfRecord( llvm::function_ref(const CallStackId)> Callback) const { MemProfRecord Record; Record.AllocSites.reserve(AllocSites.size()); for (const IndexedAllocationInfo &IndexedAI : AllocSites) { AllocationInfo AI; AI.Info = IndexedAI.Info; AI.CallStack = Callback(IndexedAI.CSId); Record.AllocSites.push_back(std::move(AI)); } Record.CallSites.reserve(CallSiteIds.size()); for (CallStackId CSId : CallSiteIds) Record.CallSites.push_back(Callback(CSId)); return Record; } GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) { // Canonicalize the function name to drop suffixes such as ".llvm.". Note // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop // those by default. This is by design to differentiate internal linkage // functions during matching. By dropping the other suffixes we can then match // functions in the profile use phase prior to their addition. Note that this // applies to both instrumented and sampled function names. StringRef CanonicalName = sampleprof::FunctionSamples::getCanonicalFnName(FunctionName); // We use the function guid which we expect to be a uint64_t. At // this time, it is the lower 64 bits of the md5 of the canonical // function name. return Function::getGUID(CanonicalName); } Expected readMemProfSchema(const unsigned char *&Buffer) { using namespace support; const unsigned char *Ptr = Buffer; const uint64_t NumSchemaIds = endian::readNext(Ptr); if (NumSchemaIds > static_cast(Meta::Size)) { return make_error(instrprof_error::malformed, "memprof schema invalid"); } MemProfSchema Result; for (size_t I = 0; I < NumSchemaIds; I++) { const uint64_t Tag = endian::readNext(Ptr); if (Tag >= static_cast(Meta::Size)) { return make_error(instrprof_error::malformed, "memprof schema invalid"); } Result.push_back(static_cast(Tag)); } // Advance the buffer to one past the schema if we succeeded. Buffer = Ptr; return Result; } CallStackId hashCallStack(ArrayRef CS) { llvm::HashBuilder, llvm::endianness::little> HashBuilder; for (FrameId F : CS) HashBuilder.add(F); llvm::BLAKE3Result<8> Hash = HashBuilder.final(); CallStackId CSId; std::memcpy(&CSId, Hash.data(), sizeof(Hash)); return CSId; } // Encode a call stack into RadixArray. Return the starting index within // RadixArray. For each call stack we encode, we emit two or three components // into RadixArray. If a given call stack doesn't have a common prefix relative // to the previous one, we emit: // // - the frames in the given call stack in the root-to-leaf order // // - the length of the given call stack // // If a given call stack has a non-empty common prefix relative to the previous // one, we emit: // // - the relative location of the common prefix, encoded as a negative number. // // - a portion of the given call stack that's beyond the common prefix // // - the length of the given call stack, including the length of the common // prefix. // // The resulting RadixArray requires a somewhat unintuitive backward traversal // to reconstruct a call stack -- read the call stack length and scan backward // while collecting frames in the leaf to root order. build, the caller of this // function, reverses RadixArray in place so that we can reconstruct a call // stack as if we were deserializing an array in a typical way -- the call stack // length followed by the frames in the leaf-to-root order except that we need // to handle pointers to parents along the way. // // To quickly determine the location of the common prefix within RadixArray, // Indexes caches the indexes of the previous call stack's frames within // RadixArray. LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack( const llvm::SmallVector *CallStack, const llvm::SmallVector *Prev, const llvm::DenseMap &MemProfFrameIndexes) { // Compute the length of the common root prefix between Prev and CallStack. uint32_t CommonLen = 0; if (Prev) { auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(), CallStack->rend()); CommonLen = std::distance(CallStack->rbegin(), Pos.second); } // Drop the portion beyond CommonLen. assert(CommonLen <= Indexes.size()); Indexes.resize(CommonLen); // Append a pointer to the parent. if (CommonLen) { uint32_t CurrentIndex = RadixArray.size(); uint32_t ParentIndex = Indexes.back(); // The offset to the parent must be negative because we are pointing to an // element we've already added to RadixArray. assert(ParentIndex < CurrentIndex); RadixArray.push_back(ParentIndex - CurrentIndex); } // Copy the part of the call stack beyond the common prefix to RadixArray. assert(CommonLen <= CallStack->size()); for (FrameId F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) { // Remember the index of F in RadixArray. Indexes.push_back(RadixArray.size()); RadixArray.push_back(MemProfFrameIndexes.find(F)->second); } assert(CallStack->size() == Indexes.size()); // End with the call stack length. RadixArray.push_back(CallStack->size()); // Return the index within RadixArray where we can start reconstructing a // given call stack from. return RadixArray.size() - 1; } void CallStackRadixTreeBuilder::build( llvm::MapVector> &&MemProfCallStackData, const llvm::DenseMap &MemProfFrameIndexes, llvm::DenseMap &FrameHistogram) { // Take the vector portion of MemProfCallStackData. The vector is exactly // what we need to sort. Also, we no longer need its lookup capability. llvm::SmallVector CallStacks = MemProfCallStackData.takeVector(); // Return early if we have no work to do. if (CallStacks.empty()) { RadixArray.clear(); CallStackPos.clear(); return; } // Sorting the list of call stacks in the dictionary order is sufficient to // maximize the length of the common prefix between two adjacent call stacks // and thus minimize the length of RadixArray. However, we go one step // further and try to reduce the number of times we follow pointers to parents // during deserilization. Consider a poorly encoded radix tree: // // CallStackId 1: f1 -> f2 -> f3 // | // CallStackId 2: +--- f4 -> f5 // | // CallStackId 3: +--> f6 // // Here, f2 and f4 appear once and twice, respectively, in the call stacks. // Once we encode CallStackId 1 into RadixArray, every other call stack with // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to // parents twice. // // We try to alleviate the situation by sorting the list of call stacks by // comparing the popularity of frames rather than the integer values of // FrameIds. In the example above, f4 is more popular than f2, so we sort the // call stacks and encode them as: // // CallStackId 2: f1 -- f4 -> f5 // | | // CallStackId 3: | +--> f6 // | // CallStackId 1: +--> f2 -> f3 // // Notice that CallStackId 3 follows a pointer to a parent only once. // // All this is a quick-n-dirty trick to reduce the number of jumps. The // proper way would be to compute the weight of each radix tree node -- how // many call stacks use a given radix tree node, and encode a radix tree from // the heaviest node first. We do not do so because that's a lot of work. llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { // Call stacks are stored from leaf to root. Perform comparisons from the // root. return std::lexicographical_compare( L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), [&](FrameId F1, FrameId F2) { uint64_t H1 = FrameHistogram[F1].Count; uint64_t H2 = FrameHistogram[F2].Count; // Popular frames should come later because we encode call stacks from // the last one in the list. if (H1 != H2) return H1 < H2; // For sort stability. return F1 < F2; }); }); // Reserve some reasonable amount of storage. RadixArray.clear(); RadixArray.reserve(CallStacks.size() * 8); // Indexes will grow as long as the longest call stack. Indexes.clear(); Indexes.reserve(512); // CallStackPos will grow to exactly CallStacks.size() entries. CallStackPos.clear(); CallStackPos.reserve(CallStacks.size()); // Compute the radix array. We encode one call stack at a time, computing the // longest prefix that's shared with the previous call stack we encode. For // each call stack we encode, we remember a mapping from CallStackId to its // position within RadixArray. // // As an optimization, we encode from the last call stack in CallStacks to // reduce the number of times we follow pointers to the parents. Consider the // list of call stacks that has been sorted in the dictionary order: // // Call Stack 1: F1 // Call Stack 2: F1 -> F2 // Call Stack 3: F1 -> F2 -> F3 // // If we traversed CallStacks in the forward order, we would end up with a // radix tree like: // // Call Stack 1: F1 // | // Call Stack 2: +---> F2 // | // Call Stack 3: +---> F3 // // Notice that each call stack jumps to the previous one. However, if we // traverse CallStacks in the reverse order, then Call Stack 3 has the // complete call stack encoded without any pointers. Call Stack 1 and 2 point // to appropriate prefixes of Call Stack 3. const llvm::SmallVector *Prev = nullptr; for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) { LinearCallStackId Pos = encodeCallStack(&CallStack, Prev, MemProfFrameIndexes); CallStackPos.insert({CSId, Pos}); Prev = &CallStack; } // "RadixArray.size() - 1" below is problematic if RadixArray is empty. assert(!RadixArray.empty()); // Reverse the radix array in place. We do so mostly for intuitive // deserialization where we would read the length field and then the call // stack frames proper just like any other array deserialization, except // that we have occasional jumps to take advantage of prefixes. for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) std::swap(RadixArray[I], RadixArray[J]); // "Reverse" the indexes stored in CallStackPos. for (auto &[K, V] : CallStackPos) V = RadixArray.size() - 1 - V; } llvm::DenseMap computeFrameHistogram(llvm::MapVector> &MemProfCallStackData) { llvm::DenseMap Histogram; for (const auto &KV : MemProfCallStackData) { const auto &CS = KV.second; for (unsigned I = 0, E = CS.size(); I != E; ++I) { auto &S = Histogram[CS[I]]; ++S.Count; S.PositionSum += I; } } return Histogram; } void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) { for (const auto &AS : Record.AllocSites) { assert(AS.CSId == hashCallStack(AS.CallStack)); (void)AS; } } void verifyFunctionProfileData( const llvm::MapVector &FunctionProfileData) { for (const auto &[GUID, Record] : FunctionProfileData) { (void)GUID; verifyIndexedMemProfRecord(Record); } } } // namespace memprof } // namespace llvm