//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file is a part of XRay, a dynamic runtime instrumentation system. // // This implements the interface for the profileCollectorService. // //===----------------------------------------------------------------------===// #include "xray_profile_collector.h" #include "sanitizer_common/sanitizer_common.h" #include "xray_allocator.h" #include "xray_defs.h" #include "xray_profiling_flags.h" #include "xray_segmented_array.h" #include #include #include namespace __xray { namespace profileCollectorService { namespace { SpinMutex GlobalMutex; struct ThreadTrie { tid_t TId; alignas(FunctionCallTrie) std::byte TrieStorage[sizeof(FunctionCallTrie)]; }; struct ProfileBuffer { void *Data; size_t Size; }; // Current version of the profile format. constexpr u64 XRayProfilingVersion = 0x20180424; // Identifier for XRay profiling files 'xrayprof' in hex. constexpr u64 XRayMagicBytes = 0x7872617970726f66; struct XRayProfilingFileHeader { const u64 MagicBytes = XRayMagicBytes; const u64 Version = XRayProfilingVersion; u64 Timestamp = 0; // System time in nanoseconds. u64 PID = 0; // Process ID. }; struct BlockHeader { u32 BlockSize; u32 BlockNum; u64 ThreadId; }; struct ThreadData { BufferQueue *BQ; FunctionCallTrie::Allocators::Buffers Buffers; FunctionCallTrie::Allocators Allocators; FunctionCallTrie FCT; tid_t TId; }; using ThreadDataArray = Array; using ThreadDataAllocator = ThreadDataArray::AllocatorType; // We use a separate buffer queue for the backing store for the allocator used // by the ThreadData array. This lets us host the buffers, allocators, and tries // associated with a thread by moving the data into the array instead of // attempting to copy the data to a separately backed set of tries. alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)]; static BufferQueue *BQ = nullptr; static BufferQueue::Buffer Buffer; alignas(ThreadDataAllocator) static std::byte ThreadDataAllocatorStorage[sizeof(ThreadDataAllocator)]; alignas(ThreadDataArray) static std::byte ThreadDataArrayStorage[sizeof(ThreadDataArray)]; static ThreadDataAllocator *TDAllocator = nullptr; static ThreadDataArray *TDArray = nullptr; using ProfileBufferArray = Array; using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; // These need to be global aligned storage to avoid dynamic initialization. We // need these to be aligned to allow us to placement new objects into the // storage, and have pointers to those objects be appropriately aligned. alignas(ProfileBufferArray) static std::byte ProfileBuffersStorage[sizeof(ProfileBufferArray)]; alignas(ProfileBufferArrayAllocator) static std::byte ProfileBufferArrayAllocatorStorage[sizeof(ProfileBufferArrayAllocator)]; static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; static ProfileBufferArray *ProfileBuffers = nullptr; // Use a global flag to determine whether the collector implementation has been // initialized. static atomic_uint8_t CollectorInitialized{0}; } // namespace void post(BufferQueue *Q, FunctionCallTrie &&T, FunctionCallTrie::Allocators &&A, FunctionCallTrie::Allocators::Buffers &&B, tid_t TId) XRAY_NEVER_INSTRUMENT { DCHECK_NE(Q, nullptr); // Bail out early if the collector has not been initialized. if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { T.~FunctionCallTrie(); A.~Allocators(); Q->releaseBuffer(B.NodeBuffer); Q->releaseBuffer(B.RootsBuffer); Q->releaseBuffer(B.ShadowStackBuffer); Q->releaseBuffer(B.NodeIdPairBuffer); B.~Buffers(); return; } { SpinMutexLock Lock(&GlobalMutex); DCHECK_NE(TDAllocator, nullptr); DCHECK_NE(TDArray, nullptr); if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), TId) == nullptr) { // If we fail to add the data to the array, we should destroy the objects // handed us. T.~FunctionCallTrie(); A.~Allocators(); Q->releaseBuffer(B.NodeBuffer); Q->releaseBuffer(B.RootsBuffer); Q->releaseBuffer(B.ShadowStackBuffer); Q->releaseBuffer(B.NodeIdPairBuffer); B.~Buffers(); } } } // A PathArray represents the function id's representing a stack trace. In this // context a path is almost always represented from the leaf function in a call // stack to a root of the call trie. using PathArray = Array; struct ProfileRecord { using PathAllocator = typename PathArray::AllocatorType; // The Path in this record is the function id's from the leaf to the root of // the function call stack as represented from a FunctionCallTrie. PathArray Path; const FunctionCallTrie::Node *Node; }; namespace { using ProfileRecordArray = Array; // Walk a depth-first traversal of each root of the FunctionCallTrie to generate // the path(s) and the data associated with the path. static void populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { using StackArray = Array; using StackAllocator = typename StackArray::AllocatorType; StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); StackArray DFSStack(StackAlloc); for (const auto *R : Trie.getRoots()) { DFSStack.Append(R); while (!DFSStack.empty()) { auto *Node = DFSStack.back(); DFSStack.trim(1); if (Node == nullptr) continue; auto Record = PRs.AppendEmplace(PathArray{PA}, Node); if (Record == nullptr) return; DCHECK_NE(Record, nullptr); // Traverse the Node's parents and as we're doing so, get the FIds in // the order they appear. for (auto N = Node; N != nullptr; N = N->Parent) Record->Path.Append(N->FId); DCHECK(!Record->Path.empty()); for (const auto C : Node->Callees) DFSStack.Append(C.NodePtr); } } } static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, const ProfileRecordArray &ProfileRecords) XRAY_NEVER_INSTRUMENT { auto NextPtr = static_cast( internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + sizeof(Header); for (const auto &Record : ProfileRecords) { // List of IDs follow: for (const auto FId : Record.Path) NextPtr = static_cast(internal_memcpy(NextPtr, &FId, sizeof(FId))) + sizeof(FId); // Add the sentinel here. constexpr int32_t SentinelFId = 0; NextPtr = static_cast( internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + sizeof(SentinelFId); // Add the node data here. NextPtr = static_cast(internal_memcpy( NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) + sizeof(Record.Node->CallCount); NextPtr = static_cast( internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, sizeof(Record.Node->CumulativeLocalTime))) + sizeof(Record.Node->CumulativeLocalTime); } DCHECK_EQ(NextPtr - static_cast(Buffer->Data), Buffer->Size); } } // namespace void serialize() XRAY_NEVER_INSTRUMENT { if (!atomic_load(&CollectorInitialized, memory_order_acquire)) return; SpinMutexLock Lock(&GlobalMutex); // Clear out the global ProfileBuffers, if it's not empty. for (auto &B : *ProfileBuffers) deallocateBuffer(reinterpret_cast(B.Data), B.Size); ProfileBuffers->trim(ProfileBuffers->size()); DCHECK_NE(TDArray, nullptr); if (TDArray->empty()) return; // Then repopulate the global ProfileBuffers. u32 I = 0; auto MaxSize = profilingFlags()->global_allocator_max; auto ProfileArena = allocateBuffer(MaxSize); if (ProfileArena == nullptr) return; auto ProfileArenaCleanup = at_scope_exit( [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); if (PathArena == nullptr) return; auto PathArenaCleanup = at_scope_exit( [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); for (const auto &ThreadTrie : *TDArray) { using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; ProfileRecordAllocator PRAlloc(ProfileArena, profilingFlags()->global_allocator_max); ProfileRecord::PathAllocator PathAlloc( PathArena, profilingFlags()->global_allocator_max); ProfileRecordArray ProfileRecords(PRAlloc); // First, we want to compute the amount of space we're going to need. We'll // use a local allocator and an __xray::Array<...> to store the intermediary // data, then compute the size as we're going along. Then we'll allocate the // contiguous space to contain the thread buffer data. if (ThreadTrie.FCT.getRoots().empty()) continue; populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); DCHECK(!ThreadTrie.FCT.getRoots().empty()); DCHECK(!ProfileRecords.empty()); // Go through each record, to compute the sizes. // // header size = block size (4 bytes) // + block number (4 bytes) // + thread id (8 bytes) // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) // + call count (8 bytes) // + local time (8 bytes) // + end of record (8 bytes) u32 CumulativeSizes = 0; for (const auto &Record : ProfileRecords) CumulativeSizes += 20 + (4 * Record.Path.size()); BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; auto B = ProfileBuffers->Append({}); B->Size = sizeof(Header) + CumulativeSizes; B->Data = allocateBuffer(B->Size); DCHECK_NE(B->Data, nullptr); serializeRecords(B, Header, ProfileRecords); } } void reset() XRAY_NEVER_INSTRUMENT { atomic_store(&CollectorInitialized, 0, memory_order_release); SpinMutexLock Lock(&GlobalMutex); if (ProfileBuffers != nullptr) { // Clear out the profile buffers that have been serialized. for (auto &B : *ProfileBuffers) deallocateBuffer(reinterpret_cast(B.Data), B.Size); ProfileBuffers->trim(ProfileBuffers->size()); ProfileBuffers = nullptr; } if (TDArray != nullptr) { // Release the resources as required. for (auto &TD : *TDArray) { TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); } // We don't bother destroying the array here because we've already // potentially freed the backing store for the array. Instead we're going to // reset the pointer to nullptr, and re-use the storage later instead // (placement-new'ing into the storage as-is). TDArray = nullptr; } if (TDAllocator != nullptr) { TDAllocator->~Allocator(); TDAllocator = nullptr; } if (Buffer.Data != nullptr) { BQ->releaseBuffer(Buffer); } if (BQ == nullptr) { bool Success = false; new (&BufferQueueStorage) BufferQueue(profilingFlags()->global_allocator_max, 1, Success); if (!Success) return; BQ = reinterpret_cast(&BufferQueueStorage); } else { BQ->finalize(); if (BQ->init(profilingFlags()->global_allocator_max, 1) != BufferQueue::ErrorCode::Ok) return; } if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) return; new (&ProfileBufferArrayAllocatorStorage) ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); ProfileBuffersAllocator = reinterpret_cast( &ProfileBufferArrayAllocatorStorage); new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); ProfileBuffers = reinterpret_cast(&ProfileBuffersStorage); new (&ThreadDataAllocatorStorage) ThreadDataAllocator(Buffer.Data, Buffer.Size); TDAllocator = reinterpret_cast(&ThreadDataAllocatorStorage); new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); TDArray = reinterpret_cast(&ThreadDataArrayStorage); atomic_store(&CollectorInitialized, 1, memory_order_release); } XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { SpinMutexLock Lock(&GlobalMutex); if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) return {nullptr, 0}; static pthread_once_t Once = PTHREAD_ONCE_INIT; alignas(XRayProfilingFileHeader) static std::byte FileHeaderStorage[sizeof(XRayProfilingFileHeader)]; pthread_once( &Once, +[]() XRAY_NEVER_INSTRUMENT { new (&FileHeaderStorage) XRayProfilingFileHeader{}; }); if (UNLIKELY(B.Data == nullptr)) { // The first buffer should always contain the file header information. auto &FileHeader = *reinterpret_cast(&FileHeaderStorage); FileHeader.Timestamp = NanoTime(); FileHeader.PID = internal_getpid(); return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)}; } if (UNLIKELY(B.Data == &FileHeaderStorage)) return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; BlockHeader Header; internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); auto NextBlock = Header.BlockNum + 1; if (NextBlock < ProfileBuffers->size()) return {(*ProfileBuffers)[NextBlock].Data, (*ProfileBuffers)[NextBlock].Size}; return {nullptr, 0}; } } // namespace profileCollectorService } // namespace __xray