diff --git a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h index e1543d14a23c8..f7b5ef5136a6c 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h +++ b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h @@ -18,6 +18,7 @@ #include "HeterogeneousCore/AlpakaInterface/interface/devices.h" #include "HeterogeneousCore/AlpakaInterface/interface/AllocatorConfig.h" #include "HeterogeneousCore/AlpakaInterface/interface/AlpakaServiceFwd.h" +#include "HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h" // Inspired by cub::CachingDeviceAllocator @@ -202,7 +203,8 @@ namespace cms::alpakatools { std::tie(block.bin, block.bytes) = findBin(bytes); // try to re-use a cached block, or allocate a new buffer - if (tryReuseCachedBlock(block)) { + bool const cacheHit = tryReuseCachedBlock(block); + if (cacheHit) { // fill the re-used memory block with a pattern if (fillReallocations_) { immediateOrAsyncMemset(*block.queue, *block.buffer, fillReallocationValue_); @@ -217,6 +219,12 @@ namespace cms::alpakatools { } } + if (auto* monitor = cachingAllocatorMonitor()) [[unlikely]] { + monitor->onAllocate( + monitorDevice(), block.buffer->data(), block.bytes, block.requested, cacheHit, monitorQueue(block)); + monitor->onUsage(monitorDevice(), cachedBytes_.live, cachedBytes_.free, cachedBytes_.requested); + } + return block.buffer->data(); } @@ -236,6 +244,11 @@ namespace cms::alpakatools { cachedBytes_.live -= block.bytes; cachedBytes_.requested -= block.requested; + if (auto* monitor = cachingAllocatorMonitor()) [[unlikely]] { + monitor->onFree(monitorDevice(), ptr, block.bytes, monitorQueue(block)); + monitor->onUsage(monitorDevice(), cachedBytes_.live, cachedBytes_.free, cachedBytes_.requested); + } + bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_); if (recache) { // If enqueuing the event fails, very likely an error has @@ -333,6 +346,24 @@ namespace cms::alpakatools { }; private: + // identifiers passed to a registered CachingAllocatorMonitor + int monitorDevice() const noexcept { + if constexpr (std::is_same_v) { + return 0; // the host platform has exactly one device + } else if constexpr (requires(Device const& d) { alpaka::getNativeHandle(d); }) { + return static_cast(alpaka::getNativeHandle(device_)); + } else { + return -1; // a backend without a native device handle + } + } + + unsigned long long monitorQueue(BlockDescriptor const& block) const noexcept { + if (not block.queue) { + return 0; + } + return reinterpret_cast(block.queue->m_spQueueImpl.get()); + } + // return the maximum amount of memory that should be cached on this device size_t cacheSize(size_t maxCachedBytes, double maxCachedFraction) const { // note that getMemBytes() returns 0 if the platform does not support querying the device memory diff --git a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h new file mode 100644 index 0000000000000..7f42f9c1648df --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h @@ -0,0 +1,60 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#ifndef HeterogeneousCore_AlpakaInterface_interface_CachingAllocatorMonitor_h +#define HeterogeneousCore_AlpakaInterface_interface_CachingAllocatorMonitor_h + +#include +#include + +namespace cms::alpakatools { + + // Optional, process-wide hook to observe caching-allocator transactions. + // + // Kept free of any alpaka or perfetto dependency so that the allocator (which + // calls it on the hot path) stays cheap -- when no monitor is registered the + // cost is a single load of an atomic pointer (a read-shared global, hence a + // plain MOV on x86) plus a predicted-not-taken branch -- and so that an external + // profiler can implement it without pulling alpaka headers. A registered monitor must + // outlive all allocator use and its callbacks must be thread-safe: they are + // invoked from whatever TBB worker (or async/GPU callback thread) runs the + // owning module, while the allocator's internal mutex is held. + class CachingAllocatorMonitor { + public: + virtual ~CachingAllocatorMonitor() = default; + + // A block was handed out. |bytes| is the bin-rounded size, |requested| the + // user size, |cacheHit| is true when a cached block was reused (no new + // device allocation), |queue| identifies the associated backend queue/stream + // and |device| is the native device handle (e.g. CUDA ordinal). + virtual void onAllocate(int device, + const void* ptr, + std::size_t bytes, + std::size_t requested, + bool cacheHit, + unsigned long long queue) noexcept {} + + // A block was returned to the allocator. Note the memory may still be in use + // by asynchronous device work; the allocator records an event on |queue| and + // only re-hands the block once that event completes (possibly to another + // thread/stream) -- this is the "asynchronous transaction". + virtual void onFree(int device, const void* ptr, std::size_t bytes, unsigned long long queue) noexcept {} + + // Running byte totals for |device| right after a transaction. + virtual void onUsage(int device, std::size_t live, std::size_t cached, std::size_t requested) noexcept {} + }; + + inline std::atomic& cachingAllocatorMonitorRef() noexcept { + static std::atomic instance{nullptr}; + return instance; + } + + inline void setCachingAllocatorMonitor(CachingAllocatorMonitor* monitor) noexcept { + cachingAllocatorMonitorRef().store(monitor, std::memory_order_release); + } + + inline CachingAllocatorMonitor* cachingAllocatorMonitor() noexcept { + return cachingAllocatorMonitorRef().load(std::memory_order_acquire); + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_CachingAllocatorMonitor_h diff --git a/PerfTools/Perfetto/BuildFile.xml b/PerfTools/Perfetto/BuildFile.xml new file mode 100644 index 0000000000000..5da06a8087a94 --- /dev/null +++ b/PerfTools/Perfetto/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/PerfTools/Perfetto/README.md b/PerfTools/Perfetto/README.md new file mode 100644 index 0000000000000..0919beba6d126 --- /dev/null +++ b/PerfTools/Perfetto/README.md @@ -0,0 +1,251 @@ +# PerfTools/Perfetto + +In-process [Perfetto](https://perfetto.dev) tracing for `cmsRun`. The +`PerfettoTraceService` records a `.pftrace` file that can be opened directly at +, giving a per-stream timeline of the framework phases +(source read, event lifetime, module execution, `acquire`, EventSetup, cleanup). + +## Contents + +The Perfetto SDK comes from the `perfetto` CMSSW external (``, +`#include `), not vendored here. + +- `interface/CMSSWPerfettoCategories.h` — the `cmssw.*` track-event categories. +- `interface/CMSSWPerfettoModuleContext.{h,cc}` — thread-local "current module", + set by the service so the allocator/GPU layers can attribute their work. +- `interface/CMSSWPerfettoTrace.h` — `CMS_PERFETTO_FUNC()` / `CMS_PERFETTO_SCOPE()` + scoped-slice macros for optional intra-module instrumentation. +- `interface/PerfettoAllocatorMonitor.h` — caching-allocator → Perfetto bridge. +- `plugins/PerfettoCuptiProfiler.h` — CUPTI → Perfetto GPU kernel tracing. +- `plugins/PerfettoPowerSampler.h` — CPU (RAPL) + GPU (NVML) power sampling. +- `plugins/PerfettoTraceService.cc` — the EDM service. +- `python/customisePerfetto.py` — `cmsDriver.py --customise` helper. +- `scripts/perfettoKernelResources.py` — static (ptxas/cuobjdump) kernel resource dump. + +## Track layout + +CMSSW runs one global TBB arena; within a *single* stream and a *single* event, +independent modules execute concurrently on different threads, and an ExternalWork +module's `acquire()`/`produce()` run on different threads. A single per-stream +timeline therefore cannot hold module slices without overlaps. So each slice goes +on a **per-(stream, thread) lane** that hangs under the stream: + +``` +process "cmsRun" + └─ edm::stream (one per stream; events are serialized here) + ├─ "Event" slice + run/lumi/event counters + └─ thread (lanes: one per thread that worked on the stream) + └─ module / acquire / eventsetup / source / cleanup slices + (+ alloc/free instants and CMS_PERFETTO_FUNC slices nested under them) +``` + +- **Module / acquire / EventSetup / source / cleanup** slices are emitted on a lane + keyed by `(stream, executing thread)`, parented to the stream track. Because every + lane is fed by exactly one thread, its begin/end events arrive in order and nest + correctly; modules of one stream running concurrently on different threads simply + show up as parallel lanes under that stream. The primary thread carries most of a + stream's work; brief lanes are tasks TBB ran on other threads (work-stealing). +- The **per-stream `Event` track** itself holds the serialized event lifetime + (`preEvent`..`postClearEvent`) plus the run/lumi/event counters. + +The service also publishes a thread-local `cms::perfetto::ModuleContext` around +every module call, so the caching-allocator monitor and the GPU/CUPTI layer can +attribute their work to the responsible module. + +## Usage + +With `cmsDriver.py`: + +```bash +cmsDriver.py step3 -s RAW2DIGI,RECO ... \ + --customise PerfTools/Perfetto/customisePerfetto.customise +``` + +Or directly in a config: + +```python +from PerfTools.Perfetto.customisePerfetto import customisePerfetto +customisePerfetto(process, fileName="reco.pftrace") +``` + +Or the bare service: + +```python +process.add_(cms.Service("PerfettoTraceService", + fileName=cms.untracked.string("reco.pftrace"))) +``` + +### Parameters (all untracked) + +| parameter | default | meaning | +|------------------|------------------|----------------------------------------------------------------| +| `enabled` | `True` | master switch | +| `fileName` | `cmsrun.pftrace` | output trace file | +| `bufferSizeKB` | `262144` | in-process trace buffer size (KB) | +| `maxEvents` | `200` | stop opening new event slices after N events (`0` = unlimited) | +| `traceFunctions` | `False` | enable tier-B per-function slices | +| `traceAllocations` | `False` | trace Alpaka caching-allocator alloc/free + device-memory counters | +| `traceGpuKernels` | `False` | trace CUDA kernels (real device timing + registers/occupancy) via CUPTI | +| `tracePower` | `False` | sample CPU (RAPL) + GPU (NVML) power as counter tracks | +| `powerPeriodMs` | `1000` | power sampling period in ms (when `tracePower`) | +| `traceModules` | `[]` | if non-empty, only trace these module labels (focused, low overhead) | + +A global **`Throughput (events/s)`** counter is always emitted -- a sliding-window +event rate (over the last 16 completed events) that shows the job ramping up to +steady state. With `tracePower=True`, a background thread samples **`CPU pkg +power (W)`** (Intel RAPL, `/sys/class/powercap`) and **`GPU power (W)`** (NVML, +loaded via `dlopen`) every `powerPeriodMs` (default 1000 ms -- NVML power queries +are not free, so polling too fast can perturb the GPU). Both are no-ops where the +source is unavailable. + +### Per-function (tier-B) tracing + +Annotate hot code with `CMS_PERFETTO_FUNC()` (uses `__func__`) or +`CMS_PERFETTO_SCOPE("name")`. The slices are emitted on the calling thread's +track, nesting under the enclosing module slice, while tracing is active; +otherwise they are no-ops. + +```cpp +#include "PerfTools/Perfetto/interface/CMSSWPerfettoTrace.h" + +void MyProducer::produce(edm::Event& e, edm::EventSetup const&) { + CMS_PERFETTO_FUNC(); + ... +} +``` + +### Caching-allocator tracing (`traceAllocations=True`) + +The Alpaka `CachingAllocator` calls an optional `cms::alpakatools::CachingAllocatorMonitor` +(a process-wide hook, free of any perfetto dependency) on every alloc/free. The +service registers `PerfettoAllocatorMonitor`, which emits each transaction as an +INSTANT on the calling thread's track — so it sits under the module slice that +triggered it and is annotated with that module (from `ModuleContext`), the byte +size, cache-hit/miss, device and queue — plus per-device `live`/`cached`/`requested` +byte counters for a device-memory-pressure timeline. + +Because a freed block may still be in use by asynchronous device work, the allocator +only re-hands it once a recorded event completes (possibly to another thread/stream): +this asynchronous reuse is visible as a later `alloc` with `cache_hit=true` on a +different thread, against the same device/queue. + +### GPU kernel tracing (`traceGpuKernels=True`) + +Uses [CUPTI](https://docs.nvidia.com/cupti/) to stream CUDA kernel activity into +the session. CUPTI is a driver-level profiler, so it captures kernels from the +already-built Alpaka/CUDA plugins without rebuilding them, at low overhead and +without serializing kernels. Each kernel is a slice on a per-(device, CUDA stream) +track, placed at its **device-side** start/end — i.e. the *real* GPU execution +time, not the host enqueue/wait — annotated with `registers_per_thread`, +`static_smem_B`, `dynamic_smem_B`, `local_per_thread_B` / `local_total_B` (spills, +per thread and for the whole launch), `grid`, `block`, +an estimated `occupancy_est`, the full kernel name, and the CUPTI `correlation_id` +that links it back to the host module that launched it. GPU timestamps are +converted to `CLOCK_BOOTTIME` so they line up with the host timeline. + +The compile-time counterpart is `scripts/perfettoKernelResources.py`, which runs +`cuobjdump --dump-resource-usage` on the built `*PortableCudaAsync.so` libraries to +print the per-kernel registers / shared / stack / spill / constant usage (the +ptxas numbers), e.g.: + +```bash +perfettoKernelResources.py --filter Phase2 \ + $CMSSW_RELEASE_BASE/lib/$SCRAM_ARCH/pluginRecoLocalTrackerSiPixelClusterizerPluginsPortableCudaAsync.so +``` + +## Threading model and nested parallelism + +Slices are emitted on the **thread** that does the work, because within one stream +and one event CMSSW runs independent modules concurrently on different threads (and +an ExternalWork module's `acquire()`/`produce()` run on different threads). A few +consequences worth knowing when reading a trace: + +- A module's slice spans its `produce()` **wall-clock on that thread**, including + any blocking `tbb::parallel_for` it calls. The parallel iterations that run on + *other* threads show up on *those* threads' tracks, not nested under the module — + so you can see the fan-out, but it is not visually grouped under the module. +- **`tbb::parallel_for` inside `produce()`**: while the calling thread is blocked in + the `parallel_for`, TBB may run an unrelated module's task on it (work-stealing), + which then appears *nested inside* your module's slice on that thread. This is a + faithful picture of what the thread did, but it means a slice's duration can + include stolen, unrelated work. The thread-local "current module" used for + allocator/GPU attribution is a **stack**, so it is restored correctly when the + stolen work returns. +- The helper threads running the `parallel_for` body do **not** automatically carry + the module context, so allocations / `CMS_PERFETTO_FUNC` slices made there are not + attributed to the module. Wrap the body to propagate it: + + ```cpp + #include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h" + tbb::parallel_for(range, cms::perfetto::withModuleContext([&](auto const& r) { + CMS_PERFETTO_FUNC(); // now attributed to the enclosing module + ... + })); + ``` + +## A profiling guide for reconstruction developers + +Goal: understand and speed up *one* algorithm (a module and the GPU work it drives). + +1. **Focus the trace on your module** (much lower overhead, far less noise): + + ```bash + cmsRun step3.py # add via --customise_commands or customisePerfetto(...) + ``` + ```python + from PerfTools.Perfetto.customisePerfetto import customisePerfetto + customisePerfetto(process, + fileName="myalgo.pftrace", + traceModules=["myProducerAlpaka"], # only this module + traceGpuKernels=True, # its CUDA kernels + traceAllocations=True) # its device memory + ``` + +2. **Open `myalgo.pftrace` at ** and read it top-down: + - **Per-thread tracks** — find your module's slice. Its width is the host-side + time (enqueue for an async GPU module; the actual compute for a CPU module). + If you added `CMS_PERFETTO_FUNC()`/`CMS_PERFETTO_SCOPE()`, the sub-steps nest + underneath. + - **`GPU stream ` tracks** — the real kernel execution. Click a kernel to + see `registers_per_thread`, `occupancy_est`, `grid`/`block`, shared/local + memory. Use the `correlation_id` to match a kernel to the host launch. + - **`Event` tracks** — per-stream event boundaries + run/lumi/event counters. + - **`dev live/cached/requested (B)` counters** — the device-memory timeline. + +3. **What to look for:** + - *Kernel time vs host time*: an async GPU module's host slice is tiny while the + real cost is on the GPU track — optimize the kernel, not the host call. + - *Occupancy*: a low `occupancy_est` driven by a high `registers_per_thread` (or + large shared memory) means the kernel is occupancy-limited; cross-check the + static numbers with `scripts/perfettoKernelResources.py` (it also shows + register *spills*, `local_spill > 0`). + - *Memory churn*: `alloc` events with `cache_hit=false` are real `cudaMalloc`s + (expensive); a sawtooth in the `live` counter means repeated alloc/free that + the cache is not absorbing — consider reusing buffers or sizing the cache. A + low cache-hit rate early in a job is normal (cold cache); it should rise as + blocks are reused. + - *Allocation overhead*: the gap between the `live` and `requested` counters is + the power-of-two bin rounding (often tens of %); a few oversized buffers from + one module usually dominate the peak — group `alloc` events by `module` to + find them. + - *Serialization*: the caching allocator takes one global lock per device; if + `alloc`/`free` instants from many threads line up back-to-back, that lock may + be a contention point. + - *Gaps*: an `acquire` slice on one thread, a long gap (GPU + the `edm async pool` + wait), then `produce` on another thread is the normal ExternalWork pattern; the + gap is the GPU doing work, visible on the GPU track. + +## Overhead + +The cost is opt-in and proportional to what you enable: + +- Disabled categories cost a predicated load; the `enabled`/`IsEnabled` guard makes + every callback an early return. +- `traceModules=[...]` is the main overhead lever: only selected modules emit + slices, so a focused run on one algorithm avoids the hundreds of per-module + events of a full event. The module-context stack push/pop is allocation-free. +- `traceAllocations`: when off, the allocator hook is a single relaxed atomic load + per alloc/free; when on, one instant + a few counters per transaction. +- `traceGpuKernels`: CUPTI activity tracing is asynchronous and does not serialize + kernels; it is the cheapest of the three relative to the information it adds. +- `maxEvents` caps the trace size (and thus buffer pressure) regardless of job length. diff --git a/PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h b/PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h new file mode 100644 index 0000000000000..087cf89bffa9d --- /dev/null +++ b/PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h @@ -0,0 +1,17 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#ifndef CMSSW_PERFETTO_CATEGORIES_H +#define CMSSW_PERFETTO_CATEGORIES_H +#include + +PERFETTO_DEFINE_CATEGORIES(perfetto::Category("cmssw.event"), + perfetto::Category("cmssw.source"), + perfetto::Category("cmssw.module"), + perfetto::Category("cmssw.acquire"), + perfetto::Category("cmssw.cleanup"), + perfetto::Category("cmssw.es"), + perfetto::Category("cmssw.func"), + perfetto::Category("cmssw.alloc"), + perfetto::Category("cmssw.gpu"), + perfetto::Category("cmssw.power")); + +#endif diff --git a/PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h b/PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h new file mode 100644 index 0000000000000..a87c16a281831 --- /dev/null +++ b/PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h @@ -0,0 +1,59 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#ifndef PerfTools_Perfetto_interface_CMSSWPerfettoLanes_h +#define PerfTools_Perfetto_interface_CMSSWPerfettoLanes_h + +#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h" +#include + +#include +#include +#include +#include + +// Per-stream "lane" tracks. +// +// CMSSW serializes events within a stream but runs the modules of one event as +// concurrent TBB tasks on different threads (and an ExternalWork module's +// acquire()/produce() run on different threads). To show that work *under* its +// edm::stream -- instead of scattered across process-wide thread tracks -- each +// slice is emitted on a lane keyed by (stream, executing thread), parented to the +// stream track. Because every lane is therefore fed by exactly one thread, the +// begin/end events on it arrive in order and nest correctly; concurrent work in +// the same stream simply lands on separate lanes shown side by side. +namespace cms::perfetto { + + inline constexpr uint64_t kStreamBase = 0x5354524D00000000ull; // "STRM...." + inline constexpr uint64_t kLaneBase = 0x4C414E4500000000ull; // "LANE...." + + inline uint64_t streamUuid(unsigned sid) noexcept { return kStreamBase | (uint64_t{sid} << 16); } + + inline ::perfetto::Track streamTrack(unsigned sid) { + return ::perfetto::Track(streamUuid(sid), ::perfetto::ProcessTrack::Current()); + } + + // A small, stable ordinal for the calling OS/TBB thread (assigned on first use). + inline unsigned threadOrdinal() noexcept { + static std::atomic next{0}; + static thread_local unsigned ord = next.fetch_add(1, std::memory_order_relaxed); + return ord; + } + + // The lane for the current thread within stream |sid|, as a child of the stream + // track. The lane uuid embeds the thread ordinal, so it is owned by a single + // thread -- the descriptor can be written from a thread-local guard, no lock. + inline ::perfetto::Track laneTrack(unsigned sid) { + unsigned const ord = threadOrdinal(); + uint64_t const uuid = kLaneBase | (uint64_t{sid} << 24) | ord; + static thread_local std::set described; // streams this thread has named a lane for + if (described.insert(sid).second) { + ::perfetto::Track t(uuid, streamTrack(sid)); + auto d = t.Serialize(); + d.set_name("thread " + std::to_string(ord)); + ::perfetto::TrackEvent::SetTrackDescriptor(t, d); + } + return ::perfetto::Track(uuid, streamTrack(sid)); + } + +} // namespace cms::perfetto + +#endif // PerfTools_Perfetto_interface_CMSSWPerfettoLanes_h diff --git a/PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h b/PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h new file mode 100644 index 0000000000000..231ed5080fed4 --- /dev/null +++ b/PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h @@ -0,0 +1,70 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#ifndef PerfTools_Perfetto_interface_CMSSWPerfettoModuleContext_h +#define PerfTools_Perfetto_interface_CMSSWPerfettoModuleContext_h + +#include +#include + +namespace cms::perfetto { + + // Thread-local record of which CMSSW module (if any) is currently executing on + // this thread. PerfettoTraceService pushes it around every module/acquire call, + // so lower-level instrumentation that runs *inside* module code -- the + // caching-allocator monitor and the GPU/CUPTI layer -- can attribute its work + // to the responsible module without being passed any context. + // + // It is a *stack*, not a single slot, on purpose: TBB work-stealing can run + // another module's produce() on this same thread while the current one is + // blocked in a tbb::parallel_for (or similar), nested inside it. A stack + // restores the enclosing module's context when that nested work returns. + // + // The char pointers reference the module's ModuleDescription, which outlives + // the call, so they stay valid while the context is on the stack. + struct ModuleContext { + const char* label = nullptr; + const char* type = nullptr; + unsigned moduleId = 0; + unsigned streamId = 0xffffffffu; + unsigned long long eventId = 0; + bool active = false; + }; + + void pushModuleContext(ModuleContext const& ctx) noexcept; + void popModuleContext() noexcept; + void resetModuleContext() noexcept; // empty the stack (defensive, at boundaries) + ModuleContext const& currentModuleContext() noexcept; + + // ---- Propagating the context across nested parallelism -------------------- + // + // The context is thread-local and is pushed only on the thread that runs a + // module's produce()/acquire(). If that code spawns work on *other* threads + // (a tbb::parallel_for, edm::Async, ...), those helper threads do NOT see the + // context, so allocations and tier-B slices they make are not attributed to + // the module. Capture the context and re-apply it on the helper thread. + + // RAII: push a captured context for a scope, pop it on exit. Nests correctly. + class ModuleContextGuard { + public: + explicit ModuleContextGuard(ModuleContext const& ctx) noexcept { pushModuleContext(ctx); } + ~ModuleContextGuard() noexcept { popModuleContext(); } + ModuleContextGuard(ModuleContextGuard const&) = delete; + ModuleContextGuard& operator=(ModuleContextGuard const&) = delete; + }; + + // Wrap a callable so it runs with the *current* module context applied. Use it + // as the body of a tbb::parallel_for/parallel_reduce (or anything that runs on + // borrowed threads) so allocations and CMS_PERFETTO_FUNC slices made inside are + // still attributed to the enclosing module: + // + // tbb::parallel_for(range, cms::perfetto::withModuleContext([&](auto const& r){ ... })); + template + auto withModuleContext(F&& f) { + return [ctx = currentModuleContext(), fn = std::decay_t(std::forward(f))](auto&&... args) mutable { + ModuleContextGuard guard(ctx); + return fn(std::forward(args)...); + }; + } + +} // namespace cms::perfetto + +#endif // PerfTools_Perfetto_interface_CMSSWPerfettoModuleContext_h diff --git a/PerfTools/Perfetto/interface/CMSSWPerfettoTrace.h b/PerfTools/Perfetto/interface/CMSSWPerfettoTrace.h new file mode 100644 index 0000000000000..02face0425592 --- /dev/null +++ b/PerfTools/Perfetto/interface/CMSSWPerfettoTrace.h @@ -0,0 +1,58 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#ifndef PerfTools_Perfetto_interface_CMSSWPerfettoTrace_h +#define PerfTools_Perfetto_interface_CMSSWPerfettoTrace_h + +#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h" +#include "PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h" +#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h" +#include + +#include + +// Tier-B, per-function instrumentation. The slice is emitted on the lane of the +// module currently executing on this thread (from cms::perfetto::ModuleContext), +// so it nests under that module's slice; outside any module it falls back to the +// calling thread's own track. Because a SliceScope lives entirely on one thread, +// its captured begin/end track always match. +namespace cms::perfetto_trace { + + struct SliceScope { + explicit SliceScope(const char* name) noexcept { + if (!::perfetto::TrackEvent::IsEnabled()) + return; + auto const& m = cms::perfetto::currentModuleContext(); + if (m.active && m.streamId != 0xffffffffu) { + track_.emplace(cms::perfetto::laneTrack(m.streamId)); + TRACE_EVENT_BEGIN("cmssw.func", ::perfetto::DynamicString(name), *track_); + } else { + TRACE_EVENT_BEGIN("cmssw.func", ::perfetto::DynamicString(name)); + } + active_ = true; + } + + ~SliceScope() noexcept { + if (!active_) + return; + if (track_) + TRACE_EVENT_END("cmssw.func", *track_); + else + TRACE_EVENT_END("cmssw.func"); + } + + SliceScope(SliceScope const&) = delete; + SliceScope& operator=(SliceScope const&) = delete; + + private: + std::optional<::perfetto::Track> track_; + bool active_ = false; + }; + +} // namespace cms::perfetto_trace + +#define CMS_PERFETTO_FUNC() \ + cms::perfetto_trace::SliceScope PERFETTO_UID(_cms_perfetto_func_) { __func__ } + +#define CMS_PERFETTO_SCOPE(name_literal) \ + cms::perfetto_trace::SliceScope PERFETTO_UID(_cms_perfetto_scope_) { name_literal } + +#endif // PerfTools_Perfetto_interface_CMSSWPerfettoTrace_h diff --git a/PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h b/PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h new file mode 100644 index 0000000000000..8a16e9ecc70e0 --- /dev/null +++ b/PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h @@ -0,0 +1,104 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#ifndef PerfTools_Perfetto_interface_PerfettoAllocatorMonitor_h +#define PerfTools_Perfetto_interface_PerfettoAllocatorMonitor_h + +#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h" +#include "PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h" +#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h" +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h" + +#include +#include +#include + +namespace cms::perfetto { + + // Turns Alpaka caching-allocator transactions into Perfetto events. Each + // alloc/free is emitted as an INSTANT on the calling thread's track -- so it + // sits, visually and by annotation, under the module slice that triggered it -- + // and the live / cached / requested byte totals are emitted as per-device + // counters, giving a device-memory-pressure timeline. Attribution to the + // responsible module comes from the thread-local cms::perfetto::ModuleContext. + class PerfettoAllocatorMonitor : public cms::alpakatools::CachingAllocatorMonitor { + public: + PerfettoAllocatorMonitor() { + for (int d = 0; d <= kMaxDevice; ++d) { + std::string const tag = (d == kMaxDevice) ? std::string("host") : ("dev" + std::to_string(d)); + names_[d][0] = tag + " live (B)"; + names_[d][1] = tag + " cached (B)"; + names_[d][2] = tag + " requested (B)"; + } + } + + void onAllocate(int device, + const void* /*ptr*/, + std::size_t bytes, + std::size_t requested, + bool cacheHit, + unsigned long long queue) noexcept override { + if (!::perfetto::TrackEvent::IsEnabled()) + return; + auto const& m = cms::perfetto::currentModuleContext(); + TRACE_EVENT_INSTANT("cmssw.alloc", + "alloc", + trackFor(m), + "module", + ::perfetto::DynamicString(m.active && m.label ? m.label : "(none)"), + "bytes", + static_cast(bytes), + "requested", + static_cast(requested), + "cache_hit", + cacheHit, + "device", + device, + "queue", + queue); + } + + void onFree(int device, const void* /*ptr*/, std::size_t bytes, unsigned long long queue) noexcept override { + if (!::perfetto::TrackEvent::IsEnabled()) + return; + auto const& m = cms::perfetto::currentModuleContext(); + TRACE_EVENT_INSTANT("cmssw.alloc", + "free", + trackFor(m), + "module", + ::perfetto::DynamicString(m.active && m.label ? m.label : "(none)"), + "bytes", + static_cast(bytes), + "device", + device, + "queue", + queue); + } + + void onUsage(int device, std::size_t live, std::size_t cached, std::size_t requested) noexcept override { + if (!::perfetto::TrackEvent::IsEnabled()) + return; + int const d = (device < 0 || device >= kMaxDevice) ? kMaxDevice : device; + TRACE_COUNTER("cmssw.alloc", ::perfetto::CounterTrack(::perfetto::DynamicString{names_[d][0].c_str()}), live); + TRACE_COUNTER("cmssw.alloc", ::perfetto::CounterTrack(::perfetto::DynamicString{names_[d][1].c_str()}), cached); + TRACE_COUNTER( + "cmssw.alloc", ::perfetto::CounterTrack(::perfetto::DynamicString{names_[d][2].c_str()}), requested); + } + + private: + // The alloc/free instant goes on the lane of the module that triggered it (so it + // sits under that module's slice). Outside any traced module it falls back to the + // calling thread's own track. + static ::perfetto::Track trackFor(cms::perfetto::ModuleContext const& m) { + if (m.active && m.streamId != 0xffffffffu) + return cms::perfetto::laneTrack(m.streamId); + return ::perfetto::ThreadTrack::Current(); + } + + static constexpr int kMaxDevice = 16; // index kMaxDevice == host + std::array, kMaxDevice + 1> names_; + }; + +} // namespace cms::perfetto + +#endif // PerfTools_Perfetto_interface_PerfettoAllocatorMonitor_h diff --git a/PerfTools/Perfetto/plugins/BuildFile.xml b/PerfTools/Perfetto/plugins/BuildFile.xml new file mode 100644 index 0000000000000..dbaa5faf578f4 --- /dev/null +++ b/PerfTools/Perfetto/plugins/BuildFile.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/PerfTools/Perfetto/plugins/PerfettoCuptiProfiler.h b/PerfTools/Perfetto/plugins/PerfettoCuptiProfiler.h new file mode 100644 index 0000000000000..9c0a3c9233523 --- /dev/null +++ b/PerfTools/Perfetto/plugins/PerfettoCuptiProfiler.h @@ -0,0 +1,233 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#ifndef PerfTools_Perfetto_plugins_PerfettoCuptiProfiler_h +#define PerfTools_Perfetto_plugins_PerfettoCuptiProfiler_h + +#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h" +#include + +// CUDA/CUPTI exist only on some architectures, so the plugin BuildFile pulls in +// cuda/cupti and defines PERFETTO_HAS_CUPTI only inside . +// Build the real profiler when it is set; otherwise provide a no-op stub below so +// PerfettoTraceService compiles unchanged (traceGpuKernels then does nothing). +#ifdef PERFETTO_HAS_CUPTI + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +// Streams CUDA kernel activity into the running Perfetto session using CUPTI. +// +// Each kernel is emitted as a slice on a per-(device, CUDA stream) track, placed +// at the *device-side* start/end timestamps reported by CUPTI -- i.e. the real +// GPU execution time, not the host enqueue/wait time. The slice is annotated with +// the per-launch static resource usage CUPTI reports (registers/thread, static & +// dynamic shared memory, local memory/thread), the launch configuration +// (grid/block), an estimated theoretical occupancy, and the CUPTI correlation id +// (which links it to the host module that launched it). +// +// CUPTI is a driver-level profiler, so this works on the already-compiled release +// Alpaka/CUDA plugins without rebuilding them. The activity-tracing path is low +// overhead and does not serialize kernels. +namespace cms::perfetto { + + class PerfettoCuptiProfiler { + public: + // Returns true if CUPTI kernel tracing was activated (a CUDA device is present + // and CUPTI accepted the configuration). + bool start() { + int count = 0; + if (cudaGetDeviceCount(&count) != cudaSuccess || count == 0) + return false; + + // Cache per-device properties for the occupancy estimate. + props_.resize(count); + for (int d = 0; d < count; ++d) + cudaGetDeviceProperties(&props_[d], d); + + // Correlate the CUPTI clock to CLOCK_BOOTTIME (Perfetto's default trace + // clock), so GPU slices line up with the host timeline. + uint64_t cuptiNow = 0; + cuptiGetTimestamp(&cuptiNow); + timespec ts{}; + clock_gettime(CLOCK_BOOTTIME, &ts); + int64_t bootNow = int64_t(ts.tv_sec) * 1000000000LL + ts.tv_nsec; + offsetNs_ = bootNow - int64_t(cuptiNow); + + s_instance = this; + if (cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted) != CUPTI_SUCCESS) + return false; + if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) != CUPTI_SUCCESS) + return false; + active_ = true; + return true; + } + + // Drain remaining activity records into the (still open) Perfetto session. + void flush() { + if (active_) + cuptiActivityFlushAll(1); + } + + void stop() { + if (active_) { + cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + active_ = false; + } + s_instance = nullptr; + } + + private: + static constexpr uint64_t kGpuTrackBase = 0x4750550000000000ull; // "GPU....." + + static void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { + constexpr size_t kSize = 8 * 1024 * 1024; + *buffer = static_cast(std::aligned_alloc(8, kSize)); + *size = kSize; + *maxNumRecords = 0; + } + + static void bufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) { + if (s_instance && validSize > 0) { + CUpti_Activity* record = nullptr; + while (cuptiActivityGetNextRecord(buffer, validSize, &record) == CUPTI_SUCCESS) { + if (record->kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) + s_instance->handleKernel(reinterpret_cast(record)); + } + } + std::free(buffer); + } + + ::perfetto::Track gpuTrack(uint32_t device, uint32_t stream) { + uint64_t const uuid = kGpuTrackBase | (uint64_t{device} << 24) | stream; + { + std::scoped_lock lock(mutex_); + if (seen_.insert(uuid).second) { + ::perfetto::Track t(uuid, ::perfetto::ProcessTrack::Current()); + auto d = t.Serialize(); + d.set_name("GPU" + std::to_string(device) + " stream " + std::to_string(stream)); + ::perfetto::TrackEvent::SetTrackDescriptor(t, d); + } + } + return ::perfetto::Track(uuid, ::perfetto::ProcessTrack::Current()); + } + + static std::string shorten(const char* demangled) { + std::string s = demangled ? demangled : ""; + auto p = s.find("gpuKernel<"); + if (p == std::string::npos) + return s.size() > 96 ? s.substr(0, 96) : s; + p += 10; + int depth = 0; + size_t i = p; + for (; i < s.size(); ++i) { + char c = s[i]; + if (c == '<') + ++depth; + else if (c == '>') { + if (depth == 0) + break; + --depth; + } else if (c == ',' && depth == 0) + break; + } + return s.substr(p, i - p); + } + + double occupancy(int device, uint32_t regsPerThread, uint32_t smem, uint32_t blockThreads) const { + if (device < 0 || device >= int(props_.size()) || blockThreads == 0) + return 0.; + auto const& p = props_[device]; + int blocksThreads = p.maxThreadsPerMultiProcessor / int(blockThreads); + int blocksReg = regsPerThread > 0 ? p.regsPerMultiprocessor / int(regsPerThread * blockThreads) : blocksThreads; + int blocksSmem = smem > 0 ? int(p.sharedMemPerMultiprocessor / smem) : blocksThreads; + int blocks = std::max(0, std::min({blocksThreads, blocksReg, blocksSmem})); + return double(blocks * int(blockThreads)) / double(p.maxThreadsPerMultiProcessor); + } + + void handleKernel(CUpti_ActivityKernel9 const* k) { + char* demangled = abi::__cxa_demangle(k->name, nullptr, nullptr, nullptr); + std::string const name = shorten(demangled ? demangled : k->name); + std::string const full = demangled ? demangled : (k->name ? k->name : ""); + std::free(demangled); + + uint32_t const blockThreads = k->blockX * k->blockY * k->blockZ; + uint32_t const smem = k->staticSharedMemory + k->dynamicSharedMemory; + double const occ = occupancy(int(k->deviceId), k->registersPerThread, smem, blockThreads); + std::string const grid = + std::to_string(k->gridX) + "x" + std::to_string(k->gridY) + "x" + std::to_string(k->gridZ); + std::string const block = + std::to_string(k->blockX) + "x" + std::to_string(k->blockY) + "x" + std::to_string(k->blockZ); + + auto track = gpuTrack(k->deviceId, k->streamId); + ::perfetto::TraceTimestamp const tsBegin{6, uint64_t(int64_t(k->start) + offsetNs_)}; // 6 == BOOTTIME + ::perfetto::TraceTimestamp const tsEnd{6, uint64_t(int64_t(k->end) + offsetNs_)}; + + TRACE_EVENT_BEGIN("cmssw.gpu", + ::perfetto::DynamicString{name.c_str()}, + track, + tsBegin, + "registers_per_thread", + k->registersPerThread, + "static_smem_B", + k->staticSharedMemory, + "dynamic_smem_B", + k->dynamicSharedMemory, + "local_per_thread_B", + k->localMemoryPerThread, + "local_total_B", + k->localMemoryTotal, + "grid", + ::perfetto::DynamicString{grid.c_str()}, + "block", + ::perfetto::DynamicString{block.c_str()}, + "occupancy_est", + occ, + "correlation_id", + k->correlationId, + "kernel", + ::perfetto::DynamicString{full.c_str()}); + TRACE_EVENT_END("cmssw.gpu", track, tsEnd); + } + + bool active_ = false; + int64_t offsetNs_ = 0; + std::vector props_; + std::mutex mutex_; + std::set seen_; + + static PerfettoCuptiProfiler* s_instance; + }; + + inline PerfettoCuptiProfiler* PerfettoCuptiProfiler::s_instance = nullptr; + +} // namespace cms::perfetto + +#else // PERFETTO_HAS_CUPTI + +namespace cms::perfetto { + + // No CUDA/CUPTI on this architecture: a no-op stub so that the rest of the + // service (lanes, counters, allocator, power) builds and runs normally. + class PerfettoCuptiProfiler { + public: + bool start() { return false; } + void flush() {} + void stop() {} + }; + +} // namespace cms::perfetto + +#endif // PERFETTO_HAS_CUPTI + +#endif // PerfTools_Perfetto_plugins_PerfettoCuptiProfiler_h diff --git a/PerfTools/Perfetto/plugins/PerfettoPowerSampler.h b/PerfTools/Perfetto/plugins/PerfettoPowerSampler.h new file mode 100644 index 0000000000000..f8ce9b75217c7 --- /dev/null +++ b/PerfTools/Perfetto/plugins/PerfettoPowerSampler.h @@ -0,0 +1,168 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#ifndef PerfTools_Perfetto_plugins_PerfettoPowerSampler_h +#define PerfTools_Perfetto_plugins_PerfettoPowerSampler_h + +#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// Samples CPU and GPU power on a background thread and emits them as Perfetto +// counter tracks: "CPU pkg power (W)" from the Intel RAPL energy counters +// (/sys/class/powercap/intel-rapl:*), and "GPU power (W)" from NVML +// (nvmlDeviceGetPowerUsage). NVML is loaded with dlopen so there is no build-time +// dependency on it; if neither source is available the sampler is a no-op. +namespace cms::perfetto { + + class PerfettoPowerSampler { + public: + // Returns true if at least one power source (RAPL or NVML) was found and the + // sampling thread was started. periodMs is kept modest by default because NVML + // power queries are not free and can perturb the GPU if polled too often. + bool start(unsigned periodMs = 1000) { + periodMs_ = periodMs; + openGpu(); + openCpu(); + if (gpus_.empty() && cpus_.empty()) + return false; + running_ = true; + thread_ = std::thread([this] { loop(); }); + return true; + } + + void stop() { + if (running_.exchange(false)) { + cv_.notify_all(); + if (thread_.joinable()) + thread_.join(); + } + if (nvml_) { + if (nvmlShutdown_) + nvmlShutdown_(); + ::dlclose(nvml_); + nvml_ = nullptr; + } + } + + ~PerfettoPowerSampler() { stop(); } + + private: + using FnInt = int (*)(); + using FnCount = int (*)(unsigned*); + using FnHandle = int (*)(unsigned, void**); + using FnPower = int (*)(void*, unsigned*); + + static int64_t nowNs() { + timespec ts{}; + clock_gettime(CLOCK_BOOTTIME, &ts); + return int64_t(ts.tv_sec) * 1000000000LL + ts.tv_nsec; + } + + static bool readU64(std::string const& path, uint64_t& out) { + std::FILE* f = std::fopen(path.c_str(), "re"); + if (!f) + return false; + bool const ok = std::fscanf(f, "%lu", &out) == 1; + std::fclose(f); + return ok; + } + + void openGpu() { + nvml_ = ::dlopen("libnvidia-ml.so.1", RTLD_NOW | RTLD_LOCAL); + if (!nvml_) + return; + nvmlShutdown_ = reinterpret_cast(::dlsym(nvml_, "nvmlShutdown")); + auto init = reinterpret_cast(::dlsym(nvml_, "nvmlInit_v2")); + auto count = reinterpret_cast(::dlsym(nvml_, "nvmlDeviceGetCount_v2")); + auto handle = reinterpret_cast(::dlsym(nvml_, "nvmlDeviceGetHandleByIndex_v2")); + nvmlPower_ = reinterpret_cast(::dlsym(nvml_, "nvmlDeviceGetPowerUsage")); + if (!init || !count || !handle || !nvmlPower_ || init() != 0) + return; + unsigned n = 0; + if (count(&n) != 0) + return; + for (unsigned i = 0; i < n; ++i) { + void* h = nullptr; + if (handle(i, &h) == 0) { + gpus_.push_back(h); + gpuNames_.push_back("GPU" + std::to_string(i) + " power (W)"); + } + } + } + + void openCpu() { + for (int pkg = 0;; ++pkg) { + std::string const base = "/sys/class/powercap/intel-rapl:" + std::to_string(pkg); + uint64_t energy = 0; + if (!readU64(base + "/energy_uj", energy)) + break; + uint64_t range = 0; + readU64(base + "/max_energy_range_uj", range); + cpus_.push_back({base + "/energy_uj", energy, range ? range : ~uint64_t{0}, nowNs()}); + cpuNames_.push_back("CPU pkg" + std::to_string(pkg) + " power (W)"); + } + } + + void loop() { + while (running_.load()) { + for (std::size_t i = 0; i < gpus_.size(); ++i) { + unsigned mw = 0; + if (nvmlPower_(gpus_[i], &mw) == 0) + TRACE_COUNTER( + "cmssw.power", ::perfetto::CounterTrack(::perfetto::DynamicString{gpuNames_[i].c_str()}), mw / 1000.0); + } + for (std::size_t i = 0; i < cpus_.size(); ++i) { + uint64_t energy = 0; + if (!readU64(cpus_[i].path, energy)) + continue; + int64_t const now = nowNs(); + uint64_t const dE = (energy >= cpus_[i].lastEnergy) ? (energy - cpus_[i].lastEnergy) + : (cpus_[i].range - cpus_[i].lastEnergy + energy); + double const dt = double(now - cpus_[i].lastTime) / 1e9; + if (dt > 0.) + TRACE_COUNTER("cmssw.power", + ::perfetto::CounterTrack(::perfetto::DynamicString{cpuNames_[i].c_str()}), + double(dE) / 1e6 / dt); + cpus_[i].lastEnergy = energy; + cpus_[i].lastTime = now; + } + std::unique_lock lock(mutex_); + cv_.wait_for(lock, std::chrono::milliseconds(periodMs_), [this] { return !running_.load(); }); + } + } + + struct Rapl { + std::string path; + uint64_t lastEnergy; + uint64_t range; + int64_t lastTime; + }; + + unsigned periodMs_ = 50; + void* nvml_ = nullptr; + FnInt nvmlShutdown_ = nullptr; + FnPower nvmlPower_ = nullptr; + std::vector gpus_; + std::vector cpus_; + std::vector gpuNames_; + std::vector cpuNames_; + + std::atomic running_{false}; + std::mutex mutex_; + std::condition_variable cv_; + std::thread thread_; + }; + +} // namespace cms::perfetto + +#endif // PerfTools_Perfetto_plugins_PerfettoPowerSampler_h diff --git a/PerfTools/Perfetto/plugins/PerfettoTraceService.cc b/PerfTools/Perfetto/plugins/PerfettoTraceService.cc new file mode 100644 index 0000000000000..cdc84dd0fe9de --- /dev/null +++ b/PerfTools/Perfetto/plugins/PerfettoTraceService.cc @@ -0,0 +1,469 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h" +#include "PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h" +#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h" +#include "PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h" +#include +#include "PerfTools/Perfetto/plugins/PerfettoCuptiProfiler.h" +#include "PerfTools/Perfetto/plugins/PerfettoPowerSampler.h" + +#include "HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h" + +#include "DataFormats/Provenance/interface/ModuleDescription.h" +#include "FWCore/Framework/interface/ComponentDescription.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" +#include "FWCore/ServiceRegistry/interface/ESModuleCallingContext.h" +#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h" +#include "FWCore/ServiceRegistry/interface/ParentContext.h" +#include "FWCore/ServiceRegistry/interface/PathContext.h" +#include "FWCore/ServiceRegistry/interface/PlaceInPathContext.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" +#include "FWCore/ServiceRegistry/interface/StreamContext.h" +#include "FWCore/ServiceRegistry/interface/SystemBounds.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// PerfettoTraceService records a Perfetto trace (.pftrace) of a cmsRun job using +// the in-process Perfetto SDK. +// +// Threading model (this is the important part). CMSSW runs one global TBB arena; +// within a *single* stream and a *single* event, independent modules execute as +// concurrent tasks on different threads, and an ExternalWork module's acquire() +// and produce() run on different threads with an asynchronous gap in between. A +// single per-stream timeline therefore cannot represent module execution without +// overlapping (mis-paired) slices. So: +// +// * module / acquire / EventSetup / source / cleanup slices are emitted on a +// per-(stream, thread) lane that hangs under the stream track (see +// CMSSWPerfettoLanes.h). Each lane is fed by exactly one thread, so its slices +// nest correctly; concurrent work in a stream lands on separate lanes. +// * the per-stream "Event" lifetime (preEvent..postClearEvent) IS serialized +// per stream, so it sits on the stream track itself, together with the +// run/lumi/event counters. +// +// Around every module call the service also publishes a thread-local +// cms::perfetto::ModuleContext, so the caching-allocator monitor and the CUPTI +// GPU layer can attribute their work to the responsible module. + +class PerfettoTraceService { +public: + PerfettoTraceService(edm::ParameterSet const& pset, edm::ActivityRegistry& ar) + : enabled_(pset.getUntrackedParameter("enabled")), + fileName_(pset.getUntrackedParameter("fileName")), + bufferSizeKB_(pset.getUntrackedParameter("bufferSizeKB")), + shmemSizeKB_(pset.getUntrackedParameter("shmemSizeKB")), + maxEvents_(pset.getUntrackedParameter("maxEvents")), + traceFunctions_(pset.getUntrackedParameter("traceFunctions")), + traceAllocations_(pset.getUntrackedParameter("traceAllocations")), + traceGpuKernels_(pset.getUntrackedParameter("traceGpuKernels")), + tracePower_(pset.getUntrackedParameter("tracePower")), + powerPeriodMs_(pset.getUntrackedParameter("powerPeriodMs")), + traceModules_(pset.getUntrackedParameter>("traceModules")) { + std::sort(traceModules_.begin(), traceModules_.end()); + if (!enabled_) + return; + + ::perfetto::TracingInitArgs args; + args.backends = ::perfetto::kInProcessBackend; + // Size the producer shared-memory buffer (SMB) for the high, bursty slice rate + // of many concurrent edm::streams. With the SDK default (~4 MB) the SMB + // saturates and TrackEvent (kDrop policy) silently discards slices -- dropping + // whole events from the trace. 32 KB chunks (the SDK maximum) reduce per-chunk + // contention under heavy multithreading. kDrop is kept (not kStall) so tracing + // never blocks the reconstruction threads. + args.shmem_size_hint_kb = shmemSizeKB_; + args.shmem_page_size_hint_kb = 32; + ::perfetto::Tracing::Initialize(args); + ::perfetto::TrackEvent::Register(); + + ::perfetto::TraceConfig cfg; + cfg.add_buffers()->set_size_kb(bufferSizeKB_); + + auto* ds = cfg.add_data_sources(); + auto* ds_cfg = ds->mutable_config(); + ds_cfg->set_name("track_event"); + + ::perfetto::protos::gen::TrackEventConfig te; + te.add_enabled_categories("cmssw.event"); + te.add_enabled_categories("cmssw.source"); + te.add_enabled_categories("cmssw.module"); + te.add_enabled_categories("cmssw.acquire"); + te.add_enabled_categories("cmssw.es"); + te.add_enabled_categories("cmssw.cleanup"); + te.add_enabled_categories("cmssw.func"); + te.add_enabled_categories("cmssw.alloc"); + te.add_enabled_categories("cmssw.gpu"); + te.add_enabled_categories("cmssw.power"); + ds_cfg->set_track_event_config_raw(te.SerializeAsString()); + + // In-memory session: the whole trace is held in the buffer and written once at + // the end. This keeps the GPU/CUPTI events -- emitted at end-of-job with their + // real, earlier device timestamps -- correctly ordered, and avoids perfetto's + // write_into_file "no flush" warning. The trace is bounded by bufferSizeKB; + // raise it for very long jobs. + session_ = ::perfetto::Tracing::NewTrace(); + session_->Setup(cfg); + session_->StartBlocking(); + + { + auto proc = ::perfetto::ProcessTrack::Current(); + auto desc = proc.Serialize(); + desc.mutable_process()->set_process_name("cmsRun"); + ::perfetto::TrackEvent::SetTrackDescriptor(proc, desc); + } + + ar.watchPreallocate(this, &PerfettoTraceService::preallocate); + + ar.watchPreSourceEvent(this, &PerfettoTraceService::preSourceEvent); + ar.watchPostSourceEvent(this, &PerfettoTraceService::postSourceEvent); + + ar.watchPreEvent(this, &PerfettoTraceService::preEvent); + ar.watchPreClearEvent(this, &PerfettoTraceService::preClearEvent); + ar.watchPostClearEvent(this, &PerfettoTraceService::postClearEvent); + + ar.watchPreModuleEvent(this, &PerfettoTraceService::preModuleEvent); + ar.watchPostModuleEvent(this, &PerfettoTraceService::postModuleEvent); + ar.watchPreModuleEventAcquire(this, &PerfettoTraceService::preModuleEventAcquire); + ar.watchPostModuleEventAcquire(this, &PerfettoTraceService::postModuleEventAcquire); + + ar.watchPreESModule(this, &PerfettoTraceService::preESModule); + ar.watchPostESModule(this, &PerfettoTraceService::postESModule); + + ar.watchPostEndJob(this, &PerfettoTraceService::postEndJob); + + // Observe caching-allocator transactions (device/host memory). Registered + // last so it is active for the whole job. + if (traceAllocations_) + cms::alpakatools::setCachingAllocatorMonitor(&allocatorMonitor_); + + // Stream CUDA kernel activity (real device-side timing + register/occupancy + // info) into the session via CUPTI; a no-op when no GPU is present. + if (traceGpuKernels_) + cuptiProfiler_.start(); + + // Sample CPU (RAPL) and GPU (NVML) power on a background thread; a no-op when + // neither source is available. + if (tracePower_) + powerSampler_.start(powerPeriodMs_); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.addUntracked("enabled", true)->setComment("Master switch; when false the service does nothing."); + desc.addUntracked("fileName", "cmsrun.pftrace")->setComment("Output Perfetto trace file."); + desc.addUntracked("bufferSizeKB", 256 * 1024)->setComment("In-process trace buffer size in KB."); + desc.addUntracked("shmemSizeKB", 64 * 1024) + ->setComment( + "Producer shared-memory buffer (SMB) size in KB; if too small the trace silently drops " + "slices (whole events) under high-rate multi-stream tracing."); + desc.addUntracked("maxEvents", 0) + ->setComment( + "Stop opening new event slices after this many events (0 = unlimited, the default; the " + "trace is then bounded by bufferSizeKB)."); + desc.addUntracked("traceFunctions", false) + ->setComment("Enable tier-B per-function slices (CMS_PERFETTO_FUNC/SCOPE)."); + desc.addUntracked("traceAllocations", false) + ->setComment("Trace Alpaka caching-allocator alloc/free and device-memory counters."); + desc.addUntracked("traceGpuKernels", false) + ->setComment("Trace CUDA kernels (real device timing + registers/occupancy) via CUPTI."); + desc.addUntracked("tracePower", false) + ->setComment("Sample CPU (RAPL) and GPU (NVML) power as counter tracks."); + desc.addUntracked("powerPeriodMs", 1000) + ->setComment("Power sampling period in milliseconds (when tracePower is true)."); + desc.addUntracked>("traceModules", {}) + ->setComment("If non-empty, only trace these module labels (lower overhead, focused trace)."); + descriptions.add("PerfettoTraceService", desc); + } + +private: + struct PerStream { + unsigned sid = 0; + bool in_event = false; + bool source_open = false; + unsigned long long eventId = 0; + }; + + bool tracing() const noexcept { return enabled_ && ::perfetto::TrackEvent::IsEnabled(); } + + // When traceModules is non-empty, only those module labels are traced (and only + // their allocations/kernels are attributed). Empty = trace every module. + bool selected(edm::ModuleDescription const& md) const { + return traceModules_.empty() || std::binary_search(traceModules_.begin(), traceModules_.end(), md.moduleLabel()); + } + + bool withinBudget() noexcept { return maxEvents_ == 0 || seenEvents_.fetch_add(1) < maxEvents_; } + + // Publish (or clear) the thread-local "current module" used by the allocator + // and GPU layers to attribute their work. + // Push a module context on this thread's stack. |active| (== this module is + // selected for tracing) controls whether work done inside it is attributed. + static void pushContext(edm::StreamContext const& sc, edm::ModuleDescription const& md, bool active) noexcept { + cms::perfetto::ModuleContext ctx; + ctx.label = md.moduleLabel().c_str(); + ctx.type = md.moduleName().c_str(); + ctx.moduleId = md.id(); + ctx.streamId = sc.streamID().value(); + ctx.eventId = sc.eventID().event(); + ctx.active = active; + cms::perfetto::pushModuleContext(ctx); + } + + void preallocate(edm::service::SystemBounds const& bounds) { + states_.assign(bounds.maxNumberOfStreams(), PerStream{}); + for (unsigned i = 0; i < states_.size(); ++i) { + states_[i].sid = i; + auto t = cms::perfetto::streamTrack(i); + auto d = t.Serialize(); + d.set_name("edm::stream " + std::to_string(i)); + ::perfetto::TrackEvent::SetTrackDescriptor(t, d); + } + } + + // ----- Source (on the stream's lane for the reading thread) ----- + void preSourceEvent(edm::StreamID sid) { + if (!tracing()) + return; + auto& st = states_[sid.value()]; + if (!st.source_open) { + st.source_open = true; + TRACE_EVENT_BEGIN("cmssw.source", "Source", cms::perfetto::laneTrack(sid.value()), "stream", sid.value()); + } + } + + void postSourceEvent(edm::StreamID sid) { + if (!tracing()) + return; + auto& st = states_[sid.value()]; + if (st.source_open) { + TRACE_EVENT_END("cmssw.source", cms::perfetto::laneTrack(sid.value())); + st.source_open = false; + } + } + + // ----- Per-stream event lifetime (serialized -> per-stream track) ----- + void preEvent(edm::StreamContext const& sc) { + if (!tracing()) + return; + auto& st = states_[sc.streamID().value()]; + if (st.in_event) + return; + if (!withinBudget()) { + cms::perfetto::resetModuleContext(); + return; + } + st.in_event = true; + auto const& id = sc.eventID(); + st.eventId = id.event(); + + auto track = cms::perfetto::streamTrack(st.sid); + TRACE_EVENT_BEGIN( + "cmssw.event", "Event", track, "run", id.run(), "lumi", id.luminosityBlock(), "event", id.event()); + TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("run", "id", track), static_cast(id.run())); + TRACE_COUNTER( + "cmssw.event", ::perfetto::CounterTrack("lumi", "id", track), static_cast(id.luminosityBlock())); + TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("event", "id", track), static_cast(id.event())); + } + + void preClearEvent(edm::StreamContext const& sc) { + if (!tracing()) + return; + if (!states_[sc.streamID().value()].in_event) + return; + TRACE_EVENT_BEGIN( + "cmssw.cleanup", "Cleanup", cms::perfetto::laneTrack(sc.streamID().value()), "stream", sc.streamID().value()); + } + + void postClearEvent(edm::StreamContext const& sc) { + if (!tracing()) + return; + auto& st = states_[sc.streamID().value()]; + if (!st.in_event) + return; + TRACE_EVENT_END("cmssw.cleanup", cms::perfetto::laneTrack(sc.streamID().value())); + TRACE_EVENT_END("cmssw.event", cms::perfetto::streamTrack(st.sid)); + st.in_event = false; + emitThroughput(); // global events/s counter, from event-completion times + cms::perfetto::resetModuleContext(); + } + + // Global "Throughput (events/s)" counter: a sliding-window rate over the last + // kThroughputWindow event completions (across all streams), emitted on a single + // process-level CounterTrack -- so any Perfetto UI shows the job's event rate + // ramping up and reaching steady state, no UI plugin required. + void emitThroughput() { + timespec ts{}; + clock_gettime(CLOCK_BOOTTIME, &ts); + int64_t const now = int64_t(ts.tv_sec) * 1000000000LL + ts.tv_nsec; + double rate = 0.; + { + std::scoped_lock lock(throughputMutex_); + completions_.push_back(now); + while (completions_.size() > kThroughputWindow) + completions_.pop_front(); + if (completions_.size() >= 2) { + double const span_s = double(now - completions_.front()) / 1e9; + if (span_s > 0.) + rate = double(completions_.size() - 1) / span_s; + } + } + TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("Throughput (events/s)"), rate); + } + + // ----- Modules (per thread) ----- + void preModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc) { + if (!tracing()) + return; + if (!states_[sc.streamID().value()].in_event) + return; + auto const& md = *mcc.moduleDescription(); + bool const sel = selected(md); + pushContext(sc, md, sel); // pushed for every module so the stack stays balanced + if (!sel) + return; + TRACE_EVENT_BEGIN("cmssw.module", + ::perfetto::DynamicString(md.moduleLabel()), + cms::perfetto::laneTrack(sc.streamID().value()), + "event", + sc.eventID().event(), + "module_id", + md.id(), + "cpp_type", + ::perfetto::DynamicString(md.moduleName())); + } + + void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc) { + if (!tracing()) + return; + if (!states_[sc.streamID().value()].in_event) + return; + if (selected(*mcc.moduleDescription())) + TRACE_EVENT_END("cmssw.module", cms::perfetto::laneTrack(sc.streamID().value())); + cms::perfetto::popModuleContext(); + } + + void preModuleEventAcquire(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc) { + if (!tracing()) + return; + if (!states_[sc.streamID().value()].in_event) + return; + auto const& md = *mcc.moduleDescription(); + bool const sel = selected(md); + pushContext(sc, md, sel); + if (!sel) + return; + TRACE_EVENT_BEGIN("cmssw.acquire", + ::perfetto::DynamicString(md.moduleLabel()), + cms::perfetto::laneTrack(sc.streamID().value()), + "event", + sc.eventID().event(), + "cpp_type", + ::perfetto::DynamicString(md.moduleName())); + } + + void postModuleEventAcquire(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc) { + if (!tracing()) + return; + if (!states_[sc.streamID().value()].in_event) + return; + if (selected(*mcc.moduleDescription())) + TRACE_EVENT_END("cmssw.acquire", cms::perfetto::laneTrack(sc.streamID().value())); + cms::perfetto::popModuleContext(); + } + + // ----- EventSetup modules (per thread; stream attribution is best-effort) ----- + static edm::StreamContext const* streamOf(edm::ESModuleCallingContext const& cc) { + auto top = cc.getTopModuleCallingContext(); + if (!top || top->type() != edm::ParentContext::Type::kPlaceInPath) + return nullptr; + auto const* pip = top->parent().placeInPathContext(); + auto const* pc = pip ? pip->pathContext() : nullptr; + return pc ? pc->streamContext() : nullptr; + } + + void preESModule(edm::eventsetup::EventSetupRecordKey const&, edm::ESModuleCallingContext const& cc) { + if (!tracing()) + return; + auto const* sc = streamOf(cc); + if (!sc || !states_[sc->streamID().value()].in_event) + return; + auto const* cd = cc.componentDescription(); + const char* name = (cd && !cd->label_.empty()) ? cd->label_.c_str() : (cd ? cd->type_.c_str() : "ESModule"); + TRACE_EVENT_BEGIN("cmssw.es", + ::perfetto::DynamicString(name), + cms::perfetto::laneTrack(sc->streamID().value()), + "stream", + sc->streamID().value()); + } + + void postESModule(edm::eventsetup::EventSetupRecordKey const&, edm::ESModuleCallingContext const& cc) { + if (!tracing()) + return; + auto const* sc = streamOf(cc); + if (!sc || !states_[sc->streamID().value()].in_event) + return; + TRACE_EVENT_END("cmssw.es", cms::perfetto::laneTrack(sc->streamID().value())); + } + + void postEndJob() { + if (!enabled_ || !session_) + return; + // Stop observing allocations before the session is torn down: late frees + // (e.g. from the AlpakaService destructor) must not touch a stopped session. + if (traceAllocations_) + cms::alpakatools::setCachingAllocatorMonitor(nullptr); + // Stop the power sampler before the session is torn down (its thread emits). + if (tracePower_) + powerSampler_.stop(); + // Drain CUDA kernel activity into the still-open session, then stop it. + if (traceGpuKernels_) { + cuptiProfiler_.flush(); + cuptiProfiler_.stop(); + } + ::perfetto::TrackEvent::Flush(); + session_->StopBlocking(); + auto trace_data = session_->ReadTraceBlocking(); + int fd = ::open(fileName_.c_str(), O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC, 0644); + if (fd >= 0) { + [[maybe_unused]] auto n = ::write(fd, trace_data.data(), trace_data.size()); + ::close(fd); + } + } + + const bool enabled_; + const std::string fileName_; + const unsigned bufferSizeKB_; + const unsigned shmemSizeKB_; + const unsigned maxEvents_; + const bool traceFunctions_; + const bool traceAllocations_; + const bool traceGpuKernels_; + const bool tracePower_; + const unsigned powerPeriodMs_; + std::vector traceModules_; + + std::unique_ptr<::perfetto::TracingSession> session_; + + std::vector states_; + std::atomic seenEvents_{0}; + cms::perfetto::PerfettoAllocatorMonitor allocatorMonitor_; + cms::perfetto::PerfettoCuptiProfiler cuptiProfiler_; + cms::perfetto::PerfettoPowerSampler powerSampler_; + + static constexpr std::size_t kThroughputWindow = 16; // events in the rate window + std::mutex throughputMutex_; + std::deque completions_; // boottime ns of recent event completions +}; + +DEFINE_FWK_SERVICE(PerfettoTraceService); diff --git a/PerfTools/Perfetto/python/customisePerfetto.py b/PerfTools/Perfetto/python/customisePerfetto.py new file mode 100644 index 0000000000000..c5f480024e621 --- /dev/null +++ b/PerfTools/Perfetto/python/customisePerfetto.py @@ -0,0 +1,42 @@ +# Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +import FWCore.ParameterSet.Config as cms + + +def customisePerfetto(process, + fileName="cmsrun.pftrace", + bufferSizeKB=256 * 1024, + maxEvents=200, + traceFunctions=False, + traceAllocations=False, + traceGpuKernels=False, + traceModules=None): + """Add the PerfettoTraceService to process. + + The service writes an in-process Perfetto trace (open it at https://ui.perfetto.dev). + Module / acquire / EventSetup slices land on per-thread tracks (so concurrent + work nests correctly); each stream gets an "Event" track with run/lumi/event + counters. + + Arguments: + fileName output .pftrace file + bufferSizeKB in-process trace buffer size (KB) + maxEvents stop opening new event slices after this many events (0 = unlimited) + traceFunctions enable tier-B per-function slices (CMS_PERFETTO_FUNC/SCOPE) + traceAllocations trace the Alpaka caching allocator (alloc/free + device-memory counters) + traceGpuKernels trace CUDA kernels via CUPTI (real device timing + registers/occupancy) + traceModules if a non-empty list, only trace these module labels (focused, low overhead) + """ + process.add_(cms.Service("PerfettoTraceService", + fileName=cms.untracked.string(fileName), + bufferSizeKB=cms.untracked.uint32(bufferSizeKB), + maxEvents=cms.untracked.uint32(maxEvents), + traceFunctions=cms.untracked.bool(traceFunctions), + traceAllocations=cms.untracked.bool(traceAllocations), + traceGpuKernels=cms.untracked.bool(traceGpuKernels), + traceModules=cms.untracked.vstring(traceModules or []))) + return process + + +def customise(process): + """Default entry point for `cmsDriver.py --customise PerfTools/Perfetto/customisePerfetto.customise`.""" + return customisePerfetto(process) diff --git a/PerfTools/Perfetto/scripts/perfettoKernelResources.py b/PerfTools/Perfetto/scripts/perfettoKernelResources.py new file mode 100755 index 0000000000000..ca75871f3f4a4 --- /dev/null +++ b/PerfTools/Perfetto/scripts/perfettoKernelResources.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +"""Dump the static per-kernel resource usage (ptxas info) of CUDA/Alpaka libraries. + +Runs `cuobjdump --dump-resource-usage` on each given shared object / cubin and +reports, per kernel, the registers/thread, stack frame, static shared memory, +local memory (>0 means register spills) and constant memory -- the same numbers +ptxas prints with `-Xptxas -v`. Names are demangled with `c++filt`/`cu++filt`. + +This is the compile-time counterpart of the runtime CUPTI layer in +PerfettoTraceService (traceGpuKernels=True): CUPTI reports registers and memory +per launch, while this works offline on the built libraries and also exposes +spills/stack, which are not in the CUPTI kernel records. + +Usage: + perfettoKernelResources.py [--json] [--filter SUBSTR] [more ...] +""" +import argparse +import json +import re +import shutil +import subprocess +import sys + +_FUNC = re.compile(r"^\s*Function (.+):\s*$") +_RES = re.compile(r"REG:(\d+).*?STACK:(\d+).*?SHARED:(\d+).*?LOCAL:(\d+)") +_CONST = re.compile(r"CONSTANT\[0\]:(\d+)") + + +def demangle(name, tool): + if not tool: + return name + out = subprocess.run([tool, name], capture_output=True, text=True).stdout.strip() + return out or name + + +def shorten(name): + """Reduce an Alpaka gpuKernel<...> wrapper to the user functor.""" + i = name.find("gpuKernel<") + if i < 0: + return name + i += len("gpuKernel<") + depth = 0 + for j in range(i, len(name)): + c = name[j] + if c == "<": + depth += 1 + elif c == ">": + if depth == 0: + return name[i:j] + depth -= 1 + elif c == "," and depth == 0: + return name[i:j] + return name[i:] + + +def dump(path, demangler): + out = subprocess.run(["cuobjdump", "--dump-resource-usage", path], capture_output=True, text=True).stdout + kernels, cur = [], None + for line in out.splitlines(): + m = _FUNC.match(line) + if m: + cur = m.group(1) + continue + if cur: + r = _RES.search(line) + if r: + c = _CONST.search(line) + full = demangle(cur, demangler) + kernels.append({ + "kernel": shorten(full), + "registers": int(r.group(1)), + "stack": int(r.group(2)), + "shared": int(r.group(3)), + "local_spill": int(r.group(4)), + "const": int(c.group(1)) if c else 0, + "mangled": cur, + }) + cur = None + return kernels + + +def main(): + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("libs", nargs="+", help="shared objects or cubins to inspect") + ap.add_argument("--json", action="store_true", help="emit JSON instead of a table") + ap.add_argument("--filter", default="", help="only keep kernels whose name contains SUBSTR") + args = ap.parse_args() + + if not shutil.which("cuobjdump"): + sys.exit("cuobjdump not found; run inside a CUDA environment (cmsenv)") + demangler = shutil.which("cu++filt") or shutil.which("c++filt") + + kernels = [] + for lib in args.libs: + for k in dump(lib, demangler): + if args.filter in k["kernel"]: + kernels.append(k) + kernels.sort(key=lambda k: -k["registers"]) + + if args.json: + print(json.dumps(kernels, indent=2)) + return + print(f"{'REG':>4} {'STACK':>6} {'SHARED':>7} {'SPILL':>6} {'CONST':>6} kernel") + for k in kernels: + print(f"{k['registers']:>4} {k['stack']:>6} {k['shared']:>7} {k['local_spill']:>6} {k['const']:>6} " + f"{k['kernel'][:90]}") + + +if __name__ == "__main__": + main() diff --git a/PerfTools/Perfetto/src/CMSSWPerfettoCategories.cc b/PerfTools/Perfetto/src/CMSSWPerfettoCategories.cc new file mode 100644 index 0000000000000..1be791a93d606 --- /dev/null +++ b/PerfTools/Perfetto/src/CMSSWPerfettoCategories.cc @@ -0,0 +1,3 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h" +PERFETTO_TRACK_EVENT_STATIC_STORAGE(); diff --git a/PerfTools/Perfetto/src/CMSSWPerfettoModuleContext.cc b/PerfTools/Perfetto/src/CMSSWPerfettoModuleContext.cc new file mode 100644 index 0000000000000..b91fe93be31bf --- /dev/null +++ b/PerfTools/Perfetto/src/CMSSWPerfettoModuleContext.cc @@ -0,0 +1,33 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h" + +namespace cms::perfetto { + namespace { + // A small fixed-size, allocation-free per-thread stack. Module re-entrancy + // from work-stealing is shallow in practice; beyond the cap we keep counting + // depth (so push/pop stay balanced) but stop recording, and report no module. + constexpr int kMaxDepth = 64; + thread_local ModuleContext g_stack[kMaxDepth]; + thread_local int g_depth = 0; + const ModuleContext g_none{}; + } // namespace + + void pushModuleContext(ModuleContext const& ctx) noexcept { + if (g_depth >= 0 && g_depth < kMaxDepth) + g_stack[g_depth] = ctx; + ++g_depth; + } + + void popModuleContext() noexcept { + if (g_depth > 0) + --g_depth; + } + + void resetModuleContext() noexcept { g_depth = 0; } + + ModuleContext const& currentModuleContext() noexcept { + if (g_depth > 0 && g_depth <= kMaxDepth) + return g_stack[g_depth - 1]; + return g_none; + } +} // namespace cms::perfetto diff --git a/PerfTools/Perfetto/test/BuildFile.xml b/PerfTools/Perfetto/test/BuildFile.xml new file mode 100644 index 0000000000000..792f805472a60 --- /dev/null +++ b/PerfTools/Perfetto/test/BuildFile.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/PerfTools/Perfetto/test/alpaka/testPerfettoAllocator.dev.cc b/PerfTools/Perfetto/test/alpaka/testPerfettoAllocator.dev.cc new file mode 100644 index 0000000000000..e4156bb2a550c --- /dev/null +++ b/PerfTools/Perfetto/test/alpaka/testPerfettoAllocator.dev.cc @@ -0,0 +1,131 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +#include +#include + +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "FWCore/Utilities/interface/stringize.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h" + +#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h" +#include "PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h" +#include + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +namespace { + constexpr size_t SIZE = 1024; + + // Counts transactions and captures the module attribution seen via the + // thread-local cms::perfetto::ModuleContext. + struct MockMonitor : public cms::alpakatools::CachingAllocatorMonitor { + std::atomic allocs{0}; + std::atomic frees{0}; + std::string allocModule; + std::string freeModule; + + void onAllocate(int, const void*, std::size_t, std::size_t, bool, unsigned long long) noexcept override { + ++allocs; + auto const& m = cms::perfetto::currentModuleContext(); + if (m.active && m.label) + allocModule = m.label; + } + void onFree(int, const void*, std::size_t, unsigned long long) noexcept override { + ++frees; + auto const& m = cms::perfetto::currentModuleContext(); + if (m.active && m.label) + freeModule = m.label; + } + }; +} // namespace + +TEST_CASE("Caching-allocator monitor hook attributes transactions to the current module (" EDM_STRINGIZE( + ALPAKA_ACCELERATOR_NAMESPACE) ")", + "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") { + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + INFO("no devices available for this backend, skipping"); + return; + } + + SECTION("mock monitor sees alloc/free with the module attribution") { + MockMonitor mon; + cms::alpakatools::setCachingAllocatorMonitor(&mon); + + // pretend we are inside module "testAllocModule" running on stream 0 + cms::perfetto::ModuleContext ctx; + ctx.label = "testAllocModule"; + ctx.streamId = 0; + ctx.active = true; + cms::perfetto::pushModuleContext(ctx); + + { + auto queue = Queue(devices[0]); + auto buf_h = cms::alpakatools::make_host_buffer(queue, SIZE); + auto buf_d = cms::alpakatools::make_device_buffer(queue, SIZE); + alpaka::memset(queue, buf_d, 0); + alpaka::wait(queue); + } // buffers freed here -> onFree + + cms::perfetto::popModuleContext(); + cms::alpakatools::setCachingAllocatorMonitor(nullptr); + + // Not every backend routes buffers through the caching allocator (e.g. the + // serial-CPU backend allocates directly). Where it does, the monitor must + // have been called and the transactions attributed to the current module. + if (mon.allocs.load() == 0) { + WARN("this backend does not use the caching allocator; monitor not exercised"); + } else { + REQUIRE(mon.frees.load() >= 1); + REQUIRE(mon.allocModule == "testAllocModule"); + REQUIRE(mon.freeModule == "testAllocModule"); + } + } + + SECTION("PerfettoAllocatorMonitor emits a non-empty trace") { + ::perfetto::TracingInitArgs args; + args.backends = ::perfetto::kInProcessBackend; + ::perfetto::Tracing::Initialize(args); + ::perfetto::TrackEvent::Register(); + + ::perfetto::TraceConfig cfg; + cfg.add_buffers()->set_size_kb(4096); + auto* ds = cfg.add_data_sources()->mutable_config(); + ds->set_name("track_event"); + ::perfetto::protos::gen::TrackEventConfig te; + te.add_enabled_categories("cmssw.alloc"); + ds->set_track_event_config_raw(te.SerializeAsString()); + + auto session = ::perfetto::Tracing::NewTrace(); + session->Setup(cfg); + session->StartBlocking(); + + cms::perfetto::PerfettoAllocatorMonitor monitor; + cms::alpakatools::setCachingAllocatorMonitor(&monitor); + cms::perfetto::ModuleContext ctx; + ctx.label = "testAllocModule"; + ctx.active = true; + cms::perfetto::pushModuleContext(ctx); + + { + auto queue = Queue(devices[0]); + auto buf_d = cms::alpakatools::make_device_buffer(queue, SIZE); + alpaka::memset(queue, buf_d, 0); + alpaka::wait(queue); + } + + cms::perfetto::popModuleContext(); + cms::alpakatools::setCachingAllocatorMonitor(nullptr); + + ::perfetto::TrackEvent::Flush(); + session->StopBlocking(); + std::vector data = session->ReadTraceBlocking(); + REQUIRE(not data.empty()); + } +} diff --git a/PerfTools/Perfetto/test/testPerfettoTrace.cpp b/PerfTools/Perfetto/test/testPerfettoTrace.cpp new file mode 100644 index 0000000000000..1f026e4a636cb --- /dev/null +++ b/PerfTools/Perfetto/test/testPerfettoTrace.cpp @@ -0,0 +1,222 @@ +// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026 +// +// Regression test for the Perfetto tracing building blocks. It records a small +// trace in-process the same way PerfettoTraceService does -- process track, +// per-stream tracks, per-(stream,thread) lanes, module slices, run/lumi/event +// and Throughput counters -- then parses the result back with perfetto's own +// (protozero) decoders and asserts the structure. It is meant to break loudly if +// a future perfetto SDK update (or a change to our helpers) silently drops a +// feature: wrong track parenting, missing counters, unbalanced slices, etc. No GPU. + +#include +#include +#include +#include +#include +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h" +#include "PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h" +#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h" +#include + +namespace { + // Perfetto may be initialized only once per process. + void initPerfettoOnce() { + [[maybe_unused]] static const bool done = [] { + ::perfetto::TracingInitArgs args; + args.backends = ::perfetto::kInProcessBackend; + ::perfetto::Tracing::Initialize(args); + ::perfetto::TrackEvent::Register(); + return true; + }(); + } + + // Run |emit| inside a fresh in-process session and return the serialized trace. + std::vector recordTrace(const std::function& emit) { + initPerfettoOnce(); + ::perfetto::TraceConfig cfg; + cfg.add_buffers()->set_size_kb(8192); + auto* ds = cfg.add_data_sources()->mutable_config(); + ds->set_name("track_event"); + ::perfetto::protos::gen::TrackEventConfig te; + for (const char* c : {"cmssw.event", + "cmssw.source", + "cmssw.module", + "cmssw.acquire", + "cmssw.cleanup", + "cmssw.es", + "cmssw.func", + "cmssw.alloc", + "cmssw.gpu", + "cmssw.power"}) + te.add_enabled_categories(c); + ds->set_track_event_config_raw(te.SerializeAsString()); + + auto session = ::perfetto::Tracing::NewTrace(); + session->Setup(cfg); + session->StartBlocking(); + emit(); + ::perfetto::TrackEvent::Flush(); + session->StopBlocking(); + return session->ReadTraceBlocking(); + } +} // namespace + +TEST_CASE("Perfetto trace has the expected track/counter/slice structure", "[perfetto]") { + using namespace cms::perfetto; + auto bytes = recordTrace([] { + auto proc = ::perfetto::ProcessTrack::Current(); + { + auto d = proc.Serialize(); + d.mutable_process()->set_process_name("cmsRun"); + ::perfetto::TrackEvent::SetTrackDescriptor(proc, d); + } + for (unsigned s = 0; s < 2; ++s) { + auto st = streamTrack(s); + { + auto d = st.Serialize(); + d.set_name("edm::stream " + std::to_string(s)); + ::perfetto::TrackEvent::SetTrackDescriptor(st, d); + } + TRACE_EVENT_BEGIN("cmssw.event", "Event", st, "run", 1, "lumi", 1, "event", static_cast(s + 1)); + TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("run", "id", st), 1.0); + auto lane = laneTrack(s); + TRACE_EVENT_BEGIN("cmssw.module", "outer", lane); + TRACE_EVENT_BEGIN("cmssw.module", "inner", lane); // nests inside outer on the same lane + TRACE_EVENT_END("cmssw.module", lane); + TRACE_EVENT_END("cmssw.module", lane); + TRACE_EVENT_END("cmssw.event", st); + TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("Throughput (events/s)"), 100.0 + s); + } + }); + + REQUIRE(!bytes.empty()); + + namespace pb = ::perfetto::protos::pbzero; + pb::Trace::Decoder trace(reinterpret_cast(bytes.data()), bytes.size()); + + // The Throughput counter is global (no parent track); its name lands in the + // trace but not in a place as convenient as the parented "run" counter, so + // check it by byte presence -- it disappears if the counter stops being emitted. + bool const sawThroughput = + std::string_view(bytes.data(), bytes.size()).find("Throughput (events/s)") != std::string_view::npos; + + bool sawProcess = false; + int laneTracks = 0, packets = 0; + int begins = 0, ends = 0, counters = 0; + std::set streamUuids; + + // First pass: process/stream/counter descriptors and slice/counter events. + for (auto it = trace.packet(); it; ++it) { + ++packets; + pb::TracePacket::Decoder packet(*it); + if (packet.has_track_descriptor()) { + pb::TrackDescriptor::Decoder td(packet.track_descriptor()); + std::string const name = td.name().ToStdString(); + if (td.has_process()) { + pb::ProcessDescriptor::Decoder p(td.process()); + if (p.process_name().ToStdString() == "cmsRun") + sawProcess = true; + } + if (name.rfind("edm::stream", 0) == 0) + streamUuids.insert(td.uuid()); + } + if (packet.has_track_event()) { + pb::TrackEvent::Decoder te(packet.track_event()); + switch (te.type()) { + case pb::TrackEvent::TYPE_SLICE_BEGIN: + ++begins; + break; + case pb::TrackEvent::TYPE_SLICE_END: + ++ends; + break; + case pb::TrackEvent::TYPE_COUNTER: + ++counters; + break; + default: + break; + } + } + } + + // Second pass: lanes are descriptors parented to a stream track -- exactly the + // property the per-(stream,thread) lane scheme must preserve. + pb::Trace::Decoder trace2(reinterpret_cast(bytes.data()), bytes.size()); + for (auto it = trace2.packet(); it; ++it) { + pb::TracePacket::Decoder packet(*it); + if (packet.has_track_descriptor()) { + pb::TrackDescriptor::Decoder td(packet.track_descriptor()); + if (streamUuids.count(td.parent_uuid())) + ++laneTracks; + } + } + + REQUIRE(packets > 0); + REQUIRE(sawProcess); + REQUIRE(streamUuids.size() == 2u); + REQUIRE(laneTracks >= 1); + REQUIRE(sawThroughput); + REQUIRE(begins == ends); // every slice is closed + REQUIRE(begins >= 6); // 2 streams x (Event + outer + inner) + REQUIRE(counters >= 4); // 2 streams x (run counter + Throughput counter) +} + +TEST_CASE("lane and stream track uuids are distinct and correctly parented", "[perfetto]") { + using namespace cms::perfetto; + initPerfettoOnce(); + + auto s0 = streamTrack(0); + auto s1 = streamTrack(1); + REQUIRE(s0.uuid != 0); + REQUIRE(s0.uuid != s1.uuid); // one track per stream + + auto l0 = laneTrack(0); + auto l1 = laneTrack(1); + REQUIRE(l0.uuid != 0); // a lane must not collapse onto the root (the old xor-cancellation bug) + REQUIRE(l0.uuid != s0.uuid); // lane is its own track, not the stream track + REQUIRE(l0.uuid != l1.uuid); // different streams -> different lanes + REQUIRE(l0.parent_uuid == s0.uuid); // the lane hangs under its stream + REQUIRE(laneTrack(0).uuid == l0.uuid); // stable for the same (stream, thread) +} + +TEST_CASE("module context stack nests and propagates across threads", "[perfetto]") { + using namespace cms::perfetto; + resetModuleContext(); + REQUIRE_FALSE(currentModuleContext().active); + + ModuleContext a; + a.label = "A"; + a.active = true; + a.streamId = 3; + pushModuleContext(a); + REQUIRE(currentModuleContext().active); + REQUIRE(std::string(currentModuleContext().label) == "A"); + REQUIRE(currentModuleContext().streamId == 3u); + + { + ModuleContext b; + b.label = "B"; + b.active = true; + ModuleContextGuard guard(b); + REQUIRE(std::string(currentModuleContext().label) == "B"); + } + REQUIRE(std::string(currentModuleContext().label) == "A"); // restored after the guard + + // withModuleContext must carry the context onto a helper thread (parallel_for). + std::string seenOnThread; + auto body = withModuleContext([&seenOnThread] { + auto const& m = currentModuleContext(); + seenOnThread = (m.active && m.label) ? m.label : ""; + }); + std::thread th([&body] { body(); }); + th.join(); + REQUIRE(seenOnThread == "A"); + + popModuleContext(); + REQUIRE_FALSE(currentModuleContext().active); +}