diff --git a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h
index e1543d14a23c8..f7b5ef5136a6c 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h
@@ -18,6 +18,7 @@
 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/AllocatorConfig.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/AlpakaServiceFwd.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h"
 
 // Inspired by cub::CachingDeviceAllocator
 
@@ -202,7 +203,8 @@ namespace cms::alpakatools {
       std::tie(block.bin, block.bytes) = findBin(bytes);
 
       // try to re-use a cached block, or allocate a new buffer
-      if (tryReuseCachedBlock(block)) {
+      bool const cacheHit = tryReuseCachedBlock(block);
+      if (cacheHit) {
         // fill the re-used memory block with a pattern
         if (fillReallocations_) {
           immediateOrAsyncMemset(*block.queue, *block.buffer, fillReallocationValue_);
@@ -217,6 +219,12 @@ namespace cms::alpakatools {
         }
       }
 
+      if (auto* monitor = cachingAllocatorMonitor()) [[unlikely]] {
+        monitor->onAllocate(
+            monitorDevice(), block.buffer->data(), block.bytes, block.requested, cacheHit, monitorQueue(block));
+        monitor->onUsage(monitorDevice(), cachedBytes_.live, cachedBytes_.free, cachedBytes_.requested);
+      }
+
       return block.buffer->data();
     }
 
@@ -236,6 +244,11 @@ namespace cms::alpakatools {
       cachedBytes_.live -= block.bytes;
       cachedBytes_.requested -= block.requested;
 
+      if (auto* monitor = cachingAllocatorMonitor()) [[unlikely]] {
+        monitor->onFree(monitorDevice(), ptr, block.bytes, monitorQueue(block));
+        monitor->onUsage(monitorDevice(), cachedBytes_.live, cachedBytes_.free, cachedBytes_.requested);
+      }
+
       bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_);
       if (recache) {
         // If enqueuing the event fails, very likely an error has
@@ -333,6 +346,24 @@ namespace cms::alpakatools {
     };
 
   private:
+    // identifiers passed to a registered CachingAllocatorMonitor
+    int monitorDevice() const noexcept {
+      if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
+        return 0;  // the host platform has exactly one device
+      } else if constexpr (requires(Device const& d) { alpaka::getNativeHandle(d); }) {
+        return static_cast<int>(alpaka::getNativeHandle(device_));
+      } else {
+        return -1;  // a backend without a native device handle
+      }
+    }
+
+    unsigned long long monitorQueue(BlockDescriptor const& block) const noexcept {
+      if (not block.queue) {
+        return 0;
+      }
+      return reinterpret_cast<unsigned long long>(block.queue->m_spQueueImpl.get());
+    }
+
     // return the maximum amount of memory that should be cached on this device
     size_t cacheSize(size_t maxCachedBytes, double maxCachedFraction) const {
       // note that getMemBytes() returns 0 if the platform does not support querying the device memory
diff --git a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h
new file mode 100644
index 0000000000000..7f42f9c1648df
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h
@@ -0,0 +1,60 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#ifndef HeterogeneousCore_AlpakaInterface_interface_CachingAllocatorMonitor_h
+#define HeterogeneousCore_AlpakaInterface_interface_CachingAllocatorMonitor_h
+
+#include <atomic>
+#include <cstddef>
+
+namespace cms::alpakatools {
+
+  // Optional, process-wide hook to observe caching-allocator transactions.
+  //
+  // Kept free of any alpaka or perfetto dependency so that the allocator (which
+  // calls it on the hot path) stays cheap -- when no monitor is registered the
+  // cost is a single load of an atomic pointer (a read-shared global, hence a
+  // plain MOV on x86) plus a predicted-not-taken branch -- and so that an external
+  // profiler can implement it without pulling alpaka headers. A registered monitor must
+  // outlive all allocator use and its callbacks must be thread-safe: they are
+  // invoked from whatever TBB worker (or async/GPU callback thread) runs the
+  // owning module, while the allocator's internal mutex is held.
+  class CachingAllocatorMonitor {
+  public:
+    virtual ~CachingAllocatorMonitor() = default;
+
+    // A block was handed out. |bytes| is the bin-rounded size, |requested| the
+    // user size, |cacheHit| is true when a cached block was reused (no new
+    // device allocation), |queue| identifies the associated backend queue/stream
+    // and |device| is the native device handle (e.g. CUDA ordinal).
+    virtual void onAllocate(int device,
+                            const void* ptr,
+                            std::size_t bytes,
+                            std::size_t requested,
+                            bool cacheHit,
+                            unsigned long long queue) noexcept {}
+
+    // A block was returned to the allocator. Note the memory may still be in use
+    // by asynchronous device work; the allocator records an event on |queue| and
+    // only re-hands the block once that event completes (possibly to another
+    // thread/stream) -- this is the "asynchronous transaction".
+    virtual void onFree(int device, const void* ptr, std::size_t bytes, unsigned long long queue) noexcept {}
+
+    // Running byte totals for |device| right after a transaction.
+    virtual void onUsage(int device, std::size_t live, std::size_t cached, std::size_t requested) noexcept {}
+  };
+
+  inline std::atomic<CachingAllocatorMonitor*>& cachingAllocatorMonitorRef() noexcept {
+    static std::atomic<CachingAllocatorMonitor*> instance{nullptr};
+    return instance;
+  }
+
+  inline void setCachingAllocatorMonitor(CachingAllocatorMonitor* monitor) noexcept {
+    cachingAllocatorMonitorRef().store(monitor, std::memory_order_release);
+  }
+
+  inline CachingAllocatorMonitor* cachingAllocatorMonitor() noexcept {
+    return cachingAllocatorMonitorRef().load(std::memory_order_acquire);
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_CachingAllocatorMonitor_h
diff --git a/PerfTools/Perfetto/BuildFile.xml b/PerfTools/Perfetto/BuildFile.xml
new file mode 100644
index 0000000000000..5da06a8087a94
--- /dev/null
+++ b/PerfTools/Perfetto/BuildFile.xml
@@ -0,0 +1,6 @@
+<use name="FWCore/Utilities"/>
+<use name="perfetto"/>
+
+<export>
+  <lib name="1"/>
+</export>
diff --git a/PerfTools/Perfetto/README.md b/PerfTools/Perfetto/README.md
new file mode 100644
index 0000000000000..0919beba6d126
--- /dev/null
+++ b/PerfTools/Perfetto/README.md
@@ -0,0 +1,251 @@
+# PerfTools/Perfetto
+
+In-process [Perfetto](https://perfetto.dev) tracing for `cmsRun`. The
+`PerfettoTraceService` records a `.pftrace` file that can be opened directly at
+<https://ui.perfetto.dev>, giving a per-stream timeline of the framework phases
+(source read, event lifetime, module execution, `acquire`, EventSetup, cleanup).
+
+## Contents
+
+The Perfetto SDK comes from the `perfetto` CMSSW external (`<use name="perfetto"/>`,
+`#include <perfetto.h>`), not vendored here.
+
+- `interface/CMSSWPerfettoCategories.h` — the `cmssw.*` track-event categories.
+- `interface/CMSSWPerfettoModuleContext.{h,cc}` — thread-local "current module",
+  set by the service so the allocator/GPU layers can attribute their work.
+- `interface/CMSSWPerfettoTrace.h` — `CMS_PERFETTO_FUNC()` / `CMS_PERFETTO_SCOPE()`
+  scoped-slice macros for optional intra-module instrumentation.
+- `interface/PerfettoAllocatorMonitor.h` — caching-allocator → Perfetto bridge.
+- `plugins/PerfettoCuptiProfiler.h` — CUPTI → Perfetto GPU kernel tracing.
+- `plugins/PerfettoPowerSampler.h` — CPU (RAPL) + GPU (NVML) power sampling.
+- `plugins/PerfettoTraceService.cc` — the EDM service.
+- `python/customisePerfetto.py` — `cmsDriver.py --customise` helper.
+- `scripts/perfettoKernelResources.py` — static (ptxas/cuobjdump) kernel resource dump.
+
+## Track layout
+
+CMSSW runs one global TBB arena; within a *single* stream and a *single* event,
+independent modules execute concurrently on different threads, and an ExternalWork
+module's `acquire()`/`produce()` run on different threads. A single per-stream
+timeline therefore cannot hold module slices without overlaps. So each slice goes
+on a **per-(stream, thread) lane** that hangs under the stream:
+
+```
+process "cmsRun"
+  └─ edm::stream <sid>                  (one per stream; events are serialized here)
+        ├─ "Event" slice + run/lumi/event counters
+        └─ thread <n>                   (lanes: one per thread that worked on the stream)
+              └─ module / acquire / eventsetup / source / cleanup slices
+                 (+ alloc/free instants and CMS_PERFETTO_FUNC slices nested under them)
+```
+
+- **Module / acquire / EventSetup / source / cleanup** slices are emitted on a lane
+  keyed by `(stream, executing thread)`, parented to the stream track. Because every
+  lane is fed by exactly one thread, its begin/end events arrive in order and nest
+  correctly; modules of one stream running concurrently on different threads simply
+  show up as parallel lanes under that stream. The primary thread carries most of a
+  stream's work; brief lanes are tasks TBB ran on other threads (work-stealing).
+- The **per-stream `Event` track** itself holds the serialized event lifetime
+  (`preEvent`..`postClearEvent`) plus the run/lumi/event counters.
+
+The service also publishes a thread-local `cms::perfetto::ModuleContext` around
+every module call, so the caching-allocator monitor and the GPU/CUPTI layer can
+attribute their work to the responsible module.
+
+## Usage
+
+With `cmsDriver.py`:
+
+```bash
+cmsDriver.py step3 -s RAW2DIGI,RECO ... \
+  --customise PerfTools/Perfetto/customisePerfetto.customise
+```
+
+Or directly in a config:
+
+```python
+from PerfTools.Perfetto.customisePerfetto import customisePerfetto
+customisePerfetto(process, fileName="reco.pftrace")
+```
+
+Or the bare service:
+
+```python
+process.add_(cms.Service("PerfettoTraceService",
+                         fileName=cms.untracked.string("reco.pftrace")))
+```
+
+### Parameters (all untracked)
+
+| parameter        | default          | meaning                                                        |
+|------------------|------------------|----------------------------------------------------------------|
+| `enabled`        | `True`           | master switch                                                  |
+| `fileName`       | `cmsrun.pftrace` | output trace file                                              |
+| `bufferSizeKB`   | `262144`         | in-process trace buffer size (KB)                              |
+| `maxEvents`      | `200`            | stop opening new event slices after N events (`0` = unlimited) |
+| `traceFunctions` | `False`          | enable tier-B per-function slices                              |
+| `traceAllocations` | `False`        | trace Alpaka caching-allocator alloc/free + device-memory counters |
+| `traceGpuKernels` | `False`         | trace CUDA kernels (real device timing + registers/occupancy) via CUPTI |
+| `tracePower`     | `False`          | sample CPU (RAPL) + GPU (NVML) power as counter tracks |
+| `powerPeriodMs`  | `1000`           | power sampling period in ms (when `tracePower`) |
+| `traceModules`   | `[]`             | if non-empty, only trace these module labels (focused, low overhead) |
+
+A global **`Throughput (events/s)`** counter is always emitted -- a sliding-window
+event rate (over the last 16 completed events) that shows the job ramping up to
+steady state. With `tracePower=True`, a background thread samples **`CPU pkg<n>
+power (W)`** (Intel RAPL, `/sys/class/powercap`) and **`GPU<d> power (W)`** (NVML,
+loaded via `dlopen`) every `powerPeriodMs` (default 1000 ms -- NVML power queries
+are not free, so polling too fast can perturb the GPU). Both are no-ops where the
+source is unavailable.
+
+### Per-function (tier-B) tracing
+
+Annotate hot code with `CMS_PERFETTO_FUNC()` (uses `__func__`) or
+`CMS_PERFETTO_SCOPE("name")`. The slices are emitted on the calling thread's
+track, nesting under the enclosing module slice, while tracing is active;
+otherwise they are no-ops.
+
+```cpp
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoTrace.h"
+
+void MyProducer::produce(edm::Event& e, edm::EventSetup const&) {
+  CMS_PERFETTO_FUNC();
+  ...
+}
+```
+
+### Caching-allocator tracing (`traceAllocations=True`)
+
+The Alpaka `CachingAllocator` calls an optional `cms::alpakatools::CachingAllocatorMonitor`
+(a process-wide hook, free of any perfetto dependency) on every alloc/free. The
+service registers `PerfettoAllocatorMonitor`, which emits each transaction as an
+INSTANT on the calling thread's track — so it sits under the module slice that
+triggered it and is annotated with that module (from `ModuleContext`), the byte
+size, cache-hit/miss, device and queue — plus per-device `live`/`cached`/`requested`
+byte counters for a device-memory-pressure timeline.
+
+Because a freed block may still be in use by asynchronous device work, the allocator
+only re-hands it once a recorded event completes (possibly to another thread/stream):
+this asynchronous reuse is visible as a later `alloc` with `cache_hit=true` on a
+different thread, against the same device/queue.
+
+### GPU kernel tracing (`traceGpuKernels=True`)
+
+Uses [CUPTI](https://docs.nvidia.com/cupti/) to stream CUDA kernel activity into
+the session. CUPTI is a driver-level profiler, so it captures kernels from the
+already-built Alpaka/CUDA plugins without rebuilding them, at low overhead and
+without serializing kernels. Each kernel is a slice on a per-(device, CUDA stream)
+track, placed at its **device-side** start/end — i.e. the *real* GPU execution
+time, not the host enqueue/wait — annotated with `registers_per_thread`,
+`static_smem_B`, `dynamic_smem_B`, `local_per_thread_B` / `local_total_B` (spills,
+per thread and for the whole launch), `grid`, `block`,
+an estimated `occupancy_est`, the full kernel name, and the CUPTI `correlation_id`
+that links it back to the host module that launched it. GPU timestamps are
+converted to `CLOCK_BOOTTIME` so they line up with the host timeline.
+
+The compile-time counterpart is `scripts/perfettoKernelResources.py`, which runs
+`cuobjdump --dump-resource-usage` on the built `*PortableCudaAsync.so` libraries to
+print the per-kernel registers / shared / stack / spill / constant usage (the
+ptxas numbers), e.g.:
+
+```bash
+perfettoKernelResources.py --filter Phase2 \
+  $CMSSW_RELEASE_BASE/lib/$SCRAM_ARCH/pluginRecoLocalTrackerSiPixelClusterizerPluginsPortableCudaAsync.so
+```
+
+## Threading model and nested parallelism
+
+Slices are emitted on the **thread** that does the work, because within one stream
+and one event CMSSW runs independent modules concurrently on different threads (and
+an ExternalWork module's `acquire()`/`produce()` run on different threads). A few
+consequences worth knowing when reading a trace:
+
+- A module's slice spans its `produce()` **wall-clock on that thread**, including
+  any blocking `tbb::parallel_for` it calls. The parallel iterations that run on
+  *other* threads show up on *those* threads' tracks, not nested under the module —
+  so you can see the fan-out, but it is not visually grouped under the module.
+- **`tbb::parallel_for` inside `produce()`**: while the calling thread is blocked in
+  the `parallel_for`, TBB may run an unrelated module's task on it (work-stealing),
+  which then appears *nested inside* your module's slice on that thread. This is a
+  faithful picture of what the thread did, but it means a slice's duration can
+  include stolen, unrelated work. The thread-local "current module" used for
+  allocator/GPU attribution is a **stack**, so it is restored correctly when the
+  stolen work returns.
+- The helper threads running the `parallel_for` body do **not** automatically carry
+  the module context, so allocations / `CMS_PERFETTO_FUNC` slices made there are not
+  attributed to the module. Wrap the body to propagate it:
+
+  ```cpp
+  #include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h"
+  tbb::parallel_for(range, cms::perfetto::withModuleContext([&](auto const& r) {
+    CMS_PERFETTO_FUNC();           // now attributed to the enclosing module
+    ...
+  }));
+  ```
+
+## A profiling guide for reconstruction developers
+
+Goal: understand and speed up *one* algorithm (a module and the GPU work it drives).
+
+1. **Focus the trace on your module** (much lower overhead, far less noise):
+
+   ```bash
+   cmsRun step3.py   # add via --customise_commands or customisePerfetto(...)
+   ```
+   ```python
+   from PerfTools.Perfetto.customisePerfetto import customisePerfetto
+   customisePerfetto(process,
+                     fileName="myalgo.pftrace",
+                     traceModules=["myProducerAlpaka"],   # only this module
+                     traceGpuKernels=True,                # its CUDA kernels
+                     traceAllocations=True)               # its device memory
+   ```
+
+2. **Open `myalgo.pftrace` at <https://ui.perfetto.dev>** and read it top-down:
+   - **Per-thread tracks** — find your module's slice. Its width is the host-side
+     time (enqueue for an async GPU module; the actual compute for a CPU module).
+     If you added `CMS_PERFETTO_FUNC()`/`CMS_PERFETTO_SCOPE()`, the sub-steps nest
+     underneath.
+   - **`GPU<d> stream <s>` tracks** — the real kernel execution. Click a kernel to
+     see `registers_per_thread`, `occupancy_est`, `grid`/`block`, shared/local
+     memory. Use the `correlation_id` to match a kernel to the host launch.
+   - **`Event` tracks** — per-stream event boundaries + run/lumi/event counters.
+   - **`dev<d> live/cached/requested (B)` counters** — the device-memory timeline.
+
+3. **What to look for:**
+   - *Kernel time vs host time*: an async GPU module's host slice is tiny while the
+     real cost is on the GPU track — optimize the kernel, not the host call.
+   - *Occupancy*: a low `occupancy_est` driven by a high `registers_per_thread` (or
+     large shared memory) means the kernel is occupancy-limited; cross-check the
+     static numbers with `scripts/perfettoKernelResources.py` (it also shows
+     register *spills*, `local_spill > 0`).
+   - *Memory churn*: `alloc` events with `cache_hit=false` are real `cudaMalloc`s
+     (expensive); a sawtooth in the `live` counter means repeated alloc/free that
+     the cache is not absorbing — consider reusing buffers or sizing the cache. A
+     low cache-hit rate early in a job is normal (cold cache); it should rise as
+     blocks are reused.
+   - *Allocation overhead*: the gap between the `live` and `requested` counters is
+     the power-of-two bin rounding (often tens of %); a few oversized buffers from
+     one module usually dominate the peak — group `alloc` events by `module` to
+     find them.
+   - *Serialization*: the caching allocator takes one global lock per device; if
+     `alloc`/`free` instants from many threads line up back-to-back, that lock may
+     be a contention point.
+   - *Gaps*: an `acquire` slice on one thread, a long gap (GPU + the `edm async pool`
+     wait), then `produce` on another thread is the normal ExternalWork pattern; the
+     gap is the GPU doing work, visible on the GPU track.
+
+## Overhead
+
+The cost is opt-in and proportional to what you enable:
+
+- Disabled categories cost a predicated load; the `enabled`/`IsEnabled` guard makes
+  every callback an early return.
+- `traceModules=[...]` is the main overhead lever: only selected modules emit
+  slices, so a focused run on one algorithm avoids the hundreds of per-module
+  events of a full event. The module-context stack push/pop is allocation-free.
+- `traceAllocations`: when off, the allocator hook is a single relaxed atomic load
+  per alloc/free; when on, one instant + a few counters per transaction.
+- `traceGpuKernels`: CUPTI activity tracing is asynchronous and does not serialize
+  kernels; it is the cheapest of the three relative to the information it adds.
+- `maxEvents` caps the trace size (and thus buffer pressure) regardless of job length.
diff --git a/PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h b/PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h
new file mode 100644
index 0000000000000..087cf89bffa9d
--- /dev/null
+++ b/PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h
@@ -0,0 +1,17 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#ifndef CMSSW_PERFETTO_CATEGORIES_H
+#define CMSSW_PERFETTO_CATEGORIES_H
+#include <perfetto.h>
+
+PERFETTO_DEFINE_CATEGORIES(perfetto::Category("cmssw.event"),
+                           perfetto::Category("cmssw.source"),
+                           perfetto::Category("cmssw.module"),
+                           perfetto::Category("cmssw.acquire"),
+                           perfetto::Category("cmssw.cleanup"),
+                           perfetto::Category("cmssw.es"),
+                           perfetto::Category("cmssw.func"),
+                           perfetto::Category("cmssw.alloc"),
+                           perfetto::Category("cmssw.gpu"),
+                           perfetto::Category("cmssw.power"));
+
+#endif
diff --git a/PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h b/PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h
new file mode 100644
index 0000000000000..a87c16a281831
--- /dev/null
+++ b/PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h
@@ -0,0 +1,59 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#ifndef PerfTools_Perfetto_interface_CMSSWPerfettoLanes_h
+#define PerfTools_Perfetto_interface_CMSSWPerfettoLanes_h
+
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h"
+#include <perfetto.h>
+
+#include <atomic>
+#include <cstdint>
+#include <set>
+#include <string>
+
+// Per-stream "lane" tracks.
+//
+// CMSSW serializes events within a stream but runs the modules of one event as
+// concurrent TBB tasks on different threads (and an ExternalWork module's
+// acquire()/produce() run on different threads). To show that work *under* its
+// edm::stream -- instead of scattered across process-wide thread tracks -- each
+// slice is emitted on a lane keyed by (stream, executing thread), parented to the
+// stream track. Because every lane is therefore fed by exactly one thread, the
+// begin/end events on it arrive in order and nest correctly; concurrent work in
+// the same stream simply lands on separate lanes shown side by side.
+namespace cms::perfetto {
+
+  inline constexpr uint64_t kStreamBase = 0x5354524D00000000ull;  // "STRM...."
+  inline constexpr uint64_t kLaneBase = 0x4C414E4500000000ull;    // "LANE...."
+
+  inline uint64_t streamUuid(unsigned sid) noexcept { return kStreamBase | (uint64_t{sid} << 16); }
+
+  inline ::perfetto::Track streamTrack(unsigned sid) {
+    return ::perfetto::Track(streamUuid(sid), ::perfetto::ProcessTrack::Current());
+  }
+
+  // A small, stable ordinal for the calling OS/TBB thread (assigned on first use).
+  inline unsigned threadOrdinal() noexcept {
+    static std::atomic<unsigned> next{0};
+    static thread_local unsigned ord = next.fetch_add(1, std::memory_order_relaxed);
+    return ord;
+  }
+
+  // The lane for the current thread within stream |sid|, as a child of the stream
+  // track. The lane uuid embeds the thread ordinal, so it is owned by a single
+  // thread -- the descriptor can be written from a thread-local guard, no lock.
+  inline ::perfetto::Track laneTrack(unsigned sid) {
+    unsigned const ord = threadOrdinal();
+    uint64_t const uuid = kLaneBase | (uint64_t{sid} << 24) | ord;
+    static thread_local std::set<unsigned> described;  // streams this thread has named a lane for
+    if (described.insert(sid).second) {
+      ::perfetto::Track t(uuid, streamTrack(sid));
+      auto d = t.Serialize();
+      d.set_name("thread " + std::to_string(ord));
+      ::perfetto::TrackEvent::SetTrackDescriptor(t, d);
+    }
+    return ::perfetto::Track(uuid, streamTrack(sid));
+  }
+
+}  // namespace cms::perfetto
+
+#endif  // PerfTools_Perfetto_interface_CMSSWPerfettoLanes_h
diff --git a/PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h b/PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h
new file mode 100644
index 0000000000000..231ed5080fed4
--- /dev/null
+++ b/PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h
@@ -0,0 +1,70 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#ifndef PerfTools_Perfetto_interface_CMSSWPerfettoModuleContext_h
+#define PerfTools_Perfetto_interface_CMSSWPerfettoModuleContext_h
+
+#include <type_traits>
+#include <utility>
+
+namespace cms::perfetto {
+
+  // Thread-local record of which CMSSW module (if any) is currently executing on
+  // this thread. PerfettoTraceService pushes it around every module/acquire call,
+  // so lower-level instrumentation that runs *inside* module code -- the
+  // caching-allocator monitor and the GPU/CUPTI layer -- can attribute its work
+  // to the responsible module without being passed any context.
+  //
+  // It is a *stack*, not a single slot, on purpose: TBB work-stealing can run
+  // another module's produce() on this same thread while the current one is
+  // blocked in a tbb::parallel_for (or similar), nested inside it. A stack
+  // restores the enclosing module's context when that nested work returns.
+  //
+  // The char pointers reference the module's ModuleDescription, which outlives
+  // the call, so they stay valid while the context is on the stack.
+  struct ModuleContext {
+    const char* label = nullptr;
+    const char* type = nullptr;
+    unsigned moduleId = 0;
+    unsigned streamId = 0xffffffffu;
+    unsigned long long eventId = 0;
+    bool active = false;
+  };
+
+  void pushModuleContext(ModuleContext const& ctx) noexcept;
+  void popModuleContext() noexcept;
+  void resetModuleContext() noexcept;  // empty the stack (defensive, at boundaries)
+  ModuleContext const& currentModuleContext() noexcept;
+
+  // ---- Propagating the context across nested parallelism --------------------
+  //
+  // The context is thread-local and is pushed only on the thread that runs a
+  // module's produce()/acquire(). If that code spawns work on *other* threads
+  // (a tbb::parallel_for, edm::Async, ...), those helper threads do NOT see the
+  // context, so allocations and tier-B slices they make are not attributed to
+  // the module. Capture the context and re-apply it on the helper thread.
+
+  // RAII: push a captured context for a scope, pop it on exit. Nests correctly.
+  class ModuleContextGuard {
+  public:
+    explicit ModuleContextGuard(ModuleContext const& ctx) noexcept { pushModuleContext(ctx); }
+    ~ModuleContextGuard() noexcept { popModuleContext(); }
+    ModuleContextGuard(ModuleContextGuard const&) = delete;
+    ModuleContextGuard& operator=(ModuleContextGuard const&) = delete;
+  };
+
+  // Wrap a callable so it runs with the *current* module context applied. Use it
+  // as the body of a tbb::parallel_for/parallel_reduce (or anything that runs on
+  // borrowed threads) so allocations and CMS_PERFETTO_FUNC slices made inside are
+  // still attributed to the enclosing module:
+  //
+  //   tbb::parallel_for(range, cms::perfetto::withModuleContext([&](auto const& r){ ... }));
+  template <class F>
+  auto withModuleContext(F&& f) {
+    return [ctx = currentModuleContext(), fn = std::decay_t<F>(std::forward<F>(f))](auto&&... args) mutable {
+      ModuleContextGuard guard(ctx);
+      return fn(std::forward<decltype(args)>(args)...);
+    };
+  }
+
+}  // namespace cms::perfetto
+
+#endif  // PerfTools_Perfetto_interface_CMSSWPerfettoModuleContext_h
diff --git a/PerfTools/Perfetto/interface/CMSSWPerfettoTrace.h b/PerfTools/Perfetto/interface/CMSSWPerfettoTrace.h
new file mode 100644
index 0000000000000..02face0425592
--- /dev/null
+++ b/PerfTools/Perfetto/interface/CMSSWPerfettoTrace.h
@@ -0,0 +1,58 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#ifndef PerfTools_Perfetto_interface_CMSSWPerfettoTrace_h
+#define PerfTools_Perfetto_interface_CMSSWPerfettoTrace_h
+
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h"
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h"
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h"
+#include <perfetto.h>
+
+#include <optional>
+
+// Tier-B, per-function instrumentation. The slice is emitted on the lane of the
+// module currently executing on this thread (from cms::perfetto::ModuleContext),
+// so it nests under that module's slice; outside any module it falls back to the
+// calling thread's own track. Because a SliceScope lives entirely on one thread,
+// its captured begin/end track always match.
+namespace cms::perfetto_trace {
+
+  struct SliceScope {
+    explicit SliceScope(const char* name) noexcept {
+      if (!::perfetto::TrackEvent::IsEnabled())
+        return;
+      auto const& m = cms::perfetto::currentModuleContext();
+      if (m.active && m.streamId != 0xffffffffu) {
+        track_.emplace(cms::perfetto::laneTrack(m.streamId));
+        TRACE_EVENT_BEGIN("cmssw.func", ::perfetto::DynamicString(name), *track_);
+      } else {
+        TRACE_EVENT_BEGIN("cmssw.func", ::perfetto::DynamicString(name));
+      }
+      active_ = true;
+    }
+
+    ~SliceScope() noexcept {
+      if (!active_)
+        return;
+      if (track_)
+        TRACE_EVENT_END("cmssw.func", *track_);
+      else
+        TRACE_EVENT_END("cmssw.func");
+    }
+
+    SliceScope(SliceScope const&) = delete;
+    SliceScope& operator=(SliceScope const&) = delete;
+
+  private:
+    std::optional<::perfetto::Track> track_;
+    bool active_ = false;
+  };
+
+}  // namespace cms::perfetto_trace
+
+#define CMS_PERFETTO_FUNC() \
+  cms::perfetto_trace::SliceScope PERFETTO_UID(_cms_perfetto_func_) { __func__ }
+
+#define CMS_PERFETTO_SCOPE(name_literal) \
+  cms::perfetto_trace::SliceScope PERFETTO_UID(_cms_perfetto_scope_) { name_literal }
+
+#endif  // PerfTools_Perfetto_interface_CMSSWPerfettoTrace_h
diff --git a/PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h b/PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h
new file mode 100644
index 0000000000000..8a16e9ecc70e0
--- /dev/null
+++ b/PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h
@@ -0,0 +1,104 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#ifndef PerfTools_Perfetto_interface_PerfettoAllocatorMonitor_h
+#define PerfTools_Perfetto_interface_PerfettoAllocatorMonitor_h
+
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h"
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h"
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h"
+#include <perfetto.h>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h"
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+namespace cms::perfetto {
+
+  // Turns Alpaka caching-allocator transactions into Perfetto events. Each
+  // alloc/free is emitted as an INSTANT on the calling thread's track -- so it
+  // sits, visually and by annotation, under the module slice that triggered it --
+  // and the live / cached / requested byte totals are emitted as per-device
+  // counters, giving a device-memory-pressure timeline. Attribution to the
+  // responsible module comes from the thread-local cms::perfetto::ModuleContext.
+  class PerfettoAllocatorMonitor : public cms::alpakatools::CachingAllocatorMonitor {
+  public:
+    PerfettoAllocatorMonitor() {
+      for (int d = 0; d <= kMaxDevice; ++d) {
+        std::string const tag = (d == kMaxDevice) ? std::string("host") : ("dev" + std::to_string(d));
+        names_[d][0] = tag + " live (B)";
+        names_[d][1] = tag + " cached (B)";
+        names_[d][2] = tag + " requested (B)";
+      }
+    }
+
+    void onAllocate(int device,
+                    const void* /*ptr*/,
+                    std::size_t bytes,
+                    std::size_t requested,
+                    bool cacheHit,
+                    unsigned long long queue) noexcept override {
+      if (!::perfetto::TrackEvent::IsEnabled())
+        return;
+      auto const& m = cms::perfetto::currentModuleContext();
+      TRACE_EVENT_INSTANT("cmssw.alloc",
+                          "alloc",
+                          trackFor(m),
+                          "module",
+                          ::perfetto::DynamicString(m.active && m.label ? m.label : "(none)"),
+                          "bytes",
+                          static_cast<uint64_t>(bytes),
+                          "requested",
+                          static_cast<uint64_t>(requested),
+                          "cache_hit",
+                          cacheHit,
+                          "device",
+                          device,
+                          "queue",
+                          queue);
+    }
+
+    void onFree(int device, const void* /*ptr*/, std::size_t bytes, unsigned long long queue) noexcept override {
+      if (!::perfetto::TrackEvent::IsEnabled())
+        return;
+      auto const& m = cms::perfetto::currentModuleContext();
+      TRACE_EVENT_INSTANT("cmssw.alloc",
+                          "free",
+                          trackFor(m),
+                          "module",
+                          ::perfetto::DynamicString(m.active && m.label ? m.label : "(none)"),
+                          "bytes",
+                          static_cast<uint64_t>(bytes),
+                          "device",
+                          device,
+                          "queue",
+                          queue);
+    }
+
+    void onUsage(int device, std::size_t live, std::size_t cached, std::size_t requested) noexcept override {
+      if (!::perfetto::TrackEvent::IsEnabled())
+        return;
+      int const d = (device < 0 || device >= kMaxDevice) ? kMaxDevice : device;
+      TRACE_COUNTER("cmssw.alloc", ::perfetto::CounterTrack(::perfetto::DynamicString{names_[d][0].c_str()}), live);
+      TRACE_COUNTER("cmssw.alloc", ::perfetto::CounterTrack(::perfetto::DynamicString{names_[d][1].c_str()}), cached);
+      TRACE_COUNTER(
+          "cmssw.alloc", ::perfetto::CounterTrack(::perfetto::DynamicString{names_[d][2].c_str()}), requested);
+    }
+
+  private:
+    // The alloc/free instant goes on the lane of the module that triggered it (so it
+    // sits under that module's slice). Outside any traced module it falls back to the
+    // calling thread's own track.
+    static ::perfetto::Track trackFor(cms::perfetto::ModuleContext const& m) {
+      if (m.active && m.streamId != 0xffffffffu)
+        return cms::perfetto::laneTrack(m.streamId);
+      return ::perfetto::ThreadTrack::Current();
+    }
+
+    static constexpr int kMaxDevice = 16;  // index kMaxDevice == host
+    std::array<std::array<std::string, 3>, kMaxDevice + 1> names_;
+  };
+
+}  // namespace cms::perfetto
+
+#endif  // PerfTools_Perfetto_interface_PerfettoAllocatorMonitor_h
diff --git a/PerfTools/Perfetto/plugins/BuildFile.xml b/PerfTools/Perfetto/plugins/BuildFile.xml
new file mode 100644
index 0000000000000..dbaa5faf578f4
--- /dev/null
+++ b/PerfTools/Perfetto/plugins/BuildFile.xml
@@ -0,0 +1,13 @@
+<use name="PerfTools/Perfetto"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/Utilities"/>
+<use name="HeterogeneousCore/AlpakaInterface"/>
+<use name="sockets"/>
+<iftool name="cuda">
+  <use name="cuda"/>
+  <use name="cupti"/>
+  <flags CPPDEFINES="PERFETTO_HAS_CUPTI"/>
+</iftool>
+<flags EDM_PLUGIN="1"/>
diff --git a/PerfTools/Perfetto/plugins/PerfettoCuptiProfiler.h b/PerfTools/Perfetto/plugins/PerfettoCuptiProfiler.h
new file mode 100644
index 0000000000000..9c0a3c9233523
--- /dev/null
+++ b/PerfTools/Perfetto/plugins/PerfettoCuptiProfiler.h
@@ -0,0 +1,233 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#ifndef PerfTools_Perfetto_plugins_PerfettoCuptiProfiler_h
+#define PerfTools_Perfetto_plugins_PerfettoCuptiProfiler_h
+
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h"
+#include <perfetto.h>
+
+// CUDA/CUPTI exist only on some architectures, so the plugin BuildFile pulls in
+// cuda/cupti and defines PERFETTO_HAS_CUPTI only inside <iftool name="cuda">.
+// Build the real profiler when it is set; otherwise provide a no-op stub below so
+// PerfettoTraceService compiles unchanged (traceGpuKernels then does nothing).
+#ifdef PERFETTO_HAS_CUPTI
+
+#include <cuda_runtime.h>
+#include <cupti.h>
+
+#include <cxxabi.h>
+#include <time.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <set>
+#include <string>
+#include <vector>
+
+// Streams CUDA kernel activity into the running Perfetto session using CUPTI.
+//
+// Each kernel is emitted as a slice on a per-(device, CUDA stream) track, placed
+// at the *device-side* start/end timestamps reported by CUPTI -- i.e. the real
+// GPU execution time, not the host enqueue/wait time. The slice is annotated with
+// the per-launch static resource usage CUPTI reports (registers/thread, static &
+// dynamic shared memory, local memory/thread), the launch configuration
+// (grid/block), an estimated theoretical occupancy, and the CUPTI correlation id
+// (which links it to the host module that launched it).
+//
+// CUPTI is a driver-level profiler, so this works on the already-compiled release
+// Alpaka/CUDA plugins without rebuilding them. The activity-tracing path is low
+// overhead and does not serialize kernels.
+namespace cms::perfetto {
+
+  class PerfettoCuptiProfiler {
+  public:
+    // Returns true if CUPTI kernel tracing was activated (a CUDA device is present
+    // and CUPTI accepted the configuration).
+    bool start() {
+      int count = 0;
+      if (cudaGetDeviceCount(&count) != cudaSuccess || count == 0)
+        return false;
+
+      // Cache per-device properties for the occupancy estimate.
+      props_.resize(count);
+      for (int d = 0; d < count; ++d)
+        cudaGetDeviceProperties(&props_[d], d);
+
+      // Correlate the CUPTI clock to CLOCK_BOOTTIME (Perfetto's default trace
+      // clock), so GPU slices line up with the host timeline.
+      uint64_t cuptiNow = 0;
+      cuptiGetTimestamp(&cuptiNow);
+      timespec ts{};
+      clock_gettime(CLOCK_BOOTTIME, &ts);
+      int64_t bootNow = int64_t(ts.tv_sec) * 1000000000LL + ts.tv_nsec;
+      offsetNs_ = bootNow - int64_t(cuptiNow);
+
+      s_instance = this;
+      if (cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted) != CUPTI_SUCCESS)
+        return false;
+      if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) != CUPTI_SUCCESS)
+        return false;
+      active_ = true;
+      return true;
+    }
+
+    // Drain remaining activity records into the (still open) Perfetto session.
+    void flush() {
+      if (active_)
+        cuptiActivityFlushAll(1);
+    }
+
+    void stop() {
+      if (active_) {
+        cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+        active_ = false;
+      }
+      s_instance = nullptr;
+    }
+
+  private:
+    static constexpr uint64_t kGpuTrackBase = 0x4750550000000000ull;  // "GPU....."
+
+    static void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) {
+      constexpr size_t kSize = 8 * 1024 * 1024;
+      *buffer = static_cast<uint8_t*>(std::aligned_alloc(8, kSize));
+      *size = kSize;
+      *maxNumRecords = 0;
+    }
+
+    static void bufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) {
+      if (s_instance && validSize > 0) {
+        CUpti_Activity* record = nullptr;
+        while (cuptiActivityGetNextRecord(buffer, validSize, &record) == CUPTI_SUCCESS) {
+          if (record->kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+            s_instance->handleKernel(reinterpret_cast<CUpti_ActivityKernel9*>(record));
+        }
+      }
+      std::free(buffer);
+    }
+
+    ::perfetto::Track gpuTrack(uint32_t device, uint32_t stream) {
+      uint64_t const uuid = kGpuTrackBase | (uint64_t{device} << 24) | stream;
+      {
+        std::scoped_lock lock(mutex_);
+        if (seen_.insert(uuid).second) {
+          ::perfetto::Track t(uuid, ::perfetto::ProcessTrack::Current());
+          auto d = t.Serialize();
+          d.set_name("GPU" + std::to_string(device) + " stream " + std::to_string(stream));
+          ::perfetto::TrackEvent::SetTrackDescriptor(t, d);
+        }
+      }
+      return ::perfetto::Track(uuid, ::perfetto::ProcessTrack::Current());
+    }
+
+    static std::string shorten(const char* demangled) {
+      std::string s = demangled ? demangled : "";
+      auto p = s.find("gpuKernel<");
+      if (p == std::string::npos)
+        return s.size() > 96 ? s.substr(0, 96) : s;
+      p += 10;
+      int depth = 0;
+      size_t i = p;
+      for (; i < s.size(); ++i) {
+        char c = s[i];
+        if (c == '<')
+          ++depth;
+        else if (c == '>') {
+          if (depth == 0)
+            break;
+          --depth;
+        } else if (c == ',' && depth == 0)
+          break;
+      }
+      return s.substr(p, i - p);
+    }
+
+    double occupancy(int device, uint32_t regsPerThread, uint32_t smem, uint32_t blockThreads) const {
+      if (device < 0 || device >= int(props_.size()) || blockThreads == 0)
+        return 0.;
+      auto const& p = props_[device];
+      int blocksThreads = p.maxThreadsPerMultiProcessor / int(blockThreads);
+      int blocksReg = regsPerThread > 0 ? p.regsPerMultiprocessor / int(regsPerThread * blockThreads) : blocksThreads;
+      int blocksSmem = smem > 0 ? int(p.sharedMemPerMultiprocessor / smem) : blocksThreads;
+      int blocks = std::max(0, std::min({blocksThreads, blocksReg, blocksSmem}));
+      return double(blocks * int(blockThreads)) / double(p.maxThreadsPerMultiProcessor);
+    }
+
+    void handleKernel(CUpti_ActivityKernel9 const* k) {
+      char* demangled = abi::__cxa_demangle(k->name, nullptr, nullptr, nullptr);
+      std::string const name = shorten(demangled ? demangled : k->name);
+      std::string const full = demangled ? demangled : (k->name ? k->name : "");
+      std::free(demangled);
+
+      uint32_t const blockThreads = k->blockX * k->blockY * k->blockZ;
+      uint32_t const smem = k->staticSharedMemory + k->dynamicSharedMemory;
+      double const occ = occupancy(int(k->deviceId), k->registersPerThread, smem, blockThreads);
+      std::string const grid =
+          std::to_string(k->gridX) + "x" + std::to_string(k->gridY) + "x" + std::to_string(k->gridZ);
+      std::string const block =
+          std::to_string(k->blockX) + "x" + std::to_string(k->blockY) + "x" + std::to_string(k->blockZ);
+
+      auto track = gpuTrack(k->deviceId, k->streamId);
+      ::perfetto::TraceTimestamp const tsBegin{6, uint64_t(int64_t(k->start) + offsetNs_)};  // 6 == BOOTTIME
+      ::perfetto::TraceTimestamp const tsEnd{6, uint64_t(int64_t(k->end) + offsetNs_)};
+
+      TRACE_EVENT_BEGIN("cmssw.gpu",
+                        ::perfetto::DynamicString{name.c_str()},
+                        track,
+                        tsBegin,
+                        "registers_per_thread",
+                        k->registersPerThread,
+                        "static_smem_B",
+                        k->staticSharedMemory,
+                        "dynamic_smem_B",
+                        k->dynamicSharedMemory,
+                        "local_per_thread_B",
+                        k->localMemoryPerThread,
+                        "local_total_B",
+                        k->localMemoryTotal,
+                        "grid",
+                        ::perfetto::DynamicString{grid.c_str()},
+                        "block",
+                        ::perfetto::DynamicString{block.c_str()},
+                        "occupancy_est",
+                        occ,
+                        "correlation_id",
+                        k->correlationId,
+                        "kernel",
+                        ::perfetto::DynamicString{full.c_str()});
+      TRACE_EVENT_END("cmssw.gpu", track, tsEnd);
+    }
+
+    bool active_ = false;
+    int64_t offsetNs_ = 0;
+    std::vector<cudaDeviceProp> props_;
+    std::mutex mutex_;
+    std::set<uint64_t> seen_;
+
+    static PerfettoCuptiProfiler* s_instance;
+  };
+
+  inline PerfettoCuptiProfiler* PerfettoCuptiProfiler::s_instance = nullptr;
+
+}  // namespace cms::perfetto
+
+#else  // PERFETTO_HAS_CUPTI
+
+namespace cms::perfetto {
+
+  // No CUDA/CUPTI on this architecture: a no-op stub so that the rest of the
+  // service (lanes, counters, allocator, power) builds and runs normally.
+  class PerfettoCuptiProfiler {
+  public:
+    bool start() { return false; }
+    void flush() {}
+    void stop() {}
+  };
+
+}  // namespace cms::perfetto
+
+#endif  // PERFETTO_HAS_CUPTI
+
+#endif  // PerfTools_Perfetto_plugins_PerfettoCuptiProfiler_h
diff --git a/PerfTools/Perfetto/plugins/PerfettoPowerSampler.h b/PerfTools/Perfetto/plugins/PerfettoPowerSampler.h
new file mode 100644
index 0000000000000..f8ce9b75217c7
--- /dev/null
+++ b/PerfTools/Perfetto/plugins/PerfettoPowerSampler.h
@@ -0,0 +1,168 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#ifndef PerfTools_Perfetto_plugins_PerfettoPowerSampler_h
+#define PerfTools_Perfetto_plugins_PerfettoPowerSampler_h
+
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h"
+#include <perfetto.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstdint>
+#include <cstdio>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <dlfcn.h>
+#include <time.h>
+
+// Samples CPU and GPU power on a background thread and emits them as Perfetto
+// counter tracks: "CPU pkg<n> power (W)" from the Intel RAPL energy counters
+// (/sys/class/powercap/intel-rapl:*), and "GPU<d> power (W)" from NVML
+// (nvmlDeviceGetPowerUsage). NVML is loaded with dlopen so there is no build-time
+// dependency on it; if neither source is available the sampler is a no-op.
+namespace cms::perfetto {
+
+  class PerfettoPowerSampler {
+  public:
+    // Returns true if at least one power source (RAPL or NVML) was found and the
+    // sampling thread was started. periodMs is kept modest by default because NVML
+    // power queries are not free and can perturb the GPU if polled too often.
+    bool start(unsigned periodMs = 1000) {
+      periodMs_ = periodMs;
+      openGpu();
+      openCpu();
+      if (gpus_.empty() && cpus_.empty())
+        return false;
+      running_ = true;
+      thread_ = std::thread([this] { loop(); });
+      return true;
+    }
+
+    void stop() {
+      if (running_.exchange(false)) {
+        cv_.notify_all();
+        if (thread_.joinable())
+          thread_.join();
+      }
+      if (nvml_) {
+        if (nvmlShutdown_)
+          nvmlShutdown_();
+        ::dlclose(nvml_);
+        nvml_ = nullptr;
+      }
+    }
+
+    ~PerfettoPowerSampler() { stop(); }
+
+  private:
+    using FnInt = int (*)();
+    using FnCount = int (*)(unsigned*);
+    using FnHandle = int (*)(unsigned, void**);
+    using FnPower = int (*)(void*, unsigned*);
+
+    static int64_t nowNs() {
+      timespec ts{};
+      clock_gettime(CLOCK_BOOTTIME, &ts);
+      return int64_t(ts.tv_sec) * 1000000000LL + ts.tv_nsec;
+    }
+
+    static bool readU64(std::string const& path, uint64_t& out) {
+      std::FILE* f = std::fopen(path.c_str(), "re");
+      if (!f)
+        return false;
+      bool const ok = std::fscanf(f, "%lu", &out) == 1;
+      std::fclose(f);
+      return ok;
+    }
+
+    void openGpu() {
+      nvml_ = ::dlopen("libnvidia-ml.so.1", RTLD_NOW | RTLD_LOCAL);
+      if (!nvml_)
+        return;
+      nvmlShutdown_ = reinterpret_cast<FnInt>(::dlsym(nvml_, "nvmlShutdown"));
+      auto init = reinterpret_cast<FnInt>(::dlsym(nvml_, "nvmlInit_v2"));
+      auto count = reinterpret_cast<FnCount>(::dlsym(nvml_, "nvmlDeviceGetCount_v2"));
+      auto handle = reinterpret_cast<FnHandle>(::dlsym(nvml_, "nvmlDeviceGetHandleByIndex_v2"));
+      nvmlPower_ = reinterpret_cast<FnPower>(::dlsym(nvml_, "nvmlDeviceGetPowerUsage"));
+      if (!init || !count || !handle || !nvmlPower_ || init() != 0)
+        return;
+      unsigned n = 0;
+      if (count(&n) != 0)
+        return;
+      for (unsigned i = 0; i < n; ++i) {
+        void* h = nullptr;
+        if (handle(i, &h) == 0) {
+          gpus_.push_back(h);
+          gpuNames_.push_back("GPU" + std::to_string(i) + " power (W)");
+        }
+      }
+    }
+
+    void openCpu() {
+      for (int pkg = 0;; ++pkg) {
+        std::string const base = "/sys/class/powercap/intel-rapl:" + std::to_string(pkg);
+        uint64_t energy = 0;
+        if (!readU64(base + "/energy_uj", energy))
+          break;
+        uint64_t range = 0;
+        readU64(base + "/max_energy_range_uj", range);
+        cpus_.push_back({base + "/energy_uj", energy, range ? range : ~uint64_t{0}, nowNs()});
+        cpuNames_.push_back("CPU pkg" + std::to_string(pkg) + " power (W)");
+      }
+    }
+
+    void loop() {
+      while (running_.load()) {
+        for (std::size_t i = 0; i < gpus_.size(); ++i) {
+          unsigned mw = 0;
+          if (nvmlPower_(gpus_[i], &mw) == 0)
+            TRACE_COUNTER(
+                "cmssw.power", ::perfetto::CounterTrack(::perfetto::DynamicString{gpuNames_[i].c_str()}), mw / 1000.0);
+        }
+        for (std::size_t i = 0; i < cpus_.size(); ++i) {
+          uint64_t energy = 0;
+          if (!readU64(cpus_[i].path, energy))
+            continue;
+          int64_t const now = nowNs();
+          uint64_t const dE = (energy >= cpus_[i].lastEnergy) ? (energy - cpus_[i].lastEnergy)
+                                                              : (cpus_[i].range - cpus_[i].lastEnergy + energy);
+          double const dt = double(now - cpus_[i].lastTime) / 1e9;
+          if (dt > 0.)
+            TRACE_COUNTER("cmssw.power",
+                          ::perfetto::CounterTrack(::perfetto::DynamicString{cpuNames_[i].c_str()}),
+                          double(dE) / 1e6 / dt);
+          cpus_[i].lastEnergy = energy;
+          cpus_[i].lastTime = now;
+        }
+        std::unique_lock<std::mutex> lock(mutex_);
+        cv_.wait_for(lock, std::chrono::milliseconds(periodMs_), [this] { return !running_.load(); });
+      }
+    }
+
+    struct Rapl {
+      std::string path;
+      uint64_t lastEnergy;
+      uint64_t range;
+      int64_t lastTime;
+    };
+
+    unsigned periodMs_ = 50;
+    void* nvml_ = nullptr;
+    FnInt nvmlShutdown_ = nullptr;
+    FnPower nvmlPower_ = nullptr;
+    std::vector<void*> gpus_;
+    std::vector<Rapl> cpus_;
+    std::vector<std::string> gpuNames_;
+    std::vector<std::string> cpuNames_;
+
+    std::atomic<bool> running_{false};
+    std::mutex mutex_;
+    std::condition_variable cv_;
+    std::thread thread_;
+  };
+
+}  // namespace cms::perfetto
+
+#endif  // PerfTools_Perfetto_plugins_PerfettoPowerSampler_h
diff --git a/PerfTools/Perfetto/plugins/PerfettoTraceService.cc b/PerfTools/Perfetto/plugins/PerfettoTraceService.cc
new file mode 100644
index 0000000000000..cdc84dd0fe9de
--- /dev/null
+++ b/PerfTools/Perfetto/plugins/PerfettoTraceService.cc
@@ -0,0 +1,469 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h"
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h"
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h"
+#include "PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h"
+#include <perfetto.h>
+#include "PerfTools/Perfetto/plugins/PerfettoCuptiProfiler.h"
+#include "PerfTools/Perfetto/plugins/PerfettoPowerSampler.h"
+
+#include "HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h"
+
+#include "DataFormats/Provenance/interface/ModuleDescription.h"
+#include "FWCore/Framework/interface/ComponentDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "FWCore/ServiceRegistry/interface/ESModuleCallingContext.h"
+#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
+#include "FWCore/ServiceRegistry/interface/ParentContext.h"
+#include "FWCore/ServiceRegistry/interface/PathContext.h"
+#include "FWCore/ServiceRegistry/interface/PlaceInPathContext.h"
+#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
+#include "FWCore/ServiceRegistry/interface/StreamContext.h"
+#include "FWCore/ServiceRegistry/interface/SystemBounds.h"
+
+#include <algorithm>
+#include <atomic>
+#include <ctime>
+#include <deque>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <fcntl.h>
+#include <unistd.h>
+
+// PerfettoTraceService records a Perfetto trace (.pftrace) of a cmsRun job using
+// the in-process Perfetto SDK.
+//
+// Threading model (this is the important part). CMSSW runs one global TBB arena;
+// within a *single* stream and a *single* event, independent modules execute as
+// concurrent tasks on different threads, and an ExternalWork module's acquire()
+// and produce() run on different threads with an asynchronous gap in between. A
+// single per-stream timeline therefore cannot represent module execution without
+// overlapping (mis-paired) slices. So:
+//
+//   * module / acquire / EventSetup / source / cleanup slices are emitted on a
+//     per-(stream, thread) lane that hangs under the stream track (see
+//     CMSSWPerfettoLanes.h). Each lane is fed by exactly one thread, so its slices
+//     nest correctly; concurrent work in a stream lands on separate lanes.
+//   * the per-stream "Event" lifetime (preEvent..postClearEvent) IS serialized
+//     per stream, so it sits on the stream track itself, together with the
+//     run/lumi/event counters.
+//
+// Around every module call the service also publishes a thread-local
+// cms::perfetto::ModuleContext, so the caching-allocator monitor and the CUPTI
+// GPU layer can attribute their work to the responsible module.
+
+class PerfettoTraceService {
+public:
+  PerfettoTraceService(edm::ParameterSet const& pset, edm::ActivityRegistry& ar)
+      : enabled_(pset.getUntrackedParameter<bool>("enabled")),
+        fileName_(pset.getUntrackedParameter<std::string>("fileName")),
+        bufferSizeKB_(pset.getUntrackedParameter<unsigned>("bufferSizeKB")),
+        shmemSizeKB_(pset.getUntrackedParameter<unsigned>("shmemSizeKB")),
+        maxEvents_(pset.getUntrackedParameter<unsigned>("maxEvents")),
+        traceFunctions_(pset.getUntrackedParameter<bool>("traceFunctions")),
+        traceAllocations_(pset.getUntrackedParameter<bool>("traceAllocations")),
+        traceGpuKernels_(pset.getUntrackedParameter<bool>("traceGpuKernels")),
+        tracePower_(pset.getUntrackedParameter<bool>("tracePower")),
+        powerPeriodMs_(pset.getUntrackedParameter<unsigned>("powerPeriodMs")),
+        traceModules_(pset.getUntrackedParameter<std::vector<std::string>>("traceModules")) {
+    std::sort(traceModules_.begin(), traceModules_.end());
+    if (!enabled_)
+      return;
+
+    ::perfetto::TracingInitArgs args;
+    args.backends = ::perfetto::kInProcessBackend;
+    // Size the producer shared-memory buffer (SMB) for the high, bursty slice rate
+    // of many concurrent edm::streams. With the SDK default (~4 MB) the SMB
+    // saturates and TrackEvent (kDrop policy) silently discards slices -- dropping
+    // whole events from the trace. 32 KB chunks (the SDK maximum) reduce per-chunk
+    // contention under heavy multithreading. kDrop is kept (not kStall) so tracing
+    // never blocks the reconstruction threads.
+    args.shmem_size_hint_kb = shmemSizeKB_;
+    args.shmem_page_size_hint_kb = 32;
+    ::perfetto::Tracing::Initialize(args);
+    ::perfetto::TrackEvent::Register();
+
+    ::perfetto::TraceConfig cfg;
+    cfg.add_buffers()->set_size_kb(bufferSizeKB_);
+
+    auto* ds = cfg.add_data_sources();
+    auto* ds_cfg = ds->mutable_config();
+    ds_cfg->set_name("track_event");
+
+    ::perfetto::protos::gen::TrackEventConfig te;
+    te.add_enabled_categories("cmssw.event");
+    te.add_enabled_categories("cmssw.source");
+    te.add_enabled_categories("cmssw.module");
+    te.add_enabled_categories("cmssw.acquire");
+    te.add_enabled_categories("cmssw.es");
+    te.add_enabled_categories("cmssw.cleanup");
+    te.add_enabled_categories("cmssw.func");
+    te.add_enabled_categories("cmssw.alloc");
+    te.add_enabled_categories("cmssw.gpu");
+    te.add_enabled_categories("cmssw.power");
+    ds_cfg->set_track_event_config_raw(te.SerializeAsString());
+
+    // In-memory session: the whole trace is held in the buffer and written once at
+    // the end. This keeps the GPU/CUPTI events -- emitted at end-of-job with their
+    // real, earlier device timestamps -- correctly ordered, and avoids perfetto's
+    // write_into_file "no flush" warning. The trace is bounded by bufferSizeKB;
+    // raise it for very long jobs.
+    session_ = ::perfetto::Tracing::NewTrace();
+    session_->Setup(cfg);
+    session_->StartBlocking();
+
+    {
+      auto proc = ::perfetto::ProcessTrack::Current();
+      auto desc = proc.Serialize();
+      desc.mutable_process()->set_process_name("cmsRun");
+      ::perfetto::TrackEvent::SetTrackDescriptor(proc, desc);
+    }
+
+    ar.watchPreallocate(this, &PerfettoTraceService::preallocate);
+
+    ar.watchPreSourceEvent(this, &PerfettoTraceService::preSourceEvent);
+    ar.watchPostSourceEvent(this, &PerfettoTraceService::postSourceEvent);
+
+    ar.watchPreEvent(this, &PerfettoTraceService::preEvent);
+    ar.watchPreClearEvent(this, &PerfettoTraceService::preClearEvent);
+    ar.watchPostClearEvent(this, &PerfettoTraceService::postClearEvent);
+
+    ar.watchPreModuleEvent(this, &PerfettoTraceService::preModuleEvent);
+    ar.watchPostModuleEvent(this, &PerfettoTraceService::postModuleEvent);
+    ar.watchPreModuleEventAcquire(this, &PerfettoTraceService::preModuleEventAcquire);
+    ar.watchPostModuleEventAcquire(this, &PerfettoTraceService::postModuleEventAcquire);
+
+    ar.watchPreESModule(this, &PerfettoTraceService::preESModule);
+    ar.watchPostESModule(this, &PerfettoTraceService::postESModule);
+
+    ar.watchPostEndJob(this, &PerfettoTraceService::postEndJob);
+
+    // Observe caching-allocator transactions (device/host memory). Registered
+    // last so it is active for the whole job.
+    if (traceAllocations_)
+      cms::alpakatools::setCachingAllocatorMonitor(&allocatorMonitor_);
+
+    // Stream CUDA kernel activity (real device-side timing + register/occupancy
+    // info) into the session via CUPTI; a no-op when no GPU is present.
+    if (traceGpuKernels_)
+      cuptiProfiler_.start();
+
+    // Sample CPU (RAPL) and GPU (NVML) power on a background thread; a no-op when
+    // neither source is available.
+    if (tracePower_)
+      powerSampler_.start(powerPeriodMs_);
+  }
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+    desc.addUntracked<bool>("enabled", true)->setComment("Master switch; when false the service does nothing.");
+    desc.addUntracked<std::string>("fileName", "cmsrun.pftrace")->setComment("Output Perfetto trace file.");
+    desc.addUntracked<unsigned>("bufferSizeKB", 256 * 1024)->setComment("In-process trace buffer size in KB.");
+    desc.addUntracked<unsigned>("shmemSizeKB", 64 * 1024)
+        ->setComment(
+            "Producer shared-memory buffer (SMB) size in KB; if too small the trace silently drops "
+            "slices (whole events) under high-rate multi-stream tracing.");
+    desc.addUntracked<unsigned>("maxEvents", 0)
+        ->setComment(
+            "Stop opening new event slices after this many events (0 = unlimited, the default; the "
+            "trace is then bounded by bufferSizeKB).");
+    desc.addUntracked<bool>("traceFunctions", false)
+        ->setComment("Enable tier-B per-function slices (CMS_PERFETTO_FUNC/SCOPE).");
+    desc.addUntracked<bool>("traceAllocations", false)
+        ->setComment("Trace Alpaka caching-allocator alloc/free and device-memory counters.");
+    desc.addUntracked<bool>("traceGpuKernels", false)
+        ->setComment("Trace CUDA kernels (real device timing + registers/occupancy) via CUPTI.");
+    desc.addUntracked<bool>("tracePower", false)
+        ->setComment("Sample CPU (RAPL) and GPU (NVML) power as counter tracks.");
+    desc.addUntracked<unsigned>("powerPeriodMs", 1000)
+        ->setComment("Power sampling period in milliseconds (when tracePower is true).");
+    desc.addUntracked<std::vector<std::string>>("traceModules", {})
+        ->setComment("If non-empty, only trace these module labels (lower overhead, focused trace).");
+    descriptions.add("PerfettoTraceService", desc);
+  }
+
+private:
+  struct PerStream {
+    unsigned sid = 0;
+    bool in_event = false;
+    bool source_open = false;
+    unsigned long long eventId = 0;
+  };
+
+  bool tracing() const noexcept { return enabled_ && ::perfetto::TrackEvent::IsEnabled(); }
+
+  // When traceModules is non-empty, only those module labels are traced (and only
+  // their allocations/kernels are attributed). Empty = trace every module.
+  bool selected(edm::ModuleDescription const& md) const {
+    return traceModules_.empty() || std::binary_search(traceModules_.begin(), traceModules_.end(), md.moduleLabel());
+  }
+
+  bool withinBudget() noexcept { return maxEvents_ == 0 || seenEvents_.fetch_add(1) < maxEvents_; }
+
+  // Publish (or clear) the thread-local "current module" used by the allocator
+  // and GPU layers to attribute their work.
+  // Push a module context on this thread's stack. |active| (== this module is
+  // selected for tracing) controls whether work done inside it is attributed.
+  static void pushContext(edm::StreamContext const& sc, edm::ModuleDescription const& md, bool active) noexcept {
+    cms::perfetto::ModuleContext ctx;
+    ctx.label = md.moduleLabel().c_str();
+    ctx.type = md.moduleName().c_str();
+    ctx.moduleId = md.id();
+    ctx.streamId = sc.streamID().value();
+    ctx.eventId = sc.eventID().event();
+    ctx.active = active;
+    cms::perfetto::pushModuleContext(ctx);
+  }
+
+  void preallocate(edm::service::SystemBounds const& bounds) {
+    states_.assign(bounds.maxNumberOfStreams(), PerStream{});
+    for (unsigned i = 0; i < states_.size(); ++i) {
+      states_[i].sid = i;
+      auto t = cms::perfetto::streamTrack(i);
+      auto d = t.Serialize();
+      d.set_name("edm::stream " + std::to_string(i));
+      ::perfetto::TrackEvent::SetTrackDescriptor(t, d);
+    }
+  }
+
+  // ----- Source (on the stream's lane for the reading thread) -----
+  void preSourceEvent(edm::StreamID sid) {
+    if (!tracing())
+      return;
+    auto& st = states_[sid.value()];
+    if (!st.source_open) {
+      st.source_open = true;
+      TRACE_EVENT_BEGIN("cmssw.source", "Source", cms::perfetto::laneTrack(sid.value()), "stream", sid.value());
+    }
+  }
+
+  void postSourceEvent(edm::StreamID sid) {
+    if (!tracing())
+      return;
+    auto& st = states_[sid.value()];
+    if (st.source_open) {
+      TRACE_EVENT_END("cmssw.source", cms::perfetto::laneTrack(sid.value()));
+      st.source_open = false;
+    }
+  }
+
+  // ----- Per-stream event lifetime (serialized -> per-stream track) -----
+  void preEvent(edm::StreamContext const& sc) {
+    if (!tracing())
+      return;
+    auto& st = states_[sc.streamID().value()];
+    if (st.in_event)
+      return;
+    if (!withinBudget()) {
+      cms::perfetto::resetModuleContext();
+      return;
+    }
+    st.in_event = true;
+    auto const& id = sc.eventID();
+    st.eventId = id.event();
+
+    auto track = cms::perfetto::streamTrack(st.sid);
+    TRACE_EVENT_BEGIN(
+        "cmssw.event", "Event", track, "run", id.run(), "lumi", id.luminosityBlock(), "event", id.event());
+    TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("run", "id", track), static_cast<double>(id.run()));
+    TRACE_COUNTER(
+        "cmssw.event", ::perfetto::CounterTrack("lumi", "id", track), static_cast<double>(id.luminosityBlock()));
+    TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("event", "id", track), static_cast<double>(id.event()));
+  }
+
+  void preClearEvent(edm::StreamContext const& sc) {
+    if (!tracing())
+      return;
+    if (!states_[sc.streamID().value()].in_event)
+      return;
+    TRACE_EVENT_BEGIN(
+        "cmssw.cleanup", "Cleanup", cms::perfetto::laneTrack(sc.streamID().value()), "stream", sc.streamID().value());
+  }
+
+  void postClearEvent(edm::StreamContext const& sc) {
+    if (!tracing())
+      return;
+    auto& st = states_[sc.streamID().value()];
+    if (!st.in_event)
+      return;
+    TRACE_EVENT_END("cmssw.cleanup", cms::perfetto::laneTrack(sc.streamID().value()));
+    TRACE_EVENT_END("cmssw.event", cms::perfetto::streamTrack(st.sid));
+    st.in_event = false;
+    emitThroughput();  // global events/s counter, from event-completion times
+    cms::perfetto::resetModuleContext();
+  }
+
+  // Global "Throughput (events/s)" counter: a sliding-window rate over the last
+  // kThroughputWindow event completions (across all streams), emitted on a single
+  // process-level CounterTrack -- so any Perfetto UI shows the job's event rate
+  // ramping up and reaching steady state, no UI plugin required.
+  void emitThroughput() {
+    timespec ts{};
+    clock_gettime(CLOCK_BOOTTIME, &ts);
+    int64_t const now = int64_t(ts.tv_sec) * 1000000000LL + ts.tv_nsec;
+    double rate = 0.;
+    {
+      std::scoped_lock lock(throughputMutex_);
+      completions_.push_back(now);
+      while (completions_.size() > kThroughputWindow)
+        completions_.pop_front();
+      if (completions_.size() >= 2) {
+        double const span_s = double(now - completions_.front()) / 1e9;
+        if (span_s > 0.)
+          rate = double(completions_.size() - 1) / span_s;
+      }
+    }
+    TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("Throughput (events/s)"), rate);
+  }
+
+  // ----- Modules (per thread) -----
+  void preModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc) {
+    if (!tracing())
+      return;
+    if (!states_[sc.streamID().value()].in_event)
+      return;
+    auto const& md = *mcc.moduleDescription();
+    bool const sel = selected(md);
+    pushContext(sc, md, sel);  // pushed for every module so the stack stays balanced
+    if (!sel)
+      return;
+    TRACE_EVENT_BEGIN("cmssw.module",
+                      ::perfetto::DynamicString(md.moduleLabel()),
+                      cms::perfetto::laneTrack(sc.streamID().value()),
+                      "event",
+                      sc.eventID().event(),
+                      "module_id",
+                      md.id(),
+                      "cpp_type",
+                      ::perfetto::DynamicString(md.moduleName()));
+  }
+
+  void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc) {
+    if (!tracing())
+      return;
+    if (!states_[sc.streamID().value()].in_event)
+      return;
+    if (selected(*mcc.moduleDescription()))
+      TRACE_EVENT_END("cmssw.module", cms::perfetto::laneTrack(sc.streamID().value()));
+    cms::perfetto::popModuleContext();
+  }
+
+  void preModuleEventAcquire(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc) {
+    if (!tracing())
+      return;
+    if (!states_[sc.streamID().value()].in_event)
+      return;
+    auto const& md = *mcc.moduleDescription();
+    bool const sel = selected(md);
+    pushContext(sc, md, sel);
+    if (!sel)
+      return;
+    TRACE_EVENT_BEGIN("cmssw.acquire",
+                      ::perfetto::DynamicString(md.moduleLabel()),
+                      cms::perfetto::laneTrack(sc.streamID().value()),
+                      "event",
+                      sc.eventID().event(),
+                      "cpp_type",
+                      ::perfetto::DynamicString(md.moduleName()));
+  }
+
+  void postModuleEventAcquire(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc) {
+    if (!tracing())
+      return;
+    if (!states_[sc.streamID().value()].in_event)
+      return;
+    if (selected(*mcc.moduleDescription()))
+      TRACE_EVENT_END("cmssw.acquire", cms::perfetto::laneTrack(sc.streamID().value()));
+    cms::perfetto::popModuleContext();
+  }
+
+  // ----- EventSetup modules (per thread; stream attribution is best-effort) -----
+  static edm::StreamContext const* streamOf(edm::ESModuleCallingContext const& cc) {
+    auto top = cc.getTopModuleCallingContext();
+    if (!top || top->type() != edm::ParentContext::Type::kPlaceInPath)
+      return nullptr;
+    auto const* pip = top->parent().placeInPathContext();
+    auto const* pc = pip ? pip->pathContext() : nullptr;
+    return pc ? pc->streamContext() : nullptr;
+  }
+
+  void preESModule(edm::eventsetup::EventSetupRecordKey const&, edm::ESModuleCallingContext const& cc) {
+    if (!tracing())
+      return;
+    auto const* sc = streamOf(cc);
+    if (!sc || !states_[sc->streamID().value()].in_event)
+      return;
+    auto const* cd = cc.componentDescription();
+    const char* name = (cd && !cd->label_.empty()) ? cd->label_.c_str() : (cd ? cd->type_.c_str() : "ESModule");
+    TRACE_EVENT_BEGIN("cmssw.es",
+                      ::perfetto::DynamicString(name),
+                      cms::perfetto::laneTrack(sc->streamID().value()),
+                      "stream",
+                      sc->streamID().value());
+  }
+
+  void postESModule(edm::eventsetup::EventSetupRecordKey const&, edm::ESModuleCallingContext const& cc) {
+    if (!tracing())
+      return;
+    auto const* sc = streamOf(cc);
+    if (!sc || !states_[sc->streamID().value()].in_event)
+      return;
+    TRACE_EVENT_END("cmssw.es", cms::perfetto::laneTrack(sc->streamID().value()));
+  }
+
+  void postEndJob() {
+    if (!enabled_ || !session_)
+      return;
+    // Stop observing allocations before the session is torn down: late frees
+    // (e.g. from the AlpakaService destructor) must not touch a stopped session.
+    if (traceAllocations_)
+      cms::alpakatools::setCachingAllocatorMonitor(nullptr);
+    // Stop the power sampler before the session is torn down (its thread emits).
+    if (tracePower_)
+      powerSampler_.stop();
+    // Drain CUDA kernel activity into the still-open session, then stop it.
+    if (traceGpuKernels_) {
+      cuptiProfiler_.flush();
+      cuptiProfiler_.stop();
+    }
+    ::perfetto::TrackEvent::Flush();
+    session_->StopBlocking();
+    auto trace_data = session_->ReadTraceBlocking();
+    int fd = ::open(fileName_.c_str(), O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC, 0644);
+    if (fd >= 0) {
+      [[maybe_unused]] auto n = ::write(fd, trace_data.data(), trace_data.size());
+      ::close(fd);
+    }
+  }
+
+  const bool enabled_;
+  const std::string fileName_;
+  const unsigned bufferSizeKB_;
+  const unsigned shmemSizeKB_;
+  const unsigned maxEvents_;
+  const bool traceFunctions_;
+  const bool traceAllocations_;
+  const bool traceGpuKernels_;
+  const bool tracePower_;
+  const unsigned powerPeriodMs_;
+  std::vector<std::string> traceModules_;
+
+  std::unique_ptr<::perfetto::TracingSession> session_;
+
+  std::vector<PerStream> states_;
+  std::atomic<unsigned> seenEvents_{0};
+  cms::perfetto::PerfettoAllocatorMonitor allocatorMonitor_;
+  cms::perfetto::PerfettoCuptiProfiler cuptiProfiler_;
+  cms::perfetto::PerfettoPowerSampler powerSampler_;
+
+  static constexpr std::size_t kThroughputWindow = 16;  // events in the rate window
+  std::mutex throughputMutex_;
+  std::deque<int64_t> completions_;  // boottime ns of recent event completions
+};
+
+DEFINE_FWK_SERVICE(PerfettoTraceService);
diff --git a/PerfTools/Perfetto/python/customisePerfetto.py b/PerfTools/Perfetto/python/customisePerfetto.py
new file mode 100644
index 0000000000000..c5f480024e621
--- /dev/null
+++ b/PerfTools/Perfetto/python/customisePerfetto.py
@@ -0,0 +1,42 @@
+# Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+import FWCore.ParameterSet.Config as cms
+
+
+def customisePerfetto(process,
+                      fileName="cmsrun.pftrace",
+                      bufferSizeKB=256 * 1024,
+                      maxEvents=200,
+                      traceFunctions=False,
+                      traceAllocations=False,
+                      traceGpuKernels=False,
+                      traceModules=None):
+    """Add the PerfettoTraceService to process.
+
+    The service writes an in-process Perfetto trace (open it at https://ui.perfetto.dev).
+    Module / acquire / EventSetup slices land on per-thread tracks (so concurrent
+    work nests correctly); each stream gets an "Event" track with run/lumi/event
+    counters.
+
+    Arguments:
+      fileName         output .pftrace file
+      bufferSizeKB     in-process trace buffer size (KB)
+      maxEvents        stop opening new event slices after this many events (0 = unlimited)
+      traceFunctions   enable tier-B per-function slices (CMS_PERFETTO_FUNC/SCOPE)
+      traceAllocations trace the Alpaka caching allocator (alloc/free + device-memory counters)
+      traceGpuKernels  trace CUDA kernels via CUPTI (real device timing + registers/occupancy)
+      traceModules     if a non-empty list, only trace these module labels (focused, low overhead)
+    """
+    process.add_(cms.Service("PerfettoTraceService",
+                             fileName=cms.untracked.string(fileName),
+                             bufferSizeKB=cms.untracked.uint32(bufferSizeKB),
+                             maxEvents=cms.untracked.uint32(maxEvents),
+                             traceFunctions=cms.untracked.bool(traceFunctions),
+                             traceAllocations=cms.untracked.bool(traceAllocations),
+                             traceGpuKernels=cms.untracked.bool(traceGpuKernels),
+                             traceModules=cms.untracked.vstring(traceModules or [])))
+    return process
+
+
+def customise(process):
+    """Default entry point for `cmsDriver.py --customise PerfTools/Perfetto/customisePerfetto.customise`."""
+    return customisePerfetto(process)
diff --git a/PerfTools/Perfetto/scripts/perfettoKernelResources.py b/PerfTools/Perfetto/scripts/perfettoKernelResources.py
new file mode 100755
index 0000000000000..ca75871f3f4a4
--- /dev/null
+++ b/PerfTools/Perfetto/scripts/perfettoKernelResources.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+"""Dump the static per-kernel resource usage (ptxas info) of CUDA/Alpaka libraries.
+
+Runs `cuobjdump --dump-resource-usage` on each given shared object / cubin and
+reports, per kernel, the registers/thread, stack frame, static shared memory,
+local memory (>0 means register spills) and constant memory -- the same numbers
+ptxas prints with `-Xptxas -v`. Names are demangled with `c++filt`/`cu++filt`.
+
+This is the compile-time counterpart of the runtime CUPTI layer in
+PerfettoTraceService (traceGpuKernels=True): CUPTI reports registers and memory
+per launch, while this works offline on the built libraries and also exposes
+spills/stack, which are not in the CUPTI kernel records.
+
+Usage:
+  perfettoKernelResources.py [--json] [--filter SUBSTR] <lib.so|cubin> [more ...]
+"""
+import argparse
+import json
+import re
+import shutil
+import subprocess
+import sys
+
+_FUNC = re.compile(r"^\s*Function (.+):\s*$")
+_RES = re.compile(r"REG:(\d+).*?STACK:(\d+).*?SHARED:(\d+).*?LOCAL:(\d+)")
+_CONST = re.compile(r"CONSTANT\[0\]:(\d+)")
+
+
+def demangle(name, tool):
+    if not tool:
+        return name
+    out = subprocess.run([tool, name], capture_output=True, text=True).stdout.strip()
+    return out or name
+
+
+def shorten(name):
+    """Reduce an Alpaka gpuKernel<...> wrapper to the user functor."""
+    i = name.find("gpuKernel<")
+    if i < 0:
+        return name
+    i += len("gpuKernel<")
+    depth = 0
+    for j in range(i, len(name)):
+        c = name[j]
+        if c == "<":
+            depth += 1
+        elif c == ">":
+            if depth == 0:
+                return name[i:j]
+            depth -= 1
+        elif c == "," and depth == 0:
+            return name[i:j]
+    return name[i:]
+
+
+def dump(path, demangler):
+    out = subprocess.run(["cuobjdump", "--dump-resource-usage", path], capture_output=True, text=True).stdout
+    kernels, cur = [], None
+    for line in out.splitlines():
+        m = _FUNC.match(line)
+        if m:
+            cur = m.group(1)
+            continue
+        if cur:
+            r = _RES.search(line)
+            if r:
+                c = _CONST.search(line)
+                full = demangle(cur, demangler)
+                kernels.append({
+                    "kernel": shorten(full),
+                    "registers": int(r.group(1)),
+                    "stack": int(r.group(2)),
+                    "shared": int(r.group(3)),
+                    "local_spill": int(r.group(4)),
+                    "const": int(c.group(1)) if c else 0,
+                    "mangled": cur,
+                })
+                cur = None
+    return kernels
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("libs", nargs="+", help="shared objects or cubins to inspect")
+    ap.add_argument("--json", action="store_true", help="emit JSON instead of a table")
+    ap.add_argument("--filter", default="", help="only keep kernels whose name contains SUBSTR")
+    args = ap.parse_args()
+
+    if not shutil.which("cuobjdump"):
+        sys.exit("cuobjdump not found; run inside a CUDA environment (cmsenv)")
+    demangler = shutil.which("cu++filt") or shutil.which("c++filt")
+
+    kernels = []
+    for lib in args.libs:
+        for k in dump(lib, demangler):
+            if args.filter in k["kernel"]:
+                kernels.append(k)
+    kernels.sort(key=lambda k: -k["registers"])
+
+    if args.json:
+        print(json.dumps(kernels, indent=2))
+        return
+    print(f"{'REG':>4} {'STACK':>6} {'SHARED':>7} {'SPILL':>6} {'CONST':>6}  kernel")
+    for k in kernels:
+        print(f"{k['registers']:>4} {k['stack']:>6} {k['shared']:>7} {k['local_spill']:>6} {k['const']:>6}  "
+              f"{k['kernel'][:90]}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PerfTools/Perfetto/src/CMSSWPerfettoCategories.cc b/PerfTools/Perfetto/src/CMSSWPerfettoCategories.cc
new file mode 100644
index 0000000000000..1be791a93d606
--- /dev/null
+++ b/PerfTools/Perfetto/src/CMSSWPerfettoCategories.cc
@@ -0,0 +1,3 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h"
+PERFETTO_TRACK_EVENT_STATIC_STORAGE();
diff --git a/PerfTools/Perfetto/src/CMSSWPerfettoModuleContext.cc b/PerfTools/Perfetto/src/CMSSWPerfettoModuleContext.cc
new file mode 100644
index 0000000000000..b91fe93be31bf
--- /dev/null
+++ b/PerfTools/Perfetto/src/CMSSWPerfettoModuleContext.cc
@@ -0,0 +1,33 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h"
+
+namespace cms::perfetto {
+  namespace {
+    // A small fixed-size, allocation-free per-thread stack. Module re-entrancy
+    // from work-stealing is shallow in practice; beyond the cap we keep counting
+    // depth (so push/pop stay balanced) but stop recording, and report no module.
+    constexpr int kMaxDepth = 64;
+    thread_local ModuleContext g_stack[kMaxDepth];
+    thread_local int g_depth = 0;
+    const ModuleContext g_none{};
+  }  // namespace
+
+  void pushModuleContext(ModuleContext const& ctx) noexcept {
+    if (g_depth >= 0 && g_depth < kMaxDepth)
+      g_stack[g_depth] = ctx;
+    ++g_depth;
+  }
+
+  void popModuleContext() noexcept {
+    if (g_depth > 0)
+      --g_depth;
+  }
+
+  void resetModuleContext() noexcept { g_depth = 0; }
+
+  ModuleContext const& currentModuleContext() noexcept {
+    if (g_depth > 0 && g_depth <= kMaxDepth)
+      return g_stack[g_depth - 1];
+    return g_none;
+  }
+}  // namespace cms::perfetto
diff --git a/PerfTools/Perfetto/test/BuildFile.xml b/PerfTools/Perfetto/test/BuildFile.xml
new file mode 100644
index 0000000000000..792f805472a60
--- /dev/null
+++ b/PerfTools/Perfetto/test/BuildFile.xml
@@ -0,0 +1,13 @@
+<bin name="testPerfettoAllocator" file="alpaka/testPerfettoAllocator.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="FWCore/Utilities"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <use name="PerfTools/Perfetto"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
+<bin name="testPerfettoTrace" file="testPerfettoTrace.cpp">
+  <use name="catch2"/>
+  <use name="PerfTools/Perfetto"/>
+</bin>
diff --git a/PerfTools/Perfetto/test/alpaka/testPerfettoAllocator.dev.cc b/PerfTools/Perfetto/test/alpaka/testPerfettoAllocator.dev.cc
new file mode 100644
index 0000000000000..e4156bb2a550c
--- /dev/null
+++ b/PerfTools/Perfetto/test/alpaka/testPerfettoAllocator.dev.cc
@@ -0,0 +1,131 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+#include <atomic>
+#include <string>
+
+#include <alpaka/alpaka.hpp>
+
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch_all.hpp>
+
+#include "FWCore/Utilities/interface/stringize.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/CachingAllocatorMonitor.h"
+
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h"
+#include "PerfTools/Perfetto/interface/PerfettoAllocatorMonitor.h"
+#include <perfetto.h>
+
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+namespace {
+  constexpr size_t SIZE = 1024;
+
+  // Counts transactions and captures the module attribution seen via the
+  // thread-local cms::perfetto::ModuleContext.
+  struct MockMonitor : public cms::alpakatools::CachingAllocatorMonitor {
+    std::atomic<int> allocs{0};
+    std::atomic<int> frees{0};
+    std::string allocModule;
+    std::string freeModule;
+
+    void onAllocate(int, const void*, std::size_t, std::size_t, bool, unsigned long long) noexcept override {
+      ++allocs;
+      auto const& m = cms::perfetto::currentModuleContext();
+      if (m.active && m.label)
+        allocModule = m.label;
+    }
+    void onFree(int, const void*, std::size_t, unsigned long long) noexcept override {
+      ++frees;
+      auto const& m = cms::perfetto::currentModuleContext();
+      if (m.active && m.label)
+        freeModule = m.label;
+    }
+  };
+}  // namespace
+
+TEST_CASE("Caching-allocator monitor hook attributes transactions to the current module (" EDM_STRINGIZE(
+              ALPAKA_ACCELERATOR_NAMESPACE) ")",
+          "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
+  auto const& devices = cms::alpakatools::devices<Platform>();
+  if (devices.empty()) {
+    INFO("no devices available for this backend, skipping");
+    return;
+  }
+
+  SECTION("mock monitor sees alloc/free with the module attribution") {
+    MockMonitor mon;
+    cms::alpakatools::setCachingAllocatorMonitor(&mon);
+
+    // pretend we are inside module "testAllocModule" running on stream 0
+    cms::perfetto::ModuleContext ctx;
+    ctx.label = "testAllocModule";
+    ctx.streamId = 0;
+    ctx.active = true;
+    cms::perfetto::pushModuleContext(ctx);
+
+    {
+      auto queue = Queue(devices[0]);
+      auto buf_h = cms::alpakatools::make_host_buffer<float[]>(queue, SIZE);
+      auto buf_d = cms::alpakatools::make_device_buffer<float[]>(queue, SIZE);
+      alpaka::memset(queue, buf_d, 0);
+      alpaka::wait(queue);
+    }  // buffers freed here -> onFree
+
+    cms::perfetto::popModuleContext();
+    cms::alpakatools::setCachingAllocatorMonitor(nullptr);
+
+    // Not every backend routes buffers through the caching allocator (e.g. the
+    // serial-CPU backend allocates directly). Where it does, the monitor must
+    // have been called and the transactions attributed to the current module.
+    if (mon.allocs.load() == 0) {
+      WARN("this backend does not use the caching allocator; monitor not exercised");
+    } else {
+      REQUIRE(mon.frees.load() >= 1);
+      REQUIRE(mon.allocModule == "testAllocModule");
+      REQUIRE(mon.freeModule == "testAllocModule");
+    }
+  }
+
+  SECTION("PerfettoAllocatorMonitor emits a non-empty trace") {
+    ::perfetto::TracingInitArgs args;
+    args.backends = ::perfetto::kInProcessBackend;
+    ::perfetto::Tracing::Initialize(args);
+    ::perfetto::TrackEvent::Register();
+
+    ::perfetto::TraceConfig cfg;
+    cfg.add_buffers()->set_size_kb(4096);
+    auto* ds = cfg.add_data_sources()->mutable_config();
+    ds->set_name("track_event");
+    ::perfetto::protos::gen::TrackEventConfig te;
+    te.add_enabled_categories("cmssw.alloc");
+    ds->set_track_event_config_raw(te.SerializeAsString());
+
+    auto session = ::perfetto::Tracing::NewTrace();
+    session->Setup(cfg);
+    session->StartBlocking();
+
+    cms::perfetto::PerfettoAllocatorMonitor monitor;
+    cms::alpakatools::setCachingAllocatorMonitor(&monitor);
+    cms::perfetto::ModuleContext ctx;
+    ctx.label = "testAllocModule";
+    ctx.active = true;
+    cms::perfetto::pushModuleContext(ctx);
+
+    {
+      auto queue = Queue(devices[0]);
+      auto buf_d = cms::alpakatools::make_device_buffer<float[]>(queue, SIZE);
+      alpaka::memset(queue, buf_d, 0);
+      alpaka::wait(queue);
+    }
+
+    cms::perfetto::popModuleContext();
+    cms::alpakatools::setCachingAllocatorMonitor(nullptr);
+
+    ::perfetto::TrackEvent::Flush();
+    session->StopBlocking();
+    std::vector<char> data = session->ReadTraceBlocking();
+    REQUIRE(not data.empty());
+  }
+}
diff --git a/PerfTools/Perfetto/test/testPerfettoTrace.cpp b/PerfTools/Perfetto/test/testPerfettoTrace.cpp
new file mode 100644
index 0000000000000..1f026e4a636cb
--- /dev/null
+++ b/PerfTools/Perfetto/test/testPerfettoTrace.cpp
@@ -0,0 +1,222 @@
+// Original author: Felice Pantaleo, felice.pantaleo@cern.ch, 02/2026
+//
+// Regression test for the Perfetto tracing building blocks. It records a small
+// trace in-process the same way PerfettoTraceService does -- process track,
+// per-stream tracks, per-(stream,thread) lanes, module slices, run/lumi/event
+// and Throughput counters -- then parses the result back with perfetto's own
+// (protozero) decoders and asserts the structure. It is meant to break loudly if
+// a future perfetto SDK update (or a change to our helpers) silently drops a
+// feature: wrong track parenting, missing counters, unbalanced slices, etc. No GPU.
+
+#include <cstdint>
+#include <functional>
+#include <set>
+#include <string>
+#include <string_view>
+#include <thread>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch_all.hpp>
+
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoCategories.h"
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoLanes.h"
+#include "PerfTools/Perfetto/interface/CMSSWPerfettoModuleContext.h"
+#include <perfetto.h>
+
+namespace {
+  // Perfetto may be initialized only once per process.
+  void initPerfettoOnce() {
+    [[maybe_unused]] static const bool done = [] {
+      ::perfetto::TracingInitArgs args;
+      args.backends = ::perfetto::kInProcessBackend;
+      ::perfetto::Tracing::Initialize(args);
+      ::perfetto::TrackEvent::Register();
+      return true;
+    }();
+  }
+
+  // Run |emit| inside a fresh in-process session and return the serialized trace.
+  std::vector<char> recordTrace(const std::function<void()>& emit) {
+    initPerfettoOnce();
+    ::perfetto::TraceConfig cfg;
+    cfg.add_buffers()->set_size_kb(8192);
+    auto* ds = cfg.add_data_sources()->mutable_config();
+    ds->set_name("track_event");
+    ::perfetto::protos::gen::TrackEventConfig te;
+    for (const char* c : {"cmssw.event",
+                          "cmssw.source",
+                          "cmssw.module",
+                          "cmssw.acquire",
+                          "cmssw.cleanup",
+                          "cmssw.es",
+                          "cmssw.func",
+                          "cmssw.alloc",
+                          "cmssw.gpu",
+                          "cmssw.power"})
+      te.add_enabled_categories(c);
+    ds->set_track_event_config_raw(te.SerializeAsString());
+
+    auto session = ::perfetto::Tracing::NewTrace();
+    session->Setup(cfg);
+    session->StartBlocking();
+    emit();
+    ::perfetto::TrackEvent::Flush();
+    session->StopBlocking();
+    return session->ReadTraceBlocking();
+  }
+}  // namespace
+
+TEST_CASE("Perfetto trace has the expected track/counter/slice structure", "[perfetto]") {
+  using namespace cms::perfetto;
+  auto bytes = recordTrace([] {
+    auto proc = ::perfetto::ProcessTrack::Current();
+    {
+      auto d = proc.Serialize();
+      d.mutable_process()->set_process_name("cmsRun");
+      ::perfetto::TrackEvent::SetTrackDescriptor(proc, d);
+    }
+    for (unsigned s = 0; s < 2; ++s) {
+      auto st = streamTrack(s);
+      {
+        auto d = st.Serialize();
+        d.set_name("edm::stream " + std::to_string(s));
+        ::perfetto::TrackEvent::SetTrackDescriptor(st, d);
+      }
+      TRACE_EVENT_BEGIN("cmssw.event", "Event", st, "run", 1, "lumi", 1, "event", static_cast<int>(s + 1));
+      TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("run", "id", st), 1.0);
+      auto lane = laneTrack(s);
+      TRACE_EVENT_BEGIN("cmssw.module", "outer", lane);
+      TRACE_EVENT_BEGIN("cmssw.module", "inner", lane);  // nests inside outer on the same lane
+      TRACE_EVENT_END("cmssw.module", lane);
+      TRACE_EVENT_END("cmssw.module", lane);
+      TRACE_EVENT_END("cmssw.event", st);
+      TRACE_COUNTER("cmssw.event", ::perfetto::CounterTrack("Throughput (events/s)"), 100.0 + s);
+    }
+  });
+
+  REQUIRE(!bytes.empty());
+
+  namespace pb = ::perfetto::protos::pbzero;
+  pb::Trace::Decoder trace(reinterpret_cast<const uint8_t*>(bytes.data()), bytes.size());
+
+  // The Throughput counter is global (no parent track); its name lands in the
+  // trace but not in a place as convenient as the parented "run" counter, so
+  // check it by byte presence -- it disappears if the counter stops being emitted.
+  bool const sawThroughput =
+      std::string_view(bytes.data(), bytes.size()).find("Throughput (events/s)") != std::string_view::npos;
+
+  bool sawProcess = false;
+  int laneTracks = 0, packets = 0;
+  int begins = 0, ends = 0, counters = 0;
+  std::set<uint64_t> streamUuids;
+
+  // First pass: process/stream/counter descriptors and slice/counter events.
+  for (auto it = trace.packet(); it; ++it) {
+    ++packets;
+    pb::TracePacket::Decoder packet(*it);
+    if (packet.has_track_descriptor()) {
+      pb::TrackDescriptor::Decoder td(packet.track_descriptor());
+      std::string const name = td.name().ToStdString();
+      if (td.has_process()) {
+        pb::ProcessDescriptor::Decoder p(td.process());
+        if (p.process_name().ToStdString() == "cmsRun")
+          sawProcess = true;
+      }
+      if (name.rfind("edm::stream", 0) == 0)
+        streamUuids.insert(td.uuid());
+    }
+    if (packet.has_track_event()) {
+      pb::TrackEvent::Decoder te(packet.track_event());
+      switch (te.type()) {
+        case pb::TrackEvent::TYPE_SLICE_BEGIN:
+          ++begins;
+          break;
+        case pb::TrackEvent::TYPE_SLICE_END:
+          ++ends;
+          break;
+        case pb::TrackEvent::TYPE_COUNTER:
+          ++counters;
+          break;
+        default:
+          break;
+      }
+    }
+  }
+
+  // Second pass: lanes are descriptors parented to a stream track -- exactly the
+  // property the per-(stream,thread) lane scheme must preserve.
+  pb::Trace::Decoder trace2(reinterpret_cast<const uint8_t*>(bytes.data()), bytes.size());
+  for (auto it = trace2.packet(); it; ++it) {
+    pb::TracePacket::Decoder packet(*it);
+    if (packet.has_track_descriptor()) {
+      pb::TrackDescriptor::Decoder td(packet.track_descriptor());
+      if (streamUuids.count(td.parent_uuid()))
+        ++laneTracks;
+    }
+  }
+
+  REQUIRE(packets > 0);
+  REQUIRE(sawProcess);
+  REQUIRE(streamUuids.size() == 2u);
+  REQUIRE(laneTracks >= 1);
+  REQUIRE(sawThroughput);
+  REQUIRE(begins == ends);  // every slice is closed
+  REQUIRE(begins >= 6);     // 2 streams x (Event + outer + inner)
+  REQUIRE(counters >= 4);   // 2 streams x (run counter + Throughput counter)
+}
+
+TEST_CASE("lane and stream track uuids are distinct and correctly parented", "[perfetto]") {
+  using namespace cms::perfetto;
+  initPerfettoOnce();
+
+  auto s0 = streamTrack(0);
+  auto s1 = streamTrack(1);
+  REQUIRE(s0.uuid != 0);
+  REQUIRE(s0.uuid != s1.uuid);  // one track per stream
+
+  auto l0 = laneTrack(0);
+  auto l1 = laneTrack(1);
+  REQUIRE(l0.uuid != 0);                  // a lane must not collapse onto the root (the old xor-cancellation bug)
+  REQUIRE(l0.uuid != s0.uuid);            // lane is its own track, not the stream track
+  REQUIRE(l0.uuid != l1.uuid);            // different streams -> different lanes
+  REQUIRE(l0.parent_uuid == s0.uuid);     // the lane hangs under its stream
+  REQUIRE(laneTrack(0).uuid == l0.uuid);  // stable for the same (stream, thread)
+}
+
+TEST_CASE("module context stack nests and propagates across threads", "[perfetto]") {
+  using namespace cms::perfetto;
+  resetModuleContext();
+  REQUIRE_FALSE(currentModuleContext().active);
+
+  ModuleContext a;
+  a.label = "A";
+  a.active = true;
+  a.streamId = 3;
+  pushModuleContext(a);
+  REQUIRE(currentModuleContext().active);
+  REQUIRE(std::string(currentModuleContext().label) == "A");
+  REQUIRE(currentModuleContext().streamId == 3u);
+
+  {
+    ModuleContext b;
+    b.label = "B";
+    b.active = true;
+    ModuleContextGuard guard(b);
+    REQUIRE(std::string(currentModuleContext().label) == "B");
+  }
+  REQUIRE(std::string(currentModuleContext().label) == "A");  // restored after the guard
+
+  // withModuleContext must carry the context onto a helper thread (parallel_for).
+  std::string seenOnThread;
+  auto body = withModuleContext([&seenOnThread] {
+    auto const& m = currentModuleContext();
+    seenOnThread = (m.active && m.label) ? m.label : "";
+  });
+  std::thread th([&body] { body(); });
+  th.join();
+  REQUIRE(seenOnThread == "A");
+
+  popModuleContext();
+  REQUIRE_FALSE(currentModuleContext().active);
+}