diff --git a/src/metrix/README.md b/src/metrix/README.md index aa79044c..a3e6ed9f 100644 --- a/src/metrix/README.md +++ b/src/metrix/README.md @@ -17,6 +17,7 @@ Existing GPU profilers are **trash**: - **Human-readable metrics** instead of raw counters - **Unit tested** and reliable - **12 Memory Metrics**: Bandwidth, cache, coalescing, LDS, atomic latency +- **7 Compute Metrics**: FLOPS, arithmetic intensity (HBM/L2/L1), compute throughput - **Multi-Run Profiling**: Automatic aggregation with min/max/avg statistics - **Kernel Filtering**: Efficient regex filtering at rocprofv3 level - **Multiple Output Formats**: Text, JSON, CSV @@ -68,6 +69,8 @@ for kernel in results.kernels: - `memory.hbm_write_bandwidth` - HBM write bandwidth (GB/s) - `memory.hbm_bandwidth_utilization` - % of peak HBM bandwidth - `memory.bytes_transferred_hbm` - Total bytes through HBM +- `memory.bytes_transferred_l2` - Total bytes through L2 cache +- `memory.bytes_transferred_l1` - Total bytes through L1 cache ### Cache Performance - `memory.l1_hit_rate` - L1 cache hit rate (%) @@ -85,13 +88,20 @@ for kernel in results.kernels: ### Atomic Operations - `memory.atomic_latency` - Atomic operation latency (cycles) +### Compute Metrics +- `compute.total_flops` - Total floating-point operations performed +- `compute.hbm_gflops` - Compute throughput (GFLOPS) +- `compute.hbm_arithmetic_intensity` - Ratio of FLOPs to HBM bytes (FLOP/byte) +- `compute.l2_arithmetic_intensity` - Ratio of FLOPs to L2 bytes (FLOP/byte) +- `compute.l1_arithmetic_intensity` - Ratio of FLOPs to L1 bytes (FLOP/byte) + ## CLI Options ``` metrix [options] Options: - --profile, -p Use pre-defined profile (quick, memory) + --profile, -p Use pre-defined profile (quick, memory, compute) --metrics, -m Comma-separated list of metrics --time-only Only collect timing --kernel, -k Filter by kernel name substring diff --git a/src/metrix/src/metrix/backends/base.py b/src/metrix/src/metrix/backends/base.py index a935595c..e9495fa4 100644 --- a/src/metrix/src/metrix/backends/base.py +++ b/src/metrix/src/metrix/backends/base.py @@ -162,6 +162,10 @@ def _split_counters_into_passes(self, counters: List[str]) -> List[List[str]]: Returns: List of counter lists, one per profiling pass """ + # Handle empty counters (timing-only mode) - return single pass with no counters + if not counters: + return [[]] + counter_groups = self._get_counter_groups() max_per_pass = 14 # Conservative limit for most AMD GPUs diff --git a/src/metrix/src/metrix/backends/gfx1201.py b/src/metrix/src/metrix/backends/gfx1201.py index 18a4ceee..fae8bb7d 100644 --- a/src/metrix/src/metrix/backends/gfx1201.py +++ b/src/metrix/src/metrix/backends/gfx1201.py @@ -83,6 +83,24 @@ def _bytes_transferred_hbm(self, GRBM_GUI_ACTIVE): """ return 0.0 + @metric("memory.bytes_transferred_l2") + def _bytes_transferred_l2(self): + """ + Total bytes transferred through L2 cache + + Formula: TCC_REQ_sum * 128 (L2 cache line size is 128 bytes) + """ + return 0.0 + + @metric("memory.bytes_transferred_l1") + def _bytes_transferred_l1(self): + """ + Total bytes transferred through L1 cache + + Formula: TCP_TOTAL_CACHE_ACCESSES_sum * cache_line_size (architecture-dependent) + """ + return 0.0 + # Cache metrics @metric("memory.l2_hit_rate") @@ -173,3 +191,49 @@ def _atomic_latency(self): return 0.0 + # Compute metrics + + @metric("compute.total_flops") + def _total_flops(self): + """ + Total floating-point operations performed by the kernel + + Formula: 64 * (FP16 + FP32 + FP64) + 512 * MFMA + """ + return 0.0 + + @metric("compute.hbm_gflops") + def _hbm_gflops(self): + """ + Compute throughput (GFLOPS) normalized by kernel execution time + + Formula: (total_flops / 1e9) / time_seconds + """ + return 0.0 + + @metric("compute.hbm_arithmetic_intensity") + def _hbm_arithmetic_intensity(self): + """ + HBM Arithmetic Intensity: ratio of floating-point operations to HBM bytes transferred (FLOP/byte) + + Formula: total_flops / hbm_bytes + """ + return 0.0 + + @metric("compute.l2_arithmetic_intensity") + def _l2_arithmetic_intensity(self): + """ + L2 Arithmetic Intensity: ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte) + + Formula: total_flops / l2_bytes + """ + return 0.0 + + @metric("compute.l1_arithmetic_intensity") + def _l1_arithmetic_intensity(self): + """ + L1 Arithmetic Intensity: ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte) + + Formula: total_flops / l1_bytes + """ + return 0.0 diff --git a/src/metrix/src/metrix/backends/gfx942.py b/src/metrix/src/metrix/backends/gfx942.py index f1e63d3b..9791a707 100644 --- a/src/metrix/src/metrix/backends/gfx942.py +++ b/src/metrix/src/metrix/backends/gfx942.py @@ -58,6 +58,7 @@ def _get_counter_groups(self) -> List[List[str]]: [ "SQ_LDS_BANK_CONFLICT", "TCC_EA0_WRREQ_sum", + "TCC_EA0_WRREQ_64B_sum", "TCC_EA0_ATOMIC_LEVEL_sum", "TCC_EA0_ATOMIC_sum", "GRBM_GUI_ACTIVE", @@ -68,6 +69,7 @@ def _get_counter_groups(self) -> List[List[str]]: "TCP_TOTAL_ACCESSES_sum", "TCC_HIT_sum", "TCC_MISS_sum", + "TCC_REQ_sum", ], # Group 3: Memory instructions and read requests (from SQ_INSTS_VMEM) # Note: RDREQ can only be collected here, not with atomics! @@ -76,9 +78,33 @@ def _get_counter_groups(self) -> List[List[str]]: "TCP_TCC_WRITE_REQ_sum", "TCP_TOTAL_CACHE_ACCESSES_sum", "TCC_EA0_RDREQ_sum", + "TCC_EA0_RDREQ_32B_sum", + "TCC_BUBBLE_sum", "SQ_INSTS_VMEM_RD", "SQ_INSTS_VMEM_WR", ], + # Group 4: FP16 and FP32 VALU instructions (for FLOPS calculations) + [ + "SQ_INSTS_VALU_ADD_F16", + "SQ_INSTS_VALU_MUL_F16", + "SQ_INSTS_VALU_TRANS_F16", + "SQ_INSTS_VALU_FMA_F16", + "SQ_INSTS_VALU_ADD_F32", + "SQ_INSTS_VALU_MUL_F32", + "SQ_INSTS_VALU_TRANS_F32", + "SQ_INSTS_VALU_FMA_F32", + ], + # Group 5: FP64 VALU instructions and MFMA instructions + [ + "SQ_INSTS_VALU_ADD_F64", + "SQ_INSTS_VALU_MUL_F64", + "SQ_INSTS_VALU_TRANS_F64", + "SQ_INSTS_VALU_FMA_F64", + "SQ_INSTS_VALU_MFMA_MOPS_F16", + "SQ_INSTS_VALU_MFMA_MOPS_BF16", + "SQ_INSTS_VALU_MFMA_MOPS_F32", + "SQ_INSTS_VALU_MFMA_MOPS_F64", + ], ] def _run_rocprof(self, command: str, counters: List[str], @@ -90,15 +116,20 @@ def _run_rocprof(self, command: str, counters: List[str], # Memory bandwidth metrics @metric("memory.hbm_read_bandwidth") - def _hbm_read_bandwidth(self, TCC_EA0_RDREQ_sum, GRBM_GUI_ACTIVE): + def _hbm_read_bandwidth(self, TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum, GRBM_GUI_ACTIVE): """ HBM read bandwidth in GB/s - Formula: (read_requests * 64 bytes) / (active_cycles / clock_freq) + Formula: (128B_requests * 128 + 64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq) Note: TCC_EA0_RDREQ_sum aggregates across all memory controllers on MI300 + TCC_BUBBLE_sum counts 128B read requests """ - bytes_read = TCC_EA0_RDREQ_sum * 64 # Each request is 64 bytes + # Calculate bytes with 32B/64B/128B distinction + bytes_read_128B = TCC_BUBBLE_sum * 128 + bytes_read_64B = (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 + bytes_read_32B = TCC_EA0_RDREQ_32B_sum * 32 + bytes_read = bytes_read_128B + bytes_read_64B + bytes_read_32B if GRBM_GUI_ACTIVE == 0: return 0.0 @@ -107,15 +138,18 @@ def _hbm_read_bandwidth(self, TCC_EA0_RDREQ_sum, GRBM_GUI_ACTIVE): return (bytes_read / 1e9) / time_seconds if time_seconds > 0 else 0.0 @metric("memory.hbm_write_bandwidth") - def _hbm_write_bandwidth(self, TCC_EA0_WRREQ_sum, GRBM_GUI_ACTIVE): + def _hbm_write_bandwidth(self, TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum, GRBM_GUI_ACTIVE): """ - HBM write bandwidth in GB/s + HBM write bandwidth in GB/s (with 32B/64B request granularity) - Formula: (write_requests * 64 bytes) / (active_cycles / clock_freq) + Formula: (64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq) Note: TCC_EA0_WRREQ_sum aggregates across all memory controllers on MI300 """ - bytes_written = TCC_EA0_WRREQ_sum * 64 # Each request is 64 bytes + # Calculate bytes with 32B/64B distinction + bytes_written_64B = TCC_EA0_WRREQ_64B_sum * 64 + bytes_written_32B = (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 + bytes_written = bytes_written_64B + bytes_written_32B if GRBM_GUI_ACTIVE == 0: return 0.0 @@ -124,15 +158,22 @@ def _hbm_write_bandwidth(self, TCC_EA0_WRREQ_sum, GRBM_GUI_ACTIVE): return (bytes_written / 1e9) / time_seconds if time_seconds > 0 else 0.0 @metric("memory.hbm_bandwidth_utilization") - def _hbm_bandwidth_utilization(self, TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum, GRBM_GUI_ACTIVE): + def _hbm_bandwidth_utilization(self, TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum, + TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum, GRBM_GUI_ACTIVE): """ HBM bandwidth utilization as percentage of peak Formula: (actual_bandwidth / peak_bandwidth) * 100 Note: TCC_EA0_* counters aggregate across all memory controllers on MI300 + TCC_BUBBLE_sum counts 128B read requests """ - total_bytes = (TCC_EA0_RDREQ_sum + TCC_EA0_WRREQ_sum) * 64 + # Calculate bytes with 32B/64B/128B distinction + bytes_read = (TCC_BUBBLE_sum * 128 + + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 + + TCC_EA0_RDREQ_32B_sum * 32) + bytes_written = TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 + total_bytes = bytes_read + bytes_written if GRBM_GUI_ACTIVE == 0: return 0.0 @@ -143,15 +184,40 @@ def _hbm_bandwidth_utilization(self, TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum, GRBM_ return (actual_bw_gbs / self.device_specs.hbm_bandwidth_gbs) * 100 @metric("memory.bytes_transferred_hbm") - def _bytes_transferred_hbm(self, TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum): + def _bytes_transferred_hbm(self, TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum, + TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum): """ Total bytes transferred through HBM - Formula: (read_requests + write_requests) * 64 bytes + Formula: (128B_read_requests * 128 + 64B_read_requests * 64 + 32B_read_requests * 32 + + 64B_write_requests * 64 + 32B_write_requests * 32) Note: TCC_EA0_* counters aggregate across all memory controllers on MI300 + TCC_BUBBLE_sum counts 128B read requests + """ + bytes_read = (TCC_BUBBLE_sum * 128 + + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 + + TCC_EA0_RDREQ_32B_sum * 32) + bytes_written = TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 + return bytes_read + bytes_written + + @metric("memory.bytes_transferred_l2") + def _bytes_transferred_l2(self, TCC_REQ_sum): + """ + Total bytes transferred through L2 cache + + Formula: TCC_REQ_sum * 128 (L2 cache line size is 128 bytes) + """ + return TCC_REQ_sum * 128 + + @metric("memory.bytes_transferred_l1") + def _bytes_transferred_l1(self, TCP_TOTAL_CACHE_ACCESSES_sum): """ - return (TCC_EA0_RDREQ_sum + TCC_EA0_WRREQ_sum) * 64 + Total bytes transferred through L1 cache + + Formula: TCP_TOTAL_CACHE_ACCESSES_sum * 128 (L1 cache line size is 128 bytes) + """ + return TCP_TOTAL_CACHE_ACCESSES_sum * 128 # Cache metrics @@ -278,3 +344,244 @@ def _atomic_latency(self, TCC_EA0_ATOMIC_LEVEL_sum, TCC_EA0_ATOMIC_sum): return TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum + # Compute metrics + + @metric("compute.total_flops") + def _total_flops(self, + SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16, + SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32, + SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64, + SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16, + SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64): + """ + Total floating-point operations performed by the kernel + + Formula: 64 * (FP16 + FP32 + FP64) + 512 * MFMA + - 64 operations per wave (wavefront size = 64) + - FMA counts as 2 operations (multiply + add) + - MFMA instructions produce 512 operations per instruction + """ + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + return fops + + @metric("compute.hbm_gflops") + def _hbm_gflops(self, + SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16, + SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32, + SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64, + SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16, + SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64, + GRBM_GUI_ACTIVE): + """ + Compute throughput (GFLOPS) normalized by kernel execution time + + Formula: (total_flops / 1e9) / time_seconds + """ + # Calculate total FLOPS (same as compute.total_flops) + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + if GRBM_GUI_ACTIVE == 0: + return 0.0 + + time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) + gflops = (fops / 1e9) / time_seconds if time_seconds > 0 else 0.0 + + return gflops + + @metric("compute.hbm_arithmetic_intensity") + def _hbm_arithmetic_intensity(self, + SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16, + SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32, + SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64, + SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16, + SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64, + TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum, + TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum): + """ + HBM Arithmetic Intensity: ratio of floating-point operations to HBM bytes transferred (FLOP/byte) + + Formula: total_flops / hbm_bytes + """ + # Calculate total FLOPS + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + # Calculate HBM bytes (with 32B/64B/128B distinction) + hbm_rd = (TCC_BUBBLE_sum * 128 + + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 + + TCC_EA0_RDREQ_32B_sum * 32) + hbm_wr = TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 + hbm_bytes = hbm_rd + hbm_wr + + # Arithmetic intensity = FLOP / byte + ai_hbm = fops / hbm_bytes if hbm_bytes > 0 else 0.0 + + return ai_hbm + + @metric("compute.l2_arithmetic_intensity") + def _l2_arithmetic_intensity(self, + SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16, + SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32, + SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64, + SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16, + SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64, + TCC_REQ_sum): + """ + L2 Arithmetic Intensity: ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte) + + Formula: total_flops / l2_bytes + """ + # Calculate total FLOPS + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + # Calculate L2 bytes (L2 cache line is 128 bytes) + l2_bytes = TCC_REQ_sum * 128 + + # Arithmetic intensity = FLOP / byte + ai_l2 = fops / l2_bytes if l2_bytes > 0 else 0.0 + + return ai_l2 + + @metric("compute.l1_arithmetic_intensity") + def _l1_arithmetic_intensity(self, + SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16, + SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32, + SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64, + SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16, + SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64, + TCP_TOTAL_CACHE_ACCESSES_sum): + """ + L1 Arithmetic Intensity: ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte) + + Formula: total_flops / l1_bytes + """ + # Calculate total FLOPS + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + # Calculate L1 bytes (L1 cache line is 128 bytes on gfx942) + l1_bytes = TCP_TOTAL_CACHE_ACCESSES_sum * 128 + + # Arithmetic intensity = FLOP / byte + ai_l1 = fops / l1_bytes if l1_bytes > 0 else 0.0 + + return ai_l1 \ No newline at end of file diff --git a/src/metrix/src/metrix/cli/info_cmd.py b/src/metrix/src/metrix/cli/info_cmd.py index 7c1b5944..55627a96 100644 --- a/src/metrix/src/metrix/cli/info_cmd.py +++ b/src/metrix/src/metrix/cli/info_cmd.py @@ -3,13 +3,16 @@ """ from ..metrics import METRIC_CATALOG, METRIC_PROFILES +from ..backends import get_backend, detect_or_default def info_command(args): """Execute info command""" if args.info_type == "metric": - show_metric_info(args.name) + # Get architecture from args if available, otherwise auto-detect + arch = getattr(args, 'arch', None) or detect_or_default() + show_metric_info(args.name, arch) elif args.info_type == "profile": show_profile_info(args.name) elif args.info_type == "counter": @@ -18,7 +21,7 @@ def info_command(args): return 0 -def show_metric_info(metric_name): +def show_metric_info(metric_name, arch="gfx942"): """Show detailed metric information""" if metric_name not in METRIC_CATALOG: @@ -37,9 +40,21 @@ def show_metric_info(metric_name): print(f"Unit: {metric_def['unit']}") print(f"Category: {metric_def['category'].value}") - print(f"\nRequired Hardware Counters:") - for counter in metric_def['derived_from']: - print(f" • {counter}") + # Show actual hardware counters from the backend (architecture-specific) + print(f"\nRequired Hardware Counters ({arch}):") + try: + backend = get_backend(arch) + actual_counters = backend.get_metric_counters(metric_name) + for counter in actual_counters: + print(f" • {counter}") + except ValueError as e: + # Metric not implemented in this backend + print(f" ⚠️ Metric not implemented for {arch}") + # Fall back to catalog's derived_from as documentation + if 'derived_from' in metric_def: + print(f"\n Conceptual counters (from catalog):") + for counter in metric_def['derived_from']: + print(f" • {counter}") if 'interpretation' in metric_def: print(f"\nInterpretation Guide:") diff --git a/src/metrix/src/metrix/metrics/catalog.py b/src/metrix/src/metrix/metrics/catalog.py index 61dcb0c7..ed0ec5d6 100644 --- a/src/metrix/src/metrix/metrics/catalog.py +++ b/src/metrix/src/metrix/metrics/catalog.py @@ -3,6 +3,7 @@ """ from .memory_metrics import MEMORY_METRICS +from .compute_metrics import COMPUTE_METRICS # ═══════════════════════════════════════════════════════════════════ # COMPLETE METRIC CATALOG @@ -10,7 +11,8 @@ METRIC_CATALOG = { **MEMORY_METRICS, - # Will add compute, occupancy, bottleneck metrics later + **COMPUTE_METRICS, + # Will add occupancy, bottleneck metrics later } # ═══════════════════════════════════════════════════════════════════ @@ -80,6 +82,23 @@ "memory.coalescing_efficiency", ], "estimated_passes": 1 + }, + + "compute": { + "description": "Compute and arithmetic intensity analysis", + "metrics": [ + "compute.total_flops", + "compute.hbm_gflops", + "compute.hbm_arithmetic_intensity", + "compute.l2_arithmetic_intensity", + "compute.l1_arithmetic_intensity", + ], + "estimated_passes": 3, + "focus": "compute_performance", + "typical_bottlenecks": [ + "low_arithmetic_intensity", + "memory_bound_kernel" + ] } } diff --git a/src/metrix/src/metrix/metrics/compute_metrics.py b/src/metrix/src/metrix/metrics/compute_metrics.py new file mode 100644 index 00000000..26942ec1 --- /dev/null +++ b/src/metrix/src/metrix/metrics/compute_metrics.py @@ -0,0 +1,320 @@ +""" +Compute-focused metric definitions (FLOPS, Arithmetic Intensity) +Based on Omnipilot's calculate_hbm_arithmetic_intensity() implementation + +NOTE: The `derived_from` field contains CONCEPTUAL counter names for documentation. +Actual hardware counter names vary by architecture (e.g., TCC_EA_* vs TCC_EA0_*). +For architecture-specific counter names, see the backend implementations in +metrix/backends/gfx942.py, gfx1201.py, etc. +""" + +from .categories import MetricCategory + +# ═══════════════════════════════════════════════════════════════════ +# COMPUTE THROUGHPUT METRICS +# ═══════════════════════════════════════════════════════════════════ + +COMPUTE_THROUGHPUT_METRICS = { + "compute.total_flops": { + "name": "Total FLOPS", + "description": "Total floating-point operations performed by the kernel", + "unit": "FLOPS", + "category": MetricCategory.COMPUTE, + # NOTE: HBM counters are architecture-specific: + # - MI300 (gfx942): TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum, etc. + # - MI200 (gfx90a): TCC_EA_RDREQ_sum, TCC_EA_WRREQ_sum, etc. + "derived_from": [ + # FP16 instructions + "SQ_INSTS_VALU_ADD_F16", + "SQ_INSTS_VALU_MUL_F16", + "SQ_INSTS_VALU_TRANS_F16", + "SQ_INSTS_VALU_FMA_F16", + # FP32 instructions + "SQ_INSTS_VALU_ADD_F32", + "SQ_INSTS_VALU_MUL_F32", + "SQ_INSTS_VALU_TRANS_F32", + "SQ_INSTS_VALU_FMA_F32", + # FP64 instructions + "SQ_INSTS_VALU_ADD_F64", + "SQ_INSTS_VALU_MUL_F64", + "SQ_INSTS_VALU_TRANS_F64", + "SQ_INSTS_VALU_FMA_F64", + # MFMA instructions (Matrix FMA) + "SQ_INSTS_VALU_MFMA_MOPS_F16", + "SQ_INSTS_VALU_MFMA_MOPS_BF16", + "SQ_INSTS_VALU_MFMA_MOPS_F32", + "SQ_INSTS_VALU_MFMA_MOPS_F64", + ], + "formula": """ + # 64 operations per wave (wavefront size = 64) + # FMA counts as 2 operations (multiply + add) + # MFMA instructions produce 512 operations per instruction + + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + return fops + """ + }, + + "compute.hbm_gflops": { + "name": "HBM Compute Throughput", + "description": "Compute throughput (GFLOPS) normalized by kernel execution time", + "unit": "GFLOPS", + "category": MetricCategory.COMPUTE, + "derived_from": [ + # All FLOPS counters + "SQ_INSTS_VALU_ADD_F16", "SQ_INSTS_VALU_MUL_F16", "SQ_INSTS_VALU_TRANS_F16", "SQ_INSTS_VALU_FMA_F16", + "SQ_INSTS_VALU_ADD_F32", "SQ_INSTS_VALU_MUL_F32", "SQ_INSTS_VALU_TRANS_F32", "SQ_INSTS_VALU_FMA_F32", + "SQ_INSTS_VALU_ADD_F64", "SQ_INSTS_VALU_MUL_F64", "SQ_INSTS_VALU_TRANS_F64", "SQ_INSTS_VALU_FMA_F64", + "SQ_INSTS_VALU_MFMA_MOPS_F16", "SQ_INSTS_VALU_MFMA_MOPS_BF16", + "SQ_INSTS_VALU_MFMA_MOPS_F32", "SQ_INSTS_VALU_MFMA_MOPS_F64", + "GRBM_GUI_ACTIVE" + ], + "formula": """ + # Calculate total FLOPS (same as compute.total_flops) + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + # Convert to GFLOPS + time_seconds = GRBM_GUI_ACTIVE / (gpu_freq_mhz * 1e6) + gflops = (fops / 1e9) / time_seconds if time_seconds > 0 else 0 + + return gflops + """, + "device_specific": True + } +} + +# ═══════════════════════════════════════════════════════════════════ +# ARITHMETIC INTENSITY METRICS +# ═══════════════════════════════════════════════════════════════════ + +ARITHMETIC_INTENSITY_METRICS = { + "compute.hbm_arithmetic_intensity": { + "name": "HBM Arithmetic Intensity", + "description": "Ratio of floating-point operations to HBM bytes transferred (FLOP/byte)", + "unit": "FLOP/byte", + "category": MetricCategory.COMPUTE, + "derived_from": [ + # FLOPS counters (same across architectures) + "SQ_INSTS_VALU_ADD_F16", "SQ_INSTS_VALU_MUL_F16", "SQ_INSTS_VALU_TRANS_F16", "SQ_INSTS_VALU_FMA_F16", + "SQ_INSTS_VALU_ADD_F32", "SQ_INSTS_VALU_MUL_F32", "SQ_INSTS_VALU_TRANS_F32", "SQ_INSTS_VALU_FMA_F32", + "SQ_INSTS_VALU_ADD_F64", "SQ_INSTS_VALU_MUL_F64", "SQ_INSTS_VALU_TRANS_F64", "SQ_INSTS_VALU_FMA_F64", + "SQ_INSTS_VALU_MFMA_MOPS_F16", "SQ_INSTS_VALU_MFMA_MOPS_BF16", + "SQ_INSTS_VALU_MFMA_MOPS_F32", "SQ_INSTS_VALU_MFMA_MOPS_F64", + # HBM bandwidth counters - conceptual names (actual names vary by arch) + "TCC_EA_RDREQ_32B_sum", "TCC_EA_RDREQ_sum", "TCC_BUBBLE_sum", + "TCC_EA_WRREQ_64B_sum", "TCC_EA_WRREQ_sum", + ], + "formula": """ + # Calculate total FLOPS + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + # Calculate HBM bytes (with 32B/64B/128B distinction) + # Note: TCC_BUBBLE_sum counts 128B read requests on MI300 + hbm_rd = (TCC_BUBBLE_sum * 128 + + (TCC_EA_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA_RDREQ_32B_sum) * 64 + + TCC_EA_RDREQ_32B_sum * 32) + hbm_wr = (TCC_EA_WRREQ_64B_sum * 64 + + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) + hbm_bytes = hbm_rd + hbm_wr + + # Arithmetic intensity = FLOP / byte + ai_hbm = fops / hbm_bytes if hbm_bytes > 0 else 0 + + return ai_hbm + """, + "interpretation": { + "excellent": (10, float('inf'), "Compute bound - excellent FLOP/byte ratio"), + "good": (5, 10, "Good balance between compute and memory"), + "fair": (1, 5, "Memory bound - moderate FLOP/byte ratio"), + "poor": (0, 1, "Heavily memory bound - low FLOP/byte ratio") + } + }, + + "compute.l2_arithmetic_intensity": { + "name": "L2 Arithmetic Intensity", + "description": "Ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte)", + "unit": "FLOP/byte", + "category": MetricCategory.COMPUTE, + "derived_from": [ + # FLOPS counters + "SQ_INSTS_VALU_ADD_F16", "SQ_INSTS_VALU_MUL_F16", "SQ_INSTS_VALU_TRANS_F16", "SQ_INSTS_VALU_FMA_F16", + "SQ_INSTS_VALU_ADD_F32", "SQ_INSTS_VALU_MUL_F32", "SQ_INSTS_VALU_TRANS_F32", "SQ_INSTS_VALU_FMA_F32", + "SQ_INSTS_VALU_ADD_F64", "SQ_INSTS_VALU_MUL_F64", "SQ_INSTS_VALU_TRANS_F64", "SQ_INSTS_VALU_FMA_F64", + "SQ_INSTS_VALU_MFMA_MOPS_F16", "SQ_INSTS_VALU_MFMA_MOPS_BF16", + "SQ_INSTS_VALU_MFMA_MOPS_F32", "SQ_INSTS_VALU_MFMA_MOPS_F64", + # L2 cache counters + "TCC_REQ_sum", + ], + "formula": """ + # Calculate total FLOPS + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + # Calculate L2 bytes (L2 cache line is 128 bytes) + l2_bytes = TCC_REQ_sum * 128 + + # Arithmetic intensity = FLOP / byte + ai_l2 = fops / l2_bytes if l2_bytes > 0 else 0 + + return ai_l2 + """ + }, + + "compute.l1_arithmetic_intensity": { + "name": "L1 Arithmetic Intensity", + "description": "Ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte)", + "unit": "FLOP/byte", + "category": MetricCategory.COMPUTE, + "derived_from": [ + # FLOPS counters + "SQ_INSTS_VALU_ADD_F16", "SQ_INSTS_VALU_MUL_F16", "SQ_INSTS_VALU_TRANS_F16", "SQ_INSTS_VALU_FMA_F16", + "SQ_INSTS_VALU_ADD_F32", "SQ_INSTS_VALU_MUL_F32", "SQ_INSTS_VALU_TRANS_F32", "SQ_INSTS_VALU_FMA_F32", + "SQ_INSTS_VALU_ADD_F64", "SQ_INSTS_VALU_MUL_F64", "SQ_INSTS_VALU_TRANS_F64", "SQ_INSTS_VALU_FMA_F64", + "SQ_INSTS_VALU_MFMA_MOPS_F16", "SQ_INSTS_VALU_MFMA_MOPS_BF16", + "SQ_INSTS_VALU_MFMA_MOPS_F32", "SQ_INSTS_VALU_MFMA_MOPS_F64", + # L1 cache counters + "TCP_TOTAL_CACHE_ACCESSES_sum", + ], + "formula": """ + # Calculate total FLOPS + fops = 64 * ( + ( + SQ_INSTS_VALU_ADD_F16 + + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + + SQ_INSTS_VALU_FMA_F16 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + + SQ_INSTS_VALU_FMA_F32 * 2 + ) + + ( + SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + + SQ_INSTS_VALU_FMA_F64 * 2 + ) + ) + 512 * ( + SQ_INSTS_VALU_MFMA_MOPS_F16 + + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + + SQ_INSTS_VALU_MFMA_MOPS_F64 + ) + + # Calculate L1 bytes (L1 cache line is 64 bytes) + l1_bytes = TCP_TOTAL_CACHE_ACCESSES_sum * 64 + + # Arithmetic intensity = FLOP / byte + ai_l1 = fops / l1_bytes if l1_bytes > 0 else 0 + + return ai_l1 + """ + } +} + +# ═══════════════════════════════════════════════════════════════════ +# COMBINED COMPUTE METRIC CATALOG +# ═══════════════════════════════════════════════════════════════════ + +COMPUTE_METRICS = { + **COMPUTE_THROUGHPUT_METRICS, + **ARITHMETIC_INTENSITY_METRICS +} diff --git a/src/metrix/src/metrix/metrics/memory_metrics.py b/src/metrix/src/metrix/metrics/memory_metrics.py index 0b3178cc..890d9008 100644 --- a/src/metrix/src/metrix/metrics/memory_metrics.py +++ b/src/metrix/src/metrix/metrics/memory_metrics.py @@ -1,6 +1,11 @@ """ Memory-focused metric definitions Top-down approach: Define what we want to know, not how to measure it + +NOTE: The `derived_from` field contains CONCEPTUAL counter names for documentation. +Actual hardware counter names vary by architecture (e.g., TCC_EA_* vs TCC_EA0_*). +For architecture-specific counter names, see the backend implementations in +metrix/backends/gfx942.py, gfx1201.py, etc. """ from .categories import MetricCategory @@ -15,6 +20,9 @@ "description": "Percentage of peak HBM (High Bandwidth Memory) bandwidth utilized", "unit": "percent", "category": MetricCategory.MEMORY_BANDWIDTH, + # NOTE: These are conceptual counter names. Actual names vary by architecture: + # - MI300 (gfx942): TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum + # - MI200 (gfx90a): TCC_EA_RDREQ_sum, TCC_EA_WRREQ_sum "derived_from": [ "TCC_EA_RDREQ_sum", # Read requests to memory controller "TCC_EA_WRREQ_sum", # Write requests to memory controller @@ -84,6 +92,30 @@ "formula": """ return (TCC_EA_RDREQ_sum + TCC_EA_WRREQ_sum) * 64 """ + }, + + "memory.bytes_transferred_l2": { + "name": "Total L2 Bytes Transferred", + "description": "Total bytes accessed through L2 cache", + "unit": "bytes", + "category": MetricCategory.MEMORY_BANDWIDTH, + "derived_from": ["TCC_REQ_sum"], + "formula": """ + # L2 cache line is 128 bytes + return TCC_REQ_sum * 128 + """ + }, + + "memory.bytes_transferred_l1": { + "name": "Total L1 Bytes Transferred", + "description": "Total bytes accessed through L1 cache", + "unit": "bytes", + "category": MetricCategory.MEMORY_BANDWIDTH, + "derived_from": ["TCP_TOTAL_CACHE_ACCESSES_sum"], + "formula": """ + # L1 cache line is 128 bytes on gfx942 + return TCP_TOTAL_CACHE_ACCESSES_sum * 128 + """ } } diff --git a/src/metrix/tests/integration/test_all_metrics_displayed.py b/src/metrix/tests/integration/test_all_metrics_displayed.py index 31f06d7d..a080b22b 100644 --- a/src/metrix/tests/integration/test_all_metrics_displayed.py +++ b/src/metrix/tests/integration/test_all_metrics_displayed.py @@ -1,8 +1,8 @@ """ -Integration test to verify all 11 metrics are displayed +Integration test to verify all metrics are displayed -This test specifically validates the bug fix where only 7 out of 11 metrics -were being displayed due to MetricComputer receiving wrong parameter type. +This test validates that all memory and compute metrics are properly +computed and displayed by the metrix profiler. """ import pytest @@ -35,10 +35,10 @@ def vector_add_binary(tmp_path): @pytest.mark.integration @pytest.mark.timeout(60) -def test_all_11_metrics_are_displayed(vector_add_binary): - """Verify that all 11 metrics are computed and displayed""" +def test_all_memory_metrics_are_displayed(vector_add_binary): + """Verify that all 12 memory metrics are computed and displayed""" result = subprocess.run( - ["metrix", "-n", "1", "--aggregate", str(vector_add_binary)], + ["metrix", "-n", "1", "--aggregate", "--profile", "memory", str(vector_add_binary)], capture_output=True, text=True, timeout=60, @@ -47,8 +47,8 @@ def test_all_11_metrics_are_displayed(vector_add_binary): assert result.returncode == 0, f"stderr: {result.stderr}" output = result.stdout - # List of all 11 expected metrics (friendly names as displayed) - expected_metrics = [ + # List of all expected memory metrics (friendly names as displayed) + expected_memory_metrics = [ # Memory Bandwidth (5 metrics) "HBM Read Bandwidth", "HBM Write Bandwidth", @@ -67,15 +67,15 @@ def test_all_11_metrics_are_displayed(vector_add_binary): ] missing_metrics = [] - for metric in expected_metrics: + for metric in expected_memory_metrics: if metric not in output: missing_metrics.append(metric) assert ( len(missing_metrics) == 0 - ), f"Missing metrics: {missing_metrics}\n\nOutput:\n{output}" + ), f"Missing memory metrics: {missing_metrics}\n\nOutput:\n{output}" - print(f"✓ All {len(expected_metrics)} metrics displayed successfully") + print(f"✓ All {len(expected_memory_metrics)} memory metrics displayed successfully") @pytest.mark.integration @@ -83,7 +83,7 @@ def test_all_11_metrics_are_displayed(vector_add_binary): def test_bandwidth_metrics_have_values(vector_add_binary): """Verify bandwidth metrics compute to non-zero values""" result = subprocess.run( - ["metrix", "-n", "1", "--aggregate", "--verbose", str(vector_add_binary)], + ["metrix", "-n", "1", "--aggregate", str(vector_add_binary)], capture_output=True, text=True, timeout=60, @@ -109,10 +109,45 @@ def test_bandwidth_metrics_have_values(vector_add_binary): assert "0.00 percent" not in line, "HBM Bandwidth Utilization is zero!" +@pytest.mark.integration +@pytest.mark.timeout(120) +def test_all_compute_metrics_are_displayed(vector_add_binary): + """Verify that all compute metrics are computed and displayed""" + result = subprocess.run( + ["metrix", "-n", "1", "--aggregate", "--profile", "compute", str(vector_add_binary)], + capture_output=True, + text=True, + timeout=120, + ) + + assert result.returncode == 0, f"stderr: {result.stderr}" + output = result.stdout + + # List of all expected compute metrics (friendly names as displayed) + expected_compute_metrics = [ + "Total FLOPS", + "HBM Compute Throughput", + "HBM Arithmetic Intensity", + "L2 Arithmetic Intensity", + "L1 Arithmetic Intensity", + ] + + missing_metrics = [] + for metric in expected_compute_metrics: + if metric not in output: + missing_metrics.append(metric) + + assert ( + len(missing_metrics) == 0 + ), f"Missing compute metrics: {missing_metrics}\n\nOutput:\n{output}" + + print(f"✓ All {len(expected_compute_metrics)} compute metrics displayed successfully") + + @pytest.mark.integration @pytest.mark.timeout(60) -def test_json_output_has_all_metrics(vector_add_binary, tmp_path): - """Verify JSON output contains all 11 metrics""" +def test_json_output_has_memory_metrics(vector_add_binary, tmp_path): + """Verify JSON output contains all memory metrics""" output_file = tmp_path / "results.json" result = subprocess.run( @@ -121,6 +156,8 @@ def test_json_output_has_all_metrics(vector_add_binary, tmp_path): "-n", "1", "--aggregate", + "--profile", + "memory", "-o", str(output_file), str(vector_add_binary), @@ -148,14 +185,57 @@ def test_json_output_has_all_metrics(vector_add_binary, tmp_path): assert "duration_us" in kernel_data assert "metrics" in kernel_data - # Count metrics - num_metrics = len(kernel_data["metrics"]) - assert ( - num_metrics == 12 - ), f"Expected 12 metrics, got {num_metrics}: {list(kernel_data['metrics'].keys())}" - - # Verify the 4 bandwidth metrics that were previously failing + # Verify key memory bandwidth metrics assert "memory.hbm_bandwidth_utilization" in kernel_data["metrics"] assert "memory.hbm_read_bandwidth" in kernel_data["metrics"] assert "memory.hbm_write_bandwidth" in kernel_data["metrics"] assert "memory.l2_bandwidth" in kernel_data["metrics"] + + +@pytest.mark.integration +@pytest.mark.timeout(120) +def test_json_output_has_compute_metrics(vector_add_binary, tmp_path): + """Verify JSON output contains all compute metrics""" + output_file = tmp_path / "results.json" + + result = subprocess.run( + [ + "metrix", + "-n", + "1", + "--aggregate", + "--profile", + "compute", + "-o", + str(output_file), + str(vector_add_binary), + ], + capture_output=True, + text=True, + timeout=120, + ) + + assert result.returncode == 0, f"stderr: {result.stderr}" + assert output_file.exists() + + import json + + with open(output_file) as f: + data = json.load(f) + + # Check structure + assert len(data) > 0, "No kernels in JSON output" + + # Get first kernel/dispatch + first_key = list(data.keys())[0] + kernel_data = data[first_key] + + assert "duration_us" in kernel_data + assert "metrics" in kernel_data + + # Verify compute metrics are present + assert "compute.total_flops" in kernel_data["metrics"] + assert "compute.hbm_gflops" in kernel_data["metrics"] + assert "compute.hbm_arithmetic_intensity" in kernel_data["metrics"] + assert "compute.l2_arithmetic_intensity" in kernel_data["metrics"] + assert "compute.l1_arithmetic_intensity" in kernel_data["metrics"] \ No newline at end of file diff --git a/src/metrix/tests/integration/test_cli_integration.py b/src/metrix/tests/integration/test_cli_integration.py index 4c3b173b..330a6fbb 100644 --- a/src/metrix/tests/integration/test_cli_integration.py +++ b/src/metrix/tests/integration/test_cli_integration.py @@ -36,7 +36,7 @@ def test_cli_time_only_aggregated(): "metrix", "profile", "--time-only", - "--runs", + "--num-replays", "3", "--aggregate", str(VECTOR_ADD), @@ -103,3 +103,73 @@ def test_cli_list_metrics(): assert result.returncode == 0 assert "memory.l2_hit_rate" in result.stdout + + +def test_cli_list_metrics_includes_compute(): + """Test that metrix list metrics includes compute metrics""" + result = subprocess.run( + ["metrix", "list", "metrics"], capture_output=True, text=True, timeout=5 + ) + + assert result.returncode == 0 + assert "compute.total_flops" in result.stdout + assert "compute.hbm_arithmetic_intensity" in result.stdout + + +def test_cli_list_profiles_includes_compute(): + """Test that metrix list profiles includes compute profile""" + result = subprocess.run( + ["metrix", "list", "profiles"], capture_output=True, text=True, timeout=5 + ) + + assert result.returncode == 0 + assert "COMPUTE" in result.stdout + + +@pytest.mark.timeout(120) +@pytest.mark.skipif(not VECTOR_ADD.exists(), reason="vector_add not compiled") +def test_cli_compute_profile(): + """Test metrix profile --profile compute""" + result = subprocess.run( + [ + "metrix", + "profile", + "--profile", + "compute", + "-n", + "1", + "--aggregate", + str(VECTOR_ADD), + ], + capture_output=True, + text=True, + timeout=120, + ) + + assert result.returncode == 0, f"Command failed: {result.stderr}" + assert "vector_add" in result.stdout + # Compute profile should show compute metrics + assert "COMPUTE" in result.stdout or "Total FLOPS" in result.stdout or "Arithmetic Intensity" in result.stdout + + +@pytest.mark.timeout(120) +@pytest.mark.skipif(not VECTOR_ADD.exists(), reason="vector_add not compiled") +def test_cli_compute_metric_directly(): + """Test metrix --metrics compute.total_flops""" + result = subprocess.run( + [ + "metrix", + "--metrics", + "compute.total_flops", + "-n", + "1", + str(VECTOR_ADD), + ], + capture_output=True, + text=True, + timeout=120, + ) + + assert result.returncode == 0, f"Command failed: {result.stderr}" + assert "vector_add" in result.stdout + assert "Total FLOPS" in result.stdout or "FLOPS" in result.stdout \ No newline at end of file diff --git a/src/metrix/tests/unit/backends/test_gfx942_metrics.py b/src/metrix/tests/unit/backends/test_gfx942_metrics.py index 44098f1f..7cbbe5de 100644 --- a/src/metrix/tests/unit/backends/test_gfx942_metrics.py +++ b/src/metrix/tests/unit/backends/test_gfx942_metrics.py @@ -146,40 +146,73 @@ def test_no_lds_instructions(self): class TestBandwidthMetrics: - """Test HBM bandwidth computations""" + """Test HBM bandwidth computations with 32B/64B/128B request granularity""" - def test_hbm_read_bandwidth(self): - """Test read bandwidth calculation""" + def test_hbm_read_bandwidth_64b_only(self): + """Test read bandwidth with only 64B requests""" backend = GFX942Backend() backend._raw_data = { - 'TCC_EA0_RDREQ_sum': 1000, - 'TCC_EA1_RDREQ_sum': 1000, - 'GRBM_GUI_ACTIVE': 2100000 # 1 ms at 2.1 GHz + 'TCC_EA0_RDREQ_sum': 1000, # Total read requests + 'TCC_EA0_RDREQ_32B_sum': 0, # No 32B requests + 'TCC_BUBBLE_sum': 0, # No 128B requests + 'GRBM_GUI_ACTIVE': 2100000 # 1 ms at 2.1 GHz + } + + result = backend._hbm_read_bandwidth() + # (1000 * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s + assert 0.06 < result < 0.07 + + def test_hbm_read_bandwidth_mixed_sizes(self): + """Test read bandwidth with mixed request sizes""" + backend = GFX942Backend() + backend._raw_data = { + 'TCC_EA0_RDREQ_sum': 1000, # Total requests + 'TCC_EA0_RDREQ_32B_sum': 200, # 200 × 32B = 6400 bytes + 'TCC_BUBBLE_sum': 300, # 300 × 128B = 38400 bytes + # Remaining: 1000 - 200 - 300 = 500 × 64B = 32000 bytes + # Total: 6400 + 38400 + 32000 = 76800 bytes + 'GRBM_GUI_ACTIVE': 2100000 # 1 ms at 2.1 GHz } result = backend._hbm_read_bandwidth() - # (2000 requests * 64 bytes) / 0.001 seconds = 128 MB/s = 0.128 GB/s - assert 0.1 < result < 0.2 + # 76800 / 1e9 / 0.001 = 0.0768 GB/s + assert 0.07 < result < 0.08 + + def test_hbm_write_bandwidth_64b_only(self): + """Test write bandwidth with only 64B requests""" + backend = GFX942Backend() + backend._raw_data = { + 'TCC_EA0_WRREQ_sum': 1000, # Total write requests + 'TCC_EA0_WRREQ_64B_sum': 1000, # All are 64B + 'GRBM_GUI_ACTIVE': 2100000 # 1 ms at 2.1 GHz + } + + result = backend._hbm_write_bandwidth() + # (1000 * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s + assert 0.06 < result < 0.07 - def test_hbm_write_bandwidth(self): - """Test write bandwidth calculation""" + def test_hbm_write_bandwidth_mixed_sizes(self): + """Test write bandwidth with mixed 32B and 64B requests""" backend = GFX942Backend() backend._raw_data = { - 'TCC_EA0_WRREQ_sum': 500, - 'TCC_EA1_WRREQ_sum': 500, - 'GRBM_GUI_ACTIVE': 2100000 # 1 ms at 2.1 GHz + 'TCC_EA0_WRREQ_sum': 1000, # Total write requests + 'TCC_EA0_WRREQ_64B_sum': 600, # 600 × 64B = 38400 bytes + # Remaining: 1000 - 600 = 400 × 32B = 12800 bytes + # Total: 38400 + 12800 = 51200 bytes + 'GRBM_GUI_ACTIVE': 2100000 # 1 ms at 2.1 GHz } result = backend._hbm_write_bandwidth() - # (1000 requests * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s - assert 0.05 < result < 0.1 + # 51200 / 1e9 / 0.001 = 0.0512 GB/s + assert 0.05 < result < 0.06 def test_zero_active_cycles(self): """Handle zero active cycles""" backend = GFX942Backend() backend._raw_data = { 'TCC_EA0_RDREQ_sum': 1000, - 'TCC_EA1_RDREQ_sum': 1000, + 'TCC_EA0_RDREQ_32B_sum': 0, + 'TCC_BUBBLE_sum': 0, 'GRBM_GUI_ACTIVE': 0 } @@ -188,36 +221,38 @@ def test_zero_active_cycles(self): class TestAtomicLatency: - """Test atomic operation latency computation""" + """Test L2 cache atomic operation latency computation""" def test_low_latency(self): """10 cycles per atomic operation""" backend = GFX942Backend() backend._raw_data = { - 'SQ_INSTS_GDS': 1000, - 'GDS_BUSY': 10000 + 'TCC_EA0_ATOMIC_sum': 1000, # 1000 atomic operations + 'TCC_EA0_ATOMIC_LEVEL_sum': 10000 # 10000 total cycles } result = backend._atomic_latency() + # 10000 / 1000 = 10 cycles per atomic assert result == 10.0 def test_high_latency(self): """1000 cycles per atomic (contention)""" backend = GFX942Backend() backend._raw_data = { - 'SQ_INSTS_GDS': 100, - 'GDS_BUSY': 100000 + 'TCC_EA0_ATOMIC_sum': 100, # 100 atomic operations + 'TCC_EA0_ATOMIC_LEVEL_sum': 100000 # 100000 total cycles } result = backend._atomic_latency() + # 100000 / 100 = 1000 cycles per atomic assert result == 1000.0 def test_no_atomics(self): """Handle zero atomic instructions""" backend = GFX942Backend() backend._raw_data = { - 'SQ_INSTS_GDS': 0, - 'GDS_BUSY': 5000 + 'TCC_EA0_ATOMIC_sum': 0, + 'TCC_EA0_ATOMIC_LEVEL_sum': 5000 } result = backend._atomic_latency() @@ -251,3 +286,205 @@ def test_get_required_counters(self): assert "TCC_MISS_sum" in counters assert len(counters) == 2 + def test_discovers_compute_metrics(self): + """Backend should discover all compute metrics""" + backend = GFX942Backend() + + metrics = backend.get_available_metrics() + + assert "compute.total_flops" in metrics + assert "compute.hbm_gflops" in metrics + assert "compute.hbm_arithmetic_intensity" in metrics + assert "compute.l2_arithmetic_intensity" in metrics + assert "compute.l1_arithmetic_intensity" in metrics + + +class TestComputeMetrics: + """Test compute metric computations (FLOPS, arithmetic intensity)""" + + def _get_zero_flops_counters(self): + """Helper: return counter dict with all FLOPS counters set to 0""" + return { + 'SQ_INSTS_VALU_ADD_F16': 0, 'SQ_INSTS_VALU_MUL_F16': 0, + 'SQ_INSTS_VALU_TRANS_F16': 0, 'SQ_INSTS_VALU_FMA_F16': 0, + 'SQ_INSTS_VALU_ADD_F32': 0, 'SQ_INSTS_VALU_MUL_F32': 0, + 'SQ_INSTS_VALU_TRANS_F32': 0, 'SQ_INSTS_VALU_FMA_F32': 0, + 'SQ_INSTS_VALU_ADD_F64': 0, 'SQ_INSTS_VALU_MUL_F64': 0, + 'SQ_INSTS_VALU_TRANS_F64': 0, 'SQ_INSTS_VALU_FMA_F64': 0, + 'SQ_INSTS_VALU_MFMA_MOPS_F16': 0, 'SQ_INSTS_VALU_MFMA_MOPS_BF16': 0, + 'SQ_INSTS_VALU_MFMA_MOPS_F32': 0, 'SQ_INSTS_VALU_MFMA_MOPS_F64': 0, + } + + def test_total_flops_fp32_add(self): + """Test FLOPS calculation with FP32 add instructions""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 100 + + result = backend._total_flops() + # 64 threads per wave * 100 instructions = 6400 FLOPS + assert result == 6400 + + def test_total_flops_fma_counts_double(self): + """Test that FMA instructions count as 2 operations""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_FMA_F32'] = 100 + + result = backend._total_flops() + # 64 threads * 100 FMA * 2 ops = 12800 FLOPS + assert result == 12800 + + def test_total_flops_mfma_high_throughput(self): + """Test MFMA instructions produce 512 operations each""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_MFMA_MOPS_F32'] = 10 + + result = backend._total_flops() + # 512 ops * 10 instructions = 5120 FLOPS + assert result == 5120 + + def test_total_flops_mixed_precision(self): + """Test FLOPS with mixed precision operations""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F16'] = 100 # 6400 FLOPS + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 50 # 3200 FLOPS + backend._raw_data['SQ_INSTS_VALU_ADD_F64'] = 25 # 1600 FLOPS + + result = backend._total_flops() + assert result == 6400 + 3200 + 1600 + + def test_total_flops_zero(self): + """Handle zero FLOPS gracefully""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + + result = backend._total_flops() + assert result == 0 + + def test_hbm_gflops_calculation(self): + """Test GFLOPS calculation with timing""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000000 # 64M FLOPS + backend._raw_data['GRBM_GUI_ACTIVE'] = 2100000 # 1 ms at 2.1 GHz + + result = backend._hbm_gflops() + # 64M FLOPS / 0.001 seconds = 64 GFLOPS + assert 60 < result < 70 + + def test_hbm_gflops_zero_time(self): + """Handle zero active cycles""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000 + backend._raw_data['GRBM_GUI_ACTIVE'] = 0 + + result = backend._hbm_gflops() + assert result == 0.0 + + def test_hbm_arithmetic_intensity(self): + """Test HBM arithmetic intensity calculation""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000 # 64000 FLOPS + # HBM counters: simple case with only 64B reads + backend._raw_data['TCC_EA0_RDREQ_sum'] = 1000 + backend._raw_data['TCC_EA0_RDREQ_32B_sum'] = 0 + backend._raw_data['TCC_BUBBLE_sum'] = 0 + backend._raw_data['TCC_EA0_WRREQ_sum'] = 0 + backend._raw_data['TCC_EA0_WRREQ_64B_sum'] = 0 + + result = backend._hbm_arithmetic_intensity() + # 64000 FLOPS / (1000 * 64 bytes) = 64000 / 64000 = 1.0 FLOP/byte + assert result == 1.0 + + def test_hbm_arithmetic_intensity_zero_bytes(self): + """Handle zero HBM bytes transferred""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000 + backend._raw_data['TCC_EA0_RDREQ_sum'] = 0 + backend._raw_data['TCC_EA0_RDREQ_32B_sum'] = 0 + backend._raw_data['TCC_BUBBLE_sum'] = 0 + backend._raw_data['TCC_EA0_WRREQ_sum'] = 0 + backend._raw_data['TCC_EA0_WRREQ_64B_sum'] = 0 + + result = backend._hbm_arithmetic_intensity() + assert result == 0.0 + + def test_l2_arithmetic_intensity(self): + """Test L2 arithmetic intensity calculation""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000 # 64000 FLOPS + backend._raw_data['TCC_REQ_sum'] = 500 # 500 * 128 = 64000 bytes + + result = backend._l2_arithmetic_intensity() + # 64000 FLOPS / 64000 bytes = 1.0 FLOP/byte + assert result == 1.0 + + def test_l2_arithmetic_intensity_zero_bytes(self): + """Handle zero L2 bytes""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000 + backend._raw_data['TCC_REQ_sum'] = 0 + + result = backend._l2_arithmetic_intensity() + assert result == 0.0 + + def test_l1_arithmetic_intensity(self): + """Test L1 arithmetic intensity calculation""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000 # 64000 FLOPS + backend._raw_data['TCP_TOTAL_CACHE_ACCESSES_sum'] = 500 # 500 * 128 = 64000 bytes + + result = backend._l1_arithmetic_intensity() + # 64000 FLOPS / 64000 bytes = 1.0 FLOP/byte + assert result == 1.0 + + def test_l1_arithmetic_intensity_zero_bytes(self): + """Handle zero L1 bytes""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000 + backend._raw_data['TCP_TOTAL_CACHE_ACCESSES_sum'] = 0 + + result = backend._l1_arithmetic_intensity() + assert result == 0.0 + + def test_high_arithmetic_intensity_compute_bound(self): + """Test high AI indicates compute-bound kernel""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + # Lots of compute, little memory + backend._raw_data['SQ_INSTS_VALU_MFMA_MOPS_F32'] = 1000 # 512000 FLOPS + backend._raw_data['TCC_EA0_RDREQ_sum'] = 100 # 6400 bytes + backend._raw_data['TCC_EA0_RDREQ_32B_sum'] = 0 + backend._raw_data['TCC_BUBBLE_sum'] = 0 + backend._raw_data['TCC_EA0_WRREQ_sum'] = 0 + backend._raw_data['TCC_EA0_WRREQ_64B_sum'] = 0 + + result = backend._hbm_arithmetic_intensity() + # 512000 / 6400 = 80 FLOP/byte (very compute-bound) + assert result == 80.0 + + def test_low_arithmetic_intensity_memory_bound(self): + """Test low AI indicates memory-bound kernel""" + backend = GFX942Backend() + backend._raw_data = self._get_zero_flops_counters() + # Little compute, lots of memory + backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 100 # 6400 FLOPS + backend._raw_data['TCC_EA0_RDREQ_sum'] = 10000 # 640000 bytes + backend._raw_data['TCC_EA0_RDREQ_32B_sum'] = 0 + backend._raw_data['TCC_BUBBLE_sum'] = 0 + backend._raw_data['TCC_EA0_WRREQ_sum'] = 0 + backend._raw_data['TCC_EA0_WRREQ_64B_sum'] = 0 + + result = backend._hbm_arithmetic_intensity() + # 6400 / 640000 = 0.01 FLOP/byte (very memory-bound) + assert result == 0.01 diff --git a/src/metrix/tests/unit/test_api.py b/src/metrix/tests/unit/test_api.py index 1448f887..8eed2427 100644 --- a/src/metrix/tests/unit/test_api.py +++ b/src/metrix/tests/unit/test_api.py @@ -14,14 +14,13 @@ def test_init_default(self): """Test default initialization""" profiler = Metrix() assert profiler.arch == "gfx942" - assert profiler.verbose == False assert profiler.backend is not None def test_init_custom_arch(self): """Test custom architecture""" - profiler = Metrix(arch="gfx942", verbose=True) + profiler = Metrix(arch="gfx942") assert profiler.arch == "gfx942" - assert profiler.verbose == True + assert profiler.backend is not None class TestMetrixMetricListing: @@ -35,6 +34,16 @@ def test_list_metrics(self): assert "memory.l2_hit_rate" in metrics assert "memory.hbm_bandwidth_utilization" in metrics + def test_list_metrics_includes_compute(self): + """Test that compute metrics are included in list""" + profiler = Metrix() + metrics = profiler.list_metrics() + assert "compute.total_flops" in metrics + assert "compute.hbm_gflops" in metrics + assert "compute.hbm_arithmetic_intensity" in metrics + assert "compute.l2_arithmetic_intensity" in metrics + assert "compute.l1_arithmetic_intensity" in metrics + def test_list_profiles(self): """Test listing profiles""" profiler = Metrix() @@ -42,6 +51,12 @@ def test_list_profiles(self): assert "quick" in profiles assert "memory" in profiles + def test_list_profiles_includes_compute(self): + """Test that compute profile is included""" + profiler = Metrix() + profiles = profiler.list_profiles() + assert "compute" in profiles + def test_get_metric_info(self): """Test getting metric information""" profiler = Metrix() @@ -49,6 +64,20 @@ def test_get_metric_info(self): assert info['name'] == "L2 Cache Hit Rate" assert info['unit'] == "percent" + def test_get_compute_metric_info(self): + """Test getting compute metric information""" + profiler = Metrix() + info = profiler.get_metric_info("compute.total_flops") + assert info['name'] == "Total FLOPS" + assert info['unit'] == "FLOPS" + + def test_get_arithmetic_intensity_info(self): + """Test getting arithmetic intensity metric information""" + profiler = Metrix() + info = profiler.get_metric_info("compute.hbm_arithmetic_intensity") + assert info['name'] == "HBM Arithmetic Intensity" + assert info['unit'] == "FLOP/byte" + def test_get_unknown_metric_raises(self): """Test getting info for unknown metric raises error""" profiler = Metrix()