diff --git a/src/metrix/README.md b/src/metrix/README.md
index aa79044c..a3e6ed9f 100644
--- a/src/metrix/README.md
+++ b/src/metrix/README.md
@@ -17,6 +17,7 @@ Existing GPU profilers are **trash**:
 - **Human-readable metrics** instead of raw counters
 - **Unit tested** and reliable
 - **12 Memory Metrics**: Bandwidth, cache, coalescing, LDS, atomic latency
+- **7 Compute Metrics**: FLOPS, arithmetic intensity (HBM/L2/L1), compute throughput
 - **Multi-Run Profiling**: Automatic aggregation with min/max/avg statistics
 - **Kernel Filtering**: Efficient regex filtering at rocprofv3 level
 - **Multiple Output Formats**: Text, JSON, CSV
@@ -68,6 +69,8 @@ for kernel in results.kernels:
 - `memory.hbm_write_bandwidth` - HBM write bandwidth (GB/s)
 - `memory.hbm_bandwidth_utilization` - % of peak HBM bandwidth
 - `memory.bytes_transferred_hbm` - Total bytes through HBM
+- `memory.bytes_transferred_l2` - Total bytes through L2 cache
+- `memory.bytes_transferred_l1` - Total bytes through L1 cache
 
 ### Cache Performance
 - `memory.l1_hit_rate` - L1 cache hit rate (%)
@@ -85,13 +88,20 @@ for kernel in results.kernels:
 ### Atomic Operations
 - `memory.atomic_latency` - Atomic operation latency (cycles)
 
+### Compute Metrics
+- `compute.total_flops` - Total floating-point operations performed
+- `compute.hbm_gflops` - Compute throughput (GFLOPS)
+- `compute.hbm_arithmetic_intensity` - Ratio of FLOPs to HBM bytes (FLOP/byte)
+- `compute.l2_arithmetic_intensity` - Ratio of FLOPs to L2 bytes (FLOP/byte)
+- `compute.l1_arithmetic_intensity` - Ratio of FLOPs to L1 bytes (FLOP/byte)
+
 ## CLI Options
 
 ```
 metrix [options] <command>
 
 Options:
-  --profile, -p      Use pre-defined profile (quick, memory)
+  --profile, -p      Use pre-defined profile (quick, memory, compute)
   --metrics, -m      Comma-separated list of metrics
   --time-only        Only collect timing
   --kernel, -k       Filter by kernel name substring
diff --git a/src/metrix/src/metrix/backends/base.py b/src/metrix/src/metrix/backends/base.py
index a935595c..e9495fa4 100644
--- a/src/metrix/src/metrix/backends/base.py
+++ b/src/metrix/src/metrix/backends/base.py
@@ -162,6 +162,10 @@ def _split_counters_into_passes(self, counters: List[str]) -> List[List[str]]:
         Returns:
             List of counter lists, one per profiling pass
         """
+        # Handle empty counters (timing-only mode) - return single pass with no counters
+        if not counters:
+            return [[]]
+            
         counter_groups = self._get_counter_groups()
         max_per_pass = 14  # Conservative limit for most AMD GPUs
 
diff --git a/src/metrix/src/metrix/backends/gfx1201.py b/src/metrix/src/metrix/backends/gfx1201.py
index 18a4ceee..fae8bb7d 100644
--- a/src/metrix/src/metrix/backends/gfx1201.py
+++ b/src/metrix/src/metrix/backends/gfx1201.py
@@ -83,6 +83,24 @@ def _bytes_transferred_hbm(self, GRBM_GUI_ACTIVE):
         """
         return 0.0
 
+    @metric("memory.bytes_transferred_l2")
+    def _bytes_transferred_l2(self):
+        """
+        Total bytes transferred through L2 cache
+
+        Formula: TCC_REQ_sum * 128 (L2 cache line size is 128 bytes)
+        """
+        return 0.0
+
+    @metric("memory.bytes_transferred_l1")
+    def _bytes_transferred_l1(self):
+        """
+        Total bytes transferred through L1 cache
+
+        Formula: TCP_TOTAL_CACHE_ACCESSES_sum * cache_line_size (architecture-dependent)
+        """
+        return 0.0
+
     # Cache metrics
 
     @metric("memory.l2_hit_rate")
@@ -173,3 +191,49 @@ def _atomic_latency(self):
 
         return 0.0
 
+    # Compute metrics
+
+    @metric("compute.total_flops")
+    def _total_flops(self):
+        """
+        Total floating-point operations performed by the kernel
+
+        Formula: 64 * (FP16 + FP32 + FP64) + 512 * MFMA
+        """
+        return 0.0
+
+    @metric("compute.hbm_gflops")
+    def _hbm_gflops(self):
+        """
+        Compute throughput (GFLOPS) normalized by kernel execution time
+
+        Formula: (total_flops / 1e9) / time_seconds
+        """
+        return 0.0
+
+    @metric("compute.hbm_arithmetic_intensity")
+    def _hbm_arithmetic_intensity(self):
+        """
+        HBM Arithmetic Intensity: ratio of floating-point operations to HBM bytes transferred (FLOP/byte)
+
+        Formula: total_flops / hbm_bytes
+        """
+        return 0.0
+
+    @metric("compute.l2_arithmetic_intensity")
+    def _l2_arithmetic_intensity(self):
+        """
+        L2 Arithmetic Intensity: ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte)
+
+        Formula: total_flops / l2_bytes
+        """
+        return 0.0
+
+    @metric("compute.l1_arithmetic_intensity")
+    def _l1_arithmetic_intensity(self):
+        """
+        L1 Arithmetic Intensity: ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte)
+
+        Formula: total_flops / l1_bytes
+        """
+        return 0.0
diff --git a/src/metrix/src/metrix/backends/gfx942.py b/src/metrix/src/metrix/backends/gfx942.py
index f1e63d3b..9791a707 100644
--- a/src/metrix/src/metrix/backends/gfx942.py
+++ b/src/metrix/src/metrix/backends/gfx942.py
@@ -58,6 +58,7 @@ def _get_counter_groups(self) -> List[List[str]]:
             [
                 "SQ_LDS_BANK_CONFLICT",
                 "TCC_EA0_WRREQ_sum",
+                "TCC_EA0_WRREQ_64B_sum",  
                 "TCC_EA0_ATOMIC_LEVEL_sum",
                 "TCC_EA0_ATOMIC_sum",
                 "GRBM_GUI_ACTIVE",
@@ -68,6 +69,7 @@ def _get_counter_groups(self) -> List[List[str]]:
                 "TCP_TOTAL_ACCESSES_sum",
                 "TCC_HIT_sum",
                 "TCC_MISS_sum",
+                "TCC_REQ_sum",  
             ],
             # Group 3: Memory instructions and read requests (from SQ_INSTS_VMEM)
             # Note: RDREQ can only be collected here, not with atomics!
@@ -76,9 +78,33 @@ def _get_counter_groups(self) -> List[List[str]]:
                 "TCP_TCC_WRITE_REQ_sum",
                 "TCP_TOTAL_CACHE_ACCESSES_sum",
                 "TCC_EA0_RDREQ_sum",
+                "TCC_EA0_RDREQ_32B_sum",
+                "TCC_BUBBLE_sum",
                 "SQ_INSTS_VMEM_RD",
                 "SQ_INSTS_VMEM_WR",
             ],
+            # Group 4: FP16 and FP32 VALU instructions (for FLOPS calculations)
+            [
+                "SQ_INSTS_VALU_ADD_F16",
+                "SQ_INSTS_VALU_MUL_F16",
+                "SQ_INSTS_VALU_TRANS_F16",
+                "SQ_INSTS_VALU_FMA_F16",
+                "SQ_INSTS_VALU_ADD_F32",
+                "SQ_INSTS_VALU_MUL_F32",
+                "SQ_INSTS_VALU_TRANS_F32",
+                "SQ_INSTS_VALU_FMA_F32",
+            ],
+            # Group 5: FP64 VALU instructions and MFMA instructions
+            [
+                "SQ_INSTS_VALU_ADD_F64",
+                "SQ_INSTS_VALU_MUL_F64",
+                "SQ_INSTS_VALU_TRANS_F64",
+                "SQ_INSTS_VALU_FMA_F64",
+                "SQ_INSTS_VALU_MFMA_MOPS_F16",
+                "SQ_INSTS_VALU_MFMA_MOPS_BF16",
+                "SQ_INSTS_VALU_MFMA_MOPS_F32",
+                "SQ_INSTS_VALU_MFMA_MOPS_F64",
+            ],
         ]
 
     def _run_rocprof(self, command: str, counters: List[str],
@@ -90,15 +116,20 @@ def _run_rocprof(self, command: str, counters: List[str],
     # Memory bandwidth metrics
 
     @metric("memory.hbm_read_bandwidth")
-    def _hbm_read_bandwidth(self, TCC_EA0_RDREQ_sum, GRBM_GUI_ACTIVE):
+    def _hbm_read_bandwidth(self, TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum, GRBM_GUI_ACTIVE):
         """
         HBM read bandwidth in GB/s
 
-        Formula: (read_requests * 64 bytes) / (active_cycles / clock_freq)
+        Formula: (128B_requests * 128 + 64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq)
 
         Note: TCC_EA0_RDREQ_sum aggregates across all memory controllers on MI300
+              TCC_BUBBLE_sum counts 128B read requests
         """
-        bytes_read = TCC_EA0_RDREQ_sum * 64  # Each request is 64 bytes
+        # Calculate bytes with 32B/64B/128B distinction
+        bytes_read_128B = TCC_BUBBLE_sum * 128
+        bytes_read_64B = (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+        bytes_read_32B = TCC_EA0_RDREQ_32B_sum * 32
+        bytes_read = bytes_read_128B + bytes_read_64B + bytes_read_32B
 
         if GRBM_GUI_ACTIVE == 0:
             return 0.0
@@ -107,15 +138,18 @@ def _hbm_read_bandwidth(self, TCC_EA0_RDREQ_sum, GRBM_GUI_ACTIVE):
         return (bytes_read / 1e9) / time_seconds if time_seconds > 0 else 0.0
 
     @metric("memory.hbm_write_bandwidth")
-    def _hbm_write_bandwidth(self, TCC_EA0_WRREQ_sum, GRBM_GUI_ACTIVE):
+    def _hbm_write_bandwidth(self, TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum, GRBM_GUI_ACTIVE):
         """
-        HBM write bandwidth in GB/s
+        HBM write bandwidth in GB/s (with 32B/64B request granularity)
 
-        Formula: (write_requests * 64 bytes) / (active_cycles / clock_freq)
+        Formula: (64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq)
 
         Note: TCC_EA0_WRREQ_sum aggregates across all memory controllers on MI300
         """
-        bytes_written = TCC_EA0_WRREQ_sum * 64  # Each request is 64 bytes
+        # Calculate bytes with 32B/64B distinction
+        bytes_written_64B = TCC_EA0_WRREQ_64B_sum * 64
+        bytes_written_32B = (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
+        bytes_written = bytes_written_64B + bytes_written_32B
 
         if GRBM_GUI_ACTIVE == 0:
             return 0.0
@@ -124,15 +158,22 @@ def _hbm_write_bandwidth(self, TCC_EA0_WRREQ_sum, GRBM_GUI_ACTIVE):
         return (bytes_written / 1e9) / time_seconds if time_seconds > 0 else 0.0
 
     @metric("memory.hbm_bandwidth_utilization")
-    def _hbm_bandwidth_utilization(self, TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum, GRBM_GUI_ACTIVE):
+    def _hbm_bandwidth_utilization(self, TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum,
+                                   TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum, GRBM_GUI_ACTIVE):
         """
         HBM bandwidth utilization as percentage of peak
 
         Formula: (actual_bandwidth / peak_bandwidth) * 100
 
         Note: TCC_EA0_* counters aggregate across all memory controllers on MI300
+              TCC_BUBBLE_sum counts 128B read requests
         """
-        total_bytes = (TCC_EA0_RDREQ_sum + TCC_EA0_WRREQ_sum) * 64
+        # Calculate bytes with 32B/64B/128B distinction
+        bytes_read = (TCC_BUBBLE_sum * 128 + 
+                      (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 +
+                      TCC_EA0_RDREQ_32B_sum * 32)
+        bytes_written = TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
+        total_bytes = bytes_read + bytes_written
 
         if GRBM_GUI_ACTIVE == 0:
             return 0.0
@@ -143,15 +184,40 @@ def _hbm_bandwidth_utilization(self, TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum, GRBM_
         return (actual_bw_gbs / self.device_specs.hbm_bandwidth_gbs) * 100
 
     @metric("memory.bytes_transferred_hbm")
-    def _bytes_transferred_hbm(self, TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum):
+    def _bytes_transferred_hbm(self, TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum,
+                               TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum):
         """
         Total bytes transferred through HBM
 
-        Formula: (read_requests + write_requests) * 64 bytes
+        Formula: (128B_read_requests * 128 + 64B_read_requests * 64 + 32B_read_requests * 32 +
+                  64B_write_requests * 64 + 32B_write_requests * 32)
 
         Note: TCC_EA0_* counters aggregate across all memory controllers on MI300
+              TCC_BUBBLE_sum counts 128B read requests
+        """
+        bytes_read = (TCC_BUBBLE_sum * 128 + 
+                      (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 +
+                      TCC_EA0_RDREQ_32B_sum * 32)
+        bytes_written = TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
+        return bytes_read + bytes_written
+
+    @metric("memory.bytes_transferred_l2")
+    def _bytes_transferred_l2(self, TCC_REQ_sum):
+        """
+        Total bytes transferred through L2 cache
+
+        Formula: TCC_REQ_sum * 128 (L2 cache line size is 128 bytes)
+        """
+        return TCC_REQ_sum * 128
+
+    @metric("memory.bytes_transferred_l1")
+    def _bytes_transferred_l1(self, TCP_TOTAL_CACHE_ACCESSES_sum):
         """
-        return (TCC_EA0_RDREQ_sum + TCC_EA0_WRREQ_sum) * 64
+        Total bytes transferred through L1 cache
+
+        Formula: TCP_TOTAL_CACHE_ACCESSES_sum * 128 (L1 cache line size is 128 bytes)
+        """
+        return TCP_TOTAL_CACHE_ACCESSES_sum * 128
 
     # Cache metrics
 
@@ -278,3 +344,244 @@ def _atomic_latency(self, TCC_EA0_ATOMIC_LEVEL_sum, TCC_EA0_ATOMIC_sum):
 
         return TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum
 
+    # Compute metrics
+
+    @metric("compute.total_flops")
+    def _total_flops(self,
+                     SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16,
+                     SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32,
+                     SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64,
+                     SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16,
+                     SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64):
+        """
+        Total floating-point operations performed by the kernel
+
+        Formula: 64 * (FP16 + FP32 + FP64) + 512 * MFMA
+        - 64 operations per wave (wavefront size = 64)
+        - FMA counts as 2 operations (multiply + add)
+        - MFMA instructions produce 512 operations per instruction
+        """
+        fops = 64 * (
+            (
+                SQ_INSTS_VALU_ADD_F16 +
+                SQ_INSTS_VALU_MUL_F16 +
+                SQ_INSTS_VALU_TRANS_F16 +
+                SQ_INSTS_VALU_FMA_F16 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F32 +
+                SQ_INSTS_VALU_MUL_F32 +
+                SQ_INSTS_VALU_TRANS_F32 +
+                SQ_INSTS_VALU_FMA_F32 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F64 +
+                SQ_INSTS_VALU_MUL_F64 +
+                SQ_INSTS_VALU_TRANS_F64 +
+                SQ_INSTS_VALU_FMA_F64 * 2
+            )
+        ) + 512 * (
+            SQ_INSTS_VALU_MFMA_MOPS_F16 +
+            SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+            SQ_INSTS_VALU_MFMA_MOPS_F32 +
+            SQ_INSTS_VALU_MFMA_MOPS_F64
+        )
+
+        return fops
+
+    @metric("compute.hbm_gflops")
+    def _hbm_gflops(self,
+                    SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16,
+                    SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32,
+                    SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64,
+                    SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16,
+                    SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64,
+                    GRBM_GUI_ACTIVE):
+        """
+        Compute throughput (GFLOPS) normalized by kernel execution time
+
+        Formula: (total_flops / 1e9) / time_seconds
+        """
+        # Calculate total FLOPS (same as compute.total_flops)
+        fops = 64 * (
+            (
+                SQ_INSTS_VALU_ADD_F16 +
+                SQ_INSTS_VALU_MUL_F16 +
+                SQ_INSTS_VALU_TRANS_F16 +
+                SQ_INSTS_VALU_FMA_F16 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F32 +
+                SQ_INSTS_VALU_MUL_F32 +
+                SQ_INSTS_VALU_TRANS_F32 +
+                SQ_INSTS_VALU_FMA_F32 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F64 +
+                SQ_INSTS_VALU_MUL_F64 +
+                SQ_INSTS_VALU_TRANS_F64 +
+                SQ_INSTS_VALU_FMA_F64 * 2
+            )
+        ) + 512 * (
+            SQ_INSTS_VALU_MFMA_MOPS_F16 +
+            SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+            SQ_INSTS_VALU_MFMA_MOPS_F32 +
+            SQ_INSTS_VALU_MFMA_MOPS_F64
+        )
+
+        if GRBM_GUI_ACTIVE == 0:
+            return 0.0
+
+        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
+        gflops = (fops / 1e9) / time_seconds if time_seconds > 0 else 0.0
+
+        return gflops
+
+    @metric("compute.hbm_arithmetic_intensity")
+    def _hbm_arithmetic_intensity(self,
+                                   SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16,
+                                   SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32,
+                                   SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64,
+                                   SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16,
+                                   SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64,
+                                   TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum,
+                                   TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum):
+        """
+        HBM Arithmetic Intensity: ratio of floating-point operations to HBM bytes transferred (FLOP/byte)
+
+        Formula: total_flops / hbm_bytes
+        """
+        # Calculate total FLOPS
+        fops = 64 * (
+            (
+                SQ_INSTS_VALU_ADD_F16 +
+                SQ_INSTS_VALU_MUL_F16 +
+                SQ_INSTS_VALU_TRANS_F16 +
+                SQ_INSTS_VALU_FMA_F16 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F32 +
+                SQ_INSTS_VALU_MUL_F32 +
+                SQ_INSTS_VALU_TRANS_F32 +
+                SQ_INSTS_VALU_FMA_F32 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F64 +
+                SQ_INSTS_VALU_MUL_F64 +
+                SQ_INSTS_VALU_TRANS_F64 +
+                SQ_INSTS_VALU_FMA_F64 * 2
+            )
+        ) + 512 * (
+            SQ_INSTS_VALU_MFMA_MOPS_F16 +
+            SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+            SQ_INSTS_VALU_MFMA_MOPS_F32 +
+            SQ_INSTS_VALU_MFMA_MOPS_F64
+        )
+
+        # Calculate HBM bytes (with 32B/64B/128B distinction)
+        hbm_rd = (TCC_BUBBLE_sum * 128 + 
+                  (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 +
+                  TCC_EA0_RDREQ_32B_sum * 32)
+        hbm_wr = TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
+        hbm_bytes = hbm_rd + hbm_wr
+
+        # Arithmetic intensity = FLOP / byte
+        ai_hbm = fops / hbm_bytes if hbm_bytes > 0 else 0.0
+
+        return ai_hbm
+
+    @metric("compute.l2_arithmetic_intensity")
+    def _l2_arithmetic_intensity(self,
+                                  SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16,
+                                  SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32,
+                                  SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64,
+                                  SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16,
+                                  SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64,
+                                  TCC_REQ_sum):
+        """
+        L2 Arithmetic Intensity: ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte)
+
+        Formula: total_flops / l2_bytes
+        """
+        # Calculate total FLOPS
+        fops = 64 * (
+            (
+                SQ_INSTS_VALU_ADD_F16 +
+                SQ_INSTS_VALU_MUL_F16 +
+                SQ_INSTS_VALU_TRANS_F16 +
+                SQ_INSTS_VALU_FMA_F16 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F32 +
+                SQ_INSTS_VALU_MUL_F32 +
+                SQ_INSTS_VALU_TRANS_F32 +
+                SQ_INSTS_VALU_FMA_F32 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F64 +
+                SQ_INSTS_VALU_MUL_F64 +
+                SQ_INSTS_VALU_TRANS_F64 +
+                SQ_INSTS_VALU_FMA_F64 * 2
+            )
+        ) + 512 * (
+            SQ_INSTS_VALU_MFMA_MOPS_F16 +
+            SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+            SQ_INSTS_VALU_MFMA_MOPS_F32 +
+            SQ_INSTS_VALU_MFMA_MOPS_F64
+        )
+
+        # Calculate L2 bytes (L2 cache line is 128 bytes)
+        l2_bytes = TCC_REQ_sum * 128
+
+        # Arithmetic intensity = FLOP / byte
+        ai_l2 = fops / l2_bytes if l2_bytes > 0 else 0.0
+
+        return ai_l2
+
+    @metric("compute.l1_arithmetic_intensity")
+    def _l1_arithmetic_intensity(self,
+                                  SQ_INSTS_VALU_ADD_F16, SQ_INSTS_VALU_MUL_F16, SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_FMA_F16,
+                                  SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_FMA_F32,
+                                  SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_TRANS_F64, SQ_INSTS_VALU_FMA_F64,
+                                  SQ_INSTS_VALU_MFMA_MOPS_F16, SQ_INSTS_VALU_MFMA_MOPS_BF16,
+                                  SQ_INSTS_VALU_MFMA_MOPS_F32, SQ_INSTS_VALU_MFMA_MOPS_F64,
+                                  TCP_TOTAL_CACHE_ACCESSES_sum):
+        """
+        L1 Arithmetic Intensity: ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte)
+
+        Formula: total_flops / l1_bytes
+        """
+        # Calculate total FLOPS
+        fops = 64 * (
+            (
+                SQ_INSTS_VALU_ADD_F16 +
+                SQ_INSTS_VALU_MUL_F16 +
+                SQ_INSTS_VALU_TRANS_F16 +
+                SQ_INSTS_VALU_FMA_F16 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F32 +
+                SQ_INSTS_VALU_MUL_F32 +
+                SQ_INSTS_VALU_TRANS_F32 +
+                SQ_INSTS_VALU_FMA_F32 * 2
+            ) +
+            (
+                SQ_INSTS_VALU_ADD_F64 +
+                SQ_INSTS_VALU_MUL_F64 +
+                SQ_INSTS_VALU_TRANS_F64 +
+                SQ_INSTS_VALU_FMA_F64 * 2
+            )
+        ) + 512 * (
+            SQ_INSTS_VALU_MFMA_MOPS_F16 +
+            SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+            SQ_INSTS_VALU_MFMA_MOPS_F32 +
+            SQ_INSTS_VALU_MFMA_MOPS_F64
+        )
+
+        # Calculate L1 bytes (L1 cache line is 128 bytes on gfx942)
+        l1_bytes = TCP_TOTAL_CACHE_ACCESSES_sum * 128
+
+        # Arithmetic intensity = FLOP / byte
+        ai_l1 = fops / l1_bytes if l1_bytes > 0 else 0.0
+
+        return ai_l1
\ No newline at end of file
diff --git a/src/metrix/src/metrix/cli/info_cmd.py b/src/metrix/src/metrix/cli/info_cmd.py
index 7c1b5944..55627a96 100644
--- a/src/metrix/src/metrix/cli/info_cmd.py
+++ b/src/metrix/src/metrix/cli/info_cmd.py
@@ -3,13 +3,16 @@
 """
 
 from ..metrics import METRIC_CATALOG, METRIC_PROFILES
+from ..backends import get_backend, detect_or_default
 
 
 def info_command(args):
     """Execute info command"""
 
     if args.info_type == "metric":
-        show_metric_info(args.name)
+        # Get architecture from args if available, otherwise auto-detect
+        arch = getattr(args, 'arch', None) or detect_or_default()
+        show_metric_info(args.name, arch)
     elif args.info_type == "profile":
         show_profile_info(args.name)
     elif args.info_type == "counter":
@@ -18,7 +21,7 @@ def info_command(args):
     return 0
 
 
-def show_metric_info(metric_name):
+def show_metric_info(metric_name, arch="gfx942"):
     """Show detailed metric information"""
 
     if metric_name not in METRIC_CATALOG:
@@ -37,9 +40,21 @@ def show_metric_info(metric_name):
     print(f"Unit:        {metric_def['unit']}")
     print(f"Category:    {metric_def['category'].value}")
 
-    print(f"\nRequired Hardware Counters:")
-    for counter in metric_def['derived_from']:
-        print(f"  • {counter}")
+    # Show actual hardware counters from the backend (architecture-specific)
+    print(f"\nRequired Hardware Counters ({arch}):")
+    try:
+        backend = get_backend(arch)
+        actual_counters = backend.get_metric_counters(metric_name)
+        for counter in actual_counters:
+            print(f"  • {counter}")
+    except ValueError as e:
+        # Metric not implemented in this backend
+        print(f"  ⚠️  Metric not implemented for {arch}")
+        # Fall back to catalog's derived_from as documentation
+        if 'derived_from' in metric_def:
+            print(f"\n  Conceptual counters (from catalog):")
+            for counter in metric_def['derived_from']:
+                print(f"    • {counter}")
 
     if 'interpretation' in metric_def:
         print(f"\nInterpretation Guide:")
diff --git a/src/metrix/src/metrix/metrics/catalog.py b/src/metrix/src/metrix/metrics/catalog.py
index 61dcb0c7..ed0ec5d6 100644
--- a/src/metrix/src/metrix/metrics/catalog.py
+++ b/src/metrix/src/metrix/metrics/catalog.py
@@ -3,6 +3,7 @@
 """
 
 from .memory_metrics import MEMORY_METRICS
+from .compute_metrics import COMPUTE_METRICS
 
 # ═══════════════════════════════════════════════════════════════════
 # COMPLETE METRIC CATALOG
@@ -10,7 +11,8 @@
 
 METRIC_CATALOG = {
     **MEMORY_METRICS,
-    # Will add compute, occupancy, bottleneck metrics later
+    **COMPUTE_METRICS,
+    # Will add occupancy, bottleneck metrics later
 }
 
 # ═══════════════════════════════════════════════════════════════════
@@ -80,6 +82,23 @@
             "memory.coalescing_efficiency",
         ],
         "estimated_passes": 1
+    },
+
+    "compute": {
+        "description": "Compute and arithmetic intensity analysis",
+        "metrics": [
+            "compute.total_flops",
+            "compute.hbm_gflops",
+            "compute.hbm_arithmetic_intensity",
+            "compute.l2_arithmetic_intensity",
+            "compute.l1_arithmetic_intensity",
+        ],
+        "estimated_passes": 3,
+        "focus": "compute_performance",
+        "typical_bottlenecks": [
+            "low_arithmetic_intensity",
+            "memory_bound_kernel"
+        ]
     }
 }
 
diff --git a/src/metrix/src/metrix/metrics/compute_metrics.py b/src/metrix/src/metrix/metrics/compute_metrics.py
new file mode 100644
index 00000000..26942ec1
--- /dev/null
+++ b/src/metrix/src/metrix/metrics/compute_metrics.py
@@ -0,0 +1,320 @@
+"""
+Compute-focused metric definitions (FLOPS, Arithmetic Intensity)
+Based on Omnipilot's calculate_hbm_arithmetic_intensity() implementation
+
+NOTE: The `derived_from` field contains CONCEPTUAL counter names for documentation.
+Actual hardware counter names vary by architecture (e.g., TCC_EA_* vs TCC_EA0_*).
+For architecture-specific counter names, see the backend implementations in
+metrix/backends/gfx942.py, gfx1201.py, etc.
+"""
+
+from .categories import MetricCategory
+
+# ═══════════════════════════════════════════════════════════════════
+# COMPUTE THROUGHPUT METRICS
+# ═══════════════════════════════════════════════════════════════════
+
+COMPUTE_THROUGHPUT_METRICS = {
+    "compute.total_flops": {
+        "name": "Total FLOPS",
+        "description": "Total floating-point operations performed by the kernel",
+        "unit": "FLOPS",
+        "category": MetricCategory.COMPUTE,
+        # NOTE: HBM counters are architecture-specific:
+        # - MI300 (gfx942): TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum, etc.
+        # - MI200 (gfx90a): TCC_EA_RDREQ_sum, TCC_EA_WRREQ_sum, etc.
+        "derived_from": [
+            # FP16 instructions
+            "SQ_INSTS_VALU_ADD_F16",
+            "SQ_INSTS_VALU_MUL_F16",
+            "SQ_INSTS_VALU_TRANS_F16",
+            "SQ_INSTS_VALU_FMA_F16",
+            # FP32 instructions
+            "SQ_INSTS_VALU_ADD_F32",
+            "SQ_INSTS_VALU_MUL_F32",
+            "SQ_INSTS_VALU_TRANS_F32",
+            "SQ_INSTS_VALU_FMA_F32",
+            # FP64 instructions
+            "SQ_INSTS_VALU_ADD_F64",
+            "SQ_INSTS_VALU_MUL_F64",
+            "SQ_INSTS_VALU_TRANS_F64",
+            "SQ_INSTS_VALU_FMA_F64",
+            # MFMA instructions (Matrix FMA)
+            "SQ_INSTS_VALU_MFMA_MOPS_F16",
+            "SQ_INSTS_VALU_MFMA_MOPS_BF16",
+            "SQ_INSTS_VALU_MFMA_MOPS_F32",
+            "SQ_INSTS_VALU_MFMA_MOPS_F64",
+        ],
+        "formula": """
+            # 64 operations per wave (wavefront size = 64)
+            # FMA counts as 2 operations (multiply + add)
+            # MFMA instructions produce 512 operations per instruction
+            
+            fops = 64 * (
+                (
+                    SQ_INSTS_VALU_ADD_F16 +
+                    SQ_INSTS_VALU_MUL_F16 +
+                    SQ_INSTS_VALU_TRANS_F16 +
+                    SQ_INSTS_VALU_FMA_F16 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F32 +
+                    SQ_INSTS_VALU_MUL_F32 +
+                    SQ_INSTS_VALU_TRANS_F32 +
+                    SQ_INSTS_VALU_FMA_F32 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F64 +
+                    SQ_INSTS_VALU_MUL_F64 +
+                    SQ_INSTS_VALU_TRANS_F64 +
+                    SQ_INSTS_VALU_FMA_F64 * 2
+                )
+            ) + 512 * (
+                SQ_INSTS_VALU_MFMA_MOPS_F16 +
+                SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+                SQ_INSTS_VALU_MFMA_MOPS_F32 +
+                SQ_INSTS_VALU_MFMA_MOPS_F64
+            )
+            
+            return fops
+        """
+    },
+
+    "compute.hbm_gflops": {
+        "name": "HBM Compute Throughput",
+        "description": "Compute throughput (GFLOPS) normalized by kernel execution time",
+        "unit": "GFLOPS",
+        "category": MetricCategory.COMPUTE,
+        "derived_from": [
+            # All FLOPS counters
+            "SQ_INSTS_VALU_ADD_F16", "SQ_INSTS_VALU_MUL_F16", "SQ_INSTS_VALU_TRANS_F16", "SQ_INSTS_VALU_FMA_F16",
+            "SQ_INSTS_VALU_ADD_F32", "SQ_INSTS_VALU_MUL_F32", "SQ_INSTS_VALU_TRANS_F32", "SQ_INSTS_VALU_FMA_F32",
+            "SQ_INSTS_VALU_ADD_F64", "SQ_INSTS_VALU_MUL_F64", "SQ_INSTS_VALU_TRANS_F64", "SQ_INSTS_VALU_FMA_F64",
+            "SQ_INSTS_VALU_MFMA_MOPS_F16", "SQ_INSTS_VALU_MFMA_MOPS_BF16", 
+            "SQ_INSTS_VALU_MFMA_MOPS_F32", "SQ_INSTS_VALU_MFMA_MOPS_F64",
+            "GRBM_GUI_ACTIVE"
+        ],
+        "formula": """
+            # Calculate total FLOPS (same as compute.total_flops)
+            fops = 64 * (
+                (
+                    SQ_INSTS_VALU_ADD_F16 +
+                    SQ_INSTS_VALU_MUL_F16 +
+                    SQ_INSTS_VALU_TRANS_F16 +
+                    SQ_INSTS_VALU_FMA_F16 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F32 +
+                    SQ_INSTS_VALU_MUL_F32 +
+                    SQ_INSTS_VALU_TRANS_F32 +
+                    SQ_INSTS_VALU_FMA_F32 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F64 +
+                    SQ_INSTS_VALU_MUL_F64 +
+                    SQ_INSTS_VALU_TRANS_F64 +
+                    SQ_INSTS_VALU_FMA_F64 * 2
+                )
+            ) + 512 * (
+                SQ_INSTS_VALU_MFMA_MOPS_F16 +
+                SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+                SQ_INSTS_VALU_MFMA_MOPS_F32 +
+                SQ_INSTS_VALU_MFMA_MOPS_F64
+            )
+            
+            # Convert to GFLOPS
+            time_seconds = GRBM_GUI_ACTIVE / (gpu_freq_mhz * 1e6)
+            gflops = (fops / 1e9) / time_seconds if time_seconds > 0 else 0
+            
+            return gflops
+        """,
+        "device_specific": True
+    }
+}
+
+# ═══════════════════════════════════════════════════════════════════
+# ARITHMETIC INTENSITY METRICS
+# ═══════════════════════════════════════════════════════════════════
+
+ARITHMETIC_INTENSITY_METRICS = {
+    "compute.hbm_arithmetic_intensity": {
+        "name": "HBM Arithmetic Intensity",
+        "description": "Ratio of floating-point operations to HBM bytes transferred (FLOP/byte)",
+        "unit": "FLOP/byte",
+        "category": MetricCategory.COMPUTE,
+        "derived_from": [
+            # FLOPS counters (same across architectures)
+            "SQ_INSTS_VALU_ADD_F16", "SQ_INSTS_VALU_MUL_F16", "SQ_INSTS_VALU_TRANS_F16", "SQ_INSTS_VALU_FMA_F16",
+            "SQ_INSTS_VALU_ADD_F32", "SQ_INSTS_VALU_MUL_F32", "SQ_INSTS_VALU_TRANS_F32", "SQ_INSTS_VALU_FMA_F32",
+            "SQ_INSTS_VALU_ADD_F64", "SQ_INSTS_VALU_MUL_F64", "SQ_INSTS_VALU_TRANS_F64", "SQ_INSTS_VALU_FMA_F64",
+            "SQ_INSTS_VALU_MFMA_MOPS_F16", "SQ_INSTS_VALU_MFMA_MOPS_BF16", 
+            "SQ_INSTS_VALU_MFMA_MOPS_F32", "SQ_INSTS_VALU_MFMA_MOPS_F64",
+            # HBM bandwidth counters - conceptual names (actual names vary by arch)
+            "TCC_EA_RDREQ_32B_sum", "TCC_EA_RDREQ_sum", "TCC_BUBBLE_sum",
+            "TCC_EA_WRREQ_64B_sum", "TCC_EA_WRREQ_sum",
+        ],
+        "formula": """
+            # Calculate total FLOPS
+            fops = 64 * (
+                (
+                    SQ_INSTS_VALU_ADD_F16 +
+                    SQ_INSTS_VALU_MUL_F16 +
+                    SQ_INSTS_VALU_TRANS_F16 +
+                    SQ_INSTS_VALU_FMA_F16 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F32 +
+                    SQ_INSTS_VALU_MUL_F32 +
+                    SQ_INSTS_VALU_TRANS_F32 +
+                    SQ_INSTS_VALU_FMA_F32 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F64 +
+                    SQ_INSTS_VALU_MUL_F64 +
+                    SQ_INSTS_VALU_TRANS_F64 +
+                    SQ_INSTS_VALU_FMA_F64 * 2
+                )
+            ) + 512 * (
+                SQ_INSTS_VALU_MFMA_MOPS_F16 +
+                SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+                SQ_INSTS_VALU_MFMA_MOPS_F32 +
+                SQ_INSTS_VALU_MFMA_MOPS_F64
+            )
+            
+            # Calculate HBM bytes (with 32B/64B/128B distinction)
+            # Note: TCC_BUBBLE_sum counts 128B read requests on MI300
+            hbm_rd = (TCC_BUBBLE_sum * 128 + 
+                     (TCC_EA_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA_RDREQ_32B_sum) * 64 +
+                     TCC_EA_RDREQ_32B_sum * 32)
+            hbm_wr = (TCC_EA_WRREQ_64B_sum * 64 + 
+                     (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
+            hbm_bytes = hbm_rd + hbm_wr
+            
+            # Arithmetic intensity = FLOP / byte
+            ai_hbm = fops / hbm_bytes if hbm_bytes > 0 else 0
+            
+            return ai_hbm
+        """,
+        "interpretation": {
+            "excellent": (10, float('inf'), "Compute bound - excellent FLOP/byte ratio"),
+            "good": (5, 10, "Good balance between compute and memory"),
+            "fair": (1, 5, "Memory bound - moderate FLOP/byte ratio"),
+            "poor": (0, 1, "Heavily memory bound - low FLOP/byte ratio")
+        }
+    },
+
+    "compute.l2_arithmetic_intensity": {
+        "name": "L2 Arithmetic Intensity",
+        "description": "Ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte)",
+        "unit": "FLOP/byte",
+        "category": MetricCategory.COMPUTE,
+        "derived_from": [
+            # FLOPS counters
+            "SQ_INSTS_VALU_ADD_F16", "SQ_INSTS_VALU_MUL_F16", "SQ_INSTS_VALU_TRANS_F16", "SQ_INSTS_VALU_FMA_F16",
+            "SQ_INSTS_VALU_ADD_F32", "SQ_INSTS_VALU_MUL_F32", "SQ_INSTS_VALU_TRANS_F32", "SQ_INSTS_VALU_FMA_F32",
+            "SQ_INSTS_VALU_ADD_F64", "SQ_INSTS_VALU_MUL_F64", "SQ_INSTS_VALU_TRANS_F64", "SQ_INSTS_VALU_FMA_F64",
+            "SQ_INSTS_VALU_MFMA_MOPS_F16", "SQ_INSTS_VALU_MFMA_MOPS_BF16", 
+            "SQ_INSTS_VALU_MFMA_MOPS_F32", "SQ_INSTS_VALU_MFMA_MOPS_F64",
+            # L2 cache counters
+            "TCC_REQ_sum",
+        ],
+        "formula": """
+            # Calculate total FLOPS
+            fops = 64 * (
+                (
+                    SQ_INSTS_VALU_ADD_F16 +
+                    SQ_INSTS_VALU_MUL_F16 +
+                    SQ_INSTS_VALU_TRANS_F16 +
+                    SQ_INSTS_VALU_FMA_F16 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F32 +
+                    SQ_INSTS_VALU_MUL_F32 +
+                    SQ_INSTS_VALU_TRANS_F32 +
+                    SQ_INSTS_VALU_FMA_F32 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F64 +
+                    SQ_INSTS_VALU_MUL_F64 +
+                    SQ_INSTS_VALU_TRANS_F64 +
+                    SQ_INSTS_VALU_FMA_F64 * 2
+                )
+            ) + 512 * (
+                SQ_INSTS_VALU_MFMA_MOPS_F16 +
+                SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+                SQ_INSTS_VALU_MFMA_MOPS_F32 +
+                SQ_INSTS_VALU_MFMA_MOPS_F64
+            )
+            
+            # Calculate L2 bytes (L2 cache line is 128 bytes)
+            l2_bytes = TCC_REQ_sum * 128
+            
+            # Arithmetic intensity = FLOP / byte
+            ai_l2 = fops / l2_bytes if l2_bytes > 0 else 0
+            
+            return ai_l2
+        """
+    },
+
+    "compute.l1_arithmetic_intensity": {
+        "name": "L1 Arithmetic Intensity",
+        "description": "Ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte)",
+        "unit": "FLOP/byte",
+        "category": MetricCategory.COMPUTE,
+        "derived_from": [
+            # FLOPS counters
+            "SQ_INSTS_VALU_ADD_F16", "SQ_INSTS_VALU_MUL_F16", "SQ_INSTS_VALU_TRANS_F16", "SQ_INSTS_VALU_FMA_F16",
+            "SQ_INSTS_VALU_ADD_F32", "SQ_INSTS_VALU_MUL_F32", "SQ_INSTS_VALU_TRANS_F32", "SQ_INSTS_VALU_FMA_F32",
+            "SQ_INSTS_VALU_ADD_F64", "SQ_INSTS_VALU_MUL_F64", "SQ_INSTS_VALU_TRANS_F64", "SQ_INSTS_VALU_FMA_F64",
+            "SQ_INSTS_VALU_MFMA_MOPS_F16", "SQ_INSTS_VALU_MFMA_MOPS_BF16", 
+            "SQ_INSTS_VALU_MFMA_MOPS_F32", "SQ_INSTS_VALU_MFMA_MOPS_F64",
+            # L1 cache counters
+            "TCP_TOTAL_CACHE_ACCESSES_sum",
+        ],
+        "formula": """
+            # Calculate total FLOPS
+            fops = 64 * (
+                (
+                    SQ_INSTS_VALU_ADD_F16 +
+                    SQ_INSTS_VALU_MUL_F16 +
+                    SQ_INSTS_VALU_TRANS_F16 +
+                    SQ_INSTS_VALU_FMA_F16 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F32 +
+                    SQ_INSTS_VALU_MUL_F32 +
+                    SQ_INSTS_VALU_TRANS_F32 +
+                    SQ_INSTS_VALU_FMA_F32 * 2
+                ) +
+                (
+                    SQ_INSTS_VALU_ADD_F64 +
+                    SQ_INSTS_VALU_MUL_F64 +
+                    SQ_INSTS_VALU_TRANS_F64 +
+                    SQ_INSTS_VALU_FMA_F64 * 2
+                )
+            ) + 512 * (
+                SQ_INSTS_VALU_MFMA_MOPS_F16 +
+                SQ_INSTS_VALU_MFMA_MOPS_BF16 +
+                SQ_INSTS_VALU_MFMA_MOPS_F32 +
+                SQ_INSTS_VALU_MFMA_MOPS_F64
+            )
+            
+            # Calculate L1 bytes (L1 cache line is 64 bytes)
+            l1_bytes = TCP_TOTAL_CACHE_ACCESSES_sum * 64
+            
+            # Arithmetic intensity = FLOP / byte
+            ai_l1 = fops / l1_bytes if l1_bytes > 0 else 0
+            
+            return ai_l1
+        """
+    }
+}
+
+# ═══════════════════════════════════════════════════════════════════
+# COMBINED COMPUTE METRIC CATALOG
+# ═══════════════════════════════════════════════════════════════════
+
+COMPUTE_METRICS = {
+    **COMPUTE_THROUGHPUT_METRICS,
+    **ARITHMETIC_INTENSITY_METRICS
+}
diff --git a/src/metrix/src/metrix/metrics/memory_metrics.py b/src/metrix/src/metrix/metrics/memory_metrics.py
index 0b3178cc..890d9008 100644
--- a/src/metrix/src/metrix/metrics/memory_metrics.py
+++ b/src/metrix/src/metrix/metrics/memory_metrics.py
@@ -1,6 +1,11 @@
 """
 Memory-focused metric definitions
 Top-down approach: Define what we want to know, not how to measure it
+
+NOTE: The `derived_from` field contains CONCEPTUAL counter names for documentation.
+Actual hardware counter names vary by architecture (e.g., TCC_EA_* vs TCC_EA0_*).
+For architecture-specific counter names, see the backend implementations in
+metrix/backends/gfx942.py, gfx1201.py, etc.
 """
 
 from .categories import MetricCategory
@@ -15,6 +20,9 @@
         "description": "Percentage of peak HBM (High Bandwidth Memory) bandwidth utilized",
         "unit": "percent",
         "category": MetricCategory.MEMORY_BANDWIDTH,
+        # NOTE: These are conceptual counter names. Actual names vary by architecture:
+        # - MI300 (gfx942): TCC_EA0_RDREQ_sum, TCC_EA0_WRREQ_sum
+        # - MI200 (gfx90a): TCC_EA_RDREQ_sum, TCC_EA_WRREQ_sum
         "derived_from": [
             "TCC_EA_RDREQ_sum",      # Read requests to memory controller
             "TCC_EA_WRREQ_sum",      # Write requests to memory controller
@@ -84,6 +92,30 @@
         "formula": """
             return (TCC_EA_RDREQ_sum + TCC_EA_WRREQ_sum) * 64
         """
+    },
+
+    "memory.bytes_transferred_l2": {
+        "name": "Total L2 Bytes Transferred",
+        "description": "Total bytes accessed through L2 cache",
+        "unit": "bytes",
+        "category": MetricCategory.MEMORY_BANDWIDTH,
+        "derived_from": ["TCC_REQ_sum"],
+        "formula": """
+            # L2 cache line is 128 bytes
+            return TCC_REQ_sum * 128
+        """
+    },
+
+    "memory.bytes_transferred_l1": {
+        "name": "Total L1 Bytes Transferred",
+        "description": "Total bytes accessed through L1 cache",
+        "unit": "bytes",
+        "category": MetricCategory.MEMORY_BANDWIDTH,
+        "derived_from": ["TCP_TOTAL_CACHE_ACCESSES_sum"],
+        "formula": """
+            # L1 cache line is 128 bytes on gfx942
+            return TCP_TOTAL_CACHE_ACCESSES_sum * 128
+        """
     }
 }
 
diff --git a/src/metrix/tests/integration/test_all_metrics_displayed.py b/src/metrix/tests/integration/test_all_metrics_displayed.py
index 31f06d7d..a080b22b 100644
--- a/src/metrix/tests/integration/test_all_metrics_displayed.py
+++ b/src/metrix/tests/integration/test_all_metrics_displayed.py
@@ -1,8 +1,8 @@
 """
-Integration test to verify all 11 metrics are displayed
+Integration test to verify all metrics are displayed
 
-This test specifically validates the bug fix where only 7 out of 11 metrics
-were being displayed due to MetricComputer receiving wrong parameter type.
+This test validates that all memory and compute metrics are properly
+computed and displayed by the metrix profiler.
 """
 
 import pytest
@@ -35,10 +35,10 @@ def vector_add_binary(tmp_path):
 
 @pytest.mark.integration
 @pytest.mark.timeout(60)
-def test_all_11_metrics_are_displayed(vector_add_binary):
-    """Verify that all 11 metrics are computed and displayed"""
+def test_all_memory_metrics_are_displayed(vector_add_binary):
+    """Verify that all 12 memory metrics are computed and displayed"""
     result = subprocess.run(
-        ["metrix", "-n", "1", "--aggregate", str(vector_add_binary)],
+        ["metrix", "-n", "1", "--aggregate", "--profile", "memory", str(vector_add_binary)],
         capture_output=True,
         text=True,
         timeout=60,
@@ -47,8 +47,8 @@ def test_all_11_metrics_are_displayed(vector_add_binary):
     assert result.returncode == 0, f"stderr: {result.stderr}"
     output = result.stdout
 
-    # List of all 11 expected metrics (friendly names as displayed)
-    expected_metrics = [
+    # List of all expected memory metrics (friendly names as displayed)
+    expected_memory_metrics = [
         # Memory Bandwidth (5 metrics)
         "HBM Read Bandwidth",
         "HBM Write Bandwidth",
@@ -67,15 +67,15 @@ def test_all_11_metrics_are_displayed(vector_add_binary):
     ]
 
     missing_metrics = []
-    for metric in expected_metrics:
+    for metric in expected_memory_metrics:
         if metric not in output:
             missing_metrics.append(metric)
 
     assert (
         len(missing_metrics) == 0
-    ), f"Missing metrics: {missing_metrics}\n\nOutput:\n{output}"
+    ), f"Missing memory metrics: {missing_metrics}\n\nOutput:\n{output}"
 
-    print(f"✓ All {len(expected_metrics)} metrics displayed successfully")
+    print(f"✓ All {len(expected_memory_metrics)} memory metrics displayed successfully")
 
 
 @pytest.mark.integration
@@ -83,7 +83,7 @@ def test_all_11_metrics_are_displayed(vector_add_binary):
 def test_bandwidth_metrics_have_values(vector_add_binary):
     """Verify bandwidth metrics compute to non-zero values"""
     result = subprocess.run(
-        ["metrix", "-n", "1", "--aggregate", "--verbose", str(vector_add_binary)],
+        ["metrix", "-n", "1", "--aggregate", str(vector_add_binary)],
         capture_output=True,
         text=True,
         timeout=60,
@@ -109,10 +109,45 @@ def test_bandwidth_metrics_have_values(vector_add_binary):
             assert "0.00 percent" not in line, "HBM Bandwidth Utilization is zero!"
 
 
+@pytest.mark.integration
+@pytest.mark.timeout(120)
+def test_all_compute_metrics_are_displayed(vector_add_binary):
+    """Verify that all compute metrics are computed and displayed"""
+    result = subprocess.run(
+        ["metrix", "-n", "1", "--aggregate", "--profile", "compute", str(vector_add_binary)],
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+
+    assert result.returncode == 0, f"stderr: {result.stderr}"
+    output = result.stdout
+
+    # List of all expected compute metrics (friendly names as displayed)
+    expected_compute_metrics = [
+        "Total FLOPS",
+        "HBM Compute Throughput",
+        "HBM Arithmetic Intensity",
+        "L2 Arithmetic Intensity",
+        "L1 Arithmetic Intensity",
+    ]
+
+    missing_metrics = []
+    for metric in expected_compute_metrics:
+        if metric not in output:
+            missing_metrics.append(metric)
+
+    assert (
+        len(missing_metrics) == 0
+    ), f"Missing compute metrics: {missing_metrics}\n\nOutput:\n{output}"
+
+    print(f"✓ All {len(expected_compute_metrics)} compute metrics displayed successfully")
+
+
 @pytest.mark.integration
 @pytest.mark.timeout(60)
-def test_json_output_has_all_metrics(vector_add_binary, tmp_path):
-    """Verify JSON output contains all 11 metrics"""
+def test_json_output_has_memory_metrics(vector_add_binary, tmp_path):
+    """Verify JSON output contains all memory metrics"""
     output_file = tmp_path / "results.json"
 
     result = subprocess.run(
@@ -121,6 +156,8 @@ def test_json_output_has_all_metrics(vector_add_binary, tmp_path):
             "-n",
             "1",
             "--aggregate",
+            "--profile",
+            "memory",
             "-o",
             str(output_file),
             str(vector_add_binary),
@@ -148,14 +185,57 @@ def test_json_output_has_all_metrics(vector_add_binary, tmp_path):
     assert "duration_us" in kernel_data
     assert "metrics" in kernel_data
 
-    # Count metrics
-    num_metrics = len(kernel_data["metrics"])
-    assert (
-        num_metrics == 12
-    ), f"Expected 12 metrics, got {num_metrics}: {list(kernel_data['metrics'].keys())}"
-
-    # Verify the 4 bandwidth metrics that were previously failing
+    # Verify key memory bandwidth metrics
     assert "memory.hbm_bandwidth_utilization" in kernel_data["metrics"]
     assert "memory.hbm_read_bandwidth" in kernel_data["metrics"]
     assert "memory.hbm_write_bandwidth" in kernel_data["metrics"]
     assert "memory.l2_bandwidth" in kernel_data["metrics"]
+
+
+@pytest.mark.integration
+@pytest.mark.timeout(120)
+def test_json_output_has_compute_metrics(vector_add_binary, tmp_path):
+    """Verify JSON output contains all compute metrics"""
+    output_file = tmp_path / "results.json"
+
+    result = subprocess.run(
+        [
+            "metrix",
+            "-n",
+            "1",
+            "--aggregate",
+            "--profile",
+            "compute",
+            "-o",
+            str(output_file),
+            str(vector_add_binary),
+        ],
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+
+    assert result.returncode == 0, f"stderr: {result.stderr}"
+    assert output_file.exists()
+
+    import json
+
+    with open(output_file) as f:
+        data = json.load(f)
+
+    # Check structure
+    assert len(data) > 0, "No kernels in JSON output"
+
+    # Get first kernel/dispatch
+    first_key = list(data.keys())[0]
+    kernel_data = data[first_key]
+
+    assert "duration_us" in kernel_data
+    assert "metrics" in kernel_data
+
+    # Verify compute metrics are present
+    assert "compute.total_flops" in kernel_data["metrics"]
+    assert "compute.hbm_gflops" in kernel_data["metrics"]
+    assert "compute.hbm_arithmetic_intensity" in kernel_data["metrics"]
+    assert "compute.l2_arithmetic_intensity" in kernel_data["metrics"]
+    assert "compute.l1_arithmetic_intensity" in kernel_data["metrics"]
\ No newline at end of file
diff --git a/src/metrix/tests/integration/test_cli_integration.py b/src/metrix/tests/integration/test_cli_integration.py
index 4c3b173b..330a6fbb 100644
--- a/src/metrix/tests/integration/test_cli_integration.py
+++ b/src/metrix/tests/integration/test_cli_integration.py
@@ -36,7 +36,7 @@ def test_cli_time_only_aggregated():
             "metrix",
             "profile",
             "--time-only",
-            "--runs",
+            "--num-replays",
             "3",
             "--aggregate",
             str(VECTOR_ADD),
@@ -103,3 +103,73 @@ def test_cli_list_metrics():
 
     assert result.returncode == 0
     assert "memory.l2_hit_rate" in result.stdout
+
+
+def test_cli_list_metrics_includes_compute():
+    """Test that metrix list metrics includes compute metrics"""
+    result = subprocess.run(
+        ["metrix", "list", "metrics"], capture_output=True, text=True, timeout=5
+    )
+
+    assert result.returncode == 0
+    assert "compute.total_flops" in result.stdout
+    assert "compute.hbm_arithmetic_intensity" in result.stdout
+
+
+def test_cli_list_profiles_includes_compute():
+    """Test that metrix list profiles includes compute profile"""
+    result = subprocess.run(
+        ["metrix", "list", "profiles"], capture_output=True, text=True, timeout=5
+    )
+
+    assert result.returncode == 0
+    assert "COMPUTE" in result.stdout
+
+
+@pytest.mark.timeout(120)
+@pytest.mark.skipif(not VECTOR_ADD.exists(), reason="vector_add not compiled")
+def test_cli_compute_profile():
+    """Test metrix profile --profile compute"""
+    result = subprocess.run(
+        [
+            "metrix",
+            "profile",
+            "--profile",
+            "compute",
+            "-n",
+            "1",
+            "--aggregate",
+            str(VECTOR_ADD),
+        ],
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+
+    assert result.returncode == 0, f"Command failed: {result.stderr}"
+    assert "vector_add" in result.stdout
+    # Compute profile should show compute metrics
+    assert "COMPUTE" in result.stdout or "Total FLOPS" in result.stdout or "Arithmetic Intensity" in result.stdout
+
+
+@pytest.mark.timeout(120)
+@pytest.mark.skipif(not VECTOR_ADD.exists(), reason="vector_add not compiled")
+def test_cli_compute_metric_directly():
+    """Test metrix --metrics compute.total_flops"""
+    result = subprocess.run(
+        [
+            "metrix",
+            "--metrics",
+            "compute.total_flops",
+            "-n",
+            "1",
+            str(VECTOR_ADD),
+        ],
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+
+    assert result.returncode == 0, f"Command failed: {result.stderr}"
+    assert "vector_add" in result.stdout
+    assert "Total FLOPS" in result.stdout or "FLOPS" in result.stdout
\ No newline at end of file
diff --git a/src/metrix/tests/unit/backends/test_gfx942_metrics.py b/src/metrix/tests/unit/backends/test_gfx942_metrics.py
index 44098f1f..7cbbe5de 100644
--- a/src/metrix/tests/unit/backends/test_gfx942_metrics.py
+++ b/src/metrix/tests/unit/backends/test_gfx942_metrics.py
@@ -146,40 +146,73 @@ def test_no_lds_instructions(self):
 
 
 class TestBandwidthMetrics:
-    """Test HBM bandwidth computations"""
+    """Test HBM bandwidth computations with 32B/64B/128B request granularity"""
 
-    def test_hbm_read_bandwidth(self):
-        """Test read bandwidth calculation"""
+    def test_hbm_read_bandwidth_64b_only(self):
+        """Test read bandwidth with only 64B requests"""
         backend = GFX942Backend()
         backend._raw_data = {
-            'TCC_EA0_RDREQ_sum': 1000,
-            'TCC_EA1_RDREQ_sum': 1000,
-            'GRBM_GUI_ACTIVE': 2100000  # 1 ms at 2.1 GHz
+            'TCC_EA0_RDREQ_sum': 1000,      # Total read requests
+            'TCC_EA0_RDREQ_32B_sum': 0,     # No 32B requests
+            'TCC_BUBBLE_sum': 0,            # No 128B requests
+            'GRBM_GUI_ACTIVE': 2100000      # 1 ms at 2.1 GHz
+        }
+
+        result = backend._hbm_read_bandwidth()
+        # (1000 * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s
+        assert 0.06 < result < 0.07
+
+    def test_hbm_read_bandwidth_mixed_sizes(self):
+        """Test read bandwidth with mixed request sizes"""
+        backend = GFX942Backend()
+        backend._raw_data = {
+            'TCC_EA0_RDREQ_sum': 1000,      # Total requests
+            'TCC_EA0_RDREQ_32B_sum': 200,   # 200 × 32B = 6400 bytes
+            'TCC_BUBBLE_sum': 300,          # 300 × 128B = 38400 bytes
+            # Remaining: 1000 - 200 - 300 = 500 × 64B = 32000 bytes
+            # Total: 6400 + 38400 + 32000 = 76800 bytes
+            'GRBM_GUI_ACTIVE': 2100000      # 1 ms at 2.1 GHz
         }
 
         result = backend._hbm_read_bandwidth()
-        # (2000 requests * 64 bytes) / 0.001 seconds = 128 MB/s = 0.128 GB/s
-        assert 0.1 < result < 0.2
+        # 76800 / 1e9 / 0.001 = 0.0768 GB/s
+        assert 0.07 < result < 0.08
+
+    def test_hbm_write_bandwidth_64b_only(self):
+        """Test write bandwidth with only 64B requests"""
+        backend = GFX942Backend()
+        backend._raw_data = {
+            'TCC_EA0_WRREQ_sum': 1000,      # Total write requests
+            'TCC_EA0_WRREQ_64B_sum': 1000,  # All are 64B
+            'GRBM_GUI_ACTIVE': 2100000      # 1 ms at 2.1 GHz
+        }
+
+        result = backend._hbm_write_bandwidth()
+        # (1000 * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s
+        assert 0.06 < result < 0.07
 
-    def test_hbm_write_bandwidth(self):
-        """Test write bandwidth calculation"""
+    def test_hbm_write_bandwidth_mixed_sizes(self):
+        """Test write bandwidth with mixed 32B and 64B requests"""
         backend = GFX942Backend()
         backend._raw_data = {
-            'TCC_EA0_WRREQ_sum': 500,
-            'TCC_EA1_WRREQ_sum': 500,
-            'GRBM_GUI_ACTIVE': 2100000  # 1 ms at 2.1 GHz
+            'TCC_EA0_WRREQ_sum': 1000,      # Total write requests
+            'TCC_EA0_WRREQ_64B_sum': 600,   # 600 × 64B = 38400 bytes
+            # Remaining: 1000 - 600 = 400 × 32B = 12800 bytes
+            # Total: 38400 + 12800 = 51200 bytes
+            'GRBM_GUI_ACTIVE': 2100000      # 1 ms at 2.1 GHz
         }
 
         result = backend._hbm_write_bandwidth()
-        # (1000 requests * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s
-        assert 0.05 < result < 0.1
+        # 51200 / 1e9 / 0.001 = 0.0512 GB/s
+        assert 0.05 < result < 0.06
 
     def test_zero_active_cycles(self):
         """Handle zero active cycles"""
         backend = GFX942Backend()
         backend._raw_data = {
             'TCC_EA0_RDREQ_sum': 1000,
-            'TCC_EA1_RDREQ_sum': 1000,
+            'TCC_EA0_RDREQ_32B_sum': 0,
+            'TCC_BUBBLE_sum': 0,
             'GRBM_GUI_ACTIVE': 0
         }
 
@@ -188,36 +221,38 @@ def test_zero_active_cycles(self):
 
 
 class TestAtomicLatency:
-    """Test atomic operation latency computation"""
+    """Test L2 cache atomic operation latency computation"""
 
     def test_low_latency(self):
         """10 cycles per atomic operation"""
         backend = GFX942Backend()
         backend._raw_data = {
-            'SQ_INSTS_GDS': 1000,
-            'GDS_BUSY': 10000
+            'TCC_EA0_ATOMIC_sum': 1000,        # 1000 atomic operations
+            'TCC_EA0_ATOMIC_LEVEL_sum': 10000  # 10000 total cycles
         }
 
         result = backend._atomic_latency()
+        # 10000 / 1000 = 10 cycles per atomic
         assert result == 10.0
 
     def test_high_latency(self):
         """1000 cycles per atomic (contention)"""
         backend = GFX942Backend()
         backend._raw_data = {
-            'SQ_INSTS_GDS': 100,
-            'GDS_BUSY': 100000
+            'TCC_EA0_ATOMIC_sum': 100,           # 100 atomic operations
+            'TCC_EA0_ATOMIC_LEVEL_sum': 100000   # 100000 total cycles
         }
 
         result = backend._atomic_latency()
+        # 100000 / 100 = 1000 cycles per atomic
         assert result == 1000.0
 
     def test_no_atomics(self):
         """Handle zero atomic instructions"""
         backend = GFX942Backend()
         backend._raw_data = {
-            'SQ_INSTS_GDS': 0,
-            'GDS_BUSY': 5000
+            'TCC_EA0_ATOMIC_sum': 0,
+            'TCC_EA0_ATOMIC_LEVEL_sum': 5000
         }
 
         result = backend._atomic_latency()
@@ -251,3 +286,205 @@ def test_get_required_counters(self):
         assert "TCC_MISS_sum" in counters
         assert len(counters) == 2
 
+    def test_discovers_compute_metrics(self):
+        """Backend should discover all compute metrics"""
+        backend = GFX942Backend()
+
+        metrics = backend.get_available_metrics()
+
+        assert "compute.total_flops" in metrics
+        assert "compute.hbm_gflops" in metrics
+        assert "compute.hbm_arithmetic_intensity" in metrics
+        assert "compute.l2_arithmetic_intensity" in metrics
+        assert "compute.l1_arithmetic_intensity" in metrics
+
+
+class TestComputeMetrics:
+    """Test compute metric computations (FLOPS, arithmetic intensity)"""
+
+    def _get_zero_flops_counters(self):
+        """Helper: return counter dict with all FLOPS counters set to 0"""
+        return {
+            'SQ_INSTS_VALU_ADD_F16': 0, 'SQ_INSTS_VALU_MUL_F16': 0,
+            'SQ_INSTS_VALU_TRANS_F16': 0, 'SQ_INSTS_VALU_FMA_F16': 0,
+            'SQ_INSTS_VALU_ADD_F32': 0, 'SQ_INSTS_VALU_MUL_F32': 0,
+            'SQ_INSTS_VALU_TRANS_F32': 0, 'SQ_INSTS_VALU_FMA_F32': 0,
+            'SQ_INSTS_VALU_ADD_F64': 0, 'SQ_INSTS_VALU_MUL_F64': 0,
+            'SQ_INSTS_VALU_TRANS_F64': 0, 'SQ_INSTS_VALU_FMA_F64': 0,
+            'SQ_INSTS_VALU_MFMA_MOPS_F16': 0, 'SQ_INSTS_VALU_MFMA_MOPS_BF16': 0,
+            'SQ_INSTS_VALU_MFMA_MOPS_F32': 0, 'SQ_INSTS_VALU_MFMA_MOPS_F64': 0,
+        }
+
+    def test_total_flops_fp32_add(self):
+        """Test FLOPS calculation with FP32 add instructions"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 100
+
+        result = backend._total_flops()
+        # 64 threads per wave * 100 instructions = 6400 FLOPS
+        assert result == 6400
+
+    def test_total_flops_fma_counts_double(self):
+        """Test that FMA instructions count as 2 operations"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_FMA_F32'] = 100
+
+        result = backend._total_flops()
+        # 64 threads * 100 FMA * 2 ops = 12800 FLOPS
+        assert result == 12800
+
+    def test_total_flops_mfma_high_throughput(self):
+        """Test MFMA instructions produce 512 operations each"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_MFMA_MOPS_F32'] = 10
+
+        result = backend._total_flops()
+        # 512 ops * 10 instructions = 5120 FLOPS
+        assert result == 5120
+
+    def test_total_flops_mixed_precision(self):
+        """Test FLOPS with mixed precision operations"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F16'] = 100  # 6400 FLOPS
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 50   # 3200 FLOPS
+        backend._raw_data['SQ_INSTS_VALU_ADD_F64'] = 25   # 1600 FLOPS
+
+        result = backend._total_flops()
+        assert result == 6400 + 3200 + 1600
+
+    def test_total_flops_zero(self):
+        """Handle zero FLOPS gracefully"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+
+        result = backend._total_flops()
+        assert result == 0
+
+    def test_hbm_gflops_calculation(self):
+        """Test GFLOPS calculation with timing"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000000  # 64M FLOPS
+        backend._raw_data['GRBM_GUI_ACTIVE'] = 2100000  # 1 ms at 2.1 GHz
+
+        result = backend._hbm_gflops()
+        # 64M FLOPS / 0.001 seconds = 64 GFLOPS
+        assert 60 < result < 70
+
+    def test_hbm_gflops_zero_time(self):
+        """Handle zero active cycles"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000
+        backend._raw_data['GRBM_GUI_ACTIVE'] = 0
+
+        result = backend._hbm_gflops()
+        assert result == 0.0
+
+    def test_hbm_arithmetic_intensity(self):
+        """Test HBM arithmetic intensity calculation"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000  # 64000 FLOPS
+        # HBM counters: simple case with only 64B reads
+        backend._raw_data['TCC_EA0_RDREQ_sum'] = 1000
+        backend._raw_data['TCC_EA0_RDREQ_32B_sum'] = 0
+        backend._raw_data['TCC_BUBBLE_sum'] = 0
+        backend._raw_data['TCC_EA0_WRREQ_sum'] = 0
+        backend._raw_data['TCC_EA0_WRREQ_64B_sum'] = 0
+
+        result = backend._hbm_arithmetic_intensity()
+        # 64000 FLOPS / (1000 * 64 bytes) = 64000 / 64000 = 1.0 FLOP/byte
+        assert result == 1.0
+
+    def test_hbm_arithmetic_intensity_zero_bytes(self):
+        """Handle zero HBM bytes transferred"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000
+        backend._raw_data['TCC_EA0_RDREQ_sum'] = 0
+        backend._raw_data['TCC_EA0_RDREQ_32B_sum'] = 0
+        backend._raw_data['TCC_BUBBLE_sum'] = 0
+        backend._raw_data['TCC_EA0_WRREQ_sum'] = 0
+        backend._raw_data['TCC_EA0_WRREQ_64B_sum'] = 0
+
+        result = backend._hbm_arithmetic_intensity()
+        assert result == 0.0
+
+    def test_l2_arithmetic_intensity(self):
+        """Test L2 arithmetic intensity calculation"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000  # 64000 FLOPS
+        backend._raw_data['TCC_REQ_sum'] = 500  # 500 * 128 = 64000 bytes
+
+        result = backend._l2_arithmetic_intensity()
+        # 64000 FLOPS / 64000 bytes = 1.0 FLOP/byte
+        assert result == 1.0
+
+    def test_l2_arithmetic_intensity_zero_bytes(self):
+        """Handle zero L2 bytes"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000
+        backend._raw_data['TCC_REQ_sum'] = 0
+
+        result = backend._l2_arithmetic_intensity()
+        assert result == 0.0
+
+    def test_l1_arithmetic_intensity(self):
+        """Test L1 arithmetic intensity calculation"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000  # 64000 FLOPS
+        backend._raw_data['TCP_TOTAL_CACHE_ACCESSES_sum'] = 500  # 500 * 128 = 64000 bytes
+
+        result = backend._l1_arithmetic_intensity()
+        # 64000 FLOPS / 64000 bytes = 1.0 FLOP/byte
+        assert result == 1.0
+
+    def test_l1_arithmetic_intensity_zero_bytes(self):
+        """Handle zero L1 bytes"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 1000
+        backend._raw_data['TCP_TOTAL_CACHE_ACCESSES_sum'] = 0
+
+        result = backend._l1_arithmetic_intensity()
+        assert result == 0.0
+
+    def test_high_arithmetic_intensity_compute_bound(self):
+        """Test high AI indicates compute-bound kernel"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        # Lots of compute, little memory
+        backend._raw_data['SQ_INSTS_VALU_MFMA_MOPS_F32'] = 1000  # 512000 FLOPS
+        backend._raw_data['TCC_EA0_RDREQ_sum'] = 100  # 6400 bytes
+        backend._raw_data['TCC_EA0_RDREQ_32B_sum'] = 0
+        backend._raw_data['TCC_BUBBLE_sum'] = 0
+        backend._raw_data['TCC_EA0_WRREQ_sum'] = 0
+        backend._raw_data['TCC_EA0_WRREQ_64B_sum'] = 0
+
+        result = backend._hbm_arithmetic_intensity()
+        # 512000 / 6400 = 80 FLOP/byte (very compute-bound)
+        assert result == 80.0
+
+    def test_low_arithmetic_intensity_memory_bound(self):
+        """Test low AI indicates memory-bound kernel"""
+        backend = GFX942Backend()
+        backend._raw_data = self._get_zero_flops_counters()
+        # Little compute, lots of memory
+        backend._raw_data['SQ_INSTS_VALU_ADD_F32'] = 100  # 6400 FLOPS
+        backend._raw_data['TCC_EA0_RDREQ_sum'] = 10000  # 640000 bytes
+        backend._raw_data['TCC_EA0_RDREQ_32B_sum'] = 0
+        backend._raw_data['TCC_BUBBLE_sum'] = 0
+        backend._raw_data['TCC_EA0_WRREQ_sum'] = 0
+        backend._raw_data['TCC_EA0_WRREQ_64B_sum'] = 0
+
+        result = backend._hbm_arithmetic_intensity()
+        # 6400 / 640000 = 0.01 FLOP/byte (very memory-bound)
+        assert result == 0.01
diff --git a/src/metrix/tests/unit/test_api.py b/src/metrix/tests/unit/test_api.py
index 1448f887..8eed2427 100644
--- a/src/metrix/tests/unit/test_api.py
+++ b/src/metrix/tests/unit/test_api.py
@@ -14,14 +14,13 @@ def test_init_default(self):
         """Test default initialization"""
         profiler = Metrix()
         assert profiler.arch == "gfx942"
-        assert profiler.verbose == False
         assert profiler.backend is not None
 
     def test_init_custom_arch(self):
         """Test custom architecture"""
-        profiler = Metrix(arch="gfx942", verbose=True)
+        profiler = Metrix(arch="gfx942")
         assert profiler.arch == "gfx942"
-        assert profiler.verbose == True
+        assert profiler.backend is not None
 
 
 class TestMetrixMetricListing:
@@ -35,6 +34,16 @@ def test_list_metrics(self):
         assert "memory.l2_hit_rate" in metrics
         assert "memory.hbm_bandwidth_utilization" in metrics
 
+    def test_list_metrics_includes_compute(self):
+        """Test that compute metrics are included in list"""
+        profiler = Metrix()
+        metrics = profiler.list_metrics()
+        assert "compute.total_flops" in metrics
+        assert "compute.hbm_gflops" in metrics
+        assert "compute.hbm_arithmetic_intensity" in metrics
+        assert "compute.l2_arithmetic_intensity" in metrics
+        assert "compute.l1_arithmetic_intensity" in metrics
+
     def test_list_profiles(self):
         """Test listing profiles"""
         profiler = Metrix()
@@ -42,6 +51,12 @@ def test_list_profiles(self):
         assert "quick" in profiles
         assert "memory" in profiles
 
+    def test_list_profiles_includes_compute(self):
+        """Test that compute profile is included"""
+        profiler = Metrix()
+        profiles = profiler.list_profiles()
+        assert "compute" in profiles
+
     def test_get_metric_info(self):
         """Test getting metric information"""
         profiler = Metrix()
@@ -49,6 +64,20 @@ def test_get_metric_info(self):
         assert info['name'] == "L2 Cache Hit Rate"
         assert info['unit'] == "percent"
 
+    def test_get_compute_metric_info(self):
+        """Test getting compute metric information"""
+        profiler = Metrix()
+        info = profiler.get_metric_info("compute.total_flops")
+        assert info['name'] == "Total FLOPS"
+        assert info['unit'] == "FLOPS"
+
+    def test_get_arithmetic_intensity_info(self):
+        """Test getting arithmetic intensity metric information"""
+        profiler = Metrix()
+        info = profiler.get_metric_info("compute.hbm_arithmetic_intensity")
+        assert info['name'] == "HBM Arithmetic Intensity"
+        assert info['unit'] == "FLOP/byte"
+
     def test_get_unknown_metric_raises(self):
         """Test getting info for unknown metric raises error"""
         profiler = Metrix()