From 42148414962fabd7795cd6f36cd26cd50c95b19f Mon Sep 17 00:00:00 2001 From: whatghost Date: Wed, 27 May 2026 05:08:40 +0000 Subject: [PATCH 1/2] fix bugs when device is None in get_full_tflops_approx and add b200 tflops --- python/triton_dist/kernels/nvidia/gemm_perf_model.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/triton_dist/kernels/nvidia/gemm_perf_model.py b/python/triton_dist/kernels/nvidia/gemm_perf_model.py index 62c4f2a92..a7b2d88c7 100644 --- a/python/triton_dist/kernels/nvidia/gemm_perf_model.py +++ b/python/triton_dist/kernels/nvidia/gemm_perf_model.py @@ -120,6 +120,7 @@ def get_tflops_approx(device: torch.dtype, num_ctas: int, num_warps: int, dtype: def get_full_tflops_approx(dtype: torch.dtype, device: Optional[torch.device] = None): + device = torch.cuda.current_device() if device is None else device prop = torch.cuda.get_device_properties(device) return get_tflops_approx(device, prop.multi_processor_count, 4, dtype) @@ -135,7 +136,9 @@ def get_tensorcore_dtype_support(device_id=0): (8, 9): [torch.float16, torch.bfloat16, torch.float32, torch.int8, torch.float8_e4m3fn, torch.float8_e5m2], # Ada L40S/RTX 40xx # Hopper - (9, 0): [torch.float16, torch.bfloat16, torch.float8_e4m3fn, torch.float8_e5m2, torch.int8] + (9, 0): [torch.float16, torch.bfloat16, torch.float8_e4m3fn, torch.float8_e5m2, torch.int8], + # Blackwell + (10, 0): [torch.float16, torch.bfloat16, torch.float8_e4m3fn, torch.float8_e5m2, torch.int8], } return DTYPE_MAP.get(cap, [torch.float16, torch.float32]) @@ -177,6 +180,8 @@ def get_tensorcore_tflops_by_device_name(dtype, device_id=0): return 989 * (2 / dtype.itemsize) if device_name == "NVIDIA H20": return 148 * (2 / dtype.itemsize) + if device_name == "NVIDIA B200": + return 2250 * (2 / dtype.itemsize) logging.warning( f"device {device_name} not listed here. calculate tflops by estimation, or you can report it to developers.") @@ -206,6 +211,7 @@ def get_dram_gbps_by_device_name(device_name: str): "NVIDIA H100 SXM": 3958, "NVIDIA H100 NVL": 3341, "NVIDIA H800": 3350, + "NVIDIA B200": 8000, } return _DRAM_GBPS[device_name] @@ -227,8 +233,8 @@ def estimate_gemm_sol_time_ms(M: int, N: int, K: int, dtype=torch.bfloat16): if __name__ == "__main__": print(f"DRAM: {get_dram_gbps():0.2f} GB/s") print(f"DRAM by approx: {triton.testing.get_dram_gbps():0.2f} GB/s") - print(f"DRAM by device name: {get_dram_gbps_by_device_name(torch.cuda.get_device_name(0)):0.2f} GB/s") - print(f"TFLOPS: {get_tensorcore_tflops(torch.float16):0.2f} TFLOPS") + # print(f"DRAM by device name: {get_dram_gbps_by_device_name(torch.cuda.get_device_name(0)):0.2f} GB/s") + # print(f"TFLOPS: {get_tensorcore_tflops(torch.float16):0.2f} TFLOPS") num_sms = torch.cuda.get_device_properties(0).multi_processor_count print(f"TFLOPS by approx: {get_tensorcore_tflops_by_calc(0, num_sms, 4, torch.float16):0.2f} TFLOPS") print(f"TFLOPS by device name: {get_tensorcore_tflops_by_device_name(torch.float16):0.2f} TFLOPS") From 01d13c17969720053536d46b3b798adc4f19ad15 Mon Sep 17 00:00:00 2001 From: whatghost Date: Wed, 27 May 2026 05:19:11 +0000 Subject: [PATCH 2/2] unmove debug msg --- python/triton_dist/kernels/nvidia/gemm_perf_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/triton_dist/kernels/nvidia/gemm_perf_model.py b/python/triton_dist/kernels/nvidia/gemm_perf_model.py index a7b2d88c7..ee23d0223 100644 --- a/python/triton_dist/kernels/nvidia/gemm_perf_model.py +++ b/python/triton_dist/kernels/nvidia/gemm_perf_model.py @@ -233,8 +233,8 @@ def estimate_gemm_sol_time_ms(M: int, N: int, K: int, dtype=torch.bfloat16): if __name__ == "__main__": print(f"DRAM: {get_dram_gbps():0.2f} GB/s") print(f"DRAM by approx: {triton.testing.get_dram_gbps():0.2f} GB/s") - # print(f"DRAM by device name: {get_dram_gbps_by_device_name(torch.cuda.get_device_name(0)):0.2f} GB/s") - # print(f"TFLOPS: {get_tensorcore_tflops(torch.float16):0.2f} TFLOPS") + print(f"DRAM by device name: {get_dram_gbps_by_device_name(torch.cuda.get_device_name(0)):0.2f} GB/s") + print(f"TFLOPS: {get_tensorcore_tflops(torch.float16):0.2f} TFLOPS") num_sms = torch.cuda.get_device_properties(0).multi_processor_count print(f"TFLOPS by approx: {get_tensorcore_tflops_by_calc(0, num_sms, 4, torch.float16):0.2f} TFLOPS") print(f"TFLOPS by device name: {get_tensorcore_tflops_by_device_name(torch.float16):0.2f} TFLOPS")