From a98d263ee26c39a1c82a703b85731d13ed72d593 Mon Sep 17 00:00:00 2001 From: Ilia Sergachev Date: Fri, 12 Jun 2026 12:48:21 +0000 Subject: [PATCH] [GPU] Fix family-conditional PTX compilation. --- .../gpu/llvm_gpu_backend/nvptx_backend.cc | 22 +++++++++++++------ .../gpu/llvm_gpu_backend/nvptx_backend.h | 11 ++++++++-- .../llvm_gpu_backend/nvptx_backend_test.cc | 21 ++++++++++++++++++ xla/service/gpu/nvptx_compiler.cc | 10 ++++----- 4 files changed, 50 insertions(+), 14 deletions(-) diff --git a/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc b/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc index 81130c6bb23f9..ec2b5327c60db 100644 --- a/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc +++ b/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc @@ -20,7 +20,6 @@ limitations under the License. #include #include #include -#include #include #include "absl/base/call_once.h" @@ -30,7 +29,6 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "xla/tsl/platform/status_macros.h" -#include "third_party/gpus/cuda/include/cuda.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/LazyCallGraph.h" @@ -59,6 +57,7 @@ limitations under the License. #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/Scalar.h" +#include "third_party/gpus/cuda/include/cuda.h" #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h" #include "xla/service/gpu/llvm_gpu_backend/load_ir_module.h" #include "xla/service/gpu/llvm_gpu_backend/nvptx_libdevice_path.h" @@ -239,7 +238,13 @@ std::vector GetNVPTXBackendOptions( return backend_llvm_opts; } -std::string GetSmName(se::CudaComputeCapability compute_capability) { +constexpr se::CudaComputeCapability kSupportedVersions[] = { + {12, 1}, {12, 0}, {11, 0}, {10, 3}, {10, 0}, {9, 0}, {8, 9}, {8, 7}, + {8, 6}, {8, 0}, {7, 5}, {7, 2}, {7, 0}, {6, 2}, {6, 1}, {6, 0}, + {5, 3}, {5, 2}, {5, 0}, {3, 7}, {3, 5}, {3, 2}, {3, 0}}; + +se::CudaComputeCapability ResolveSupportedComputeCapability( + se::CudaComputeCapability compute_capability) { using CudaComputeCapabilities = se::CudaComputeCapability::CudaComputeCapabilities; @@ -248,10 +253,6 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) { se::CudaComputeCapability::FeatureExtension::kNone; // If the current compute capability isn't known, fallback to the // most recent version before it. - constexpr stream_executor::CudaComputeCapability kSupportedVersions[] = { - {12, 1}, {12, 0}, {11, 0}, {10, 3}, {10, 0}, {9, 0}, {8, 9}, {8, 7}, - {8, 6}, {8, 0}, {7, 5}, {7, 2}, {7, 0}, {6, 2}, {6, 1}, {6, 0}, - {5, 3}, {5, 2}, {5, 0}, {3, 7}, {3, 5}, {3, 2}, {3, 0}}; // Initialize to the least supported version, which acts as a safe fallback auto target_compute_capability = kSupportedVersions[std::size(kSupportedVersions) - 1]; @@ -284,6 +285,13 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) { se::CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures; } + return target_compute_capability; +} + +std::string GetSmName(se::CudaComputeCapability compute_capability) { + se::CudaComputeCapability target_compute_capability = + ResolveSupportedComputeCapability(compute_capability); + // If the current CC isn't supported by LLVM and it is newer then // the max supported LLVM version, do not warn about it. The end // user can't do anything about this. E.g., PTX compiled for SM75 will diff --git a/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h b/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h index 525641d90b46c..ebf63a2c11744 100644 --- a/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h +++ b/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h @@ -24,13 +24,20 @@ limitations under the License. #include "absl/status/statusor.h" #include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" -#include "xla/service/gpu/llvm_gpu_backend/ptx_version_util.h" +#include "xla/stream_executor/cuda/cuda_compute_capability.h" #include "xla/stream_executor/device_description.h" -#include "xla/stream_executor/semantic_version.h" #include "xla/xla.pb.h" namespace xla::gpu::nvptx { +// Resolves the compute capability that XLA actually compiles for given the +// compute capability of the target device. If the device's compute capability +// is not directly supported by the bundled LLVM/ptxas, this returns the most +// advanced supported compute capability that the device can run, potentially +// with the family ("f") feature extension enabled. +stream_executor::CudaComputeCapability ResolveSupportedComputeCapability( + stream_executor::CudaComputeCapability compute_capability); + // Gets the GPU name as it's known to LLVM for a given compute // capability. If we see an unrecognized compute capability, we // return the highest one that is known and below the selected device. diff --git a/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc b/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc index b2c17f85aecc6..9b0a462299478 100644 --- a/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc +++ b/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc @@ -64,6 +64,27 @@ TEST(UtilsTest, TestGetSmName) { ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{13, 0}), "sm_121"); } +TEST(UtilsTest, UnknownCapabilityFallsBackToFamilyCompatible) { + using FeatureExtension = se::CudaComputeCapability::FeatureExtension; + // Directly supported compute capabilities keep their feature extension. + EXPECT_EQ(nvptx::ResolveSupportedComputeCapability(se::CudaComputeCapability{ + 10, 0, FeatureExtension::kAcceleratedFeatures}), + (se::CudaComputeCapability{ + 10, 0, FeatureExtension::kAcceleratedFeatures})); + // An unknown compute capability within a known major version falls back to + // the latest supported minor version with the family compatible extension. + // This mirrors a yet-unreleased device (e.g. sm_1099a) where ptxas only knows + // about sm_103f. + EXPECT_EQ(nvptx::ResolveSupportedComputeCapability(se::CudaComputeCapability{ + 10, 99, FeatureExtension::kAcceleratedFeatures}), + (se::CudaComputeCapability{ + 10, 3, FeatureExtension::kFamilyCompatibleFeatures})); + // When no family-compatible extension is available, don't use any. + EXPECT_EQ(nvptx::ResolveSupportedComputeCapability(se::CudaComputeCapability{ + 9, 99, FeatureExtension::kAcceleratedFeatures}), + (se::CudaComputeCapability{9, 0, FeatureExtension::kNone})); +} + using VersionPair = std::pair; using PtxVersionFromCudaVersionTest = ::testing::TestWithParam; diff --git a/xla/service/gpu/nvptx_compiler.cc b/xla/service/gpu/nvptx_compiler.cc index 5c7b67b903f60..d3ebe57f25e09 100644 --- a/xla/service/gpu/nvptx_compiler.cc +++ b/xla/service/gpu/nvptx_compiler.cc @@ -558,8 +558,8 @@ NVPTXCompiler::CompileTargetBinary( se::cuda::CompilationOptions compilation_options = PtxCompileOptionsFromDebugOptions(module_config.debug_options()); - se::CudaComputeCapability cc = - *device_description.gpu_compute_capability().cuda_compute_capability(); + se::CudaComputeCapability cc = nvptx::ResolveSupportedComputeCapability( + *device_description.gpu_compute_capability().cuda_compute_capability()); // This may print multiple lines per HLO compilation because of the // parallelized compilation of LLVM modules. @@ -620,8 +620,8 @@ absl::StatusOr> NVPTXCompiler::LinkModules( return std::vector{}; } - auto cc = - device_description.gpu_compute_capability().cuda_compute_capability(); + se::CudaComputeCapability cc = nvptx::ResolveSupportedComputeCapability( + *device_description.gpu_compute_capability().cuda_compute_capability()); ASSIGN_OR_RETURN(const se::cuda::CompilationProvider* compilation_provider, GetCompilationProvider(debug_options, stream_exec)); @@ -640,7 +640,7 @@ absl::StatusOr> NVPTXCompiler::LinkModules( << compilation_provider->name(); ASSIGN_OR_RETURN( se::cuda::Assembly assembly, - compilation_provider->CompileAndLink(*cc, inputs, compilation_options)); + compilation_provider->CompileAndLink(cc, inputs, compilation_options)); return std::move(assembly.cubin); }