From 08863add202f78e9c99ee8e442ac276cf2093db8 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 9 Jun 2026 00:19:07 +0000 Subject: [PATCH 1/2] [tsl] Add support for spatial partitioning of BFC-managed address range --- xla/debug_options_flags.cc | 13 +- xla/pjrt/gpu/gpu_helpers.cc | 13 +- xla/pjrt/gpu/gpu_helpers.h | 9 +- xla/pjrt/gpu/se_gpu_pjrt_client.cc | 68 +- .../integrations/tf_allocator_adapter.cc | 19 +- .../integrations/tf_allocator_adapter.h | 16 +- xla/tsl/framework/BUILD | 2 +- xla/tsl/framework/allocator.h | 39 +- xla/tsl/framework/bfc_allocator.cc | 605 +++++++++++++----- xla/tsl/framework/bfc_allocator.h | 256 +++++++- xla/tsl/framework/bfc_allocator_test.cc | 539 ++++++++++++++-- xla/xla.proto | 5 + 12 files changed, 1315 insertions(+), 269 deletions(-) diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc index 59429e76e636c..307e704dc8d71 100644 --- a/xla/debug_options_flags.cc +++ b/xla/debug_options_flags.cc @@ -105,7 +105,8 @@ absl::StatusOr> ParseRepeatedEnumModifiers( namespace { template -static auto FindRepeatedFieldValue(google::protobuf::RepeatedField* list, T value) { +static auto FindRepeatedFieldValue(google::protobuf::RepeatedField* list, + T value) { for (auto it = list->begin(); it != list->end(); ++it) { if (*it == value) { return it; @@ -300,6 +301,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_gpu_experimental_dynamic_slice_fusion_verify_offsets(false); opts.set_xla_gpu_nccl_termination_timeout_seconds(-1); opts.set_xla_gpu_enable_nccl_user_buffers(false); + opts.set_xla_gpu_enable_allocator_spatial_partitioning(true); opts.set_xla_gpu_experimental_enable_nccl_symmetric_buffers(false); opts.set_xla_gpu_experimental_enable_nvshmem(false); opts.set_xla_gpu_enable_nccl_comm_splitting(true); @@ -2025,6 +2027,14 @@ void MakeDebugOptionsFlags(std::vector* flag_list, "Enables NCCL User Buffer Registration. collective_memory_size in the " "allocator config must also be set to a non-zero value that is large " "enough to meet peak collective memory usage.")); + flag_list->push_back(tsl::Flag( + "xla_gpu_enable_allocator_spatial_partitioning", + bool_setter_for( + &DebugOptions::set_xla_gpu_enable_allocator_spatial_partitioning), + debug_options->xla_gpu_enable_allocator_spatial_partitioning(), + "Enables spatial partitioning of the GPU BFC allocator so default and " + "collective allocations share one fixed address range. Requires BFC " + "preallocation.")); flag_list->push_back(tsl::Flag( "xla_gpu_experimental_enable_nccl_symmetric_buffers", bool_setter_for( @@ -3250,7 +3260,6 @@ void MakeDebugOptionsFlags(std::vector* flag_list, bool_setter_for(&DebugOptions::set_xla_gpu_log_minmax), debug_options->xla_gpu_log_minmax(), "If true, log min/max values from kernel outputs.")); - flag_list->push_back(tsl::Flag( "xla_early_exit_with_layouts", bool_setter_for(&DebugOptions::set_xla_early_exit_with_layouts), diff --git a/xla/pjrt/gpu/gpu_helpers.cc b/xla/pjrt/gpu/gpu_helpers.cc index 50c5a21fe72d3..f2942acee38a4 100644 --- a/xla/pjrt/gpu/gpu_helpers.cc +++ b/xla/pjrt/gpu/gpu_helpers.cc @@ -103,8 +103,12 @@ absl::StatusOr> CreateBFCAllocator( se::StreamExecutor* executor, double memory_fraction, bool preallocate, std::optional gpu_system_memory_size, const std::vector& sub_allocator_alloc_visitors, - const std::vector& - sub_allocator_free_visitors) { + const std::vector& sub_allocator_free_visitors, + bool enable_spatial_partitioning) { + if (enable_spatial_partitioning && !preallocate) { + return InvalidArgument( + "Spatial partitioning of the BFC allocator requires preallocate=true."); + } bool enable_unified_memory; absl::Status status = tsl::ReadBoolFromEnvVar("TF_FORCE_UNIFIED_MEMORY", false, &enable_unified_memory); @@ -164,13 +168,14 @@ absl::StatusOr> CreateBFCAllocator( tsl::BFCAllocator::Options opts; opts.allow_growth = !preallocate; + opts.enable_spatial_partitioning = enable_spatial_partitioning; return std::make_shared( std::move(sub_allocator), allocator_memory, absl::StrCat("GPU_", device_ordinal, "_bfc"), opts); } // Builds a BFCAllocator for all local GPUs that uses collective memory. -absl::StatusOr> CreateCollectiveBFCAllocator( +absl::StatusOr> CreateCollectiveBFCAllocator( se::StreamExecutor* executor, double memory_fraction, size_t collective_memory_size) { int device_ordinal = executor->device_ordinal(); @@ -205,7 +210,7 @@ absl::StatusOr> CreateCollectiveBFCAllocator( tsl::BFCAllocator::Options opts; opts.allow_growth = !preallocate; - return std::make_shared( + return std::make_unique( std::move(sub_allocator), allocator_memory, absl::StrCat("GPU_collectivememory_", device_ordinal, "_bfc"), opts); } diff --git a/xla/pjrt/gpu/gpu_helpers.h b/xla/pjrt/gpu/gpu_helpers.h index f037bb5115329..3d3866873067e 100644 --- a/xla/pjrt/gpu/gpu_helpers.h +++ b/xla/pjrt/gpu/gpu_helpers.h @@ -47,15 +47,18 @@ void EnablePeerAccess(absl::Span executors); absl::StatusOr> GetGpuHostAllocator( se::StreamExecutor* executor); -// Builds a BFCAllocator for all local GPUs. +// Builds a BFCAllocator for all local GPUs. When enable_spatial_partitioning +// is set, the allocator serves collective (upper-end) and default (lower-end) +// requests from one shared address range; this requires preallocate=true. absl::StatusOr> CreateBFCAllocator( se::StreamExecutor* executor, double memory_fraction, bool preallocate, std::optional gpu_system_memory_size, const std::vector& sub_allocator_alloc_visitors, - const std::vector& sub_allocator_free_visitors); + const std::vector& sub_allocator_free_visitors, + bool enable_spatial_partitioning = false); // Builds a BFCAllocator for all local GPUs that uses collective memory. -absl::StatusOr> CreateCollectiveBFCAllocator( +absl::StatusOr> CreateCollectiveBFCAllocator( se::StreamExecutor* executor, double memory_fraction, size_t collective_memory_size); diff --git a/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/xla/pjrt/gpu/se_gpu_pjrt_client.cc index 6bd3f0fe27bf3..3f1ab375add4e 100644 --- a/xla/pjrt/gpu/se_gpu_pjrt_client.cc +++ b/xla/pjrt/gpu/se_gpu_pjrt_client.cc @@ -1329,14 +1329,20 @@ GetStreamExecutorGpuDeviceAllocator( const std::map>& addressable_devices) { std::vector allocators; + const DebugOptions& debug_options = xla::GetDebugOptionsFromFlags(); GpuAllocatorConfig::Kind effective_kind = allocator_config.kind; - if (GetDebugOptionsFromFlags().xla_gpu_command_buffer_update_mode() != + if (debug_options.xla_gpu_command_buffer_update_mode() != DebugOptions::ALWAYS_UPDATE && effective_kind != GpuAllocatorConfig::Kind::kVmm) { LOG(WARNING) << "xla_gpu_command_buffer_update_mode requires the " "VMM allocator. Overriding allocator kind to kVmm."; effective_kind = GpuAllocatorConfig::Kind::kVmm; } + + // Set when a single preallocated BFC allocator serves both default and + // collective memory via spatial partitioning; suppresses the separate + // collective allocator below. + bool shared_collective_pool = false; switch (effective_kind) { case GpuAllocatorConfig::Kind::kCudaAsync: { for (const auto& ordinal_and_device : addressable_devices) { @@ -1356,6 +1362,13 @@ GetStreamExecutorGpuDeviceAllocator( case GpuAllocatorConfig::Kind::kDefault: case GpuAllocatorConfig::Kind::kBFC: { LOG(INFO) << "Using BFC allocator."; + // With the spatial-partitioning flag enabled, preallocation lets one BFC + // allocator over a fixed address range serve both default (lower end) and + // collective (upper end) memory, so no separate collective allocator is + // created. Otherwise, use the separate collective allocator below. + shared_collective_pool = + allocator_config.preallocate && + debug_options.xla_gpu_enable_allocator_spatial_partitioning(); for (const auto& ordinal_and_device : addressable_devices) { ASSIGN_OR_RETURN( auto bfc_allocator, @@ -1364,11 +1377,29 @@ GetStreamExecutorGpuDeviceAllocator( allocator_config.preallocate, allocator_config.gpu_system_memory_size, allocator_config.sub_allocator_alloc_visitors, - allocator_config.sub_allocator_free_visitors)); + allocator_config.sub_allocator_free_visitors, + /*enable_spatial_partitioning=*/ + shared_collective_pool)); allocators.push_back( - {std::move(bfc_allocator), - ordinal_and_device.second->compute_stream(), + {bfc_allocator, ordinal_and_device.second->compute_stream(), /*memory_space=*/(int)xla::gpu::MemorySpaceColor::kDefault}); + if (shared_collective_pool) { + size_t collective_memory_alignment = + tsl::Allocator::kAllocatorAlignment; + if (auto* collectives = + gpu::GpuCollectives::Default(platform->Name())) { + collective_memory_alignment = + collectives->SymmetricMemoryAlignment(); + } + allocators.push_back( + {std::move(bfc_allocator), + ordinal_and_device.second->compute_stream(), + /*memory_space=*/(int)xla::gpu::MemorySpaceColor::kCollective, + /*device_ordinal=*/std::nullopt, + /*platform=*/nullptr, + /*min_alignment=*/collective_memory_alignment, + /*allocation_end=*/tsl::AllocationEnd::kUpper}); + } } break; } @@ -1402,18 +1433,22 @@ GetStreamExecutorGpuDeviceAllocator( } } - // Add any additional allocators for alternate memory spaces. - for (const auto& ordinal_and_device : addressable_devices) { - ASSIGN_OR_RETURN( - auto collective_bfc_allocator, - CreateCollectiveBFCAllocator( - ordinal_and_device.second->executor(), - /*memory_fraction=*/1.0 - allocator_config.memory_fraction, - allocator_config.collective_memory_size)); - allocators.push_back( - {std::move(collective_bfc_allocator), - ordinal_and_device.second->compute_stream(), - /*memory_space=*/(int)xla::gpu::MemorySpaceColor::kCollective}); + // Add a separate collective allocator unless the default BFC allocator + // already serves collective memory from its shared, spatially partitioned + // pool. + if (!shared_collective_pool) { + for (const auto& ordinal_and_device : addressable_devices) { + ASSIGN_OR_RETURN( + auto collective_bfc_allocator, + CreateCollectiveBFCAllocator( + ordinal_and_device.second->executor(), + /*memory_fraction=*/1.0 - allocator_config.memory_fraction, + allocator_config.collective_memory_size)); + allocators.push_back( + {std::move(collective_bfc_allocator), + ordinal_and_device.second->compute_stream(), + /*memory_space=*/(int)xla::gpu::MemorySpaceColor::kCollective}); + } } for (const auto& ordinal_and_device : addressable_devices) { @@ -1426,7 +1461,6 @@ GetStreamExecutorGpuDeviceAllocator( } #if defined(GOOGLE_CUDA) && CUDA_VERSION >= 11020 - const auto& debug_options = xla::GetDebugOptionsFromFlags(); if (debug_options.xla_gpu_temp_buffer_use_separate_color()) { // Add memory allocator to allocate memory buffers with persistent temp // memory space color. diff --git a/xla/stream_executor/integrations/tf_allocator_adapter.cc b/xla/stream_executor/integrations/tf_allocator_adapter.cc index 48099deec3bfb..93492bf988c45 100644 --- a/xla/stream_executor/integrations/tf_allocator_adapter.cc +++ b/xla/stream_executor/integrations/tf_allocator_adapter.cc @@ -44,19 +44,23 @@ limitations under the License. namespace stream_executor { TfAllocatorAdapter::TfAllocatorAdapter(tsl::Allocator* wrapped, Stream* stream, - size_t min_alignment) + size_t min_alignment, + tsl::AllocationEnd allocation_end) : DeviceAddressAllocator(CHECK_NOTNULL(stream)->parent()->GetPlatform()), wrapped_(wrapped), stream_(stream), - min_alignment_(min_alignment) {} + min_alignment_(min_alignment), + allocation_end_(allocation_end) {} TfAllocatorAdapter::TfAllocatorAdapter(tsl::Allocator* wrapped, const Platform* platform, - size_t min_alignment) + size_t min_alignment, + tsl::AllocationEnd allocation_end) : DeviceAddressAllocator(platform), wrapped_(wrapped), stream_(nullptr), - min_alignment_(min_alignment) {} + min_alignment_(min_alignment), + allocation_end_(allocation_end) {} TfAllocatorAdapter::~TfAllocatorAdapter() {} @@ -65,6 +69,7 @@ absl::StatusOr> TfAllocatorAdapter::Allocate( int64_t memory_space) { tsl::AllocationAttributes attrs; attrs.retry_on_failure = retry_on_failure; + attrs.allocation_end = allocation_end_; void* data = nullptr; if (size != 0) { data = wrapped_->AllocateRaw(min_alignment_, size, attrs); @@ -131,11 +136,13 @@ MultiDeviceAdapter::MultiDeviceAdapter(const Platform* platform, if (info.stream != nullptr) { per_device_allocators[device_ordinal] = std::make_shared(info.allocator.get(), - info.stream, info.min_alignment); + info.stream, info.min_alignment, + info.allocation_end); } else { per_device_allocators[device_ordinal] = std::make_shared( - info.allocator.get(), info.platform, info.min_alignment); + info.allocator.get(), info.platform, info.min_alignment, + info.allocation_end); } VLOG(3) << absl::StrFormat( "MultiDeviceAdapter: device_ordinal=%d memory_space=%d " diff --git a/xla/stream_executor/integrations/tf_allocator_adapter.h b/xla/stream_executor/integrations/tf_allocator_adapter.h index ae83c98963f1e..64717a17ee41e 100644 --- a/xla/stream_executor/integrations/tf_allocator_adapter.h +++ b/xla/stream_executor/integrations/tf_allocator_adapter.h @@ -54,14 +54,19 @@ class TfAllocatorAdapter : public DeviceAddressAllocator { // Different memory spaces may require different alignment // (e.g. symmetric memory requires higher alignment than // default memory used for on-device compute). + // + // allocation_end: which end of a spatially partitioned allocator to serve + // requests from. Ignored by allocators that do not partition. TfAllocatorAdapter( tsl::Allocator* wrapped, Stream* stream, - size_t min_alignment = tsl::Allocator::kAllocatorAlignment); + size_t min_alignment = tsl::Allocator::kAllocatorAlignment, + tsl::AllocationEnd allocation_end = tsl::AllocationEnd::kLower); // Constructor for cases where `stream` is not available. TfAllocatorAdapter( tsl::Allocator* wrapped, const Platform* platform, - size_t min_alignment = tsl::Allocator::kAllocatorAlignment); + size_t min_alignment = tsl::Allocator::kAllocatorAlignment, + tsl::AllocationEnd allocation_end = tsl::AllocationEnd::kLower); ~TfAllocatorAdapter() override; @@ -88,6 +93,7 @@ class TfAllocatorAdapter : public DeviceAddressAllocator { tsl::Allocator* wrapped_; Stream* stream_; size_t min_alignment_; + tsl::AllocationEnd allocation_end_; }; // Adapter class that wraps per-device TF allocators with corresponding streams @@ -117,6 +123,11 @@ class MultiDeviceAdapter : public DeviceAddressAllocator { // min_alignment: minimum alignment passed to tsl::Allocator::AllocateRaw. // Symmetric/collective memory typically needs higher // alignment than default compute buffers. + // + // allocation_end: which end of a spatially partitioned allocator to serve + // from. When one BFC allocator backs both kDefault and + // kCollective, the kCollective entry uses kUpper so its + // offsets stay independent of default-memory activity. struct AllocatorInfo { std::shared_ptr allocator; Stream* stream; @@ -124,6 +135,7 @@ class MultiDeviceAdapter : public DeviceAddressAllocator { std::optional device_ordinal = std::nullopt; const Platform* platform = nullptr; size_t min_alignment = tsl::Allocator::kAllocatorAlignment; + tsl::AllocationEnd allocation_end = tsl::AllocationEnd::kLower; }; MultiDeviceAdapter(const Platform* platform, diff --git a/xla/tsl/framework/BUILD b/xla/tsl/framework/BUILD index c1bcdc4531539..d923ccca00fbe 100644 --- a/xla/tsl/framework/BUILD +++ b/xla/tsl/framework/BUILD @@ -232,8 +232,8 @@ tsl_cc_test( "//xla/tsl/platform:test_benchmark", "//xla/tsl/platform:test_main", "@com_google_absl//absl/base", + "@com_google_absl//absl/base:no_destructor", "@com_google_absl//absl/synchronization", - "@tsl//tsl/platform:platform_port", ], ) diff --git a/xla/tsl/framework/allocator.h b/xla/tsl/framework/allocator.h index 2e7b60f7a5c44..3c5618d527f41 100644 --- a/xla/tsl/framework/allocator.h +++ b/xla/tsl/framework/allocator.h @@ -16,30 +16,52 @@ limitations under the License. #ifndef XLA_TSL_FRAMEWORK_ALLOCATOR_H_ #define XLA_TSL_FRAMEWORK_ALLOCATOR_H_ -#include - #include #include #include #include #include +#include #include "xla/tsl/platform/logging.h" #include "xla/tsl/platform/macros.h" #include "tsl/platform/numa.h" namespace tsl { +// Selects which end of an allocator's pre-allocated address range a request +// should be served from, for allocators that spatially partition their range. +// Allocators that do not spatially partition ignore this and always behave as +// kLower. +enum class AllocationEnd : uint8_t { + kLower, // Carve from the lower-address end of the range (grows up). + kUpper, // Carve from the upper-address end of the range (grows down). +}; + +template +void AbslStringify(Sink& sink, AllocationEnd end) { + switch (end) { + case AllocationEnd::kLower: + sink.Append("lower"); + return; + case AllocationEnd::kUpper: + sink.Append("upper"); + return; + } +} + // Attributes for a single allocation call. Different calls to the same // allocator could potentially have different allocation attributes. struct AllocationAttributes { AllocationAttributes() = default; AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged, - std::function* freed_by_func) + std::function* freed_by_func, + AllocationEnd allocation_end = AllocationEnd::kLower) : retry_on_failure(retry_on_failure), allocation_will_be_logged(allocation_will_be_logged), - freed_by_func(freed_by_func) {} + freed_by_func(freed_by_func), + allocation_end(allocation_end) {} // If the first attempt to allocate the memory fails, the allocation should // wait and retry (with a timeout). @@ -59,6 +81,11 @@ struct AllocationAttributes { // returned. std::function* freed_by_func = nullptr; // Not owned. + // Which end of the allocator's pre-allocated address range to serve this + // request from. Only honored by allocators configured for spatial + // partitioning. + AllocationEnd allocation_end = AllocationEnd::kLower; + AllocationAttributes(const AllocationAttributes&) = delete; void operator=(const AllocationAttributes&) = delete; }; @@ -251,7 +278,7 @@ class AllocatorWrapper : public Allocator { public: explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {} - ~AllocatorWrapper() override {} + ~AllocatorWrapper() override = default; // Returns the wrapped allocator to which all calls are delegated. Allocator* wrapped() const { return wrapped_; } @@ -399,7 +426,7 @@ class SubAllocator { SubAllocator(const std::vector& alloc_visitors, const std::vector& free_visitors); - virtual ~SubAllocator() {} + virtual ~SubAllocator() = default; // Allocates at least num_bytes. Returns actual number of bytes allocated in // bytes_received. The caller can safely use the full bytes_received sized // buffer following the returned pointer. diff --git a/xla/tsl/framework/bfc_allocator.cc b/xla/tsl/framework/bfc_allocator.cc index 519014b972172..d14c96c8b4688 100644 --- a/xla/tsl/framework/bfc_allocator.cc +++ b/xla/tsl/framework/bfc_allocator.cc @@ -60,11 +60,16 @@ BFCAllocator::BFCAllocator(std::unique_ptr sub_allocator, size_t total_memory, const std::string& name, const Options& opts) : opts_(opts), + free_chunk_tag_(opts.enable_spatial_partitioning ? ChunkTag::kCentralGap + : ChunkTag::kLower), coalesce_regions_(sub_allocator->SupportsCoalescing()), sub_allocator_(std::move(sub_allocator)), name_(name), - free_chunks_list_(kInvalidChunkHandle), + unused_chunk_handle_head_(kInvalidChunkHandle), next_allocation_id_(1) { + CHECK(!opts.enable_spatial_partitioning || !opts.allow_growth) // Crash OK + << "Spatial partitioning requires a single fixed address range " + "(allow_growth=false)."; if (opts.allow_growth) { // 2MiB smallest initial allocation, unless total memory available // is less. @@ -83,6 +88,13 @@ BFCAllocator::BFCAllocator(std::unique_ptr sub_allocator, memory_limit_ = total_memory; stats_.bytes_limit = static_cast(total_memory); + // Cap on how much a chunk may exceed the requested size before we split it. + // If the user did not set a fraction, default to 128MB. + max_internal_fragmentation_bytes_ = + (opts.fragmentation_fraction > 0.0) + ? opts.fragmentation_fraction * memory_limit_ + : 128 << 20; + // Create a bunch of bins of various good sizes. // We create bins to fit all possible ranges that cover the @@ -210,6 +222,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { c->prev = kInvalidChunkHandle; c->next = kInvalidChunkHandle; c->freed_at_count = 0; + c->tag = free_chunk_tag_; region_manager_.set_handle(c->ptr, h); @@ -228,17 +241,18 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { prev_chunk->next = h; } - // Maybe merge adjacent chunks and insert the chunk into the right bin. - InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false)); + // Maybe merge adjacent chunks and insert the chunk into the right free + // structure. In spatial mode, a fresh region becomes the central gap. + InsertFreeChunk(TryToCoalesce(h, /*ignore_freed_at=*/false)); return true; } BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() { - if (free_chunks_list_ != kInvalidChunkHandle) { - ChunkHandle h = free_chunks_list_; + if (unused_chunk_handle_head_ != kInvalidChunkHandle) { + ChunkHandle h = unused_chunk_handle_head_; Chunk* c = ChunkFromHandle(h); - free_chunks_list_ = c->next; + unused_chunk_handle_head_ = c->next; return h; } ChunkHandle h = chunks_.size(); @@ -250,8 +264,8 @@ void BFCAllocator::DeallocateChunk(ChunkHandle h) { Chunk* c = ChunkFromHandle(h); c->allocation_id = -1; c->bin_num = kInvalidBinNum; - c->next = free_chunks_list_; - free_chunks_list_ = h; + c->next = unused_chunk_handle_head_; + unused_chunk_handle_head_ = h; } void* BFCAllocator::AllocateRawInternalWithRetry( @@ -263,7 +277,8 @@ void* BFCAllocator::AllocateRawInternalWithRetry( freed_by_count = (*allocation_attr.freed_by_func)(); } - void* r = AllocateRawInternal(alignment, num_bytes, false, freed_by_count); + void* r = AllocateRawInternal(alignment, num_bytes, false, freed_by_count, + allocation_attr.allocation_end); if (ABSL_PREDICT_TRUE(r != nullptr)) { return r; } @@ -275,7 +290,8 @@ void* BFCAllocator::AllocateRawInternalWithRetry( if (allocation_attr.freed_by_func != nullptr) { freed_by_count = (*allocation_attr.freed_by_func)(); } - return AllocateRawInternal(a, nb, v, freed_by_count); + return AllocateRawInternal(a, nb, v, freed_by_count, + allocation_attr.allocation_end); }, kMaxMillisToWait, alignment, num_bytes); return r; @@ -285,6 +301,12 @@ void* BFCAllocator::AllocateRaw(size_t alignment, size_t num_bytes, const AllocationAttributes& allocation_attr) { VLOG(3) << "AllocateRaw " << Name() << " " << num_bytes << " alignment=" << alignment; + // Only spatially partitioned allocators serve upper-end requests; everything + // else must leave allocation_end at its kLower default. This lets the + // allocation path skip partitioning branches: with allocation_end always + // kLower, requests only ever land in AllocateChunkFromLowEnd. + DCHECK(opts_.enable_spatial_partitioning || + allocation_attr.allocation_end == AllocationEnd::kLower); void* result = [&] { if (!opts_.allow_retry_on_failure || !allocation_attr.retry_on_failure) { // If we have globally disabled retry-on-failure and fail to allocate an @@ -308,8 +330,9 @@ void* BFCAllocator::AllocateRaw(size_t alignment, size_t num_bytes, if (allocation_attr.freed_by_func != nullptr) { freed_by_count = (*allocation_attr.freed_by_func)(); } - void* res = AllocateRawInternal(alignment, num_bytes, dump_log_on_failure, - freed_by_count); + void* res = + AllocateRawInternal(alignment, num_bytes, dump_log_on_failure, + freed_by_count, allocation_attr.allocation_end); if (res == nullptr) { int32_t counter_value = log_counter.load(std::memory_order_relaxed); if (counter_value < kMaxFailureLogs) { @@ -317,6 +340,7 @@ void* BFCAllocator::AllocateRaw(size_t alignment, size_t num_bytes, LOG(WARNING) << "Allocator (" << Name() << ") ran out of memory trying " << "to allocate " << strings::HumanReadableNumBytes(num_bytes) + << " from the " << allocation_attr.allocation_end << " end" << " with freed_by_count=" << freed_by_count << "." << (!allocation_attr.retry_on_failure ? " The caller indicates that this is not a failure, but" @@ -335,7 +359,6 @@ void* BFCAllocator::AllocateRaw(size_t alignment, size_t num_bytes, return result; } -// static size_t BFCAllocator::RoundedBytes(size_t bytes) { size_t rounded_bytes = (kMinAllocationSize * @@ -344,6 +367,35 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) { return rounded_bytes; } +uintptr_t BFCAllocator::AlignUp(uintptr_t ptr, size_t alignment) { + CHECK(absl::has_single_bit(alignment)) + << "alignment must be a power of 2, got " << alignment; + const uintptr_t mask = static_cast(alignment) - 1; + return (ptr + mask) & ~mask; +} + +uintptr_t BFCAllocator::AlignDown(uintptr_t ptr, size_t alignment) { + CHECK(absl::has_single_bit(alignment)) + << "alignment must be a power of 2, got " << alignment; + const uintptr_t mask = static_cast(alignment) - 1; + return ptr & ~mask; +} + +size_t BFCAllocator::LowEndAlignmentPadding(uintptr_t chunk_start, + size_t alignment) { + return RoundedBytes(AlignUp(chunk_start, alignment) - chunk_start); +} + +uintptr_t BFCAllocator::HighEndAlignedStart(uintptr_t chunk_start, + size_t chunk_size, + size_t rounded_bytes, + size_t alignment) { + if (ABSL_PREDICT_FALSE(chunk_size < rounded_bytes)) { + return 0; + } + return AlignDown(chunk_start + chunk_size - rounded_bytes, alignment); +} + bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) { // Do nothing if garbage collection is off. @@ -416,12 +468,12 @@ void BFCAllocator::DeallocateRegions( } VLOG(2) << "Deallocate region with ptr = " << it->ptr(); - // Remove all chunk registrations from Bins. + // Remove all chunk registrations from free structures. ChunkHandle h = region_manager_.get_handle(it->ptr()); while (h != kInvalidChunkHandle) { const Chunk* c = ChunkFromHandle(h); - if (c->bin_num != kInvalidBinNum) { - RemoveFreeChunkFromBin(h); + if (!c->in_use()) { + RemoveFreeChunk(h); } auto h_to_delete = h; h = c->next; @@ -437,7 +489,8 @@ void BFCAllocator::DeallocateRegions( void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes, bool dump_log_on_failure, - uint64_t freed_before) { + uint64_t freed_before, + AllocationEnd allocation_end) { if (ABSL_PREDICT_FALSE(num_bytes == 0)) { VLOG(2) << "tried to allocate 0 bytes"; return nullptr; @@ -460,8 +513,8 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes, // Merge timestamped chunks whose counts have become safe for general use. MergeTimestampedChunks(0); } - void* ptr = - FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment, freed_before); + void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment, + freed_before, allocation_end); if (ABSL_PREDICT_TRUE(ptr != nullptr)) { AddTraceMe("MemoryAllocation", ptr); return ptr; @@ -470,7 +523,7 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes, // Try to extend if (Extend(alignment, rounded_bytes)) { ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment, - freed_before); + freed_before, allocation_end); if (ptr != nullptr) { AddTraceMe("MemoryAllocation", ptr); return ptr; @@ -484,7 +537,7 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes, // size is formed. if (MergeTimestampedChunks(rounded_bytes)) { ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment, - freed_before); + freed_before, allocation_end); if (ptr != nullptr) { AddTraceMe("MemoryAllocation", ptr); return ptr; @@ -499,7 +552,7 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes, if (DeallocateFreeRegions(rounded_bytes) && Extend(alignment, rounded_bytes)) { ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment, - freed_before); + freed_before, allocation_end); if (ptr != nullptr) { AddTraceMe("MemoryAllocation", ptr); return ptr; @@ -514,20 +567,20 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes, LOG(WARNING) << "Allocator (" << Name() << ") ran out of memory trying " << "to allocate " << strings::HumanReadableNumBytes(num_bytes) - << " (rounded to " << rounded_bytes << ")" << "requested by op " + << " (rounded to " << rounded_bytes << ")" + << " from the " << allocation_end << " end requested by op " << tsl::profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation() .pending_op_name - << "\nIf the cause is memory fragmentation maybe the environment " - << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will " - << "improve the situation. \nCurrent allocation summary follows." - << "\nCurrent allocation summary follows."; - DumpMemoryLog(rounded_bytes); + << "\nIf this is caused by memory fragmentation, the environment " + << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' may improve " + << "the situation.\nCurrent allocation summary follows."; + DumpMemoryLog(rounded_bytes, allocation_end); LOG(WARNING) << RenderOccupancy(); } return nullptr; } -int64_t BFCAllocator::LargestFreeChunk() { +size_t BFCAllocator::LargestBinnedFreeChunk() { for (int i = kNumBins - 1; i >= 0; i--) { if (!BinFromIndex(i)->free_chunks.empty()) { return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size; @@ -536,10 +589,34 @@ int64_t BFCAllocator::LargestFreeChunk() { return 0; } +size_t BFCAllocator::LargestBinnedFreeChunk(AllocationEnd allocation_end) { + ChunkTag tag = ChunkTagOf(allocation_end); + for (int i = kNumBins - 1; i >= 0; i--) { + Bin* b = BinFromIndex(i); + for (auto it = b->free_chunks.rbegin(); it != b->free_chunks.rend(); ++it) { + const Chunk* chunk = ChunkFromHandle(*it); + if (chunk->tag == tag) { + return chunk->size; + } + } + } + return 0; +} + +size_t BFCAllocator::LargestFreeChunk() { + size_t largest = LargestBinnedFreeChunk(); + if (central_gap_ != kInvalidChunkHandle) { + largest = std::max(largest, ChunkFromHandle(central_gap_)->size); + } + return largest; +} + double BFCAllocator::GetFragmentation() { int64_t bytes_available = *stats_.pool_bytes - stats_.bytes_in_use; DCHECK_GE(bytes_available, 0); - return static_cast(bytes_available - LargestFreeChunk()) / + size_t largest_free_chunk = LargestFreeChunk(); + return static_cast(bytes_available - + static_cast(largest_free_chunk)) / bytes_available; } @@ -580,115 +657,223 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name, void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, size_t alignment, - uint64_t freed_before) { - // First identify the first bin that could satisfy rounded_bytes. - for (; bin_num < kNumBins; bin_num++) { - // Start searching from the first bin for the smallest chunk that fits - // rounded_bytes. - Bin* b = BinFromIndex(bin_num); - for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end(); - ++citer) { - BFCAllocator::ChunkHandle h = (*citer); - BFCAllocator::Chunk* chunk = ChunkFromHandle(h); + uint64_t freed_before, + AllocationEnd allocation_end) { + // Spatial partitioning keeps three contiguous spans by address: + // + // [ kLower (grows up) ][ kCentralGap ][ kUpper (grows down) ] + // + // A request first reuses a free hole with its own tag from the size bins. + // Only if no same-tag hole fits does it carve from the one central gap. + // Because neither end can create or consume the other end's interior holes, + // lower placements are independent of upper activity and upper placements are + // independent of lower activity, except when lower and upper allocations + // exhaust the central gap. + if (void* ptr = FindTaggedChunkPtr(bin_num, rounded_bytes, num_bytes, + alignment, freed_before, allocation_end)) { + return ptr; + } + return FindChunkPtrInCentralGap(rounded_bytes, num_bytes, alignment, + freed_before, allocation_end); +} + +void* BFCAllocator::FindTaggedChunkPtr(BinNum bin_num, size_t rounded_bytes, + size_t num_bytes, size_t alignment, + uint64_t freed_before, + AllocationEnd allocation_end) { + const ChunkTag requested_tag = ChunkTagOf(allocation_end); + for (BinNum bn = bin_num; bn < kNumBins; bn++) { + Bin* b = BinFromIndex(bn); + for (ChunkHandle h : b->free_chunks) { + Chunk* chunk = ChunkFromHandle(h); DCHECK(!chunk->in_use()); + if (ABSL_PREDICT_FALSE(chunk->tag != requested_tag)) { + continue; + } if (ABSL_PREDICT_FALSE(freed_before > 0) && freed_before < chunk->freed_at_count) { continue; } - - // Compute how many bytes we need to skip at the front of this chunk - // to reach the requested alignment boundary. - uintptr_t ptr_int = absl::bit_cast(chunk->ptr); - size_t align_padding = - (alignment - (ptr_int & (alignment - 1))) % alignment; - // Round padding up to kMinAllocationSize so the prefix chunk is valid. - align_padding = RoundedBytes(align_padding); - - if (chunk->size >= rounded_bytes + align_padding) { - // We found an existing chunk that fits us that wasn't in use, so remove - // it from the free bin structure prior to using. - RemoveFreeChunkIterFromBin(&b->free_chunks, citer); - - // If alignment requires it, split off the unaligned prefix as a - // separate free chunk. - if (align_padding > 0) { - SplitChunk(h, align_padding); - // After splitting, h still points to the prefix chunk (size = - // align_padding). The new aligned chunk is h's next and was - // inserted into a free bin by SplitChunk. - chunk = ChunkFromHandle(h); - // Put the prefix back into the free bin. - InsertFreeChunkIntoBin(h); - // Advance to the aligned chunk and remove it from its free bin - // so we can use it (and potentially split it again below). - h = chunk->next; - chunk = ChunkFromHandle(h); - RemoveFreeChunkFromBin(h); + const uintptr_t chunk_start = absl::bit_cast(chunk->ptr); + if (ABSL_PREDICT_FALSE(allocation_end == AllocationEnd::kUpper)) { + const uintptr_t aligned_start = HighEndAlignedStart( + chunk_start, chunk->size, rounded_bytes, alignment); + if (ABSL_PREDICT_FALSE(aligned_start < chunk_start)) { + continue; } - - // If we can break the size of the chunk into two reasonably large - // pieces, do don't waste more than max_internal_fragmentation_bytes on - // padding. If this threshold is not set by the user, then use 128MB as - // the default. - const int64_t max_internal_fragmentation_bytes = - (opts_.fragmentation_fraction > 0.0) - ? opts_.fragmentation_fraction * memory_limit_ - : 128 << 20; - - if (chunk->size >= rounded_bytes * 2 || - static_cast(chunk->size) - rounded_bytes >= - max_internal_fragmentation_bytes) { - SplitChunk(h, rounded_bytes); - chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved + } else { + const size_t align_padding = + LowEndAlignmentPadding(chunk_start, alignment); + if (ABSL_PREDICT_FALSE(chunk->size < rounded_bytes + align_padding)) { + continue; } + } + RemoveFreeChunkFromBin(h); + return allocation_end == AllocationEnd::kUpper + ? AllocateChunkFromHighEnd(h, rounded_bytes, num_bytes, + alignment) + : AllocateChunkFromLowEnd(h, rounded_bytes, num_bytes, + alignment); + } + } - // The requested size of the returned chunk is what the user - // has allocated. - chunk->requested_size = num_bytes; - // Assign a unique id and increment the id counter, marking the - // chunk as being in use. - chunk->allocation_id = next_allocation_id_++; - - // Update stats. - ++stats_.num_allocs; - stats_.bytes_in_use += chunk->size; - if (stats_.bytes_in_use > stats_.peak_bytes_in_use) { - VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use - << " bytes for " << Name(); - } - stats_.peak_bytes_in_use = - std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use); - stats_.largest_alloc_size = - std::max(stats_.largest_alloc_size, chunk->size); + return nullptr; +} -#ifdef TENSORFLOW_MEM_DEBUG - if (ShouldRecordOpName()) { - const auto& annotation = - profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation(); - if (!annotation.pending_op_name.empty()) { - chunk->op_name = annotation.pending_op_name; - } else { - LOG(INFO) << "missing pending_op_name for " << Name() << "\n" - << CurrentStackTrace(); - chunk->op_name = nullptr; - } - chunk->action_count = ++action_counter_; - chunk->step_id = annotation.pending_step_id; - int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE; - size_history_[slot] = stats_.bytes_in_use; - } -#endif +void* BFCAllocator::FindChunkPtrInCentralGap(size_t rounded_bytes, + size_t num_bytes, size_t alignment, + uint64_t freed_before, + AllocationEnd allocation_end) { + if (ABSL_PREDICT_FALSE(central_gap_ == kInvalidChunkHandle)) { + return nullptr; + } - VLOG(4) << "Returning: " << chunk->ptr; - if (VLOG_IS_ON(4)) { - LOG(INFO) << "A: " << RenderOccupancy(); - } - return chunk->ptr; - } + ChunkHandle h = central_gap_; + Chunk* chunk = ChunkFromHandle(h); + DCHECK(!chunk->in_use()); + DCHECK_EQ(chunk->tag, ChunkTag::kCentralGap); + if (ABSL_PREDICT_FALSE(freed_before > 0) && + freed_before < chunk->freed_at_count) { + return nullptr; + } + + const uintptr_t chunk_start = absl::bit_cast(chunk->ptr); + if (ABSL_PREDICT_FALSE(allocation_end == AllocationEnd::kUpper)) { + const uintptr_t aligned_start = + HighEndAlignedStart(chunk_start, chunk->size, rounded_bytes, alignment); + if (ABSL_PREDICT_FALSE(aligned_start < chunk_start)) { + return nullptr; + } + } else { + const size_t align_padding = LowEndAlignmentPadding(chunk_start, alignment); + if (ABSL_PREDICT_FALSE(chunk->size < rounded_bytes + align_padding)) { + return nullptr; } } - return nullptr; + RemoveFreeChunk(h); + return allocation_end == AllocationEnd::kUpper + ? AllocateChunkFromHighEnd(h, rounded_bytes, num_bytes, alignment) + : AllocateChunkFromLowEnd(h, rounded_bytes, num_bytes, alignment); +} + +void* BFCAllocator::AllocateChunkFromLowEnd(ChunkHandle h, size_t rounded_bytes, + size_t num_bytes, + size_t alignment) { + Chunk* chunk = ChunkFromHandle(h); + + // If alignment requires it, split off the unaligned prefix as a separate free + // chunk. For a central-gap carve, that prefix falls below the new lower + // allocation and therefore becomes a lower-owned interior hole. + uintptr_t ptr_int = absl::bit_cast(chunk->ptr); + const size_t align_padding = LowEndAlignmentPadding(ptr_int, alignment); + if (ABSL_PREDICT_FALSE(align_padding > 0)) { + SplitChunk(h, align_padding); + chunk = ChunkFromHandle(h); + ChunkHandle aligned_h = chunk->next; + RemoveFreeChunk(aligned_h); + chunk->tag = ChunkTag::kLower; + InsertFreeChunk(h); + h = aligned_h; + chunk = ChunkFromHandle(h); + } + + // If we can break the size of the chunk into two reasonably large pieces, + // don't waste more than max_internal_fragmentation_bytes_ on padding. The + // trailing remainder keeps the source chunk's tag (the central gap keeps its + // tag; a lower hole stays lower). + if (chunk->size >= rounded_bytes * 2 || + static_cast(chunk->size) - rounded_bytes >= + max_internal_fragmentation_bytes_) { + SplitChunk(h, rounded_bytes); + chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved. + } + + // The in-use chunk gets the lower tag. + chunk->tag = ChunkTag::kLower; + FinishChunkAllocation(chunk, num_bytes); + return chunk->ptr; +} + +void* BFCAllocator::AllocateChunkFromHighEnd(ChunkHandle h, + size_t rounded_bytes, + size_t num_bytes, + size_t alignment) { + Chunk* chunk = ChunkFromHandle(h); + + uintptr_t chunk_start = absl::bit_cast(chunk->ptr); + const uintptr_t aligned_start = + HighEndAlignedStart(chunk_start, chunk->size, rounded_bytes, alignment); + CHECK_GE(aligned_start, chunk_start); // Crash OK + const size_t prefix_size = aligned_start - chunk_start; + + // Split off everything below the aligned start as a free prefix, so the + // allocation lands at the high end of the chunk and grows downward. For a + // central-gap carve, this prefix remains the central gap. + if (ABSL_PREDICT_TRUE(prefix_size > 0)) { + SplitChunk(h, prefix_size); + ChunkHandle aligned_h = ChunkFromHandle(h)->next; + RemoveFreeChunk(aligned_h); + InsertFreeChunk(h); + h = aligned_h; + chunk = ChunkFromHandle(h); + } + + // Split off any aligned-up suffix as a free remainder. Set the tag before + // splitting so the suffix inherits kUpper directly. + chunk->tag = ChunkTag::kUpper; + if (chunk->size > rounded_bytes) { + SplitChunk(h, rounded_bytes); + chunk = ChunkFromHandle(h); + } + + // The in-use chunk gets the upper tag. + chunk->tag = ChunkTag::kUpper; + FinishChunkAllocation(chunk, num_bytes); + return chunk->ptr; +} + +void BFCAllocator::FinishChunkAllocation(Chunk* chunk, size_t num_bytes) { + // The requested size of the returned chunk is what the user has allocated. + chunk->requested_size = num_bytes; + // Assign a unique id and increment the id counter, marking the chunk as being + // in use. + chunk->allocation_id = next_allocation_id_++; + + // Update stats. + ++stats_.num_allocs; + stats_.bytes_in_use += chunk->size; + if (stats_.bytes_in_use > stats_.peak_bytes_in_use) { + VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use + << " bytes for " << Name(); + } + stats_.peak_bytes_in_use = + std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use); + stats_.largest_alloc_size = + std::max(stats_.largest_alloc_size, chunk->size); + +#ifdef TENSORFLOW_MEM_DEBUG + if (ShouldRecordOpName()) { + const auto& annotation = + profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation(); + if (!annotation.pending_op_name.empty()) { + chunk->op_name = annotation.pending_op_name; + } else { + LOG(INFO) << "missing pending_op_name for " << Name() << "\n" + << CurrentStackTrace(); + chunk->op_name = nullptr; + } + chunk->action_count = ++action_counter_; + chunk->step_id = annotation.pending_step_id; + int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE; + size_history_[slot] = stats_.bytes_in_use; + } +#endif + + VLOG(4) << "Returning: " << chunk->ptr; + if (VLOG_IS_ON(4)) { + LOG(INFO) << "A: " << RenderOccupancy(); + } } void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) { @@ -713,6 +898,9 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) { // It inherits the freed time. new_chunk->freed_at_count = c->freed_at_count; + // It inherits the tag; callers update the in-use piece after splitting. + new_chunk->tag = c->tag; + // Maintain the pointers. // c <-> c_neighbor becomes // c <-> new_chunk <-> c_neighbor @@ -725,8 +913,8 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) { c_neighbor->prev = h_new_chunk; } - // Add the newly free chunk to the free bin. - InsertFreeChunkIntoBin(h_new_chunk); + // Add the newly free chunk to the appropriate free structure. + InsertFreeChunk(h_new_chunk); } void BFCAllocator::DeallocateRaw(void* ptr) { @@ -759,13 +947,13 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) { // Consider coalescing it. if (ABSL_PREDICT_FALSE(timing_counter_ != nullptr)) { - InsertFreeChunkIntoBin(h); + InsertFreeChunk(h); timestamped_chunks_.push_back(h); } else { - InsertFreeChunkIntoBin(TryToCoalesce(h, false)); + InsertFreeChunk(TryToCoalesce(h, false)); } - // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for + // TraceMe needs to be added after MarkFree and InsertFreeChunk for // correct aggregation stats (bytes_in_use, fragmentation). AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes); @@ -774,10 +962,19 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) { } } +BFCAllocator::ChunkTag BFCAllocator::MergedChunkTag(ChunkTag a, + ChunkTag b) const { + // Two free holes with the same tag keep that tag (an interior hole still + // belongs to its end). Any other combination -- a hole merging with the + // central gap, or lower-end and upper-end holes meeting after the gap is + // exhausted -- yields a kCentralGap span reusable by either end. + return a == b ? a : ChunkTag::kCentralGap; +} + // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1. // We merge Chunk(h2) into Chunk(h1). -void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1, - BFCAllocator::ChunkHandle h2) { +void BFCAllocator::MergeChunks(BFCAllocator::ChunkHandle h1, + BFCAllocator::ChunkHandle h2) { Chunk* c1 = ChunkFromHandle(h1); Chunk* c2 = ChunkFromHandle(h2); // We can only merge chunks that are not in use. @@ -802,6 +999,9 @@ void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1, // Set the new size c1->size += c2->size; + // Combine tags: merging with the central gap grows the gap. + c1->tag = MergedChunkTag(c1->tag, c2->tag); + // Pick latest free time. c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count); @@ -816,15 +1016,54 @@ void BFCAllocator::DeleteChunk(ChunkHandle h) { DeallocateChunk(h); } -void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) { +void BFCAllocator::InsertFreeChunk(BFCAllocator::ChunkHandle h) { Chunk* c = ChunkFromHandle(h); - CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum)); + CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum)); // Crash OK + if (c->tag == ChunkTag::kCentralGap) { + CHECK_EQ(central_gap_, kInvalidChunkHandle) // Crash OK + << "spatial partitioning expects one central gap"; + central_gap_ = h; + return; + } + CHECK_NE(c->tag, ChunkTag::kCentralGap); // Crash OK BinNum bin_num = BinNumForSize(c->size); Bin* new_bin = BinFromIndex(bin_num); c->bin_num = bin_num; new_bin->free_chunks.insert(h); } +void BFCAllocator::RemoveFreeChunk(BFCAllocator::ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + CHECK(!c->in_use()); // Crash OK + if (c->tag == ChunkTag::kCentralGap && c->bin_num == kInvalidBinNum) { + CHECK_EQ(central_gap_, h); // Crash OK + central_gap_ = kInvalidChunkHandle; + return; + } + RemoveFreeChunkFromBin(h); +} + +void BFCAllocator::ReturnBoundaryChunkToGap(BFCAllocator::ChunkHandle h) { + if (ABSL_PREDICT_TRUE(free_chunk_tag_ != ChunkTag::kCentralGap)) { + return; + } + Chunk* c = ChunkFromHandle(h); + CHECK(!c->in_use()); // Crash OK + if (ABSL_PREDICT_TRUE(c->tag == ChunkTag::kLower)) { + ChunkHandle n = c->next; + if (n == kInvalidChunkHandle || + ChunkFromHandle(n)->tag != ChunkTag::kLower) { + c->tag = ChunkTag::kCentralGap; + } + } else if (ABSL_PREDICT_FALSE(c->tag == ChunkTag::kUpper)) { + ChunkHandle p = c->prev; + if (p == kInvalidChunkHandle || + ChunkFromHandle(p)->tag != ChunkTag::kUpper) { + c->tag = ChunkTag::kCentralGap; + } + } +} + void BFCAllocator::RemoveFreeChunkIterFromBin( BFCAllocator::Bin::FreeChunkSet* free_chunks, const BFCAllocator::Bin::FreeChunkSet::iterator& citer) { @@ -850,9 +1089,16 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) { // Mark the chunk as no longer in use. c->allocation_id = -1; - // Optionally record the free time. + // Optionally record the free time. Timestamped chunks are kept in their + // original lower/upper tag until they become safe to merge; otherwise a + // pending boundary free could create a second central gap. if (ABSL_PREDICT_FALSE(timing_counter_ != nullptr)) { c->freed_at_count = timing_counter_->next(); + } else { + c->freed_at_count = 0; + } + if (ABSL_PREDICT_TRUE(c->freed_at_count == 0)) { + ReturnBoundaryChunkToGap(h); } // Updates the stats. @@ -880,8 +1126,8 @@ BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h, Chunk* n = ChunkFromHandle(c->next); if ((n->freed_at_count == 0) || ignore_freed_at) { VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr; - RemoveFreeChunkFromBin(c->next); - Merge(h, c->next); + RemoveFreeChunk(c->next); + MergeChunks(h, c->next); } } @@ -891,8 +1137,8 @@ BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h, if ((n->freed_at_count == 0) || ignore_freed_at) { VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr; coalesced_chunk = c->prev; - RemoveFreeChunkFromBin(c->prev); - Merge(c->prev, h); + RemoveFreeChunk(c->prev); + MergeChunks(c->prev, h); } } @@ -966,9 +1212,12 @@ bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) { Chunk* c = ChunkFromHandle(h); DCHECK_NE(c->bin_num, kInvalidBinNum); DCHECK(!c->in_use()); - RemoveFreeChunkFromBin(h); + RemoveFreeChunk(h); + if (c->freed_at_count == 0 || required_bytes > 0) { + ReturnBoundaryChunkToGap(h); + } ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0)); - InsertFreeChunkIntoBin(new_h); + InsertFreeChunk(new_h); if (required_bytes > 0) { c = ChunkFromHandle(new_h); if (new_h != h && c->freed_at_count > 0) { @@ -1087,7 +1336,8 @@ std::string BFCAllocator::RenderOccupancy() { return rendered; } -void BFCAllocator::DumpMemoryLog(size_t num_bytes) { +void BFCAllocator::DumpMemoryLog(size_t num_bytes, + AllocationEnd allocation_end) { const std::array bin_infos = get_bin_debug_info(); LOG(INFO) << "BFCAllocator dump for " << Name(); for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) { @@ -1135,6 +1385,7 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) { std::string buf = absl::StrCat( (c->in_use() ? "InUse" : "Free "), " at ", absl::Hex(absl::bit_cast(c->ptr)), " of size ", c->size); + absl::StrAppend(&buf, " tag ", c->tag); #ifdef TENSORFLOW_MEM_DEBUG if (ShouldRecordOpName()) { absl::StrAppend(&buf, " by op ", c->op_name, " action_count ", @@ -1157,18 +1408,56 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) { << strings::HumanReadableNumBytes(it.first * it.second); total_bytes += (it.first * it.second); } + size_t memory_limit = memory_limit_; + size_t pool_bytes = static_cast(*stats_.pool_bytes); + size_t unallocated_bytes = memory_limit - pool_bytes; + size_t free_pool_bytes = + pool_bytes - static_cast(stats_.bytes_in_use); + size_t largest_binned_free_chunk = LargestBinnedFreeChunk(); + size_t largest_compatible_binned_free_chunk = + LargestBinnedFreeChunk(allocation_end); + size_t central_gap_bytes = central_gap_ == kInvalidChunkHandle + ? 0 + : ChunkFromHandle(central_gap_)->size; + size_t largest_free_chunk = + std::max(largest_binned_free_chunk, central_gap_bytes); + size_t largest_compatible_free_chunk = + std::max(largest_compatible_binned_free_chunk, central_gap_bytes); + LOG(INFO) << "Sum Total of in-use chunks: " << strings::HumanReadableNumBytes(total_bytes); - LOG(INFO) << "Total size in pool: " - << strings::HumanReadableNumBytes(*stats_.pool_bytes) - << " memory_limit_: " - << strings::HumanReadableNumBytes(memory_limit_) - << " available size: " - << strings::HumanReadableNumBytes(memory_limit_ - - *stats_.pool_bytes) - << " curr_region_allocation_bytes_: " - << strings::HumanReadableNumBytes(curr_region_allocation_bytes_); - LOG(INFO) << "Stats: \n" << stats_.DebugString(); + LOG(INFO) << "Allocator memory summary: rounded request " + << strings::HumanReadableNumBytes(num_bytes) + << ", total free in pool " + << strings::HumanReadableNumBytes(free_pool_bytes) + << ", largest free chunk usable by " << allocation_end << " end " + << strings::HumanReadableNumBytes(largest_compatible_free_chunk) + << ", unallocated bytes " + << strings::HumanReadableNumBytes(unallocated_bytes) + << ", pool size " << strings::HumanReadableNumBytes(pool_bytes) + << ", pool limit " << strings::HumanReadableNumBytes(memory_limit); + if (central_gap_ != kInvalidChunkHandle) { + LOG(INFO) << "Spatial partitioning summary for " << allocation_end + << " end: central gap " + << strings::HumanReadableNumBytes(central_gap_bytes) + << ", largest same-end binned free chunk " + << strings::HumanReadableNumBytes( + largest_compatible_binned_free_chunk) + << ", largest binned free chunk from any end " + << strings::HumanReadableNumBytes(largest_binned_free_chunk) + << ". The central gap is shared by lower/upper allocations " + "and is not inserted into a bin."; + } + LOG(INFO) << "Allocator region growth hint: next region allocation target " + << strings::HumanReadableNumBytes(curr_region_allocation_bytes_) + << (opts_.allow_growth + ? " (growth enabled)." + : " (growth disabled; fixed-size pool; not an attempted " + "allocation)."); + + AllocatorStats stats = stats_; + stats.largest_free_block_bytes = static_cast(largest_free_chunk); + LOG(INFO) << "Stats: \n" << stats.DebugString(); } void BFCAllocator::MaybeWriteMemoryMap() { @@ -1265,7 +1554,9 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() { std::optional BFCAllocator::GetStats() { absl::MutexLock l(mutex_); - return stats_; + AllocatorStats stats = stats_; + stats.largest_free_block_bytes = static_cast(LargestFreeChunk()); + return stats; } bool BFCAllocator::ClearStats() { @@ -1283,6 +1574,12 @@ BFCAllocator::get_bin_debug_info() { ChunkHandle h = region_manager_.get_handle(region.ptr()); while (h != kInvalidChunkHandle) { const Chunk* c = ChunkFromHandle(h); + if (!c->in_use() && c->tag == ChunkTag::kCentralGap && + c->bin_num == kInvalidBinNum) { + CHECK_EQ(central_gap_, h); // Crash OK + h = c->next; + continue; + } BinNum bin_num = BinNumForSize(c->size); BinDebugInfo& bin_info = bin_infos[bin_num]; bin_info.total_bytes_in_bin += c->size; diff --git a/xla/tsl/framework/bfc_allocator.h b/xla/tsl/framework/bfc_allocator.h index 91cd993ed49a4..1c0ba329018bc 100644 --- a/xla/tsl/framework/bfc_allocator.h +++ b/xla/tsl/framework/bfc_allocator.h @@ -47,14 +47,50 @@ class MemoryDump; namespace tsl { using tensorflow::MemoryDump; -// A memory allocator that implements a 'best-fit with coalescing' -// algorithm. This is essentially a very simple version of Doug Lea's -// malloc (dlmalloc). +// A memory allocator that implements best-fit with coalescing (BFC), a +// simple dlmalloc-style allocator for arenas where most allocations go through +// this interface. +// +// See prior art: https://gee.cs.oswego.edu/dl/html/malloc.html +// +// High-level model: +// +// - Backing memory comes from the SubAllocator as AllocationRegions. With +// Options::allow_growth=true the allocator grows by adding regions up to +// total_memory; with Options::allow_growth=false it reserves one fixed region +// during construction. stats_.bytes_reserved tracks bytes held from the +// SubAllocator, while stats_.bytes_in_use tracks bytes currently live for +// clients. +// +// - Each AllocationRegion is represented as an ordered sequence of Chunks that +// cover the region without gaps. This is boundary-tag-style bookkeeping: the +// allocator can find physically adjacent chunks and coalesce neighboring free +// chunks, even though the metadata lives in Chunk objects instead of literal +// dlmalloc headers/trailers. A Chunk is either entirely in use or entirely +// free. Allocations split free chunks when needed, and frees coalesce +// adjacent free chunks to repair fragmentation. +// +// - Free chunks are indexed by size-class Bins. Each Bin stores ChunkHandles in +// a FreeChunkSet ordered by chunk size and then address. Allocation starts in +// the smallest viable bin, scans upward, and uses the smallest fitting chunk. +// Allocated chunks are never in a Bin. +// +// - AllocationAttributes::allocation_end controls placement. Without spatial +// partitioning all requests use AllocationEnd::kLower, and ordinary free +// chunks stay in ChunkTag::kLower, which is classic BFC behavior. +// +// - With Options::enable_spatial_partitioning=true, which requires +// Options::allow_growth=false, the fixed address range is split into +// lower-end ownership, one central gap, and upper-end ownership. +// AllocationEnd::kLower requests grow upward, and AllocationEnd::kUpper +// requests grow downward. ChunkTag records ownership: kLower and kUpper for +// allocated chunks and same-tag interior holes, and kCentralGap for the +// central gap. The central gap is tracked by central_gap_ instead of being +// inserted into a Bin. Each end first reuses binned holes with its own tag, +// then carves from the central gap. This keeps each end's placements +// independent of activity from the opposite end except when lower and upper +// allocations exhaust the central gap. // -// The goal of this allocator is to support defragmentation via -// coalescing. One assumption we make is that the process using this -// allocator owns pretty much all of the memory, and that nearly -// all requests to allocate memory go through this interface. class BFCAllocator : public Allocator { public: struct Options { @@ -75,7 +111,36 @@ class BFCAllocator : public Allocator { // Controls when a chunk should be split, if its size exceeds the requested // allocation size. double fragmentation_fraction = 0; + + // If true, the allocator spatially partitions a single pre-allocated + // address range by serving requests from either end. AllocationEnd::kLower + // requests grow up from the low address; AllocationEnd::kUpper requests + // grow down from the high address; a central gap sits in between: + // + // low address high address + // |------------------------------------------------------------| + // | lower-end owned ---> central gap <--- upper-end owned | + // |------------------------------------------------------------| + // + // The split is fully dynamic with no hard boundary: a request carves from + // the central gap or reuses a free hole of its OWN tag, but never the + // other end's tagged interior holes. When a buffer at either end of the + // central gap is freed it rejoins the gap, growing it, and adjacent holes + // with the same tag cascade back in turn -- so e.g. allocating 100% lower, + // freeing it, then allocating 100% upper is fully supported. The only + // failure is true exhaustion: lower and upper meeting with no gap left. + // + // Because neither end ever carves the other's interior holes, each end's + // placement is a pure function of that end's request sequence and is never + // perturbed by activity from the opposite end, except when lower and upper + // allocations exhaust the central gap. That makes offsets reproducible + // across processes that issue the same requests for that end in the same + // order, e.g. symmetric collective buffers across ranks. + // + // Requires allow_growth=false (a single fixed address range). + bool enable_spatial_partitioning = false; }; + BFCAllocator(std::unique_ptr sub_allocator, size_t total_memory, const std::string& name, const Options& opts); @@ -122,7 +187,8 @@ class BFCAllocator : public Allocator { void* AllocateRawInternal(size_t alignment, size_t num_bytes, bool dump_log_on_failure, - uint64_t freed_before_count); + uint64_t freed_before_count, + AllocationEnd allocation_end); void* AllocateRawInternalWithRetry( size_t alignment, size_t num_bytes, @@ -147,9 +213,15 @@ class BFCAllocator : public Allocator { bool MergeTimestampedChunks(size_t required_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Return the largest free chunk bytes from the largest bin in constant time. - // The free chunks are sorted by size (and then address) in a bin. - int64_t LargestFreeChunk() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Return the largest binned free chunk. Free chunks are sorted by size (and + // then address) in a bin. + size_t LargestBinnedFreeChunk() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + size_t LargestBinnedFreeChunk(AllocationEnd allocation_end) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Return the largest free chunk, including the central gap when spatial + // partitioning is enabled. + size_t LargestFreeChunk() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Add TraceMe (in memory allocation and deallocation) for memory stats // profiling. The chunk_ptr is passed to get information such as address, @@ -172,6 +244,38 @@ class BFCAllocator : public Allocator { // The following means that the largest bin'd chunk size is 256 << 21 = 512MB. static constexpr int kNumBins = 21; + // Tag describing a chunk's ownership state. Spatial partitioning keeps three + // contiguous spans by address: + // + // [ kLower (grows up) ][ kCentralGap ][ kUpper (grows down) ] + // + // A request may carve from the contiguous kCentralGap span or reuse a + // free hole with its OWN tag, but never the other end's tagged holes. This + // keeps each end's offsets a pure function of that end's request sequence. + // The split between lower-end, central-gap, and upper-end spans is fully + // dynamic with no hard boundary: when a boundary chunk is freed it rejoins + // the central gap, growing it, and adjacent same-tag holes cascade back in + // turn. So e.g. allocating 100% kLower, freeing it, then allocating 100% + // kUpper is supported -- the freed lower space cascades back into one + // central gap that the upper end can then consume. + enum class ChunkTag : uint8_t { + kCentralGap, // The single central gap between lower-end and upper-end + // ownership. Either end may carve from it. + kLower, // Lower-end-owned: in use, or a free hole reusable only by the + // lower end until it rejoins the gap. + kUpper, // Upper-end-owned: in use, or a free hole reusable only by the + // upper end until it rejoins the gap. + }; + + template + friend void AbslStringify(Sink& sink, ChunkTag tag); + + // The tag owned by an allocation from `allocation_end`. + static ChunkTag ChunkTagOf(AllocationEnd allocation_end) { + return allocation_end == AllocationEnd::kUpper ? ChunkTag::kUpper + : ChunkTag::kLower; + } + // A Chunk points to a piece of memory that's either entirely free or entirely // in use by one user memory allocation. // @@ -218,6 +322,11 @@ class BFCAllocator : public Allocator { // Optional count when this chunk was most recently made free. uint64_t freed_at_count = 0; + // Ownership state for this chunk (see ChunkTag). A chunk in the central + // gap is kCentralGap; interior free holes keep their tag until they + // rejoin the gap. + ChunkTag tag = ChunkTag::kCentralGap; + bool in_use() const { return allocation_id != -1; } #ifdef TENSORFLOW_MEM_DEBUG @@ -227,8 +336,8 @@ class BFCAllocator : public Allocator { int64 action_count = 0; #endif - std::string DebugString(BFCAllocator* a, - bool recurse) ABSL_NO_THREAD_SAFETY_ANALYSIS { + std::string DebugString(BFCAllocator* a, bool recurse) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(a->mutex_) { std::string dbg; absl::StrAppend( &dbg, " Size: ", strings::HumanReadableNumBytes(size), @@ -272,12 +381,11 @@ class BFCAllocator : public Allocator { } private: - BFCAllocator* allocator_; // The parent allocator + BFCAllocator* allocator_; // The parent allocator. }; using FreeChunkSet = absl::btree_set; // List of free chunks within the bin, sorted by chunk size. - // Chunk * not owned. FreeChunkSet free_chunks; Bin(BFCAllocator* allocator, size_t bs) : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} @@ -454,6 +562,24 @@ class BFCAllocator : public Allocator { // Returns 'bytes' rounded up to the next highest kMinAllocationSize. static size_t RoundedBytes(size_t bytes); + // Returns the first aligned address at or above 'ptr'. Alignment must be a + // power of two. + static uintptr_t AlignUp(uintptr_t ptr, size_t alignment); + + // Returns the last aligned address at or below 'ptr'. Alignment must be a + // power of two. + static uintptr_t AlignDown(uintptr_t ptr, size_t alignment); + + // Bytes to skip at the low end of a free chunk so the allocation starts + // aligned. The padding is rounded so it can be represented as a Chunk when + // split from the allocation. + static size_t LowEndAlignmentPadding(uintptr_t chunk_start, size_t alignment); + + // Start address for an allocation carved from the high end of a free chunk. + // Returns an address below `chunk_start` if the allocation cannot fit. + static uintptr_t HighEndAlignedStart(uintptr_t chunk_start, size_t chunk_size, + size_t rounded_bytes, size_t alignment); + // Try to add a new memory region that can satisfy an allocation of // 'rounded_bytes' bytes. Returns true on success and false on // failure. @@ -473,9 +599,42 @@ class BFCAllocator : public Allocator { ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Returns a pointer to an underlying allocated chunk of size - // 'rounded_bytes' aligned to 'alignment'. + // 'rounded_bytes' aligned to 'alignment', served from 'allocation_end'. void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, - size_t alignment, uint64_t freed_before) + size_t alignment, uint64_t freed_before, + AllocationEnd allocation_end) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Best-fit scan restricted to binned interior holes owned by + // 'allocation_end'. Returns the user pointer, or nullptr if no same-tag hole + // fits. + void* FindTaggedChunkPtr(BinNum bin_num, size_t rounded_bytes, + size_t num_bytes, size_t alignment, + uint64_t freed_before, AllocationEnd allocation_end) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Carves from the central gap. In spatial partitioning mode the gap is + // tracked directly by central_gap_ instead of being inserted into bins. + void* FindChunkPtrInCentralGap(size_t rounded_bytes, size_t num_bytes, + size_t alignment, uint64_t freed_before, + AllocationEnd allocation_end) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Carves an allocation of 'num_bytes' (rounded to 'rounded_bytes') out of the + // free chunk 'h', which must already have been removed from its free + // structure. The low variant grows up from the chunk's low address (the + // default); the high variant grows down from the chunk's high address. Both + // return the user pointer. + void* AllocateChunkFromLowEnd(ChunkHandle h, size_t rounded_bytes, + size_t num_bytes, size_t alignment) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + void* AllocateChunkFromHighEnd(ChunkHandle h, size_t rounded_bytes, + size_t num_bytes, size_t alignment) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Marks 'chunk' in use and updates allocation stats. Common tail of the two + // AllocateChunkFrom*End helpers. + void FinishChunkAllocation(Chunk* chunk, size_t num_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Splits the chunk specified by 'h' into two chunks, one at least @@ -483,13 +642,29 @@ class BFCAllocator : public Allocator { void SplitChunk(ChunkHandle h, size_t num_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Tag of the free chunk formed by merging two adjacent free neighbors: + // the common tag if both holes have the same tag (an interior hole keeps + // its end), otherwise kCentralGap -- so a hole merging with the central gap, + // or lower and upper holes becoming adjacent, yields space reusable by either + // end. + ChunkTag MergedChunkTag(ChunkTag a, ChunkTag b) const; + // Merges the two chunk handles. Requires that the chunks are // contiguous in their allocation. - void Merge(ChunkHandle h, ChunkHandle h2) + void MergeChunks(ChunkHandle h, ChunkHandle h2) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Adds the chunk 'h' to the proper free bin. - void InsertFreeChunkIntoBin(ChunkHandle h) + // Adds the chunk 'h' to the free data structure. Spatial partitioning + // keeps the single central gap out of the bins and bins only lower/upper + // interior holes; classic BFC inserts every free chunk into a size bin. + void InsertFreeChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Removes the chunk 'h' from the free data structure. + void RemoveFreeChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Reclassifies a just-freed lower/upper boundary chunk as kCentralGap when it + // is no longer interior to its tag. + void ReturnBoundaryChunkToGap(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Removes the free chunk pointed to by 'c' from the set free_chunks. @@ -507,7 +682,8 @@ class BFCAllocator : public Allocator { void DeleteChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); std::string RenderOccupancy() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - void DumpMemoryLog(size_t num_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + void DumpMemoryLog(size_t num_bytes, AllocationEnd allocation_end) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); tensorflow::MemoryDump RecordMemoryMapInternal() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); void MaybeWriteMemoryMap() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -546,6 +722,11 @@ class BFCAllocator : public Allocator { // Structures immutable after construction size_t memory_limit_ = 0; + // Maximum bytes a chunk may exceed the requested size before it is split, to + // bound internal fragmentation. Derived from Options::fragmentation_fraction + // and memory_limit_ once at construction. + int64_t max_internal_fragmentation_bytes_ = 0; + // Map from bin size to Bin Bin* BinFromIndex(BinNum index) { return reinterpret_cast(&(bins_space_[index * sizeof(Bin)])); @@ -564,6 +745,11 @@ class BFCAllocator : public Allocator { const Options opts_; + // Tag assigned to newly-created free chunks. Classic BFC keeps ordinary + // free chunks in kLower; spatial partitioning starts each fixed region as + // the kCentralGap span. + const ChunkTag free_chunk_tag_; + // The size of the current region allocation. size_t curr_region_allocation_bytes_; @@ -587,8 +773,13 @@ class BFCAllocator : public Allocator { std::vector chunks_ ABSL_GUARDED_BY(mutex_); - // Pointer to head of linked list of free Chunks - ChunkHandle free_chunks_list_ ABSL_GUARDED_BY(mutex_); + // Head of a singly-linked list of unused Chunk metadata slots in chunks_. + // The list reuses Chunk::next while the slot is inactive. + ChunkHandle unused_chunk_handle_head_ ABSL_GUARDED_BY(mutex_); + + // The single central gap in spatial partitioning mode. It is not present in + // any Bin; lower/upper interior free holes remain binned. + ChunkHandle central_gap_ ABSL_GUARDED_BY(mutex_) = kInvalidChunkHandle; // Counter containing the next unique identifier to assign to a // newly-created chunk. @@ -609,6 +800,25 @@ class BFCAllocator : public Allocator { void operator=(const BFCAllocator&) = delete; }; +//===----------------------------------------------------------------------===// +// Stringification of enums. +//===----------------------------------------------------------------------===// + +template +void AbslStringify(Sink& sink, BFCAllocator::ChunkTag tag) { + switch (tag) { + case BFCAllocator::ChunkTag::kCentralGap: + sink.Append("central_gap"); + return; + case BFCAllocator::ChunkTag::kLower: + sink.Append("lower"); + return; + case BFCAllocator::ChunkTag::kUpper: + sink.Append("upper"); + return; + } +} + } // namespace tsl #endif // XLA_TSL_FRAMEWORK_BFC_ALLOCATOR_H_ diff --git a/xla/tsl/framework/bfc_allocator_test.cc b/xla/tsl/framework/bfc_allocator_test.cc index 8489a72a580c0..0d2b878939fe5 100644 --- a/xla/tsl/framework/bfc_allocator_test.cc +++ b/xla/tsl/framework/bfc_allocator_test.cc @@ -17,41 +17,71 @@ limitations under the License. #include #include +#include #include #include +#include #include #include +#include #include +#include #include #include "absl/base/casts.h" +#include "absl/base/no_destructor.h" #include "absl/synchronization/blocking_counter.h" #include "xla/tsl/framework/allocator.h" #include "xla/tsl/platform/env.h" #include "xla/tsl/platform/test.h" #include "xla/tsl/platform/test_benchmark.h" #include "xla/tsl/platform/threadpool.h" -#include "tsl/platform/mem.h" namespace tsl { namespace { -// Minimal SubAllocator backed by port::AlignedMalloc for host memory. -class MallocSubAllocator : public SubAllocator { +static constexpr size_t kAlignment = Allocator::kAllocatorAlignment; + +static const absl::NoDestructor kUpper( + /*retry_on_failure=*/false, /*allocation_will_be_logged=*/false, + /*freed_by_func=*/nullptr, AllocationEnd::kUpper); + +static const absl::NoDestructor kLower( + /*retry_on_failure=*/false, /*allocation_will_be_logged=*/false, + /*freed_by_func=*/nullptr, AllocationEnd::kLower); + +// SubAllocator that hands out fake (non-dereferenceable) addresses without +// allocating any real memory. It bump-allocates from a large, fixed virtual +// base so addresses are unique, well-aligned, and consistent. This lets tests +// exercise huge pools and verify the exact addresses BFC returns without +// touching device memory. +class FakeSubAllocator : public SubAllocator { public: - MallocSubAllocator() : SubAllocator({}, {}) {} + // kBase is a high, page-aligned constant so returned addresses look like + // plausible device pointers and never collide with real ones. + static constexpr uintptr_t kBase = uintptr_t{1} << 40; + + explicit FakeSubAllocator( + std::optional hardcoded_alignment = std::nullopt) + : SubAllocator({}, {}), hardcoded_alignment_(hardcoded_alignment) {} void* Alloc(size_t alignment, size_t num_bytes, size_t* bytes_received) override { - void* ptr = port::AlignedMalloc(num_bytes, - static_cast(alignment)); + const size_t effective_alignment = hardcoded_alignment_.value_or(alignment); + uintptr_t aligned = + (next_ + (effective_alignment - 1)) & ~(effective_alignment - 1); + next_ = aligned + num_bytes; *bytes_received = num_bytes; - return ptr; + return absl::bit_cast(aligned); } - void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); } + void Free(void* ptr, size_t num_bytes) override {} bool SupportsCoalescing() const override { return false; } + + private: + std::optional hardcoded_alignment_; + uintptr_t next_ = kBase; }; // Helper to check pointer alignment. @@ -60,7 +90,7 @@ bool IsAligned(const void* ptr, size_t alignment) { } TEST(BFCAllocatorTest, AllocateAndFree) { - BFCAllocator alloc(std::make_unique(), + BFCAllocator alloc(std::make_unique(), /*total_memory=*/1 << 20, /*name=*/"test", BFCAllocator::Options{}); @@ -70,14 +100,14 @@ TEST(BFCAllocatorTest, AllocateAndFree) { } TEST(BFCAllocatorTest, DefaultAlignment) { - BFCAllocator alloc(std::make_unique(), + BFCAllocator alloc(std::make_unique(), /*total_memory=*/1 << 20, /*name=*/"test", BFCAllocator::Options{}); // BFC always returns pointers aligned to at least kAllocatorAlignment (64). - void* ptr = alloc.AllocateRaw(Allocator::kAllocatorAlignment, 1); + void* ptr = alloc.AllocateRaw(kAlignment, 1); ASSERT_NE(ptr, nullptr); - EXPECT_TRUE(IsAligned(ptr, Allocator::kAllocatorAlignment)); + EXPECT_TRUE(IsAligned(ptr, kAlignment)); alloc.DeallocateRaw(ptr); } @@ -87,13 +117,13 @@ class BFCAllocatorAlignmentTest : public ::testing::TestWithParam {}; TEST_P(BFCAllocatorAlignmentTest, RespectsRequestedAlignment) { const size_t alignment = GetParam(); - BFCAllocator alloc(std::make_unique(), + BFCAllocator alloc(std::make_unique(), /*total_memory=*/1 << 20, /*name=*/"test", BFCAllocator::Options{}); // Allocate a small block first to push the arena cursor off any "lucky" // alignment, then allocate with the requested alignment. - void* filler = alloc.AllocateRaw(Allocator::kAllocatorAlignment, 256); + void* filler = alloc.AllocateRaw(kAlignment, 256); ASSERT_NE(filler, nullptr); constexpr int kTrials = 8; @@ -121,7 +151,7 @@ INSTANTIATE_TEST_SUITE_P(Alignments, BFCAllocatorAlignmentTest, // randomized order across multiple iterations. This exercises chunk splitting, // alignment padding, coalescing on free, and reuse of freed chunks. TEST(BFCAllocatorTest, StressAllocFree) { - BFCAllocator alloc(std::make_unique(), + BFCAllocator alloc(std::make_unique(), /*total_memory=*/16 << 20, /*name=*/"stress", BFCAllocator::Options{}); @@ -177,39 +207,20 @@ TEST(BFCAllocatorTest, StressAllocFree) { } } -// SubAllocator that always returns 256-byte (kMinAllocationSize) aligned -// memory but ignores higher alignment requests. This simulates GPU allocators -// like DeviceMemAllocator where cudaMalloc returns 256-byte aligned memory -// regardless of the requested alignment. -class GpuLikeSubAllocator : public SubAllocator { - public: - GpuLikeSubAllocator() : SubAllocator({}, {}) {} - - void* Alloc(size_t /*alignment*/, size_t num_bytes, - size_t* bytes_received) override { - // Always align to 256 bytes, ignoring the requested alignment. - void* ptr = port::AlignedMalloc(num_bytes, std::align_val_t{256}); - *bytes_received = num_bytes; - return ptr; - } - - void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); } - - bool SupportsCoalescing() const override { return false; } -}; - -// Verify that BFC still respects alignment even when the sub-allocator only -// provides 256-byte aligned regions (as GPU sub-allocators do). -TEST(BFCAllocatorTest, AlignmentWithGpuLikeSubAllocator) { - BFCAllocator alloc(std::make_unique(), - /*total_memory=*/1 << 20, /*name=*/"gpu_like", - BFCAllocator::Options{}); +// Verify that BFC respects requested alignment even when the sub-allocator +// ignores it and returns addresses aligned above the required minimum. +TEST(BFCAllocatorTest, AlignmentWithHardcodedSubAllocatorAlignment) { + constexpr size_t kHardcodedAlignment = 256; + BFCAllocator alloc(std::make_unique(kHardcodedAlignment), + /*total_memory=*/1 << 20, + /*name=*/"hardcoded_alignment", BFCAllocator::Options{}); // Push the cursor off any lucky alignment. - void* filler = alloc.AllocateRaw(Allocator::kAllocatorAlignment, 256); + void* filler = alloc.AllocateRaw(kAlignment, 256); ASSERT_NE(filler, nullptr); - constexpr std::array kAlignments = {256, 512, 1024, 4096}; + constexpr std::array kAlignments = {kHardcodedAlignment, 512, 1024, + 4096}; constexpr int kTrials = 8; for (size_t alignment : kAlignments) { @@ -225,20 +236,409 @@ TEST(BFCAllocatorTest, AlignmentWithGpuLikeSubAllocator) { alloc.DeallocateRaw(filler); } +//===----------------------------------------------------------------------===// +// Spatial partitioning tests. +//===----------------------------------------------------------------------===// + +TEST(BFCAllocatorTest, SpatialAllocatesFromEnds) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/4096, /*name=*/"spatial", opts); + + void* lower = alloc.AllocateRaw(kAlignment, 256); + ASSERT_NE(lower, nullptr); + + void* upper = alloc.AllocateRaw(kAlignment, 256, *kUpper); + ASSERT_NE(upper, nullptr); + + EXPECT_EQ(absl::bit_cast(upper) - absl::bit_cast(lower), + 4096 - 256); + + alloc.DeallocateRaw(upper); + alloc.DeallocateRaw(lower); +} + +// Lower activity does not perturb upper offsets. +TEST(BFCAllocatorTest, SpatialKeepsUpperOffset) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + + BFCAllocator alloc_a(std::make_unique(), + /*total_memory=*/4096, /*name=*/"spatial_a", opts); + BFCAllocator alloc_b(std::make_unique(), + /*total_memory=*/4096, /*name=*/"spatial_b", opts); + + void* lower_a = alloc_a.AllocateRaw(kAlignment, 256); + void* upper_a = alloc_a.AllocateRaw(kAlignment, 512, *kUpper); + void* lower_b = alloc_b.AllocateRaw(kAlignment, 256); + void* extra_lower_b = alloc_b.AllocateRaw(kAlignment, 1024); + void* upper_b = alloc_b.AllocateRaw(kAlignment, 512, *kUpper); + + ASSERT_NE(lower_a, nullptr); + ASSERT_NE(upper_a, nullptr); + ASSERT_NE(lower_b, nullptr); + ASSERT_NE(extra_lower_b, nullptr); + ASSERT_NE(upper_b, nullptr); + + const uintptr_t upper_offset_a = + absl::bit_cast(upper_a) - absl::bit_cast(lower_a); + const uintptr_t upper_offset_b = + absl::bit_cast(upper_b) - absl::bit_cast(lower_b); + EXPECT_EQ(upper_offset_a, upper_offset_b); + + alloc_a.DeallocateRaw(upper_a); + alloc_a.DeallocateRaw(lower_a); + alloc_b.DeallocateRaw(upper_b); + alloc_b.DeallocateRaw(extra_lower_b); + alloc_b.DeallocateRaw(lower_b); +} + +// Upper must not reuse a non-boundary lower hole. +TEST(BFCAllocatorTest, SpatialSkipsLowerHole) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/1024, /*name=*/"spatial", opts); + + // Fill the whole region: two lower chunks then one upper chunk, leaving no + // central gap. + void* lower_a = alloc.AllocateRaw(kAlignment, 256); + ASSERT_NE(lower_a, nullptr); + void* lower_b = alloc.AllocateRaw(kAlignment, 256); + ASSERT_NE(lower_b, nullptr); + void* upper = alloc.AllocateRaw(kAlignment, 512, *kUpper); + ASSERT_NE(upper, nullptr); + + // lower_a is trapped below live lower_b. + alloc.DeallocateRaw(lower_a); + + // Upper must not reuse the trapped lower hole. + void* trapped = alloc.AllocateRaw(kAlignment, 256, *kUpper); + EXPECT_EQ(trapped, nullptr); + + alloc.DeallocateRaw(upper); + alloc.DeallocateRaw(lower_b); +} + +// Boundary frees rejoin the central gap. +TEST(BFCAllocatorTest, SpatialLowerReclaimsGap) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/2048, /*name=*/"spatial", opts); + + void* upper = alloc.AllocateRaw(kAlignment, 1024, *kUpper); + ASSERT_NE(upper, nullptr); + alloc.DeallocateRaw(upper); + + void* lower = alloc.AllocateRaw(kAlignment, 2048); + ASSERT_NE(lower, nullptr); + alloc.DeallocateRaw(lower); +} + +// The dynamic boundary moves with frees, but ownership is still enforced. +TEST(BFCAllocatorTest, SpatialReclaimsGap) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/4096, /*name=*/"spatial", opts); + + void* lower0 = alloc.AllocateRaw(kAlignment, 1024); + ASSERT_NE(lower0, nullptr); + + void* upper0 = alloc.AllocateRaw(kAlignment, 512, *kUpper); + ASSERT_NE(upper0, nullptr); + alloc.DeallocateRaw(upper0); + + // Adjacent upper free space rejoins the central gap. + void* lower1 = alloc.AllocateRaw(kAlignment, 512); + ASSERT_NE(lower1, nullptr); + void* upper1 = alloc.AllocateRaw(kAlignment, 512, *kUpper); + ASSERT_NE(upper1, nullptr); + EXPECT_EQ(upper1, upper0); + alloc.DeallocateRaw(upper1); + + // Lower claims the remaining central gap. + void* lower2 = alloc.AllocateRaw(kAlignment, 2560); + ASSERT_NE(lower2, nullptr); + EXPECT_LE(absl::bit_cast(lower2), + absl::bit_cast(upper1)); + EXPECT_EQ(absl::bit_cast(lower2) + 2560, + absl::bit_cast(upper1) + 512); + + // Upper must not cross back into lower-owned space. + void* upper2 = alloc.AllocateRaw(kAlignment, 256, *kUpper); + EXPECT_EQ(upper2, nullptr); + + alloc.DeallocateRaw(lower2); + alloc.DeallocateRaw(lower1); + alloc.DeallocateRaw(lower0); +} + +TEST(BFCAllocatorTest, SpatialUpperAlignmentSuffix) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/4096, /*name=*/"spatial", opts); + const uintptr_t base = FakeSubAllocator::kBase; + + void* upper = alloc.AllocateRaw(1024, 256, *kUpper); + ASSERT_NE(upper, nullptr); + EXPECT_EQ(absl::bit_cast(upper), base + 3072); + + // The alignment suffix above `upper` is upper-owned. + void* lower = alloc.AllocateRaw(kAlignment, 3072); + ASSERT_NE(lower, nullptr); + EXPECT_EQ(absl::bit_cast(lower), base); + + void* crossed = alloc.AllocateRaw(kAlignment, 768, *kLower); + EXPECT_EQ(crossed, nullptr); + + alloc.DeallocateRaw(lower); + alloc.DeallocateRaw(upper); +} + +TEST(BFCAllocatorTest, SpatialLowerAlignmentPrefix) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/4096, /*name=*/"spatial", opts); + const uintptr_t base = FakeSubAllocator::kBase; + + void* lower0 = alloc.AllocateRaw(kAlignment, 256); + ASSERT_NE(lower0, nullptr); + EXPECT_EQ(absl::bit_cast(lower0), base); + + void* lower1 = alloc.AllocateRaw(1024, 256); + ASSERT_NE(lower1, nullptr); + EXPECT_EQ(absl::bit_cast(lower1), base + 1024); + + void* upper = alloc.AllocateRaw(kAlignment, 2816, *kUpper); + ASSERT_NE(upper, nullptr); + EXPECT_EQ(absl::bit_cast(upper), base + 1280); + + // The alignment prefix below lower1 is lower-owned. + void* crossed = alloc.AllocateRaw(kAlignment, 768, *kUpper); + EXPECT_EQ(crossed, nullptr); + + alloc.DeallocateRaw(upper); + alloc.DeallocateRaw(lower1); + alloc.DeallocateRaw(lower0); +} + +// A fully freed lower range reforms the central gap for upper allocations. +TEST(BFCAllocatorTest, SpatialUpperReclaimsAfterLowerFill) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + constexpr size_t kPool = size_t{1} << 30; // 1 GiB. + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/kPool, /*name=*/"repro", opts); + const uintptr_t base = FakeSubAllocator::kBase; + + // Lower fills the entire pool, then frees it. + constexpr size_t kChunk = size_t{32} << 20; // 32 MiB. + constexpr int kNumChunks = kPool / kChunk; // 32 chunks exactly fill 1 GiB. + std::vector lower_ptrs; + lower_ptrs.reserve(kNumChunks); + for (int i = 0; i < kNumChunks; ++i) { + void* p = alloc.AllocateRaw(kAlignment, kChunk); + ASSERT_NE(p, nullptr) << "lower fill failed at chunk " << i; + lower_ptrs.push_back(p); + } + + // Boundary coalescing should reform one whole-pool gap. + for (void* p : lower_ptrs) { + alloc.DeallocateRaw(p); + } + + // Upper should now allocate from the top of the reformed gap. + constexpr size_t kUpperBytes = 18 << 20; + void* upper = alloc.AllocateRaw(kAlignment, kUpperBytes, *kUpper); + ASSERT_NE(upper, nullptr) << "upper should reclaim the freed pool"; + EXPECT_EQ(absl::bit_cast(upper) + kUpperBytes, base + kPool) + << "upper allocation should be anchored at the top of the pool"; + alloc.DeallocateRaw(upper); +} + +TEST(BFCAllocatorTest, SpatialReusesOwnHoles) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/4096, /*name=*/"spatial", opts); + const uintptr_t base = FakeSubAllocator::kBase; + + void* lower0 = alloc.AllocateRaw(kAlignment, 256); + void* lower_hole = alloc.AllocateRaw(kAlignment, 256); + void* lower_guard = alloc.AllocateRaw(kAlignment, 256); + ASSERT_NE(lower0, nullptr); + ASSERT_NE(lower_hole, nullptr); + ASSERT_NE(lower_guard, nullptr); + EXPECT_EQ(absl::bit_cast(lower_hole), base + 256); + + void* upper0 = alloc.AllocateRaw(kAlignment, 256, *kUpper); + void* upper_hole = alloc.AllocateRaw(kAlignment, 256, *kUpper); + void* upper_guard = alloc.AllocateRaw(kAlignment, 256, *kUpper); + ASSERT_NE(upper0, nullptr); + ASSERT_NE(upper_hole, nullptr); + ASSERT_NE(upper_guard, nullptr); + EXPECT_EQ(absl::bit_cast(upper_hole), base + 3584); + + alloc.DeallocateRaw(lower_hole); + alloc.DeallocateRaw(upper_hole); + + // Own binned holes are reused before the central gap. + void* lower_reuse = alloc.AllocateRaw(kAlignment, 256); + ASSERT_NE(lower_reuse, nullptr); + EXPECT_EQ(lower_reuse, lower_hole); + + void* upper_reuse = alloc.AllocateRaw(kAlignment, 256, *kUpper); + ASSERT_NE(upper_reuse, nullptr); + EXPECT_EQ(upper_reuse, upper_hole); + + alloc.DeallocateRaw(upper_reuse); + alloc.DeallocateRaw(upper_guard); + alloc.DeallocateRaw(upper0); + alloc.DeallocateRaw(lower_reuse); + alloc.DeallocateRaw(lower_guard); + alloc.DeallocateRaw(lower0); +} + +// Identical upper allocation sequences should produce identical offsets. +TEST(BFCAllocatorTest, SpatialUpperOffsetsStable) { + constexpr size_t kPool = size_t{512} << 20; + constexpr size_t kUpperAlignment = 512; + // Fixed upper sizes, identical across simulated ranks. + const std::array kUpperSizes = {4 << 20, 16 << 20, 1 << 20, + 18 << 20, 2 << 20, 8 << 20, + 4 << 20, 32 << 20}; + + // Run the fixed upper sequence with randomized lower churn. + auto run = [&](uint32_t lower_seed) -> std::vector { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), kPool, "sym", + opts); + const uintptr_t base = FakeSubAllocator::kBase; + + std::mt19937 rng(lower_seed); + std::vector> live_lower; // (ptr, bytes) + size_t live_lower_bytes = 0; + // Keep utilization away from true exhaustion. + constexpr size_t kLowerCap = kPool / 2; + const std::array kLowerSizes = {256, 1 << 20, 8 << 20, 32 << 20, + 64 << 20}; + auto churn_lower = [&] { + // A random burst of lower allocations and frees, leaving some live. + const int ops = std::uniform_int_distribution(0, 6)(rng); + for (int i = 0; i < ops; ++i) { + if (!live_lower.empty() && + std::uniform_int_distribution(0, 2)(rng) == 0) { + size_t idx = std::uniform_int_distribution( + 0, live_lower.size() - 1)(rng); + alloc.DeallocateRaw(live_lower[idx].first); + live_lower_bytes -= live_lower[idx].second; + live_lower.erase(live_lower.begin() + idx); + } else { + size_t bytes = kLowerSizes[std::uniform_int_distribution( + 0, kLowerSizes.size() - 1)(rng)]; + if (live_lower_bytes + bytes > kLowerCap) { + continue; + } + void* p = alloc.AllocateRaw(kUpperAlignment, bytes); + if (p) { + live_lower.push_back({p, bytes}); + live_lower_bytes += bytes; + } + } + } + }; + + std::vector offsets; + std::vector live_upper; + for (size_t bytes : kUpperSizes) { + churn_lower(); + void* p = alloc.AllocateRaw(kUpperAlignment, bytes, *kUpper); + EXPECT_NE(p, nullptr) + << "upper alloc failed under lower churn (seed " << lower_seed << ")"; + offsets.push_back(p ? absl::bit_cast(p) - base + : std::numeric_limits::max()); + if (p) { + live_upper.push_back(p); + } + // Occasionally free an earlier upper temp, mimicking short-lived S(1). + if (live_upper.size() > 2) { + alloc.DeallocateRaw(live_upper.front()); + live_upper.erase(live_upper.begin()); + } + } + return offsets; + }; + + const std::vector rank0 = run(/*lower_seed=*/1); + for (uint32_t seed = 2; seed <= 32; ++seed) { + EXPECT_EQ(run(seed), rank0) + << "upper offsets diverged for lower_seed=" << seed; + } +} + +TEST(BFCAllocatorTest, SpatialUnderContention) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/64 << 20, /*name=*/"contention", opts); + + constexpr int kNumThreads = 8; + constexpr int kItersPerThread = 1000; + constexpr size_t kBytes = 1024; + + std::atomic failures{0}; + tsl::thread::ThreadPool threads(tsl::Env::Default(), "spatial_contention", + kNumThreads); + absl::BlockingCounter counter(kNumThreads); + for (int t = 0; t < kNumThreads; ++t) { + threads.Schedule([&] { + for (int i = 0; i < kItersPerThread; ++i) { + void* lower = alloc.AllocateRaw(kAlignment, kBytes, *kLower); + void* upper = alloc.AllocateRaw(kAlignment, kBytes, *kUpper); + if (!lower || !upper || !IsAligned(lower, kAlignment) || + !IsAligned(upper, kAlignment)) { + failures.fetch_add(1, std::memory_order_relaxed); + } + alloc.DeallocateRaw(lower); + alloc.DeallocateRaw(upper); + } + counter.DecrementCount(); + }); + } + counter.Wait(); + EXPECT_EQ(failures.load(std::memory_order_relaxed), 0); +} + //===----------------------------------------------------------------------===// // Performance benchmarks. //===----------------------------------------------------------------------===// static constexpr size_t kBenchAllocSize = 1024; -static constexpr size_t kBenchAlignment = Allocator::kAllocatorAlignment; static void BM_AllocAndFree(benchmark::State& state) { - BFCAllocator alloc(std::make_unique(), + BFCAllocator alloc(std::make_unique(), /*total_memory=*/256 << 20, /*name=*/"bench", BFCAllocator::Options{}); for (auto _ : state) { - void* ptr = alloc.AllocateRaw(kBenchAlignment, kBenchAllocSize); + void* ptr = alloc.AllocateRaw(kAlignment, kBenchAllocSize); alloc.DeallocateRaw(ptr); } state.SetItemsProcessed(state.iterations()); @@ -248,14 +648,14 @@ BENCHMARK(BM_AllocAndFree); static void BM_AllocBatchThenFree(benchmark::State& state) { int batch = state.range(0); - BFCAllocator alloc(std::make_unique(), + BFCAllocator alloc(std::make_unique(), /*total_memory=*/256 << 20, /*name=*/"bench", BFCAllocator::Options{}); std::vector ptrs(batch); for (auto _ : state) { for (int i = 0; i < batch; ++i) { - ptrs[i] = alloc.AllocateRaw(kBenchAlignment, kBenchAllocSize); + ptrs[i] = alloc.AllocateRaw(kAlignment, kBenchAllocSize); } for (int i = 0; i < batch; ++i) { alloc.DeallocateRaw(ptrs[i]); @@ -266,11 +666,48 @@ static void BM_AllocBatchThenFree(benchmark::State& state) { BENCHMARK(BM_AllocBatchThenFree)->Arg(100)->Arg(1000); +//===----------------------------------------------------------------------===// +// Spatial allocation benchmarks. +//===----------------------------------------------------------------------===// + +static void BM_SpatialAllocBatchThenFree(benchmark::State& state) { + const int batch = state.range(0); + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.enable_spatial_partitioning = true; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/256 << 20, /*name=*/"bench", opts); + + std::vector lower_ptrs(batch); + std::vector upper_ptrs(batch); + for (auto _ : state) { + for (int i = 0; i < batch; ++i) { + lower_ptrs[i] = alloc.AllocateRaw(kAlignment, kBenchAllocSize); + tsl::testing::DoNotOptimize(lower_ptrs[i]); + } + for (int i = 0; i < batch; ++i) { + upper_ptrs[i] = alloc.AllocateRaw(kAlignment, kBenchAllocSize, *kUpper); + tsl::testing::DoNotOptimize(upper_ptrs[i]); + } + for (int i = 0; i < batch; ++i) { + alloc.DeallocateRaw(lower_ptrs[i]); + alloc.DeallocateRaw(upper_ptrs[i]); + } + } + state.SetItemsProcessed(state.iterations() * batch * 2); +} + +BENCHMARK(BM_SpatialAllocBatchThenFree)->Arg(100)->Arg(1000); + +//===----------------------------------------------------------------------===// +// Contention benchmarks. +//===----------------------------------------------------------------------===// + static void BM_AllocAndFreeUnderContention(benchmark::State& state) { size_t num_threads = state.range(0); static constexpr int kItersPerThread = 10000; - BFCAllocator alloc(std::make_unique(), + BFCAllocator alloc(std::make_unique(), /*total_memory=*/256 << 20, /*name=*/"bench", BFCAllocator::Options{}); tsl::thread::ThreadPool threads(tsl::Env::Default(), "bench", num_threads); @@ -280,7 +717,7 @@ static void BM_AllocAndFreeUnderContention(benchmark::State& state) { for (int t = 0; t < num_threads; ++t) { threads.Schedule([&] { for (int i = 0; i < kItersPerThread; ++i) { - void* ptr = alloc.AllocateRaw(kBenchAlignment, kBenchAllocSize); + void* ptr = alloc.AllocateRaw(kAlignment, kBenchAllocSize); alloc.DeallocateRaw(ptr); } counter.DecrementCount(); diff --git a/xla/xla.proto b/xla/xla.proto index 22d844074ff38..0c6ff7059981c 100644 --- a/xla/xla.proto +++ b/xla/xla.proto @@ -645,6 +645,11 @@ message DebugOptions { // dimension. optional bool xla_gpu_enable_all_gather_combine_by_dim = 254; + // Enables spatial partitioning of the GPU BFC allocator so default and + // collective allocations share one fixed address range. Requires BFC + // preallocation. + optional bool xla_gpu_enable_allocator_spatial_partitioning = 494; + optional bool xla_gpu_enable_analytical_latency_estimator = 255; // Enables NCCL Speed-of-Light (SoL) analytical cost model From ba67293791a3d52084a2c7bdc63df8e3d5ff3b2c Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 12 Jun 2026 23:02:26 +0000 Subject: [PATCH 2/2] [tsl] Add ScopedAllocationTrace to help debugging OOMs --- xla/pjrt/gpu/BUILD | 18 ++-- xla/pjrt/gpu/se_gpu_pjrt_client.cc | 28 ++++- xla/tsl/framework/BUILD | 18 ++++ xla/tsl/framework/bfc_allocator.cc | 38 +++++++ xla/tsl/framework/bfc_allocator.h | 5 + xla/tsl/framework/bfc_allocator_test.cc | 41 ++++++++ xla/tsl/framework/scoped_allocation_trace.cc | 84 +++++++++++++++ xla/tsl/framework/scoped_allocation_trace.h | 101 +++++++++++++++++++ 8 files changed, 322 insertions(+), 11 deletions(-) create mode 100644 xla/tsl/framework/scoped_allocation_trace.cc create mode 100644 xla/tsl/framework/scoped_allocation_trace.h diff --git a/xla/pjrt/gpu/BUILD b/xla/pjrt/gpu/BUILD index d73a34f66ec72..e7ec790839263 100644 --- a/xla/pjrt/gpu/BUILD +++ b/xla/pjrt/gpu/BUILD @@ -63,6 +63,7 @@ cc_library( ":gpu_metrics", ":se_gpu_pjrt_runtime_abi_version", ":se_gpu_topology_description", + "//xla:debug_options_flags", "//xla:executable_run_options", "//xla:future", "//xla:literal", @@ -88,6 +89,7 @@ cc_library( "//xla/core/collectives:communicator", "//xla/core/collectives:rank_id", "//xla/hlo/builder:xla_computation", + "//xla/hlo/ir:hlo", "//xla/pjrt:async_work_runner", "//xla/pjrt:common_pjrt_client", "//xla/pjrt:device_event", @@ -134,8 +136,10 @@ cc_library( "//xla/service:transfer_manager", "//xla/service/gpu:buffer_allocations", "//xla/service/gpu:gpu_constants", + "//xla/service/gpu:gpu_executable", "//xla/service/gpu:gpu_executable_run_options", "//xla/service/gpu:gpu_memory_space_assignment", + "//xla/service/gpu:stream_executor_util", "//xla/stream_executor:device_address", "//xla/stream_executor:device_address_allocator", "//xla/stream_executor:device_description", @@ -146,6 +150,9 @@ cc_library( "//xla/stream_executor:stream", "//xla/stream_executor:stream_executor_h", "//xla/stream_executor:vmm_device_address_allocator", + "//xla/stream_executor/cuda:cuda_compute_capability", + "//xla/stream_executor/cuda:cuda_device_address_vmm_allocator", + "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator", "//xla/stream_executor/integrations:device_mem_allocator", "//xla/stream_executor/integrations:tf_allocator_adapter", "//xla/tsl/concurrency:async_value", @@ -155,6 +162,7 @@ cc_library( "//xla/tsl/framework:bfc_allocator", "//xla/tsl/framework:device_id", "//xla/tsl/framework:device_id_impl", + "//xla/tsl/framework:scoped_allocation_trace", "//xla/tsl/lib/strings:proto_serialization", "//xla/tsl/platform:env", "//xla/tsl/platform:errors", @@ -183,6 +191,7 @@ cc_library( "@com_google_absl//absl/time", "@com_google_absl//absl/types:span", "@llvm-project//mlir:IR", + "@local_config_cuda//cuda:cudart_headers", "@tsl//tsl/platform:casts", "@tsl//tsl/platform:env", "@tsl//tsl/platform:errors", @@ -196,25 +205,16 @@ cc_library( "@tsl//tsl/profiler/lib:traceme", ] + if_cuda_or_rocm([ # keep sorted - "//xla:debug_options_flags", "//xla/service/gpu:gpu_compiler", - "//xla/service/gpu:gpu_executable", - "//xla/service/gpu:stream_executor_util", ]) + if_cuda([ # keep sorted - "//xla/stream_executor/cuda:cuda_compute_capability", - "//xla/stream_executor/cuda:cuda_device_address_vmm_allocator", - "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator", "@local_config_cuda//cuda:cuda_headers", ]) + if_rocm([ # keep sorted "@local_config_rocm//rocm:rocm_headers", ]) + if_sycl([ # keep sorted - "//xla:debug_options_flags", "//xla/service/gpu:gpu_compiler", - "//xla/service/gpu:gpu_executable", - "//xla/service/gpu:stream_executor_util", "@local_config_sycl//sycl:sycl_headers", ]), ) diff --git a/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/xla/pjrt/gpu/se_gpu_pjrt_client.cc index 3f1ab375add4e..a876d9b39dee5 100644 --- a/xla/pjrt/gpu/se_gpu_pjrt_client.cc +++ b/xla/pjrt/gpu/se_gpu_pjrt_client.cc @@ -63,6 +63,7 @@ limitations under the License. #include "xla/executable_run_options.h" #include "xla/future.h" #include "xla/hlo/builder/xla_computation.h" +#include "xla/hlo/ir/hlo_input_output_alias_config.h" #include "xla/layout.h" #include "xla/pjrt/async_work_runner.h" #include "xla/pjrt/buffer_sequencing_event.h" @@ -118,6 +119,7 @@ limitations under the License. #include "xla/tsl/concurrency/async_value_ref.h" #include "xla/tsl/concurrency/ref_count.h" #include "xla/tsl/framework/allocator.h" +#include "xla/tsl/framework/scoped_allocation_trace.h" #include "xla/tsl/platform/env.h" #include "xla/tsl/platform/errors.h" #include "xla/tsl/platform/statusor.h" @@ -2037,6 +2039,11 @@ StreamExecutorGpuClient::RunAsync( "[", device_ordinal, "] GpuExecutable::ExecuteAsyncOnStreamImpl(", gpu_exec->name(), ")")); + // Attribute all device memory allocation to the gpu executable. + tsl::ScopedAllocationTrace allocation_trace( + "xla.execute", + {{"executable", gpu_exec->name()}, {"device", device_ordinal}}); + // GpuExecutable always bound to a single GpuContext during its execution, so // we activate it once to skip expensive context activations later. auto activation = executor->Activate(); @@ -2127,14 +2134,21 @@ StreamExecutorGpuClient::RunAsync( } } else { // Allocate each allocation that might escape, or is the temp buffer. - CHECK(allocation.maybe_live_out() || - allocation.IsPreallocatedTempBuffer()); + bool is_live_out = allocation.maybe_live_out(); + bool is_temp_buffer = allocation.IsPreallocatedTempBuffer(); + CHECK(is_live_out || is_temp_buffer); // Crash OK + int64_t buffer_size = allocation.size(); if (auto it = allocate_granularity.find(allocation.color()); it != allocate_granularity.end()) { buffer_size = RoundUpTo(buffer_size, it->second); } if (buffer_size > 0) { + tsl::ScopedAllocationTrace allocation_trace( + "xla.buffer", {{"kind", is_temp_buffer ? "temp" : "live_out"}, + {"allocation_index", i}, + {"requested_bytes", buffer_size}, + {"memory_space", allocation.color()}}); ASSIGN_OR_RETURN( se::ScopedDeviceAddress owning_buffer, memory_allocator->Allocate(device_ordinal, buffer_size, @@ -2199,6 +2213,16 @@ StreamExecutorGpuClient::RunAsync( "buffer is not donated; allocating a fresh buffer"; int64_t allocation_size = ShapeUtil::ByteSizeOf( ShapeUtil::GetSubshape(gpu_exec->result_shape(), index)); + const HloInputOutputAliasConfig::Alias& alias = + *output_info.alias_config; + const bool must_alias = alias.must_alias(); + tsl::ScopedAllocationTrace copy_protection_trace( + "xla.buffer", + {{"kind", "live_out_copy_protection"}, + {"allocation_index", output_info.allocation_index}, + {"requested_bytes", allocation_size}, + {"memory_space", allocation->color()}, + {"alias_kind", must_alias ? "must_alias" : "may_alias"}}); absl::StatusOr> allocated_buffer = memory_allocator->Allocate(device_ordinal, allocation_size, /*retry_on_failure=*/true, diff --git a/xla/tsl/framework/BUILD b/xla/tsl/framework/BUILD index d923ccca00fbe..8c23401aff40b 100644 --- a/xla/tsl/framework/BUILD +++ b/xla/tsl/framework/BUILD @@ -185,6 +185,20 @@ cc_library( alwayslink = 1, ) +cc_library( + name = "scoped_allocation_trace", + srcs = ["scoped_allocation_trace.cc"], + hdrs = ["scoped_allocation_trace.h"], + features = ["parse_headers"], + visibility = ["//visibility:public"], + deps = [ + "//xla/tsl/platform:logging", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:string_view", + ], +) + cc_library( name = "bfc_allocator", srcs = [ @@ -198,6 +212,7 @@ cc_library( deps = [ ":allocator", ":metrics", + ":scoped_allocation_trace", ":shared_counter", "//xla/tsl/lib/core:bits", "//xla/tsl/platform:env", @@ -226,13 +241,16 @@ tsl_cc_test( deps = [ ":allocator", ":bfc_allocator", + ":scoped_allocation_trace", "//xla/tsl/platform:env", "//xla/tsl/platform:env_impl", # buildcleaner: keep "//xla/tsl/platform:test", "//xla/tsl/platform:test_benchmark", "//xla/tsl/platform:test_main", "@com_google_absl//absl/base", + "@com_google_absl//absl/base:log_severity", "@com_google_absl//absl/base:no_destructor", + "@com_google_absl//absl/log:scoped_mock_log", "@com_google_absl//absl/synchronization", ], ) diff --git a/xla/tsl/framework/bfc_allocator.cc b/xla/tsl/framework/bfc_allocator.cc index d14c96c8b4688..384a80d34437e 100644 --- a/xla/tsl/framework/bfc_allocator.cc +++ b/xla/tsl/framework/bfc_allocator.cc @@ -35,10 +35,12 @@ limitations under the License. #include "absl/container/flat_hash_set.h" #include "absl/numeric/bits.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "absl/strings/string_view.h" #include "absl/synchronization/mutex.h" #include "xla/tsl/framework/allocator.h" #include "xla/tsl/framework/allocator_retry.h" +#include "xla/tsl/framework/scoped_allocation_trace.h" #include "xla/tsl/platform/env.h" #include "xla/tsl/platform/file_system.h" #include "xla/tsl/platform/logging.h" @@ -56,6 +58,25 @@ const uint64_t kDefaultMemoryFilterMask = tsl::profiler::TraceMeFiltersToMask( constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle; +static std::string AllocationAnnotationFrameDebugString( + const ScopedAllocationTrace::Frame& frame) { + if (frame.args.empty()) { + return frame.name; + } + return absl::StrCat(frame.name, "{", + absl::StrJoin(frame.args, ", ", absl::PairFormatter("=")), + "}"); +} + +static std::string AllocationAnnotationSnapshotDebugString( + const ScopedAllocationTrace::Snapshot& snapshot) { + return absl::StrJoin( + snapshot.frames, " / ", + [](std::string* out, const ScopedAllocationTrace::Frame& frame) { + absl::StrAppend(out, AllocationAnnotationFrameDebugString(frame)); + }); +} + BFCAllocator::BFCAllocator(std::unique_ptr sub_allocator, size_t total_memory, const std::string& name, const Options& opts) @@ -223,6 +244,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { c->next = kInvalidChunkHandle; c->freed_at_count = 0; c->tag = free_chunk_tag_; + c->allocation_annotation.reset(); region_manager_.set_handle(c->ptr, h); @@ -264,6 +286,7 @@ void BFCAllocator::DeallocateChunk(ChunkHandle h) { Chunk* c = ChunkFromHandle(h); c->allocation_id = -1; c->bin_num = kInvalidBinNum; + c->allocation_annotation.reset(); c->next = unused_chunk_handle_head_; unused_chunk_handle_head_ = h; } @@ -839,6 +862,13 @@ void BFCAllocator::FinishChunkAllocation(Chunk* chunk, size_t num_bytes) { // Assign a unique id and increment the id counter, marking the chunk as being // in use. chunk->allocation_id = next_allocation_id_++; + ScopedAllocationTrace::Snapshot allocation_annotation = + ScopedAllocationTrace::Current(); + if (allocation_annotation.frames.empty()) { + chunk->allocation_annotation.reset(); + } else { + chunk->allocation_annotation = std::move(allocation_annotation); + } // Update stats. ++stats_.num_allocs; @@ -901,6 +931,8 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) { // It inherits the tag; callers update the in-use piece after splitting. new_chunk->tag = c->tag; + new_chunk->allocation_annotation.reset(); + // Maintain the pointers. // c <-> c_neighbor becomes // c <-> new_chunk <-> c_neighbor @@ -1088,6 +1120,7 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) { // Mark the chunk as no longer in use. c->allocation_id = -1; + c->allocation_annotation.reset(); // Optionally record the free time. Timestamped chunks are kept in their // original lower/upper tag until they become safe to merge; otherwise a @@ -1392,6 +1425,11 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes, c->action_count, " step ", c->step_id); } #endif + if (c->in_use() && c->allocation_annotation.has_value()) { + absl::StrAppend( + &buf, " allocation_annotation ", + AllocationAnnotationSnapshotDebugString(*c->allocation_annotation)); + } absl::StrAppend(&buf, " next ", c->next); if (timing_counter_) { absl::StrAppend(&buf, " freed_at_count ", c->freed_at_count); diff --git a/xla/tsl/framework/bfc_allocator.h b/xla/tsl/framework/bfc_allocator.h index 1c0ba329018bc..4601fddc5e5f7 100644 --- a/xla/tsl/framework/bfc_allocator.h +++ b/xla/tsl/framework/bfc_allocator.h @@ -36,6 +36,7 @@ limitations under the License. #include "absl/synchronization/mutex.h" #include "xla/tsl/framework/allocator.h" #include "xla/tsl/framework/allocator_retry.h" +#include "xla/tsl/framework/scoped_allocation_trace.h" #include "xla/tsl/framework/shared_counter.h" #include "xla/tsl/lib/core/bits.h" #include "xla/tsl/platform/logging.h" @@ -327,6 +328,10 @@ class BFCAllocator : public Allocator { // rejoin the gap. ChunkTag tag = ChunkTag::kCentralGap; + // Snapshot of the thread-local allocation annotation stack captured when + // this chunk became in-use. Cleared when the chunk is freed. + std::optional allocation_annotation; + bool in_use() const { return allocation_id != -1; } #ifdef TENSORFLOW_MEM_DEBUG diff --git a/xla/tsl/framework/bfc_allocator_test.cc b/xla/tsl/framework/bfc_allocator_test.cc index 0d2b878939fe5..eeb8376b915e6 100644 --- a/xla/tsl/framework/bfc_allocator_test.cc +++ b/xla/tsl/framework/bfc_allocator_test.cc @@ -29,9 +29,12 @@ limitations under the License. #include #include "absl/base/casts.h" +#include "absl/base/log_severity.h" #include "absl/base/no_destructor.h" +#include "absl/log/scoped_mock_log.h" #include "absl/synchronization/blocking_counter.h" #include "xla/tsl/framework/allocator.h" +#include "xla/tsl/framework/scoped_allocation_trace.h" #include "xla/tsl/platform/env.h" #include "xla/tsl/platform/test.h" #include "xla/tsl/platform/test_benchmark.h" @@ -40,6 +43,11 @@ limitations under the License. namespace tsl { namespace { +using ::testing::_; +using ::testing::AllOf; +using ::testing::AtLeast; +using ::testing::HasSubstr; + static constexpr size_t kAlignment = Allocator::kAllocatorAlignment; static const absl::NoDestructor kUpper( @@ -111,6 +119,39 @@ TEST(BFCAllocatorTest, DefaultAlignment) { alloc.DeallocateRaw(ptr); } +TEST(BFCAllocatorTest, OomLogsAllocationAnnotations) { + BFCAllocator::Options opts; + opts.allow_growth = false; + opts.allow_retry_on_failure = false; + BFCAllocator alloc(std::make_unique(), + /*total_memory=*/1024, /*name=*/"annotated", opts); + + void* ptr = nullptr; + { + ScopedAllocationTrace exec_scope("xla.execute", + {{"executable", "module"}, {"device", 7}}); + ScopedAllocationTrace buffer_scope( + "xla.buffer", {{"kind", "live_out"}, {"allocation_index", 3}}); + ptr = alloc.AllocateRaw(kAlignment, 512); + } + ASSERT_NE(ptr, nullptr); + + absl::ScopedMockLog log(absl::MockLogDefault::kIgnoreUnexpected); + EXPECT_CALL( + log, + Log(absl::LogSeverity::kInfo, _, + AllOf(HasSubstr("InUse at"), HasSubstr("allocation_annotation"), + HasSubstr("xla.execute{executable=module, device=7}"), + HasSubstr("xla.buffer{kind=live_out, allocation_index=3}")))) + .Times(AtLeast(1)); + log.StartCapturingLogs(); + + EXPECT_EQ(alloc.AllocateRaw(kAlignment, 2048), nullptr); + + log.StopCapturingLogs(); + alloc.DeallocateRaw(ptr); +} + // Parameterized test that verifies alignment is respected for various // power-of-two alignments from 32 bytes to 4096 bytes. class BFCAllocatorAlignmentTest : public ::testing::TestWithParam {}; diff --git a/xla/tsl/framework/scoped_allocation_trace.cc b/xla/tsl/framework/scoped_allocation_trace.cc new file mode 100644 index 0000000000000..5b351b1452cba --- /dev/null +++ b/xla/tsl/framework/scoped_allocation_trace.cc @@ -0,0 +1,84 @@ +/* Copyright 2026 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "xla/tsl/framework/scoped_allocation_trace.h" + +#include +#include +#include +#include + +#include "absl/base/attributes.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "xla/tsl/platform/logging.h" + +namespace tsl { +namespace { + +struct AnnotationState { + std::vector frames; +}; + +AnnotationState& ThreadAnnotationState() { + static thread_local AnnotationState state; // NOLINT + return state; +} + +} // namespace + +ScopedAllocationTrace::Arg::Arg(absl::string_view key, + const absl::AlphaNum& value + ABSL_ATTRIBUTE_LIFETIME_BOUND) + : key(key), value(value.Piece()) {} + +ScopedAllocationTrace::Frame::Frame(absl::string_view name) : name(name) {} + +ScopedAllocationTrace::Frame& ScopedAllocationTrace::Frame::Add( + absl::string_view key, const absl::AlphaNum& value) { + args.emplace_back(std::string(key), std::string(value.Piece())); + return *this; +} + +ScopedAllocationTrace::Snapshot::Snapshot(std::vector frames) + : frames(std::move(frames)) {} + +ScopedAllocationTrace::ScopedAllocationTrace(absl::string_view name, + std::initializer_list args) { + Frame frame(name); + frame.args.reserve(args.size()); + for (const Arg& arg : args) { + frame.args.emplace_back(std::string(arg.key), std::string(arg.value)); + } + AnnotationState& state = ThreadAnnotationState(); + state.frames.push_back(std::move(frame)); +} + +ScopedAllocationTrace::ScopedAllocationTrace(Frame frame) { + AnnotationState& state = ThreadAnnotationState(); + state.frames.push_back(std::move(frame)); +} + +ScopedAllocationTrace::~ScopedAllocationTrace() { + AnnotationState& state = ThreadAnnotationState(); + DCHECK(!state.frames.empty()); + state.frames.pop_back(); +} + +ScopedAllocationTrace::Snapshot ScopedAllocationTrace::Current() { + return Snapshot(ThreadAnnotationState().frames); +} + +} // namespace tsl diff --git a/xla/tsl/framework/scoped_allocation_trace.h b/xla/tsl/framework/scoped_allocation_trace.h new file mode 100644 index 0000000000000..452fb675b9d2b --- /dev/null +++ b/xla/tsl/framework/scoped_allocation_trace.h @@ -0,0 +1,101 @@ +/* Copyright 2026 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_TSL_FRAMEWORK_SCOPED_ALLOCATION_TRACE_H_ +#define XLA_TSL_FRAMEWORK_SCOPED_ALLOCATION_TRACE_H_ + +#include +#include +#include +#include + +#include "absl/base/attributes.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" + +namespace tsl { + +// Thread-local metadata for passing higher-level allocation details down to +// allocators. XLA/PJRT code can describe what an allocation represents, and +// allocators may snapshot that stack to connect low-level chunks back to +// high-level execution state during OOM diagnostics. +// +// This complements ScopedMemoryDebugAnnotation: that API exposes current +// pending op/shape metadata for memory profiling, while this API keeps explicit +// allocation trace frames for allocator diagnostics. This is scoped to the +// current thread and does not automatically propagate to other threads. This +// differs from third_party/tsl/tsl/platform/context.h, whose context can be +// automatically captured and propagated through XLA thread pools. +// +// Allocator implementations may optionally snapshot Current() when a buffer +// becomes live and attach it to internal metadata. Callers should not assume +// every allocator records it. +// +// Example: +// +// ScopedAllocationTrace exec_scope( +// "xla.execute", +// {{"executable", executable_name}, {"device", device_ordinal}}); +// +// void* ptr = allocator->AllocateRaw(alignment, bytes); +// +class ScopedAllocationTrace { + public: + // Key/value pair encoded into an allocation trace frame. + struct Arg { + Arg(absl::string_view key, + const absl::AlphaNum& value ABSL_ATTRIBUTE_LIFETIME_BOUND); + + Arg(const Arg&) = delete; + void operator=(const Arg&) = delete; + + absl::string_view key; + absl::string_view value; + }; + + // Single allocation trace scope frame. + struct Frame { + explicit Frame(absl::string_view name); + + Frame& Add(absl::string_view key, + const absl::AlphaNum& value ABSL_ATTRIBUTE_LIFETIME_BOUND); + + std::string name; + std::vector> args; + }; + + // Copy of the current thread-local trace frame stack. + struct Snapshot { + explicit Snapshot(std::vector frames); + + std::vector frames; + }; + + explicit ScopedAllocationTrace(absl::string_view name, + std::initializer_list args = {}); + explicit ScopedAllocationTrace(Frame frame); + + ScopedAllocationTrace(ScopedAllocationTrace&&) = delete; + + ~ScopedAllocationTrace(); + + // Returns a copy of the current thread's annotation stack. The returned + // snapshot is independent from later scope changes and can have no frames. + static Snapshot Current(); +}; + +} // namespace tsl + +#endif // XLA_TSL_FRAMEWORK_SCOPED_ALLOCATION_TRACE_H_