From 08863add202f78e9c99ee8e442ac276cf2093db8 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@openxla.org>
Date: Tue, 9 Jun 2026 00:19:07 +0000
Subject: [PATCH 1/2] [tsl] Add support for spatial partitioning of BFC-managed
 address range

---
 xla/debug_options_flags.cc                    |  13 +-
 xla/pjrt/gpu/gpu_helpers.cc                   |  13 +-
 xla/pjrt/gpu/gpu_helpers.h                    |   9 +-
 xla/pjrt/gpu/se_gpu_pjrt_client.cc            |  68 +-
 .../integrations/tf_allocator_adapter.cc      |  19 +-
 .../integrations/tf_allocator_adapter.h       |  16 +-
 xla/tsl/framework/BUILD                       |   2 +-
 xla/tsl/framework/allocator.h                 |  39 +-
 xla/tsl/framework/bfc_allocator.cc            | 605 +++++++++++++-----
 xla/tsl/framework/bfc_allocator.h             | 256 +++++++-
 xla/tsl/framework/bfc_allocator_test.cc       | 539 ++++++++++++++--
 xla/xla.proto                                 |   5 +
 12 files changed, 1315 insertions(+), 269 deletions(-)
diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc
index 59429e76e636c..307e704dc8d71 100644
--- a/xla/debug_options_flags.cc
+++ b/xla/debug_options_flags.cc
@@ -105,7 +105,8 @@ absl::StatusOr<std::vector<RepeatedFlagModifier>> ParseRepeatedEnumModifiers(
 namespace {
 
 template <typename T>
-static auto FindRepeatedFieldValue(google::protobuf::RepeatedField<int>* list, T value) {
+static auto FindRepeatedFieldValue(google::protobuf::RepeatedField<int>* list,
+                                   T value) {
   for (auto it = list->begin(); it != list->end(); ++it) {
     if (*it == value) {
       return it;
@@ -300,6 +301,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_dynamic_slice_fusion_verify_offsets(false);
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
+  opts.set_xla_gpu_enable_allocator_spatial_partitioning(true);
   opts.set_xla_gpu_experimental_enable_nccl_symmetric_buffers(false);
   opts.set_xla_gpu_experimental_enable_nvshmem(false);
   opts.set_xla_gpu_enable_nccl_comm_splitting(true);
@@ -2025,6 +2027,14 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Enables NCCL User Buffer Registration. collective_memory_size in the "
       "allocator config must also be set to a non-zero value that is large "
       "enough to meet peak collective memory usage."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_allocator_spatial_partitioning",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_allocator_spatial_partitioning),
+      debug_options->xla_gpu_enable_allocator_spatial_partitioning(),
+      "Enables spatial partitioning of the GPU BFC allocator so default and "
+      "collective allocations share one fixed address range. Requires BFC "
+      "preallocation."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_enable_nccl_symmetric_buffers",
       bool_setter_for(
@@ -3250,7 +3260,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 bool_setter_for(&DebugOptions::set_xla_gpu_log_minmax),
                 debug_options->xla_gpu_log_minmax(),
                 "If true, log min/max values from kernel outputs."));
-
   flag_list->push_back(tsl::Flag(
       "xla_early_exit_with_layouts",
       bool_setter_for(&DebugOptions::set_xla_early_exit_with_layouts),
diff --git a/xla/pjrt/gpu/gpu_helpers.cc b/xla/pjrt/gpu/gpu_helpers.cc
index 50c5a21fe72d3..f2942acee38a4 100644
--- a/xla/pjrt/gpu/gpu_helpers.cc
+++ b/xla/pjrt/gpu/gpu_helpers.cc
@@ -103,8 +103,12 @@ absl::StatusOr<std::shared_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
     se::StreamExecutor* executor, double memory_fraction, bool preallocate,
     std::optional<int64_t> gpu_system_memory_size,
     const std::vector<tsl::SubAllocator::Visitor>& sub_allocator_alloc_visitors,
-    const std::vector<tsl::SubAllocator::Visitor>&
-        sub_allocator_free_visitors) {
+    const std::vector<tsl::SubAllocator::Visitor>& sub_allocator_free_visitors,
+    bool enable_spatial_partitioning) {
+  if (enable_spatial_partitioning && !preallocate) {
+    return InvalidArgument(
+        "Spatial partitioning of the BFC allocator requires preallocate=true.");
+  }
   bool enable_unified_memory;
   absl::Status status = tsl::ReadBoolFromEnvVar("TF_FORCE_UNIFIED_MEMORY",
                                                 false, &enable_unified_memory);
@@ -164,13 +168,14 @@ absl::StatusOr<std::shared_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
 
   tsl::BFCAllocator::Options opts;
   opts.allow_growth = !preallocate;
+  opts.enable_spatial_partitioning = enable_spatial_partitioning;
   return std::make_shared<tsl::BFCAllocator>(
       std::move(sub_allocator), allocator_memory,
       absl::StrCat("GPU_", device_ordinal, "_bfc"), opts);
 }
 
 // Builds a BFCAllocator for all local GPUs that uses collective memory.
-absl::StatusOr<std::shared_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
+absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
     se::StreamExecutor* executor, double memory_fraction,
     size_t collective_memory_size) {
   int device_ordinal = executor->device_ordinal();
@@ -205,7 +210,7 @@ absl::StatusOr<std::shared_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
 
   tsl::BFCAllocator::Options opts;
   opts.allow_growth = !preallocate;
-  return std::make_shared<tsl::BFCAllocator>(
+  return std::make_unique<tsl::BFCAllocator>(
       std::move(sub_allocator), allocator_memory,
       absl::StrCat("GPU_collectivememory_", device_ordinal, "_bfc"), opts);
 }
diff --git a/xla/pjrt/gpu/gpu_helpers.h b/xla/pjrt/gpu/gpu_helpers.h
index f037bb5115329..3d3866873067e 100644
--- a/xla/pjrt/gpu/gpu_helpers.h
+++ b/xla/pjrt/gpu/gpu_helpers.h
@@ -47,15 +47,18 @@ void EnablePeerAccess(absl::Span<se::StreamExecutor* const> executors);
 absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> GetGpuHostAllocator(
     se::StreamExecutor* executor);
 
-// Builds a BFCAllocator for all local GPUs.
+// Builds a BFCAllocator for all local GPUs. When enable_spatial_partitioning
+// is set, the allocator serves collective (upper-end) and default (lower-end)
+// requests from one shared address range; this requires preallocate=true.
 absl::StatusOr<std::shared_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
     se::StreamExecutor* executor, double memory_fraction, bool preallocate,
     std::optional<int64_t> gpu_system_memory_size,
     const std::vector<tsl::SubAllocator::Visitor>& sub_allocator_alloc_visitors,
-    const std::vector<tsl::SubAllocator::Visitor>& sub_allocator_free_visitors);
+    const std::vector<tsl::SubAllocator::Visitor>& sub_allocator_free_visitors,
+    bool enable_spatial_partitioning = false);
 
 // Builds a BFCAllocator for all local GPUs that uses collective memory.
-absl::StatusOr<std::shared_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
+absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
     se::StreamExecutor* executor, double memory_fraction,
     size_t collective_memory_size);
 
diff --git a/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 6bd3f0fe27bf3..3f1ab375add4e 100644
--- a/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -1329,14 +1329,20 @@ GetStreamExecutorGpuDeviceAllocator(
     const std::map<int, std::unique_ptr<LocalDeviceState>>&
         addressable_devices) {
   std::vector<se::MultiDeviceAdapter::AllocatorInfo> allocators;
+  const DebugOptions& debug_options = xla::GetDebugOptionsFromFlags();
   GpuAllocatorConfig::Kind effective_kind = allocator_config.kind;
-  if (GetDebugOptionsFromFlags().xla_gpu_command_buffer_update_mode() !=
+  if (debug_options.xla_gpu_command_buffer_update_mode() !=
           DebugOptions::ALWAYS_UPDATE &&
       effective_kind != GpuAllocatorConfig::Kind::kVmm) {
     LOG(WARNING) << "xla_gpu_command_buffer_update_mode requires the "
                     "VMM allocator. Overriding allocator kind to kVmm.";
     effective_kind = GpuAllocatorConfig::Kind::kVmm;
   }
+
+  // Set when a single preallocated BFC allocator serves both default and
+  // collective memory via spatial partitioning; suppresses the separate
+  // collective allocator below.
+  bool shared_collective_pool = false;
   switch (effective_kind) {
     case GpuAllocatorConfig::Kind::kCudaAsync: {
       for (const auto& ordinal_and_device : addressable_devices) {
@@ -1356,6 +1362,13 @@ GetStreamExecutorGpuDeviceAllocator(
     case GpuAllocatorConfig::Kind::kDefault:
     case GpuAllocatorConfig::Kind::kBFC: {
       LOG(INFO) << "Using BFC allocator.";
+      // With the spatial-partitioning flag enabled, preallocation lets one BFC
+      // allocator over a fixed address range serve both default (lower end) and
+      // collective (upper end) memory, so no separate collective allocator is
+      // created. Otherwise, use the separate collective allocator below.
+      shared_collective_pool =
+          allocator_config.preallocate &&
+          debug_options.xla_gpu_enable_allocator_spatial_partitioning();
       for (const auto& ordinal_and_device : addressable_devices) {
         ASSIGN_OR_RETURN(
             auto bfc_allocator,
@@ -1364,11 +1377,29 @@ GetStreamExecutorGpuDeviceAllocator(
                                allocator_config.preallocate,
                                allocator_config.gpu_system_memory_size,
                                allocator_config.sub_allocator_alloc_visitors,
-                               allocator_config.sub_allocator_free_visitors));
+                               allocator_config.sub_allocator_free_visitors,
+                               /*enable_spatial_partitioning=*/
+                               shared_collective_pool));
         allocators.push_back(
-            {std::move(bfc_allocator),
-             ordinal_and_device.second->compute_stream(),
+            {bfc_allocator, ordinal_and_device.second->compute_stream(),
              /*memory_space=*/(int)xla::gpu::MemorySpaceColor::kDefault});
+        if (shared_collective_pool) {
+          size_t collective_memory_alignment =
+              tsl::Allocator::kAllocatorAlignment;
+          if (auto* collectives =
+                  gpu::GpuCollectives::Default(platform->Name())) {
+            collective_memory_alignment =
+                collectives->SymmetricMemoryAlignment();
+          }
+          allocators.push_back(
+              {std::move(bfc_allocator),
+               ordinal_and_device.second->compute_stream(),
+               /*memory_space=*/(int)xla::gpu::MemorySpaceColor::kCollective,
+               /*device_ordinal=*/std::nullopt,
+               /*platform=*/nullptr,
+               /*min_alignment=*/collective_memory_alignment,
+               /*allocation_end=*/tsl::AllocationEnd::kUpper});
+        }
       }
       break;
     }
@@ -1402,18 +1433,22 @@ GetStreamExecutorGpuDeviceAllocator(
     }
   }
 
-  // Add any additional allocators for alternate memory spaces.
-  for (const auto& ordinal_and_device : addressable_devices) {
-    ASSIGN_OR_RETURN(
-        auto collective_bfc_allocator,
-        CreateCollectiveBFCAllocator(
-            ordinal_and_device.second->executor(),
-            /*memory_fraction=*/1.0 - allocator_config.memory_fraction,
-            allocator_config.collective_memory_size));
-    allocators.push_back(
-        {std::move(collective_bfc_allocator),
-         ordinal_and_device.second->compute_stream(),
-         /*memory_space=*/(int)xla::gpu::MemorySpaceColor::kCollective});
+  // Add a separate collective allocator unless the default BFC allocator
+  // already serves collective memory from its shared, spatially partitioned
+  // pool.
+  if (!shared_collective_pool) {
+    for (const auto& ordinal_and_device : addressable_devices) {
+      ASSIGN_OR_RETURN(
+          auto collective_bfc_allocator,
+          CreateCollectiveBFCAllocator(
+              ordinal_and_device.second->executor(),
+              /*memory_fraction=*/1.0 - allocator_config.memory_fraction,
+              allocator_config.collective_memory_size));
+      allocators.push_back(
+          {std::move(collective_bfc_allocator),
+           ordinal_and_device.second->compute_stream(),
+           /*memory_space=*/(int)xla::gpu::MemorySpaceColor::kCollective});
+    }
   }
 
   for (const auto& ordinal_and_device : addressable_devices) {
@@ -1426,7 +1461,6 @@ GetStreamExecutorGpuDeviceAllocator(
   }
 
 #if defined(GOOGLE_CUDA) && CUDA_VERSION >= 11020
-  const auto& debug_options = xla::GetDebugOptionsFromFlags();
   if (debug_options.xla_gpu_temp_buffer_use_separate_color()) {
     // Add memory allocator to allocate memory buffers with persistent temp
     // memory space color.
diff --git a/xla/stream_executor/integrations/tf_allocator_adapter.cc b/xla/stream_executor/integrations/tf_allocator_adapter.cc
index 48099deec3bfb..93492bf988c45 100644
--- a/xla/stream_executor/integrations/tf_allocator_adapter.cc
+++ b/xla/stream_executor/integrations/tf_allocator_adapter.cc
@@ -44,19 +44,23 @@ limitations under the License.
 namespace stream_executor {
 
 TfAllocatorAdapter::TfAllocatorAdapter(tsl::Allocator* wrapped, Stream* stream,
-                                       size_t min_alignment)
+                                       size_t min_alignment,
+                                       tsl::AllocationEnd allocation_end)
     : DeviceAddressAllocator(CHECK_NOTNULL(stream)->parent()->GetPlatform()),
       wrapped_(wrapped),
       stream_(stream),
-      min_alignment_(min_alignment) {}
+      min_alignment_(min_alignment),
+      allocation_end_(allocation_end) {}
 
 TfAllocatorAdapter::TfAllocatorAdapter(tsl::Allocator* wrapped,
                                        const Platform* platform,
-                                       size_t min_alignment)
+                                       size_t min_alignment,
+                                       tsl::AllocationEnd allocation_end)
     : DeviceAddressAllocator(platform),
       wrapped_(wrapped),
       stream_(nullptr),
-      min_alignment_(min_alignment) {}
+      min_alignment_(min_alignment),
+      allocation_end_(allocation_end) {}
 
 TfAllocatorAdapter::~TfAllocatorAdapter() {}
 
@@ -65,6 +69,7 @@ absl::StatusOr<ScopedDeviceAddress<uint8_t>> TfAllocatorAdapter::Allocate(
     int64_t memory_space) {
   tsl::AllocationAttributes attrs;
   attrs.retry_on_failure = retry_on_failure;
+  attrs.allocation_end = allocation_end_;
   void* data = nullptr;
   if (size != 0) {
     data = wrapped_->AllocateRaw(min_alignment_, size, attrs);
@@ -131,11 +136,13 @@ MultiDeviceAdapter::MultiDeviceAdapter(const Platform* platform,
     if (info.stream != nullptr) {
       per_device_allocators[device_ordinal] =
           std::make_shared<TfAllocatorAdapter>(info.allocator.get(),
-                                               info.stream, info.min_alignment);
+                                               info.stream, info.min_alignment,
+                                               info.allocation_end);
     } else {
       per_device_allocators[device_ordinal] =
           std::make_shared<TfAllocatorAdapter>(
-              info.allocator.get(), info.platform, info.min_alignment);
+              info.allocator.get(), info.platform, info.min_alignment,
+              info.allocation_end);
     }
     VLOG(3) << absl::StrFormat(
         "MultiDeviceAdapter: device_ordinal=%d memory_space=%d "
diff --git a/xla/stream_executor/integrations/tf_allocator_adapter.h b/xla/stream_executor/integrations/tf_allocator_adapter.h
index ae83c98963f1e..64717a17ee41e 100644
--- a/xla/stream_executor/integrations/tf_allocator_adapter.h
+++ b/xla/stream_executor/integrations/tf_allocator_adapter.h
@@ -54,14 +54,19 @@ class TfAllocatorAdapter : public DeviceAddressAllocator {
   //                Different memory spaces may require different alignment
   //                (e.g. symmetric memory requires higher alignment than
   //                default memory used for on-device compute).
+  //
+  // allocation_end: which end of a spatially partitioned allocator to serve
+  //                 requests from. Ignored by allocators that do not partition.
   TfAllocatorAdapter(
       tsl::Allocator* wrapped, Stream* stream,
-      size_t min_alignment = tsl::Allocator::kAllocatorAlignment);
+      size_t min_alignment = tsl::Allocator::kAllocatorAlignment,
+      tsl::AllocationEnd allocation_end = tsl::AllocationEnd::kLower);
 
   // Constructor for cases where `stream` is not available.
   TfAllocatorAdapter(
       tsl::Allocator* wrapped, const Platform* platform,
-      size_t min_alignment = tsl::Allocator::kAllocatorAlignment);
+      size_t min_alignment = tsl::Allocator::kAllocatorAlignment,
+      tsl::AllocationEnd allocation_end = tsl::AllocationEnd::kLower);
 
   ~TfAllocatorAdapter() override;
 
@@ -88,6 +93,7 @@ class TfAllocatorAdapter : public DeviceAddressAllocator {
   tsl::Allocator* wrapped_;
   Stream* stream_;
   size_t min_alignment_;
+  tsl::AllocationEnd allocation_end_;
 };
 
 // Adapter class that wraps per-device TF allocators with corresponding streams
@@ -117,6 +123,11 @@ class MultiDeviceAdapter : public DeviceAddressAllocator {
   // min_alignment:  minimum alignment passed to tsl::Allocator::AllocateRaw.
   //                 Symmetric/collective memory typically needs higher
   //                 alignment than default compute buffers.
+  //
+  // allocation_end: which end of a spatially partitioned allocator to serve
+  //                 from. When one BFC allocator backs both kDefault and
+  //                 kCollective, the kCollective entry uses kUpper so its
+  //                 offsets stay independent of default-memory activity.
   struct AllocatorInfo {
     std::shared_ptr<tsl::Allocator> allocator;
     Stream* stream;
@@ -124,6 +135,7 @@ class MultiDeviceAdapter : public DeviceAddressAllocator {
     std::optional<int32_t> device_ordinal = std::nullopt;
     const Platform* platform = nullptr;
     size_t min_alignment = tsl::Allocator::kAllocatorAlignment;
+    tsl::AllocationEnd allocation_end = tsl::AllocationEnd::kLower;
   };
 
   MultiDeviceAdapter(const Platform* platform,
diff --git a/xla/tsl/framework/BUILD b/xla/tsl/framework/BUILD
index c1bcdc4531539..d923ccca00fbe 100644
--- a/xla/tsl/framework/BUILD
+++ b/xla/tsl/framework/BUILD
@@ -232,8 +232,8 @@ tsl_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/synchronization",
-        "@tsl//tsl/platform:platform_port",
     ],
 )
 
diff --git a/xla/tsl/framework/allocator.h b/xla/tsl/framework/allocator.h
index 2e7b60f7a5c44..3c5618d527f41 100644
--- a/xla/tsl/framework/allocator.h
+++ b/xla/tsl/framework/allocator.h
@@ -16,30 +16,52 @@ limitations under the License.
 #ifndef XLA_TSL_FRAMEWORK_ALLOCATOR_H_
 #define XLA_TSL_FRAMEWORK_ALLOCATOR_H_
 
-#include <stdlib.h>
-
 #include <cstdint>
 #include <functional>
 #include <optional>
 #include <string>
 #include <vector>
 
+#include <stdlib.h>
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/macros.h"
 #include "tsl/platform/numa.h"
 
 namespace tsl {
 
+// Selects which end of an allocator's pre-allocated address range a request
+// should be served from, for allocators that spatially partition their range.
+// Allocators that do not spatially partition ignore this and always behave as
+// kLower.
+enum class AllocationEnd : uint8_t {
+  kLower,  // Carve from the lower-address end of the range (grows up).
+  kUpper,  // Carve from the upper-address end of the range (grows down).
+};
+
+template <typename Sink>
+void AbslStringify(Sink& sink, AllocationEnd end) {
+  switch (end) {
+    case AllocationEnd::kLower:
+      sink.Append("lower");
+      return;
+    case AllocationEnd::kUpper:
+      sink.Append("upper");
+      return;
+  }
+}
+
 // Attributes for a single allocation call. Different calls to the same
 // allocator could potentially have different allocation attributes.
 struct AllocationAttributes {
   AllocationAttributes() = default;
 
   AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged,
-                       std::function<uint64_t()>* freed_by_func)
+                       std::function<uint64_t()>* freed_by_func,
+                       AllocationEnd allocation_end = AllocationEnd::kLower)
       : retry_on_failure(retry_on_failure),
         allocation_will_be_logged(allocation_will_be_logged),
-        freed_by_func(freed_by_func) {}
+        freed_by_func(freed_by_func),
+        allocation_end(allocation_end) {}
 
   // If the first attempt to allocate the memory fails, the allocation should
   // wait and retry (with a timeout).
@@ -59,6 +81,11 @@ struct AllocationAttributes {
   // returned.
   std::function<uint64_t()>* freed_by_func = nullptr;  // Not owned.
 
+  // Which end of the allocator's pre-allocated address range to serve this
+  // request from. Only honored by allocators configured for spatial
+  // partitioning.
+  AllocationEnd allocation_end = AllocationEnd::kLower;
+
   AllocationAttributes(const AllocationAttributes&) = delete;
   void operator=(const AllocationAttributes&) = delete;
 };
@@ -251,7 +278,7 @@ class AllocatorWrapper : public Allocator {
  public:
   explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {}
 
-  ~AllocatorWrapper() override {}
+  ~AllocatorWrapper() override = default;
 
   // Returns the wrapped allocator to which all calls are delegated.
   Allocator* wrapped() const { return wrapped_; }
@@ -399,7 +426,7 @@ class SubAllocator {
   SubAllocator(const std::vector<Visitor>& alloc_visitors,
                const std::vector<Visitor>& free_visitors);
 
-  virtual ~SubAllocator() {}
+  virtual ~SubAllocator() = default;
   // Allocates at least num_bytes. Returns actual number of bytes allocated in
   // bytes_received. The caller can safely use the full bytes_received sized
   // buffer following the returned pointer.
diff --git a/xla/tsl/framework/bfc_allocator.cc b/xla/tsl/framework/bfc_allocator.cc
index 519014b972172..d14c96c8b4688 100644
--- a/xla/tsl/framework/bfc_allocator.cc
+++ b/xla/tsl/framework/bfc_allocator.cc
@@ -60,11 +60,16 @@ BFCAllocator::BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator,
                            size_t total_memory, const std::string& name,
                            const Options& opts)
     : opts_(opts),
+      free_chunk_tag_(opts.enable_spatial_partitioning ? ChunkTag::kCentralGap
+                                                       : ChunkTag::kLower),
       coalesce_regions_(sub_allocator->SupportsCoalescing()),
       sub_allocator_(std::move(sub_allocator)),
       name_(name),
-      free_chunks_list_(kInvalidChunkHandle),
+      unused_chunk_handle_head_(kInvalidChunkHandle),
       next_allocation_id_(1) {
+  CHECK(!opts.enable_spatial_partitioning || !opts.allow_growth)  // Crash OK
+      << "Spatial partitioning requires a single fixed address range "
+         "(allow_growth=false).";
   if (opts.allow_growth) {
     // 2MiB smallest initial allocation, unless total memory available
     // is less.
@@ -83,6 +88,13 @@ BFCAllocator::BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator,
   memory_limit_ = total_memory;
   stats_.bytes_limit = static_cast<int64_t>(total_memory);
 
+  // Cap on how much a chunk may exceed the requested size before we split it.
+  // If the user did not set a fraction, default to 128MB.
+  max_internal_fragmentation_bytes_ =
+      (opts.fragmentation_fraction > 0.0)
+          ? opts.fragmentation_fraction * memory_limit_
+          : 128 << 20;
+
   // Create a bunch of bins of various good sizes.
 
   // We create bins to fit all possible ranges that cover the
@@ -210,6 +222,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   c->prev = kInvalidChunkHandle;
   c->next = kInvalidChunkHandle;
   c->freed_at_count = 0;
+  c->tag = free_chunk_tag_;
 
   region_manager_.set_handle(c->ptr, h);
 
@@ -228,17 +241,18 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
     prev_chunk->next = h;
   }
 
-  // Maybe merge adjacent chunks and insert the chunk into the right bin.
-  InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));
+  // Maybe merge adjacent chunks and insert the chunk into the right free
+  // structure. In spatial mode, a fresh region becomes the central gap.
+  InsertFreeChunk(TryToCoalesce(h, /*ignore_freed_at=*/false));
 
   return true;
 }
 
 BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
-  if (free_chunks_list_ != kInvalidChunkHandle) {
-    ChunkHandle h = free_chunks_list_;
+  if (unused_chunk_handle_head_ != kInvalidChunkHandle) {
+    ChunkHandle h = unused_chunk_handle_head_;
     Chunk* c = ChunkFromHandle(h);
-    free_chunks_list_ = c->next;
+    unused_chunk_handle_head_ = c->next;
     return h;
   }
   ChunkHandle h = chunks_.size();
@@ -250,8 +264,8 @@ void BFCAllocator::DeallocateChunk(ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   c->allocation_id = -1;
   c->bin_num = kInvalidBinNum;
-  c->next = free_chunks_list_;
-  free_chunks_list_ = h;
+  c->next = unused_chunk_handle_head_;
+  unused_chunk_handle_head_ = h;
 }
 
 void* BFCAllocator::AllocateRawInternalWithRetry(
@@ -263,7 +277,8 @@ void* BFCAllocator::AllocateRawInternalWithRetry(
     freed_by_count = (*allocation_attr.freed_by_func)();
   }
 
-  void* r = AllocateRawInternal(alignment, num_bytes, false, freed_by_count);
+  void* r = AllocateRawInternal(alignment, num_bytes, false, freed_by_count,
+                                allocation_attr.allocation_end);
   if (ABSL_PREDICT_TRUE(r != nullptr)) {
     return r;
   }
@@ -275,7 +290,8 @@ void* BFCAllocator::AllocateRawInternalWithRetry(
         if (allocation_attr.freed_by_func != nullptr) {
           freed_by_count = (*allocation_attr.freed_by_func)();
         }
-        return AllocateRawInternal(a, nb, v, freed_by_count);
+        return AllocateRawInternal(a, nb, v, freed_by_count,
+                                   allocation_attr.allocation_end);
       },
       kMaxMillisToWait, alignment, num_bytes);
   return r;
@@ -285,6 +301,12 @@ void* BFCAllocator::AllocateRaw(size_t alignment, size_t num_bytes,
                                 const AllocationAttributes& allocation_attr) {
   VLOG(3) << "AllocateRaw " << Name() << " " << num_bytes
           << " alignment=" << alignment;
+  // Only spatially partitioned allocators serve upper-end requests; everything
+  // else must leave allocation_end at its kLower default. This lets the
+  // allocation path skip partitioning branches: with allocation_end always
+  // kLower, requests only ever land in AllocateChunkFromLowEnd.
+  DCHECK(opts_.enable_spatial_partitioning ||
+         allocation_attr.allocation_end == AllocationEnd::kLower);
   void* result = [&] {
     if (!opts_.allow_retry_on_failure || !allocation_attr.retry_on_failure) {
       // If we have globally disabled retry-on-failure and fail to allocate an
@@ -308,8 +330,9 @@ void* BFCAllocator::AllocateRaw(size_t alignment, size_t num_bytes,
       if (allocation_attr.freed_by_func != nullptr) {
         freed_by_count = (*allocation_attr.freed_by_func)();
       }
-      void* res = AllocateRawInternal(alignment, num_bytes, dump_log_on_failure,
-                                      freed_by_count);
+      void* res =
+          AllocateRawInternal(alignment, num_bytes, dump_log_on_failure,
+                              freed_by_count, allocation_attr.allocation_end);
       if (res == nullptr) {
         int32_t counter_value = log_counter.load(std::memory_order_relaxed);
         if (counter_value < kMaxFailureLogs) {
@@ -317,6 +340,7 @@ void* BFCAllocator::AllocateRaw(size_t alignment, size_t num_bytes,
           LOG(WARNING)
               << "Allocator (" << Name() << ") ran out of memory trying "
               << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
+              << " from the " << allocation_attr.allocation_end << " end"
               << " with freed_by_count=" << freed_by_count << "."
               << (!allocation_attr.retry_on_failure
                       ? " The caller indicates that this is not a failure, but"
@@ -335,7 +359,6 @@ void* BFCAllocator::AllocateRaw(size_t alignment, size_t num_bytes,
   return result;
 }
 
-// static
 size_t BFCAllocator::RoundedBytes(size_t bytes) {
   size_t rounded_bytes =
       (kMinAllocationSize *
@@ -344,6 +367,35 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) {
   return rounded_bytes;
 }
 
+uintptr_t BFCAllocator::AlignUp(uintptr_t ptr, size_t alignment) {
+  CHECK(absl::has_single_bit(alignment))
+      << "alignment must be a power of 2, got " << alignment;
+  const uintptr_t mask = static_cast<uintptr_t>(alignment) - 1;
+  return (ptr + mask) & ~mask;
+}
+
+uintptr_t BFCAllocator::AlignDown(uintptr_t ptr, size_t alignment) {
+  CHECK(absl::has_single_bit(alignment))
+      << "alignment must be a power of 2, got " << alignment;
+  const uintptr_t mask = static_cast<uintptr_t>(alignment) - 1;
+  return ptr & ~mask;
+}
+
+size_t BFCAllocator::LowEndAlignmentPadding(uintptr_t chunk_start,
+                                            size_t alignment) {
+  return RoundedBytes(AlignUp(chunk_start, alignment) - chunk_start);
+}
+
+uintptr_t BFCAllocator::HighEndAlignedStart(uintptr_t chunk_start,
+                                            size_t chunk_size,
+                                            size_t rounded_bytes,
+                                            size_t alignment) {
+  if (ABSL_PREDICT_FALSE(chunk_size < rounded_bytes)) {
+    return 0;
+  }
+  return AlignDown(chunk_start + chunk_size - rounded_bytes, alignment);
+}
+
 bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
     ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
   // Do nothing if garbage collection is off.
@@ -416,12 +468,12 @@ void BFCAllocator::DeallocateRegions(
     }
 
     VLOG(2) << "Deallocate region with ptr = " << it->ptr();
-    // Remove all chunk registrations from Bins.
+    // Remove all chunk registrations from free structures.
     ChunkHandle h = region_manager_.get_handle(it->ptr());
     while (h != kInvalidChunkHandle) {
       const Chunk* c = ChunkFromHandle(h);
-      if (c->bin_num != kInvalidBinNum) {
-        RemoveFreeChunkFromBin(h);
+      if (!c->in_use()) {
+        RemoveFreeChunk(h);
       }
       auto h_to_delete = h;
       h = c->next;
@@ -437,7 +489,8 @@ void BFCAllocator::DeallocateRegions(
 
 void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes,
                                         bool dump_log_on_failure,
-                                        uint64_t freed_before) {
+                                        uint64_t freed_before,
+                                        AllocationEnd allocation_end) {
   if (ABSL_PREDICT_FALSE(num_bytes == 0)) {
     VLOG(2) << "tried to allocate 0 bytes";
     return nullptr;
@@ -460,8 +513,8 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes,
     // Merge timestamped chunks whose counts have become safe for general use.
     MergeTimestampedChunks(0);
   }
-  void* ptr =
-      FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment, freed_before);
+  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment,
+                           freed_before, allocation_end);
   if (ABSL_PREDICT_TRUE(ptr != nullptr)) {
     AddTraceMe("MemoryAllocation", ptr);
     return ptr;
@@ -470,7 +523,7 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes,
   // Try to extend
   if (Extend(alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment,
-                       freed_before);
+                       freed_before, allocation_end);
     if (ptr != nullptr) {
       AddTraceMe("MemoryAllocation", ptr);
       return ptr;
@@ -484,7 +537,7 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes,
     // size is formed.
     if (MergeTimestampedChunks(rounded_bytes)) {
       ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment,
-                         freed_before);
+                         freed_before, allocation_end);
       if (ptr != nullptr) {
         AddTraceMe("MemoryAllocation", ptr);
         return ptr;
@@ -499,7 +552,7 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes,
   if (DeallocateFreeRegions(rounded_bytes) &&
       Extend(alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, alignment,
-                       freed_before);
+                       freed_before, allocation_end);
     if (ptr != nullptr) {
       AddTraceMe("MemoryAllocation", ptr);
       return ptr;
@@ -514,20 +567,20 @@ void* BFCAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes,
     LOG(WARNING)
         << "Allocator (" << Name() << ") ran out of memory trying "
         << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
-        << " (rounded to " << rounded_bytes << ")" << "requested by op "
+        << " (rounded to " << rounded_bytes << ")"
+        << " from the " << allocation_end << " end requested by op "
         << tsl::profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation()
                .pending_op_name
-        << "\nIf the cause is memory fragmentation maybe the environment "
-        << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will "
-        << "improve the situation. \nCurrent allocation summary follows."
-        << "\nCurrent allocation summary follows.";
-    DumpMemoryLog(rounded_bytes);
+        << "\nIf this is caused by memory fragmentation, the environment "
+        << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' may improve "
+        << "the situation.\nCurrent allocation summary follows.";
+    DumpMemoryLog(rounded_bytes, allocation_end);
     LOG(WARNING) << RenderOccupancy();
   }
   return nullptr;
 }
 
-int64_t BFCAllocator::LargestFreeChunk() {
+size_t BFCAllocator::LargestBinnedFreeChunk() {
   for (int i = kNumBins - 1; i >= 0; i--) {
     if (!BinFromIndex(i)->free_chunks.empty()) {
       return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
@@ -536,10 +589,34 @@ int64_t BFCAllocator::LargestFreeChunk() {
   return 0;
 }
 
+size_t BFCAllocator::LargestBinnedFreeChunk(AllocationEnd allocation_end) {
+  ChunkTag tag = ChunkTagOf(allocation_end);
+  for (int i = kNumBins - 1; i >= 0; i--) {
+    Bin* b = BinFromIndex(i);
+    for (auto it = b->free_chunks.rbegin(); it != b->free_chunks.rend(); ++it) {
+      const Chunk* chunk = ChunkFromHandle(*it);
+      if (chunk->tag == tag) {
+        return chunk->size;
+      }
+    }
+  }
+  return 0;
+}
+
+size_t BFCAllocator::LargestFreeChunk() {
+  size_t largest = LargestBinnedFreeChunk();
+  if (central_gap_ != kInvalidChunkHandle) {
+    largest = std::max(largest, ChunkFromHandle(central_gap_)->size);
+  }
+  return largest;
+}
+
 double BFCAllocator::GetFragmentation() {
   int64_t bytes_available = *stats_.pool_bytes - stats_.bytes_in_use;
   DCHECK_GE(bytes_available, 0);
-  return static_cast<double>(bytes_available - LargestFreeChunk()) /
+  size_t largest_free_chunk = LargestFreeChunk();
+  return static_cast<double>(bytes_available -
+                             static_cast<int64_t>(largest_free_chunk)) /
          bytes_available;
 }
 
@@ -580,115 +657,223 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
 
 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
                                  size_t num_bytes, size_t alignment,
-                                 uint64_t freed_before) {
-  // First identify the first bin that could satisfy rounded_bytes.
-  for (; bin_num < kNumBins; bin_num++) {
-    // Start searching from the first bin for the smallest chunk that fits
-    // rounded_bytes.
-    Bin* b = BinFromIndex(bin_num);
-    for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
-         ++citer) {
-      BFCAllocator::ChunkHandle h = (*citer);
-      BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
+                                 uint64_t freed_before,
+                                 AllocationEnd allocation_end) {
+  // Spatial partitioning keeps three contiguous spans by address:
+  //
+  //   [ kLower (grows up) ][ kCentralGap ][ kUpper (grows down) ]
+  //
+  // A request first reuses a free hole with its own tag from the size bins.
+  // Only if no same-tag hole fits does it carve from the one central gap.
+  // Because neither end can create or consume the other end's interior holes,
+  // lower placements are independent of upper activity and upper placements are
+  // independent of lower activity, except when lower and upper allocations
+  // exhaust the central gap.
+  if (void* ptr = FindTaggedChunkPtr(bin_num, rounded_bytes, num_bytes,
+                                     alignment, freed_before, allocation_end)) {
+    return ptr;
+  }
+  return FindChunkPtrInCentralGap(rounded_bytes, num_bytes, alignment,
+                                  freed_before, allocation_end);
+}
+
+void* BFCAllocator::FindTaggedChunkPtr(BinNum bin_num, size_t rounded_bytes,
+                                       size_t num_bytes, size_t alignment,
+                                       uint64_t freed_before,
+                                       AllocationEnd allocation_end) {
+  const ChunkTag requested_tag = ChunkTagOf(allocation_end);
+  for (BinNum bn = bin_num; bn < kNumBins; bn++) {
+    Bin* b = BinFromIndex(bn);
+    for (ChunkHandle h : b->free_chunks) {
+      Chunk* chunk = ChunkFromHandle(h);
       DCHECK(!chunk->in_use());
+      if (ABSL_PREDICT_FALSE(chunk->tag != requested_tag)) {
+        continue;
+      }
       if (ABSL_PREDICT_FALSE(freed_before > 0) &&
           freed_before < chunk->freed_at_count) {
         continue;
       }
-
-      // Compute how many bytes we need to skip at the front of this chunk
-      // to reach the requested alignment boundary.
-      uintptr_t ptr_int = absl::bit_cast<uintptr_t>(chunk->ptr);
-      size_t align_padding =
-          (alignment - (ptr_int & (alignment - 1))) % alignment;
-      // Round padding up to kMinAllocationSize so the prefix chunk is valid.
-      align_padding = RoundedBytes(align_padding);
-
-      if (chunk->size >= rounded_bytes + align_padding) {
-        // We found an existing chunk that fits us that wasn't in use, so remove
-        // it from the free bin structure prior to using.
-        RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
-
-        // If alignment requires it, split off the unaligned prefix as a
-        // separate free chunk.
-        if (align_padding > 0) {
-          SplitChunk(h, align_padding);
-          // After splitting, h still points to the prefix chunk (size =
-          // align_padding). The new aligned chunk is h's next and was
-          // inserted into a free bin by SplitChunk.
-          chunk = ChunkFromHandle(h);
-          // Put the prefix back into the free bin.
-          InsertFreeChunkIntoBin(h);
-          // Advance to the aligned chunk and remove it from its free bin
-          // so we can use it (and potentially split it again below).
-          h = chunk->next;
-          chunk = ChunkFromHandle(h);
-          RemoveFreeChunkFromBin(h);
+      const uintptr_t chunk_start = absl::bit_cast<uintptr_t>(chunk->ptr);
+      if (ABSL_PREDICT_FALSE(allocation_end == AllocationEnd::kUpper)) {
+        const uintptr_t aligned_start = HighEndAlignedStart(
+            chunk_start, chunk->size, rounded_bytes, alignment);
+        if (ABSL_PREDICT_FALSE(aligned_start < chunk_start)) {
+          continue;
         }
-
-        // If we can break the size of the chunk into two reasonably large
-        // pieces, do don't waste more than max_internal_fragmentation_bytes on
-        // padding. If this threshold is not set by the user, then use 128MB as
-        // the default.
-        const int64_t max_internal_fragmentation_bytes =
-            (opts_.fragmentation_fraction > 0.0)
-                ? opts_.fragmentation_fraction * memory_limit_
-                : 128 << 20;
-
-        if (chunk->size >= rounded_bytes * 2 ||
-            static_cast<int64_t>(chunk->size) - rounded_bytes >=
-                max_internal_fragmentation_bytes) {
-          SplitChunk(h, rounded_bytes);
-          chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
+      } else {
+        const size_t align_padding =
+            LowEndAlignmentPadding(chunk_start, alignment);
+        if (ABSL_PREDICT_FALSE(chunk->size < rounded_bytes + align_padding)) {
+          continue;
         }
+      }
+      RemoveFreeChunkFromBin(h);
+      return allocation_end == AllocationEnd::kUpper
+                 ? AllocateChunkFromHighEnd(h, rounded_bytes, num_bytes,
+                                            alignment)
+                 : AllocateChunkFromLowEnd(h, rounded_bytes, num_bytes,
+                                           alignment);
+    }
+  }
 
-        // The requested size of the returned chunk is what the user
-        // has allocated.
-        chunk->requested_size = num_bytes;
-        // Assign a unique id and increment the id counter, marking the
-        // chunk as being in use.
-        chunk->allocation_id = next_allocation_id_++;
-
-        // Update stats.
-        ++stats_.num_allocs;
-        stats_.bytes_in_use += chunk->size;
-        if (stats_.bytes_in_use > stats_.peak_bytes_in_use) {
-          VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use
-                  << " bytes for " << Name();
-        }
-        stats_.peak_bytes_in_use =
-            std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
-        stats_.largest_alloc_size =
-            std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
+  return nullptr;
+}
 
-#ifdef TENSORFLOW_MEM_DEBUG
-        if (ShouldRecordOpName()) {
-          const auto& annotation =
-              profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
-          if (!annotation.pending_op_name.empty()) {
-            chunk->op_name = annotation.pending_op_name;
-          } else {
-            LOG(INFO) << "missing pending_op_name for " << Name() << "\n"
-                      << CurrentStackTrace();
-            chunk->op_name = nullptr;
-          }
-          chunk->action_count = ++action_counter_;
-          chunk->step_id = annotation.pending_step_id;
-          int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
-          size_history_[slot] = stats_.bytes_in_use;
-        }
-#endif
+void* BFCAllocator::FindChunkPtrInCentralGap(size_t rounded_bytes,
+                                             size_t num_bytes, size_t alignment,
+                                             uint64_t freed_before,
+                                             AllocationEnd allocation_end) {
+  if (ABSL_PREDICT_FALSE(central_gap_ == kInvalidChunkHandle)) {
+    return nullptr;
+  }
 
-        VLOG(4) << "Returning: " << chunk->ptr;
-        if (VLOG_IS_ON(4)) {
-          LOG(INFO) << "A: " << RenderOccupancy();
-        }
-        return chunk->ptr;
-      }
+  ChunkHandle h = central_gap_;
+  Chunk* chunk = ChunkFromHandle(h);
+  DCHECK(!chunk->in_use());
+  DCHECK_EQ(chunk->tag, ChunkTag::kCentralGap);
+  if (ABSL_PREDICT_FALSE(freed_before > 0) &&
+      freed_before < chunk->freed_at_count) {
+    return nullptr;
+  }
+
+  const uintptr_t chunk_start = absl::bit_cast<uintptr_t>(chunk->ptr);
+  if (ABSL_PREDICT_FALSE(allocation_end == AllocationEnd::kUpper)) {
+    const uintptr_t aligned_start =
+        HighEndAlignedStart(chunk_start, chunk->size, rounded_bytes, alignment);
+    if (ABSL_PREDICT_FALSE(aligned_start < chunk_start)) {
+      return nullptr;
+    }
+  } else {
+    const size_t align_padding = LowEndAlignmentPadding(chunk_start, alignment);
+    if (ABSL_PREDICT_FALSE(chunk->size < rounded_bytes + align_padding)) {
+      return nullptr;
     }
   }
 
-  return nullptr;
+  RemoveFreeChunk(h);
+  return allocation_end == AllocationEnd::kUpper
+             ? AllocateChunkFromHighEnd(h, rounded_bytes, num_bytes, alignment)
+             : AllocateChunkFromLowEnd(h, rounded_bytes, num_bytes, alignment);
+}
+
+void* BFCAllocator::AllocateChunkFromLowEnd(ChunkHandle h, size_t rounded_bytes,
+                                            size_t num_bytes,
+                                            size_t alignment) {
+  Chunk* chunk = ChunkFromHandle(h);
+
+  // If alignment requires it, split off the unaligned prefix as a separate free
+  // chunk. For a central-gap carve, that prefix falls below the new lower
+  // allocation and therefore becomes a lower-owned interior hole.
+  uintptr_t ptr_int = absl::bit_cast<uintptr_t>(chunk->ptr);
+  const size_t align_padding = LowEndAlignmentPadding(ptr_int, alignment);
+  if (ABSL_PREDICT_FALSE(align_padding > 0)) {
+    SplitChunk(h, align_padding);
+    chunk = ChunkFromHandle(h);
+    ChunkHandle aligned_h = chunk->next;
+    RemoveFreeChunk(aligned_h);
+    chunk->tag = ChunkTag::kLower;
+    InsertFreeChunk(h);
+    h = aligned_h;
+    chunk = ChunkFromHandle(h);
+  }
+
+  // If we can break the size of the chunk into two reasonably large pieces,
+  // don't waste more than max_internal_fragmentation_bytes_ on padding. The
+  // trailing remainder keeps the source chunk's tag (the central gap keeps its
+  // tag; a lower hole stays lower).
+  if (chunk->size >= rounded_bytes * 2 ||
+      static_cast<int64_t>(chunk->size) - rounded_bytes >=
+          max_internal_fragmentation_bytes_) {
+    SplitChunk(h, rounded_bytes);
+    chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved.
+  }
+
+  // The in-use chunk gets the lower tag.
+  chunk->tag = ChunkTag::kLower;
+  FinishChunkAllocation(chunk, num_bytes);
+  return chunk->ptr;
+}
+
+void* BFCAllocator::AllocateChunkFromHighEnd(ChunkHandle h,
+                                             size_t rounded_bytes,
+                                             size_t num_bytes,
+                                             size_t alignment) {
+  Chunk* chunk = ChunkFromHandle(h);
+
+  uintptr_t chunk_start = absl::bit_cast<uintptr_t>(chunk->ptr);
+  const uintptr_t aligned_start =
+      HighEndAlignedStart(chunk_start, chunk->size, rounded_bytes, alignment);
+  CHECK_GE(aligned_start, chunk_start);  // Crash OK
+  const size_t prefix_size = aligned_start - chunk_start;
+
+  // Split off everything below the aligned start as a free prefix, so the
+  // allocation lands at the high end of the chunk and grows downward. For a
+  // central-gap carve, this prefix remains the central gap.
+  if (ABSL_PREDICT_TRUE(prefix_size > 0)) {
+    SplitChunk(h, prefix_size);
+    ChunkHandle aligned_h = ChunkFromHandle(h)->next;
+    RemoveFreeChunk(aligned_h);
+    InsertFreeChunk(h);
+    h = aligned_h;
+    chunk = ChunkFromHandle(h);
+  }
+
+  // Split off any aligned-up suffix as a free remainder. Set the tag before
+  // splitting so the suffix inherits kUpper directly.
+  chunk->tag = ChunkTag::kUpper;
+  if (chunk->size > rounded_bytes) {
+    SplitChunk(h, rounded_bytes);
+    chunk = ChunkFromHandle(h);
+  }
+
+  // The in-use chunk gets the upper tag.
+  chunk->tag = ChunkTag::kUpper;
+  FinishChunkAllocation(chunk, num_bytes);
+  return chunk->ptr;
+}
+
+void BFCAllocator::FinishChunkAllocation(Chunk* chunk, size_t num_bytes) {
+  // The requested size of the returned chunk is what the user has allocated.
+  chunk->requested_size = num_bytes;
+  // Assign a unique id and increment the id counter, marking the chunk as being
+  // in use.
+  chunk->allocation_id = next_allocation_id_++;
+
+  // Update stats.
+  ++stats_.num_allocs;
+  stats_.bytes_in_use += chunk->size;
+  if (stats_.bytes_in_use > stats_.peak_bytes_in_use) {
+    VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use
+            << " bytes for " << Name();
+  }
+  stats_.peak_bytes_in_use =
+      std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+  stats_.largest_alloc_size =
+      std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
+
+#ifdef TENSORFLOW_MEM_DEBUG
+  if (ShouldRecordOpName()) {
+    const auto& annotation =
+        profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
+    if (!annotation.pending_op_name.empty()) {
+      chunk->op_name = annotation.pending_op_name;
+    } else {
+      LOG(INFO) << "missing pending_op_name for " << Name() << "\n"
+                << CurrentStackTrace();
+      chunk->op_name = nullptr;
+    }
+    chunk->action_count = ++action_counter_;
+    chunk->step_id = annotation.pending_step_id;
+    int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
+    size_history_[slot] = stats_.bytes_in_use;
+  }
+#endif
+
+  VLOG(4) << "Returning: " << chunk->ptr;
+  if (VLOG_IS_ON(4)) {
+    LOG(INFO) << "A: " << RenderOccupancy();
+  }
 }
 
 void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
@@ -713,6 +898,9 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
   // It inherits the freed time.
   new_chunk->freed_at_count = c->freed_at_count;
 
+  // It inherits the tag; callers update the in-use piece after splitting.
+  new_chunk->tag = c->tag;
+
   // Maintain the pointers.
   // c <-> c_neighbor becomes
   // c <-> new_chunk <-> c_neighbor
@@ -725,8 +913,8 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
     c_neighbor->prev = h_new_chunk;
   }
 
-  // Add the newly free chunk to the free bin.
-  InsertFreeChunkIntoBin(h_new_chunk);
+  // Add the newly free chunk to the appropriate free structure.
+  InsertFreeChunk(h_new_chunk);
 }
 
 void BFCAllocator::DeallocateRaw(void* ptr) {
@@ -759,13 +947,13 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) {
 
   // Consider coalescing it.
   if (ABSL_PREDICT_FALSE(timing_counter_ != nullptr)) {
-    InsertFreeChunkIntoBin(h);
+    InsertFreeChunk(h);
     timestamped_chunks_.push_back(h);
   } else {
-    InsertFreeChunkIntoBin(TryToCoalesce(h, false));
+    InsertFreeChunk(TryToCoalesce(h, false));
   }
 
-  // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
+  // TraceMe needs to be added after MarkFree and InsertFreeChunk for
   // correct aggregation stats (bytes_in_use, fragmentation).
   AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);
 
@@ -774,10 +962,19 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) {
   }
 }
 
+BFCAllocator::ChunkTag BFCAllocator::MergedChunkTag(ChunkTag a,
+                                                    ChunkTag b) const {
+  // Two free holes with the same tag keep that tag (an interior hole still
+  // belongs to its end). Any other combination -- a hole merging with the
+  // central gap, or lower-end and upper-end holes meeting after the gap is
+  // exhausted -- yields a kCentralGap span reusable by either end.
+  return a == b ? a : ChunkTag::kCentralGap;
+}
+
 // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
 // We merge Chunk(h2) into Chunk(h1).
-void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
-                         BFCAllocator::ChunkHandle h2) {
+void BFCAllocator::MergeChunks(BFCAllocator::ChunkHandle h1,
+                               BFCAllocator::ChunkHandle h2) {
   Chunk* c1 = ChunkFromHandle(h1);
   Chunk* c2 = ChunkFromHandle(h2);
   // We can only merge chunks that are not in use.
@@ -802,6 +999,9 @@ void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
   // Set the new size
   c1->size += c2->size;
 
+  // Combine tags: merging with the central gap grows the gap.
+  c1->tag = MergedChunkTag(c1->tag, c2->tag);
+
   // Pick latest free time.
   c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);
 
@@ -816,15 +1016,54 @@ void BFCAllocator::DeleteChunk(ChunkHandle h) {
   DeallocateChunk(h);
 }
 
-void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
+void BFCAllocator::InsertFreeChunk(BFCAllocator::ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
-  CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
+  CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));  // Crash OK
+  if (c->tag == ChunkTag::kCentralGap) {
+    CHECK_EQ(central_gap_, kInvalidChunkHandle)  // Crash OK
+        << "spatial partitioning expects one central gap";
+    central_gap_ = h;
+    return;
+  }
+  CHECK_NE(c->tag, ChunkTag::kCentralGap);  // Crash OK
   BinNum bin_num = BinNumForSize(c->size);
   Bin* new_bin = BinFromIndex(bin_num);
   c->bin_num = bin_num;
   new_bin->free_chunks.insert(h);
 }
 
+void BFCAllocator::RemoveFreeChunk(BFCAllocator::ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  CHECK(!c->in_use());  // Crash OK
+  if (c->tag == ChunkTag::kCentralGap && c->bin_num == kInvalidBinNum) {
+    CHECK_EQ(central_gap_, h);  // Crash OK
+    central_gap_ = kInvalidChunkHandle;
+    return;
+  }
+  RemoveFreeChunkFromBin(h);
+}
+
+void BFCAllocator::ReturnBoundaryChunkToGap(BFCAllocator::ChunkHandle h) {
+  if (ABSL_PREDICT_TRUE(free_chunk_tag_ != ChunkTag::kCentralGap)) {
+    return;
+  }
+  Chunk* c = ChunkFromHandle(h);
+  CHECK(!c->in_use());  // Crash OK
+  if (ABSL_PREDICT_TRUE(c->tag == ChunkTag::kLower)) {
+    ChunkHandle n = c->next;
+    if (n == kInvalidChunkHandle ||
+        ChunkFromHandle(n)->tag != ChunkTag::kLower) {
+      c->tag = ChunkTag::kCentralGap;
+    }
+  } else if (ABSL_PREDICT_FALSE(c->tag == ChunkTag::kUpper)) {
+    ChunkHandle p = c->prev;
+    if (p == kInvalidChunkHandle ||
+        ChunkFromHandle(p)->tag != ChunkTag::kUpper) {
+      c->tag = ChunkTag::kCentralGap;
+    }
+  }
+}
+
 void BFCAllocator::RemoveFreeChunkIterFromBin(
     BFCAllocator::Bin::FreeChunkSet* free_chunks,
     const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
@@ -850,9 +1089,16 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
   // Mark the chunk as no longer in use.
   c->allocation_id = -1;
 
-  // Optionally record the free time.
+  // Optionally record the free time. Timestamped chunks are kept in their
+  // original lower/upper tag until they become safe to merge; otherwise a
+  // pending boundary free could create a second central gap.
   if (ABSL_PREDICT_FALSE(timing_counter_ != nullptr)) {
     c->freed_at_count = timing_counter_->next();
+  } else {
+    c->freed_at_count = 0;
+  }
+  if (ABSL_PREDICT_TRUE(c->freed_at_count == 0)) {
+    ReturnBoundaryChunkToGap(h);
   }
 
   // Updates the stats.
@@ -880,8 +1126,8 @@ BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
     Chunk* n = ChunkFromHandle(c->next);
     if ((n->freed_at_count == 0) || ignore_freed_at) {
       VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr;
-      RemoveFreeChunkFromBin(c->next);
-      Merge(h, c->next);
+      RemoveFreeChunk(c->next);
+      MergeChunks(h, c->next);
     }
   }
 
@@ -891,8 +1137,8 @@ BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
     if ((n->freed_at_count == 0) || ignore_freed_at) {
       VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
       coalesced_chunk = c->prev;
-      RemoveFreeChunkFromBin(c->prev);
-      Merge(c->prev, h);
+      RemoveFreeChunk(c->prev);
+      MergeChunks(c->prev, h);
     }
   }
 
@@ -966,9 +1212,12 @@ bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
       Chunk* c = ChunkFromHandle(h);
       DCHECK_NE(c->bin_num, kInvalidBinNum);
       DCHECK(!c->in_use());
-      RemoveFreeChunkFromBin(h);
+      RemoveFreeChunk(h);
+      if (c->freed_at_count == 0 || required_bytes > 0) {
+        ReturnBoundaryChunkToGap(h);
+      }
       ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0));
-      InsertFreeChunkIntoBin(new_h);
+      InsertFreeChunk(new_h);
       if (required_bytes > 0) {
         c = ChunkFromHandle(new_h);
         if (new_h != h && c->freed_at_count > 0) {
@@ -1087,7 +1336,8 @@ std::string BFCAllocator::RenderOccupancy() {
   return rendered;
 }
 
-void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
+void BFCAllocator::DumpMemoryLog(size_t num_bytes,
+                                 AllocationEnd allocation_end) {
   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
   LOG(INFO) << "BFCAllocator dump for " << Name();
   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
@@ -1135,6 +1385,7 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
       std::string buf = absl::StrCat(
           (c->in_use() ? "InUse" : "Free "), " at ",
           absl::Hex(absl::bit_cast<uintptr_t>(c->ptr)), " of size ", c->size);
+      absl::StrAppend(&buf, " tag ", c->tag);
 #ifdef TENSORFLOW_MEM_DEBUG
       if (ShouldRecordOpName()) {
         absl::StrAppend(&buf, " by op ", c->op_name, " action_count ",
@@ -1157,18 +1408,56 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
               << strings::HumanReadableNumBytes(it.first * it.second);
     total_bytes += (it.first * it.second);
   }
+  size_t memory_limit = memory_limit_;
+  size_t pool_bytes = static_cast<size_t>(*stats_.pool_bytes);
+  size_t unallocated_bytes = memory_limit - pool_bytes;
+  size_t free_pool_bytes =
+      pool_bytes - static_cast<size_t>(stats_.bytes_in_use);
+  size_t largest_binned_free_chunk = LargestBinnedFreeChunk();
+  size_t largest_compatible_binned_free_chunk =
+      LargestBinnedFreeChunk(allocation_end);
+  size_t central_gap_bytes = central_gap_ == kInvalidChunkHandle
+                                 ? 0
+                                 : ChunkFromHandle(central_gap_)->size;
+  size_t largest_free_chunk =
+      std::max(largest_binned_free_chunk, central_gap_bytes);
+  size_t largest_compatible_free_chunk =
+      std::max(largest_compatible_binned_free_chunk, central_gap_bytes);
+
   LOG(INFO) << "Sum Total of in-use chunks: "
             << strings::HumanReadableNumBytes(total_bytes);
-  LOG(INFO) << "Total size in pool: "
-            << strings::HumanReadableNumBytes(*stats_.pool_bytes)
-            << " memory_limit_: "
-            << strings::HumanReadableNumBytes(memory_limit_)
-            << " available size: "
-            << strings::HumanReadableNumBytes(memory_limit_ -
-                                              *stats_.pool_bytes)
-            << " curr_region_allocation_bytes_: "
-            << strings::HumanReadableNumBytes(curr_region_allocation_bytes_);
-  LOG(INFO) << "Stats: \n" << stats_.DebugString();
+  LOG(INFO) << "Allocator memory summary: rounded request "
+            << strings::HumanReadableNumBytes(num_bytes)
+            << ", total free in pool "
+            << strings::HumanReadableNumBytes(free_pool_bytes)
+            << ", largest free chunk usable by " << allocation_end << " end "
+            << strings::HumanReadableNumBytes(largest_compatible_free_chunk)
+            << ", unallocated bytes "
+            << strings::HumanReadableNumBytes(unallocated_bytes)
+            << ", pool size " << strings::HumanReadableNumBytes(pool_bytes)
+            << ", pool limit " << strings::HumanReadableNumBytes(memory_limit);
+  if (central_gap_ != kInvalidChunkHandle) {
+    LOG(INFO) << "Spatial partitioning summary for " << allocation_end
+              << " end: central gap "
+              << strings::HumanReadableNumBytes(central_gap_bytes)
+              << ", largest same-end binned free chunk "
+              << strings::HumanReadableNumBytes(
+                     largest_compatible_binned_free_chunk)
+              << ", largest binned free chunk from any end "
+              << strings::HumanReadableNumBytes(largest_binned_free_chunk)
+              << ". The central gap is shared by lower/upper allocations "
+                 "and is not inserted into a bin.";
+  }
+  LOG(INFO) << "Allocator region growth hint: next region allocation target "
+            << strings::HumanReadableNumBytes(curr_region_allocation_bytes_)
+            << (opts_.allow_growth
+                    ? " (growth enabled)."
+                    : " (growth disabled; fixed-size pool; not an attempted "
+                      "allocation).");
+
+  AllocatorStats stats = stats_;
+  stats.largest_free_block_bytes = static_cast<int64_t>(largest_free_chunk);
+  LOG(INFO) << "Stats: \n" << stats.DebugString();
 }
 
 void BFCAllocator::MaybeWriteMemoryMap() {
@@ -1265,7 +1554,9 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
 
 std::optional<AllocatorStats> BFCAllocator::GetStats() {
   absl::MutexLock l(mutex_);
-  return stats_;
+  AllocatorStats stats = stats_;
+  stats.largest_free_block_bytes = static_cast<int64_t>(LargestFreeChunk());
+  return stats;
 }
 
 bool BFCAllocator::ClearStats() {
@@ -1283,6 +1574,12 @@ BFCAllocator::get_bin_debug_info() {
     ChunkHandle h = region_manager_.get_handle(region.ptr());
     while (h != kInvalidChunkHandle) {
       const Chunk* c = ChunkFromHandle(h);
+      if (!c->in_use() && c->tag == ChunkTag::kCentralGap &&
+          c->bin_num == kInvalidBinNum) {
+        CHECK_EQ(central_gap_, h);  // Crash OK
+        h = c->next;
+        continue;
+      }
       BinNum bin_num = BinNumForSize(c->size);
       BinDebugInfo& bin_info = bin_infos[bin_num];
       bin_info.total_bytes_in_bin += c->size;
diff --git a/xla/tsl/framework/bfc_allocator.h b/xla/tsl/framework/bfc_allocator.h
index 91cd993ed49a4..1c0ba329018bc 100644
--- a/xla/tsl/framework/bfc_allocator.h
+++ b/xla/tsl/framework/bfc_allocator.h
@@ -47,14 +47,50 @@ class MemoryDump;
 namespace tsl {
 using tensorflow::MemoryDump;
 
-// A memory allocator that implements a 'best-fit with coalescing'
-// algorithm.  This is essentially a very simple version of Doug Lea's
-// malloc (dlmalloc).
+// A memory allocator that implements best-fit with coalescing (BFC), a
+// simple dlmalloc-style allocator for arenas where most allocations go through
+// this interface.
+//
+// See prior art: https://gee.cs.oswego.edu/dl/html/malloc.html
+//
+// High-level model:
+//
+// - Backing memory comes from the SubAllocator as AllocationRegions. With
+//   Options::allow_growth=true the allocator grows by adding regions up to
+//   total_memory; with Options::allow_growth=false it reserves one fixed region
+//   during construction. stats_.bytes_reserved tracks bytes held from the
+//   SubAllocator, while stats_.bytes_in_use tracks bytes currently live for
+//   clients.
+//
+// - Each AllocationRegion is represented as an ordered sequence of Chunks that
+//   cover the region without gaps. This is boundary-tag-style bookkeeping: the
+//   allocator can find physically adjacent chunks and coalesce neighboring free
+//   chunks, even though the metadata lives in Chunk objects instead of literal
+//   dlmalloc headers/trailers. A Chunk is either entirely in use or entirely
+//   free. Allocations split free chunks when needed, and frees coalesce
+//   adjacent free chunks to repair fragmentation.
+//
+// - Free chunks are indexed by size-class Bins. Each Bin stores ChunkHandles in
+//   a FreeChunkSet ordered by chunk size and then address. Allocation starts in
+//   the smallest viable bin, scans upward, and uses the smallest fitting chunk.
+//   Allocated chunks are never in a Bin.
+//
+// - AllocationAttributes::allocation_end controls placement. Without spatial
+//   partitioning all requests use AllocationEnd::kLower, and ordinary free
+//   chunks stay in ChunkTag::kLower, which is classic BFC behavior.
+//
+// - With Options::enable_spatial_partitioning=true, which requires
+//   Options::allow_growth=false, the fixed address range is split into
+//   lower-end ownership, one central gap, and upper-end ownership.
+//   AllocationEnd::kLower requests grow upward, and AllocationEnd::kUpper
+//   requests grow downward. ChunkTag records ownership: kLower and kUpper for
+//   allocated chunks and same-tag interior holes, and kCentralGap for the
+//   central gap. The central gap is tracked by central_gap_ instead of being
+//   inserted into a Bin. Each end first reuses binned holes with its own tag,
+//   then carves from the central gap. This keeps each end's placements
+//   independent of activity from the opposite end except when lower and upper
+//   allocations exhaust the central gap.
 //
-// The goal of this allocator is to support defragmentation via
-// coalescing.  One assumption we make is that the process using this
-// allocator owns pretty much all of the memory, and that nearly
-// all requests to allocate memory go through this interface.
 class BFCAllocator : public Allocator {
  public:
   struct Options {
@@ -75,7 +111,36 @@ class BFCAllocator : public Allocator {
     // Controls when a chunk should be split, if its size exceeds the requested
     // allocation size.
     double fragmentation_fraction = 0;
+
+    // If true, the allocator spatially partitions a single pre-allocated
+    // address range by serving requests from either end. AllocationEnd::kLower
+    // requests grow up from the low address; AllocationEnd::kUpper requests
+    // grow down from the high address; a central gap sits in between:
+    //
+    //   low address                                      high address
+    //   |------------------------------------------------------------|
+    //   | lower-end owned --->   central gap   <--- upper-end owned |
+    //   |------------------------------------------------------------|
+    //
+    // The split is fully dynamic with no hard boundary: a request carves from
+    // the central gap or reuses a free hole of its OWN tag, but never the
+    // other end's tagged interior holes. When a buffer at either end of the
+    // central gap is freed it rejoins the gap, growing it, and adjacent holes
+    // with the same tag cascade back in turn -- so e.g. allocating 100% lower,
+    // freeing it, then allocating 100% upper is fully supported. The only
+    // failure is true exhaustion: lower and upper meeting with no gap left.
+    //
+    // Because neither end ever carves the other's interior holes, each end's
+    // placement is a pure function of that end's request sequence and is never
+    // perturbed by activity from the opposite end, except when lower and upper
+    // allocations exhaust the central gap. That makes offsets reproducible
+    // across processes that issue the same requests for that end in the same
+    // order, e.g. symmetric collective buffers across ranks.
+    //
+    // Requires allow_growth=false (a single fixed address range).
+    bool enable_spatial_partitioning = false;
   };
+
   BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator, size_t total_memory,
                const std::string& name, const Options& opts);
 
@@ -122,7 +187,8 @@ class BFCAllocator : public Allocator {
 
   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
                             bool dump_log_on_failure,
-                            uint64_t freed_before_count);
+                            uint64_t freed_before_count,
+                            AllocationEnd allocation_end);
 
   void* AllocateRawInternalWithRetry(
       size_t alignment, size_t num_bytes,
@@ -147,9 +213,15 @@ class BFCAllocator : public Allocator {
   bool MergeTimestampedChunks(size_t required_bytes)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  // Return the largest free chunk bytes from the largest bin in constant time.
-  // The free chunks are sorted by size (and then address) in a bin.
-  int64_t LargestFreeChunk() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  // Return the largest binned free chunk. Free chunks are sorted by size (and
+  // then address) in a bin.
+  size_t LargestBinnedFreeChunk() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  size_t LargestBinnedFreeChunk(AllocationEnd allocation_end)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Return the largest free chunk, including the central gap when spatial
+  // partitioning is enabled.
+  size_t LargestFreeChunk() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Add TraceMe (in memory allocation and deallocation) for memory stats
   // profiling. The chunk_ptr is passed to get information such as address,
@@ -172,6 +244,38 @@ class BFCAllocator : public Allocator {
   // The following means that the largest bin'd chunk size is 256 << 21 = 512MB.
   static constexpr int kNumBins = 21;
 
+  // Tag describing a chunk's ownership state. Spatial partitioning keeps three
+  // contiguous spans by address:
+  //
+  //   [ kLower (grows up) ][ kCentralGap ][ kUpper (grows down) ]
+  //
+  // A request may carve from the contiguous kCentralGap span or reuse a
+  // free hole with its OWN tag, but never the other end's tagged holes. This
+  // keeps each end's offsets a pure function of that end's request sequence.
+  // The split between lower-end, central-gap, and upper-end spans is fully
+  // dynamic with no hard boundary: when a boundary chunk is freed it rejoins
+  // the central gap, growing it, and adjacent same-tag holes cascade back in
+  // turn. So e.g. allocating 100% kLower, freeing it, then allocating 100%
+  // kUpper is supported -- the freed lower space cascades back into one
+  // central gap that the upper end can then consume.
+  enum class ChunkTag : uint8_t {
+    kCentralGap,  // The single central gap between lower-end and upper-end
+                  // ownership. Either end may carve from it.
+    kLower,  // Lower-end-owned: in use, or a free hole reusable only by the
+             // lower end until it rejoins the gap.
+    kUpper,  // Upper-end-owned: in use, or a free hole reusable only by the
+             // upper end until it rejoins the gap.
+  };
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, ChunkTag tag);
+
+  // The tag owned by an allocation from `allocation_end`.
+  static ChunkTag ChunkTagOf(AllocationEnd allocation_end) {
+    return allocation_end == AllocationEnd::kUpper ? ChunkTag::kUpper
+                                                   : ChunkTag::kLower;
+  }
+
   // A Chunk points to a piece of memory that's either entirely free or entirely
   // in use by one user memory allocation.
   //
@@ -218,6 +322,11 @@ class BFCAllocator : public Allocator {
     // Optional count when this chunk was most recently made free.
     uint64_t freed_at_count = 0;
 
+    // Ownership state for this chunk (see ChunkTag). A chunk in the central
+    // gap is kCentralGap; interior free holes keep their tag until they
+    // rejoin the gap.
+    ChunkTag tag = ChunkTag::kCentralGap;
+
     bool in_use() const { return allocation_id != -1; }
 
 #ifdef TENSORFLOW_MEM_DEBUG
@@ -227,8 +336,8 @@ class BFCAllocator : public Allocator {
     int64 action_count = 0;
 #endif
 
-    std::string DebugString(BFCAllocator* a,
-                            bool recurse) ABSL_NO_THREAD_SAFETY_ANALYSIS {
+    std::string DebugString(BFCAllocator* a, bool recurse)
+        ABSL_EXCLUSIVE_LOCKS_REQUIRED(a->mutex_) {
       std::string dbg;
       absl::StrAppend(
           &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
@@ -272,12 +381,11 @@ class BFCAllocator : public Allocator {
       }
 
      private:
-      BFCAllocator* allocator_;  // The parent allocator
+      BFCAllocator* allocator_;  // The parent allocator.
     };
 
     using FreeChunkSet = absl::btree_set<ChunkHandle, ChunkComparator>;
     // List of free chunks within the bin, sorted by chunk size.
-    // Chunk * not owned.
     FreeChunkSet free_chunks;
     Bin(BFCAllocator* allocator, size_t bs)
         : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
@@ -454,6 +562,24 @@ class BFCAllocator : public Allocator {
   // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
   static size_t RoundedBytes(size_t bytes);
 
+  // Returns the first aligned address at or above 'ptr'. Alignment must be a
+  // power of two.
+  static uintptr_t AlignUp(uintptr_t ptr, size_t alignment);
+
+  // Returns the last aligned address at or below 'ptr'. Alignment must be a
+  // power of two.
+  static uintptr_t AlignDown(uintptr_t ptr, size_t alignment);
+
+  // Bytes to skip at the low end of a free chunk so the allocation starts
+  // aligned. The padding is rounded so it can be represented as a Chunk when
+  // split from the allocation.
+  static size_t LowEndAlignmentPadding(uintptr_t chunk_start, size_t alignment);
+
+  // Start address for an allocation carved from the high end of a free chunk.
+  // Returns an address below `chunk_start` if the allocation cannot fit.
+  static uintptr_t HighEndAlignedStart(uintptr_t chunk_start, size_t chunk_size,
+                                       size_t rounded_bytes, size_t alignment);
+
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
@@ -473,9 +599,42 @@ class BFCAllocator : public Allocator {
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Returns a pointer to an underlying allocated chunk of size
-  // 'rounded_bytes' aligned to 'alignment'.
+  // 'rounded_bytes' aligned to 'alignment', served from 'allocation_end'.
   void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
-                     size_t alignment, uint64_t freed_before)
+                     size_t alignment, uint64_t freed_before,
+                     AllocationEnd allocation_end)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Best-fit scan restricted to binned interior holes owned by
+  // 'allocation_end'. Returns the user pointer, or nullptr if no same-tag hole
+  // fits.
+  void* FindTaggedChunkPtr(BinNum bin_num, size_t rounded_bytes,
+                           size_t num_bytes, size_t alignment,
+                           uint64_t freed_before, AllocationEnd allocation_end)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Carves from the central gap. In spatial partitioning mode the gap is
+  // tracked directly by central_gap_ instead of being inserted into bins.
+  void* FindChunkPtrInCentralGap(size_t rounded_bytes, size_t num_bytes,
+                                 size_t alignment, uint64_t freed_before,
+                                 AllocationEnd allocation_end)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Carves an allocation of 'num_bytes' (rounded to 'rounded_bytes') out of the
+  // free chunk 'h', which must already have been removed from its free
+  // structure. The low variant grows up from the chunk's low address (the
+  // default); the high variant grows down from the chunk's high address. Both
+  // return the user pointer.
+  void* AllocateChunkFromLowEnd(ChunkHandle h, size_t rounded_bytes,
+                                size_t num_bytes, size_t alignment)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void* AllocateChunkFromHighEnd(ChunkHandle h, size_t rounded_bytes,
+                                 size_t num_bytes, size_t alignment)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Marks 'chunk' in use and updates allocation stats. Common tail of the two
+  // AllocateChunkFrom*End helpers.
+  void FinishChunkAllocation(Chunk* chunk, size_t num_bytes)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Splits the chunk specified by 'h' into two chunks, one at least
@@ -483,13 +642,29 @@ class BFCAllocator : public Allocator {
   void SplitChunk(ChunkHandle h, size_t num_bytes)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
+  // Tag of the free chunk formed by merging two adjacent free neighbors:
+  // the common tag if both holes have the same tag (an interior hole keeps
+  // its end), otherwise kCentralGap -- so a hole merging with the central gap,
+  // or lower and upper holes becoming adjacent, yields space reusable by either
+  // end.
+  ChunkTag MergedChunkTag(ChunkTag a, ChunkTag b) const;
+
   // Merges the two chunk handles.  Requires that the chunks are
   // contiguous in their allocation.
-  void Merge(ChunkHandle h, ChunkHandle h2)
+  void MergeChunks(ChunkHandle h, ChunkHandle h2)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  // Adds the chunk 'h' to the proper free bin.
-  void InsertFreeChunkIntoBin(ChunkHandle h)
+  // Adds the chunk 'h' to the free data structure. Spatial partitioning
+  // keeps the single central gap out of the bins and bins only lower/upper
+  // interior holes; classic BFC inserts every free chunk into a size bin.
+  void InsertFreeChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Removes the chunk 'h' from the free data structure.
+  void RemoveFreeChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Reclassifies a just-freed lower/upper boundary chunk as kCentralGap when it
+  // is no longer interior to its tag.
+  void ReturnBoundaryChunkToGap(ChunkHandle h)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Removes the free chunk pointed to by 'c' from the set free_chunks.
@@ -507,7 +682,8 @@ class BFCAllocator : public Allocator {
   void DeleteChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   std::string RenderOccupancy() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-  void DumpMemoryLog(size_t num_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void DumpMemoryLog(size_t num_bytes, AllocationEnd allocation_end)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   tensorflow::MemoryDump RecordMemoryMapInternal()
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   void MaybeWriteMemoryMap() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
@@ -546,6 +722,11 @@ class BFCAllocator : public Allocator {
   // Structures immutable after construction
   size_t memory_limit_ = 0;
 
+  // Maximum bytes a chunk may exceed the requested size before it is split, to
+  // bound internal fragmentation. Derived from Options::fragmentation_fraction
+  // and memory_limit_ once at construction.
+  int64_t max_internal_fragmentation_bytes_ = 0;
+
   // Map from bin size to Bin
   Bin* BinFromIndex(BinNum index) {
     return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
@@ -564,6 +745,11 @@ class BFCAllocator : public Allocator {
 
   const Options opts_;
 
+  // Tag assigned to newly-created free chunks. Classic BFC keeps ordinary
+  // free chunks in kLower; spatial partitioning starts each fixed region as
+  // the kCentralGap span.
+  const ChunkTag free_chunk_tag_;
+
   // The size of the current region allocation.
   size_t curr_region_allocation_bytes_;
 
@@ -587,8 +773,13 @@ class BFCAllocator : public Allocator {
 
   std::vector<Chunk> chunks_ ABSL_GUARDED_BY(mutex_);
 
-  // Pointer to head of linked list of free Chunks
-  ChunkHandle free_chunks_list_ ABSL_GUARDED_BY(mutex_);
+  // Head of a singly-linked list of unused Chunk metadata slots in chunks_.
+  // The list reuses Chunk::next while the slot is inactive.
+  ChunkHandle unused_chunk_handle_head_ ABSL_GUARDED_BY(mutex_);
+
+  // The single central gap in spatial partitioning mode. It is not present in
+  // any Bin; lower/upper interior free holes remain binned.
+  ChunkHandle central_gap_ ABSL_GUARDED_BY(mutex_) = kInvalidChunkHandle;
 
   // Counter containing the next unique identifier to assign to a
   // newly-created chunk.
@@ -609,6 +800,25 @@ class BFCAllocator : public Allocator {
   void operator=(const BFCAllocator&) = delete;
 };
 
+//===----------------------------------------------------------------------===//
+// Stringification of enums.
+//===----------------------------------------------------------------------===//
+
+template <typename Sink>
+void AbslStringify(Sink& sink, BFCAllocator::ChunkTag tag) {
+  switch (tag) {
+    case BFCAllocator::ChunkTag::kCentralGap:
+      sink.Append("central_gap");
+      return;
+    case BFCAllocator::ChunkTag::kLower:
+      sink.Append("lower");
+      return;
+    case BFCAllocator::ChunkTag::kUpper:
+      sink.Append("upper");
+      return;
+  }
+}
+
 }  // namespace tsl
 
 #endif  // XLA_TSL_FRAMEWORK_BFC_ALLOCATOR_H_
diff --git a/xla/tsl/framework/bfc_allocator_test.cc b/xla/tsl/framework/bfc_allocator_test.cc
index 8489a72a580c0..0d2b878939fe5 100644
--- a/xla/tsl/framework/bfc_allocator_test.cc
+++ b/xla/tsl/framework/bfc_allocator_test.cc
@@ -17,41 +17,71 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <atomic>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <new>
+#include <optional>
 #include <random>
+#include <utility>
 #include <vector>
 
 #include "absl/base/casts.h"
+#include "absl/base/no_destructor.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/tsl/platform/threadpool.h"
-#include "tsl/platform/mem.h"
 
 namespace tsl {
 namespace {
 
-// Minimal SubAllocator backed by port::AlignedMalloc for host memory.
-class MallocSubAllocator : public SubAllocator {
+static constexpr size_t kAlignment = Allocator::kAllocatorAlignment;
+
+static const absl::NoDestructor<AllocationAttributes> kUpper(
+    /*retry_on_failure=*/false, /*allocation_will_be_logged=*/false,
+    /*freed_by_func=*/nullptr, AllocationEnd::kUpper);
+
+static const absl::NoDestructor<AllocationAttributes> kLower(
+    /*retry_on_failure=*/false, /*allocation_will_be_logged=*/false,
+    /*freed_by_func=*/nullptr, AllocationEnd::kLower);
+
+// SubAllocator that hands out fake (non-dereferenceable) addresses without
+// allocating any real memory. It bump-allocates from a large, fixed virtual
+// base so addresses are unique, well-aligned, and consistent. This lets tests
+// exercise huge pools and verify the exact addresses BFC returns without
+// touching device memory.
+class FakeSubAllocator : public SubAllocator {
  public:
-  MallocSubAllocator() : SubAllocator({}, {}) {}
+  // kBase is a high, page-aligned constant so returned addresses look like
+  // plausible device pointers and never collide with real ones.
+  static constexpr uintptr_t kBase = uintptr_t{1} << 40;
+
+  explicit FakeSubAllocator(
+      std::optional<size_t> hardcoded_alignment = std::nullopt)
+      : SubAllocator({}, {}), hardcoded_alignment_(hardcoded_alignment) {}
 
   void* Alloc(size_t alignment, size_t num_bytes,
               size_t* bytes_received) override {
-    void* ptr = port::AlignedMalloc(num_bytes,
-                                    static_cast<std::align_val_t>(alignment));
+    const size_t effective_alignment = hardcoded_alignment_.value_or(alignment);
+    uintptr_t aligned =
+        (next_ + (effective_alignment - 1)) & ~(effective_alignment - 1);
+    next_ = aligned + num_bytes;
     *bytes_received = num_bytes;
-    return ptr;
+    return absl::bit_cast<void*>(aligned);
   }
 
-  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
+  void Free(void* ptr, size_t num_bytes) override {}
 
   bool SupportsCoalescing() const override { return false; }
+
+ private:
+  std::optional<size_t> hardcoded_alignment_;
+  uintptr_t next_ = kBase;
 };
 
 // Helper to check pointer alignment.
@@ -60,7 +90,7 @@ bool IsAligned(const void* ptr, size_t alignment) {
 }
 
 TEST(BFCAllocatorTest, AllocateAndFree) {
-  BFCAllocator alloc(std::make_unique<MallocSubAllocator>(),
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
                      /*total_memory=*/1 << 20, /*name=*/"test",
                      BFCAllocator::Options{});
 
@@ -70,14 +100,14 @@ TEST(BFCAllocatorTest, AllocateAndFree) {
 }
 
 TEST(BFCAllocatorTest, DefaultAlignment) {
-  BFCAllocator alloc(std::make_unique<MallocSubAllocator>(),
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
                      /*total_memory=*/1 << 20, /*name=*/"test",
                      BFCAllocator::Options{});
 
   // BFC always returns pointers aligned to at least kAllocatorAlignment (64).
-  void* ptr = alloc.AllocateRaw(Allocator::kAllocatorAlignment, 1);
+  void* ptr = alloc.AllocateRaw(kAlignment, 1);
   ASSERT_NE(ptr, nullptr);
-  EXPECT_TRUE(IsAligned(ptr, Allocator::kAllocatorAlignment));
+  EXPECT_TRUE(IsAligned(ptr, kAlignment));
   alloc.DeallocateRaw(ptr);
 }
 
@@ -87,13 +117,13 @@ class BFCAllocatorAlignmentTest : public ::testing::TestWithParam<size_t> {};
 
 TEST_P(BFCAllocatorAlignmentTest, RespectsRequestedAlignment) {
   const size_t alignment = GetParam();
-  BFCAllocator alloc(std::make_unique<MallocSubAllocator>(),
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
                      /*total_memory=*/1 << 20, /*name=*/"test",
                      BFCAllocator::Options{});
 
   // Allocate a small block first to push the arena cursor off any "lucky"
   // alignment, then allocate with the requested alignment.
-  void* filler = alloc.AllocateRaw(Allocator::kAllocatorAlignment, 256);
+  void* filler = alloc.AllocateRaw(kAlignment, 256);
   ASSERT_NE(filler, nullptr);
 
   constexpr int kTrials = 8;
@@ -121,7 +151,7 @@ INSTANTIATE_TEST_SUITE_P(Alignments, BFCAllocatorAlignmentTest,
 // randomized order across multiple iterations. This exercises chunk splitting,
 // alignment padding, coalescing on free, and reuse of freed chunks.
 TEST(BFCAllocatorTest, StressAllocFree) {
-  BFCAllocator alloc(std::make_unique<MallocSubAllocator>(),
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
                      /*total_memory=*/16 << 20, /*name=*/"stress",
                      BFCAllocator::Options{});
 
@@ -177,39 +207,20 @@ TEST(BFCAllocatorTest, StressAllocFree) {
   }
 }
 
-// SubAllocator that always returns 256-byte (kMinAllocationSize) aligned
-// memory but ignores higher alignment requests. This simulates GPU allocators
-// like DeviceMemAllocator where cudaMalloc returns 256-byte aligned memory
-// regardless of the requested alignment.
-class GpuLikeSubAllocator : public SubAllocator {
- public:
-  GpuLikeSubAllocator() : SubAllocator({}, {}) {}
-
-  void* Alloc(size_t /*alignment*/, size_t num_bytes,
-              size_t* bytes_received) override {
-    // Always align to 256 bytes, ignoring the requested alignment.
-    void* ptr = port::AlignedMalloc(num_bytes, std::align_val_t{256});
-    *bytes_received = num_bytes;
-    return ptr;
-  }
-
-  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
-
-  bool SupportsCoalescing() const override { return false; }
-};
-
-// Verify that BFC still respects alignment even when the sub-allocator only
-// provides 256-byte aligned regions (as GPU sub-allocators do).
-TEST(BFCAllocatorTest, AlignmentWithGpuLikeSubAllocator) {
-  BFCAllocator alloc(std::make_unique<GpuLikeSubAllocator>(),
-                     /*total_memory=*/1 << 20, /*name=*/"gpu_like",
-                     BFCAllocator::Options{});
+// Verify that BFC respects requested alignment even when the sub-allocator
+// ignores it and returns addresses aligned above the required minimum.
+TEST(BFCAllocatorTest, AlignmentWithHardcodedSubAllocatorAlignment) {
+  constexpr size_t kHardcodedAlignment = 256;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(kHardcodedAlignment),
+                     /*total_memory=*/1 << 20,
+                     /*name=*/"hardcoded_alignment", BFCAllocator::Options{});
 
   // Push the cursor off any lucky alignment.
-  void* filler = alloc.AllocateRaw(Allocator::kAllocatorAlignment, 256);
+  void* filler = alloc.AllocateRaw(kAlignment, 256);
   ASSERT_NE(filler, nullptr);
 
-  constexpr std::array<size_t, 4> kAlignments = {256, 512, 1024, 4096};
+  constexpr std::array<size_t, 4> kAlignments = {kHardcodedAlignment, 512, 1024,
+                                                 4096};
   constexpr int kTrials = 8;
 
   for (size_t alignment : kAlignments) {
@@ -225,20 +236,409 @@ TEST(BFCAllocatorTest, AlignmentWithGpuLikeSubAllocator) {
   alloc.DeallocateRaw(filler);
 }
 
+//===----------------------------------------------------------------------===//
+// Spatial partitioning tests.
+//===----------------------------------------------------------------------===//
+
+TEST(BFCAllocatorTest, SpatialAllocatesFromEnds) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/4096, /*name=*/"spatial", opts);
+
+  void* lower = alloc.AllocateRaw(kAlignment, 256);
+  ASSERT_NE(lower, nullptr);
+
+  void* upper = alloc.AllocateRaw(kAlignment, 256, *kUpper);
+  ASSERT_NE(upper, nullptr);
+
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(upper) - absl::bit_cast<uintptr_t>(lower),
+            4096 - 256);
+
+  alloc.DeallocateRaw(upper);
+  alloc.DeallocateRaw(lower);
+}
+
+// Lower activity does not perturb upper offsets.
+TEST(BFCAllocatorTest, SpatialKeepsUpperOffset) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+
+  BFCAllocator alloc_a(std::make_unique<FakeSubAllocator>(),
+                       /*total_memory=*/4096, /*name=*/"spatial_a", opts);
+  BFCAllocator alloc_b(std::make_unique<FakeSubAllocator>(),
+                       /*total_memory=*/4096, /*name=*/"spatial_b", opts);
+
+  void* lower_a = alloc_a.AllocateRaw(kAlignment, 256);
+  void* upper_a = alloc_a.AllocateRaw(kAlignment, 512, *kUpper);
+  void* lower_b = alloc_b.AllocateRaw(kAlignment, 256);
+  void* extra_lower_b = alloc_b.AllocateRaw(kAlignment, 1024);
+  void* upper_b = alloc_b.AllocateRaw(kAlignment, 512, *kUpper);
+
+  ASSERT_NE(lower_a, nullptr);
+  ASSERT_NE(upper_a, nullptr);
+  ASSERT_NE(lower_b, nullptr);
+  ASSERT_NE(extra_lower_b, nullptr);
+  ASSERT_NE(upper_b, nullptr);
+
+  const uintptr_t upper_offset_a =
+      absl::bit_cast<uintptr_t>(upper_a) - absl::bit_cast<uintptr_t>(lower_a);
+  const uintptr_t upper_offset_b =
+      absl::bit_cast<uintptr_t>(upper_b) - absl::bit_cast<uintptr_t>(lower_b);
+  EXPECT_EQ(upper_offset_a, upper_offset_b);
+
+  alloc_a.DeallocateRaw(upper_a);
+  alloc_a.DeallocateRaw(lower_a);
+  alloc_b.DeallocateRaw(upper_b);
+  alloc_b.DeallocateRaw(extra_lower_b);
+  alloc_b.DeallocateRaw(lower_b);
+}
+
+// Upper must not reuse a non-boundary lower hole.
+TEST(BFCAllocatorTest, SpatialSkipsLowerHole) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/1024, /*name=*/"spatial", opts);
+
+  // Fill the whole region: two lower chunks then one upper chunk, leaving no
+  // central gap.
+  void* lower_a = alloc.AllocateRaw(kAlignment, 256);
+  ASSERT_NE(lower_a, nullptr);
+  void* lower_b = alloc.AllocateRaw(kAlignment, 256);
+  ASSERT_NE(lower_b, nullptr);
+  void* upper = alloc.AllocateRaw(kAlignment, 512, *kUpper);
+  ASSERT_NE(upper, nullptr);
+
+  // lower_a is trapped below live lower_b.
+  alloc.DeallocateRaw(lower_a);
+
+  // Upper must not reuse the trapped lower hole.
+  void* trapped = alloc.AllocateRaw(kAlignment, 256, *kUpper);
+  EXPECT_EQ(trapped, nullptr);
+
+  alloc.DeallocateRaw(upper);
+  alloc.DeallocateRaw(lower_b);
+}
+
+// Boundary frees rejoin the central gap.
+TEST(BFCAllocatorTest, SpatialLowerReclaimsGap) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/2048, /*name=*/"spatial", opts);
+
+  void* upper = alloc.AllocateRaw(kAlignment, 1024, *kUpper);
+  ASSERT_NE(upper, nullptr);
+  alloc.DeallocateRaw(upper);
+
+  void* lower = alloc.AllocateRaw(kAlignment, 2048);
+  ASSERT_NE(lower, nullptr);
+  alloc.DeallocateRaw(lower);
+}
+
+// The dynamic boundary moves with frees, but ownership is still enforced.
+TEST(BFCAllocatorTest, SpatialReclaimsGap) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/4096, /*name=*/"spatial", opts);
+
+  void* lower0 = alloc.AllocateRaw(kAlignment, 1024);
+  ASSERT_NE(lower0, nullptr);
+
+  void* upper0 = alloc.AllocateRaw(kAlignment, 512, *kUpper);
+  ASSERT_NE(upper0, nullptr);
+  alloc.DeallocateRaw(upper0);
+
+  // Adjacent upper free space rejoins the central gap.
+  void* lower1 = alloc.AllocateRaw(kAlignment, 512);
+  ASSERT_NE(lower1, nullptr);
+  void* upper1 = alloc.AllocateRaw(kAlignment, 512, *kUpper);
+  ASSERT_NE(upper1, nullptr);
+  EXPECT_EQ(upper1, upper0);
+  alloc.DeallocateRaw(upper1);
+
+  // Lower claims the remaining central gap.
+  void* lower2 = alloc.AllocateRaw(kAlignment, 2560);
+  ASSERT_NE(lower2, nullptr);
+  EXPECT_LE(absl::bit_cast<uintptr_t>(lower2),
+            absl::bit_cast<uintptr_t>(upper1));
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(lower2) + 2560,
+            absl::bit_cast<uintptr_t>(upper1) + 512);
+
+  // Upper must not cross back into lower-owned space.
+  void* upper2 = alloc.AllocateRaw(kAlignment, 256, *kUpper);
+  EXPECT_EQ(upper2, nullptr);
+
+  alloc.DeallocateRaw(lower2);
+  alloc.DeallocateRaw(lower1);
+  alloc.DeallocateRaw(lower0);
+}
+
+TEST(BFCAllocatorTest, SpatialUpperAlignmentSuffix) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/4096, /*name=*/"spatial", opts);
+  const uintptr_t base = FakeSubAllocator::kBase;
+
+  void* upper = alloc.AllocateRaw(1024, 256, *kUpper);
+  ASSERT_NE(upper, nullptr);
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(upper), base + 3072);
+
+  // The alignment suffix above `upper` is upper-owned.
+  void* lower = alloc.AllocateRaw(kAlignment, 3072);
+  ASSERT_NE(lower, nullptr);
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(lower), base);
+
+  void* crossed = alloc.AllocateRaw(kAlignment, 768, *kLower);
+  EXPECT_EQ(crossed, nullptr);
+
+  alloc.DeallocateRaw(lower);
+  alloc.DeallocateRaw(upper);
+}
+
+TEST(BFCAllocatorTest, SpatialLowerAlignmentPrefix) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/4096, /*name=*/"spatial", opts);
+  const uintptr_t base = FakeSubAllocator::kBase;
+
+  void* lower0 = alloc.AllocateRaw(kAlignment, 256);
+  ASSERT_NE(lower0, nullptr);
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(lower0), base);
+
+  void* lower1 = alloc.AllocateRaw(1024, 256);
+  ASSERT_NE(lower1, nullptr);
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(lower1), base + 1024);
+
+  void* upper = alloc.AllocateRaw(kAlignment, 2816, *kUpper);
+  ASSERT_NE(upper, nullptr);
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(upper), base + 1280);
+
+  // The alignment prefix below lower1 is lower-owned.
+  void* crossed = alloc.AllocateRaw(kAlignment, 768, *kUpper);
+  EXPECT_EQ(crossed, nullptr);
+
+  alloc.DeallocateRaw(upper);
+  alloc.DeallocateRaw(lower1);
+  alloc.DeallocateRaw(lower0);
+}
+
+// A fully freed lower range reforms the central gap for upper allocations.
+TEST(BFCAllocatorTest, SpatialUpperReclaimsAfterLowerFill) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  constexpr size_t kPool = size_t{1} << 30;  // 1 GiB.
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/kPool, /*name=*/"repro", opts);
+  const uintptr_t base = FakeSubAllocator::kBase;
+
+  // Lower fills the entire pool, then frees it.
+  constexpr size_t kChunk = size_t{32} << 20;  // 32 MiB.
+  constexpr int kNumChunks = kPool / kChunk;   // 32 chunks exactly fill 1 GiB.
+  std::vector<void*> lower_ptrs;
+  lower_ptrs.reserve(kNumChunks);
+  for (int i = 0; i < kNumChunks; ++i) {
+    void* p = alloc.AllocateRaw(kAlignment, kChunk);
+    ASSERT_NE(p, nullptr) << "lower fill failed at chunk " << i;
+    lower_ptrs.push_back(p);
+  }
+
+  // Boundary coalescing should reform one whole-pool gap.
+  for (void* p : lower_ptrs) {
+    alloc.DeallocateRaw(p);
+  }
+
+  // Upper should now allocate from the top of the reformed gap.
+  constexpr size_t kUpperBytes = 18 << 20;
+  void* upper = alloc.AllocateRaw(kAlignment, kUpperBytes, *kUpper);
+  ASSERT_NE(upper, nullptr) << "upper should reclaim the freed pool";
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(upper) + kUpperBytes, base + kPool)
+      << "upper allocation should be anchored at the top of the pool";
+  alloc.DeallocateRaw(upper);
+}
+
+TEST(BFCAllocatorTest, SpatialReusesOwnHoles) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/4096, /*name=*/"spatial", opts);
+  const uintptr_t base = FakeSubAllocator::kBase;
+
+  void* lower0 = alloc.AllocateRaw(kAlignment, 256);
+  void* lower_hole = alloc.AllocateRaw(kAlignment, 256);
+  void* lower_guard = alloc.AllocateRaw(kAlignment, 256);
+  ASSERT_NE(lower0, nullptr);
+  ASSERT_NE(lower_hole, nullptr);
+  ASSERT_NE(lower_guard, nullptr);
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(lower_hole), base + 256);
+
+  void* upper0 = alloc.AllocateRaw(kAlignment, 256, *kUpper);
+  void* upper_hole = alloc.AllocateRaw(kAlignment, 256, *kUpper);
+  void* upper_guard = alloc.AllocateRaw(kAlignment, 256, *kUpper);
+  ASSERT_NE(upper0, nullptr);
+  ASSERT_NE(upper_hole, nullptr);
+  ASSERT_NE(upper_guard, nullptr);
+  EXPECT_EQ(absl::bit_cast<uintptr_t>(upper_hole), base + 3584);
+
+  alloc.DeallocateRaw(lower_hole);
+  alloc.DeallocateRaw(upper_hole);
+
+  // Own binned holes are reused before the central gap.
+  void* lower_reuse = alloc.AllocateRaw(kAlignment, 256);
+  ASSERT_NE(lower_reuse, nullptr);
+  EXPECT_EQ(lower_reuse, lower_hole);
+
+  void* upper_reuse = alloc.AllocateRaw(kAlignment, 256, *kUpper);
+  ASSERT_NE(upper_reuse, nullptr);
+  EXPECT_EQ(upper_reuse, upper_hole);
+
+  alloc.DeallocateRaw(upper_reuse);
+  alloc.DeallocateRaw(upper_guard);
+  alloc.DeallocateRaw(upper0);
+  alloc.DeallocateRaw(lower_reuse);
+  alloc.DeallocateRaw(lower_guard);
+  alloc.DeallocateRaw(lower0);
+}
+
+// Identical upper allocation sequences should produce identical offsets.
+TEST(BFCAllocatorTest, SpatialUpperOffsetsStable) {
+  constexpr size_t kPool = size_t{512} << 20;
+  constexpr size_t kUpperAlignment = 512;
+  // Fixed upper sizes, identical across simulated ranks.
+  const std::array<size_t, 8> kUpperSizes = {4 << 20,  16 << 20, 1 << 20,
+                                             18 << 20, 2 << 20,  8 << 20,
+                                             4 << 20,  32 << 20};
+
+  // Run the fixed upper sequence with randomized lower churn.
+  auto run = [&](uint32_t lower_seed) -> std::vector<uintptr_t> {
+    BFCAllocator::Options opts;
+    opts.allow_growth = false;
+    opts.enable_spatial_partitioning = true;
+    BFCAllocator alloc(std::make_unique<FakeSubAllocator>(), kPool, "sym",
+                       opts);
+    const uintptr_t base = FakeSubAllocator::kBase;
+
+    std::mt19937 rng(lower_seed);
+    std::vector<std::pair<void*, size_t>> live_lower;  // (ptr, bytes)
+    size_t live_lower_bytes = 0;
+    // Keep utilization away from true exhaustion.
+    constexpr size_t kLowerCap = kPool / 2;
+    const std::array<size_t, 5> kLowerSizes = {256, 1 << 20, 8 << 20, 32 << 20,
+                                               64 << 20};
+    auto churn_lower = [&] {
+      // A random burst of lower allocations and frees, leaving some live.
+      const int ops = std::uniform_int_distribution<int>(0, 6)(rng);
+      for (int i = 0; i < ops; ++i) {
+        if (!live_lower.empty() &&
+            std::uniform_int_distribution<int>(0, 2)(rng) == 0) {
+          size_t idx = std::uniform_int_distribution<size_t>(
+              0, live_lower.size() - 1)(rng);
+          alloc.DeallocateRaw(live_lower[idx].first);
+          live_lower_bytes -= live_lower[idx].second;
+          live_lower.erase(live_lower.begin() + idx);
+        } else {
+          size_t bytes = kLowerSizes[std::uniform_int_distribution<size_t>(
+              0, kLowerSizes.size() - 1)(rng)];
+          if (live_lower_bytes + bytes > kLowerCap) {
+            continue;
+          }
+          void* p = alloc.AllocateRaw(kUpperAlignment, bytes);
+          if (p) {
+            live_lower.push_back({p, bytes});
+            live_lower_bytes += bytes;
+          }
+        }
+      }
+    };
+
+    std::vector<uintptr_t> offsets;
+    std::vector<void*> live_upper;
+    for (size_t bytes : kUpperSizes) {
+      churn_lower();
+      void* p = alloc.AllocateRaw(kUpperAlignment, bytes, *kUpper);
+      EXPECT_NE(p, nullptr)
+          << "upper alloc failed under lower churn (seed " << lower_seed << ")";
+      offsets.push_back(p ? absl::bit_cast<uintptr_t>(p) - base
+                          : std::numeric_limits<uintptr_t>::max());
+      if (p) {
+        live_upper.push_back(p);
+      }
+      // Occasionally free an earlier upper temp, mimicking short-lived S(1).
+      if (live_upper.size() > 2) {
+        alloc.DeallocateRaw(live_upper.front());
+        live_upper.erase(live_upper.begin());
+      }
+    }
+    return offsets;
+  };
+
+  const std::vector<uintptr_t> rank0 = run(/*lower_seed=*/1);
+  for (uint32_t seed = 2; seed <= 32; ++seed) {
+    EXPECT_EQ(run(seed), rank0)
+        << "upper offsets diverged for lower_seed=" << seed;
+  }
+}
+
+TEST(BFCAllocatorTest, SpatialUnderContention) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/64 << 20, /*name=*/"contention", opts);
+
+  constexpr int kNumThreads = 8;
+  constexpr int kItersPerThread = 1000;
+  constexpr size_t kBytes = 1024;
+
+  std::atomic<int> failures{0};
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "spatial_contention",
+                                  kNumThreads);
+  absl::BlockingCounter counter(kNumThreads);
+  for (int t = 0; t < kNumThreads; ++t) {
+    threads.Schedule([&] {
+      for (int i = 0; i < kItersPerThread; ++i) {
+        void* lower = alloc.AllocateRaw(kAlignment, kBytes, *kLower);
+        void* upper = alloc.AllocateRaw(kAlignment, kBytes, *kUpper);
+        if (!lower || !upper || !IsAligned(lower, kAlignment) ||
+            !IsAligned(upper, kAlignment)) {
+          failures.fetch_add(1, std::memory_order_relaxed);
+        }
+        alloc.DeallocateRaw(lower);
+        alloc.DeallocateRaw(upper);
+      }
+      counter.DecrementCount();
+    });
+  }
+  counter.Wait();
+  EXPECT_EQ(failures.load(std::memory_order_relaxed), 0);
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks.
 //===----------------------------------------------------------------------===//
 
 static constexpr size_t kBenchAllocSize = 1024;
-static constexpr size_t kBenchAlignment = Allocator::kAllocatorAlignment;
 
 static void BM_AllocAndFree(benchmark::State& state) {
-  BFCAllocator alloc(std::make_unique<MallocSubAllocator>(),
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
                      /*total_memory=*/256 << 20, /*name=*/"bench",
                      BFCAllocator::Options{});
 
   for (auto _ : state) {
-    void* ptr = alloc.AllocateRaw(kBenchAlignment, kBenchAllocSize);
+    void* ptr = alloc.AllocateRaw(kAlignment, kBenchAllocSize);
     alloc.DeallocateRaw(ptr);
   }
   state.SetItemsProcessed(state.iterations());
@@ -248,14 +648,14 @@ BENCHMARK(BM_AllocAndFree);
 
 static void BM_AllocBatchThenFree(benchmark::State& state) {
   int batch = state.range(0);
-  BFCAllocator alloc(std::make_unique<MallocSubAllocator>(),
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
                      /*total_memory=*/256 << 20, /*name=*/"bench",
                      BFCAllocator::Options{});
 
   std::vector<void*> ptrs(batch);
   for (auto _ : state) {
     for (int i = 0; i < batch; ++i) {
-      ptrs[i] = alloc.AllocateRaw(kBenchAlignment, kBenchAllocSize);
+      ptrs[i] = alloc.AllocateRaw(kAlignment, kBenchAllocSize);
     }
     for (int i = 0; i < batch; ++i) {
       alloc.DeallocateRaw(ptrs[i]);
@@ -266,11 +666,48 @@ static void BM_AllocBatchThenFree(benchmark::State& state) {
 
 BENCHMARK(BM_AllocBatchThenFree)->Arg(100)->Arg(1000);
 
+//===----------------------------------------------------------------------===//
+// Spatial allocation benchmarks.
+//===----------------------------------------------------------------------===//
+
+static void BM_SpatialAllocBatchThenFree(benchmark::State& state) {
+  const int batch = state.range(0);
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.enable_spatial_partitioning = true;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/256 << 20, /*name=*/"bench", opts);
+
+  std::vector<void*> lower_ptrs(batch);
+  std::vector<void*> upper_ptrs(batch);
+  for (auto _ : state) {
+    for (int i = 0; i < batch; ++i) {
+      lower_ptrs[i] = alloc.AllocateRaw(kAlignment, kBenchAllocSize);
+      tsl::testing::DoNotOptimize(lower_ptrs[i]);
+    }
+    for (int i = 0; i < batch; ++i) {
+      upper_ptrs[i] = alloc.AllocateRaw(kAlignment, kBenchAllocSize, *kUpper);
+      tsl::testing::DoNotOptimize(upper_ptrs[i]);
+    }
+    for (int i = 0; i < batch; ++i) {
+      alloc.DeallocateRaw(lower_ptrs[i]);
+      alloc.DeallocateRaw(upper_ptrs[i]);
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * batch * 2);
+}
+
+BENCHMARK(BM_SpatialAllocBatchThenFree)->Arg(100)->Arg(1000);
+
+//===----------------------------------------------------------------------===//
+// Contention benchmarks.
+//===----------------------------------------------------------------------===//
+
 static void BM_AllocAndFreeUnderContention(benchmark::State& state) {
   size_t num_threads = state.range(0);
   static constexpr int kItersPerThread = 10000;
 
-  BFCAllocator alloc(std::make_unique<MallocSubAllocator>(),
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
                      /*total_memory=*/256 << 20, /*name=*/"bench",
                      BFCAllocator::Options{});
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "bench", num_threads);
@@ -280,7 +717,7 @@ static void BM_AllocAndFreeUnderContention(benchmark::State& state) {
     for (int t = 0; t < num_threads; ++t) {
       threads.Schedule([&] {
         for (int i = 0; i < kItersPerThread; ++i) {
-          void* ptr = alloc.AllocateRaw(kBenchAlignment, kBenchAllocSize);
+          void* ptr = alloc.AllocateRaw(kAlignment, kBenchAllocSize);
           alloc.DeallocateRaw(ptr);
         }
         counter.DecrementCount();
diff --git a/xla/xla.proto b/xla/xla.proto
index 22d844074ff38..0c6ff7059981c 100644
--- a/xla/xla.proto
+++ b/xla/xla.proto
@@ -645,6 +645,11 @@ message DebugOptions {
   // dimension.
   optional bool xla_gpu_enable_all_gather_combine_by_dim = 254;
 
+  // Enables spatial partitioning of the GPU BFC allocator so default and
+  // collective allocations share one fixed address range. Requires BFC
+  // preallocation.
+  optional bool xla_gpu_enable_allocator_spatial_partitioning = 494;
+
   optional bool xla_gpu_enable_analytical_latency_estimator = 255;
 
   // Enables NCCL Speed-of-Light (SoL) analytical cost model

From ba67293791a3d52084a2c7bdc63df8e3d5ff3b2c Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@openxla.org>
Date: Fri, 12 Jun 2026 23:02:26 +0000
Subject: [PATCH 2/2] [tsl] Add ScopedAllocationTrace to help debugging OOMs

---
 xla/pjrt/gpu/BUILD                           |  18 ++--
 xla/pjrt/gpu/se_gpu_pjrt_client.cc           |  28 ++++-
 xla/tsl/framework/BUILD                      |  18 ++++
 xla/tsl/framework/bfc_allocator.cc           |  38 +++++++
 xla/tsl/framework/bfc_allocator.h            |   5 +
 xla/tsl/framework/bfc_allocator_test.cc      |  41 ++++++++
 xla/tsl/framework/scoped_allocation_trace.cc |  84 +++++++++++++++
 xla/tsl/framework/scoped_allocation_trace.h  | 101 +++++++++++++++++++
 8 files changed, 322 insertions(+), 11 deletions(-)
 create mode 100644 xla/tsl/framework/scoped_allocation_trace.cc
 create mode 100644 xla/tsl/framework/scoped_allocation_trace.h

diff --git a/xla/pjrt/gpu/BUILD b/xla/pjrt/gpu/BUILD
index d73a34f66ec72..e7ec790839263 100644
--- a/xla/pjrt/gpu/BUILD
+++ b/xla/pjrt/gpu/BUILD
@@ -63,6 +63,7 @@ cc_library(
         ":gpu_metrics",
         ":se_gpu_pjrt_runtime_abi_version",
         ":se_gpu_topology_description",
+        "//xla:debug_options_flags",
         "//xla:executable_run_options",
         "//xla:future",
         "//xla:literal",
@@ -88,6 +89,7 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/ir:hlo",
         "//xla/pjrt:async_work_runner",
         "//xla/pjrt:common_pjrt_client",
         "//xla/pjrt:device_event",
@@ -134,8 +136,10 @@ cc_library(
         "//xla/service:transfer_manager",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:gpu_constants",
+        "//xla/service/gpu:gpu_executable",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/service/gpu:gpu_memory_space_assignment",
+        "//xla/service/gpu:stream_executor_util",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description",
@@ -146,6 +150,9 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:vmm_device_address_allocator",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/cuda:cuda_device_address_vmm_allocator",
+        "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
         "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/stream_executor/integrations:tf_allocator_adapter",
         "//xla/tsl/concurrency:async_value",
@@ -155,6 +162,7 @@ cc_library(
         "//xla/tsl/framework:bfc_allocator",
         "//xla/tsl/framework:device_id",
         "//xla/tsl/framework:device_id_impl",
+        "//xla/tsl/framework:scoped_allocation_trace",
         "//xla/tsl/lib/strings:proto_serialization",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -183,6 +191,7 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
+        "@local_config_cuda//cuda:cudart_headers",
         "@tsl//tsl/platform:casts",
         "@tsl//tsl/platform:env",
         "@tsl//tsl/platform:errors",
@@ -196,25 +205,16 @@ cc_library(
         "@tsl//tsl/profiler/lib:traceme",
     ] + if_cuda_or_rocm([
         # keep sorted
-        "//xla:debug_options_flags",
         "//xla/service/gpu:gpu_compiler",
-        "//xla/service/gpu:gpu_executable",
-        "//xla/service/gpu:stream_executor_util",
     ]) + if_cuda([
         # keep sorted
-        "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/stream_executor/cuda:cuda_device_address_vmm_allocator",
-        "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm([
         # keep sorted
         "@local_config_rocm//rocm:rocm_headers",
     ]) + if_sycl([
         # keep sorted
-        "//xla:debug_options_flags",
         "//xla/service/gpu:gpu_compiler",
-        "//xla/service/gpu:gpu_executable",
-        "//xla/service/gpu:stream_executor_util",
         "@local_config_sycl//sycl:sycl_headers",
     ]),
 )
diff --git a/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 3f1ab375add4e..a876d9b39dee5 100644
--- a/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/future.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/layout.h"
 #include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
@@ -118,6 +119,7 @@ limitations under the License.
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/scoped_allocation_trace.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -2037,6 +2039,11 @@ StreamExecutorGpuClient::RunAsync(
       "[", device_ordinal, "] GpuExecutable::ExecuteAsyncOnStreamImpl(",
       gpu_exec->name(), ")"));
 
+  // Attribute all device memory allocation to the gpu executable.
+  tsl::ScopedAllocationTrace allocation_trace(
+      "xla.execute",
+      {{"executable", gpu_exec->name()}, {"device", device_ordinal}});
+
   // GpuExecutable always bound to a single GpuContext during its execution, so
   // we activate it once to skip expensive context activations later.
   auto activation = executor->Activate();
@@ -2127,14 +2134,21 @@ StreamExecutorGpuClient::RunAsync(
         }
       } else {
         // Allocate each allocation that might escape, or is the temp buffer.
-        CHECK(allocation.maybe_live_out() ||
-              allocation.IsPreallocatedTempBuffer());
+        bool is_live_out = allocation.maybe_live_out();
+        bool is_temp_buffer = allocation.IsPreallocatedTempBuffer();
+        CHECK(is_live_out || is_temp_buffer);  // Crash OK
+
         int64_t buffer_size = allocation.size();
         if (auto it = allocate_granularity.find(allocation.color());
             it != allocate_granularity.end()) {
           buffer_size = RoundUpTo(buffer_size, it->second);
         }
         if (buffer_size > 0) {
+          tsl::ScopedAllocationTrace allocation_trace(
+              "xla.buffer", {{"kind", is_temp_buffer ? "temp" : "live_out"},
+                             {"allocation_index", i},
+                             {"requested_bytes", buffer_size},
+                             {"memory_space", allocation.color()}});
           ASSIGN_OR_RETURN(
               se::ScopedDeviceAddress<uint8_t> owning_buffer,
               memory_allocator->Allocate(device_ordinal, buffer_size,
@@ -2199,6 +2213,16 @@ StreamExecutorGpuClient::RunAsync(
                "buffer is not donated; allocating a fresh buffer";
         int64_t allocation_size = ShapeUtil::ByteSizeOf(
             ShapeUtil::GetSubshape(gpu_exec->result_shape(), index));
+        const HloInputOutputAliasConfig::Alias& alias =
+            *output_info.alias_config;
+        const bool must_alias = alias.must_alias();
+        tsl::ScopedAllocationTrace copy_protection_trace(
+            "xla.buffer",
+            {{"kind", "live_out_copy_protection"},
+             {"allocation_index", output_info.allocation_index},
+             {"requested_bytes", allocation_size},
+             {"memory_space", allocation->color()},
+             {"alias_kind", must_alias ? "must_alias" : "may_alias"}});
         absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> allocated_buffer =
             memory_allocator->Allocate(device_ordinal, allocation_size,
                                        /*retry_on_failure=*/true,
diff --git a/xla/tsl/framework/BUILD b/xla/tsl/framework/BUILD
index d923ccca00fbe..8c23401aff40b 100644
--- a/xla/tsl/framework/BUILD
+++ b/xla/tsl/framework/BUILD
@@ -185,6 +185,20 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "scoped_allocation_trace",
+    srcs = ["scoped_allocation_trace.cc"],
+    hdrs = ["scoped_allocation_trace.h"],
+    features = ["parse_headers"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 cc_library(
     name = "bfc_allocator",
     srcs = [
@@ -198,6 +212,7 @@ cc_library(
     deps = [
         ":allocator",
         ":metrics",
+        ":scoped_allocation_trace",
         ":shared_counter",
         "//xla/tsl/lib/core:bits",
         "//xla/tsl/platform:env",
@@ -226,13 +241,16 @@ tsl_cc_test(
     deps = [
         ":allocator",
         ":bfc_allocator",
+        ":scoped_allocation_trace",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:env_impl",  # buildcleaner: keep
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/synchronization",
     ],
 )
diff --git a/xla/tsl/framework/bfc_allocator.cc b/xla/tsl/framework/bfc_allocator.cc
index d14c96c8b4688..384a80d34437e 100644
--- a/xla/tsl/framework/bfc_allocator.cc
+++ b/xla/tsl/framework/bfc_allocator.cc
@@ -35,10 +35,12 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/numeric/bits.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/framework/allocator_retry.h"
+#include "xla/tsl/framework/scoped_allocation_trace.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/logging.h"
@@ -56,6 +58,25 @@ const uint64_t kDefaultMemoryFilterMask = tsl::profiler::TraceMeFiltersToMask(
 
 constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
 
+static std::string AllocationAnnotationFrameDebugString(
+    const ScopedAllocationTrace::Frame& frame) {
+  if (frame.args.empty()) {
+    return frame.name;
+  }
+  return absl::StrCat(frame.name, "{",
+                      absl::StrJoin(frame.args, ", ", absl::PairFormatter("=")),
+                      "}");
+}
+
+static std::string AllocationAnnotationSnapshotDebugString(
+    const ScopedAllocationTrace::Snapshot& snapshot) {
+  return absl::StrJoin(
+      snapshot.frames, " / ",
+      [](std::string* out, const ScopedAllocationTrace::Frame& frame) {
+        absl::StrAppend(out, AllocationAnnotationFrameDebugString(frame));
+      });
+}
+
 BFCAllocator::BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator,
                            size_t total_memory, const std::string& name,
                            const Options& opts)
@@ -223,6 +244,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   c->next = kInvalidChunkHandle;
   c->freed_at_count = 0;
   c->tag = free_chunk_tag_;
+  c->allocation_annotation.reset();
 
   region_manager_.set_handle(c->ptr, h);
 
@@ -264,6 +286,7 @@ void BFCAllocator::DeallocateChunk(ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   c->allocation_id = -1;
   c->bin_num = kInvalidBinNum;
+  c->allocation_annotation.reset();
   c->next = unused_chunk_handle_head_;
   unused_chunk_handle_head_ = h;
 }
@@ -839,6 +862,13 @@ void BFCAllocator::FinishChunkAllocation(Chunk* chunk, size_t num_bytes) {
   // Assign a unique id and increment the id counter, marking the chunk as being
   // in use.
   chunk->allocation_id = next_allocation_id_++;
+  ScopedAllocationTrace::Snapshot allocation_annotation =
+      ScopedAllocationTrace::Current();
+  if (allocation_annotation.frames.empty()) {
+    chunk->allocation_annotation.reset();
+  } else {
+    chunk->allocation_annotation = std::move(allocation_annotation);
+  }
 
   // Update stats.
   ++stats_.num_allocs;
@@ -901,6 +931,8 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
   // It inherits the tag; callers update the in-use piece after splitting.
   new_chunk->tag = c->tag;
 
+  new_chunk->allocation_annotation.reset();
+
   // Maintain the pointers.
   // c <-> c_neighbor becomes
   // c <-> new_chunk <-> c_neighbor
@@ -1088,6 +1120,7 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
 
   // Mark the chunk as no longer in use.
   c->allocation_id = -1;
+  c->allocation_annotation.reset();
 
   // Optionally record the free time. Timestamped chunks are kept in their
   // original lower/upper tag until they become safe to merge; otherwise a
@@ -1392,6 +1425,11 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes,
                         c->action_count, " step ", c->step_id);
       }
 #endif
+      if (c->in_use() && c->allocation_annotation.has_value()) {
+        absl::StrAppend(
+            &buf, " allocation_annotation ",
+            AllocationAnnotationSnapshotDebugString(*c->allocation_annotation));
+      }
       absl::StrAppend(&buf, " next ", c->next);
       if (timing_counter_) {
         absl::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
diff --git a/xla/tsl/framework/bfc_allocator.h b/xla/tsl/framework/bfc_allocator.h
index 1c0ba329018bc..4601fddc5e5f7 100644
--- a/xla/tsl/framework/bfc_allocator.h
+++ b/xla/tsl/framework/bfc_allocator.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/framework/allocator_retry.h"
+#include "xla/tsl/framework/scoped_allocation_trace.h"
 #include "xla/tsl/framework/shared_counter.h"
 #include "xla/tsl/lib/core/bits.h"
 #include "xla/tsl/platform/logging.h"
@@ -327,6 +328,10 @@ class BFCAllocator : public Allocator {
     // rejoin the gap.
     ChunkTag tag = ChunkTag::kCentralGap;
 
+    // Snapshot of the thread-local allocation annotation stack captured when
+    // this chunk became in-use. Cleared when the chunk is freed.
+    std::optional<ScopedAllocationTrace::Snapshot> allocation_annotation;
+
     bool in_use() const { return allocation_id != -1; }
 
 #ifdef TENSORFLOW_MEM_DEBUG
diff --git a/xla/tsl/framework/bfc_allocator_test.cc b/xla/tsl/framework/bfc_allocator_test.cc
index 0d2b878939fe5..eeb8376b915e6 100644
--- a/xla/tsl/framework/bfc_allocator_test.cc
+++ b/xla/tsl/framework/bfc_allocator_test.cc
@@ -29,9 +29,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/casts.h"
+#include "absl/base/log_severity.h"
 #include "absl/base/no_destructor.h"
+#include "absl/log/scoped_mock_log.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/scoped_allocation_trace.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -40,6 +43,11 @@ limitations under the License.
 namespace tsl {
 namespace {
 
+using ::testing::_;
+using ::testing::AllOf;
+using ::testing::AtLeast;
+using ::testing::HasSubstr;
+
 static constexpr size_t kAlignment = Allocator::kAllocatorAlignment;
 
 static const absl::NoDestructor<AllocationAttributes> kUpper(
@@ -111,6 +119,39 @@ TEST(BFCAllocatorTest, DefaultAlignment) {
   alloc.DeallocateRaw(ptr);
 }
 
+TEST(BFCAllocatorTest, OomLogsAllocationAnnotations) {
+  BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  opts.allow_retry_on_failure = false;
+  BFCAllocator alloc(std::make_unique<FakeSubAllocator>(),
+                     /*total_memory=*/1024, /*name=*/"annotated", opts);
+
+  void* ptr = nullptr;
+  {
+    ScopedAllocationTrace exec_scope("xla.execute",
+                                     {{"executable", "module"}, {"device", 7}});
+    ScopedAllocationTrace buffer_scope(
+        "xla.buffer", {{"kind", "live_out"}, {"allocation_index", 3}});
+    ptr = alloc.AllocateRaw(kAlignment, 512);
+  }
+  ASSERT_NE(ptr, nullptr);
+
+  absl::ScopedMockLog log(absl::MockLogDefault::kIgnoreUnexpected);
+  EXPECT_CALL(
+      log,
+      Log(absl::LogSeverity::kInfo, _,
+          AllOf(HasSubstr("InUse at"), HasSubstr("allocation_annotation"),
+                HasSubstr("xla.execute{executable=module, device=7}"),
+                HasSubstr("xla.buffer{kind=live_out, allocation_index=3}"))))
+      .Times(AtLeast(1));
+  log.StartCapturingLogs();
+
+  EXPECT_EQ(alloc.AllocateRaw(kAlignment, 2048), nullptr);
+
+  log.StopCapturingLogs();
+  alloc.DeallocateRaw(ptr);
+}
+
 // Parameterized test that verifies alignment is respected for various
 // power-of-two alignments from 32 bytes to 4096 bytes.
 class BFCAllocatorAlignmentTest : public ::testing::TestWithParam<size_t> {};
diff --git a/xla/tsl/framework/scoped_allocation_trace.cc b/xla/tsl/framework/scoped_allocation_trace.cc
new file mode 100644
index 0000000000000..5b351b1452cba
--- /dev/null
+++ b/xla/tsl/framework/scoped_allocation_trace.cc
@@ -0,0 +1,84 @@
+/* Copyright 2026 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/framework/scoped_allocation_trace.h"
+
+#include <initializer_list>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+namespace {
+
+struct AnnotationState {
+  std::vector<ScopedAllocationTrace::Frame> frames;
+};
+
+AnnotationState& ThreadAnnotationState() {
+  static thread_local AnnotationState state;  // NOLINT
+  return state;
+}
+
+}  // namespace
+
+ScopedAllocationTrace::Arg::Arg(absl::string_view key,
+                                const absl::AlphaNum& value
+                                    ABSL_ATTRIBUTE_LIFETIME_BOUND)
+    : key(key), value(value.Piece()) {}
+
+ScopedAllocationTrace::Frame::Frame(absl::string_view name) : name(name) {}
+
+ScopedAllocationTrace::Frame& ScopedAllocationTrace::Frame::Add(
+    absl::string_view key, const absl::AlphaNum& value) {
+  args.emplace_back(std::string(key), std::string(value.Piece()));
+  return *this;
+}
+
+ScopedAllocationTrace::Snapshot::Snapshot(std::vector<Frame> frames)
+    : frames(std::move(frames)) {}
+
+ScopedAllocationTrace::ScopedAllocationTrace(absl::string_view name,
+                                             std::initializer_list<Arg> args) {
+  Frame frame(name);
+  frame.args.reserve(args.size());
+  for (const Arg& arg : args) {
+    frame.args.emplace_back(std::string(arg.key), std::string(arg.value));
+  }
+  AnnotationState& state = ThreadAnnotationState();
+  state.frames.push_back(std::move(frame));
+}
+
+ScopedAllocationTrace::ScopedAllocationTrace(Frame frame) {
+  AnnotationState& state = ThreadAnnotationState();
+  state.frames.push_back(std::move(frame));
+}
+
+ScopedAllocationTrace::~ScopedAllocationTrace() {
+  AnnotationState& state = ThreadAnnotationState();
+  DCHECK(!state.frames.empty());
+  state.frames.pop_back();
+}
+
+ScopedAllocationTrace::Snapshot ScopedAllocationTrace::Current() {
+  return Snapshot(ThreadAnnotationState().frames);
+}
+
+}  // namespace tsl
diff --git a/xla/tsl/framework/scoped_allocation_trace.h b/xla/tsl/framework/scoped_allocation_trace.h
new file mode 100644
index 0000000000000..452fb675b9d2b
--- /dev/null
+++ b/xla/tsl/framework/scoped_allocation_trace.h
@@ -0,0 +1,101 @@
+/* Copyright 2026 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_SCOPED_ALLOCATION_TRACE_H_
+#define XLA_TSL_FRAMEWORK_SCOPED_ALLOCATION_TRACE_H_
+
+#include <initializer_list>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace tsl {
+
+// Thread-local metadata for passing higher-level allocation details down to
+// allocators. XLA/PJRT code can describe what an allocation represents, and
+// allocators may snapshot that stack to connect low-level chunks back to
+// high-level execution state during OOM diagnostics.
+//
+// This complements ScopedMemoryDebugAnnotation: that API exposes current
+// pending op/shape metadata for memory profiling, while this API keeps explicit
+// allocation trace frames for allocator diagnostics. This is scoped to the
+// current thread and does not automatically propagate to other threads. This
+// differs from third_party/tsl/tsl/platform/context.h, whose context can be
+// automatically captured and propagated through XLA thread pools.
+//
+// Allocator implementations may optionally snapshot Current() when a buffer
+// becomes live and attach it to internal metadata. Callers should not assume
+// every allocator records it.
+//
+// Example:
+//
+//   ScopedAllocationTrace exec_scope(
+//       "xla.execute",
+//       {{"executable", executable_name}, {"device", device_ordinal}});
+//
+//   void* ptr = allocator->AllocateRaw(alignment, bytes);
+//
+class ScopedAllocationTrace {
+ public:
+  // Key/value pair encoded into an allocation trace frame.
+  struct Arg {
+    Arg(absl::string_view key,
+        const absl::AlphaNum& value ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
+    Arg(const Arg&) = delete;
+    void operator=(const Arg&) = delete;
+
+    absl::string_view key;
+    absl::string_view value;
+  };
+
+  // Single allocation trace scope frame.
+  struct Frame {
+    explicit Frame(absl::string_view name);
+
+    Frame& Add(absl::string_view key,
+               const absl::AlphaNum& value ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
+    std::string name;
+    std::vector<std::pair<std::string, std::string>> args;
+  };
+
+  // Copy of the current thread-local trace frame stack.
+  struct Snapshot {
+    explicit Snapshot(std::vector<Frame> frames);
+
+    std::vector<Frame> frames;
+  };
+
+  explicit ScopedAllocationTrace(absl::string_view name,
+                                 std::initializer_list<Arg> args = {});
+  explicit ScopedAllocationTrace(Frame frame);
+
+  ScopedAllocationTrace(ScopedAllocationTrace&&) = delete;
+
+  ~ScopedAllocationTrace();
+
+  // Returns a copy of the current thread's annotation stack. The returned
+  // snapshot is independent from later scope changes and can have no frames.
+  static Snapshot Current();
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_SCOPED_ALLOCATION_TRACE_H_