From f27ee56438c7fbfc8b0ce98654697a059af8394f Mon Sep 17 00:00:00 2001
From: Polykarpos Thomadakis <polykarpos.thomadakis@pnnl.gov>
Date: Fri, 27 Jun 2025 12:29:29 -0700
Subject: [PATCH 1/3] (#112) [GPU] Avoid redundant data transfers depending on
 the type of access required in a kernel (read/write/read-write)

---
 .../BlockedGpuToTriton/BlockedGpuToTriton.cpp |  3 +-
 .../GpuToBlockedGpu/GpuToBlockedGpu.cpp       | 34 +++++++++++++++
 .../PrepareGpuHost/PrepareGpuHost.cpp         | 41 +++++++++++++++----
 3 files changed, 70 insertions(+), 8 deletions(-)
diff --git a/lib/Conversion/BlockedGpuToTriton/BlockedGpuToTriton.cpp b/lib/Conversion/BlockedGpuToTriton/BlockedGpuToTriton.cpp
index 4d505d76..de19eab8 100644
--- a/lib/Conversion/BlockedGpuToTriton/BlockedGpuToTriton.cpp
+++ b/lib/Conversion/BlockedGpuToTriton/BlockedGpuToTriton.cpp
@@ -103,7 +103,8 @@ class ConvertGpuFuncToTritonFunc : public OpConversionPattern<mlir::gpu::GPUFunc
         rewriter.create<func::ReturnOp>(gpuFunc.getLoc());
         newFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                            rewriter.getUnitAttr());
-        
+        newFunc.setArgAttrsAttr(gpuFunc.getArgAttrsAttr());
+                
         return success();
     }
 
diff --git a/lib/Conversion/GpuToBlockedGpu/GpuToBlockedGpu.cpp b/lib/Conversion/GpuToBlockedGpu/GpuToBlockedGpu.cpp
index cc86fa94..73179199 100644
--- a/lib/Conversion/GpuToBlockedGpu/GpuToBlockedGpu.cpp
+++ b/lib/Conversion/GpuToBlockedGpu/GpuToBlockedGpu.cpp
@@ -197,6 +197,40 @@ class ConvertGpuToBlockedGpu: public CometGpuToBlockedGpuBase<ConvertGpuToBlocke
         mlir::gpu::GPUFuncOp funcOp = getOperation();
         mlir::OpBuilder builder(funcOp);
 
+        for(auto memrefArg : funcOp.getArguments())
+        {
+            if(mlir::isa<mlir::MemRefType>(memrefArg.getType()))
+            {
+                std::vector<Operation*> toExamine;
+                toExamine.insert(toExamine.end(), memrefArg.getUsers().begin(), memrefArg.getUsers().end());
+                for(size_t i = 0; i < toExamine.size(); i++)
+                {
+                    auto user = toExamine[i];
+                    if(mlir::isa<mlir::memref::LoadOp>(user))
+                    {
+                        funcOp.setArgAttr(memrefArg.getArgNumber(), "gpu.read", builder.getUnitAttr());
+                    }
+                    else if(mlir::isa<mlir::memref::StoreOp>(user))
+                    {
+                        funcOp.setArgAttr(memrefArg.getArgNumber(), "gpu.write", builder.getUnitAttr());
+                    }
+
+                    if(user->getNumResults() > 0)
+                    {
+                        for(auto res: user->getResults())
+                        {
+                            if(isa<MemRefType>(res.getType()))
+                            {
+                                // If the result is a memref, we need to check its users as well
+                                // to see if it is used in a store or load operation
+                                toExamine.insert(toExamine.end(), res.getUsers().begin(), res.getUsers().end());
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
         for(auto arg: funcOp.getArguments())
         {
             if(mlir::isa<mlir::MemRefType>(arg.getType()))
diff --git a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
index f3452174..1ad3cabe 100644
--- a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
+++ b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
@@ -1,4 +1,7 @@
 #include "comet/Conversion/PrepareGpuHost/PrepareGpuHostPass.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -26,6 +29,7 @@ class PrepareGpuHost
 
     std::map<std::string, Value> funcs;
     std::map<std::string, std::string> gpu_to_triton_kernel;
+    std::map<std::string, func::FuncOp> gpu_name_to_funcOp;
     std::map<std::string, triton::FuncOp> triton_name_to_triton_func_op;
     auto gpuModules = modOp.getOps<gpu::GPUModuleOp>();
     for(auto gpuModuleOp: gpuModules)
@@ -33,17 +37,20 @@ class PrepareGpuHost
         auto funcOps = gpuModuleOp.getOps<mlir::func::FuncOp>();
         for(func::FuncOp funcOp: llvm::make_early_inc_range(funcOps))
         {
+            std::map<size_t, std::vector<Attribute>> argsToSet;
             if(!funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName()))
             {
                 continue;
             }
             builder.setInsertionPoint(funcOp);
             SmallVector<Type, 4> newTypes;
-            for(auto argType: funcOp.getArgumentTypes())
+            for(auto arg: funcOp.getArguments())
             {
+                auto argType = arg.getType();
                 newTypes.push_back(argType);
                 if(MemRefType rankedType = dyn_cast<mlir::MemRefType>(argType))
                 {
+                    argsToSet[newTypes.size() - 1] = {funcOp.getArgAttr(arg.getArgNumber(), "gpu.read"), funcOp.getArgAttr(arg.getArgNumber(), "gpu.write")};
                     if(rankedType.hasRank())
                     {
                         newTypes.push_back(builder.getIndexType());
@@ -71,6 +78,18 @@ class PrepareGpuHost
             builder.create<func::ReturnOp>(funcOp.getLoc());
             newFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                                 builder.getUnitAttr());
+            for(auto [argNumber, attrs]: argsToSet)
+            {
+                if(attrs[0])
+                {
+                    newFunc.setArgAttr(argNumber, "gpu.read", attrs[0]);
+                }
+                if(attrs[1])
+                {
+                    newFunc.setArgAttr(argNumber, "gpu.write", attrs[1]);
+                }
+            }
+            gpu_name_to_funcOp[funcOp.getName().str()] = newFunc;
             funcOp->erase();
         }
     }
@@ -185,14 +204,22 @@ class PrepareGpuHost
                 if (mlir::gpu::LaunchFuncOp launchOp =
                         dyn_cast<mlir::gpu::LaunchFuncOp>(use.getOwner())) {
                     builder.setInsertionPoint(launchOp);
-                    builder.create<mlir::gpu::MemcpyOp>(launchOp->getLoc(), TypeRange(),
-                                                        ValueRange(),
-                                                        gpuAlloc.getMemref(), alloc);
+                    int offset = launchOp->getNumOperands() - launchOp.getNumKernelOperands();
+                    int operNum = use.getOperandNumber() - offset;
+                    if(gpu_name_to_funcOp[launchOp.getKernelName().str()].getArgAttr(operNum, "gpu.read"))
+                    {
+                        builder.create<mlir::gpu::MemcpyOp>(launchOp->getLoc(), TypeRange(),
+                                                            ValueRange(),
+                                                            gpuAlloc.getMemref(), alloc);
+                    }
                     use.set(gpuAlloc.getMemref());
                     builder.setInsertionPointAfter(launchOp);
-                    builder.create<mlir::gpu::MemcpyOp>(launchOp->getLoc(), TypeRange(),
-                                                        ValueRange(), alloc,
-                                                        gpuAlloc.getMemref());
+                    if(gpu_name_to_funcOp[launchOp.getKernelName().str()].getArgAttr(operNum, "gpu.write"))
+                    {
+                                            builder.create<mlir::gpu::MemcpyOp>(launchOp->getLoc(), TypeRange(),
+                                                                                ValueRange(), alloc,
+                                                                                gpuAlloc.getMemref());
+                    }
                 }
             }
         }

From 9fce842bcf55ba7df2357ea23906b72129d3f811 Mon Sep 17 00:00:00 2001
From: Polykarpos Thomadakis <polykarpos.thomadakis@pnnl.gov>
Date: Fri, 27 Jun 2025 15:49:38 -0700
Subject: [PATCH 2/3] [GPU] Added simple heuristic to detect cases where data
 are already in the device (because of a previous operation). Currently it's
 quite conservative as it does not track memref aliases and will just copy if
 an alias is detected.

---
 .../PrepareGpuHost/PrepareGpuHost.cpp         | 92 +++++++++++++++++--
 1 file changed, 85 insertions(+), 7 deletions(-)

diff --git a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
index 1ad3cabe..af3a5a15 100644
--- a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
+++ b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
@@ -1,7 +1,9 @@
 #include "comet/Conversion/PrepareGpuHost/PrepareGpuHostPass.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -208,17 +210,19 @@ class PrepareGpuHost
                     int operNum = use.getOperandNumber() - offset;
                     if(gpu_name_to_funcOp[launchOp.getKernelName().str()].getArgAttr(operNum, "gpu.read"))
                     {
-                        builder.create<mlir::gpu::MemcpyOp>(launchOp->getLoc(), TypeRange(),
+                        auto gpuMemCpy = builder.create<mlir::gpu::MemcpyOp>(launchOp->getLoc(), TypeRange(),
                                                             ValueRange(),
                                                             gpuAlloc.getMemref(), alloc);
+                        gpuMemCpy->setAttr("gpu.read", builder.getUnitAttr());
                     }
                     use.set(gpuAlloc.getMemref());
                     builder.setInsertionPointAfter(launchOp);
                     if(gpu_name_to_funcOp[launchOp.getKernelName().str()].getArgAttr(operNum, "gpu.write"))
                     {
-                                            builder.create<mlir::gpu::MemcpyOp>(launchOp->getLoc(), TypeRange(),
-                                                                                ValueRange(), alloc,
-                                                                                gpuAlloc.getMemref());
+                        auto gpuMemCpy = builder.create<mlir::gpu::MemcpyOp>(launchOp->getLoc(), TypeRange(),
+                                                            ValueRange(), alloc,
+                                                            gpuAlloc.getMemref());
+                        gpuMemCpy->setAttr("gpu.write", builder.getUnitAttr());
                     }
                 }
             }
@@ -241,11 +245,85 @@ class PrepareGpuHost
         gpuAllocs.push_back(gpuAllocOp);
     });
 
-    std::vector<mlir::gpu::MemcpyOp> gpuCopies;
-    modOp->walk([&gpuCopies](mlir::gpu::MemcpyOp gpuCopy) {
-        gpuCopies.push_back(gpuCopy);
+
+    std::map<void*, std::vector<Operation*>> memEffects;
+    modOp->walk([&uniqueGpuAllocs, &memEffects](Operation* memEffect) {
+        for(auto op: memEffect->getOperands())
+        {
+            if(uniqueGpuAllocs.find(op) != uniqueGpuAllocs.end())
+            {
+                memEffects[op.getAsOpaquePointer()].push_back(memEffect);
+            }
+        }
     });
 
+    for(auto& [memref, effects]: memEffects)
+    {
+        bool copyIn = true;
+        std::vector<Operation*> copyDelete;
+        for(size_t i = 0; i < effects.size(); i++)
+        {
+            if(mlir::gpu::MemcpyOp gpuCopy = mlir::dyn_cast<mlir::gpu::MemcpyOp>(effects[i]))
+            {
+                if(!copyIn & gpuCopy->hasAttr("gpu.read"))
+                {
+                    // gpuCopy.dump();
+                    gpuCopy->erase();
+                }
+                else if(gpuCopy->hasAttr("gpu.read"))
+                {
+                    copyIn = false;
+                }
+                else if(gpuCopy->hasAttr("gpu.write"))
+                {
+                    copyIn = false;
+                    copyDelete.push_back(gpuCopy);
+                }
+            }
+            else if(mlir::memref::StoreOp store = mlir::dyn_cast<mlir::memref::StoreOp>(effects[i]))
+            {
+                copyIn = true;
+                if(!copyDelete.empty())
+                {
+                    copyDelete.pop_back();
+                }
+            }
+            else if(mlir::memref::CopyOp copy = mlir::dyn_cast<mlir::memref::CopyOp>(effects[i]))
+            {
+                if(copy.getTarget().getAsOpaquePointer() == memref)
+                {
+                    copyIn = true;
+                }
+            }
+            else if(mlir::memref::LoadOp load = mlir::dyn_cast<mlir::memref::LoadOp>(effects[i]))
+            {
+                if(!copyDelete.empty())
+                {
+                    copyDelete.pop_back();
+                }
+            }
+            else if(isa<memref::ExtractStridedMetadataOp, memref::DimOp, memref::RankOp>(effects[i]))
+            {
+                continue;
+            }
+            else // Unknown operation, be conservative
+            {
+                effects[i]->dump();
+                copyIn = true;
+                if(!copyDelete.empty())
+                {
+                    copyDelete.pop_back();
+                }  
+            }
+        }
+
+        for(auto toDelete: copyDelete)
+        {
+            // toDelete->dump();
+            toDelete->erase();
+        }
+    }
+
   }
 };
 

From 62b49d9c6877ce980bc4f692de0e1dbdfad13e00 Mon Sep 17 00:00:00 2001
From: Polykarpos Thomadakis <polykarpos.thomadakis@pnnl.gov>
Date: Fri, 27 Jun 2025 15:50:23 -0700
Subject: [PATCH 3/3] Removed debugging output

---
 lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
index af3a5a15..62ebabda 100644
--- a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
+++ b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp
@@ -267,7 +267,6 @@ class PrepareGpuHost
             {
                 if(!copyIn & gpuCopy->hasAttr("gpu.read"))
                 {
-                    // gpuCopy.dump();
                     gpuCopy->erase();
                 }
                 else if(gpuCopy->hasAttr("gpu.read"))
@@ -308,7 +307,6 @@ class PrepareGpuHost
             }
             else // Unknown operation, be conservative
             {
-                effects[i]->dump();
                 copyIn = true;
                 if(!copyDelete.empty())
                 {
@@ -319,7 +317,6 @@ class PrepareGpuHost
 
         for(auto toDelete: copyDelete)
         {
-            // toDelete->dump();
             toDelete->erase();
         }
     }