From f27ee56438c7fbfc8b0ce98654697a059af8394f Mon Sep 17 00:00:00 2001 From: Polykarpos Thomadakis Date: Fri, 27 Jun 2025 12:29:29 -0700 Subject: [PATCH 1/3] (#112) [GPU] Avoid redundant data transfers depending on the type of access required in a kernel (read/write/read-write) --- .../BlockedGpuToTriton/BlockedGpuToTriton.cpp | 3 +- .../GpuToBlockedGpu/GpuToBlockedGpu.cpp | 34 +++++++++++++++ .../PrepareGpuHost/PrepareGpuHost.cpp | 41 +++++++++++++++---- 3 files changed, 70 insertions(+), 8 deletions(-) diff --git a/lib/Conversion/BlockedGpuToTriton/BlockedGpuToTriton.cpp b/lib/Conversion/BlockedGpuToTriton/BlockedGpuToTriton.cpp index 4d505d76..de19eab8 100644 --- a/lib/Conversion/BlockedGpuToTriton/BlockedGpuToTriton.cpp +++ b/lib/Conversion/BlockedGpuToTriton/BlockedGpuToTriton.cpp @@ -103,7 +103,8 @@ class ConvertGpuFuncToTritonFunc : public OpConversionPattern(gpuFunc.getLoc()); newFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), rewriter.getUnitAttr()); - + newFunc.setArgAttrsAttr(gpuFunc.getArgAttrsAttr()); + return success(); } diff --git a/lib/Conversion/GpuToBlockedGpu/GpuToBlockedGpu.cpp b/lib/Conversion/GpuToBlockedGpu/GpuToBlockedGpu.cpp index cc86fa94..73179199 100644 --- a/lib/Conversion/GpuToBlockedGpu/GpuToBlockedGpu.cpp +++ b/lib/Conversion/GpuToBlockedGpu/GpuToBlockedGpu.cpp @@ -197,6 +197,40 @@ class ConvertGpuToBlockedGpu: public CometGpuToBlockedGpuBase(memrefArg.getType())) + { + std::vector toExamine; + toExamine.insert(toExamine.end(), memrefArg.getUsers().begin(), memrefArg.getUsers().end()); + for(size_t i = 0; i < toExamine.size(); i++) + { + auto user = toExamine[i]; + if(mlir::isa(user)) + { + funcOp.setArgAttr(memrefArg.getArgNumber(), "gpu.read", builder.getUnitAttr()); + } + else if(mlir::isa(user)) + { + funcOp.setArgAttr(memrefArg.getArgNumber(), "gpu.write", builder.getUnitAttr()); + } + + if(user->getNumResults() > 0) + { + for(auto res: user->getResults()) + { + if(isa(res.getType())) + { + // If the result is a memref, we need to check its users as well + // to see if it is used in a store or load operation + toExamine.insert(toExamine.end(), res.getUsers().begin(), res.getUsers().end()); + } + } + } + } + } + } + for(auto arg: funcOp.getArguments()) { if(mlir::isa(arg.getType())) diff --git a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp index f3452174..1ad3cabe 100644 --- a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp +++ b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp @@ -1,4 +1,7 @@ #include "comet/Conversion/PrepareGpuHost/PrepareGpuHostPass.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/Pass/Pass.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" @@ -26,6 +29,7 @@ class PrepareGpuHost std::map funcs; std::map gpu_to_triton_kernel; + std::map gpu_name_to_funcOp; std::map triton_name_to_triton_func_op; auto gpuModules = modOp.getOps(); for(auto gpuModuleOp: gpuModules) @@ -33,17 +37,20 @@ class PrepareGpuHost auto funcOps = gpuModuleOp.getOps(); for(func::FuncOp funcOp: llvm::make_early_inc_range(funcOps)) { + std::map> argsToSet; if(!funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) { continue; } builder.setInsertionPoint(funcOp); SmallVector newTypes; - for(auto argType: funcOp.getArgumentTypes()) + for(auto arg: funcOp.getArguments()) { + auto argType = arg.getType(); newTypes.push_back(argType); if(MemRefType rankedType = dyn_cast(argType)) { + argsToSet[newTypes.size() - 1] = {funcOp.getArgAttr(arg.getArgNumber(), "gpu.read"), funcOp.getArgAttr(arg.getArgNumber(), "gpu.write")}; if(rankedType.hasRank()) { newTypes.push_back(builder.getIndexType()); @@ -71,6 +78,18 @@ class PrepareGpuHost builder.create(funcOp.getLoc()); newFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); + for(auto [argNumber, attrs]: argsToSet) + { + if(attrs[0]) + { + newFunc.setArgAttr(argNumber, "gpu.read", attrs[0]); + } + if(attrs[1]) + { + newFunc.setArgAttr(argNumber, "gpu.write", attrs[1]); + } + } + gpu_name_to_funcOp[funcOp.getName().str()] = newFunc; funcOp->erase(); } } @@ -185,14 +204,22 @@ class PrepareGpuHost if (mlir::gpu::LaunchFuncOp launchOp = dyn_cast(use.getOwner())) { builder.setInsertionPoint(launchOp); - builder.create(launchOp->getLoc(), TypeRange(), - ValueRange(), - gpuAlloc.getMemref(), alloc); + int offset = launchOp->getNumOperands() - launchOp.getNumKernelOperands(); + int operNum = use.getOperandNumber() - offset; + if(gpu_name_to_funcOp[launchOp.getKernelName().str()].getArgAttr(operNum, "gpu.read")) + { + builder.create(launchOp->getLoc(), TypeRange(), + ValueRange(), + gpuAlloc.getMemref(), alloc); + } use.set(gpuAlloc.getMemref()); builder.setInsertionPointAfter(launchOp); - builder.create(launchOp->getLoc(), TypeRange(), - ValueRange(), alloc, - gpuAlloc.getMemref()); + if(gpu_name_to_funcOp[launchOp.getKernelName().str()].getArgAttr(operNum, "gpu.write")) + { + builder.create(launchOp->getLoc(), TypeRange(), + ValueRange(), alloc, + gpuAlloc.getMemref()); + } } } } From 9fce842bcf55ba7df2357ea23906b72129d3f811 Mon Sep 17 00:00:00 2001 From: Polykarpos Thomadakis Date: Fri, 27 Jun 2025 15:49:38 -0700 Subject: [PATCH 2/3] [GPU] Added simple heuristic to detect cases where data are already in the device (because of a previous operation). Currently it's quite conservative as it does not track memref aliases and will just copy if an alias is detected. --- .../PrepareGpuHost/PrepareGpuHost.cpp | 92 +++++++++++++++++-- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp index 1ad3cabe..af3a5a15 100644 --- a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp +++ b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp @@ -1,7 +1,9 @@ #include "comet/Conversion/PrepareGpuHost/PrepareGpuHostPass.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Operation.h" #include "mlir/Pass/Pass.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" @@ -208,17 +210,19 @@ class PrepareGpuHost int operNum = use.getOperandNumber() - offset; if(gpu_name_to_funcOp[launchOp.getKernelName().str()].getArgAttr(operNum, "gpu.read")) { - builder.create(launchOp->getLoc(), TypeRange(), + auto gpuMemCpy = builder.create(launchOp->getLoc(), TypeRange(), ValueRange(), gpuAlloc.getMemref(), alloc); + gpuMemCpy->setAttr("gpu.read", builder.getUnitAttr()); } use.set(gpuAlloc.getMemref()); builder.setInsertionPointAfter(launchOp); if(gpu_name_to_funcOp[launchOp.getKernelName().str()].getArgAttr(operNum, "gpu.write")) { - builder.create(launchOp->getLoc(), TypeRange(), - ValueRange(), alloc, - gpuAlloc.getMemref()); + auto gpuMemCpy = builder.create(launchOp->getLoc(), TypeRange(), + ValueRange(), alloc, + gpuAlloc.getMemref()); + gpuMemCpy->setAttr("gpu.write", builder.getUnitAttr()); } } } @@ -241,11 +245,85 @@ class PrepareGpuHost gpuAllocs.push_back(gpuAllocOp); }); - std::vector gpuCopies; - modOp->walk([&gpuCopies](mlir::gpu::MemcpyOp gpuCopy) { - gpuCopies.push_back(gpuCopy); + + std::map> memEffects; + modOp->walk([&uniqueGpuAllocs, &memEffects](Operation* memEffect) { + for(auto op: memEffect->getOperands()) + { + if(uniqueGpuAllocs.find(op) != uniqueGpuAllocs.end()) + { + memEffects[op.getAsOpaquePointer()].push_back(memEffect); + } + } }); + for(auto& [memref, effects]: memEffects) + { + bool copyIn = true; + std::vector copyDelete; + for(size_t i = 0; i < effects.size(); i++) + { + if(mlir::gpu::MemcpyOp gpuCopy = mlir::dyn_cast(effects[i])) + { + if(!copyIn & gpuCopy->hasAttr("gpu.read")) + { + // gpuCopy.dump(); + gpuCopy->erase(); + } + else if(gpuCopy->hasAttr("gpu.read")) + { + copyIn = false; + } + else if(gpuCopy->hasAttr("gpu.write")) + { + copyIn = false; + copyDelete.push_back(gpuCopy); + } + } + else if(mlir::memref::StoreOp store = mlir::dyn_cast(effects[i])) + { + copyIn = true; + if(!copyDelete.empty()) + { + copyDelete.pop_back(); + } + } + else if(mlir::memref::CopyOp copy = mlir::dyn_cast(effects[i])) + { + if(copy.getTarget().getAsOpaquePointer() == memref) + { + copyIn = true; + } + } + else if(mlir::memref::LoadOp load = mlir::dyn_cast(effects[i])) + { + if(!copyDelete.empty()) + { + copyDelete.pop_back(); + } + } + else if(isa(effects[i])) + { + continue; + } + else // Unknown operation, be conservative + { + effects[i]->dump(); + copyIn = true; + if(!copyDelete.empty()) + { + copyDelete.pop_back(); + } + } + } + + for(auto toDelete: copyDelete) + { + // toDelete->dump(); + toDelete->erase(); + } + } + } }; From 62b49d9c6877ce980bc4f692de0e1dbdfad13e00 Mon Sep 17 00:00:00 2001 From: Polykarpos Thomadakis Date: Fri, 27 Jun 2025 15:50:23 -0700 Subject: [PATCH 3/3] Removed debugging output --- lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp index af3a5a15..62ebabda 100644 --- a/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp +++ b/lib/Conversion/PrepareGpuHost/PrepareGpuHost.cpp @@ -267,7 +267,6 @@ class PrepareGpuHost { if(!copyIn & gpuCopy->hasAttr("gpu.read")) { - // gpuCopy.dump(); gpuCopy->erase(); } else if(gpuCopy->hasAttr("gpu.read")) @@ -308,7 +307,6 @@ class PrepareGpuHost } else // Unknown operation, be conservative { - effects[i]->dump(); copyIn = true; if(!copyDelete.empty()) { @@ -319,7 +317,6 @@ class PrepareGpuHost for(auto toDelete: copyDelete) { - // toDelete->dump(); toDelete->erase(); } }