diff --git a/README.md b/README.md index 1dc9c115..4703fe14 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,7 @@ $ mkdir build && cd build $ cmake -G Ninja .. \ -DCMAKE_BUILD_TYPE=RELEASE \ -DOP_OPTIMIZATION_BENCHMARKS=ON \ + -DCMAKE_CXX_COMPILER=clang++ \ -DBUDDY_MLIR_BUILD_DIR=/PATH/TO/BUDDY-MLIR/BUILD/ $ ninja @@ -234,6 +235,34 @@ $ ninja // - conv2d-nchw-fchw-benchmark // - matmul-benchmark ``` +### matmul-benchmark +`OpenMP` and `lld` LTO is required in matmul-benchmark. To ensure version compatibility with the project, it's recommended to use the LLVM toolchains built within the `buddy-benchmark`. Follow the steps below: +- build llvm toolchains with `lld` and `OpenMP`. +``` +$ cd buddy-mlir/llvm/build +$ cmake -G Ninja ../llvm \ + -DLLVM_ENABLE_PROJECTS="mlir;clang;lld;openmp" \ + -DLLVM_TARGETS_TO_BUILD="host;RISCV" \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DLLVM_ENABLE_RUNTIMES=all \ + -DOPENMP_ENABLE_LIBOMPTARGET=OFF \ + -DCMAKE_BUILD_TYPE=RELEASE +``` +- use the `clang++` in `buddy-mlir/llvm/build/bin`. +``` +$ mkdir build && cd build +$ cmake -G Ninja .. \ + -DCMAKE_BUILD_TYPE=RELEASE \ + -DOP_OPTIMIZATION_BENCHMARKS=ON \ + -DCMAKE_CXX_COMPILER=/PATH/TO/BUDDY-MLIR/BUILD/bin/clang++ \ + -DBUDDY_MLIR_BUILD_DIR=/PATH/TO/BUDDY-MLIR/BUILD/ +$ ninja matmul-benchmark +``` +- `matmul-benchmark` need to load the `libomp.so` in `buddy-mlir/llvm/build/lib` to execute, here's a temporary way without root. + +``` +$ export LD_LIBRARY_PATH=/PATH/TO/BUDDY-MLIR/BUILD/lib/:$LD_LIBRARY_PATH +``` Run TVM operation optimization benchmark cases. - Install TVM ([steps](./thirdparty/README.md#tvm)). diff --git a/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir b/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir new file mode 100644 index 00000000..6ac9d9e2 --- /dev/null +++ b/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir @@ -0,0 +1,8 @@ +module{ + func.func @bm_batch_matmul(%a : memref, %b : memref, %c : memref) { + linalg.batch_matmul + ins(%a, %b: memref, memref) + outs(%c: memref) + return + } +} diff --git a/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir b/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir new file mode 100644 index 00000000..6d6de890 --- /dev/null +++ b/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir @@ -0,0 +1,46 @@ +// The MLIR prototype of batchmatmul-optimize in buddy-opt. + +#map = affine_map<(d0) -> (d0 ceildiv STEP_PLACEHOLDER)> +#tail_len_map = affine_map<(d0) -> (d0 mod STEP_PLACEHOLDER)> +#if_set = affine_set<(d0)[s0] : (s0 - d0 * STEP_PLACEHOLDER >= STEP_PLACEHOLDER)> +#b_col_idx_tail_map = affine_map<(d0) -> (d0 * STEP_PLACEHOLDER)> + +func.func @batch_matmul_broadcast_STEP_PLACEHOLDER(%a : memref, %b : memref, %c : memref) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %step = arith.constant STEP_PLACEHOLDER : index + %c0_f32 = arith.constant 0.0 : f32 + %c0_f32_vec = vector.splat %c0_f32 : vector + + %a_row = memref.dim %a, %c1 : memref + %a_col = memref.dim %a, %c2 : memref + %b_row = memref.dim %b, %c1 : memref + %b_col = memref.dim %b, %c2 : memref + %batch = memref.dim %a, %c0 : memref + + %tail_len = affine.apply #tail_len_map(%b_col) + %mask_vec = vector.create_mask %tail_len : vector + + affine.parallel (%batch_idx) = (0) to (%batch){ // Affine.parallel can be lowered to the omp dialect, which enables batch-level parallelization. + affine.prefetch %a[%batch_idx, %a_row, %a_col], read, locality<3>, data : memref // Explicitly prefetch, about 5% faster on X86. + affine.for %b_row_idx = 0 to %b_row { + affine.for %b_col_idx = 0 to #map(%b_col) { + %b_vec = affine.vector_load %b[%batch_idx, %b_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + %b_col_idx_tail = affine.apply #b_col_idx_tail_map(%b_col_idx) + affine.for %a_row_idx = 0 to %a_row { + %a_ele = affine.load %a[%batch_idx, %a_row_idx, %b_row_idx] : memref + %a_vec = vector.broadcast %a_ele : f32 to vector + %c_vec = affine.vector_load %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + %result_vec = vector.fma %a_vec, %b_vec, %c_vec : vector + affine.if #if_set(%b_col_idx)[%b_col] { + affine.vector_store %result_vec, %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + } else { + vector.maskedstore %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %result_vec : memref, vector, vector + } + } + } + } + } + return +} diff --git a/benchmarks/OpOptimization/MatMul/CMakeLists.txt b/benchmarks/OpOptimization/MatMul/CMakeLists.txt index f68fe934..2803ba1a 100644 --- a/benchmarks/OpOptimization/MatMul/CMakeLists.txt +++ b/benchmarks/OpOptimization/MatMul/CMakeLists.txt @@ -97,12 +97,88 @@ add_custom_command(OUTPUT matmul-scalar.o add_library(MatMulScalar STATIC matmul-scalar.o) set_target_properties(MatMulScalar PROPERTIES LINKER_LANGUAGE CXX) +add_custom_command(OUTPUT batch-matmul-scalar.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir | + sed 's/bm_batch_matmul/batch_matmul_scalar/' | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -convert-linalg-to-loops + -lower-affine + -convert-scf-to-cf + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-arith-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} + -mattr=${BUDDY_OPT_ATTR} --filetype=obj + -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/MatMul/batch-matmul-scalar.o +) +add_library(BatchMatMulScalar STATIC batch-matmul-scalar.o) +set_target_properties(BatchMatMulScalar PROPERTIES LINKER_LANGUAGE CXX) + +function(build_batch_matmul_broadcast step) + add_custom_command(OUTPUT batch-matmul-broadcast-${step}.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir | + sed 's/bm_batch_matmul/batch_matmul_broadcast_${step}/g' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -batchmatmul-optimize="vector-size=${step}" + -expand-strided-metadata + -affine-super-vectorize + -lower-affine + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -convert-linalg-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} + -mattr=${BUDDY_OPT_ATTR} --filetype=obj + -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/MatMul/batch-matmul-broadcast-${step}.o + ) + add_library(BatchMatMulBroadcast${step} STATIC batch-matmul-broadcast-${step}.o) + set_target_properties(BatchMatMulBroadcast${step} PROPERTIES LINKER_LANGUAGE CXX) +endfunction() + +build_batch_matmul_broadcast(64) + +function(build_batch_matmul_broadcast_omp step) + add_custom_command(OUTPUT batch-matmul-broadcast-${step}-omp.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir | + sed 's/batch_matmul_broadcast_STEP_PLACEHOLDER/batch_matmul_broadcast_STEP_PLACEHOLDER_omp/g' | + sed 's/STEP_PLACEHOLDER/${step}/g' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -expand-strided-metadata + -affine-super-vectorize + -lower-affine + -convert-scf-to-openmp + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -convert-linalg-to-llvm + -llvm-request-c-wrappers + -convert-openmp-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${CMAKE_CXX_COMPILER} -c -x ir -O3 --target=${BUDDY_OPT_TRIPLE} -fopenmp -march=native -flto + -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/MatMul/batch-matmul-broadcast-${step}-omp.o - + ) + add_library(BatchMatMulBroadcast${step}OMP STATIC batch-matmul-broadcast-${step}-omp.o) + set_target_properties(BatchMatMulBroadcast${step}OMP PROPERTIES LINKER_LANGUAGE CXX) +endfunction() + +build_batch_matmul_broadcast_omp(64) + add_executable(matmul-benchmark Main.cpp MatMulBenchmark.cpp ) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fopenmp -flto -fuse-ld=lld") target_link_libraries(matmul-benchmark GoogleBenchmark @@ -114,4 +190,7 @@ target_link_libraries(matmul-benchmark MatMulBroadcast128 MatMulBroadcast256 MatMulScalar + BatchMatMulScalar + BatchMatMulBroadcast64 + BatchMatMulBroadcast64OMP ) diff --git a/benchmarks/OpOptimization/MatMul/Main.cpp b/benchmarks/OpOptimization/MatMul/Main.cpp index 4c90eb81..f2af7250 100644 --- a/benchmarks/OpOptimization/MatMul/Main.cpp +++ b/benchmarks/OpOptimization/MatMul/Main.cpp @@ -20,13 +20,15 @@ #include -void verification(); +void matmul_verification(); +void batch_matmul_verification(); int main(int argc, char **argv) { // Run benchmark. ::benchmark::Initialize(&argc, argv); ::benchmark::RunSpecifiedBenchmarks(); // Run correctness verification. - verification(); + matmul_verification(); + batch_matmul_verification(); return 0; } diff --git a/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp b/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp index aee79b17..50265166 100644 --- a/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp +++ b/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp @@ -18,6 +18,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include #include @@ -27,6 +28,10 @@ #define M 64 #define N 3136 #define K 576 +#define BATCH_M 128 +#define BATCH_N 784 +#define BATCH_K 72 +#define BATCH 16 // Helper functions and variables. namespace { @@ -62,6 +67,14 @@ void _mlir_ciface_matmul_broadcast_256(MemRef *A, MemRef *B, MemRef *C); void _mlir_ciface_matmul_scalar(MemRef *A, MemRef *B, MemRef *C); +void _mlir_ciface_batch_matmul_scalar(MemRef *A, MemRef *B, + MemRef *C); +void _mlir_ciface_batch_matmul_broadcast_64(MemRef *A, + MemRef *B, + MemRef *C); +void _mlir_ciface_batch_matmul_broadcast_64_omp(MemRef *A, + MemRef *B, + MemRef *C); } #define DEFINE_MATMUL_BENCHMARK(name, func) \ @@ -79,6 +92,21 @@ void _mlir_ciface_matmul_scalar(MemRef *A, MemRef *B, } \ } +#define DEFINE_BATCH_MATMUL_BENCHMARK(name, func) \ + void BM_BATCH_MATMUL_##name(benchmark::State &state) { \ + intptr_t sizesA[3] = {BATCH, BATCH_M, BATCH_K}; \ + intptr_t sizesB[3] = {BATCH, BATCH_K, BATCH_N}; \ + intptr_t sizesC[3] = {BATCH, BATCH_M, BATCH_N}; \ + \ + MemRef A(sizesA, 1.0); \ + MemRef B(sizesB, 1.0); \ + MemRef C(sizesC, 0); \ + \ + for (auto _ : state) { \ + func(&A, &B, &C); \ + } \ + } + DEFINE_MATMUL_BENCHMARK(OCV, _mlir_ciface_matmul_ocv) DEFINE_MATMUL_BENCHMARK(TRANSFORM, _mlir_ciface_matmul_transform) DEFINE_MATMUL_BENCHMARK(BROADCAST_16, _mlir_ciface_matmul_broadcast_16) @@ -87,6 +115,11 @@ DEFINE_MATMUL_BENCHMARK(BROADCAST_64, _mlir_ciface_matmul_broadcast_64) DEFINE_MATMUL_BENCHMARK(BROADCAST_128, _mlir_ciface_matmul_broadcast_128) DEFINE_MATMUL_BENCHMARK(BROADCAST_256, _mlir_ciface_matmul_broadcast_256) DEFINE_MATMUL_BENCHMARK(SCALAR, _mlir_ciface_matmul_scalar) +DEFINE_BATCH_MATMUL_BENCHMARK(SCALAR, _mlir_ciface_batch_matmul_scalar) +DEFINE_BATCH_MATMUL_BENCHMARK(BROADCAST_64, + _mlir_ciface_batch_matmul_broadcast_64) +DEFINE_BATCH_MATMUL_BENCHMARK(BROADCAST_64_OMP, + _mlir_ciface_batch_matmul_broadcast_64_omp) } // namespace // Register benchmark cases. @@ -98,15 +131,18 @@ BENCHMARK(BM_MATMUL_BROADCAST_32)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_128)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BATCH_MATMUL_SCALAR)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BATCH_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BATCH_MATMUL_BROADCAST_64_OMP)->Unit(benchmark::kMillisecond); -/// Correctness Verification -/// The verification does not affect the performance. -/// - Set the scalar case as the criteria. -/// - Input elements are random numbers. -/// - Output elements are initialized to zero. -/// - Compare the output of various optimizations with the scalar version to -/// verify correctness. -void verification() { +// Correctness Verification +// The verification does not affect the performance. +// - Set the scalar case as the criteria. +// - Input elements are random numbers. +// - Output elements are initialized to zero. +// - Compare the output of various optimizations with the scalar version to +// verify correctness. +void matmul_verification() { // Set the random number generator. std::random_device rd; std::mt19937 generator(rd()); @@ -209,3 +245,68 @@ void verification() { std::cout << "-----------------------------------------------------------" << std::endl; } + +void batch_matmul_verification() { + // Set the random number generator. + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution distribution(1, 100); + + // Set the layout sizes of input and output memref container. + intptr_t sizesA[3] = {BATCH, BATCH_M, BATCH_K}; + intptr_t sizesB[3] = {BATCH, BATCH_K, BATCH_N}; + intptr_t sizesC[3] = {BATCH, BATCH_M, BATCH_N}; + + // Generate input A and input B memref container with random numbers. + const int inputASize = BATCH * (BATCH_M) * (BATCH_K); + // float inputARand[inputASize]; + auto inputARand = new std::array(); + for (int i = 0; i < inputASize; ++i) { + (*inputARand)[i] = distribution(generator); + } + MemRef inputAMemRef(inputARand->data(), sizesA); + + const int inputBSize = BATCH * (BATCH_K) * (BATCH_N); + // float inputBRand[inputBSize]; + auto inputBRand = new std::array(); + for (int i = 0; i < inputBSize; ++i) { + (*inputBRand)[i] = distribution(generator); + } + MemRef inputBMemRef(inputBRand->data(), sizesB); + + // Generate output memref container with zero. + const int outputSize = BATCH * (BATCH_M) * (BATCH_N); + MemRef outputScalar(sizesC, 0); + MemRef outputBroadcast64(sizesC, 0); + MemRef outputBroadcast64OMP(sizesC, 0); + + // Perform all the matmul implementation. + _mlir_ciface_batch_matmul_scalar(&inputAMemRef, &inputBMemRef, &outputScalar); + _mlir_ciface_batch_matmul_broadcast_64(&inputAMemRef, &inputBMemRef, + &outputBroadcast64); + _mlir_ciface_batch_matmul_broadcast_64_omp(&inputAMemRef, &inputBMemRef, + &outputBroadcast64OMP); + + // Get the result array. + auto resultScalar = outputScalar.getData(); + auto resultBroadcast64 = outputBroadcast64.getData(); + auto resultBroadcast64OMP = outputBroadcast64OMP.getData(); + + // Print the verfication result. + std::cout << "Batch Matmul Broadcast 64 case: " + << (areArraysEqual(resultScalar, resultBroadcast64, + outputSize / BATCH) + ? PASS + : FAIL) + << std::endl; + + std::cout << "Batch Matmul Broadcast 64 OpenMP case: " + << (areArraysEqual(resultScalar, resultBroadcast64OMP, + outputSize / BATCH) + ? PASS + : FAIL) + << std::endl; + + std::cout << "-----------------------------------------------------------" + << std::endl; +}