From ff2049a9953e367793a5c2c77231b01a31270e1b Mon Sep 17 00:00:00 2001 From: EllisLambda Date: Thu, 17 Aug 2023 03:11:27 +0800 Subject: [PATCH 1/3] [OpOptimization] Add BatchMatMul benchmark. --- .../OpOptimization/MatMul/BatchMatMul.mlir | 8 ++ .../MatMul/BatchMatMulBroadcast.mlir | 47 ++++++++ .../OpOptimization/MatMul/CMakeLists.txt | 49 +++++++++ benchmarks/OpOptimization/MatMul/Main.cpp | 6 +- .../OpOptimization/MatMul/MatMulBenchmark.cpp | 101 ++++++++++++++++-- 5 files changed, 201 insertions(+), 10 deletions(-) create mode 100644 benchmarks/OpOptimization/MatMul/BatchMatMul.mlir create mode 100644 benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir diff --git a/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir b/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir new file mode 100644 index 00000000..6ac9d9e2 --- /dev/null +++ b/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir @@ -0,0 +1,8 @@ +module{ + func.func @bm_batch_matmul(%a : memref, %b : memref, %c : memref) { + linalg.batch_matmul + ins(%a, %b: memref, memref) + outs(%c: memref) + return + } +} diff --git a/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir b/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir new file mode 100644 index 00000000..708e9d0d --- /dev/null +++ b/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir @@ -0,0 +1,47 @@ +// The MLIR prototype of batchmatmul-optimize in buddy-opt. + +#map = affine_map<(d0) -> (d0 ceildiv STEP_PLACEHOLDER)> +func.func @batch_matmul_broadcast_STEP_PLACEHOLDER(%a : memref, %b : memref, %c : memref) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %step = arith.constant STEP_PLACEHOLDER : index + %c0_f32 = arith.constant 0.0 : f32 + %c0_f32_vec = vector.splat %c0_f32 : vector + + %a_row = memref.dim %a, %c1 : memref + %a_col = memref.dim %a, %c2 : memref + %b_row = memref.dim %b, %c1 : memref + %b_col = memref.dim %b, %c2 : memref + %batch = memref.dim %a, %c0 : memref + + affine.parallel (%batch_idx) = (0) to (%batch){ // Affine.parallel can be lowered to the omp dialect, which enables batch-level parallelization. + affine.prefetch %a[%batch_idx, %a_row, %a_col], read, locality<3>, data : memref // Explicitly prefetch, about 5% faster on X86. + affine.for %b_row_idx = 0 to %b_row { + affine.for %a_row_idx = 0 to %a_row { + affine.for %b_col_idx = 0 to #map(%b_col) { + %a_ele = affine.load %a[%batch_idx, %a_row_idx, %b_row_idx] : memref + %a_vec = vector.broadcast %a_ele : f32 to vector + // Check tail. + %b_col_cur = arith.muli %b_col_idx, %step : index + %tail_len = arith.subi %b_col, %b_col_cur : index + %tail_flag = arith.cmpi sge, %tail_len, %step : index + scf.if %tail_flag { + %b_vec = affine.vector_load %b[%batch_idx, %b_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + %c_vec = affine.vector_load %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + %result_vec = vector.fma %a_vec, %b_vec, %c_vec : vector + affine.vector_store %result_vec, %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + } else { + %mask_vec = vector.create_mask %tail_len : vector + %b_col_idx_tail = arith.muli %b_col_idx, %step : index + %b_vec_tail = vector.maskedload %b[%batch_idx, %b_row_idx, %b_col_idx_tail], %mask_vec, %c0_f32_vec : memref, vector, vector into vector + %c_vec_tail = vector.maskedload %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %c0_f32_vec : memref, vector, vector into vector + %result_vec_tail = vector.fma %a_vec, %b_vec_tail, %c_vec_tail : vector + vector.maskedstore %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %result_vec_tail : memref, vector, vector + } + } + } + } + } + return +} diff --git a/benchmarks/OpOptimization/MatMul/CMakeLists.txt b/benchmarks/OpOptimization/MatMul/CMakeLists.txt index f68fe934..5107a391 100644 --- a/benchmarks/OpOptimization/MatMul/CMakeLists.txt +++ b/benchmarks/OpOptimization/MatMul/CMakeLists.txt @@ -97,6 +97,53 @@ add_custom_command(OUTPUT matmul-scalar.o add_library(MatMulScalar STATIC matmul-scalar.o) set_target_properties(MatMulScalar PROPERTIES LINKER_LANGUAGE CXX) +add_custom_command(OUTPUT batch-matmul-scalar.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir | + sed 's/bm_batch_matmul/batch_matmul_scalar/' | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -convert-linalg-to-loops + -lower-affine + -convert-scf-to-cf + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-arith-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} + -mattr=${BUDDY_OPT_ATTR} --filetype=obj + -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/MatMul/batch-matmul-scalar.o +) +add_library(BatchMatMulScalar STATIC batch-matmul-scalar.o) +set_target_properties(BatchMatMulScalar PROPERTIES LINKER_LANGUAGE CXX) + +function(build_batch_matmul_broadcast step) + add_custom_command(OUTPUT batch-matmul-broadcast-${step}.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir | + sed 's/bm_batch_matmul/batch_matmul_broadcast_${step}/g' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -batchmatmul-optimize="step-placeholder=${step}" + -expand-strided-metadata + -lower-affine + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -convert-linalg-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} + -mattr=${BUDDY_OPT_ATTR} --filetype=obj + -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/MatMul/batch-matmul-broadcast-${step}.o + ) + add_library(BatchMatMulBroadcast${step} STATIC batch-matmul-broadcast-${step}.o) + set_target_properties(BatchMatMulBroadcast${step} PROPERTIES LINKER_LANGUAGE CXX) +endfunction() + +build_batch_matmul_broadcast(64) + add_executable(matmul-benchmark Main.cpp MatMulBenchmark.cpp @@ -114,4 +161,6 @@ target_link_libraries(matmul-benchmark MatMulBroadcast128 MatMulBroadcast256 MatMulScalar + BatchMatMulScalar + BatchMatMulBroadcast64 ) diff --git a/benchmarks/OpOptimization/MatMul/Main.cpp b/benchmarks/OpOptimization/MatMul/Main.cpp index 4c90eb81..f2af7250 100644 --- a/benchmarks/OpOptimization/MatMul/Main.cpp +++ b/benchmarks/OpOptimization/MatMul/Main.cpp @@ -20,13 +20,15 @@ #include -void verification(); +void matmul_verification(); +void batch_matmul_verification(); int main(int argc, char **argv) { // Run benchmark. ::benchmark::Initialize(&argc, argv); ::benchmark::RunSpecifiedBenchmarks(); // Run correctness verification. - verification(); + matmul_verification(); + batch_matmul_verification(); return 0; } diff --git a/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp b/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp index aee79b17..9a798a6f 100644 --- a/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp +++ b/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp @@ -18,6 +18,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include #include @@ -27,6 +28,10 @@ #define M 64 #define N 3136 #define K 576 +#define BATCH_M 16 +#define BATCH_N 784 +#define BATCH_K 144 +#define BATCH 64 // Helper functions and variables. namespace { @@ -62,6 +67,11 @@ void _mlir_ciface_matmul_broadcast_256(MemRef *A, MemRef *B, MemRef *C); void _mlir_ciface_matmul_scalar(MemRef *A, MemRef *B, MemRef *C); +void _mlir_ciface_batch_matmul_scalar(MemRef *A, MemRef *B, + MemRef *C); +void _mlir_ciface_batch_matmul_broadcast_64(MemRef *A, + MemRef *B, + MemRef *C); } #define DEFINE_MATMUL_BENCHMARK(name, func) \ @@ -79,6 +89,21 @@ void _mlir_ciface_matmul_scalar(MemRef *A, MemRef *B, } \ } +#define DEFINE_BATCH_MATMUL_BENCHMARK(name, func) \ + void BM_BATCH_MATMUL_##name(benchmark::State &state) { \ + intptr_t sizesA[3] = {BATCH, BATCH_M, BATCH_K}; \ + intptr_t sizesB[3] = {BATCH, BATCH_K, BATCH_N}; \ + intptr_t sizesC[3] = {BATCH, BATCH_M, BATCH_N}; \ + \ + MemRef A(sizesA, 1.0); \ + MemRef B(sizesB, 1.0); \ + MemRef C(sizesC, 0); \ + \ + for (auto _ : state) { \ + func(&A, &B, &C); \ + } \ + } + DEFINE_MATMUL_BENCHMARK(OCV, _mlir_ciface_matmul_ocv) DEFINE_MATMUL_BENCHMARK(TRANSFORM, _mlir_ciface_matmul_transform) DEFINE_MATMUL_BENCHMARK(BROADCAST_16, _mlir_ciface_matmul_broadcast_16) @@ -87,6 +112,9 @@ DEFINE_MATMUL_BENCHMARK(BROADCAST_64, _mlir_ciface_matmul_broadcast_64) DEFINE_MATMUL_BENCHMARK(BROADCAST_128, _mlir_ciface_matmul_broadcast_128) DEFINE_MATMUL_BENCHMARK(BROADCAST_256, _mlir_ciface_matmul_broadcast_256) DEFINE_MATMUL_BENCHMARK(SCALAR, _mlir_ciface_matmul_scalar) +DEFINE_BATCH_MATMUL_BENCHMARK(SCALAR, _mlir_ciface_batch_matmul_scalar) +DEFINE_BATCH_MATMUL_BENCHMARK(BROADCAST_64, + _mlir_ciface_batch_matmul_broadcast_64) } // namespace // Register benchmark cases. @@ -98,15 +126,18 @@ BENCHMARK(BM_MATMUL_BROADCAST_32)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_128)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BATCH_MATMUL_SCALAR)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BATCH_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond); -/// Correctness Verification -/// The verification does not affect the performance. -/// - Set the scalar case as the criteria. -/// - Input elements are random numbers. -/// - Output elements are initialized to zero. -/// - Compare the output of various optimizations with the scalar version to -/// verify correctness. -void verification() { +// Correctness Verification +// The verification does not affect the performance. +// - Set the scalar case as the criteria. +// - Input elements are random numbers. +// - Output elements are initialized to zero. +// - Compare the output of various optimizations with the scalar version to +// verify correctness. +void matmul_verification() { // Set the random number generator. std::random_device rd; std::mt19937 generator(rd()); @@ -206,6 +237,60 @@ void verification() { ? PASS : FAIL) << std::endl; + + std::cout << "-----------------------------------------------------------" + << std::endl; +} + +void batch_matmul_verification() { + // Set the random number generator. + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution distribution(1, 100); + + // Set the layout sizes of input and output memref container. + intptr_t sizesA[3] = {BATCH, BATCH_M, BATCH_K}; + intptr_t sizesB[3] = {BATCH, BATCH_K, BATCH_N}; + intptr_t sizesC[3] = {BATCH, BATCH_M, BATCH_N}; + + // Generate input A and input B memref container with random numbers. + const int inputASize = BATCH * (BATCH_M) * (BATCH_K); + // float inputARand[inputASize]; + auto inputARand = new std::array(); + for (int i = 0; i < inputASize; ++i) { + (*inputARand)[i] = distribution(generator); + } + MemRef inputAMemRef(inputARand->data(), sizesA); + + const int inputBSize = BATCH * (BATCH_K) * (BATCH_N); + // float inputBRand[inputBSize]; + auto inputBRand = new std::array(); + for (int i = 0; i < inputBSize; ++i) { + (*inputBRand)[i] = distribution(generator); + } + MemRef inputBMemRef(inputBRand->data(), sizesB); + + // Generate output memref container with zero. + const int outputSize = BATCH * (BATCH_M) * (BATCH_N); + MemRef outputScalar(sizesC, 0); + MemRef outputBroadcast64(sizesC, 0); + + // Perform all the matmul implementation. + _mlir_ciface_batch_matmul_scalar(&inputAMemRef, &inputBMemRef, &outputScalar); + _mlir_ciface_batch_matmul_broadcast_64(&inputAMemRef, &inputBMemRef, + &outputBroadcast64); + + // Get the result array. + auto resultScalar = outputScalar.getData(); + auto resultBroadcast16 = outputBroadcast64.getData(); + + // Print the verfication result. + std::cout << "Batch Matmul Broadcast 64 case: " + << (areArraysEqual(resultScalar, resultBroadcast16, + outputSize / BATCH) + ? PASS + : FAIL) + << std::endl; std::cout << "-----------------------------------------------------------" << std::endl; } From 4e3e288f601ece5f06cf1545c6048d3c28dcb56c Mon Sep 17 00:00:00 2001 From: EllisLambda Date: Sat, 9 Sep 2023 18:34:10 +0800 Subject: [PATCH 2/3] [OpOptimization] Further optimize BatchMatMulBroadcast and add OpenMP tests. --- README.md | 2 + .../MatMul/BatchMatMulBroadcast.mlir | 41 +++++++++---------- .../OpOptimization/MatMul/CMakeLists.txt | 32 ++++++++++++++- .../OpOptimization/MatMul/MatMulBenchmark.cpp | 29 ++++++++++--- 4 files changed, 76 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 1dc9c115..ace03882 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,7 @@ $ mkdir build && cd build $ cmake -G Ninja .. \ -DCMAKE_BUILD_TYPE=RELEASE \ -DOP_OPTIMIZATION_BENCHMARKS=ON \ + -DCMAKE_CXX_COMPILER=clang++ \ -DBUDDY_MLIR_BUILD_DIR=/PATH/TO/BUDDY-MLIR/BUILD/ $ ninja @@ -234,6 +235,7 @@ $ ninja // - conv2d-nchw-fchw-benchmark // - matmul-benchmark ``` +OpenMP is required in matmul-benchmark, make sure `libomp` and `libomp-dev` (on Ubuntu and Debian) / `libomp-devel` (on Redhat and SUSE) have been installed. Run TVM operation optimization benchmark cases. - Install TVM ([steps](./thirdparty/README.md#tvm)). diff --git a/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir b/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir index 708e9d0d..6d6de890 100644 --- a/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir +++ b/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir @@ -1,6 +1,10 @@ // The MLIR prototype of batchmatmul-optimize in buddy-opt. #map = affine_map<(d0) -> (d0 ceildiv STEP_PLACEHOLDER)> +#tail_len_map = affine_map<(d0) -> (d0 mod STEP_PLACEHOLDER)> +#if_set = affine_set<(d0)[s0] : (s0 - d0 * STEP_PLACEHOLDER >= STEP_PLACEHOLDER)> +#b_col_idx_tail_map = affine_map<(d0) -> (d0 * STEP_PLACEHOLDER)> + func.func @batch_matmul_broadcast_STEP_PLACEHOLDER(%a : memref, %b : memref, %c : memref) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -15,32 +19,27 @@ func.func @batch_matmul_broadcast_STEP_PLACEHOLDER(%a : memref, %b : %b_col = memref.dim %b, %c2 : memref %batch = memref.dim %a, %c0 : memref + %tail_len = affine.apply #tail_len_map(%b_col) + %mask_vec = vector.create_mask %tail_len : vector + affine.parallel (%batch_idx) = (0) to (%batch){ // Affine.parallel can be lowered to the omp dialect, which enables batch-level parallelization. affine.prefetch %a[%batch_idx, %a_row, %a_col], read, locality<3>, data : memref // Explicitly prefetch, about 5% faster on X86. affine.for %b_row_idx = 0 to %b_row { + affine.for %b_col_idx = 0 to #map(%b_col) { + %b_vec = affine.vector_load %b[%batch_idx, %b_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + %b_col_idx_tail = affine.apply #b_col_idx_tail_map(%b_col_idx) affine.for %a_row_idx = 0 to %a_row { - affine.for %b_col_idx = 0 to #map(%b_col) { - %a_ele = affine.load %a[%batch_idx, %a_row_idx, %b_row_idx] : memref - %a_vec = vector.broadcast %a_ele : f32 to vector - // Check tail. - %b_col_cur = arith.muli %b_col_idx, %step : index - %tail_len = arith.subi %b_col, %b_col_cur : index - %tail_flag = arith.cmpi sge, %tail_len, %step : index - scf.if %tail_flag { - %b_vec = affine.vector_load %b[%batch_idx, %b_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector - %c_vec = affine.vector_load %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector - %result_vec = vector.fma %a_vec, %b_vec, %c_vec : vector - affine.vector_store %result_vec, %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector - } else { - %mask_vec = vector.create_mask %tail_len : vector - %b_col_idx_tail = arith.muli %b_col_idx, %step : index - %b_vec_tail = vector.maskedload %b[%batch_idx, %b_row_idx, %b_col_idx_tail], %mask_vec, %c0_f32_vec : memref, vector, vector into vector - %c_vec_tail = vector.maskedload %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %c0_f32_vec : memref, vector, vector into vector - %result_vec_tail = vector.fma %a_vec, %b_vec_tail, %c_vec_tail : vector - vector.maskedstore %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %result_vec_tail : memref, vector, vector - } - } + %a_ele = affine.load %a[%batch_idx, %a_row_idx, %b_row_idx] : memref + %a_vec = vector.broadcast %a_ele : f32 to vector + %c_vec = affine.vector_load %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + %result_vec = vector.fma %a_vec, %b_vec, %c_vec : vector + affine.if #if_set(%b_col_idx)[%b_col] { + affine.vector_store %result_vec, %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref, vector + } else { + vector.maskedstore %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %result_vec : memref, vector, vector + } } + } } } return diff --git a/benchmarks/OpOptimization/MatMul/CMakeLists.txt b/benchmarks/OpOptimization/MatMul/CMakeLists.txt index 5107a391..0d55de3d 100644 --- a/benchmarks/OpOptimization/MatMul/CMakeLists.txt +++ b/benchmarks/OpOptimization/MatMul/CMakeLists.txt @@ -125,6 +125,7 @@ function(build_batch_matmul_broadcast step) ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt -batchmatmul-optimize="step-placeholder=${step}" -expand-strided-metadata + -affine-super-vectorize -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm @@ -144,12 +145,40 @@ endfunction() build_batch_matmul_broadcast(64) +function(build_batch_matmul_broadcast_omp step) + add_custom_command(OUTPUT batch-matmul-broadcast-${step}-omp.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir | + sed 's/batch_matmul_broadcast_STEP_PLACEHOLDER/batch_matmul_broadcast_STEP_PLACEHOLDER_omp/g' | + sed 's/STEP_PLACEHOLDER/${step}/g' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -expand-strided-metadata + -affine-super-vectorize + -lower-affine + -convert-scf-to-openmp + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -convert-linalg-to-llvm + -llvm-request-c-wrappers + -convert-openmp-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${CMAKE_CXX_COMPILER} -c -x ir -O3 --target=${BUDDY_OPT_TRIPLE} -fopenmp -march=native -flto + -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/MatMul/batch-matmul-broadcast-${step}-omp.o - + ) + add_library(BatchMatMulBroadcast${step}OMP STATIC batch-matmul-broadcast-${step}-omp.o) + set_target_properties(BatchMatMulBroadcast${step}OMP PROPERTIES LINKER_LANGUAGE CXX) +endfunction() + +build_batch_matmul_broadcast_omp(64) + add_executable(matmul-benchmark Main.cpp MatMulBenchmark.cpp ) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fopenmp -flto") target_link_libraries(matmul-benchmark GoogleBenchmark @@ -163,4 +192,5 @@ target_link_libraries(matmul-benchmark MatMulScalar BatchMatMulScalar BatchMatMulBroadcast64 + BatchMatMulBroadcast64OMP ) diff --git a/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp b/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp index 9a798a6f..274f691e 100644 --- a/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp +++ b/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp @@ -28,10 +28,10 @@ #define M 64 #define N 3136 #define K 576 -#define BATCH_M 16 +#define BATCH_M 128 #define BATCH_N 784 -#define BATCH_K 144 -#define BATCH 64 +#define BATCH_K 72 +#define BATCH 16 // Helper functions and variables. namespace { @@ -72,6 +72,9 @@ void _mlir_ciface_batch_matmul_scalar(MemRef *A, MemRef *B, void _mlir_ciface_batch_matmul_broadcast_64(MemRef *A, MemRef *B, MemRef *C); +void _mlir_ciface_batch_matmul_broadcast_64_omp(MemRef *A, + MemRef *B, + MemRef *C); } #define DEFINE_MATMUL_BENCHMARK(name, func) \ @@ -115,6 +118,8 @@ DEFINE_MATMUL_BENCHMARK(SCALAR, _mlir_ciface_matmul_scalar) DEFINE_BATCH_MATMUL_BENCHMARK(SCALAR, _mlir_ciface_batch_matmul_scalar) DEFINE_BATCH_MATMUL_BENCHMARK(BROADCAST_64, _mlir_ciface_batch_matmul_broadcast_64) +DEFINE_BATCH_MATMUL_BENCHMARK(BROADCAST_64_OMP, + _mlir_ciface_batch_matmul_broadcast_64_omp) } // namespace // Register benchmark cases. @@ -129,6 +134,7 @@ BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond); BENCHMARK(BM_BATCH_MATMUL_SCALAR)->Unit(benchmark::kMillisecond); BENCHMARK(BM_BATCH_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BATCH_MATMUL_BROADCAST_64_OMP)->Unit(benchmark::kMillisecond); // Correctness Verification // The verification does not affect the performance. @@ -237,7 +243,6 @@ void matmul_verification() { ? PASS : FAIL) << std::endl; - std::cout << "-----------------------------------------------------------" << std::endl; } @@ -274,23 +279,35 @@ void batch_matmul_verification() { const int outputSize = BATCH * (BATCH_M) * (BATCH_N); MemRef outputScalar(sizesC, 0); MemRef outputBroadcast64(sizesC, 0); + MemRef outputBroadcast64OMP(sizesC, 0); // Perform all the matmul implementation. _mlir_ciface_batch_matmul_scalar(&inputAMemRef, &inputBMemRef, &outputScalar); _mlir_ciface_batch_matmul_broadcast_64(&inputAMemRef, &inputBMemRef, &outputBroadcast64); + _mlir_ciface_batch_matmul_broadcast_64_omp(&inputAMemRef, &inputBMemRef, + &outputBroadcast64OMP); // Get the result array. auto resultScalar = outputScalar.getData(); - auto resultBroadcast16 = outputBroadcast64.getData(); + auto resultBroadcast64 = outputBroadcast64.getData(); + auto resultBroadcast64OMP = outputBroadcast64OMP.getData(); // Print the verfication result. std::cout << "Batch Matmul Broadcast 64 case: " - << (areArraysEqual(resultScalar, resultBroadcast16, + << (areArraysEqual(resultScalar, resultBroadcast64, + outputSize / BATCH) + ? PASS + : FAIL) + << std::endl; + + std::cout << "Batch Matmul Broadcast 64 OpenMP case: " + << (areArraysEqual(resultScalar, resultBroadcast64OMP, outputSize / BATCH) ? PASS : FAIL) << std::endl; + std::cout << "-----------------------------------------------------------" << std::endl; } From 07dedb9898f26b5f70a21ec44b6a6be956f5bc0f Mon Sep 17 00:00:00 2001 From: EllisLambda Date: Tue, 12 Sep 2023 18:36:38 +0800 Subject: [PATCH 3/3] [OpOptimization] Sync with buddy-compiler/buddy-mlir#201 --- README.md | 29 ++++++++++++++++++- .../OpOptimization/MatMul/CMakeLists.txt | 4 +-- .../OpOptimization/MatMul/MatMulBenchmark.cpp | 1 - 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ace03882..4703fe14 100644 --- a/README.md +++ b/README.md @@ -235,7 +235,34 @@ $ ninja // - conv2d-nchw-fchw-benchmark // - matmul-benchmark ``` -OpenMP is required in matmul-benchmark, make sure `libomp` and `libomp-dev` (on Ubuntu and Debian) / `libomp-devel` (on Redhat and SUSE) have been installed. +### matmul-benchmark +`OpenMP` and `lld` LTO is required in matmul-benchmark. To ensure version compatibility with the project, it's recommended to use the LLVM toolchains built within the `buddy-benchmark`. Follow the steps below: +- build llvm toolchains with `lld` and `OpenMP`. +``` +$ cd buddy-mlir/llvm/build +$ cmake -G Ninja ../llvm \ + -DLLVM_ENABLE_PROJECTS="mlir;clang;lld;openmp" \ + -DLLVM_TARGETS_TO_BUILD="host;RISCV" \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DLLVM_ENABLE_RUNTIMES=all \ + -DOPENMP_ENABLE_LIBOMPTARGET=OFF \ + -DCMAKE_BUILD_TYPE=RELEASE +``` +- use the `clang++` in `buddy-mlir/llvm/build/bin`. +``` +$ mkdir build && cd build +$ cmake -G Ninja .. \ + -DCMAKE_BUILD_TYPE=RELEASE \ + -DOP_OPTIMIZATION_BENCHMARKS=ON \ + -DCMAKE_CXX_COMPILER=/PATH/TO/BUDDY-MLIR/BUILD/bin/clang++ \ + -DBUDDY_MLIR_BUILD_DIR=/PATH/TO/BUDDY-MLIR/BUILD/ +$ ninja matmul-benchmark +``` +- `matmul-benchmark` need to load the `libomp.so` in `buddy-mlir/llvm/build/lib` to execute, here's a temporary way without root. + +``` +$ export LD_LIBRARY_PATH=/PATH/TO/BUDDY-MLIR/BUILD/lib/:$LD_LIBRARY_PATH +``` Run TVM operation optimization benchmark cases. - Install TVM ([steps](./thirdparty/README.md#tvm)). diff --git a/benchmarks/OpOptimization/MatMul/CMakeLists.txt b/benchmarks/OpOptimization/MatMul/CMakeLists.txt index 0d55de3d..2803ba1a 100644 --- a/benchmarks/OpOptimization/MatMul/CMakeLists.txt +++ b/benchmarks/OpOptimization/MatMul/CMakeLists.txt @@ -123,7 +123,7 @@ function(build_batch_matmul_broadcast step) COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir | sed 's/bm_batch_matmul/batch_matmul_broadcast_${step}/g' | ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt - -batchmatmul-optimize="step-placeholder=${step}" + -batchmatmul-optimize="vector-size=${step}" -expand-strided-metadata -affine-super-vectorize -lower-affine @@ -178,7 +178,7 @@ add_executable(matmul-benchmark MatMulBenchmark.cpp ) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fopenmp -flto") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fopenmp -flto -fuse-ld=lld") target_link_libraries(matmul-benchmark GoogleBenchmark diff --git a/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp b/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp index 274f691e..50265166 100644 --- a/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp +++ b/benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp @@ -131,7 +131,6 @@ BENCHMARK(BM_MATMUL_BROADCAST_32)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_128)->Unit(benchmark::kMillisecond); BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond); -BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond); BENCHMARK(BM_BATCH_MATMUL_SCALAR)->Unit(benchmark::kMillisecond); BENCHMARK(BM_BATCH_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond); BENCHMARK(BM_BATCH_MATMUL_BROADCAST_64_OMP)->Unit(benchmark::kMillisecond);