diff --git a/benchmarks/DeepLearning/Models/MiniLM-L6/CMakeLists.txt b/benchmarks/DeepLearning/Models/MiniLM-L6/CMakeLists.txt index cafd13c8..679678fb 100644 --- a/benchmarks/DeepLearning/Models/MiniLM-L6/CMakeLists.txt +++ b/benchmarks/DeepLearning/Models/MiniLM-L6/CMakeLists.txt @@ -4,6 +4,7 @@ COMMAND ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/MiniLM-L6-200.mlir --linalg-bufferize --batchmatmul-optimize + # --matmul-optimize --convert-linalg-to-loops --func-bufferize --arith-bufferize diff --git a/benchmarks/DeepLearning/Models/ResNet-18/CMakeLists.txt b/benchmarks/DeepLearning/Models/ResNet-18/CMakeLists.txt index a5987187..1294d6dc 100644 --- a/benchmarks/DeepLearning/Models/ResNet-18/CMakeLists.txt +++ b/benchmarks/DeepLearning/Models/ResNet-18/CMakeLists.txt @@ -7,16 +7,19 @@ COMMAND --pass-pipeline="${RESNET18_TOSA_PIPELINE}" | ${LLVM_MLIR_BINARY_DIR}/mlir-opt --test-linalg-transform-patterns="test-generalize-pad-tensor" - --linalg-bufferize + --linalg-bufferize | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + --conv-broadcast | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt --convert-linalg-to-loops --func-bufferize --arith-bufferize --tensor-bufferize --finalizing-bufferize --convert-vector-to-scf + --lower-affine --convert-scf-to-cf --expand-strided-metadata - --lower-affine --convert-vector-to-llvm --memref-expand --arith-expand diff --git a/benchmarks/OpOptimization/CMakeLists.txt b/benchmarks/OpOptimization/CMakeLists.txt index f96942a1..aa700e8f 100644 --- a/benchmarks/OpOptimization/CMakeLists.txt +++ b/benchmarks/OpOptimization/CMakeLists.txt @@ -1,2 +1,5 @@ add_subdirectory(Conv2dNchwFchw) +# add_subdirectory(Conv2dNhwcHwcf) add_subdirectory(MatMul) +add_subdirectory(LinalgGeneric) +add_subdirectory(LLama) \ No newline at end of file diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/CMakeLists.txt b/benchmarks/OpOptimization/Conv2dNhwcHwcf/CMakeLists.txt new file mode 100644 index 00000000..6415a4e8 --- /dev/null +++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/CMakeLists.txt @@ -0,0 +1,93 @@ +add_custom_command(OUTPUT mlir-conv2d_nhwc_hwcf_scalar.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/Conv2DNhwcHwcf.mlir | + sed 's/conv2d_nhwc_hwcf/conv2d_nhwc_hwcf_scalar/' | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -convert-linalg-to-affine-loops + -lower-affine + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -convert-linalg-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + --filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/Conv2dNhwcHwcf/mlir-conv2d_nhwc_hwcf_scalar.o +) +add_library(Conv2dNhwcHwcfScalar STATIC mlir-conv2d_nhwc_hwcf_scalar.o) +set_target_properties(Conv2dNhwcHwcfScalar PROPERTIES LINKER_LANGUAGE CXX) + +function(build_conv2d_nhwc_hwcf_broadcast step) + add_custom_command(OUTPUT mlir_conv2d_nhwc_hwfc_${step}.o + COMMAND + cat ${CMAKE_CURRENT_SOURCE_DIR}/Conv2DNhwcHwcf.mlir | + sed 's/conv2d_nhwc_hwcf/conv2d_nhwc_hwcf_broadcast_${step}/' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -conv-broadcast=stride=${step} + -convert-linalg-to-loops + -convert-vector-to-scf + -lower-affine + -convert-scf-to-cf + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-arith-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -convert-cf-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + --filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/Conv2dNhwcHwcf/mlir_conv2d_nhwc_hwfc_${step}.o + ) + add_library(Conv2dNhwcHwcfBroadcast${step} STATIC mlir_conv2d_nhwc_hwfc_${step}.o) + set_target_properties(Conv2dNhwcHwcfBroadcast${step} PROPERTIES LINKER_LANGUAGE CXX) +endfunction() + +build_conv2d_nhwc_hwcf_broadcast(16) +build_conv2d_nhwc_hwcf_broadcast(32) +build_conv2d_nhwc_hwcf_broadcast(64) +build_conv2d_nhwc_hwcf_broadcast(128) +build_conv2d_nhwc_hwcf_broadcast(256) + +add_custom_command(OUTPUT mlir_conv2d_nhwc_hwfc_broadcast.o + COMMAND + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/Conv2DNhwcHwcfBroadcast.mlir + -convert-linalg-to-loops + -convert-vector-to-scf + -lower-affine + -convert-scf-to-cf + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-arith-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -convert-cf-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + --filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/Conv2dNhwcHwcf/mlir_conv2d_nhwc_hwfc_broadcast.o +) + +add_library(Conv2dNhwcHwcfBroadcast STATIC mlir_conv2d_nhwc_hwfc_broadcast.o) +set_target_properties(Conv2dNhwcHwcfBroadcast PROPERTIES LINKER_LANGUAGE CXX) + +add_executable(conv2d-nhwc-hwcf-benchmark + Main.cpp + Conv2DNhwcHwcfBenchmark.cpp +) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +target_link_directories(conv2d-nhwc-hwcf-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR}) +target_link_libraries(conv2d-nhwc-hwcf-benchmark + GoogleBenchmark + Conv2dNhwcHwcfScalar + Conv2dNhwcHwcfBroadcast + Conv2dNhwcHwcfBroadcast16 + Conv2dNhwcHwcfBroadcast32 + Conv2dNhwcHwcfBroadcast64 + Conv2dNhwcHwcfBroadcast128 + Conv2dNhwcHwcfBroadcast256 + mlir_c_runner_utils + mlir_runner_utils +) \ No newline at end of file diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcf.mlir b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcf.mlir new file mode 100644 index 00000000..c6fab05a --- /dev/null +++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcf.mlir @@ -0,0 +1,6 @@ +func.func @conv2d_nhwc_hwcf(%input: memref, %filter: memref, %output: memref) { + linalg.conv_2d_nhwc_hwcf + ins(%input, %filter : memref, memref) + outs(%output : memref) + return +} \ No newline at end of file diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBenchmark.cpp b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBenchmark.cpp new file mode 100644 index 00000000..46c51240 --- /dev/null +++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBenchmark.cpp @@ -0,0 +1,187 @@ +//===- Conv2DNhwcHwcfBenchmark.cpp ----------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the benchmark for Conv2d operation. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +// Define target layout. +#define INPUT_N 1 +#define INPUT_H 58 +#define INPUT_W 58 +#define INPUT_C 64 +#define KERNEL_H 3 +#define KERNEL_W 3 +#define KERNEL_C 64 +#define KERNEL_F 64 +#define OUTPUT_N 1 +#define OUTPUT_H 56 +#define OUTPUT_W 56 +#define OUTPUT_F 64 + +// Helper functions and variables. +namespace { +const std::string PASS = "\033[32mPASS\033[0m"; +const std::string FAIL = "\033[32mFAIL\033[0m"; + +bool areArraysEqual(float array1[], float array2[], int size) { + for (int i = 0; i < size; ++i) + { + if (array1[i] != array2[i]) { + return false; + } + } + return true; +} +} // namespace + +namespace { + +// Declare the C interface. +extern "C" { +void _mlir_ciface_conv2d_nhwc_hwcf_scalar(MemRef *inpit, + MemRef *filter, + MemRef *output); +void _mlir_ciface_conv_2d_nhwc_hwcf(MemRef *inpit, + MemRef *filter, + MemRef *output); +void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_16(MemRef *input, + MemRef *filter, + MemRef *output); +void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_32(MemRef *input, + MemRef *filter, + MemRef *output); +void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_64(MemRef *input, + MemRef *filter, + MemRef *output); +void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_128(MemRef *input, + MemRef *filter, + MemRef *output); +void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_256(MemRef *input, + MemRef *filter, + MemRef *output); +} + +#define DEFINE_BENCHMARK(name, func) \ + void BM_CONV2D_NHWC_HWCF_##name(benchmark::State &state) { \ + intptr_t sizesInput[4] = {INPUT_N, INPUT_H, INPUT_W, INPUT_C}; \ + intptr_t sizesKernel[4] = {KERNEL_H, KERNEL_W, KERNEL_C, KERNEL_F}; \ + intptr_t sizesOutput[4] = {OUTPUT_N, OUTPUT_H, OUTPUT_W, OUTPUT_F}; \ + MemRef input(sizesInput, 1.0); \ + MemRef filter(sizesKernel, 1.0); \ + MemRef output(sizesOutput, 0); \ + for (auto _ : state) { \ + func(&input, &filter, &output); \ + } \ + } + +DEFINE_BENCHMARK(SCALAR, _mlir_ciface_conv2d_nhwc_hwcf_scalar) +DEFINE_BENCHMARK(BROADCAST, _mlir_ciface_conv_2d_nhwc_hwcf) +DEFINE_BENCHMARK(BROADCAST_16, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_16) +DEFINE_BENCHMARK(BROADCAST_32, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_32) +DEFINE_BENCHMARK(BROADCAST_64, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_64) +DEFINE_BENCHMARK(BROADCAST_128, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_128) +DEFINE_BENCHMARK(BROADCAST_256, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_256) +} // namespace + +BENCHMARK(BM_CONV2D_NHWC_HWCF_SCALAR)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_16)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_32)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_64)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_128)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_256)->Unit(benchmark::kMillisecond); + +#define DEFINE_VERIFICATION(name, func) \ + void VERIFICATION_##name(MemRef inputMemRef, \ + MemRef kernelMemRef, \ + float resultScalar[]) { \ + intptr_t sizesOutput[4] = {OUTPUT_N, OUTPUT_H, OUTPUT_W, OUTPUT_F}; \ + MemRef output##name(sizesOutput, 0); \ + func(&inputMemRef, &kernelMemRef, &output##name); \ + auto result##name = output##name.getData(); \ + const int outputSize = OUTPUT_N * OUTPUT_H * OUTPUT_W * OUTPUT_F; \ + std::cout << #name << "case: " \ + << (areArraysEqual(resultScalar, result##name, outputSize)\ + ? PASS \ + : FAIL) \ + << std::endl; \ + } + +DEFINE_VERIFICATION(BROADCAST, _mlir_ciface_conv_2d_nhwc_hwcf) +DEFINE_VERIFICATION(BROADCAST_16, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_16) +DEFINE_VERIFICATION(BROADCAST_32, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_32) +DEFINE_VERIFICATION(BROADCAST_64, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_64) +DEFINE_VERIFICATION(BROADCAST_128, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_128) +DEFINE_VERIFICATION(BROADCAST_256, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_256) + +void verification() { + // Set the random number generator. + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(1, 100); + + // Set the layout sizes of input and output memref container. + intptr_t sizesInput[4] = {INPUT_N, INPUT_H, INPUT_W, INPUT_C}; + intptr_t sizesKernel[4] = {KERNEL_H, KERNEL_W, KERNEL_C, KERNEL_F}; + intptr_t sizesOutput[4] = {OUTPUT_N, OUTPUT_H, OUTPUT_W, OUTPUT_F}; + + // Generate input memref container with random numbers. + const int inputSize = INPUT_N * INPUT_H * INPUT_W * INPUT_C; + float inputRand[inputSize]; + for (int i = 0; i < inputSize; ++i) + { + inputRand[i] = dis(gen); + } + MemRef inputMemRef(inputRand, sizesInput); + + // Generate kernel memref container with random numbers. + const int kernelSize = KERNEL_H * KERNEL_W * KERNEL_C * KERNEL_F; + float kernelRand[kernelSize]; + for (int i = 0; i < kernelSize; ++i) + { + kernelRand[i] = dis(gen); + } + MemRef kernelMemRef(kernelRand, sizesKernel); + + // Generate a result using a scalar method for comparison during verification. + MemRef outputScalar(sizesOutput, 0); + _mlir_ciface_conv2d_nhwc_hwcf_scalar(&inputMemRef, &kernelMemRef, &outputScalar); + auto resultScalar = outputScalar.getData(); + + // Print the verification results. + std::cout << "---------------------------------------------------------------" + "---------" + << std::endl; + std::cout << "Correctness Verification:" << std::endl; + + VERIFICATION_BROADCAST(inputMemRef, kernelMemRef, resultScalar); + + VERIFICATION_BROADCAST_16(inputMemRef, kernelMemRef, resultScalar); + VERIFICATION_BROADCAST_32(inputMemRef, kernelMemRef, resultScalar); + VERIFICATION_BROADCAST_64(inputMemRef, kernelMemRef, resultScalar); + VERIFICATION_BROADCAST_128(inputMemRef, kernelMemRef, resultScalar); + VERIFICATION_BROADCAST_256(inputMemRef, kernelMemRef, resultScalar); + + std::cout << "---------------------------------------------------------------" + "---------" + << std::endl; +} \ No newline at end of file diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBroadcast.mlir b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBroadcast.mlir new file mode 100644 index 00000000..1f18b4c0 --- /dev/null +++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBroadcast.mlir @@ -0,0 +1,162 @@ +#map0 = affine_map<(d0, d1, d2, d3) -> (d2)> +#map1 = affine_map<(d0) -> (d0 ceildiv 32)> + +func.func @transpose(%input: memref, + %output: memref) { + + return +} + func.func @conv_2d_nhwc_hwcf(%input: memref, + %kernel: memref, + %output: memref) { + %c0 = arith.constant 0 : index + %c0_f32 = arith.constant 0.0 : f32 + %c0_f32_vec = vector.splat %c0_f32 : vector<32xf32> + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c32 = arith.constant 32 : index + // Get the n size. (batch) + %n = memref.dim %input, %c0 : memref + // Get the f size. (feature) + %f = memref.dim %kernel, %c3 : memref + // Get the c size. (channel) + %c = memref.dim %kernel, %c2 : memref + // Get the 2D output size. (row and column) + %output_row = memref.dim %output, %c1 : memref + %output_col = memref.dim %output, %c2 : memref + // Get the 2D kernel size. (row and column) + %kernel_row = memref.dim %kernel, %c0 : memref + %kernel_col = memref.dim %kernel, %c1 : memref + + %input_row = memref.dim %input, %c1 : memref + %input_col = memref.dim %input, %c2 : memref + + // %input_transpose = memref.alloc(%n, %c, %input_row, %output_col) : memref + // %kernel_transpose = memref.alloc(%f, %c, %kernel_row, %kernel_col) : memref + %output_transpose = memref.alloc(%n, %f, %output_row, %output_col) : memref + + // affine.for %n_idx = %c0 to %n { + // affine.for %input_row_idx = %c0 to %input_row { + // affine.for %input_col_idx = %c0 to %input_col { + // affine.for %c_idx = %c0 to %c { + // %val = memref.load %input[%n_idx, %input_row_idx, %input_col_idx, %c_idx] : memref + // memref.store %val, %input_transpose[%n_idx, %c_idx, %input_row_idx, %input_col_idx] : memref + // } + // } + // } + // } + + // affine.for %f_idx = %c0 to %f { + // affine.for %input_row_idx = %c0 to %kernel_row { + // affine.for %input_col_idx = %c0 to %kernel_col { + // affine.for %c_idx = %c0 to %c { + // %val = memref.load %kernel[%input_row_idx, %input_col_idx, %c_idx, %f_idx] : memref + // memref.store %val, %kernel_transpose[%f_idx, %c_idx, %input_row_idx, %input_col_idx] : memref + // } + // } + // } + // } + + // affine.for %n_idx = %c0 to %n { + // affine.for %f_idx = %c0 to %f { + // affine.for %c_idx = %c0 to %c { + // affine.for %output_row_idx = %c0 to %output_row { + // affine.for %kernel_row_idx = %c0 to %kernel_row { + // affine.for %kernel_col_idx = %c0 to %kernel_col { + // affine.for %output_col_idx = %c0 to #map1(%output_col) { + // // Check sparsity. + // %kernel_ele = memref.load %kernel_transpose[%f_idx, %c_idx, %kernel_row_idx, %kernel_col_idx] : memref + // %sparsity_flag = arith.cmpf one, %kernel_ele, %c0_f32 : f32 + // scf.if %sparsity_flag { + // // Check tail. + // %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<32xf32> + // %output_col_cur = arith.muli %output_col_idx, %c32 : index + // %tail_len = arith.subi %output_col, %output_col_cur : index + // %tail_flag = arith.cmpi sge, %tail_len, %c32 : index + // scf.if %tail_flag { + // %input_vec = affine.vector_load %input_transpose[%n_idx, %c_idx, %output_row_idx + %kernel_row_idx, %kernel_col_idx + %output_col_idx * 32] : memref, vector<32xf32> + // %output_vec = affine.vector_load %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx * 32] : memref, vector<32xf32> + // %result_vec = vector.fma %input_vec, %kernel_vec, %output_vec : vector<32xf32> + // affine.vector_store %result_vec, %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx * 32] : memref, vector<32xf32> + // } else { + // %mask_vec = vector.create_mask %tail_len : vector<32xi1> + // %input_row_idx_tail = arith.addi %output_row_idx, %kernel_row_idx : index + // %output_col_idx_tail = arith.muli %output_col_idx, %c32 : index + // %input_col_idx_tail = arith.addi %kernel_col_idx, %output_col_idx_tail : index + // %input_vec_tail = vector.maskedload %input_transpose[%n_idx, %c_idx, %input_row_idx_tail, %input_col_idx_tail], %mask_vec, %c0_f32_vec : memref, vector<32xi1>, vector<32xf32> into vector<32xf32> + // %output_vec_tail = vector.maskedload %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx_tail], %mask_vec, %c0_f32_vec : memref, vector<32xi1>, vector<32xf32> into vector<32xf32> + // %result_vec_tail = vector.fma %input_vec_tail, %kernel_vec, %output_vec_tail : vector<32xf32> + // vector.maskedstore %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx_tail], %mask_vec, %result_vec_tail : memref, vector<32xi1>, vector<32xf32> + // } + // } + // } + // } + // } + // } + // } + // } + // } + + affine.for %n_idx = %c0 to %n { + affine.for %output_row_idx = %c0 to %output_row { + affine.for %kernel_row_idx = %c0 to %kernel_row { + affine.for %kernel_col_idx = %c0 to %kernel_col { + affine.for %output_col_idx = %c0 to #map1(%output_col) { + affine.for %c_idx = %c0 to %c { + affine.for %f_idx = %c0 to %f { + // Check sparsity. + %kernel_ele = memref.load %kernel[%kernel_row_idx, %kernel_col_idx, %c_idx, %f_idx] : memref + %sparsity_flag = arith.cmpf one, %kernel_ele, %c0_f32 : f32 + scf.if %sparsity_flag { + // Check tail. + %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<32xf32> + %output_col_cur = arith.muli %output_col_idx, %c32 : index + %tail_len = arith.subi %output_col, %output_col_cur : index + %tail_flag = arith.cmpi sge, %tail_len, %c32 : index + %input_row_idx_tail = arith.addi %output_row_idx, %kernel_row_idx : index + %output_col_idx_tail = arith.muli %output_col_idx, %c32 : index + %input_col_idx_tail = arith.addi %kernel_col_idx, %output_col_idx_tail : index + %mask_vec = vector.create_mask %tail_len : vector<32xi1> + %input_vec_tail = vector.transfer_read %input[%n_idx, %input_row_idx_tail, %input_col_idx_tail, %c_idx], %c0_f32, %mask_vec {permutation_map = #map0, in_bounds = [true]} : memref, vector<32xf32> + %output_vec_tail = vector.maskedload %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx_tail], %mask_vec, %c0_f32_vec : memref, vector<32xi1>, vector<32xf32> into vector<32xf32> + %result_vec_tail = vector.fma %input_vec_tail, %kernel_vec, %output_vec_tail : vector<32xf32> + // vector.transfer_write %result_vec_tail, %output[%n_idx, %output_row_idx, %output_col_idx_tail, %f_idx], %mask_vec {permutation_map = #map0, in_bounds = [true]} : vector<32xf32>, memref + vector.maskedstore %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx_tail], %mask_vec, %result_vec_tail : memref, vector<32xi1>, vector<32xf32> + } + } + } + } + } + } + } + } + + // affine.for %n_idx = %c0 to %n { + // affine.for %output_row_idx = %c0 to %output_row { + // affine.for %output_col_idx = %c0 to %output_col { + // affine.for %f_idx = %c0 to #map1(%f) { + // // Check tail. + // %f_cur = arith.muli %f_idx, %c32 : index + // %tail_len = arith.subi %f, %f_cur : index + // %tail_flag = arith.cmpi sge, %tail_len, %c32 : index + // %mask_vec = vector.create_mask %tail_len : vector<32xi1> + // %input_vec_tail = vector.transfer_read %output_transpose[%n_idx, %f_cur, %output_row_idx, %output_col_idx], %c0_f32, %mask_vec {permutation_map = #map0, in_bounds = [true]} : memref, vector<32xf32> + // // vector.transfer_write %result_vec_tail, %output[%n_idx, %output_row_idx, %output_col_idx_tail, %f_idx], %mask_vec {permutation_map = #map0, in_bounds = [true]} : vector<32xf32>, memref + // vector.maskedstore %output[%n_idx, %output_row_idx, %output_col_idx, %f_idx], %mask_vec, %input_vec_tail {permutation_map = #map0, in_bounds = [true]} : memref, vector<32xi1>, vector<32xf32> + // } + // } + // } + // } + affine.for %n_idx = %c0 to %n { + affine.for %output_row_idx = %c0 to %output_row { + affine.for %output_col_idx = %c0 to %output_col { + affine.for %f_idx = %c0 to %f { + %val = memref.load %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx] : memref + memref.store %val, %output[%n_idx, %output_row_idx, %output_col_idx, %f_idx] : memref + } + } + } + } + return + } \ No newline at end of file diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/Main.cpp b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Main.cpp new file mode 100644 index 00000000..265e8ab2 --- /dev/null +++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Main.cpp @@ -0,0 +1,32 @@ +//===- Main.cpp -----------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This is the main file of the gemm benchmark. +// +//===----------------------------------------------------------------------===// + +#include + +void verification(); + +int main(int argc, char **argv) { + // Run benchmark. + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + // Run correctness verification. + verification(); + return 0; +} \ No newline at end of file diff --git a/benchmarks/OpOptimization/LLama/CMakeLists.txt b/benchmarks/OpOptimization/LLama/CMakeLists.txt new file mode 100644 index 00000000..012528cb --- /dev/null +++ b/benchmarks/OpOptimization/LLama/CMakeLists.txt @@ -0,0 +1,123 @@ +add_custom_command(OUTPUT mlir-llama-origin.o + COMMAND + ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${CMAKE_CURRENT_SOURCE_DIR}/llama_origin.mlir + # -convert-linalg-to-affine-loops + # -lower-affine + # -convert-vector-to-llvm + # -finalize-memref-to-llvm + # -convert-scf-to-cf + # -convert-linalg-to-llvm + # -llvm-request-c-wrappers + # -convert-func-to-llvm + -one-shot-bufferize='bufferize-function-boundaries' + -llvm-request-c-wrappers + -test-lower-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LLama/mlir-llama-origin.o +) + +add_library(LLamaOrigin STATIC mlir-llama-origin.o) +set_target_properties(LLamaOrigin PROPERTIES LINKER_LANGUAGE CXX) + +add_custom_command(OUTPUT mlir-llama-tiling.o + COMMAND + cat ${CMAKE_CURRENT_SOURCE_DIR}/llama_origin.mlir | + sed 's/base/tiling/' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -polyhedral-tiling='tile-sizes=32,32,32,32' + # -generic-vectorization=tile-sizes=0,32,32,32 + # -convert-vector-to-scf + # -linalg-bufferize + # -convert-linalg-to-loops + # -convert-vector-to-llvm + # -lower-affine + # -convert-scf-to-cf + # -func-bufferize + # -finalizing-bufferize + # -finalize-memref-to-llvm + # -llvm-request-c-wrappers + # -convert-func-to-llvm + -reconcile-unrealized-casts | + # ${LLVM_MLIR_BINARY_DIR}/mlir-opt + /home/heyi/heyi/llvm-b1115f8c-ubuntu-x64/bin/mlir-opt + -convert-linalg-to-affine-loops + -affine-super-vectorize='virtual-vector-size=16' + -one-shot-bufferize='bufferize-function-boundaries' + -llvm-request-c-wrappers + -test-lower-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LLama/mlir-llama-tiling.o +) + +add_library(LLamaTiling STATIC mlir-llama-tiling.o) +set_target_properties(LLamaTiling PROPERTIES LINKER_LANGUAGE CXX) + + +add_custom_command(OUTPUT mlir-affine-vec.o + COMMAND + cat ${CMAKE_CURRENT_SOURCE_DIR}/llama_origin.mlir | + sed 's/base/affine_vec/' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -convert-linalg-to-affine-loops + # -affine-loop-tile='tile-sizes=32,32,32,32' + # -linalg-bufferize + # -convert-linalg-to-loops + # -convert-vector-to-llvm + # -lower-affine + # -convert-scf-to-cf + # -func-bufferize + # -finalizing-bufferize + # -finalize-memref-to-llvm + # -llvm-request-c-wrappers + # -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + # -vectorize-affine-loop-nest + -affine-super-vectorize='virtual-vector-size=128' + -one-shot-bufferize='bufferize-function-boundaries' + -llvm-request-c-wrappers + -test-lower-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LLama/mlir-affine-vec.o +) + +add_library(AffineVec STATIC mlir-affine-vec.o) +set_target_properties(AffineVec PROPERTIES LINKER_LANGUAGE CXX) + +# add_custom_command(OUTPUT mlir-manual-opt.o +# COMMAND +# ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${CMAKE_CURRENT_SOURCE_DIR}/linalg_manual_opt.mlir +# # -one-shot-bufferize='bufferize-function-boundaries' +# -llvm-request-c-wrappers +# -test-lower-to-llvm +# | +# ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | +# ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} +# -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LinalgGeneric/mlir-manual-opt.o +# ) + +# add_library(ManualOpt STATIC mlir-manual-opt.o) +# set_target_properties(ManualOpt PROPERTIES LINKER_LANGUAGE CXX) + +add_executable(llama-benchmark + Main.cpp + LLamaBenchmark.cpp +) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +target_link_directories(llama-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR}) +target_link_libraries(llama-benchmark + GoogleBenchmark + LLamaOrigin + LLamaTiling + AffineVec + # ManualOpt + mlir_c_runner_utils + mlir_runner_utils +) \ No newline at end of file diff --git a/benchmarks/OpOptimization/LLama/LLamaBenchmark.cpp b/benchmarks/OpOptimization/LLama/LLamaBenchmark.cpp new file mode 100644 index 00000000..1e2696f6 --- /dev/null +++ b/benchmarks/OpOptimization/LLama/LLamaBenchmark.cpp @@ -0,0 +1,119 @@ +//===- LLamaBenchmark.cpp ----------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the benchmark for Linalg Generic operation. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +// Define target layout. +#define INPUT_H 1 +#define INPUT_W 512 +#define OUTPUT_C 1024 +#define OUTPUT_F 4096 + +namespace { + +// Helper functions and variables. +const std::string PASS = "\033[32mPASS\033[0m"; +const std::string FAIL = "\033[32mFAIL\033[0m"; +bool areArraysEqual(float array1[], float array2[], int size) { + for (int i = 0; i < size; ++i) + { + if (array1[i] != array2[i]) { + return false; + } + } + return true; +} + +void setValue(MemRef &in) { + int sizes = in.getSizes()[1] * in.getSizes()[0]; + for (int i = 0; i < sizes; ++i) { + in[i] = i; + } +} + +// Declare the C interface. +extern "C" { +void _mlir_ciface_base(MemRef *input, MemRef *output); +void _mlir_ciface_tiling(MemRef *input, MemRef *output); +void _mlir_ciface_affine_vec(MemRef *input, MemRef *output); +} + +#define DEFINE_BENCHMARK(name, func) \ + void BM_GENERIC_##name(benchmark::State &state) { \ + intptr_t sizesInput[3] = {INPUT_H, INPUT_W, OUTPUT_F}; \ + intptr_t sizesOutput[3] = {INPUT_H, INPUT_W, OUTPUT_C}; \ + MemRef input(sizesInput, 2.0); \ + MemRef output(sizesOutput, 3.0); \ + for (auto _ : state) { \ + func(&input, &output); \ + } \ + } + + +DEFINE_BENCHMARK(BASE, _mlir_ciface_base) +DEFINE_BENCHMARK(TILING, _mlir_ciface_tiling) +DEFINE_BENCHMARK(AFFINEVEC, _mlir_ciface_affine_vec) +} // namespace + +BENCHMARK(BM_GENERIC_BASE)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_GENERIC_TILING)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_GENERIC_AFFINEVEC)->Unit(benchmark::kMillisecond); + +#define DEFINE_VERIFICATION(name, func) \ + void VERIFICATION_##name(MemRef input, \ + MemRef output, \ + float resultScalar[]) { \ + func(&input, &output); \ + auto result##name = output.getData(); \ + std::cout << #name << "case: " \ + << (areArraysEqual(resultScalar, result##name, 4096) \ + ? PASS \ + : FAIL) \ + << std::endl;\ + } + +// DEFINE_VERIFICATION(TRANSFORM_TILING, _mlir_ciface_transform_tiling) +// DEFINE_VERIFICATION(MANUL, _mlir_ciface_manul) + +void verification() { + // intptr_t sizesInput[2] = {INPUT_H, INPUT_W}; + // intptr_t sizesOutput[1] = {OUTPUT_W}; + // MemRef input(sizesInput, 1.0); + // MemRef output(sizesOutput, 1.0); + // _mlir_ciface_origin(&input, &output); + // auto resultScalar = output.getData(); + // // Print the verification results. + // std::cout << "---------------------------------------------------------------" + // "---------" + // << std::endl; + // std::cout << "Correctness Verification:" << std::endl; + // MemRef newOutput1(sizesOutput, 1.0); + // VERIFICATION_TRANSFORM_TILING(input, newOutput1, resultScalar); + // MemRef newOutput2(sizesOutput, 1.0); + // VERIFICATION_MANUL(input, newOutput2, resultScalar); + + // std::cout << "---------------------------------------------------------------" + // "---------" + // << std::endl; + +} diff --git a/benchmarks/OpOptimization/LLama/Main.cpp b/benchmarks/OpOptimization/LLama/Main.cpp new file mode 100644 index 00000000..4e4e9f7d --- /dev/null +++ b/benchmarks/OpOptimization/LLama/Main.cpp @@ -0,0 +1,32 @@ +//===- Main.cpp -----------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This is the main file of the linalg generic benchmark. +// +//===----------------------------------------------------------------------===// + +#include + +void verification(); + +int main(int argc, char **argv) { + // Run benchmark. + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + // Run correctness verification. + verification(); + return 0; +} \ No newline at end of file diff --git a/benchmarks/OpOptimization/LLama/llama_manul.mlir b/benchmarks/OpOptimization/LLama/llama_manul.mlir new file mode 100644 index 00000000..b3d1974e --- /dev/null +++ b/benchmarks/OpOptimization/LLama/llama_manul.mlir @@ -0,0 +1,58 @@ +module { + func.func @base(%arg0: memref<1x512x4096xf32>, %arg1: memref<1x512x1024xf32>) { + %cst = arith.constant 4.096000e+03 : f32 + affine.for %arg2 = 0 to 1 { + affine.for %arg3 = 0 to 512 { + affine.for %arg4 = 0 to 4096 { + affine.for %arg5 = 0 to 1024 { + %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<1x512x4096xf32> + %1 = affine.load %arg1[%arg2, %arg3, %arg5] : memref<1x512x1024xf32> + %2 = arith.divf %0, %cst : f32 + %3 = arith.addf %2, %1 : f32 + affine.store %3, %arg1[%arg2, %arg3, %arg5] : memref<1x512x1024xf32> + } + } + } + } + return + } +} + + + +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map2 = affine_map<(d0) -> (512-d0, 32)> +#map3 = affine_map<(d0) -> (4096-d0, 32)> +// #map3 = affine_map<(d0) -> (1024-d0, 32)> +module { + func.func @base(%arg0: memref<1x512x4096xf32>, %arg1: memref<1x512x1024xf32>) { + %cst = arith.constant 4.096000e+03 : f32 + %c1024 = arith.constant 1024 : index + %c4096 = arith.constant 4096 : index + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + affine.for %arg2 = %c0 to %c1 step 1 { + affine.for %arg3 = %c0 to %c512 step 32 { + affine.for %arg4 = %c0 to %c4096 step 32 { + affine.for %arg5 = %c0 to %c1024 step 32 { + // %subview = memref.subview %arg0[0, %arg2, %arg3] [1, 32, 32] [1, 1, 1] : memref<1x512x4096xf32> to memref<1x32x32xf32, strided<[2097152, 4096, 1], offset: ?>> + // %subview_0 = memref.subview %arg1[0, %arg2, %arg4] [1, 32, 32] [1, 1, 1] : memref<1x512x1024xf32> to memref<1x32x32xf32, strided<[524288, 1024, 1], offset: ?>> + affine.for %arg6 = %c0 to min #map2(%arg3) step 1 { + affine.for %arg7 = %c0 to min #map3(%arg4) step + } + // linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%subview : memref<1x32x32xf32, strided<[2097152, 4096, 1], offset: ?>>) outs(%subview_0 : memref<1x32x32xf32, strided<[524288, 1024, 1], offset: ?>>) { + // ^bb0(%in: f32, %out: f32): + // %0 = arith.divf %in, %cst : f32 + // %1 = arith.addf %0, %out : f32 + // linalg.yield %1 : f32 + // } + } + } + } + } + return + } +} \ No newline at end of file diff --git a/benchmarks/OpOptimization/LLama/llama_origin.mlir b/benchmarks/OpOptimization/LLama/llama_origin.mlir new file mode 100644 index 00000000..6df5c610 --- /dev/null +++ b/benchmarks/OpOptimization/LLama/llama_origin.mlir @@ -0,0 +1,14 @@ +#map8 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#map9 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +module { + func.func @base(%arg0: memref<1x512x4096xf32>, %arg1: memref<1x512x1024xf32>) { + linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%arg0 : memref<1x512x4096xf32>) outs(%arg1 : memref<1x512x1024xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2372 = arith.constant 4.096000e+03 : f32 + %4230 = arith.divf %in, %cst_2372 : f32 + %4231 = arith.addf %4230, %out : f32 + linalg.yield %4231 : f32 + } + return + } +} \ No newline at end of file diff --git a/benchmarks/OpOptimization/LinalgGeneric/CMakeLists.txt b/benchmarks/OpOptimization/LinalgGeneric/CMakeLists.txt new file mode 100644 index 00000000..2d78a16d --- /dev/null +++ b/benchmarks/OpOptimization/LinalgGeneric/CMakeLists.txt @@ -0,0 +1,83 @@ +add_custom_command(OUTPUT mlir-generic-origin.o + COMMAND + ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${CMAKE_CURRENT_SOURCE_DIR}/linalg_generic_origin.mlir + # -convert-linalg-to-affine-loops + # -lower-affine + # -convert-vector-to-llvm + # -finalize-memref-to-llvm + # -convert-scf-to-cf + # -convert-linalg-to-llvm + # -llvm-request-c-wrappers + # -convert-func-to-llvm + # -one-shot-bufferize='bufferize-function-boundaries' + -llvm-request-c-wrappers + -test-lower-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LinalgGeneric/mlir-generic-origin.o +) + +add_library(GenericOrigin STATIC mlir-generic-origin.o) +set_target_properties(GenericOrigin PROPERTIES LINKER_LANGUAGE CXX) + +add_custom_command(OUTPUT mlir-transform-tiling.o + COMMAND + cat ${CMAKE_CURRENT_SOURCE_DIR}/linalg_generic_origin.mlir | + sed 's/origin/transform_tiling/' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -polyhedral-tiling='tile-sizes=32,32' + # -linalg-bufferize + # -convert-linalg-to-loops + # -convert-vector-to-llvm + # -lower-affine + # -convert-scf-to-cf + # -func-bufferize + # -finalizing-bufferize + # -finalize-memref-to-llvm + # -llvm-request-c-wrappers + # -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -one-shot-bufferize='bufferize-function-boundaries' + -llvm-request-c-wrappers + -test-lower-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LinalgGeneric/mlir-transform-tiling.o +) + +add_library(TransformTiling STATIC mlir-transform-tiling.o) +set_target_properties(TransformTiling PROPERTIES LINKER_LANGUAGE CXX) + +add_custom_command(OUTPUT mlir-manual-opt.o + COMMAND + ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${CMAKE_CURRENT_SOURCE_DIR}/linalg_manual_opt.mlir + # -one-shot-bufferize='bufferize-function-boundaries' + -llvm-request-c-wrappers + -test-lower-to-llvm + | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} + -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LinalgGeneric/mlir-manual-opt.o +) + +add_library(ManualOpt STATIC mlir-manual-opt.o) +set_target_properties(ManualOpt PROPERTIES LINKER_LANGUAGE CXX) + +add_executable(linalg-generic-benchmark + Main.cpp + GenericBenchmark.cpp +) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +target_link_directories(linalg-generic-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR}) +target_link_libraries(linalg-generic-benchmark + GoogleBenchmark + GenericOrigin + TransformTiling + ManualOpt + mlir_c_runner_utils + mlir_runner_utils +) \ No newline at end of file diff --git a/benchmarks/OpOptimization/LinalgGeneric/GenericBenchmark.cpp b/benchmarks/OpOptimization/LinalgGeneric/GenericBenchmark.cpp new file mode 100644 index 00000000..c27f3106 --- /dev/null +++ b/benchmarks/OpOptimization/LinalgGeneric/GenericBenchmark.cpp @@ -0,0 +1,117 @@ +//===- GenericBenchmark.cpp ----------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the benchmark for Linalg Generic operation. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +// Define target layout. +#define INPUT_H 4096 +#define INPUT_W 4096 +#define OUTPUT_W 4096 + +namespace { + +// Helper functions and variables. +const std::string PASS = "\033[32mPASS\033[0m"; +const std::string FAIL = "\033[32mFAIL\033[0m"; +bool areArraysEqual(float array1[], float array2[], int size) { + for (int i = 0; i < size; ++i) + { + if (array1[i] != array2[i]) { + return false; + } + } + return true; +} + +// Declare the C interface. +extern "C" { +void _mlir_ciface_origin(MemRef *inpit, MemRef *outpit); +void _mlir_ciface_transform_tiling(MemRef *inpit, MemRef *outpit); +void _mlir_ciface_manul(MemRef *inpit, MemRef *outpit); +void _mlir_ciface_manul2(MemRef *inpit, MemRef *outpit); +void _mlir_ciface_manul3(MemRef *inpit, MemRef *outpit); +} + +#define DEFINE_BENCHMARK(name, func) \ + void BM_GENERIC_##name(benchmark::State &state) { \ + intptr_t sizesInput[2] = {INPUT_H, INPUT_W}; \ + intptr_t sizesOutput[1] = { OUTPUT_W}; \ + MemRef input(sizesInput, 1.0); \ + MemRef output(sizesOutput, 1.0); \ + for (auto _ : state) { \ + func(&input, &output); \ + } \ + } + + +DEFINE_BENCHMARK(ORIGIN, _mlir_ciface_origin) +DEFINE_BENCHMARK(TRANSFORM_TILING, _mlir_ciface_transform_tiling) +DEFINE_BENCHMARK(MANUL, _mlir_ciface_manul) +DEFINE_BENCHMARK(MANUL2, _mlir_ciface_manul2) +// DEFINE_BENCHMARK(MANUL3, _mlir_ciface_manul3) +} // namespace + +BENCHMARK(BM_GENERIC_ORIGIN)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_GENERIC_TRANSFORM_TILING)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_GENERIC_MANUL)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_GENERIC_MANUL2)->Unit(benchmark::kMillisecond); +// BENCHMARK(BM_GENERIC_MANUL3)->Unit(benchmark::kMillisecond); + +#define DEFINE_VERIFICATION(name, func) \ + void VERIFICATION_##name(MemRef input, \ + MemRef output, \ + float resultScalar[]) { \ + func(&input, &output); \ + auto result##name = output.getData(); \ + std::cout << #name << "case: " \ + << (areArraysEqual(resultScalar, result##name, 4096) \ + ? PASS \ + : FAIL) \ + << std::endl;\ + } + +DEFINE_VERIFICATION(TRANSFORM_TILING, _mlir_ciface_transform_tiling) +DEFINE_VERIFICATION(MANUL, _mlir_ciface_manul) + +void verification() { + intptr_t sizesInput[2] = {INPUT_H, INPUT_W}; + intptr_t sizesOutput[1] = {OUTPUT_W}; + MemRef input(sizesInput, 1.0); + MemRef output(sizesOutput, 1.0); + _mlir_ciface_origin(&input, &output); + auto resultScalar = output.getData(); + // Print the verification results. + std::cout << "---------------------------------------------------------------" + "---------" + << std::endl; + std::cout << "Correctness Verification:" << std::endl; + MemRef newOutput1(sizesOutput, 1.0); + VERIFICATION_TRANSFORM_TILING(input, newOutput1, resultScalar); + MemRef newOutput2(sizesOutput, 1.0); + VERIFICATION_MANUL(input, newOutput2, resultScalar); + + std::cout << "---------------------------------------------------------------" + "---------" + << std::endl; + +} diff --git a/benchmarks/OpOptimization/LinalgGeneric/Main.cpp b/benchmarks/OpOptimization/LinalgGeneric/Main.cpp new file mode 100644 index 00000000..4e4e9f7d --- /dev/null +++ b/benchmarks/OpOptimization/LinalgGeneric/Main.cpp @@ -0,0 +1,32 @@ +//===- Main.cpp -----------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This is the main file of the linalg generic benchmark. +// +//===----------------------------------------------------------------------===// + +#include + +void verification(); + +int main(int argc, char **argv) { + // Run benchmark. + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + // Run correctness verification. + verification(); + return 0; +} \ No newline at end of file diff --git a/benchmarks/OpOptimization/LinalgGeneric/linalg_generic_origin.mlir b/benchmarks/OpOptimization/LinalgGeneric/linalg_generic_origin.mlir new file mode 100644 index 00000000..e6601f9a --- /dev/null +++ b/benchmarks/OpOptimization/LinalgGeneric/linalg_generic_origin.mlir @@ -0,0 +1,11 @@ +#map = affine_map<(d0, d1) -> (d1, d0)> +#map1 = affine_map<(d0, d1) -> (d1)> +#map2 = affine_map<(d0) -> (d0 + 256, 4096)> +func.func @origin(%arg0: memref<4096x4096xf32>, %arg1: memref<4096xf32>) { + linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins (%arg0: memref<4096x4096xf32>) outs(%arg1: memref<4096xf32>) { + ^bb0(%in: f32, %out: f32): + %2 = arith.addf %in, %out : f32 + linalg.yield %2 : f32 + } + return +} \ No newline at end of file diff --git a/benchmarks/OpOptimization/LinalgGeneric/linalg_manual_opt.mlir b/benchmarks/OpOptimization/LinalgGeneric/linalg_manual_opt.mlir new file mode 100644 index 00000000..51fbaf59 --- /dev/null +++ b/benchmarks/OpOptimization/LinalgGeneric/linalg_manual_opt.mlir @@ -0,0 +1,73 @@ +#map = affine_map<(d0) -> (256, 4096-d0)> +#map1 = affine_map<(d0) -> (64, 4096-d0)> +#map2 = affine_map<(d0) -> (8, 4096-d0)> +#map3 = affine_map<(d0) -> (4, 4096-d0)> +func.func @manul(%arg0: memref<4096x4096xf32>, %arg1: memref<4096xf32>) { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c4096 = arith.constant 4096 : index + %c1 = arith.constant 1 : index + affine.for %arg3 = %c0 to %c4096 step 1 { + affine.for %arg4 = %c0 to %c4096 step 32 { + %0 = affine.vector_load %arg0[%arg3, %arg4] : memref<4096x4096xf32>, vector<32xf32> + %1 = affine.vector_load %arg1[%arg4] : memref<4096xf32>, vector<32xf32> + %2 = arith.addf %0, %1 : vector<32xf32> + affine.vector_store %2, %arg1[%arg4] : memref<4096xf32>, vector<32xf32> + } + } + // affine.for %arg3 = %c0 to %c4096 step 256 { + // affine.for %arg4 = %c0 to %c4096 step 256 { + // affine.for %arg5 = %c0 to min #map(%arg3) step 1 { + // affine.for %arg6 = %c0 to min #map(%arg4) step 32 { + // // %0 = affine.load %arg0[%arg3 + %arg5, %arg4 + %arg6] : memref<4096x4096xf32> + // // %1 = affine.load %arg1[%arg3 + %arg5] : memref<4096xf32> + // // %2 = arith.addf %0, %1 : f32 + // // affine.store %2, %arg1[%arg3 + %arg5] : memref<4096xf32> + // %0 = affine.vector_load %arg0[%arg3 + %arg5, %arg4 + %arg6] : memref<4096x4096xf32>, vector<32xf32> + // %1 = affine.vector_load %arg1[%arg4 + %arg6] : memref<4096xf32>, vector<32xf32> + // %2 = arith.addf %0, %1 : vector<32xf32> + // affine.vector_store %2, %arg1[%arg4 + %arg6] : memref<4096xf32>, vector<32xf32> + // } + // } + // } + // } + return +} + +func.func @manul2(%arg0: memref<4096x4096xf32>, %arg1: memref<4096xf32>) { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c4096 = arith.constant 4096 : index + %c1 = arith.constant 1 : index + affine.for %arg3 = %c0 to %c4096 step 8 { + affine.for %arg4 = %c0 to %c4096 step 1 { + affine.for %arg5 = %c0 to min #map2(%arg3) { + %0 = affine.load %arg0[%arg3 + %arg5, %arg4] : memref<4096x4096xf32> + %1 = affine.load %arg1[%arg3 + %arg5] : memref<4096xf32> + %2 = arith.addf %0, %1 : f32 + affine.store %2, %arg1[%arg3 + %arg5] : memref<4096xf32> + } + } + } + return +} + +func.func @manul3(%arg0: memref<4096x4096xf32>, %arg1: memref<4096xf32>) { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c4096 = arith.constant 4096 : index + %c1 = arith.constant 1 : index + affine.for %arg3 = %c0 to %c4096 step 64 { + affine.for %arg4 = %c0 to %c4096 step 64 { + affine.for %arg5 = %c0 to min #map1(%arg3) step 1 { + affine.for %arg6 = %c0 to min #map1(%arg4) step 32 { + %0 = affine.vector_load %arg0[%arg3 + %arg5, %arg4 + %arg6] : memref<4096x4096xf32>, vector<32xf32> + %1 = affine.vector_load %arg1[%arg4 + %arg6] : memref<4096xf32>, vector<32xf32> + %2 = arith.addf %0, %1 : vector<32xf32> + affine.vector_store %2, %arg1[%arg4 + %arg6] : memref<4096xf32>, vector<32xf32> + } + } + } + } + return +} \ No newline at end of file