diff --git a/benchmarks/DeepLearning/Models/MiniLM-L6/CMakeLists.txt b/benchmarks/DeepLearning/Models/MiniLM-L6/CMakeLists.txt
index cafd13c8..679678fb 100644
--- a/benchmarks/DeepLearning/Models/MiniLM-L6/CMakeLists.txt
+++ b/benchmarks/DeepLearning/Models/MiniLM-L6/CMakeLists.txt
@@ -4,6 +4,7 @@ COMMAND
   ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/MiniLM-L6-200.mlir
       --linalg-bufferize
       --batchmatmul-optimize
+      # --matmul-optimize
       --convert-linalg-to-loops
       --func-bufferize
       --arith-bufferize
diff --git a/benchmarks/DeepLearning/Models/ResNet-18/CMakeLists.txt b/benchmarks/DeepLearning/Models/ResNet-18/CMakeLists.txt
index a5987187..1294d6dc 100644
--- a/benchmarks/DeepLearning/Models/ResNet-18/CMakeLists.txt
+++ b/benchmarks/DeepLearning/Models/ResNet-18/CMakeLists.txt
@@ -7,16 +7,19 @@ COMMAND
     --pass-pipeline="${RESNET18_TOSA_PIPELINE}" | 
   ${LLVM_MLIR_BINARY_DIR}/mlir-opt
     --test-linalg-transform-patterns="test-generalize-pad-tensor"
-    --linalg-bufferize
+    --linalg-bufferize |
+  ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt 
+    --conv-broadcast |
+  ${LLVM_MLIR_BINARY_DIR}/mlir-opt
     --convert-linalg-to-loops
     --func-bufferize
     --arith-bufferize
     --tensor-bufferize 
     --finalizing-bufferize
     --convert-vector-to-scf
+    --lower-affine
     --convert-scf-to-cf
     --expand-strided-metadata
-    --lower-affine
     --convert-vector-to-llvm
     --memref-expand
     --arith-expand
diff --git a/benchmarks/OpOptimization/CMakeLists.txt b/benchmarks/OpOptimization/CMakeLists.txt
index f96942a1..aa700e8f 100644
--- a/benchmarks/OpOptimization/CMakeLists.txt
+++ b/benchmarks/OpOptimization/CMakeLists.txt
@@ -1,2 +1,5 @@
 add_subdirectory(Conv2dNchwFchw)
+# add_subdirectory(Conv2dNhwcHwcf)
 add_subdirectory(MatMul)
+add_subdirectory(LinalgGeneric)
+add_subdirectory(LLama)
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/CMakeLists.txt b/benchmarks/OpOptimization/Conv2dNhwcHwcf/CMakeLists.txt
new file mode 100644
index 00000000..6415a4e8
--- /dev/null
+++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/CMakeLists.txt
@@ -0,0 +1,93 @@
+add_custom_command(OUTPUT mlir-conv2d_nhwc_hwcf_scalar.o
+  COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/Conv2DNhwcHwcf.mlir |
+          sed 's/conv2d_nhwc_hwcf/conv2d_nhwc_hwcf_scalar/' |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+            -convert-linalg-to-affine-loops
+            -lower-affine
+            -convert-vector-to-llvm
+            -finalize-memref-to-llvm
+            -convert-scf-to-cf
+            -convert-linalg-to-llvm
+            -llvm-request-c-wrappers
+            -convert-func-to-llvm
+            -reconcile-unrealized-casts |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+            --filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/Conv2dNhwcHwcf/mlir-conv2d_nhwc_hwcf_scalar.o 
+)
+add_library(Conv2dNhwcHwcfScalar STATIC mlir-conv2d_nhwc_hwcf_scalar.o)
+set_target_properties(Conv2dNhwcHwcfScalar PROPERTIES LINKER_LANGUAGE CXX)
+
+function(build_conv2d_nhwc_hwcf_broadcast step)
+  add_custom_command(OUTPUT mlir_conv2d_nhwc_hwfc_${step}.o
+    COMMAND 
+      cat ${CMAKE_CURRENT_SOURCE_DIR}/Conv2DNhwcHwcf.mlir |
+      sed 's/conv2d_nhwc_hwcf/conv2d_nhwc_hwcf_broadcast_${step}/' |
+      ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
+        -conv-broadcast=stride=${step}
+        -convert-linalg-to-loops
+        -convert-vector-to-scf
+        -lower-affine
+        -convert-scf-to-cf
+        -convert-vector-to-llvm
+        -finalize-memref-to-llvm
+        -convert-arith-to-llvm
+        -llvm-request-c-wrappers
+        -convert-func-to-llvm
+        -convert-cf-to-llvm
+        -reconcile-unrealized-casts |
+      ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+      ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+        --filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/Conv2dNhwcHwcf/mlir_conv2d_nhwc_hwfc_${step}.o
+  )
+  add_library(Conv2dNhwcHwcfBroadcast${step} STATIC mlir_conv2d_nhwc_hwfc_${step}.o)
+  set_target_properties(Conv2dNhwcHwcfBroadcast${step} PROPERTIES LINKER_LANGUAGE CXX)
+endfunction()
+
+build_conv2d_nhwc_hwcf_broadcast(16)
+build_conv2d_nhwc_hwcf_broadcast(32)
+build_conv2d_nhwc_hwcf_broadcast(64)
+build_conv2d_nhwc_hwcf_broadcast(128)
+build_conv2d_nhwc_hwcf_broadcast(256)
+
+add_custom_command(OUTPUT mlir_conv2d_nhwc_hwfc_broadcast.o
+  COMMAND
+  ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/Conv2DNhwcHwcfBroadcast.mlir
+    -convert-linalg-to-loops
+    -convert-vector-to-scf
+    -lower-affine
+    -convert-scf-to-cf
+    -convert-vector-to-llvm
+    -finalize-memref-to-llvm
+    -convert-arith-to-llvm
+    -llvm-request-c-wrappers
+    -convert-func-to-llvm
+    -convert-cf-to-llvm
+    -reconcile-unrealized-casts |
+  ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+  ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+  --filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/Conv2dNhwcHwcf/mlir_conv2d_nhwc_hwfc_broadcast.o
+)
+
+add_library(Conv2dNhwcHwcfBroadcast STATIC mlir_conv2d_nhwc_hwfc_broadcast.o)
+set_target_properties(Conv2dNhwcHwcfBroadcast PROPERTIES LINKER_LANGUAGE CXX)
+
+add_executable(conv2d-nhwc-hwcf-benchmark
+  Main.cpp
+  Conv2DNhwcHwcfBenchmark.cpp
+)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+target_link_directories(conv2d-nhwc-hwcf-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_libraries(conv2d-nhwc-hwcf-benchmark
+  GoogleBenchmark
+  Conv2dNhwcHwcfScalar
+  Conv2dNhwcHwcfBroadcast
+  Conv2dNhwcHwcfBroadcast16
+  Conv2dNhwcHwcfBroadcast32
+  Conv2dNhwcHwcfBroadcast64
+  Conv2dNhwcHwcfBroadcast128
+  Conv2dNhwcHwcfBroadcast256
+  mlir_c_runner_utils
+  mlir_runner_utils
+)
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcf.mlir b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcf.mlir
new file mode 100644
index 00000000..c6fab05a
--- /dev/null
+++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcf.mlir
@@ -0,0 +1,6 @@
+func.func @conv2d_nhwc_hwcf(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
+    linalg.conv_2d_nhwc_hwcf 
+    ins(%input, %filter : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) 
+    outs(%output : memref<?x?x?x?xf32>)
+    return
+}
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBenchmark.cpp b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBenchmark.cpp
new file mode 100644
index 00000000..46c51240
--- /dev/null
+++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBenchmark.cpp
@@ -0,0 +1,187 @@
+//===- Conv2DNhwcHwcfBenchmark.cpp ----------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the benchmark for Conv2d operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include <benchmark/benchmark.h>
+#include <buddy/Core/Container.h>
+#include <iostream>
+#include <random>
+
+// Define target layout.
+#define INPUT_N 1
+#define INPUT_H 58
+#define INPUT_W 58
+#define INPUT_C 64
+#define KERNEL_H 3
+#define KERNEL_W 3
+#define KERNEL_C 64
+#define KERNEL_F 64
+#define OUTPUT_N 1
+#define OUTPUT_H 56
+#define OUTPUT_W 56
+#define OUTPUT_F 64
+
+// Helper functions and variables.
+namespace {
+const std::string PASS = "\033[32mPASS\033[0m";
+const std::string FAIL = "\033[32mFAIL\033[0m";
+
+bool areArraysEqual(float array1[], float array2[], int size) {
+  for (int i = 0; i < size; ++i)
+  {
+    if (array1[i] != array2[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+} // namespace
+
+namespace {
+
+// Declare the C interface.
+extern "C" {
+void _mlir_ciface_conv2d_nhwc_hwcf_scalar(MemRef<float, 4> *inpit,
+                                          MemRef<float, 4> *filter,
+                                          MemRef<float, 4> *output);
+void _mlir_ciface_conv_2d_nhwc_hwcf(MemRef<float, 4> *inpit,
+                                    MemRef<float, 4> *filter,
+                                    MemRef<float, 4> *output);                                          
+void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_16(MemRef<float, 4> *input,
+                                                MemRef<float, 4> *filter,
+                                                MemRef<float, 4> *output);
+void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_32(MemRef<float, 4> *input,
+                                                MemRef<float, 4> *filter,
+                                                MemRef<float, 4> *output);
+void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_64(MemRef<float, 4> *input,
+                                                MemRef<float, 4> *filter,
+                                                MemRef<float, 4> *output);
+void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_128(MemRef<float, 4> *input,
+                                                MemRef<float, 4> *filter,
+                                                MemRef<float, 4> *output);
+void _mlir_ciface_conv2d_nhwc_hwcf_broadcast_256(MemRef<float, 4> *input,
+                                                MemRef<float, 4> *filter,
+                                                MemRef<float, 4> *output);
+}
+
+#define DEFINE_BENCHMARK(name, func)                                    \
+  void BM_CONV2D_NHWC_HWCF_##name(benchmark::State &state) {            \
+    intptr_t sizesInput[4] = {INPUT_N, INPUT_H, INPUT_W, INPUT_C};      \
+    intptr_t sizesKernel[4] = {KERNEL_H, KERNEL_W, KERNEL_C, KERNEL_F}; \
+    intptr_t sizesOutput[4] = {OUTPUT_N, OUTPUT_H, OUTPUT_W, OUTPUT_F}; \
+    MemRef<float, 4> input(sizesInput, 1.0);                            \
+    MemRef<float, 4> filter(sizesKernel, 1.0);                          \
+    MemRef<float, 4> output(sizesOutput, 0);                            \
+    for (auto _ : state) {                                              \
+      func(&input, &filter, &output);                                   \
+    }                                                                   \
+  }
+
+DEFINE_BENCHMARK(SCALAR, _mlir_ciface_conv2d_nhwc_hwcf_scalar)
+DEFINE_BENCHMARK(BROADCAST, _mlir_ciface_conv_2d_nhwc_hwcf)
+DEFINE_BENCHMARK(BROADCAST_16, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_16)
+DEFINE_BENCHMARK(BROADCAST_32, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_32)
+DEFINE_BENCHMARK(BROADCAST_64, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_64)
+DEFINE_BENCHMARK(BROADCAST_128, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_128)
+DEFINE_BENCHMARK(BROADCAST_256, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_256)
+} // namespace
+
+BENCHMARK(BM_CONV2D_NHWC_HWCF_SCALAR)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_16)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_32)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_64)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_128)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_CONV2D_NHWC_HWCF_BROADCAST_256)->Unit(benchmark::kMillisecond);
+
+#define DEFINE_VERIFICATION(name, func)                                 \
+  void VERIFICATION_##name(MemRef<float, 4> inputMemRef,                \
+                           MemRef<float, 4> kernelMemRef,               \
+                           float resultScalar[]) {                      \
+    intptr_t sizesOutput[4] = {OUTPUT_N, OUTPUT_H, OUTPUT_W, OUTPUT_F}; \
+    MemRef<float, 4> output##name(sizesOutput, 0);                      \
+    func(&inputMemRef, &kernelMemRef, &output##name);                   \
+    auto result##name = output##name.getData();                         \
+    const int outputSize = OUTPUT_N * OUTPUT_H * OUTPUT_W * OUTPUT_F;   \
+    std::cout << #name << "case: "                                      \
+              << (areArraysEqual(resultScalar, result##name, outputSize)\
+                     ? PASS                                             \
+                     : FAIL)                                            \
+              << std::endl;                                             \
+  }
+
+DEFINE_VERIFICATION(BROADCAST, _mlir_ciface_conv_2d_nhwc_hwcf)
+DEFINE_VERIFICATION(BROADCAST_16, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_16)
+DEFINE_VERIFICATION(BROADCAST_32, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_32)
+DEFINE_VERIFICATION(BROADCAST_64, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_64)
+DEFINE_VERIFICATION(BROADCAST_128, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_128)
+DEFINE_VERIFICATION(BROADCAST_256, _mlir_ciface_conv2d_nhwc_hwcf_broadcast_256)
+
+void verification() {
+  // Set the random number generator.
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<int> dis(1, 100);
+
+  // Set the layout sizes of input and output memref container.
+  intptr_t sizesInput[4] = {INPUT_N, INPUT_H, INPUT_W, INPUT_C};
+  intptr_t sizesKernel[4] = {KERNEL_H, KERNEL_W, KERNEL_C, KERNEL_F};
+  intptr_t sizesOutput[4] = {OUTPUT_N, OUTPUT_H, OUTPUT_W, OUTPUT_F};
+
+  // Generate input memref container with random numbers.
+  const int inputSize = INPUT_N * INPUT_H * INPUT_W * INPUT_C;
+  float inputRand[inputSize];
+  for (int i = 0; i < inputSize; ++i)
+  {
+    inputRand[i] = dis(gen);
+  }
+  MemRef<float, 4> inputMemRef(inputRand, sizesInput);
+
+  // Generate kernel memref container with random numbers.
+  const int kernelSize = KERNEL_H * KERNEL_W * KERNEL_C * KERNEL_F;
+  float kernelRand[kernelSize];
+  for (int i = 0; i < kernelSize; ++i)
+  {
+    kernelRand[i] = dis(gen);
+  }
+  MemRef<float, 4> kernelMemRef(kernelRand, sizesKernel);
+
+  // Generate a result using a scalar method for comparison during verification.
+  MemRef<float, 4> outputScalar(sizesOutput, 0);
+  _mlir_ciface_conv2d_nhwc_hwcf_scalar(&inputMemRef, &kernelMemRef, &outputScalar);
+  auto resultScalar = outputScalar.getData();
+
+  // Print the verification results.
+  std::cout << "---------------------------------------------------------------"
+               "---------"
+            << std::endl;
+  std::cout << "Correctness Verification:" << std::endl;
+
+  VERIFICATION_BROADCAST(inputMemRef, kernelMemRef, resultScalar);
+
+  VERIFICATION_BROADCAST_16(inputMemRef, kernelMemRef, resultScalar);
+  VERIFICATION_BROADCAST_32(inputMemRef, kernelMemRef, resultScalar);
+  VERIFICATION_BROADCAST_64(inputMemRef, kernelMemRef, resultScalar);
+  VERIFICATION_BROADCAST_128(inputMemRef, kernelMemRef, resultScalar);
+  VERIFICATION_BROADCAST_256(inputMemRef, kernelMemRef, resultScalar);
+
+  std::cout << "---------------------------------------------------------------"
+               "---------"
+            << std::endl;
+}
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBroadcast.mlir b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBroadcast.mlir
new file mode 100644
index 00000000..1f18b4c0
--- /dev/null
+++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Conv2DNhwcHwcfBroadcast.mlir
@@ -0,0 +1,162 @@
+#map0 = affine_map<(d0, d1, d2, d3) -> (d2)>
+#map1 = affine_map<(d0) -> (d0 ceildiv 32)>
+
+func.func @transpose(%input: memref<?x?x?x?xf32>,
+                     %output: memref<?x?x?x?xf32>) {
+
+    return
+}
+   func.func @conv_2d_nhwc_hwcf(%input: memref<?x?x?x?xf32>,
+                               %kernel: memref<?x?x?x?xf32>,
+                               %output: memref<?x?x?x?xf32>) {
+    %c0 = arith.constant 0 : index
+    %c0_f32 = arith.constant 0.0 : f32
+    %c0_f32_vec = vector.splat %c0_f32 : vector<32xf32>
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %c32 = arith.constant 32 : index
+    // Get the n size. (batch)
+    %n = memref.dim %input, %c0 :  memref<?x?x?x?xf32>
+    // Get the f size. (feature)
+    %f = memref.dim %kernel, %c3 :  memref<?x?x?x?xf32>
+    // Get the c size. (channel)
+    %c = memref.dim %kernel, %c2 :  memref<?x?x?x?xf32>
+    // Get the 2D output size. (row and column)
+    %output_row = memref.dim %output, %c1 :  memref<?x?x?x?xf32>
+    %output_col = memref.dim %output, %c2 :  memref<?x?x?x?xf32>
+    // Get the 2D kernel size. (row and column)
+    %kernel_row = memref.dim %kernel, %c0 :  memref<?x?x?x?xf32>
+    %kernel_col = memref.dim %kernel, %c1 :  memref<?x?x?x?xf32>
+
+    %input_row = memref.dim %input, %c1 :  memref<?x?x?x?xf32>
+    %input_col = memref.dim %input, %c2 :  memref<?x?x?x?xf32>
+
+    // %input_transpose = memref.alloc(%n, %c, %input_row, %output_col) : memref<?x?x?x?xf32>
+    // %kernel_transpose = memref.alloc(%f, %c, %kernel_row, %kernel_col) : memref<?x?x?x?xf32>
+    %output_transpose = memref.alloc(%n, %f, %output_row, %output_col) : memref<?x?x?x?xf32>
+
+    // affine.for %n_idx = %c0 to %n {
+    //   affine.for %input_row_idx = %c0 to %input_row {
+    //     affine.for %input_col_idx = %c0 to %input_col {
+    //       affine.for %c_idx = %c0 to %c {
+    //         %val = memref.load %input[%n_idx, %input_row_idx, %input_col_idx, %c_idx] : memref<?x?x?x?xf32>
+    //         memref.store %val, %input_transpose[%n_idx, %c_idx, %input_row_idx, %input_col_idx] : memref<?x?x?x?xf32>
+    //       }
+    //     }
+    //   }
+    // }
+
+    // affine.for %f_idx = %c0 to %f {
+    //   affine.for %input_row_idx = %c0 to %kernel_row {
+    //     affine.for %input_col_idx = %c0 to %kernel_col {
+    //       affine.for %c_idx = %c0 to %c {
+    //         %val = memref.load %kernel[%input_row_idx, %input_col_idx, %c_idx, %f_idx] : memref<?x?x?x?xf32>
+    //         memref.store %val, %kernel_transpose[%f_idx, %c_idx, %input_row_idx, %input_col_idx] : memref<?x?x?x?xf32>
+    //       }
+    //     }
+    //   }
+    // }
+
+    // affine.for %n_idx = %c0 to %n {
+    //   affine.for %f_idx = %c0 to %f {
+    //     affine.for %c_idx = %c0 to %c {
+    //       affine.for %output_row_idx = %c0 to %output_row {
+    //         affine.for %kernel_row_idx = %c0 to %kernel_row {
+    //           affine.for %kernel_col_idx = %c0 to %kernel_col {
+    //             affine.for %output_col_idx = %c0 to #map1(%output_col) {
+    //               // Check sparsity.
+    //               %kernel_ele = memref.load %kernel_transpose[%f_idx, %c_idx, %kernel_row_idx, %kernel_col_idx] : memref<?x?x?x?xf32>
+    //               %sparsity_flag = arith.cmpf one, %kernel_ele, %c0_f32 : f32
+    //               scf.if %sparsity_flag {
+    //                 // Check tail.
+    //                 %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<32xf32>
+    //                 %output_col_cur = arith.muli %output_col_idx, %c32 : index
+    //                 %tail_len = arith.subi %output_col, %output_col_cur : index
+    //                 %tail_flag = arith.cmpi sge, %tail_len, %c32 : index
+    //                 scf.if %tail_flag {
+    //                   %input_vec = affine.vector_load %input_transpose[%n_idx, %c_idx, %output_row_idx + %kernel_row_idx, %kernel_col_idx + %output_col_idx * 32] : memref<?x?x?x?xf32>, vector<32xf32>
+    //                   %output_vec = affine.vector_load %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx * 32] : memref<?x?x?x?xf32>, vector<32xf32>
+    //                   %result_vec = vector.fma %input_vec, %kernel_vec, %output_vec : vector<32xf32>
+    //                   affine.vector_store %result_vec, %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx * 32] : memref<?x?x?x?xf32>, vector<32xf32>
+    //                 } else {
+    //                   %mask_vec = vector.create_mask %tail_len : vector<32xi1>
+    //                   %input_row_idx_tail = arith.addi %output_row_idx, %kernel_row_idx : index
+    //                   %output_col_idx_tail = arith.muli %output_col_idx, %c32 : index
+    //                   %input_col_idx_tail = arith.addi %kernel_col_idx, %output_col_idx_tail : index
+    //                   %input_vec_tail = vector.maskedload %input_transpose[%n_idx, %c_idx, %input_row_idx_tail, %input_col_idx_tail], %mask_vec, %c0_f32_vec : memref<?x?x?x?xf32>, vector<32xi1>, vector<32xf32> into vector<32xf32>
+    //                   %output_vec_tail = vector.maskedload %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx_tail], %mask_vec, %c0_f32_vec : memref<?x?x?x?xf32>, vector<32xi1>, vector<32xf32> into vector<32xf32>
+    //                   %result_vec_tail = vector.fma %input_vec_tail, %kernel_vec, %output_vec_tail : vector<32xf32>
+    //                   vector.maskedstore %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx_tail], %mask_vec, %result_vec_tail : memref<?x?x?x?xf32>, vector<32xi1>, vector<32xf32>
+    //                 }
+    //               }
+    //             }
+    //           }
+    //         }
+    //       }
+    //     }
+    //   }
+    // }
+
+    affine.for %n_idx = %c0 to %n {
+      affine.for %output_row_idx = %c0 to %output_row {
+        affine.for %kernel_row_idx = %c0 to %kernel_row {
+          affine.for %kernel_col_idx = %c0 to %kernel_col {
+            affine.for %output_col_idx = %c0 to #map1(%output_col) {
+              affine.for %c_idx = %c0 to %c {
+                affine.for %f_idx = %c0 to %f {
+                  // Check sparsity.
+                  %kernel_ele = memref.load %kernel[%kernel_row_idx, %kernel_col_idx, %c_idx, %f_idx] : memref<?x?x?x?xf32>
+                  %sparsity_flag = arith.cmpf one, %kernel_ele, %c0_f32 : f32
+                  scf.if %sparsity_flag {
+                    // Check tail.
+                    %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<32xf32>
+                    %output_col_cur = arith.muli %output_col_idx, %c32 : index
+                    %tail_len = arith.subi %output_col, %output_col_cur : index
+                    %tail_flag = arith.cmpi sge, %tail_len, %c32 : index
+                    %input_row_idx_tail = arith.addi %output_row_idx, %kernel_row_idx : index
+                    %output_col_idx_tail = arith.muli %output_col_idx, %c32 : index
+                    %input_col_idx_tail = arith.addi %kernel_col_idx, %output_col_idx_tail : index
+                    %mask_vec = vector.create_mask %tail_len : vector<32xi1>
+                    %input_vec_tail = vector.transfer_read %input[%n_idx, %input_row_idx_tail, %input_col_idx_tail, %c_idx], %c0_f32, %mask_vec {permutation_map = #map0, in_bounds = [true]} : memref<?x?x?x?xf32>, vector<32xf32>
+                    %output_vec_tail = vector.maskedload %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx_tail], %mask_vec, %c0_f32_vec : memref<?x?x?x?xf32>, vector<32xi1>, vector<32xf32> into vector<32xf32>
+                    %result_vec_tail = vector.fma %input_vec_tail, %kernel_vec, %output_vec_tail : vector<32xf32>
+                    // vector.transfer_write %result_vec_tail, %output[%n_idx, %output_row_idx, %output_col_idx_tail, %f_idx], %mask_vec {permutation_map = #map0, in_bounds = [true]} : vector<32xf32>, memref<?x?x?x?xf32>
+                    vector.maskedstore %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx_tail], %mask_vec, %result_vec_tail : memref<?x?x?x?xf32>, vector<32xi1>, vector<32xf32>
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // affine.for %n_idx = %c0 to %n {
+    //   affine.for %output_row_idx = %c0 to %output_row {
+    //         affine.for %output_col_idx = %c0 to %output_col {
+    //             affine.for %f_idx = %c0 to #map1(%f) {
+    //                 // Check tail.
+    //                 %f_cur = arith.muli %f_idx, %c32 : index
+    //                 %tail_len = arith.subi %f, %f_cur : index
+    //                 %tail_flag = arith.cmpi sge, %tail_len, %c32 : index
+    //                 %mask_vec = vector.create_mask %tail_len : vector<32xi1>
+    //                 %input_vec_tail = vector.transfer_read %output_transpose[%n_idx, %f_cur, %output_row_idx, %output_col_idx], %c0_f32, %mask_vec {permutation_map = #map0, in_bounds = [true]} : memref<?x?x?x?xf32>, vector<32xf32>
+    //                 // vector.transfer_write %result_vec_tail, %output[%n_idx, %output_row_idx, %output_col_idx_tail, %f_idx], %mask_vec {permutation_map = #map0, in_bounds = [true]} : vector<32xf32>, memref<?x?x?x?xf32>
+    //                 vector.maskedstore %output[%n_idx, %output_row_idx, %output_col_idx, %f_idx], %mask_vec, %input_vec_tail {permutation_map = #map0, in_bounds = [true]} : memref<?x?x?x?xf32>, vector<32xi1>, vector<32xf32>
+    //             }
+    //         }
+    //   }
+    // }
+    affine.for %n_idx = %c0 to %n {
+      affine.for %output_row_idx = %c0 to %output_row {
+            affine.for %output_col_idx = %c0 to %output_col {
+                affine.for %f_idx = %c0 to %f {
+                  %val = memref.load %output_transpose[%n_idx, %f_idx, %output_row_idx, %output_col_idx] : memref<?x?x?x?xf32>
+                  memref.store %val, %output[%n_idx, %output_row_idx, %output_col_idx, %f_idx] : memref<?x?x?x?xf32>
+                }
+            }
+      }
+    }
+    return
+  }
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/Conv2dNhwcHwcf/Main.cpp b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Main.cpp
new file mode 100644
index 00000000..265e8ab2
--- /dev/null
+++ b/benchmarks/OpOptimization/Conv2dNhwcHwcf/Main.cpp
@@ -0,0 +1,32 @@
+//===- Main.cpp -----------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the main file of the gemm benchmark.
+//
+//===----------------------------------------------------------------------===//
+
+#include <benchmark/benchmark.h>
+
+void verification();
+
+int main(int argc, char **argv) {
+    // Run benchmark.
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+    // Run correctness verification.
+    verification();
+    return 0;
+}
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/LLama/CMakeLists.txt b/benchmarks/OpOptimization/LLama/CMakeLists.txt
new file mode 100644
index 00000000..012528cb
--- /dev/null
+++ b/benchmarks/OpOptimization/LLama/CMakeLists.txt
@@ -0,0 +1,123 @@
+add_custom_command(OUTPUT mlir-llama-origin.o
+  COMMAND 
+    ${LLVM_MLIR_BINARY_DIR}/mlir-opt  ${CMAKE_CURRENT_SOURCE_DIR}/llama_origin.mlir
+      # -convert-linalg-to-affine-loops
+      # -lower-affine
+      # -convert-vector-to-llvm
+      # -finalize-memref-to-llvm
+      # -convert-scf-to-cf
+      # -convert-linalg-to-llvm
+      # -llvm-request-c-wrappers
+      # -convert-func-to-llvm
+      -one-shot-bufferize='bufferize-function-boundaries'
+      -llvm-request-c-wrappers
+      -test-lower-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+    ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+      -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LLama/mlir-llama-origin.o
+)
+
+add_library(LLamaOrigin STATIC mlir-llama-origin.o)
+set_target_properties(LLamaOrigin PROPERTIES LINKER_LANGUAGE CXX)
+
+add_custom_command(OUTPUT mlir-llama-tiling.o
+  COMMAND 
+    cat ${CMAKE_CURRENT_SOURCE_DIR}/llama_origin.mlir |
+    sed 's/base/tiling/' |
+    ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
+      -polyhedral-tiling='tile-sizes=32,32,32,32'
+      # -generic-vectorization=tile-sizes=0,32,32,32
+      # -convert-vector-to-scf
+      # -linalg-bufferize
+      # -convert-linalg-to-loops
+      # -convert-vector-to-llvm
+      # -lower-affine
+      # -convert-scf-to-cf
+      # -func-bufferize
+      # -finalizing-bufferize
+      # -finalize-memref-to-llvm
+      # -llvm-request-c-wrappers
+      # -convert-func-to-llvm
+      -reconcile-unrealized-casts |
+    # ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+    /home/heyi/heyi/llvm-b1115f8c-ubuntu-x64/bin/mlir-opt
+      -convert-linalg-to-affine-loops
+      -affine-super-vectorize='virtual-vector-size=16'
+      -one-shot-bufferize='bufferize-function-boundaries'
+      -llvm-request-c-wrappers
+      -test-lower-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+    ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+      -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LLama/mlir-llama-tiling.o
+)
+
+add_library(LLamaTiling STATIC mlir-llama-tiling.o)
+set_target_properties(LLamaTiling PROPERTIES LINKER_LANGUAGE CXX)
+
+
+add_custom_command(OUTPUT mlir-affine-vec.o
+  COMMAND 
+    cat ${CMAKE_CURRENT_SOURCE_DIR}/llama_origin.mlir |
+    sed 's/base/affine_vec/' |
+    ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
+      -convert-linalg-to-affine-loops
+      # -affine-loop-tile='tile-sizes=32,32,32,32'
+      # -linalg-bufferize
+      # -convert-linalg-to-loops
+      # -convert-vector-to-llvm
+      # -lower-affine
+      # -convert-scf-to-cf
+      # -func-bufferize
+      # -finalizing-bufferize
+      # -finalize-memref-to-llvm
+      # -llvm-request-c-wrappers
+      # -convert-func-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+      # -vectorize-affine-loop-nest
+      -affine-super-vectorize='virtual-vector-size=128' 
+      -one-shot-bufferize='bufferize-function-boundaries'
+      -llvm-request-c-wrappers
+      -test-lower-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+    ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+      -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LLama/mlir-affine-vec.o
+)
+
+add_library(AffineVec STATIC mlir-affine-vec.o)
+set_target_properties(AffineVec PROPERTIES LINKER_LANGUAGE CXX)
+
+# add_custom_command(OUTPUT mlir-manual-opt.o
+#   COMMAND 
+#     ${LLVM_MLIR_BINARY_DIR}/mlir-opt  ${CMAKE_CURRENT_SOURCE_DIR}/linalg_manual_opt.mlir
+#     #  -one-shot-bufferize='bufferize-function-boundaries'
+#      -llvm-request-c-wrappers
+#      -test-lower-to-llvm
+#        |
+#     ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+#     ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+#       -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LinalgGeneric/mlir-manual-opt.o
+# )
+
+# add_library(ManualOpt STATIC mlir-manual-opt.o)
+# set_target_properties(ManualOpt PROPERTIES LINKER_LANGUAGE CXX)
+
+add_executable(llama-benchmark
+  Main.cpp
+  LLamaBenchmark.cpp
+)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+target_link_directories(llama-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_libraries(llama-benchmark
+  GoogleBenchmark
+  LLamaOrigin
+  LLamaTiling
+  AffineVec
+  # ManualOpt
+  mlir_c_runner_utils
+  mlir_runner_utils
+)
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/LLama/LLamaBenchmark.cpp b/benchmarks/OpOptimization/LLama/LLamaBenchmark.cpp
new file mode 100644
index 00000000..1e2696f6
--- /dev/null
+++ b/benchmarks/OpOptimization/LLama/LLamaBenchmark.cpp
@@ -0,0 +1,119 @@
+//===- LLamaBenchmark.cpp ----------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the benchmark for Linalg Generic operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include <benchmark/benchmark.h>
+#include <buddy/Core/Container.h>
+#include <iostream>
+#include <random>
+
+// Define target layout.
+#define INPUT_H 1
+#define INPUT_W 512
+#define OUTPUT_C 1024
+#define OUTPUT_F 4096
+
+namespace {
+
+// Helper functions and variables.
+const std::string PASS = "\033[32mPASS\033[0m";
+const std::string FAIL = "\033[32mFAIL\033[0m";
+bool areArraysEqual(float array1[], float array2[], int size) {
+  for (int i = 0; i < size; ++i)
+  {
+    if (array1[i] != array2[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void setValue(MemRef<int, 2> &in) {
+  int sizes = in.getSizes()[1] * in.getSizes()[0];
+  for (int i = 0; i < sizes; ++i) {
+    in[i] = i;
+  }
+}
+
+// Declare the C interface.
+extern "C" {
+void _mlir_ciface_base(MemRef<float, 3> *input, MemRef<float, 3> *output);
+void _mlir_ciface_tiling(MemRef<float, 3> *input, MemRef<float, 3> *output);
+void _mlir_ciface_affine_vec(MemRef<float, 3> *input, MemRef<float, 3> *output);
+}
+
+#define DEFINE_BENCHMARK(name, func)                  \
+  void BM_GENERIC_##name(benchmark::State &state) {   \
+    intptr_t sizesInput[3] = {INPUT_H, INPUT_W, OUTPUT_F};      \
+    intptr_t sizesOutput[3] = {INPUT_H, INPUT_W, OUTPUT_C}; \
+    MemRef<float, 3> input(sizesInput, 2.0);              \
+    MemRef<float, 3> output(sizesOutput, 3.0);        \
+    for (auto _ : state) {                            \
+      func(&input, &output);                 \
+    }                                                 \
+  }
+
+
+DEFINE_BENCHMARK(BASE, _mlir_ciface_base)
+DEFINE_BENCHMARK(TILING, _mlir_ciface_tiling)
+DEFINE_BENCHMARK(AFFINEVEC, _mlir_ciface_affine_vec)
+} // namespace 
+
+BENCHMARK(BM_GENERIC_BASE)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_GENERIC_TILING)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_GENERIC_AFFINEVEC)->Unit(benchmark::kMillisecond);
+
+#define DEFINE_VERIFICATION(name, func) \
+  void VERIFICATION_##name(MemRef<float, 2> input,  \
+                           MemRef<float, 1> output, \
+                           float resultScalar[]) { \
+    func(&input, &output); \
+    auto result##name = output.getData(); \
+    std::cout << #name << "case: "  \
+              << (areArraysEqual(resultScalar, result##name, 4096) \
+                     ? PASS \
+                     : FAIL) \
+              << std::endl;\
+  }
+
+// DEFINE_VERIFICATION(TRANSFORM_TILING, _mlir_ciface_transform_tiling)
+// DEFINE_VERIFICATION(MANUL, _mlir_ciface_manul)
+
+void verification() {
+  // intptr_t sizesInput[2] = {INPUT_H, INPUT_W};
+  // intptr_t sizesOutput[1] = {OUTPUT_W};
+  // MemRef<float, 2> input(sizesInput, 1.0);
+  // MemRef<float, 1> output(sizesOutput, 1.0);
+  // _mlir_ciface_origin(&input, &output);
+  // auto resultScalar = output.getData();
+  // // Print the verification results.
+  // std::cout << "---------------------------------------------------------------"
+  //              "---------"
+  //           << std::endl;
+  // std::cout << "Correctness Verification:" << std::endl;
+  // MemRef<float, 1> newOutput1(sizesOutput, 1.0);
+  // VERIFICATION_TRANSFORM_TILING(input, newOutput1, resultScalar);
+  // MemRef<float, 1> newOutput2(sizesOutput, 1.0);
+  // VERIFICATION_MANUL(input, newOutput2, resultScalar);
+
+  // std::cout << "---------------------------------------------------------------"
+  //              "---------"
+  //           << std::endl;
+
+}
diff --git a/benchmarks/OpOptimization/LLama/Main.cpp b/benchmarks/OpOptimization/LLama/Main.cpp
new file mode 100644
index 00000000..4e4e9f7d
--- /dev/null
+++ b/benchmarks/OpOptimization/LLama/Main.cpp
@@ -0,0 +1,32 @@
+//===- Main.cpp -----------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the main file of the linalg generic benchmark.
+//
+//===----------------------------------------------------------------------===//
+
+#include <benchmark/benchmark.h>
+
+void verification();
+
+int main(int argc, char **argv) {
+    // Run benchmark.
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+    // Run correctness verification.
+    verification();
+    return 0;
+}
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/LLama/llama_manul.mlir b/benchmarks/OpOptimization/LLama/llama_manul.mlir
new file mode 100644
index 00000000..b3d1974e
--- /dev/null
+++ b/benchmarks/OpOptimization/LLama/llama_manul.mlir
@@ -0,0 +1,58 @@
+module {
+  func.func @base(%arg0: memref<1x512x4096xf32>, %arg1: memref<1x512x1024xf32>) {
+    %cst = arith.constant 4.096000e+03 : f32
+    affine.for %arg2 = 0 to 1 {
+      affine.for %arg3 = 0 to 512 {
+        affine.for %arg4 = 0 to 4096 {
+          affine.for %arg5 = 0 to 1024 {
+            %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<1x512x4096xf32>
+            %1 = affine.load %arg1[%arg2, %arg3, %arg5] : memref<1x512x1024xf32>
+            %2 = arith.divf %0, %cst : f32
+            %3 = arith.addf %2, %1 : f32
+            affine.store %3, %arg1[%arg2, %arg3, %arg5] : memref<1x512x1024xf32>
+          }
+        }
+      }
+    }
+    return
+  }
+}
+
+
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0) -> (512-d0, 32)>
+#map3 = affine_map<(d0) -> (4096-d0, 32)>
+// #map3 = affine_map<(d0) -> (1024-d0, 32)>
+module {
+  func.func @base(%arg0: memref<1x512x4096xf32>, %arg1: memref<1x512x1024xf32>) {
+    %cst = arith.constant 4.096000e+03 : f32
+    %c1024 = arith.constant 1024 : index
+    %c4096 = arith.constant 4096 : index
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    affine.for %arg2 = %c0 to %c1 step 1 {
+      affine.for %arg3 = %c0 to %c512 step 32 {
+        affine.for %arg4 = %c0 to %c4096 step 32 {
+          affine.for %arg5 = %c0 to %c1024 step 32 {
+            // %subview = memref.subview %arg0[0, %arg2, %arg3] [1, 32, 32] [1, 1, 1] : memref<1x512x4096xf32> to memref<1x32x32xf32, strided<[2097152, 4096, 1], offset: ?>>
+            // %subview_0 = memref.subview %arg1[0, %arg2, %arg4] [1, 32, 32] [1, 1, 1] : memref<1x512x1024xf32> to memref<1x32x32xf32, strided<[524288, 1024, 1], offset: ?>>
+            affine.for %arg6 = %c0 to min #map2(%arg3) step 1 {
+              affine.for %arg7 = %c0 to min #map3(%arg4) step 
+            }
+            // linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%subview : memref<1x32x32xf32, strided<[2097152, 4096, 1], offset: ?>>) outs(%subview_0 : memref<1x32x32xf32, strided<[524288, 1024, 1], offset: ?>>) {
+            // ^bb0(%in: f32, %out: f32):
+            //   %0 = arith.divf %in, %cst : f32
+            //   %1 = arith.addf %0, %out : f32
+            //   linalg.yield %1 : f32
+            // }
+          }
+        }
+      }
+    }
+    return
+  }
+}
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/LLama/llama_origin.mlir b/benchmarks/OpOptimization/LLama/llama_origin.mlir
new file mode 100644
index 00000000..6df5c610
--- /dev/null
+++ b/benchmarks/OpOptimization/LLama/llama_origin.mlir
@@ -0,0 +1,14 @@
+#map8 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map9 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+module {
+  func.func @base(%arg0: memref<1x512x4096xf32>, %arg1: memref<1x512x1024xf32>) {
+    linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%arg0 : memref<1x512x4096xf32>) outs(%arg1 : memref<1x512x1024xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2372 = arith.constant 4.096000e+03 : f32
+      %4230 = arith.divf %in, %cst_2372 : f32
+      %4231 = arith.addf %4230, %out : f32
+      linalg.yield %4231 : f32
+    }
+    return
+  }
+}
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/LinalgGeneric/CMakeLists.txt b/benchmarks/OpOptimization/LinalgGeneric/CMakeLists.txt
new file mode 100644
index 00000000..2d78a16d
--- /dev/null
+++ b/benchmarks/OpOptimization/LinalgGeneric/CMakeLists.txt
@@ -0,0 +1,83 @@
+add_custom_command(OUTPUT mlir-generic-origin.o
+  COMMAND 
+    ${LLVM_MLIR_BINARY_DIR}/mlir-opt  ${CMAKE_CURRENT_SOURCE_DIR}/linalg_generic_origin.mlir
+      # -convert-linalg-to-affine-loops
+      # -lower-affine
+      # -convert-vector-to-llvm
+      # -finalize-memref-to-llvm
+      # -convert-scf-to-cf
+      # -convert-linalg-to-llvm
+      # -llvm-request-c-wrappers
+      # -convert-func-to-llvm
+      # -one-shot-bufferize='bufferize-function-boundaries'
+      -llvm-request-c-wrappers
+      -test-lower-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+    ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+      -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LinalgGeneric/mlir-generic-origin.o
+)
+
+add_library(GenericOrigin STATIC mlir-generic-origin.o)
+set_target_properties(GenericOrigin PROPERTIES LINKER_LANGUAGE CXX)
+
+add_custom_command(OUTPUT mlir-transform-tiling.o
+  COMMAND 
+    cat ${CMAKE_CURRENT_SOURCE_DIR}/linalg_generic_origin.mlir |
+    sed 's/origin/transform_tiling/' |
+    ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
+      -polyhedral-tiling='tile-sizes=32,32'
+      # -linalg-bufferize
+      # -convert-linalg-to-loops
+      # -convert-vector-to-llvm
+      # -lower-affine
+      # -convert-scf-to-cf
+      # -func-bufferize
+      # -finalizing-bufferize
+      # -finalize-memref-to-llvm
+      # -llvm-request-c-wrappers
+      # -convert-func-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+      -one-shot-bufferize='bufferize-function-boundaries'
+      -llvm-request-c-wrappers
+      -test-lower-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+    ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+      -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LinalgGeneric/mlir-transform-tiling.o
+)
+
+add_library(TransformTiling STATIC mlir-transform-tiling.o)
+set_target_properties(TransformTiling PROPERTIES LINKER_LANGUAGE CXX)
+
+add_custom_command(OUTPUT mlir-manual-opt.o
+  COMMAND 
+    ${LLVM_MLIR_BINARY_DIR}/mlir-opt  ${CMAKE_CURRENT_SOURCE_DIR}/linalg_manual_opt.mlir
+    #  -one-shot-bufferize='bufferize-function-boundaries'
+     -llvm-request-c-wrappers
+     -test-lower-to-llvm
+       |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+    ${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR}
+      -filetype=obj -o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/LinalgGeneric/mlir-manual-opt.o
+)
+
+add_library(ManualOpt STATIC mlir-manual-opt.o)
+set_target_properties(ManualOpt PROPERTIES LINKER_LANGUAGE CXX)
+
+add_executable(linalg-generic-benchmark
+  Main.cpp
+  GenericBenchmark.cpp
+)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+target_link_directories(linalg-generic-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_libraries(linalg-generic-benchmark
+  GoogleBenchmark
+  GenericOrigin
+  TransformTiling
+  ManualOpt
+  mlir_c_runner_utils
+  mlir_runner_utils
+)
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/LinalgGeneric/GenericBenchmark.cpp b/benchmarks/OpOptimization/LinalgGeneric/GenericBenchmark.cpp
new file mode 100644
index 00000000..c27f3106
--- /dev/null
+++ b/benchmarks/OpOptimization/LinalgGeneric/GenericBenchmark.cpp
@@ -0,0 +1,117 @@
+//===- GenericBenchmark.cpp ----------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the benchmark for Linalg Generic operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include <benchmark/benchmark.h>
+#include <buddy/Core/Container.h>
+#include <iostream>
+#include <random>
+
+// Define target layout.
+#define INPUT_H 4096
+#define INPUT_W 4096
+#define OUTPUT_W 4096
+
+namespace {
+
+// Helper functions and variables.
+const std::string PASS = "\033[32mPASS\033[0m";
+const std::string FAIL = "\033[32mFAIL\033[0m";
+bool areArraysEqual(float array1[], float array2[], int size) {
+  for (int i = 0; i < size; ++i)
+  {
+    if (array1[i] != array2[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Declare the C interface.
+extern "C" {
+void _mlir_ciface_origin(MemRef<float, 2> *inpit, MemRef<float, 1> *outpit);
+void _mlir_ciface_transform_tiling(MemRef<float, 2> *inpit, MemRef<float, 1> *outpit);
+void _mlir_ciface_manul(MemRef<float, 2> *inpit, MemRef<float, 1> *outpit);
+void _mlir_ciface_manul2(MemRef<float, 2> *inpit, MemRef<float, 1> *outpit);
+void _mlir_ciface_manul3(MemRef<float, 2> *inpit, MemRef<float, 1> *outpit);
+}
+
+#define DEFINE_BENCHMARK(name, func)                  \
+  void BM_GENERIC_##name(benchmark::State &state) {   \
+    intptr_t sizesInput[2] = {INPUT_H, INPUT_W};      \
+    intptr_t sizesOutput[1] = { OUTPUT_W};   \
+    MemRef<float, 2> input(sizesInput, 1.0);               \
+    MemRef<float, 1> output(sizesOutput, 1.0);             \
+    for (auto _ : state) {                            \
+      func(&input, &output);                          \
+    }                                                 \
+  }
+
+
+DEFINE_BENCHMARK(ORIGIN, _mlir_ciface_origin)
+DEFINE_BENCHMARK(TRANSFORM_TILING, _mlir_ciface_transform_tiling)
+DEFINE_BENCHMARK(MANUL, _mlir_ciface_manul)
+DEFINE_BENCHMARK(MANUL2, _mlir_ciface_manul2)
+// DEFINE_BENCHMARK(MANUL3, _mlir_ciface_manul3)
+} // namespace 
+
+BENCHMARK(BM_GENERIC_ORIGIN)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_GENERIC_TRANSFORM_TILING)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_GENERIC_MANUL)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_GENERIC_MANUL2)->Unit(benchmark::kMillisecond);
+// BENCHMARK(BM_GENERIC_MANUL3)->Unit(benchmark::kMillisecond);
+
+#define DEFINE_VERIFICATION(name, func) \
+  void VERIFICATION_##name(MemRef<float, 2> input,  \
+                           MemRef<float, 1> output, \
+                           float resultScalar[]) { \
+    func(&input, &output); \
+    auto result##name = output.getData(); \
+    std::cout << #name << "case: "  \
+              << (areArraysEqual(resultScalar, result##name, 4096) \
+                     ? PASS \
+                     : FAIL) \
+              << std::endl;\
+  }
+
+DEFINE_VERIFICATION(TRANSFORM_TILING, _mlir_ciface_transform_tiling)
+DEFINE_VERIFICATION(MANUL, _mlir_ciface_manul)
+
+void verification() {
+  intptr_t sizesInput[2] = {INPUT_H, INPUT_W};
+  intptr_t sizesOutput[1] = {OUTPUT_W};
+  MemRef<float, 2> input(sizesInput, 1.0);
+  MemRef<float, 1> output(sizesOutput, 1.0);
+  _mlir_ciface_origin(&input, &output);
+  auto resultScalar = output.getData();
+  // Print the verification results.
+  std::cout << "---------------------------------------------------------------"
+               "---------"
+            << std::endl;
+  std::cout << "Correctness Verification:" << std::endl;
+  MemRef<float, 1> newOutput1(sizesOutput, 1.0);
+  VERIFICATION_TRANSFORM_TILING(input, newOutput1, resultScalar);
+  MemRef<float, 1> newOutput2(sizesOutput, 1.0);
+  VERIFICATION_MANUL(input, newOutput2, resultScalar);
+
+  std::cout << "---------------------------------------------------------------"
+               "---------"
+            << std::endl;
+
+}
diff --git a/benchmarks/OpOptimization/LinalgGeneric/Main.cpp b/benchmarks/OpOptimization/LinalgGeneric/Main.cpp
new file mode 100644
index 00000000..4e4e9f7d
--- /dev/null
+++ b/benchmarks/OpOptimization/LinalgGeneric/Main.cpp
@@ -0,0 +1,32 @@
+//===- Main.cpp -----------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the main file of the linalg generic benchmark.
+//
+//===----------------------------------------------------------------------===//
+
+#include <benchmark/benchmark.h>
+
+void verification();
+
+int main(int argc, char **argv) {
+    // Run benchmark.
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+    // Run correctness verification.
+    verification();
+    return 0;
+}
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/LinalgGeneric/linalg_generic_origin.mlir b/benchmarks/OpOptimization/LinalgGeneric/linalg_generic_origin.mlir
new file mode 100644
index 00000000..e6601f9a
--- /dev/null
+++ b/benchmarks/OpOptimization/LinalgGeneric/linalg_generic_origin.mlir
@@ -0,0 +1,11 @@
+#map = affine_map<(d0, d1) -> (d1, d0)>
+#map1 = affine_map<(d0, d1) -> (d1)>
+#map2 = affine_map<(d0) -> (d0 + 256, 4096)>
+func.func @origin(%arg0: memref<4096x4096xf32>, %arg1: memref<4096xf32>) {
+  linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins (%arg0: memref<4096x4096xf32>) outs(%arg1: memref<4096xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %2 = arith.addf %in, %out : f32
+    linalg.yield %2 : f32 
+  }
+  return
+}
\ No newline at end of file
diff --git a/benchmarks/OpOptimization/LinalgGeneric/linalg_manual_opt.mlir b/benchmarks/OpOptimization/LinalgGeneric/linalg_manual_opt.mlir
new file mode 100644
index 00000000..51fbaf59
--- /dev/null
+++ b/benchmarks/OpOptimization/LinalgGeneric/linalg_manual_opt.mlir
@@ -0,0 +1,73 @@
+#map = affine_map<(d0) -> (256, 4096-d0)>
+#map1 = affine_map<(d0) -> (64, 4096-d0)>
+#map2 = affine_map<(d0) -> (8, 4096-d0)>
+#map3 = affine_map<(d0) -> (4, 4096-d0)>
+func.func @manul(%arg0: memref<4096x4096xf32>, %arg1: memref<4096xf32>) {
+  %c0 = arith.constant 0 : index
+  %c256 = arith.constant 256 : index
+  %c4096 = arith.constant 4096 : index
+  %c1 = arith.constant 1 : index
+  affine.for %arg3 = %c0 to %c4096 step 1 {
+    affine.for %arg4 = %c0 to %c4096 step 32 {
+      %0 = affine.vector_load %arg0[%arg3, %arg4] : memref<4096x4096xf32>, vector<32xf32>
+      %1 = affine.vector_load %arg1[%arg4] : memref<4096xf32>, vector<32xf32>
+      %2 = arith.addf %0, %1 : vector<32xf32>
+      affine.vector_store %2, %arg1[%arg4] : memref<4096xf32>, vector<32xf32>
+    }
+  }
+  // affine.for %arg3 = %c0 to %c4096 step 256 {
+  //   affine.for %arg4 = %c0 to %c4096 step 256 {
+  //     affine.for %arg5 = %c0 to min #map(%arg3) step 1 {
+  //       affine.for %arg6 = %c0 to min #map(%arg4) step 32 {
+  //         // %0 = affine.load %arg0[%arg3 + %arg5, %arg4 + %arg6] : memref<4096x4096xf32>
+  //         // %1 = affine.load %arg1[%arg3 + %arg5] : memref<4096xf32>
+  //         // %2 = arith.addf %0, %1 : f32
+  //         // affine.store %2, %arg1[%arg3 + %arg5] : memref<4096xf32>
+  //         %0 = affine.vector_load %arg0[%arg3 + %arg5, %arg4 + %arg6] : memref<4096x4096xf32>, vector<32xf32>
+  //         %1 = affine.vector_load %arg1[%arg4 + %arg6] : memref<4096xf32>, vector<32xf32>
+  //         %2 = arith.addf %0, %1 : vector<32xf32>
+  //         affine.vector_store %2, %arg1[%arg4 + %arg6] : memref<4096xf32>, vector<32xf32>
+  //       }
+  //     }
+  //   }
+  // }
+  return
+}
+
+func.func @manul2(%arg0: memref<4096x4096xf32>, %arg1: memref<4096xf32>) {
+  %c0 = arith.constant 0 : index
+  %c256 = arith.constant 256 : index
+  %c4096 = arith.constant 4096 : index
+  %c1 = arith.constant 1 : index
+  affine.for %arg3 = %c0 to %c4096 step 8 {
+    affine.for %arg4 = %c0 to %c4096 step 1 {
+      affine.for %arg5 = %c0 to min #map2(%arg3) {
+        %0 = affine.load %arg0[%arg3 + %arg5, %arg4] : memref<4096x4096xf32>
+        %1 = affine.load %arg1[%arg3 + %arg5] : memref<4096xf32>
+        %2 = arith.addf %0, %1 : f32
+        affine.store %2, %arg1[%arg3 + %arg5] : memref<4096xf32>
+      }
+    } 
+  }
+  return
+}
+
+func.func @manul3(%arg0: memref<4096x4096xf32>, %arg1: memref<4096xf32>) {
+  %c0 = arith.constant 0 : index
+  %c256 = arith.constant 256 : index
+  %c4096 = arith.constant 4096 : index
+  %c1 = arith.constant 1 : index
+  affine.for %arg3 = %c0 to %c4096 step 64 {
+    affine.for %arg4 = %c0 to %c4096 step 64 {
+      affine.for %arg5 = %c0 to min #map1(%arg3) step 1 {
+        affine.for %arg6 = %c0 to min #map1(%arg4) step 32 {
+          %0 = affine.vector_load %arg0[%arg3 + %arg5, %arg4 + %arg6] : memref<4096x4096xf32>, vector<32xf32>
+          %1 = affine.vector_load %arg1[%arg4 + %arg6] : memref<4096xf32>, vector<32xf32>
+          %2 = arith.addf %0, %1 : vector<32xf32>
+          affine.vector_store %2, %arg1[%arg4 + %arg6] : memref<4096xf32>, vector<32xf32>
+        }
+      }
+    }
+  }
+  return
+}
\ No newline at end of file