From 14a1665b3986cbf3496fed55beee04b418d8fd2f Mon Sep 17 00:00:00 2001 From: Zhighway777 <1990405801@qq.com> Date: Thu, 10 Jul 2025 20:04:25 +0800 Subject: [PATCH] docs: add Exo dynamic guide for Gemmini MatMulOp --- .../Gemmini/Ops/MatMulOp/ExoMatmul-modified.c | 308 ++++++++++++++++++ benchmarks/Gemmini/Ops/MatMulOp/README.md | 12 + .../Gemmini/Ops/MatMulOp/matmul_dynamic.py | 281 ++++++++++++++++ .../Ops/MatMulOp/test_flexible_kernels.py | 179 ++++++++++ 4 files changed, 780 insertions(+) create mode 100644 benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul-modified.c create mode 100644 benchmarks/Gemmini/Ops/MatMulOp/README.md create mode 100644 benchmarks/Gemmini/Ops/MatMulOp/matmul_dynamic.py create mode 100644 benchmarks/Gemmini/Ops/MatMulOp/test_flexible_kernels.py diff --git a/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul-modified.c b/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul-modified.c new file mode 100644 index 00000000..008f980a --- /dev/null +++ b/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul-modified.c @@ -0,0 +1,308 @@ +//===- ExoMatmul.c --------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements Exo-lang Matmul kernel. +// The kernels are generated from exo-lang python script. +// +//===----------------------------------------------------------------------===// + +#include "ExoUtils.h" +#include "gemmini.h" + +void matmul_4(const float* scale, bool act, const int8_t* A, const int8_t* B, int8_t* C ) { + gemmini_extended_config_st((256), (act), (scale)[0]); + + gemmini_extended_config_ex(WS, 0, 0, 1, 0, 0); + + gemmini_extended3_config_ld((256), 1.0f, 0, 2); + + gemmini_extended3_config_ld((256), 1.0f, 0, 1); + + gemmini_extended3_config_ld(0, 1.0f, 0, 0); + + int8_t *a = (int8_t*) ((uint64_t)gemm_malloc (16 * 16 * 4 * 4 * 4 * sizeof(int8_t))); + int8_t *b = (int8_t*) ((uint64_t)gemm_malloc (16 * 16 * 4 * 4 * 4 * 4 * sizeof(int8_t))); + int32_t *res = (int32_t*) ((uint32_t)gemm_acc_malloc (16 * 16 * 4 * 4 * sizeof(int32_t))); + for (int_fast32_t io = 0; io < 4; io++) { + for (int_fast32_t i = 0; i < 4; i++) { + for (int_fast32_t j = 0; j < 4; j++) { + gemmini_extended_mvin( 0, ((uint64_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))),(16), (16) ); + gemmini_extended_mvin( 0, ((uint64_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))),(16), (16) ); + gemmini_extended_mvin( 0, ((uint64_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))),(16), (16) ); + gemmini_extended_mvin( 0, ((uint64_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))),(16), (16) ); + if (j == 0) { + gemmini_extended_mvin2( &A[(16 * i + 64 * io) * (256)], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096))/16))), 16*(4), (16) ); + } + if (io == 0) { + if (i == 0) { + gemmini_extended_mvin3( &B[64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384))/16))), 16*(4), (16) ); + } + } + if (io == 0) { + if (i == 0) { + gemmini_extended_mvin3( &B[(16) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 1024)/16))), 16*(4), (16) ); + } + } + if (io == 0) { + if (i == 0) { + gemmini_extended_mvin3( &B[(32) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (1024))/16))), 16*(4), (16) ); + } + } + if (io == 0) { + if (i == 0) { + gemmini_extended_mvin3( &B[(48) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (1024))/16))), 16*(4), (16) ); + } + } + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_mvin2( &A[(16 * i + 64 * io) * (256) + 64], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024)/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(64) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096)/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(80) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + 1024)/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(96) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (2) * (1024))/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(112) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (3) * (1024))/16))), 16*(4), (16) ); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + 4096 + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + 1024 + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_mvin2( &A[(16 * i + 64 * io) * (256) + 128], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024))/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(128) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096))/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(144) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + 1024)/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(160) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (2) * (1024))/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(176) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (3) * (1024))/16))), 16*(4), (16) ); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (2) * (4096) + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (2) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_mvin2( &A[(16 * i + 64 * io) * (256) + 192], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024))/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(192) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096))/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(208) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + 1024)/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(224) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (2) * (1024))/16))), 16*(4), (16) ); + gemmini_extended_mvin3( &B[(240) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (3) * (1024))/16))), 16*(4), (16) ); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (16384) + (3) * (4096) + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (4096) + (3) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 64 * io) * (256) + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 64 * io) * (256) + 16 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 64 * io) * (256) + 32 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 64 * io) * (256) + 48 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16)), (16), (16) ); + } + } + } + gemm_acc_free((uint32_t)(res)); + gemm_free((uint64_t)(b)); + gemm_free((uint64_t)(a)); +} + +// clang-format off +void _exo_matmul_4(const float* scale, bool act, const int8_t* A, const int8_t* B, int8_t* C) { + gemmini_extended_config_st((256), (act), (scale)[0]); + gemmini_extended_config_ex(WS, 0, 0, 1, 0, 0); + gemmini_extended3_config_ld((256), 1.0f, 0, 2); + gemmini_extended3_config_ld((64), 1.0f, 0, 1); + gemmini_extended3_config_ld(0, 1.0f, 0, 0); + + int8_t *a = (int8_t*) ((uint64_t)gemm_malloc (16 * 16 * 4 * 1 * 196 * sizeof(int8_t))); + int8_t *b = (int8_t*) ((uint64_t)gemm_malloc (16 * 16 * 4 * 4 * 1 * 4 * sizeof(int8_t))); + int32_t *res = (int32_t*) ((uint32_t)gemm_acc_malloc (16 * 16 * 4 * 4 * sizeof(int32_t))); + for (int_fast32_t io = 0; io < 4; io++) { + for (int_fast32_t i = 0; i < 196; i++) { + for (int_fast32_t j = 0; j < 4; j++) { + gemmini_extended_mvin( 0, ((uint64_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))),(16), (16) ); + gemmini_extended_mvin( 0, ((uint64_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))),(16), (16) ); + gemmini_extended_mvin( 0, ((uint64_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))),(16), (16) ); + gemmini_extended_mvin( 0, ((uint64_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))),(16), (16) ); + if (j == 0) { + gemmini_extended_mvin2( &A[(16 * i + 3136 * io) * (64)], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), 16*(4), (16) ); + } + if (io == 0) { + if (i == 0) { + gemmini_extended_mvin3( &B[64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096))/16))), 16*(4), (16) ); + } + } + if (io == 0) { + if (i == 0) { + gemmini_extended_mvin3( &B[(16) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024)/16))), 16*(4), (16) ); + } + } + if (io == 0) { + if (i == 0) { + gemmini_extended_mvin3( &B[(32) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024))/16))), 16*(4), (16) ); + } + } + if (io == 0) { + if (i == 0) { + gemmini_extended_mvin3( &B[(48) * (256) + 64 * j], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024))/16))), 16*(4), (16) ); + } + } + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 16 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 32 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 48 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16)), (16), (16) ); + } + } + } + gemm_acc_free((uint32_t)(res)); + gemm_free((uint64_t)(b)); + gemm_free((uint64_t)(a)); +} +// clang-format on diff --git a/benchmarks/Gemmini/Ops/MatMulOp/README.md b/benchmarks/Gemmini/Ops/MatMulOp/README.md new file mode 100644 index 00000000..a277474f --- /dev/null +++ b/benchmarks/Gemmini/Ops/MatMulOp/README.md @@ -0,0 +1,12 @@ +# Exo dynamic + +## 步骤 +1. 需要exo依赖 见 [exo github](https://github.com/exo-lang/exo) +2. 这里使用exo并且使用exocc将python文件转化为C Kernel,然后将ExoMatmul.c中Kernel部分的函数替换为exocc编译后的结果。 +```shell +exocc matmul_dynamic.py +``` +3. 在matmul_dynamic中有两个Kernel:schedule_matmul_4 & schedule_matmul_flexible +schedule_matmul_4 可以通过部分shape,但是具有较好的优化效果,通过修改MM,NN,KK的数值来修改shape +schedule_matmul_flexible 可以通过所有的规则shape(至少16的整数倍),但是性能不佳。通过给函数传入不同的参数来修改shape +4. test_flexible_kernels.py用于测试schedule_matmul_flexible的生成结果 涵盖了大部分类型的shape测试,均通过编译。 \ No newline at end of file diff --git a/benchmarks/Gemmini/Ops/MatMulOp/matmul_dynamic.py b/benchmarks/Gemmini/Ops/MatMulOp/matmul_dynamic.py new file mode 100644 index 00000000..d7b3fdb9 --- /dev/null +++ b/benchmarks/Gemmini/Ops/MatMulOp/matmul_dynamic.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +from exo.platforms.gemmini import * +from exo.stdlib.scheduling import * + + +@proc +def matmul_on_cpu( + N: size, + M: size, + K: size, + scale: f32, + act: bool, + A: i8[N, K] @ DRAM, + B: i8[K, M] @ DRAM, + C: i8[N, M] @ DRAM, +): + for i in seq(0, N): + for j in seq(0, M): + res: i32 @ DRAM + res = 0.0 + for k in seq(0, K): + a: i8 @ DRAM + a = A[i, k] + + b: i8 @ DRAM + b = B[k, j] + + a2: i32 + b2: i32 + a2 = a + b2 = b + res += a2 * b2 + + src_tmp: i32 + src_tmp = res + tmp_res1: f32 + # 缩放 用于将定点计算结果转换为浮点数 + acc_scale(src_tmp, tmp_res1, scale) + tmp_res2: i8 + # 将浮点数的结果限制为i8的范围 + clamp(tmp_res1, tmp_res2) + if act == True: + # 如果需要激活函数,则应用ReLU + tmp_res2 = relu(tmp_res2) + C[i, j] = tmp_res2 + + +def sched_matmul( + name, + NN, + MM, + KK, +): + cpu = rename(matmul_on_cpu, f"cpu_{name}") + cpu = cpu.partial_eval(NN, MM, KK) + + gemmini = rename(cpu, name) + + gemmini = set_memory(gemmini, "res", GEMM_ACCUM) + gemmini = set_memory(gemmini, "a", GEMM_SCRATCH) + gemmini = set_memory(gemmini, "b", GEMM_SCRATCH) + """ + modify the res, a, b to GEMM's inside Mem + """ + + # Tile outer loops + # gemmini = tile_outer_loops(gemmini) + # print("\n=== Stage 3: After tiling outer loops ===") + # print(gemmini) + gemmini = divide_loop(gemmini, "i", 16, ["i", "i_in"], perfect=True) + gemmini = old_reorder(gemmini, "i_in j") + gemmini = divide_loop(gemmini, "j", 64, ["j", "j_in"], perfect=True) + gemmini = divide_loop(gemmini, "j_in", 16, ["j_in_o", "j_in_i"], perfect=True) + gemmini = old_reorder(gemmini, "j_in_o j_in_i") + + # Lift res allocations + gemmini = old_lift_alloc(gemmini, "res : _ #0", n_lifts=2) + gemmini = old_lift_alloc(gemmini, "res : _ #0", n_lifts=1, mode="col", size=16) + """ + 1. just declare a tensor res once instead of + declaring a scalar res in very loop + 2. make the res be a continuous memory + """ + + # Fission outer blocks + # gemmini = fission_outer_blocks(gemmini) + # print("\n=== Stage 5: After fission outer blocks ===") + # print(gemmini) + gemmini = old_fission_after(gemmini, "res[_] = 0.0 #0", n_lifts=3) + gemmini = old_fission_after(gemmini, "for k in _:_ #0", n_lifts=3) + gemmini = old_reorder(gemmini, "j_in_i j_in_o") + gemmini = old_reorder(gemmini, "i_in k") + gemmini = old_reorder(gemmini, "j_in_i k") + gemmini = old_reorder(gemmini, "j_in_o k") + # 这里将k提到最外层,Gemmini的计算是并行的,可以一次性加载多个数据并行计算 + + # Fission inner blocks + # gemmini = fission_inner_blocks(gemmini) + # print("\n=== Stage 6: After fission inner blocks ===") + # print(gemmini) + gemmini = divide_loop(gemmini, "k", 64, ["ko", "k"], perfect=True) + gemmini = divide_loop(gemmini, "k", 16, ["k", "ki"], perfect=True) + gemmini = old_lift_alloc(gemmini, "a : i8", n_lifts=3) + gemmini = old_lift_alloc(gemmini, "a : _ #0", n_lifts=1, mode="col") + gemmini = old_lift_alloc(gemmini, "a : _", n_lifts=2) + gemmini = old_reorder(gemmini, "ki j_in_o") + gemmini = old_reorder(gemmini, "ki j_in_i") + gemmini = old_lift_alloc(gemmini, "b : i8", n_lifts=2) + gemmini = old_lift_alloc(gemmini, "b : i8", n_lifts=1, mode="col") + gemmini = old_lift_alloc(gemmini, "b : _", n_lifts=3) + gemmini = old_fission_after(gemmini, "a[_] = _", n_lifts=5) + gemmini = old_fission_after(gemmini, "b[_] = _", n_lifts=5) + gemmini = old_reorder(gemmini, "j_in_i i_in") + gemmini = old_reorder(gemmini, "ki i_in") + gemmini = old_reorder(gemmini, "k i_in") + gemmini = old_reorder(gemmini, "j_in_i ki") + gemmini = old_reorder(gemmini, "j_in_o ki") + gemmini = old_reorder(gemmini, "j_in_i i_in") + + # Replace with gemmini calls + gemmini = replace_gemmini_calls(gemmini) + + # Inline and lift config + gemmini = inline_lift_config(gemmini) + + return cpu, gemmini + + +def schedule_matmul_4(NN=256, MM=256, KK=256): + """ + Parameterized matmul_4 scheduling function + Args: + NN: Number of rows in the first matrix, default 256 + MM: Number of columns in the second matrix, default 256 + KK: Number of columns in the first matrix/number of rows in the second matrix, default 256 + """ + # Verify if dimensions are divisible by 16 (gemmini requirement) + tile_size = 16 + assert NN % tile_size == 0, f"NN ({NN}) must be divisible by {tile_size}" + assert MM % tile_size == 0, f"MM ({MM}) must be divisible by {tile_size}" + assert KK % tile_size == 0, f"KK ({KK}) must be divisible by {tile_size}" + + # Calculate block parameters + n_tiles = NN // tile_size + m_tiles = MM // tile_size + k_tiles = KK // tile_size + + cpu, gemmini = sched_matmul("matmul_4", NN, MM, KK) + + # Real optimization + gemmini = old_unroll(gemmini, "ko") + gemmini = old_lift_alloc(gemmini, "res:_") + gemmini = simplify(gemmini) + + # Adaptive block logic - calculate divide_factor based on actual dimensions + if n_tiles >= 4: + divide_factor = n_tiles // 4 + gemmini = divide_loop(gemmini, "i", divide_factor, ["io", "i"], perfect=True) + gemmini = old_lift_alloc(gemmini, "a : _", n_lifts=2) + gemmini = old_lift_alloc(gemmini, "b : _", n_lifts=2) + else: + # For smaller matrices, do not perform outer block division + gemmini = old_lift_alloc(gemmini, "a : _", n_lifts=1) + gemmini = old_lift_alloc(gemmini, "b : _", n_lifts=1) + + # tile + gemmini = old_lift_alloc(gemmini, "a : i8", n_lifts=1, keep_dims=False) + gemmini = old_lift_alloc(gemmini, "b : i8", n_lifts=1, keep_dims=False) + gemmini = old_lift_alloc(gemmini, "res : _", n_lifts=2, keep_dims=False) + + # Previously add_guard + gemmini = simplify(gemmini) + + def do_fission(pattern, n): + nonlocal gemmini + gemmini = autofission(gemmini, gemmini.find(pattern).after(), n_lifts=n) + + do_fission("for j_in_o in _:_", 5) + do_fission("do_ld_i8_block_id1(_)", 6) + do_fission("for k in _:_", 6) + + # Dynamically add loops based on actual tile count + gemmini = add_loop(gemmini, "do_ld_i8_block_id1(_)", "j", 4, guard=True) + # Determine inner loop size based on whether there is a divide_loop + inner_loop_size = n_tiles // 4 if n_tiles >= 4 else n_tiles + gemmini = add_loop(gemmini, "do_ld_i8_block_id2(_)", "i", inner_loop_size, guard=True) + + if n_tiles >= 4: + outer_loop_size = n_tiles // 4 + gemmini = add_loop(gemmini, "if i == 0: _", "io", 4, guard=True) + + # Fuse_loop cleanup - only execute when there is an outer loop + gemmini = old_reorder(gemmini, "i io") + gemmini = old_reorder(gemmini, "k io") + gemmini = old_reorder(gemmini, "j io") + gemmini = add_loop(gemmini, "for j in _:_ #0", "io", 4) + gemmini = fuse(gemmini, "for io in _:_ #0", "for io in _:_ #1") + gemmini = fuse(gemmini, "for io in _:_ #0", "for io in _:_ #1") + gemmini = fuse( + gemmini, "for io in _:_ #0", "for io in _:_ #1", unsafe_disable_check=True + ) + gemmini = add_loop(gemmini, "for j in _:_ #0", "i", divide_factor) + gemmini = old_reorder(gemmini, "k i") + gemmini = old_reorder(gemmini, "j i") + gemmini = fuse(gemmini, "for i in _:_ #0", "for i in _:_ #1") + gemmini = fuse(gemmini, "for i in _:_ #0", "for i in _:_ #1") + gemmini = fuse( + gemmini, "for i in _:_ #0", "for i in _:_ #1", unsafe_disable_check=True + ) + gemmini = fuse(gemmini, "for j in _:_ #0", "for j in _:_ #1") + gemmini = fuse(gemmini, "for j in _:_ #0", "for j in _:_ #1") + gemmini = fuse(gemmini, "for j in _:_ #0", "for j in _:_ #1") + + gemmini = old_unroll(gemmini, "j_in_o") + gemmini = old_unroll(gemmini, "k") + + return cpu, gemmini + +# ----------------------------------------------------------------------------- +# Generic parameterized template function +# ----------------------------------------------------------------------------- + +def schedule_matmul_flexible(name, NN, MM, KK): + """ + Practical parameterized matmul scheduling function + Args: + name: Generated kernel name + NN: Number of rows in the first matrix + MM: Number of columns in the second matrix + KK: Number of columns in the first matrix/number of rows in the second matrix + """ + # Verify if dimensions are divisible by 16 (gemmini requirement) + tile_size = 16 + assert NN % tile_size == 0, f"NN ({NN}) must be divisible by {tile_size}" + assert MM % tile_size == 0, f"MM ({MM}) must be divisible by {tile_size}" + assert KK % tile_size == 0, f"KK ({KK}) must be divisible by {tile_size}" + + # For dimensions divisible by 64, use full gemmini optimization + if MM % 64 == 0 and KK % 64 == 0: + try: + # Use original sched_matmul function (with full gemmini optimization) + cpu, gemmini = sched_matmul(name, NN, MM, KK) + return cpu, gemmini + except Exception as e: + print(f"Warning: Failed to use gemmini optimization for {name}: {e}") + # If gemmini optimization fails, revert to CPU version + + # For matrices not divisible by 64, return basic CPU version + # In these cases, avoid complex gemmini optimization to avoid complex loop structure matching issues + print(f"Info: Using CPU-only version for {name} (MM={MM}, KK={KK} - not divisible by 64)") + + cpu = rename(matmul_on_cpu, f"cpu_{name}") + cpu = cpu.partial_eval(NN, MM, KK) + + # For gemmini version, use simple basic tiling + gemmini = rename(cpu, name) + gemmini = set_memory(gemmini, "res", GEMM_ACCUM) + gemmini = set_memory(gemmini, "a", GEMM_SCRATCH) + gemmini = set_memory(gemmini, "b", GEMM_SCRATCH) + + # Very basic 16x16 tiling + gemmini = divide_loop(gemmini, "i", 16, ["i", "i_in"], perfect=True) + gemmini = divide_loop(gemmini, "j", 16, ["j", "j_in"], perfect=True) + gemmini = divide_loop(gemmini, "k", 16, ["k", "ki"], perfect=True) + + # Basic memory setup - do not use complex gemmini instructions + # Just map memory to gemmini memory space, but keep simple loop structure + + return cpu, gemmini + + +cpu_matmul_4, matmul_4 = schedule_matmul_4() +cpu_matmul_flexible, matmul_flexible = schedule_matmul_flexible("matmul_4_flexible", 64, 64, 64) + +__all__ = [ + "cpu_matmul_4", + "matmul_4", + "cpu_matmul_flexible", + "matmul_flexible", +] diff --git a/benchmarks/Gemmini/Ops/MatMulOp/test_flexible_kernels.py b/benchmarks/Gemmini/Ops/MatMulOp/test_flexible_kernels.py new file mode 100644 index 00000000..1ec0fc61 --- /dev/null +++ b/benchmarks/Gemmini/Ops/MatMulOp/test_flexible_kernels.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +""" +测试脚本:验证schedule_matmul_flexible函数在不同矩阵尺寸下的运行情况 +使用方法: +1. source ~/.venv/exo/bin/activate +2. python test_flexible_kernels.py +""" + +import os +import sys +import tempfile +import traceback + +# 添加当前目录到Python路径 +current_dir = os.path.dirname(os.path.abspath(__file__)) +src_dir = os.path.join(current_dir, 'src') +sys.path.insert(0, src_dir) + +try: + from matmul_dynamic import schedule_matmul_flexible +except ImportError as e: + print(f"❌ Error importing schedule_matmul_flexible: {e}") + print("Make sure you're in the correct directory and the module exists.") + sys.exit(1) + +def test_kernel_generation(name, nn, mm, kk): + """ + 测试指定尺寸的kernel是否能成功生成 + + Args: + name: kernel名称 + nn: 矩阵A的行数 + mm: 矩阵B的列数 + kk: 矩阵A的列数/矩阵B的行数 + + Returns: + tuple: (success, error_message, cpu_kernel, gemmini_kernel) + """ + try: + print(f"Testing {name} with shape ({nn}, {mm}, {kk})...") + + # 生成kernel + cpu_kernel, gemmini_kernel = schedule_matmul_flexible(name, nn, mm, kk) + + # 简单验证kernel对象存在且有基本属性 + if cpu_kernel is None or gemmini_kernel is None: + return False, "Generated kernels are None", None, None + + print(f"✅ SUCCESS: {name} generated successfully") + return True, None, cpu_kernel, gemmini_kernel + + except Exception as e: + error_msg = f"Kernel generation failed: {str(e)}" + print(f"❌ FAILED: {name} - {error_msg}") + print(f" Traceback: {traceback.format_exc()}") + return False, error_msg, None, None + +def save_kernel_to_file(name, cpu_kernel, gemmini_kernel, output_dir="generated_kernels"): + """ + 将生成的kernel保存到文件 + """ + try: + os.makedirs(output_dir, exist_ok=True) + + # 保存kernel到文件 + with open(os.path.join(output_dir, f"{name}.py"), 'w') as f: + f.write("from __future__ import annotations\n") + f.write("from exo.platforms.gemmini import *\n") + f.write("from exo.stdlib.scheduling import *\n\n") + f.write(f"# Generated kernels for {name}\n\n") + f.write(f"cpu_{name} = {repr(cpu_kernel)}\n\n") + f.write(f"{name} = {repr(gemmini_kernel)}\n") + + print(f"📁 Saved {name} to {output_dir}/{name}.py") + return True + except Exception as e: + print(f"⚠️ Failed to save {name}: {e}") + return False + +def main(): + """ + 主测试函数 + """ + print("=" * 60) + print("Testing schedule_matmul_flexible with different shapes") + print("=" * 60) + + # 定义测试用例:(name, NN, MM, KK) + # 注意:所有维度必须能被16整除 + test_cases = [ + # 小尺寸矩阵 + ("matmul_16_16_16", 16, 16, 16), + ("matmul_32_32_32", 32, 32, 32), + ("matmul_48_48_48", 48, 48, 48), + ("matmul_64_64_64", 64, 64, 64), + + # 测试不同的分块行为 (n_tiles < 4) + ("matmul_32_64_32", 32, 64, 32), # n_tiles = 2 + ("matmul_48_32_64", 48, 32, 64), # n_tiles = 3 + + # 测试大尺寸矩阵 (n_tiles >= 4,触发分块优化) + ("matmul_64_128_32", 64, 128, 32), # n_tiles = 4 + ("matmul_80_64_96", 80, 64, 96), # n_tiles = 5 + ("matmul_128_128_128", 128, 128, 128), # n_tiles = 8 + ("matmul_256_256_256", 256, 256, 256), # n_tiles = 16 + + # 不同形状的矩阵 + ("matmul_128_64_96", 128, 64, 96), + ("matmul_256_128_64", 256, 128, 64), + ("matmul_512_256_128", 512, 256, 128), + + # 大尺寸矩阵 + ("matmul_512_512_512", 512, 512, 512), + + # 极端形状 + ("matmul_16_512_16", 16, 512, 16), + ("matmul_512_16_512", 512, 16, 512), + ] + + # 运行测试 + results = [] + total_tests = len(test_cases) + successful_kernels = [] + + for i, (name, nn, mm, kk) in enumerate(test_cases, 1): + print(f"\n[{i}/{total_tests}] ", end="") + success, error, cpu_kernel, gemmini_kernel = test_kernel_generation(name, nn, mm, kk) + results.append((name, nn, mm, kk, success, error)) + + if success: + successful_kernels.append((name, cpu_kernel, gemmini_kernel)) + + # 保存成功的kernels + print(f"\n📁 Saving generated kernels...") + for name, cpu_kernel, gemmini_kernel in successful_kernels: + save_kernel_to_file(name, cpu_kernel, gemmini_kernel) + + # 统计结果 + print("\n" + "=" * 60) + print("Test Results Summary") + print("=" * 60) + + successful = sum(1 for _, _, _, _, success, _ in results if success) + failed = total_tests - successful + + print(f"Total tests: {total_tests}") + print(f"Successful: {successful}") + print(f"Failed: {failed}") + print(f"Success rate: {successful/total_tests*100:.1f}%") + + # 分析分块行为 + print(f"\n📊 Blocking behavior analysis:") + for name, nn, mm, kk, success, error in results: + if success: + n_tiles = nn // 16 + if n_tiles >= 4: + print(f" ✅ {name}: n_tiles={n_tiles} -> Uses divide_loop optimization") + else: + print(f" ✅ {name}: n_tiles={n_tiles} -> No divide_loop (small matrix)") + + # 显示失败的测试 + if failed > 0: + print(f"\n❌ Failed tests:") + for name, nn, mm, kk, success, error in results: + if not success: + print(f" - {name} ({nn}x{mm}x{kk}): {error[:100] if error else 'Unknown error'}...") + + print("\n" + "=" * 60) + print("💡 Tips:") + print("- Generated kernels are saved in 'generated_kernels/' directory") + print("- You can use exocc to compile these kernels to C code") + print("- Example: exocc generated_kernels/matmul_64_64_64.py -o matmul_64_64_64.c --target gemmini") + print("=" * 60) + + # 返回0表示全部成功,否则返回失败数量 + return failed + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file