From d823115c056618e9f475b5c08691cacb821bb238 Mon Sep 17 00:00:00 2001 From: joan mihai Date: Sat, 6 Jun 2026 14:43:28 +0200 Subject: [PATCH] SOCMIPO: Implementation of partial barrier + test --- config/cachepool.hjson | 2 +- .../cachepool_peripheral.sv | 6 +- .../cachepool_peripheral_reg.hjson | 13 ++ .../cachepool_peripheral_reg_pkg.sv | 50 ++++---- .../cachepool_peripheral_reg_top.sv | 43 ++++++- hardware/src/cachepool_cluster.sv | 10 +- hardware/src/cachepool_cluster_barrier.sv | 8 +- .../snRuntime/include/cachepool_peripheral.h | 14 +++ software/snRuntime/include/snrt.h | 3 + software/snRuntime/include/team.h | 1 + .../snRuntime/src/platforms/shared/start.c | 3 +- software/snRuntime/src/team.c | 5 + software/tests/CMakeLists.txt | 4 + software/tests/partial_barrier/main.c | 117 ++++++++++++++++++ .../tests/partial_barrier_benchmark/main.c | 117 ++++++++++++++++++ util/auto-benchmark/configs-ci.sh | 2 +- util/auto-benchmark/configs.sh | 2 +- 17 files changed, 369 insertions(+), 31 deletions(-) create mode 100644 software/tests/partial_barrier/main.c create mode 100644 software/tests/partial_barrier_benchmark/main.c diff --git a/config/cachepool.hjson b/config/cachepool.hjson index 2ac0947..f54b04e 100644 --- a/config/cachepool.hjson +++ b/config/cachepool.hjson @@ -12,7 +12,7 @@ data_width: 32, id_width_in: 6, // fixed for now id_width_out: 2, // fixed for now - user_width: 17, + user_width: 21, axi_cdc_enable: false, sw_rst_enable: true, axi_isolate_enable: false, diff --git a/hardware/cachepool_peripheral/cachepool_peripheral.sv b/hardware/cachepool_peripheral/cachepool_peripheral.sv index 6326cfa..8fb7eb0 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral.sv @@ -45,7 +45,8 @@ module cachepool_peripheral output cache_insn_t l1d_insn_o, output logic l1d_insn_valid_o, input logic [NumTiles-1:0] l1d_insn_ready_i, - output logic [NumTiles-1:0] l1d_busy_o + output logic [NumTiles-1:0] l1d_busy_o, + output logic [NumTiles-1:0] barrier_participation_mask_o // SOCMIPO ); cachepool_peripheral_reg2hw_t reg2hw; @@ -200,4 +201,7 @@ module cachepool_peripheral // The hardware barrier is external and always reads `0`. assign hw2reg.hw_barrier.d = 0; + // write from software + assign barrier_participation_mask_o = reg2hw.hw_barrier_participation_mask.q; + endmodule diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson b/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson index 79d7cda..4a60b70 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson @@ -274,6 +274,19 @@ name: "COMMIT", desc: "Commit the xbar offset configurations." }] + }, + { + name: "HW_BARRIER_PARTICIPATION_MASK", + desc: '''Hardware barrier participation mask register. This register determines the tiles that are + implicated in a barrier. Allows partial barrier implementation.''' + swaccess: "rw", + hwaccess: "hro", + resval: "15", + fields: [{ + bits: "3:0", + name: "HW_BARRIER_PARTICIPATION_MASK", + desc: "Hardware barrier participation mask register." + }] } ] } diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv b/hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv index df8c6e6..8cfe0fd 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv @@ -90,6 +90,10 @@ package cachepool_peripheral_reg_pkg; logic q; } cachepool_peripheral_reg2hw_xbar_offset_commit_reg_t; + typedef struct packed { + logic [3:0] q; + } cachepool_peripheral_reg2hw_hw_barrier_participation_mask_reg_t; + typedef struct packed { logic [31:0] d; } cachepool_peripheral_hw2reg_hw_barrier_reg_t; @@ -115,24 +119,25 @@ package cachepool_peripheral_reg_pkg; // Register -> HW type typedef struct packed { - cachepool_peripheral_reg2hw_hart_select_mreg_t [1:0] hart_select; // [275:256] - cachepool_peripheral_reg2hw_cl_clint_set_reg_t cl_clint_set; // [255:223] - cachepool_peripheral_reg2hw_cl_clint_clear_reg_t cl_clint_clear; // [222:190] - cachepool_peripheral_reg2hw_hw_barrier_reg_t hw_barrier; // [189:158] - cachepool_peripheral_reg2hw_icache_prefetch_enable_reg_t icache_prefetch_enable; // [157:157] - cachepool_peripheral_reg2hw_spatz_status_reg_t spatz_status; // [156:156] - cachepool_peripheral_reg2hw_spatz_cycle_reg_t spatz_cycle; // [155:124] - cachepool_peripheral_reg2hw_cluster_boot_control_reg_t cluster_boot_control; // [123:92] - cachepool_peripheral_reg2hw_cluster_eoc_exit_reg_t cluster_eoc_exit; // [91:88] - cachepool_peripheral_reg2hw_cfg_l1d_spm_reg_t cfg_l1d_spm; // [87:78] - cachepool_peripheral_reg2hw_cfg_l1d_insn_reg_t cfg_l1d_insn; // [77:76] - cachepool_peripheral_reg2hw_cfg_l1d_tile_sel_reg_t cfg_l1d_tile_sel; // [75:44] - cachepool_peripheral_reg2hw_l1d_spm_commit_reg_t l1d_spm_commit; // [43:43] - cachepool_peripheral_reg2hw_l1d_insn_commit_reg_t l1d_insn_commit; // [42:42] - cachepool_peripheral_reg2hw_l1d_private_reg_t l1d_private; // [41:38] - cachepool_peripheral_reg2hw_l1d_addr_reg_t l1d_addr; // [37:6] - cachepool_peripheral_reg2hw_xbar_offset_reg_t xbar_offset; // [5:1] - cachepool_peripheral_reg2hw_xbar_offset_commit_reg_t xbar_offset_commit; // [0:0] + cachepool_peripheral_reg2hw_hart_select_mreg_t [1:0] hart_select; // [279:260] + cachepool_peripheral_reg2hw_cl_clint_set_reg_t cl_clint_set; // [259:227] + cachepool_peripheral_reg2hw_cl_clint_clear_reg_t cl_clint_clear; // [226:194] + cachepool_peripheral_reg2hw_hw_barrier_reg_t hw_barrier; // [193:162] + cachepool_peripheral_reg2hw_icache_prefetch_enable_reg_t icache_prefetch_enable; // [161:161] + cachepool_peripheral_reg2hw_spatz_status_reg_t spatz_status; // [160:160] + cachepool_peripheral_reg2hw_spatz_cycle_reg_t spatz_cycle; // [159:128] + cachepool_peripheral_reg2hw_cluster_boot_control_reg_t cluster_boot_control; // [127:96] + cachepool_peripheral_reg2hw_cluster_eoc_exit_reg_t cluster_eoc_exit; // [95:92] + cachepool_peripheral_reg2hw_cfg_l1d_spm_reg_t cfg_l1d_spm; // [91:82] + cachepool_peripheral_reg2hw_cfg_l1d_insn_reg_t cfg_l1d_insn; // [81:80] + cachepool_peripheral_reg2hw_cfg_l1d_tile_sel_reg_t cfg_l1d_tile_sel; // [79:48] + cachepool_peripheral_reg2hw_l1d_spm_commit_reg_t l1d_spm_commit; // [47:47] + cachepool_peripheral_reg2hw_l1d_insn_commit_reg_t l1d_insn_commit; // [46:46] + cachepool_peripheral_reg2hw_l1d_private_reg_t l1d_private; // [45:42] + cachepool_peripheral_reg2hw_l1d_addr_reg_t l1d_addr; // [41:10] + cachepool_peripheral_reg2hw_xbar_offset_reg_t xbar_offset; // [9:5] + cachepool_peripheral_reg2hw_xbar_offset_commit_reg_t xbar_offset_commit; // [4:4] + cachepool_peripheral_reg2hw_hw_barrier_participation_mask_reg_t hw_barrier_participation_mask; // [3:0] } cachepool_peripheral_reg2hw_t; // HW -> register type @@ -165,6 +170,7 @@ package cachepool_peripheral_reg_pkg; parameter logic [BlockAw-1:0] CACHEPOOL_PERIPHERAL_L1D_ADDR_OFFSET = 7'h 44; parameter logic [BlockAw-1:0] CACHEPOOL_PERIPHERAL_XBAR_OFFSET_OFFSET = 7'h 48; parameter logic [BlockAw-1:0] CACHEPOOL_PERIPHERAL_XBAR_OFFSET_COMMIT_OFFSET = 7'h 4c; + parameter logic [BlockAw-1:0] CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_OFFSET = 7'h 50; // Reset values for hwext registers and their fields parameter logic [31:0] CACHEPOOL_PERIPHERAL_CL_CLINT_SET_RESVAL = 32'h 0; @@ -194,11 +200,12 @@ package cachepool_peripheral_reg_pkg; CACHEPOOL_PERIPHERAL_L1D_PRIVATE, CACHEPOOL_PERIPHERAL_L1D_ADDR, CACHEPOOL_PERIPHERAL_XBAR_OFFSET, - CACHEPOOL_PERIPHERAL_XBAR_OFFSET_COMMIT + CACHEPOOL_PERIPHERAL_XBAR_OFFSET_COMMIT, + CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK } cachepool_peripheral_id_e; // Register width information to check illegal writes - parameter logic [3:0] CACHEPOOL_PERIPHERAL_PERMIT [20] = '{ + parameter logic [3:0] CACHEPOOL_PERIPHERAL_PERMIT [21] = '{ 4'b 0011, // index[ 0] CACHEPOOL_PERIPHERAL_HART_SELECT_0 4'b 0011, // index[ 1] CACHEPOOL_PERIPHERAL_HART_SELECT_1 4'b 1111, // index[ 2] CACHEPOOL_PERIPHERAL_CL_CLINT_SET @@ -218,7 +225,8 @@ package cachepool_peripheral_reg_pkg; 4'b 0001, // index[16] CACHEPOOL_PERIPHERAL_L1D_PRIVATE 4'b 1111, // index[17] CACHEPOOL_PERIPHERAL_L1D_ADDR 4'b 0001, // index[18] CACHEPOOL_PERIPHERAL_XBAR_OFFSET - 4'b 0001 // index[19] CACHEPOOL_PERIPHERAL_XBAR_OFFSET_COMMIT + 4'b 0001, // index[19] CACHEPOOL_PERIPHERAL_XBAR_OFFSET_COMMIT + 4'b 0001 // index[20] CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK }; endpackage diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv b/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv index c6ece73..9645b23 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv @@ -122,6 +122,9 @@ module cachepool_peripheral_reg_top #( logic xbar_offset_commit_qs; logic xbar_offset_commit_wd; logic xbar_offset_commit_we; + logic [3:0] hw_barrier_participation_mask_qs; + logic [3:0] hw_barrier_participation_mask_wd; + logic hw_barrier_participation_mask_we; // Register instances @@ -620,9 +623,36 @@ module cachepool_peripheral_reg_top #( ); + // R[hw_barrier_participation_mask]: V(False) + prim_subreg #( + .DW (4), + .SWACCESS("RW"), + .RESVAL (4'hf) + ) u_hw_barrier_participation_mask ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (hw_barrier_participation_mask_we), + .wd (hw_barrier_participation_mask_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), - logic [19:0] addr_hit; + // to internal hardware + .qe (), + .q (reg2hw.hw_barrier_participation_mask.q ), + + // to register interface (read) + .qs (hw_barrier_participation_mask_qs) + ); + + + + + logic [20:0] addr_hit; always_comb begin addr_hit = '0; addr_hit[ 0] = (reg_addr == CACHEPOOL_PERIPHERAL_HART_SELECT_0_OFFSET); @@ -645,6 +675,7 @@ module cachepool_peripheral_reg_top #( addr_hit[17] = (reg_addr == CACHEPOOL_PERIPHERAL_L1D_ADDR_OFFSET); addr_hit[18] = (reg_addr == CACHEPOOL_PERIPHERAL_XBAR_OFFSET_OFFSET); addr_hit[19] = (reg_addr == CACHEPOOL_PERIPHERAL_XBAR_OFFSET_COMMIT_OFFSET); + addr_hit[20] = (reg_addr == CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_OFFSET); end assign addrmiss = (reg_re || reg_we) ? ~|addr_hit : 1'b0 ; @@ -671,7 +702,8 @@ module cachepool_peripheral_reg_top #( (addr_hit[16] & (|(CACHEPOOL_PERIPHERAL_PERMIT[16] & ~reg_be))) | (addr_hit[17] & (|(CACHEPOOL_PERIPHERAL_PERMIT[17] & ~reg_be))) | (addr_hit[18] & (|(CACHEPOOL_PERIPHERAL_PERMIT[18] & ~reg_be))) | - (addr_hit[19] & (|(CACHEPOOL_PERIPHERAL_PERMIT[19] & ~reg_be))))); + (addr_hit[19] & (|(CACHEPOOL_PERIPHERAL_PERMIT[19] & ~reg_be))) | + (addr_hit[20] & (|(CACHEPOOL_PERIPHERAL_PERMIT[20] & ~reg_be))))); end assign hart_select_0_we = addr_hit[0] & reg_we & !reg_error; @@ -732,6 +764,9 @@ module cachepool_peripheral_reg_top #( assign xbar_offset_commit_we = addr_hit[19] & reg_we & !reg_error; assign xbar_offset_commit_wd = reg_wdata[0]; + assign hw_barrier_participation_mask_we = addr_hit[20] & reg_we & !reg_error; + assign hw_barrier_participation_mask_wd = reg_wdata[3:0]; + // Read data return always_comb begin reg_rdata_next = '0; @@ -816,6 +851,10 @@ module cachepool_peripheral_reg_top #( reg_rdata_next[0] = xbar_offset_commit_qs; end + addr_hit[20]: begin + reg_rdata_next[3:0] = hw_barrier_participation_mask_qs; + end + default: begin reg_rdata_next = '1; end diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index 0f4beb8..19b409f 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -895,10 +895,15 @@ module cachepool_cluster assign tcdm_start_address = (cluster_base_addr_i & TCDMMask); assign tcdm_end_address = (tcdm_start_address + TCDMSize) & TCDMMask; + + logic [NumTiles-1:0] use_barrier; // TODO: Connect to CSR - assign use_barrier = {NumTiles{1'b1}}; + //assign use_barrier = {NumTiles{1'b1}}; // for now all is set to 1 + logic [NumTiles-1:0] barrier_participation_mask; + assign use_barrier = barrier_participation_mask; + axi_cut #( .Bypass (0 ), @@ -1043,7 +1048,8 @@ module cachepool_cluster .l1d_insn_o (l1d_insn ), .l1d_insn_valid_o (l1d_insn_valid ), .l1d_insn_ready_i (l1d_insn_ready ), - .l1d_busy_o (l1d_busy ) + .l1d_busy_o (l1d_busy ), + .barrier_participation_mask_o (barrier_participation_mask) ); endmodule diff --git a/hardware/src/cachepool_cluster_barrier.sv b/hardware/src/cachepool_cluster_barrier.sv index 7ed2ec6..5099c3f 100644 --- a/hardware/src/cachepool_cluster_barrier.sv +++ b/hardware/src/cachepool_cluster_barrier.sv @@ -31,7 +31,7 @@ module cachepool_cluster_barrier output axi_req_t [NrPorts-1:0] axi_mst_req_o, input axi_rsp_t [NrPorts-1:0] axi_mst_rsp_i, - input logic [NrPorts-1:0] barrier_i, + input logic [NrPorts-1:0] barrier_i, // participation barrier input addr_t cluster_periph_start_address_i ); @@ -45,8 +45,14 @@ module cachepool_cluster_barrier // FSM State of the barrier barrier_state_e [NrPorts-1:0] state_d, state_q; // the tiles participate in global barrier + + // (SOCMIP) PARTICIPATION MASK! + // (SOCMIP) map to hw then control in sw logic [NrPorts-1:0] barrier_d, barrier_q; + addr_t barrier_participation_mask_addr; + assign barrier_participation_mask_addr = cluster_periph_start_address_i + CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_OFFSET; + // Infomation stored for response generation typedef struct packed { axi_id_t id; diff --git a/software/snRuntime/include/cachepool_peripheral.h b/software/snRuntime/include/cachepool_peripheral.h index aad79f5..92c737b 100644 --- a/software/snRuntime/include/cachepool_peripheral.h +++ b/software/snRuntime/include/cachepool_peripheral.h @@ -135,6 +135,20 @@ extern "C" { #define CACHEPOOL_PERIPHERAL_XBAR_OFFSET_COMMIT_REG_OFFSET 0x4c #define CACHEPOOL_PERIPHERAL_XBAR_OFFSET_COMMIT_COMMIT_BIT 0 +// Hardware barrier participation mask register. This register determines the +// tiles that are +#define CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_REG_OFFSET 0x50 +#define CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_HW_BARRIER_PARTICIPATION_MASK_MASK \ + 0xf +#define CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_HW_BARRIER_PARTICIPATION_MASK_OFFSET \ + 0 +#define CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_HW_BARRIER_PARTICIPATION_MASK_FIELD \ + ((bitfield_field32_t){ \ + .mask = \ + CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_HW_BARRIER_PARTICIPATION_MASK_MASK, \ + .index = \ + CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_HW_BARRIER_PARTICIPATION_MASK_OFFSET}) + #ifdef __cplusplus } // extern "C" #endif diff --git a/software/snRuntime/include/snrt.h b/software/snRuntime/include/snrt.h index ae91213..14cb38a 100644 --- a/software/snRuntime/include/snrt.h +++ b/software/snRuntime/include/snrt.h @@ -95,6 +95,9 @@ extern void snrt_wakeup(uint32_t mask); /// get pointer to barrier register extern uint32_t _snrt_barrier_reg_ptr(); +/// get pointer to participation barrier register +extern uint32_t _snrt_barrier_participation_mask_reg_ptr(); + /// get start address of global memory extern snrt_slice_t snrt_global_memory(); /// get start address of the cluster's tcdm memory diff --git a/software/snRuntime/include/team.h b/software/snRuntime/include/team.h index 9c1fb31..967d01b 100644 --- a/software/snRuntime/include/team.h +++ b/software/snRuntime/include/team.h @@ -41,5 +41,6 @@ struct snrt_team_root { struct snrt_allocator allocator; struct snrt_barrier cluster_barrier; uint32_t barrier_reg_ptr; + uint32_t barrier_participation_mask_reg_ptr; //SOCMIPO struct snrt_peripherals peripherals; }; diff --git a/software/snRuntime/src/platforms/shared/start.c b/software/snRuntime/src/platforms/shared/start.c index 9d5ceb8..eae11a8 100644 --- a/software/snRuntime/src/platforms/shared/start.c +++ b/software/snRuntime/src/platforms/shared/start.c @@ -59,7 +59,8 @@ void _snrt_init_team(uint32_t cluster_core_id, uint32_t cluster_core_num, team->cluster_mem.end = (uint64_t)spm_start + bootdata->tcdm_size; team->barrier_reg_ptr = (uint32_t)spm_start + bootdata->tcdm_size + CACHEPOOL_PERIPHERAL_HW_BARRIER_REG_OFFSET; - + team->barrier_participation_mask_reg_ptr = (uint32_t)spm_start + bootdata->tcdm_size + + CACHEPOOL_PERIPHERAL_HW_BARRIER_PARTICIPATION_MASK_REG_OFFSET; // Initialize cluster barrier team->cluster_barrier.barrier = 0; team->cluster_barrier.barrier_iteration = 0; diff --git a/software/snRuntime/src/team.c b/software/snRuntime/src/team.c index 8acd9e0..d612915 100644 --- a/software/snRuntime/src/team.c +++ b/software/snRuntime/src/team.c @@ -108,6 +108,11 @@ uint32_t _snrt_barrier_reg_ptr() { return _snrt_team_current->root->barrier_reg_ptr; } +// Socmipo +uint32_t _snrt_barrier_participation_mask_reg_ptr() { + return _snrt_team_current->root->barrier_participation_mask_reg_ptr; +} + snrt_slice_t snrt_global_memory() { return _snrt_team_current->root->global_mem; } diff --git a/software/tests/CMakeLists.txt b/software/tests/CMakeLists.txt index c8f47d8..ab490d4 100644 --- a/software/tests/CMakeLists.txt +++ b/software/tests/CMakeLists.txt @@ -53,6 +53,8 @@ macro(add_spatz_test_threeParam name file param1 param2 param3) endmacro() + + # Benchmark library add_library(benchmark benchmark/benchmark.c) add_library(spin_lock benchmark/spin_lock.c) @@ -84,6 +86,8 @@ set(SNITCH_TEST_PREFIX cachepool-) ## RLC add_spatz_test_zeroParam(spin-lock spin-lock/main.c) +add_spatz_test_zeroParam(partial_barrier partial_barrier/main.c)# SOCMIPO +add_spatz_test_zeroParam(partial_barrier_benchmark partial_barrier_benchmark/main.c)# SOCMIPO add_spatz_test_zeroParam(mcs-lock mcs-lock/main.c) add_spatz_test_zeroParam(byte-enable byte-enable/main.c) add_spatz_test_zeroParam(cache-line-rw-smoke cache-line-rw-smoke/main.c) diff --git a/software/tests/partial_barrier/main.c b/software/tests/partial_barrier/main.c new file mode 100644 index 0000000..da3dfe8 --- /dev/null +++ b/software/tests/partial_barrier/main.c @@ -0,0 +1,117 @@ +// This tiny test tests the partial barrier implementation by +// setting the participation mask only for tiles 1 & 3. +// Tiles 0 & 2 should be able to reach end of programs +// before 1 & 3 finish the work and pass the partial barrier. +// Author: Luca Caballero Cusin + +#include +#include +#include +#include "spin_lock.h" + +static uint32_t result __attribute__((section(".data"))); +static uint32_t printed __attribute__((section(".data"))); +static struct snrt_barrier sw_barrier __attribute__((section(".data"))); +spinlock_t lock; + +int main() +{ + volatile uint32_t *participation = (volatile uint32_t *)_snrt_barrier_participation_mask_reg_ptr(); + + // whoever arrives first, writes the barrier + // no race condition since they all write the same thing + *participation = 0b1111; // all tiles participate + + result = 0; + snrt_cluster_hw_barrier(); + + // atomic test-and-set: returns old value + if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) { + spin_lock(&lock, 1); + printf("setting participation for tiles 1 and 3\n"); + spin_unlock(&lock, 1); + } + + // set participation mask + *participation = 0b1010; // tiles 1 and 3 only + + // only tiles 1 and 3 + if (snrt_cluster_tile_idx() == 1 || snrt_cluster_tile_idx() == 3) + { + // work + spin_lock(&lock, 1); + printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + + spin_lock(&lock, 1); + result += snrt_cluster_core_idx(); + spin_unlock(&lock, 1); + + snrt_cluster_hw_barrier(); + + if (snrt_cluster_core_idx() == 4) + { + spin_lock(&lock, 1); + printf("tiles done working, result (expected = 76) = %u\n", result); + spin_unlock(&lock, 1); + } + } + + if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12) + { + spin_lock(&lock, 1); + printf("%u reached end of first stage\n", snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + } + printed=0; + + snrt_barrier(&sw_barrier, snrt_cluster_core_num()); + + if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) { + spin_lock(&lock, 1); + printf("setting participation for tiles 0 and 2\n"); + spin_unlock(&lock, 1); + } + + // set participation mask + *participation = 0b0101; // tiles 0 and 2 only + + // only tiles 0 and 2 + if (snrt_cluster_tile_idx() == 0 || snrt_cluster_tile_idx() == 2) + { + // work + spin_lock(&lock, 1); + printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + + spin_lock(&lock, 1); + result += snrt_cluster_core_idx(); + spin_unlock(&lock, 1); + + snrt_cluster_hw_barrier(); + + if (snrt_cluster_core_idx() == 4) + { + spin_lock(&lock, 1); + printf("tiles done working, result (expected = 76) = %u\n", result); + spin_unlock(&lock, 1); + } + } + + + + if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12) + { + spin_lock(&lock, 1); + printf("%u reached end of second stage\n", snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + } + + // We use sw barrier for now, since crash happens if cores not synchronized at the end + // when implementing multiple barriers, we can assign one to 1 & 3 and another for + // all the cores at the very end + + snrt_barrier(&sw_barrier, snrt_cluster_core_num()); + + return 0; +} diff --git a/software/tests/partial_barrier_benchmark/main.c b/software/tests/partial_barrier_benchmark/main.c new file mode 100644 index 0000000..da3dfe8 --- /dev/null +++ b/software/tests/partial_barrier_benchmark/main.c @@ -0,0 +1,117 @@ +// This tiny test tests the partial barrier implementation by +// setting the participation mask only for tiles 1 & 3. +// Tiles 0 & 2 should be able to reach end of programs +// before 1 & 3 finish the work and pass the partial barrier. +// Author: Luca Caballero Cusin + +#include +#include +#include +#include "spin_lock.h" + +static uint32_t result __attribute__((section(".data"))); +static uint32_t printed __attribute__((section(".data"))); +static struct snrt_barrier sw_barrier __attribute__((section(".data"))); +spinlock_t lock; + +int main() +{ + volatile uint32_t *participation = (volatile uint32_t *)_snrt_barrier_participation_mask_reg_ptr(); + + // whoever arrives first, writes the barrier + // no race condition since they all write the same thing + *participation = 0b1111; // all tiles participate + + result = 0; + snrt_cluster_hw_barrier(); + + // atomic test-and-set: returns old value + if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) { + spin_lock(&lock, 1); + printf("setting participation for tiles 1 and 3\n"); + spin_unlock(&lock, 1); + } + + // set participation mask + *participation = 0b1010; // tiles 1 and 3 only + + // only tiles 1 and 3 + if (snrt_cluster_tile_idx() == 1 || snrt_cluster_tile_idx() == 3) + { + // work + spin_lock(&lock, 1); + printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + + spin_lock(&lock, 1); + result += snrt_cluster_core_idx(); + spin_unlock(&lock, 1); + + snrt_cluster_hw_barrier(); + + if (snrt_cluster_core_idx() == 4) + { + spin_lock(&lock, 1); + printf("tiles done working, result (expected = 76) = %u\n", result); + spin_unlock(&lock, 1); + } + } + + if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12) + { + spin_lock(&lock, 1); + printf("%u reached end of first stage\n", snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + } + printed=0; + + snrt_barrier(&sw_barrier, snrt_cluster_core_num()); + + if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) { + spin_lock(&lock, 1); + printf("setting participation for tiles 0 and 2\n"); + spin_unlock(&lock, 1); + } + + // set participation mask + *participation = 0b0101; // tiles 0 and 2 only + + // only tiles 0 and 2 + if (snrt_cluster_tile_idx() == 0 || snrt_cluster_tile_idx() == 2) + { + // work + spin_lock(&lock, 1); + printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + + spin_lock(&lock, 1); + result += snrt_cluster_core_idx(); + spin_unlock(&lock, 1); + + snrt_cluster_hw_barrier(); + + if (snrt_cluster_core_idx() == 4) + { + spin_lock(&lock, 1); + printf("tiles done working, result (expected = 76) = %u\n", result); + spin_unlock(&lock, 1); + } + } + + + + if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12) + { + spin_lock(&lock, 1); + printf("%u reached end of second stage\n", snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + } + + // We use sw barrier for now, since crash happens if cores not synchronized at the end + // when implementing multiple barriers, we can assign one to 1 & 3 and another for + // all the cores at the very end + + snrt_barrier(&sw_barrier, snrt_cluster_core_num()); + + return 0; +} diff --git a/util/auto-benchmark/configs-ci.sh b/util/auto-benchmark/configs-ci.sh index 11fe23e..98f0560 100644 --- a/util/auto-benchmark/configs-ci.sh +++ b/util/auto-benchmark/configs-ci.sh @@ -1,5 +1,5 @@ # Configs and kernel suffixes (without prefix) CONFIGS="cachepool_fpu_512" -KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv-opt_M512_N128_K32 fmatmul-32b_M32_N32_K32 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" +KERNELS="partial_barrier_benchmark partial_barrier spin-lock load-store_M16 fdotp-32b_M32768 gemv-opt_M512_N128_K32 fmatmul-32b_M32_N32_K32 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" PREFIX="test-cachepool-" # common prefix for all kernels ROOT_PATH=../.. # adjust if needed (path to repo root) diff --git a/util/auto-benchmark/configs.sh b/util/auto-benchmark/configs.sh index 1a545be..18b5b67 100755 --- a/util/auto-benchmark/configs.sh +++ b/util/auto-benchmark/configs.sh @@ -5,7 +5,7 @@ CONFIGS="cachepool_fpu_128 cachepool_fpu_256 cachepool_fpu_512" # KERNELS="spin-lock fdotp-32b_M8192 fmatmul-32b_M32_N32_K32" # KERNELS="fdotp-32b_M65536 gemv-opt_M1024_N128_K32 gemv_M1024_N128_K32" -KERNELS="spin-lock fdotp-32b_M65536 gemv-opt_M1024_N128_K32 gemv_M1024_N128_K32 fmatmul-32b_M64_N64_K64 multi_producer_single_consumer_double_linked_list_M1_N1350_K100 byte-enable" +KERNELS="partial_barrier_benchmark partial_barrier spin-lock fdotp-32b_M65536 gemv-opt_M1024_N128_K32 gemv_M1024_N128_K32 fmatmul-32b_M64_N64_K64 multi_producer_single_consumer_double_linked_list_M1_N1350_K100 byte-enable" # KERNELS="spin-lock fdotp-32b_M32768" PREFIX="test-cachepool-" # common prefix for all kernels