diff --git a/software/tests/partial_barrier_benchmark/main.c b/software/tests/partial_barrier_benchmark/main.c index da3dfe8..cb002c0 100644 --- a/software/tests/partial_barrier_benchmark/main.c +++ b/software/tests/partial_barrier_benchmark/main.c @@ -1,7 +1,6 @@ -// This tiny test tests the partial barrier implementation by -// setting the participation mask only for tiles 1 & 3. -// Tiles 0 & 2 should be able to reach end of programs -// before 1 & 3 finish the work and pass the partial barrier. +// Synthetic benchmark for partial barrier implementation. +// Tiles 0 and 2 perform multiple small reductions +// Tiles 1 and 3 perform one big reduction // Author: Luca Caballero Cusin #include @@ -10,10 +9,21 @@ #include "spin_lock.h" static uint32_t result __attribute__((section(".data"))); +static uint32_t result_big __attribute__((section(".data"))); +static uint32_t result_small[4] __attribute__((section(".data"))); static uint32_t printed __attribute__((section(".data"))); +static uint32_t printed2 __attribute__((section(".data"))); static struct snrt_barrier sw_barrier __attribute__((section(".data"))); spinlock_t lock; + +static uint32_t v_big[256] __attribute__((section(".data"))); +static uint32_t v_small[64] __attribute__((section(".data"))); + +// Choose between partial or full barrier +#define PARTIAL + + int main() { volatile uint32_t *participation = (volatile uint32_t *)_snrt_barrier_participation_mask_reg_ptr(); @@ -23,89 +33,132 @@ int main() *participation = 0b1111; // all tiles participate result = 0; + printed = 0; snrt_cluster_hw_barrier(); - // atomic test-and-set: returns old value - if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) { - spin_lock(&lock, 1); - printf("setting participation for tiles 1 and 3\n"); - spin_unlock(&lock, 1); - } + // set participation mask - *participation = 0b1010; // tiles 1 and 3 only + #ifdef PARTIAL + *participation = 0b0101; // tiles 0 and 2 only + #endif + // only tiles 1 and 3 if (snrt_cluster_tile_idx() == 1 || snrt_cluster_tile_idx() == 3) { + int local=0; // work + for (int i=0;i<256;i++){ + v_big[i]=1; + + } + #ifdef DEBUG spin_lock(&lock, 1); printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx()); spin_unlock(&lock, 1); + #endif + if (snrt_cluster_tile_idx()==1) + for (int i=0; i< 32 ;i++){ + local+=v_big[(snrt_cluster_core_idx()%4)*32+i]; - spin_lock(&lock, 1); - result += snrt_cluster_core_idx(); - spin_unlock(&lock, 1); + } + + if (snrt_cluster_tile_idx()==3) + for (int i=0; i<32 ;i++){ + local+=v_big[(snrt_cluster_core_idx()%4)*32+i+128]; - snrt_cluster_hw_barrier(); - - if (snrt_cluster_core_idx() == 4) - { - spin_lock(&lock, 1); - printf("tiles done working, result (expected = 76) = %u\n", result); - spin_unlock(&lock, 1); } - } - if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12) - { spin_lock(&lock, 1); - printf("%u reached end of first stage\n", snrt_cluster_tile_idx()); + result_big += local; spin_unlock(&lock, 1); - } - printed=0; - snrt_barrier(&sw_barrier, snrt_cluster_core_num()); + #ifdef PARTIAL + + #else + // necessary for tiles 0 and 2 to continue + snrt_cluster_hw_barrier(); + if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) { + spin_lock(&lock, 1); + printf("1 and 3 reached barrier\n"); + spin_unlock(&lock, 1); + } + snrt_cluster_hw_barrier(); + snrt_cluster_hw_barrier(); + snrt_cluster_hw_barrier(); + + #endif + printed=0; + snrt_barrier(&sw_barrier, snrt_cluster_core_num()); + + if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) { + spin_lock(&lock, 1); + printf("1 and 3 done with big red , result is : %d , expected :256 \n",result_big); + spin_unlock(&lock, 1); + } - if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) { - spin_lock(&lock, 1); - printf("setting participation for tiles 0 and 2\n"); - spin_unlock(&lock, 1); + + } - // set participation mask - *participation = 0b0101; // tiles 0 and 2 only - // only tiles 0 and 2 if (snrt_cluster_tile_idx() == 0 || snrt_cluster_tile_idx() == 2) { - // work - spin_lock(&lock, 1); - printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx()); - spin_unlock(&lock, 1); + for (int i=0;i<64;i++){ + v_small[i]=1; + } + for (int n=0;n<4;n++){ + int local=0; + // work + #ifdef DEBUG + spin_lock(&lock, 1); + printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx()); + spin_unlock(&lock, 1); + #endif + if (snrt_cluster_tile_idx()==0) + for (int i=0; i< 8 ;i++){ + local+=v_small[(snrt_cluster_core_idx()%4)*8+i]; - spin_lock(&lock, 1); - result += snrt_cluster_core_idx(); - spin_unlock(&lock, 1); + } + + if (snrt_cluster_tile_idx()==2) + for (int i=0; i< 8 ;i++){ + local+=v_small[(snrt_cluster_core_idx()%4)*8+i+32]; + + } - snrt_cluster_hw_barrier(); - - if (snrt_cluster_core_idx() == 4) - { spin_lock(&lock, 1); - printf("tiles done working, result (expected = 76) = %u\n", result); + result_small[n] += local; spin_unlock(&lock, 1); + + printed2=0; + + // if full barrier, has to wait for big reduction to finish + snrt_cluster_hw_barrier(); + + if (__atomic_fetch_add(&printed2, 1, __ATOMIC_RELAXED) == 0) { + spin_lock(&lock, 1); + printf("0 and 2 done with small red %d , result is : %d, expected :64 \n",n,result_small[n]); + spin_unlock(&lock, 1); + } + + + } - } + + snrt_barrier(&sw_barrier, snrt_cluster_core_num()); + } if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12) { spin_lock(&lock, 1); - printf("%u reached end of second stage\n", snrt_cluster_tile_idx()); + printf("%u reached end\n", snrt_cluster_tile_idx()); spin_unlock(&lock, 1); } + // We use sw barrier for now, since crash happens if cores not synchronized at the end // when implementing multiple barriers, we can assign one to 1 & 3 and another for