Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 102 additions & 49 deletions software/tests/partial_barrier_benchmark/main.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// This tiny test tests the partial barrier implementation by
// setting the participation mask only for tiles 1 & 3.
// Tiles 0 & 2 should be able to reach end of programs
// before 1 & 3 finish the work and pass the partial barrier.
// Synthetic benchmark for partial barrier implementation.
// Tiles 0 and 2 perform multiple small reductions
// Tiles 1 and 3 perform one big reduction
// Author: Luca Caballero Cusin <lcaballero@student.ethz.ch>

#include <benchmark.h>
Expand All @@ -10,10 +9,21 @@
#include "spin_lock.h"

static uint32_t result __attribute__((section(".data")));
static uint32_t result_big __attribute__((section(".data")));
static uint32_t result_small[4] __attribute__((section(".data")));
static uint32_t printed __attribute__((section(".data")));
static uint32_t printed2 __attribute__((section(".data")));
static struct snrt_barrier sw_barrier __attribute__((section(".data")));
spinlock_t lock;


static uint32_t v_big[256] __attribute__((section(".data")));
static uint32_t v_small[64] __attribute__((section(".data")));

// Choose between partial or full barrier
#define PARTIAL


int main()
{
volatile uint32_t *participation = (volatile uint32_t *)_snrt_barrier_participation_mask_reg_ptr();
Expand All @@ -23,89 +33,132 @@ int main()
*participation = 0b1111; // all tiles participate

result = 0;
printed = 0;
snrt_cluster_hw_barrier();

// atomic test-and-set: returns old value
if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) {
spin_lock(&lock, 1);
printf("setting participation for tiles 1 and 3\n");
spin_unlock(&lock, 1);
}


// set participation mask
*participation = 0b1010; // tiles 1 and 3 only
#ifdef PARTIAL
*participation = 0b0101; // tiles 0 and 2 only
#endif


// only tiles 1 and 3
if (snrt_cluster_tile_idx() == 1 || snrt_cluster_tile_idx() == 3)
{
int local=0;
// work
for (int i=0;i<256;i++){
v_big[i]=1;

}
#ifdef DEBUG
spin_lock(&lock, 1);
printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx());
spin_unlock(&lock, 1);
#endif
if (snrt_cluster_tile_idx()==1)
for (int i=0; i< 32 ;i++){
local+=v_big[(snrt_cluster_core_idx()%4)*32+i];

spin_lock(&lock, 1);
result += snrt_cluster_core_idx();
spin_unlock(&lock, 1);
}

if (snrt_cluster_tile_idx()==3)
for (int i=0; i<32 ;i++){
local+=v_big[(snrt_cluster_core_idx()%4)*32+i+128];

snrt_cluster_hw_barrier();

if (snrt_cluster_core_idx() == 4)
{
spin_lock(&lock, 1);
printf("tiles done working, result (expected = 76) = %u\n", result);
spin_unlock(&lock, 1);
}
}

if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12)
{
spin_lock(&lock, 1);
printf("%u reached end of first stage\n", snrt_cluster_tile_idx());
result_big += local;
spin_unlock(&lock, 1);
}
printed=0;

snrt_barrier(&sw_barrier, snrt_cluster_core_num());
#ifdef PARTIAL

#else
// necessary for tiles 0 and 2 to continue
snrt_cluster_hw_barrier();
if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) {
spin_lock(&lock, 1);
printf("1 and 3 reached barrier\n");
spin_unlock(&lock, 1);
}
snrt_cluster_hw_barrier();
snrt_cluster_hw_barrier();
snrt_cluster_hw_barrier();

#endif
printed=0;
snrt_barrier(&sw_barrier, snrt_cluster_core_num());

if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) {
spin_lock(&lock, 1);
printf("1 and 3 done with big red , result is : %d , expected :256 \n",result_big);
spin_unlock(&lock, 1);
}

if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) {
spin_lock(&lock, 1);
printf("setting participation for tiles 0 and 2\n");
spin_unlock(&lock, 1);


}

// set participation mask
*participation = 0b0101; // tiles 0 and 2 only

// only tiles 0 and 2
if (snrt_cluster_tile_idx() == 0 || snrt_cluster_tile_idx() == 2)
{
// work
spin_lock(&lock, 1);
printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx());
spin_unlock(&lock, 1);
for (int i=0;i<64;i++){
v_small[i]=1;
}
for (int n=0;n<4;n++){
int local=0;
// work
#ifdef DEBUG
spin_lock(&lock, 1);
printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx());
spin_unlock(&lock, 1);
#endif
if (snrt_cluster_tile_idx()==0)
for (int i=0; i< 8 ;i++){
local+=v_small[(snrt_cluster_core_idx()%4)*8+i];

spin_lock(&lock, 1);
result += snrt_cluster_core_idx();
spin_unlock(&lock, 1);
}

if (snrt_cluster_tile_idx()==2)
for (int i=0; i< 8 ;i++){
local+=v_small[(snrt_cluster_core_idx()%4)*8+i+32];

}

snrt_cluster_hw_barrier();

if (snrt_cluster_core_idx() == 4)
{
spin_lock(&lock, 1);
printf("tiles done working, result (expected = 76) = %u\n", result);
result_small[n] += local;
spin_unlock(&lock, 1);

printed2=0;

// if full barrier, has to wait for big reduction to finish
snrt_cluster_hw_barrier();

if (__atomic_fetch_add(&printed2, 1, __ATOMIC_RELAXED) == 0) {
spin_lock(&lock, 1);
printf("0 and 2 done with small red %d , result is : %d, expected :64 \n",n,result_small[n]);
spin_unlock(&lock, 1);
}



}
}


snrt_barrier(&sw_barrier, snrt_cluster_core_num());

}

if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12)
{
spin_lock(&lock, 1);
printf("%u reached end of second stage\n", snrt_cluster_tile_idx());
printf("%u reached end\n", snrt_cluster_tile_idx());
spin_unlock(&lock, 1);
}


// We use sw barrier for now, since crash happens if cores not synchronized at the end
// when implementing multiple barriers, we can assign one to 1 & 3 and another for
Expand Down
Loading