diff --git a/software/tests/partial_barrier_benchmark/main.c b/software/tests/partial_barrier_benchmark/main.c
index da3dfe8..cb002c0 100644
--- a/software/tests/partial_barrier_benchmark/main.c
+++ b/software/tests/partial_barrier_benchmark/main.c
@@ -1,7 +1,6 @@
-// This tiny test tests the partial barrier implementation by
-// setting the participation mask only for tiles 1 & 3.
-// Tiles 0 & 2 should be able to reach end of programs
-// before 1 & 3 finish the work and pass the partial barrier.
+// Synthetic benchmark for partial barrier implementation.
+// Tiles 0 and 2 perform multiple small reductions
+// Tiles 1 and 3 perform one big reduction
 // Author: Luca Caballero Cusin <lcaballero@student.ethz.ch>
 
 #include <benchmark.h>
@@ -10,10 +9,21 @@
 #include "spin_lock.h"
 
 static uint32_t result __attribute__((section(".data")));
+static uint32_t result_big __attribute__((section(".data")));
+static uint32_t result_small[4] __attribute__((section(".data")));
 static uint32_t printed __attribute__((section(".data")));
+static uint32_t printed2 __attribute__((section(".data")));
 static struct snrt_barrier sw_barrier __attribute__((section(".data")));
 spinlock_t lock;
 
+
+static uint32_t v_big[256] __attribute__((section(".data")));
+static uint32_t v_small[64] __attribute__((section(".data")));
+
+// Choose between partial or full barrier
+#define PARTIAL
+
+
 int main()
 {
     volatile uint32_t *participation = (volatile uint32_t *)_snrt_barrier_participation_mask_reg_ptr();
@@ -23,89 +33,132 @@ int main()
     *participation = 0b1111; // all tiles participate
 
     result = 0;
+    printed = 0;
     snrt_cluster_hw_barrier();
 
-    // atomic test-and-set: returns old value
-    if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) {
-        spin_lock(&lock, 1);
-        printf("setting participation for tiles 1 and 3\n");
-        spin_unlock(&lock, 1);
-    }
+
 
     // set participation mask
-    *participation = 0b1010; // tiles 1 and 3 only
+    #ifdef PARTIAL
+        *participation = 0b0101; // tiles 0 and 2 only
+    #endif
+    
 
     // only tiles 1 and 3
     if (snrt_cluster_tile_idx() == 1 || snrt_cluster_tile_idx() == 3)
     {
+        int local=0;
         // work
+        for (int i=0;i<256;i++){
+            v_big[i]=1;
+
+        }
+        #ifdef DEBUG
         spin_lock(&lock, 1);
         printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx());
         spin_unlock(&lock, 1);
+        #endif
+        if (snrt_cluster_tile_idx()==1)
+            for (int i=0; i< 32 ;i++){
+            local+=v_big[(snrt_cluster_core_idx()%4)*32+i];
 
-        spin_lock(&lock, 1);
-        result += snrt_cluster_core_idx();
-        spin_unlock(&lock, 1);
+        }
+
+        if (snrt_cluster_tile_idx()==3)
+            for (int i=0; i<32  ;i++){
+            local+=v_big[(snrt_cluster_core_idx()%4)*32+i+128];
 
-        snrt_cluster_hw_barrier();
-        
-        if (snrt_cluster_core_idx() == 4)
-        {
-            spin_lock(&lock, 1);
-            printf("tiles done working, result (expected = 76) = %u\n", result);
-            spin_unlock(&lock, 1);
         }
-    }
 
-    if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12)
-    {
         spin_lock(&lock, 1);
-        printf("%u reached end of first stage\n", snrt_cluster_tile_idx());
+        result_big += local;
         spin_unlock(&lock, 1);
-    }
-    printed=0;
 
-    snrt_barrier(&sw_barrier, snrt_cluster_core_num());
+        #ifdef PARTIAL
+            
+        #else
+            // necessary for tiles 0 and 2 to continue
+            snrt_cluster_hw_barrier();
+            if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) {
+                spin_lock(&lock, 1);
+                printf("1 and 3 reached barrier\n");
+                spin_unlock(&lock, 1);
+            }
+            snrt_cluster_hw_barrier();
+            snrt_cluster_hw_barrier();
+            snrt_cluster_hw_barrier();
+            
+        #endif
+        printed=0;
+        snrt_barrier(&sw_barrier, snrt_cluster_core_num());
+
+        if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) {
+            spin_lock(&lock, 1);
+            printf("1 and 3 done with big red , result is : %d  , expected :256 \n",result_big);
+            spin_unlock(&lock, 1);
+        }
 
-    if (__atomic_fetch_add(&printed, 1, __ATOMIC_RELAXED) == 0) {
-        spin_lock(&lock, 1);
-        printf("setting participation for tiles 0 and 2\n");
-        spin_unlock(&lock, 1);
+        
+        
     }
 
-    // set participation mask
-    *participation = 0b0101; // tiles 0 and 2 only
-
     // only tiles 0 and 2
     if (snrt_cluster_tile_idx() == 0 || snrt_cluster_tile_idx() == 2)
     {
-        // work
-        spin_lock(&lock, 1);
-        printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx());
-        spin_unlock(&lock, 1);
+        for (int i=0;i<64;i++){
+            v_small[i]=1;
+        }
+        for (int n=0;n<4;n++){
+            int local=0;
+            // work
+            #ifdef DEBUG
+            spin_lock(&lock, 1);
+            printf("core %u in tile %u reached work loop\n", snrt_cluster_core_idx(), snrt_cluster_tile_idx());
+            spin_unlock(&lock, 1);
+            #endif
+            if (snrt_cluster_tile_idx()==0)
+                for (int i=0; i< 8 ;i++){
+                local+=v_small[(snrt_cluster_core_idx()%4)*8+i];
 
-        spin_lock(&lock, 1);
-        result += snrt_cluster_core_idx();
-        spin_unlock(&lock, 1);
+            }
+
+            if (snrt_cluster_tile_idx()==2)
+                for (int i=0; i< 8 ;i++){
+                local+=v_small[(snrt_cluster_core_idx()%4)*8+i+32];
+
+            }
 
-        snrt_cluster_hw_barrier();
-        
-        if (snrt_cluster_core_idx() == 4)
-        {
             spin_lock(&lock, 1);
-            printf("tiles done working, result (expected = 76) = %u\n", result);
+            result_small[n] += local;
             spin_unlock(&lock, 1);
+
+            printed2=0;
+
+            // if full barrier, has to wait for big reduction to finish
+            snrt_cluster_hw_barrier();
+
+            if (__atomic_fetch_add(&printed2, 1, __ATOMIC_RELAXED) == 0) {
+                spin_lock(&lock, 1);
+                printf("0 and 2 done with small red %d , result is : %d, expected :64  \n",n,result_small[n]);
+                spin_unlock(&lock, 1);
+            }
+
+
+            
         }
-    }
 
+        
+        snrt_barrier(&sw_barrier, snrt_cluster_core_num());
 
+    }
 
     if (snrt_cluster_core_idx() == 0 || snrt_cluster_core_idx() == 4 || snrt_cluster_core_idx() == 8 || snrt_cluster_core_idx() == 12)
     {
         spin_lock(&lock, 1);
-        printf("%u reached end of second stage\n", snrt_cluster_tile_idx());
+        printf("%u reached end\n", snrt_cluster_tile_idx());
         spin_unlock(&lock, 1);
     }
+  
 
     // We use sw barrier for now, since crash happens if cores not synchronized at the end
     // when implementing multiple barriers, we can assign one to 1 & 3 and another for