dendibakh · dendibakh · May 26, 2026 · May 24, 2026
diff --git a/labs/core_bound/compiler_intrinsics_1/README.md b/labs/core_bound/compiler_intrinsics_1/README.md
@@ -94,7 +94,7 @@ We use `_mm_set1_epi16` to create eight copies of our 16-bit `currentSum` in a 1
 _broadcasting_ the input.
 
 ```c++
-__m128_i current = _mm_set1_epi16(currentSum); // create vector of eight copies of currentSum
+__m128i current = _mm_set1_epi16(currentSum); // create vector of eight copies of currentSum
 ```
 
 We then process eight elements of our data per iteration. Here is how we convert the required values:

diff --git a/labs/core_bound/dep_chains_2/README.md b/labs/core_bound/dep_chains_2/README.md
@@ -117,7 +117,7 @@ void randomParticleMotion(std::vector<Particle>& particles, uint32_t seed) {
       particles[j+1].y += sine(angle_rad2) * particles[j+1].velocity;
     }
     if (sz % 2) {
-      angle = rng.gen();
+      uint32_t angle = rng.gen();
       float angle_rad = angle * DEGREE_TO_RADIAN;
       particles[sz-1].x += cosine(angle_rad) * particles[sz-1].velocity;
       particles[sz-1].y += sine(angle_rad) * particles[sz-1].velocity;

diff --git a/labs/core_bound/vectorization_2/README.md b/labs/core_bound/vectorization_2/README.md
@@ -112,7 +112,7 @@ uint32_t acc = 0;
 
 Next, we want to add as many integers from `blob` as possible in chunks so that we get no more than one 32-bit overflow
 per chunk of additions. This is quite easy to achieve; the largest value we will encounter from blob is 2<sup>16</sup> - 1, so
-if we never process more than 2<sup>15</sup> (`1 << 16`) numbers at a time, we will not cause the 32-bit accumulator
+if we never process more than 2<sup>16</sup> - 1 numbers at a time, we will not cause the 32-bit accumulator
 to overflow more than once.
 
 We therefore introduce a chunk to the loop:
@@ -133,8 +133,8 @@ we know we have seen an unsigned integer overflow, and we increment `acc` by `1`
 
 ```c++
 constexpr size_t two_pow_16 = (1 << 16);
+uint32_t acc = 0, prev = 0;
 for (size_t i = 0; i < N; i += two_pow_16) {
-    uint32_t acc = 0, prev = 0;
     for (size_t j = i; j < i + two_pow_16 && j < N; j++) {
         acc += blob[j];
     }

diff --git a/labs/memory_bound/huge_pages_1/README.md b/labs/memory_bound/huge_pages_1/README.md
@@ -73,7 +73,7 @@ as these are not characteristics of a cache-friendly algorithm.
 
 ### Solution (Linux)
 
-The problem can be alleviated slightly through the use of huge tables. Before explaining what these are
+The problem can be alleviated slightly through the use of huge pages. Before explaining what these are
 and how they work, I'll add a brief explanation of paging and virtual memory.
 
 When a process (such as the program of our algorithm) is run by the operating system (OS),
@@ -132,7 +132,7 @@ inline auto allocateDoublesArray(size_t size) {
   auto pages_needed = total_array_size / page_2mb + (total_array_size % page_2mb != 0);
   size_t total_to_alloc = pages_needed * page_2mb;
 
-  void* ptr = mmap(nullptr, total_to_alloc, PROT_READ | PROT_WRITE | PROT_EXEC,
+  void* ptr = mmap(nullptr, total_to_alloc, PROT_READ | PROT_WRITE,
                 MAP_PRIVATE | MAP_ANONYMOUS, -1 , 0);
   if (ptr == MAP_FAILED) {
     throw std::bad_alloc{};

diff --git a/labs/memory_bound/loop_interchange_1/README.md b/labs/memory_bound/loop_interchange_1/README.md
@@ -117,7 +117,7 @@ void multiply(Matrix &result, const Matrix &a, const Matrix &b) {
 
 The memory access pattern for the matrix `b` is suboptimal. The innermost loop iterates over `k`, which ends up jumping
 from row to row in the matrix; in terms of memory, the processor has to jump around in memory to go from `b[k][j]` to
-`b[k+1]j`. We want to rewrite this loop so that we keep the row fixed and iterate over each column in `a` and `b`, as
+`b[k+1][j]`. We want to rewrite this loop so that we keep the row fixed and iterate over each column in `a` and `b`, as
 this will traverse memory contiguously.
 
 To do this, we simply swap the iterators over `j` with those over `k`:

diff --git a/labs/memory_bound/mem_alignment_1/README.md b/labs/memory_bound/mem_alignment_1/README.md
@@ -120,7 +120,7 @@ multiple of the number of elements of `float` that will fit on a cache line.
 To do this, we create a function in `solution.cpp`:
 
 ```c++
-int get_next_multiple_of_elems_in_cache_line(int N) {
+int get_next_multiple(int N) {
   const auto y = ELEMS_PER_CACHE_LINE - 1;
   return N + y & ~y;
 }
@@ -134,7 +134,7 @@ inline constexpr int ELEMS_PER_CACHE_LINE = CACHELINE_SIZE / sizeof(float);
 
 `CACHELINE_SIZE` is a constant provided by the existing code.
 
-`get_next_multiple_of_cache_line` will return the first multiple of `ELEMS_PER_CACHE_LINE` that is greater than or
+`get_next_multiple` will return the first multiple of `ELEMS_PER_CACHE_LINE` that is greater than or
 equal to `N`, where `N` will be the number of columns in our matrix.
 
 Finally, we change the function `n_columns`:

diff --git a/labs/memory_bound/swmem_prefetch_1/README.md b/labs/memory_bound/swmem_prefetch_1/README.md
@@ -155,16 +155,28 @@ Then, let's rewrite our loop in `solution.cpp`:
 static constexpr auto prefetch_step = 16;
 int solution(const hash_map_t *hash_map, const std::vector<int> &lookups) {
   int result = 0;
+  const auto size = lookups.size(); 
+
+  if (size <= prefetch_step) {
+    for (std::size_t i = 0; i < size; i++) {
+      if (const int val = lookups[i]; hash_map->find(val)) {
+        result += getSumOfDigits(val);
+      }
+    }
+    return result;
+  }
 
-  for (auto i = 0; i + prefetch_step < lookups.size(); i++) {
-    if (const int val = lookups[i]; hash_map->find(val))
+  for (auto i = 0; i + prefetch_step < size; i++) {
+    if (const int val = lookups[i]; hash_map->find(val)) {
       result += getSumOfDigits(val);
+    }
     hash_map->prefetch_find(lookups[i + prefetch_step]);
   }
 
-  for (auto i = lookups.size() - prefetch_step; i < lookups.size(); i++) {
-    if (const int val = lookups[i]; hash_map->find(val))
+  for (auto i = size - prefetch_step; i < size; i++) {
+    if (const int val = lookups[i]; hash_map->find(val)) {
       result += getSumOfDigits(val);
+    }
   }
 
   return result;