diff --git a/labs/core_bound/compiler_intrinsics_1/README.md b/labs/core_bound/compiler_intrinsics_1/README.md index bbc74935..c898217e 100644 --- a/labs/core_bound/compiler_intrinsics_1/README.md +++ b/labs/core_bound/compiler_intrinsics_1/README.md @@ -94,7 +94,7 @@ We use `_mm_set1_epi16` to create eight copies of our 16-bit `currentSum` in a 1 _broadcasting_ the input. ```c++ -__m128_i current = _mm_set1_epi16(currentSum); // create vector of eight copies of currentSum +__m128i current = _mm_set1_epi16(currentSum); // create vector of eight copies of currentSum ``` We then process eight elements of our data per iteration. Here is how we convert the required values: diff --git a/labs/core_bound/dep_chains_2/README.md b/labs/core_bound/dep_chains_2/README.md index 80c62469..7b856ad8 100644 --- a/labs/core_bound/dep_chains_2/README.md +++ b/labs/core_bound/dep_chains_2/README.md @@ -117,7 +117,7 @@ void randomParticleMotion(std::vector& particles, uint32_t seed) { particles[j+1].y += sine(angle_rad2) * particles[j+1].velocity; } if (sz % 2) { - angle = rng.gen(); + uint32_t angle = rng.gen(); float angle_rad = angle * DEGREE_TO_RADIAN; particles[sz-1].x += cosine(angle_rad) * particles[sz-1].velocity; particles[sz-1].y += sine(angle_rad) * particles[sz-1].velocity; diff --git a/labs/core_bound/vectorization_2/README.md b/labs/core_bound/vectorization_2/README.md index 27f67edb..d54457eb 100644 --- a/labs/core_bound/vectorization_2/README.md +++ b/labs/core_bound/vectorization_2/README.md @@ -112,7 +112,7 @@ uint32_t acc = 0; Next, we want to add as many integers from `blob` as possible in chunks so that we get no more than one 32-bit overflow per chunk of additions. This is quite easy to achieve; the largest value we will encounter from blob is 216 - 1, so -if we never process more than 215 (`1 << 16`) numbers at a time, we will not cause the 32-bit accumulator +if we never process more than 216 - 1 numbers at a time, we will not cause the 32-bit accumulator to overflow more than once. We therefore introduce a chunk to the loop: @@ -133,8 +133,8 @@ we know we have seen an unsigned integer overflow, and we increment `acc` by `1` ```c++ constexpr size_t two_pow_16 = (1 << 16); +uint32_t acc = 0, prev = 0; for (size_t i = 0; i < N; i += two_pow_16) { - uint32_t acc = 0, prev = 0; for (size_t j = i; j < i + two_pow_16 && j < N; j++) { acc += blob[j]; } diff --git a/labs/memory_bound/huge_pages_1/README.md b/labs/memory_bound/huge_pages_1/README.md index fb266188..1050fd52 100644 --- a/labs/memory_bound/huge_pages_1/README.md +++ b/labs/memory_bound/huge_pages_1/README.md @@ -73,7 +73,7 @@ as these are not characteristics of a cache-friendly algorithm. ### Solution (Linux) -The problem can be alleviated slightly through the use of huge tables. Before explaining what these are +The problem can be alleviated slightly through the use of huge pages. Before explaining what these are and how they work, I'll add a brief explanation of paging and virtual memory. When a process (such as the program of our algorithm) is run by the operating system (OS), @@ -132,7 +132,7 @@ inline auto allocateDoublesArray(size_t size) { auto pages_needed = total_array_size / page_2mb + (total_array_size % page_2mb != 0); size_t total_to_alloc = pages_needed * page_2mb; - void* ptr = mmap(nullptr, total_to_alloc, PROT_READ | PROT_WRITE | PROT_EXEC, + void* ptr = mmap(nullptr, total_to_alloc, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1 , 0); if (ptr == MAP_FAILED) { throw std::bad_alloc{}; diff --git a/labs/memory_bound/loop_interchange_1/README.md b/labs/memory_bound/loop_interchange_1/README.md index e13bc0c4..c9ce935b 100644 --- a/labs/memory_bound/loop_interchange_1/README.md +++ b/labs/memory_bound/loop_interchange_1/README.md @@ -117,7 +117,7 @@ void multiply(Matrix &result, const Matrix &a, const Matrix &b) { The memory access pattern for the matrix `b` is suboptimal. The innermost loop iterates over `k`, which ends up jumping from row to row in the matrix; in terms of memory, the processor has to jump around in memory to go from `b[k][j]` to -`b[k+1]j`. We want to rewrite this loop so that we keep the row fixed and iterate over each column in `a` and `b`, as +`b[k+1][j]`. We want to rewrite this loop so that we keep the row fixed and iterate over each column in `a` and `b`, as this will traverse memory contiguously. To do this, we simply swap the iterators over `j` with those over `k`: diff --git a/labs/memory_bound/mem_alignment_1/README.md b/labs/memory_bound/mem_alignment_1/README.md index a1166de5..8675b154 100644 --- a/labs/memory_bound/mem_alignment_1/README.md +++ b/labs/memory_bound/mem_alignment_1/README.md @@ -120,7 +120,7 @@ multiple of the number of elements of `float` that will fit on a cache line. To do this, we create a function in `solution.cpp`: ```c++ -int get_next_multiple_of_elems_in_cache_line(int N) { +int get_next_multiple(int N) { const auto y = ELEMS_PER_CACHE_LINE - 1; return N + y & ~y; } @@ -134,7 +134,7 @@ inline constexpr int ELEMS_PER_CACHE_LINE = CACHELINE_SIZE / sizeof(float); `CACHELINE_SIZE` is a constant provided by the existing code. -`get_next_multiple_of_cache_line` will return the first multiple of `ELEMS_PER_CACHE_LINE` that is greater than or +`get_next_multiple` will return the first multiple of `ELEMS_PER_CACHE_LINE` that is greater than or equal to `N`, where `N` will be the number of columns in our matrix. Finally, we change the function `n_columns`: diff --git a/labs/memory_bound/swmem_prefetch_1/README.md b/labs/memory_bound/swmem_prefetch_1/README.md index 844468ea..625278aa 100644 --- a/labs/memory_bound/swmem_prefetch_1/README.md +++ b/labs/memory_bound/swmem_prefetch_1/README.md @@ -155,16 +155,28 @@ Then, let's rewrite our loop in `solution.cpp`: static constexpr auto prefetch_step = 16; int solution(const hash_map_t *hash_map, const std::vector &lookups) { int result = 0; + const auto size = lookups.size(); + + if (size <= prefetch_step) { + for (std::size_t i = 0; i < size; i++) { + if (const int val = lookups[i]; hash_map->find(val)) { + result += getSumOfDigits(val); + } + } + return result; + } - for (auto i = 0; i + prefetch_step < lookups.size(); i++) { - if (const int val = lookups[i]; hash_map->find(val)) + for (auto i = 0; i + prefetch_step < size; i++) { + if (const int val = lookups[i]; hash_map->find(val)) { result += getSumOfDigits(val); + } hash_map->prefetch_find(lookups[i + prefetch_step]); } - for (auto i = lookups.size() - prefetch_step; i < lookups.size(); i++) { - if (const int val = lookups[i]; hash_map->find(val)) + for (auto i = size - prefetch_step; i < size; i++) { + if (const int val = lookups[i]; hash_map->find(val)) { result += getSumOfDigits(val); + } } return result;