Memory-Cache-Simulator/bench2.cpp at main · hritik0101/Memory-Cache-Simulator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/*
 * bench2.cpp — CS204 Cache Stress Benchmark
 * IIT Ropar | 2025-26
 *
 * Aggressively stresses a cache simulator. Designed so a realistic L1
 * config (32 KB, 8-way, 64 B lines) will see miss rates in the 20-60%
 * range, not the <5% you get from dijkstra_bench.
 *
 * Four phases, each defeating a different cache strength:
 *
 *  1. POINTER CHASE
 *     A linked list with 4 million nodes (~128 MB) traversed in a random
 *     permutation. Every access goes to a different cache line. Prefetcher
 *     useless; spatial locality zero; working set >> any cache.
 *
 *  2. CONFLICT THRASH
 *     Touches addresses chosen so they all map to the SAME cache set on a
 *     typical 32 KB 8-way cache (stride = 4 KB = num_sets * block_size).
 *     Forces conflict misses regardless of total cache size. LRU and FIFO
 *     behave very differently here.
 *
 *  3. CACHE-OBLIVIOUS TRANSPOSE
 *     8192 x 8192 byte matrix transpose. Naive transpose has terrible
 *     spatial locality on one of the two matrices. Working set = 64 MB,
 *     dwarfing any L1/L2.
 *
 *  4. REUSE-DISTANCE LADDER
 *     Repeatedly scans arrays of increasing size: 16 KB, 64 KB, 256 KB,
 *     1 MB, 4 MB. Each round visits every element once. This exposes
 *     the capacity-miss boundary of your cache — hit rate should drop
 *     sharply when working set exceeds cache size.
 *
 * Build (with -O0 to keep all loads/stores intact):
 *   g++ -O0 -std=c++17 -o bench2 bench2.cpp
 *
 * Trace with PIN:
 *   pin -t obj-intel64/mem_trace.so -o bench2.trace -- ./bench2
 */

#include <iostream>
#include <vector>
#include <random>
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <cstring>

// Volatile sink — keeps the optimiser from eliding work, even at -O0
static volatile uint64_t g_sink = 0;

// ════════════════════════════════════════════════════════════════════════════
//  Phase 1: Pointer chase through a randomly-permuted linked list
//
//  Each node holds the index of the NEXT node to visit. The "next" indices
//  form a random permutation, so successive accesses land on randomly-
//  scattered cache lines. The CPU prefetcher cannot predict the pattern.
//
//  Node size is padded to 64 B (one cache line) so each visit forces a
//  fresh line fetch — no spatial locality benefit.
// ════════════════════════════════════════════════════════════════════════════
struct alignas(64) Node {
    uint32_t next;
    uint32_t payload;
    char     pad[56];   // pad to 64 B
};

static void pointer_chase() {
    constexpr size_t N = 2 * 1024 * 1024;   // 2M nodes * 64 B = 128 MB

    std::cerr << "  allocating " << (N * sizeof(Node) / (1024*1024))
              << " MB for pointer chase...\n";
    std::vector<Node> nodes(N);

    // Build a random cycle visiting every node exactly once
    std::vector<uint32_t> perm(N);
    for (size_t i = 0; i < N; ++i) perm[i] = i;
    std::mt19937 rng(12345);
    std::shuffle(perm.begin() + 1, perm.end(), rng);

    // perm[i] is the i-th node visited; nodes[perm[i]].next = perm[i+1]
    for (size_t i = 0; i < N - 1; ++i)
        nodes[perm[i]].next = perm[i + 1];
    nodes[perm[N - 1]].next = perm[0];   // close the cycle

    // Chase pointers — 4 full laps for repeated reuse
    uint32_t cur = 0;
    for (int lap = 0; lap < 4; ++lap) {
        for (size_t i = 0; i < N; ++i) {
            g_sink += nodes[cur].payload;
            cur = nodes[cur].next;
        }
    }
}

// ════════════════════════════════════════════════════════════════════════════
//  Phase 2: Conflict thrash
//
//  For a 32 KB 8-way cache with 64 B lines:
//      num_sets = 32768 / (8 * 64) = 64
//      stride for same-set access = num_sets * 64 = 4096 B = 4 KB
//
//  Touching addresses 0, 4 KB, 8 KB, 12 KB, ... all map to set 0.
//  Touching 16 such addresses in a round-robin forces eviction on an
//  8-way cache regardless of how big the cache is overall.
//
//  This is the classic conflict-miss pattern. Direct-mapped caches die
//  here; high-associativity caches survive better; replacement policy
//  matters a lot when the round-robin set exceeds associativity.
// ════════════════════════════════════════════════════════════════════════════
static void conflict_thrash() {
    constexpr size_t STRIDE = 4096;       // bytes, hits same set on 32K 8-way
    constexpr size_t WAYS   = 16;         // intentionally > 8 to cause eviction
    constexpr size_t TOTAL  = STRIDE * WAYS;   // 64 KB

    std::cerr << "  allocating " << (TOTAL / 1024)
              << " KB for conflict thrash...\n";

    // aligned_alloc to a page boundary so set indices are predictable
    void* raw = std::aligned_alloc(4096, TOTAL);
    auto* buf = static_cast<volatile uint64_t*>(raw);

    // Initialise
    for (size_t i = 0; i < TOTAL / sizeof(uint64_t); ++i)
        buf[i] = i;

    // Hammer the same set: visit WAYS addresses, all aligned to STRIDE,
    // round-robin for many rounds
    for (int round = 0; round < 50000; ++round) {
        for (size_t w = 0; w < WAYS; ++w) {
            size_t idx = (w * STRIDE) / sizeof(uint64_t);
            g_sink += buf[idx];
        }
    }

    std::free(raw);
}

// ════════════════════════════════════════════════════════════════════════════
//  Phase 3: Naive matrix transpose, 8192 x 8192 bytes (64 MB)
//
//  A[i][j] read is sequential (good spatial locality).
//  B[j][i] write strides by 8192 bytes (one column step) — terrible.
//  Each write touches a new cache line; with 64 MB total, nothing stays
//  cached across iterations.
// ════════════════════════════════════════════════════════════════════════════
static void big_transpose() {
    constexpr size_t DIM = 8192;     // 8192 * 8192 = 64 MB

    std::cerr << "  allocating " << (2 * DIM * DIM / (1024*1024))
              << " MB for transpose...\n";

    auto* A = static_cast<uint8_t*>(std::malloc(DIM * DIM));
    auto* B = static_cast<uint8_t*>(std::malloc(DIM * DIM));

    // Init A — sequential, cache-friendly, just to bring it into the trace
    for (size_t i = 0; i < DIM * DIM; ++i) A[i] = uint8_t(i & 0xFF);

    // Naive transpose: B[j][i] = A[i][j]
    // Reads of A are row-major (friendly); writes to B stride by DIM (hostile)
    for (size_t i = 0; i < DIM; ++i)
        for (size_t j = 0; j < DIM; ++j)
            B[j * DIM + i] = A[i * DIM + j];

    // Consume — read transpose result column-major (now A is hostile)
    for (size_t j = 0; j < DIM; ++j)
        for (size_t i = 0; i < DIM; ++i)
            g_sink += B[i * DIM + j];

    std::free(A);
    std::free(B);
}

// ════════════════════════════════════════════════════════════════════════════
//  Phase 4: Reuse-distance ladder
//
//  Repeatedly scans arrays of geometrically increasing size. Each array
//  is scanned 8 times in a row, so the second-through-eighth scans should
//  HIT if the array fits in cache. Once the array exceeds cache size, hits
//  collapse to near zero.
//
//  This is the standard memory-mountain / cache-size probe pattern.
//  A clean run reveals the cache size as a step in the hit-rate curve.
// ════════════════════════════════════════════════════════════════════════════
static void reuse_ladder() {
    // Sizes in bytes: 16K, 64K, 256K, 1M, 4M, 16M
    constexpr size_t sizes[] = {
        16 * 1024,
        64 * 1024,
        256 * 1024,
        1 * 1024 * 1024,
        4 * 1024 * 1024,
        16 * 1024 * 1024,
    };

    for (size_t s : sizes) {
        std::cerr << "  ladder rung: " << (s / 1024) << " KB\n";
        size_t n_u64 = s / sizeof(uint64_t);
        std::vector<uint64_t> buf(n_u64);
        for (size_t i = 0; i < n_u64; ++i) buf[i] = i;

        // 8 sequential passes through this rung
        for (int pass = 0; pass < 8; ++pass) {
            for (size_t i = 0; i < n_u64; ++i) {
                g_sink += buf[i];
            }
        }
    }
}

// ════════════════════════════════════════════════════════════════════════════
//  main
// ════════════════════════════════════════════════════════════════════════════
int main() {
    std::cerr << "[1/4] Pointer chase (128 MB, random permutation)...\n";
    pointer_chase();

    std::cerr << "[2/4] Conflict thrash (4 KB stride, 16-way round-robin)...\n";
    conflict_thrash();

    std::cerr << "[3/4] Naive transpose (8192 x 8192, 64 MB)...\n";
    big_transpose();

    std::cerr << "[4/4] Reuse-distance ladder (16 KB to 16 MB)...\n";
    reuse_ladder();

    std::cerr << "[done] sink = " << g_sink << "\n";
    return 0;
}