-
Notifications
You must be signed in to change notification settings - Fork 490
Expand file tree
/
Copy pathvx_start.S
More file actions
265 lines (242 loc) · 8.18 KB
/
vx_start.S
File metadata and controls
265 lines (242 loc) · 8.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_types.h>
#include <newlib.h>
#include "common.h"
#ifdef VX_CFG_VM_ENABLE
// Mode encodings used by the SATP CSR. Defined locally to avoid pulling
// sim/common/mem.h into the kernel build (these enum values are also
// declared there).
#define BARE 0
#define SV32 1
#define SV39 8
#endif
.section .init, "ax"
.global _start
.type _start, @function
_start:
#ifdef KMU_ENABLE
// ===========================================================================
// KMU entry model.
//
// The KMU launches every (block, thread) coordinate of a kernel at that
// kernel's entry PC. There is exactly one real entry routine —
// __vx_cta_entry — which brings the warp up to a C-callable state
// (SATP / gp / sp / tp+TLS / global ctors), then dispatches to the kernel
// whose address the caller left in s11. s11 is callee-saved, so it
// survives the prologue's __init_tls / __libc_init_array calls. The
// kernel-arguments pointer is handed to the kernel in a0 (MSCRATCH).
//
// Two ways the KMU reaches __vx_cta_entry:
//
// * Multi-entry .vxbin — each kernel has its own entry stub, emitted
// into the .vx_entry section and listed in the VXSYMTAB footer:
// lla s11, <kernel> ; j __vx_cta_entry
// The KMU enters those stubs directly, one PC per named kernel.
//
// * Footer-less .vxbin — the KMU enters at min_vma, where _start sits.
// _start is itself just the entry stub for the kernel conventionally
// named `kernel_main` (the legacy single-kernel-per-program model).
//
// So _start and every per-kernel stub are the same two-instruction shape
// over one shared prologue. kernel_main is weak: a footer-less program
// (e.g. the regression tests) defines it; a multi-entry program does not,
// and its dead _start simply resolves s11 to 0.
// ===========================================================================
# _start is the shared per-CTA entry: the KMU launches every CTA of every
# kernel at the image base (here), then the dispatch site below reads the
# selected kernel's entry from VX_CSR_CTA_ENTRY. _start aliases __vx_cta_entry.
.global __vx_cta_entry
.type __vx_cta_entry, @function
__vx_cta_entry:
#ifdef VX_CFG_VM_ENABLE
# Program SATP with the page-table base PPN and addressing mode. The
# runtime has already populated the page table at VX_MEM_PAGE_TABLE_BASE_ADDR
# before launching this kernel; we just point the per-core MMU at it.
# STACK and PT regions are MMU-bypass per VMManager::need_trans, so it
# is safe to do this before the stack is set up.
#if VX_VM_ADDR_MODE == SV39
li t0, VX_MEM_PAGE_TABLE_BASE_ADDR
srli t0, t0, VX_VM_PAGE_LOG2_SIZE
li t1, 1
slli t1, t1, 63
or t0, t0, t1
csrw satp, t0
#elif VX_VM_ADDR_MODE == SV32
li t0, VX_MEM_PAGE_TABLE_BASE_ADDR
srli t0, t0, VX_VM_PAGE_LOG2_SIZE
li t1, 1
slli t1, t1, 31
or t0, t0, t1
csrw satp, t0
#endif
#endif
#ifdef NEED_GP
# set global pointer register
la gp, __global_pointer
#endif
# set stack pointer register
LOAD_IMMEDIATE64(sp, VX_MEM_STACK_BASE_ADDR)
csrr t0, VX_CSR_MHARTID
sll t1, t0, VX_MEM_STACK_LOG2_SIZE
sub sp, sp, t1
#ifdef NEED_TLS
# set thread pointer register: each hart gets its own TLS image in the
# address space after the BSS region. The image spans .tdata + .tbss,
# so the per-hart stride is __tls_block_size (= __tbss_offset +
# __tbss_size), not __tbss_size alone — otherwise adjacent harts' TLS
# images overlap by __tbss_offset bytes. __tls_block_size is a small
# absolute linker symbol; address it with lui+addi rather than `la`,
# whose medany PC-relative expansion overflows on RV64.
lui t1, %hi(__tls_block_size)
addi t1, t1, %lo(__tls_block_size)
mul t0, t0, t1
la tp, _end
add tp, tp, t0
# initialize TLS for all warps
call __init_tls
#endif
#ifdef NEED_INITFINI
# run global initialization functions
call __libc_init_array
#endif
# Per-CTA dispatch window: the scheduler re-enters here for each new CTA on
# this warp slot by rewinding PC over this exact 5-instruction (20-byte)
# block. Read the kernel entry and kargs pointer from their CSRs (both
# re-supplied per CTA by the KMU), then call.
csrr s11, VX_CSR_CTA_ENTRY
csrr a0, VX_CSR_MSCRATCH
# temporarily disable RVC to force a 4-byte call — the only compressible
# instruction in the window; the csrr/wsync/tmc are always 4 bytes.
.option push
.option norvc
jalr ra, s11
.option pop
# drain all pending instruction this warp issued before retiring warp
.insn r RISCV_CUSTOM0, 7, 0, x0, x0, x0 # wsync
# shutdown warp.
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
.size __vx_cta_entry, .-__vx_cta_entry
#else
# initialize per-thread registers
csrr t0, VX_CSR_NUM_WARPS # get num warps
la t1, init_regs_all
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
jal init_regs
li t0, 1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
#ifdef NEED_TLS
# initialize TLS for all warps
csrr t0, VX_CSR_NUM_WARPS # get num warps
la t1, init_tls_all
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
call __init_tls
li t0, 1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
#endif
#ifdef NEED_INITFINI
# run global initialization functions
call __libc_init_array
#endif
# call main program routine
call main
# call exit routine
tail exit
.size _start, .-_start
.section .text
.type _Exit, @function
.global _Exit
_Exit:
li t0, VX_MEM_IO_EXIT_CODE
sw a0, 0(t0)
# drain all pending instruction this warp issued before retiring warp
.insn r RISCV_CUSTOM0, 7, 0, x0, x0, x0 # wsync
# shutdown warp
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
.section .text
.type init_regs, @function
.local init_regs
init_regs:
#ifdef VX_CFG_VM_ENABLE
# SATP is per-core, but each warp on each core writes the same value
# idempotently — keeps the non-KMU init path symmetric with _start so
# spawned warps don't rely on warp-0's earlier write being visible
# before any user-VA access they make.
#if VX_VM_ADDR_MODE == SV39
li t0, VX_MEM_PAGE_TABLE_BASE_ADDR
srli t0, t0, VX_VM_PAGE_LOG2_SIZE
li t1, 1
slli t1, t1, 63
or t0, t0, t1
csrw satp, t0
#elif VX_VM_ADDR_MODE == SV32
li t0, VX_MEM_PAGE_TABLE_BASE_ADDR
srli t0, t0, VX_VM_PAGE_LOG2_SIZE
li t1, 1
slli t1, t1, 31
or t0, t0, t1
csrw satp, t0
#endif
#endif
#ifdef NEED_GP
# set global pointer register
.option push
.option norelax
la gp, __global_pointer
.option pop
#endif
# set stack pointer register
LOAD_IMMEDIATE64(sp, VX_MEM_STACK_BASE_ADDR)
csrr t0, VX_CSR_MHARTID
sll t1, t0, VX_MEM_STACK_LOG2_SIZE
sub sp, sp, t1
#ifdef NEED_TLS
# set thread pointer register: per-hart TLS image after the BSS region.
# Stride is __tls_block_size (__tbss_offset + __tbss_size), not
# __tbss_size alone — see the __vx_cta_entry TLS setup above.
lui t1, %hi(__tls_block_size)
addi t1, t1, %lo(__tls_block_size)
mul t0, t0, t1
la tp, _end
add tp, tp, t0
#endif
ret
.section .text
.type init_regs_all, @function
.local init_regs_all
init_regs_all:
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
jal init_regs
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
ret
#ifdef NEED_TLS
.section .text
.type init_tls_all, @function
.local init_tls_all
init_tls_all:
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
call __init_tls
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
ret
#endif
#endif
.section .data
.weak __dso_handle
__dso_handle:
.long 0