-
Notifications
You must be signed in to change notification settings - Fork 66
System Calls
This page teaches you how system calls work and how to use and implement them in MentOS.
System calls are the only way userspace programs can request services from the kernel. They provide a controlled interface between user mode (ring 3) and kernel mode (ring 0).
Why do we need them?
- User programs cannot directly access hardware, memory management, or privileged CPU instructions
- The kernel must validate all requests to maintain security and stability
- System calls provide a stable API that programs can rely on
MentOS implements 60+ POSIX-compatible system calls across these categories:
- Process management - fork, exec, exit, wait, signals
- File operations - open, read, write, close, stat
- Memory management - brk, mmap, munmap
- IPC - semaphores, message queues, shared memory
- Networking - (future)
Userspace Program Kernel
----------------- ------
1. Call libc wrapper:
read(fd, buf, 100)
↓
2. Wrapper prepares arguments:
eax = syscall number (3)
ebx = fd
ecx = buf
edx = 100
↓
3. Execute INT 0x80 → 4. CPU switches to kernel mode
- Save user context (registers, stack)
- Switch to kernel stack
- Jump to syscall_handler()
↓
5. Dispatcher reads eax
- Validate syscall number
- Call sys_read(ebx, ecx, edx)
↓
6. sys_read executes:
- Validate file descriptor
- Check permissions
- Copy data to user buffer
- Return byte count in eax
↓
7. Return to userspace
- Restore user context
- Switch back to user mode
↓
8. read() returns eax value
to program
Syscall Invocation: INT 0x80
Registers:
-
eax- Syscall number (e.g., 3 for read, 4 for write) -
ebx- First argument -
ecx- Second argument -
edx- Third argument -
esi- Fourth argument -
edi- Fifth argument
Return:
-
eax- Return value (or negative error code)
// Your code
#include <unistd.h>
int main() {
write(1, "Hello\n", 6); // Write to stdout
return 0;
}Step-by-step execution:
// 1. Libc wrapper (lib/src/unistd/write.c)
ssize_t write(int fd, const void *buf, size_t nbytes)
{
long __res;
__inline_syscall_3(__res, write, fd, buf, nbytes);
__syscall_return(ssize_t, __res);
}
// 2. Interrupt/ISR dispatch (kernel/src/descriptor_tables/interrupt.c)
// INT 0x80 → syscall_handler() (installed by kernel/src/system/syscall.c)
// 3. Syscall dispatcher (kernel/src/system/syscall.c)
void syscall_handler(pt_regs_t *f)
{
if (f->eax >= SYSCALL_NUMBER) {
f->eax = ENOSYS;
return;
}
SystemCall5 fun = (SystemCall5)sys_call_table[f->eax];
unsigned args[5] = {0};
if ((f->eax == __NR_fork) || (f->eax == __NR_clone) ||
(f->eax == __NR_execve) || (f->eax == __NR_sigreturn)) {
args[0] = (uintptr_t)f;
} else {
args[0] = f->ebx;
args[1] = f->ecx;
args[2] = f->edx;
args[3] = f->esi;
args[4] = f->edi;
}
f->eax = fun(args[0], args[1], args[2], args[3], args[4]);
}
// 4. Actual syscall implementation (kernel/src/fs/read_write.c)
ssize_t sys_write(int fd, const void *buf, size_t nbytes)
{
task_struct *task = scheduler_get_current_process();
if (fd < 0 || fd >= task->max_fd) {
return -EMFILE;
}
vfs_file_descriptor_t *vfd = &task->fd_list[fd];
if (vfd->file_struct == NULL) {
return -ENOSYS;
}
if (!bitmask_check(vfd->flags_mask, O_WRONLY | O_RDWR)) {
return -EROFS;
}
int written = vfs_write(vfd->file_struct, buf, vfd->file_struct->f_pos, nbytes);
if (written > 0) {
vfd->file_struct->f_pos += written;
}
return written;
}Let's write a simple program that directly uses a syscall without libc wrappers to understand the raw mechanism.
Create userspace/bin/syscall_demo.c:
// Direct syscall demo - bypasses libc wrappers
#include <stddef.h>
#include <system/syscall_types.h>
// Helper function to invoke syscalls
static inline long syscall3(long number, long arg1, long arg2, long arg3)
{
long result;
asm volatile(
"int $0x80"
: "=a" (result)
: "a" (number), "b" (arg1), "c" (arg2), "d" (arg3)
: "memory"
);
return result;
}
int main(void)
{
const char *message = "Hello from direct syscall!\n";
int length = 27;
// sys_write(STDOUT, message, length)
syscall3(__NR_write, 1, (long)message, length);
return 0;
}What's happening:
-
syscall3()loads the syscall number intoeax, arguments intoebx/ecx/edx, and executesint 0x80. - The program uses libc for startup/exit, but bypasses libc wrappers for the write.
Add syscall_demo.c to the PROGRAM_LIST in userspace/bin/CMakeLists.txt and build as usual:
cd build
make programs
make filesystem
make qemuOutput:
Hello from direct syscall!
Now let's look at practical examples using libc wrappers (much easier than raw syscalls!).
Reading and writing files is the most common syscall pattern.
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
int main(void)
{
char buffer[256];
// 1. Open file (syscall: sys_open)
int fd = open("/etc/motd", O_RDONLY);
if (fd < 0) {
perror("open");
return 1;
}
// 2. Read data (syscall: sys_read)
ssize_t bytes = read(fd, buffer, sizeof(buffer) - 1);
if (bytes < 0) {
perror("read");
close(fd);
return 1;
}
// 3. Null-terminate and display
buffer[bytes] = '\0';
printf("Read %d bytes:\n%s\n", bytes, buffer);
// 4. Close file (syscall: sys_close)
close(fd);
return 0;
}Syscalls used:
-
open()→sys_open(pathname, flags, mode) -
read()→sys_read(fd, buffer, count) -
close()→sys_close(fd)
Key points:
- Always check return values (negative = error)
- File descriptors are integers (0=stdin, 1=stdout, 2=stderr, 3+ user files)
- Remember to close files to free kernel resources
Creating child processes requires fork() and understanding parent/child separation.
#include <unistd.h>
#include <sys/wait.h>
#include <stdio.h>
int main(void)
{
printf("Parent PID: %d\n", getpid());
// 1. Fork creates child process (syscall: sys_fork)
pid_t pid = fork();
if (pid < 0) {
// Error
perror("fork");
return 1;
} else if (pid == 0) {
// Child process
printf("Child PID: %d, Parent PID: %d\n", getpid(), getppid());
// Do child work
sleep(2);
printf("Child exiting\n");
return 42; // Exit status
} else {
// Parent process
printf("Parent created child PID: %d\n", pid);
// Wait for child (syscall: sys_waitpid)
int status;
pid_t child = waitpid(pid, &status, 0);
if (WIFEXITED(status)) {
printf("Child %d exited with status %d\n",
child, WEXITSTATUS(status));
}
}
return 0;
}Syscalls used:
-
getpid()→sys_getpid() -
fork()→sys_fork()- creates child copy -
getppid()→sys_getppid()- get parent PID -
waitpid()→sys_waitpid(pid, status, options)
Output:
Parent PID: 5
Parent created child PID: 6
Child PID: 6, Parent PID: 5
Child exiting
Child 6 exited with status 42Key points:
-
fork()returns twice: 0 in child, child PID in parent - Child is exact copy with separate memory
- Parent should
waitpid()to collect child exit status - Use
WIFEXITED()andWEXITSTATUS()macros for status
Combining fork() + execve() to run other programs.
#include <unistd.h>
#include <sys/wait.h>
#include <stdio.h>
int main(void)
{
pid_t pid = fork();
if (pid == 0) {
// Child: execute /bin/ls
char *argv[] = {"/bin/ls", "-l", "/etc", NULL};
char *envp[] = {NULL};
// syscall: sys_execve - replaces child with /bin/ls
execve("/bin/ls", argv, envp);
// Only reached if execve fails
perror("execve");
return 1;
} else {
// Parent: wait for ls to finish
waitpid(pid, NULL, 0);
printf("ls finished\n");
}
return 0;
}Syscalls used:
-
fork()→sys_fork() -
execve()→sys_execve(path, argv, envp)- load new program -
waitpid()→sys_waitpid(pid, NULL, 0)
Key points:
-
execve()replaces the calling process's memory - Does not return on success (new program runs)
- Only returns -1 if it fails to load
- Arguments: program path, argv array, environment array
Signals are asynchronous notifications (like SIGINT from Ctrl+C).
#include <signal.h>
#include <unistd.h>
#include <stdio.h>
void signal_handler(int signum)
{
printf("\nCaught signal %d (SIGINT)\n", signum);
printf("Cleaning up...\n");
exit(0);
}
int main(void)
{
// Install signal handler (syscall: sys_sigaction)
struct sigaction sa = {0};
sa.sa_handler = signal_handler;
sigaction(SIGINT, &sa, NULL);
printf("Running... press Ctrl+C to stop\n");
while (1) {
printf(".");
fflush(stdout);
sleep(1); // syscall: sys_nanosleep
}
return 0;
}Syscalls used:
-
sigaction()→sys_sigaction(signum, act, oldact)- install handler -
sleep()→sys_nanosleep()- sleep for seconds -
exit()→sys_exit(status)- terminate
Output:
Running... press Ctrl+C to stop
.....^C
Caught signal 2 (SIGINT)
Cleaning up...Key points:
- Signals interrupt normal program flow
- Handler runs in same process context
- Common signals: SIGINT (2), SIGTERM (15), SIGCHLD (17), SIGKILL (9)
- SIGKILL and SIGSTOP cannot be caught
All syscalls return errors consistently.
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
int main(void)
{
// Try to open non-existent file
int fd = open("/non/existent/file", O_RDONLY);
if (fd < 0) {
// Syscall failed - check errno
printf("Error code: %d\n", errno);
printf("Error name: %s\n", strerror(errno));
// Common errors:
switch (errno) {
case ENOENT: // No such file or directory
printf("File not found\n");
break;
case EACCES: // Permission denied
printf("Access denied\n");
break;
case EMFILE: // Too many open files
printf("File descriptor limit reached\n");
break;
default:
printf("Unknown error\n");
}
return 1;
}
// Success path
close(fd);
return 0;
}Output:
Error code: 2
Error name: No such file or directory
File not foundKey points:
- Negative return = error (usually -1)
-
errnoglobal variable contains error code - Use
perror()orstrerror()for human-readable messages - Common errors:
-
ENOENT(2) - File not found -
EACCES(13) - Permission denied -
EINVAL(22) - Invalid argument -
ENOSYS(38) - Function not implemented
-
Want to add a new syscall to MentOS? Here's the complete process.
Let's add a simple hello() syscall that returns a greeting string.
Step 1: Add syscall number in kernel/inc/system/syscall.h:
#define __NR_exit 1
#define __NR_fork 2
#define __NR_read 3
// ... existing syscalls ...
#define __NR_hello 61 // Our new syscall number
#define SYSCALL_NUMBER 62 // Update total countStep 2: Implement kernel function in kernel/src/system/syscall.c:
/// @brief Returns a greeting message to userspace
/// @param buffer User buffer to write message
/// @param size Size of user buffer
/// @return Number of bytes written, or negative error code
int sys_hello(char *buffer, size_t size)
{
const char *message = "Hello from kernel!\n";
size_t msg_len = 19;
// Validate user buffer
if (!buffer || size == 0) {
return -EINVAL;
}
// Don't overflow user buffer
if (msg_len > size) {
msg_len = size;
}
// Copy to userspace (simplified - should use copy_to_user)
memcpy(buffer, message, msg_len);
return msg_len;
}Step 3: Register in syscall table in kernel/src/system/syscall.c:
/// System call handlers table
static uintptr_t syscall_table[SYSCALL_NUMBER] = {
[__NR_exit] = (uintptr_t)sys_exit,
[__NR_fork] = (uintptr_t)sys_fork,
[__NR_read] = (uintptr_t)sys_read,
// ... existing syscalls ...
[__NR_hello] = (uintptr_t)sys_hello, // Add our syscall
};Step 4: Add libc wrapper in lib/inc/unistd.h:
/// Get greeting from kernel
/// @param buffer Buffer to store message
/// @param size Size of buffer
/// @return Number of bytes written, or -1 on error
int hello(char *buffer, size_t size);And in lib/src/unistd/hello.c:
#include <unistd.h>
#include <sys/errno.h>
_syscall2(int, hello, char *, buffer, size_t, size)The _syscall2 macro expands to:
int hello(char *buffer, size_t size)
{
int ret;
asm volatile(
"int $0x80"
: "=a"(ret)
: "a"(__NR_hello), "b"(buffer), "c"(size)
: "memory"
);
if (ret < 0) {
errno = -ret;
return -1;
}
return ret;
}Step 5: Test it! Create userspace/bin/hello_test.c:
#include <unistd.h>
#include <stdio.h>
int main(void)
{
char buffer[64];
int bytes = hello(buffer, sizeof(buffer));
if (bytes < 0) {
perror("hello");
return 1;
}
printf("Kernel says: %.*s", bytes, buffer);
return 0;
}Build and test:
cd build
make hello_test
./qemu.sh
# In MentOS:
/bin/hello_testOutput:
Kernel says: Hello from kernel!-
Syscall Numbers: Must be unique, sequential, update
SYSCALL_NUMBER - Kernel Function: Must validate all parameters (userspace can pass garbage!)
- Return Values: Positive/zero = success, negative = error code
-
Memory Safety: Use
copy_from_user()/copy_to_user()for kernel ↔ user transfers -
Libc Wrapper: Use
_syscallNmacros where N = number of arguments - Documentation: Add Doxygen comments in header files
int sys_example(void *user_ptr, size_t size, int flags)
{
// 1. Validate pointers
if (!user_ptr) {
return -EINVAL;
}
// 2. Check size limits
if (size > MAX_SIZE) {
return -EINVAL;
}
// 3. Verify flags
if (flags & ~VALID_FLAGS_MASK) {
return -EINVAL;
}
// 4. Check permissions (example: must be superuser)
task_struct *task = scheduler_get_current_process();
if (task->uid != 0) {
return -EPERM;
}
// 5. Safely access user memory
char kernel_buf[256];
if (size > sizeof(kernel_buf)) {
return -ENOMEM;
}
// In real implementation, use copy_from_user()
memcpy(kernel_buf, user_ptr, size);
// Do work...
return 0; // Success
}MentOS implements 60+ POSIX syscalls. Here's a categorized reference.
| Syscall | Number | Description | Returns |
|---|---|---|---|
fork() |
2 | Create child process | 0 in child, PID in parent, -1 on error |
execve(path, argv, envp) |
11 | Execute program | Does not return on success |
exit(status) |
1 | Terminate process | Does not return |
waitpid(pid, status, options) |
7 | Wait for child | Child PID or -1 |
getpid() |
20 | Get process ID | PID |
getppid() |
64 | Get parent PID | Parent PID |
getpgid(pid) |
132 | Get process group | PGID or -1 |
setpgid(pid, pgid) |
57 | Set process group | 0 or -1 |
getsid(pid) |
147 | Get session ID | SID or -1 |
setsid() |
66 | Create new session | SID or -1 |
nice(increment) |
34 | Change priority | New nice value or -1 |
| Syscall | Number | Description | Returns |
|---|---|---|---|
getuid() |
24 | Get real user ID | UID |
geteuid() |
49 | Get effective user ID | EUID |
setuid(uid) |
23 | Set user ID | 0 or -1 |
setreuid(ruid, euid) |
70 | Set real/effective UID | 0 or -1 |
getgid() |
47 | Get real group ID | GID |
getegid() |
50 | Get effective group ID | EGID |
setgid(gid) |
46 | Set group ID | 0 or -1 |
setregid(rgid, egid) |
71 | Set real/effective GID | 0 or -1 |
| Syscall | Number | Description | Returns |
|---|---|---|---|
open(path, flags, mode) |
5 | Open/create file | FD or -1 |
close(fd) |
6 | Close file | 0 or -1 |
read(fd, buf, count) |
3 | Read from file | Bytes read, 0 on EOF, -1 on error |
write(fd, buf, count) |
4 | Write to file | Bytes written or -1 |
lseek(fd, offset, whence) |
19 | Seek in file | New offset or -1 |
creat(path, mode) |
8 | Create file | FD or -1 |
unlink(path) |
10 | Delete file | 0 or -1 |
stat(path, statbuf) |
106 | Get file status | 0 or -1 |
fstat(fd, statbuf) |
108 | Get file status by FD | 0 or -1 |
readlink(path, buf, size) |
85 | Read symlink | Bytes read or -1 |
symlink(target, linkpath) |
83 | Create symlink | 0 or -1 |
chmod(path, mode) |
15 | Change permissions | 0 or -1 |
chown(path, owner, group) |
182 | Change ownership | 0 or -1 |
| Syscall | Number | Description | Returns |
|---|---|---|---|
mkdir(path, mode) |
39 | Create directory | 0 or -1 |
rmdir(path) |
40 | Remove directory | 0 or -1 |
chdir(path) |
12 | Change working directory | 0 or -1 |
getcwd(buf, size) |
183 | Get working directory | Buffer pointer or NULL |
| Syscall | Number | Description | Returns |
|---|---|---|---|
brk(addr) |
45 | Set program break | 0 or -1 |
sbrk(increment) |
- | Adjust program break | New break or -1 |
| Syscall | Number | Description | Returns |
|---|---|---|---|
time(tloc) |
13 | Get current time | Seconds since epoch |
nanosleep(req, rem) |
162 | Sleep for duration | 0 or -1 |
| Syscall | Number | Description | Returns |
|---|---|---|---|
signal(signum, handler) |
48 | Install signal handler (old) | Previous handler or SIG_ERR |
sigaction(signum, act, oldact) |
67 | Install signal handler | 0 or -1 |
kill(pid, sig) |
37 | Send signal to process | 0 or -1 |
sigreturn() |
119 | Return from signal handler | Does not return |
alarm(seconds) |
27 | Set alarm timer | Remaining seconds |
| Syscall | Number | Description | Returns |
|---|---|---|---|
semget(key, nsems, flags) |
393 | Get semaphore set | ID or -1 |
semop(semid, ops, nops) |
394 | Semaphore operations | 0 or -1 |
semctl(semid, semnum, cmd, ...) |
394 | Semaphore control | Varies or -1 |
| Syscall | Number | Description | Returns |
|---|---|---|---|
msgget(key, flags) |
399 | Get message queue | ID or -1 |
msgsnd(msqid, msgp, size, flags) |
400 | Send message | 0 or -1 |
msgrcv(msqid, msgp, size, type, flags) |
401 | Receive message | Bytes received or -1 |
msgctl(msqid, cmd, buf) |
402 | Message queue control | 0 or -1 |
| Syscall | Number | Description | Returns |
|---|---|---|---|
shmget(key, size, flags) |
407 | Get shared memory segment | ID or -1 |
shmat(shmid, addr, flags) |
408 | Attach shared memory | Address or -1 |
shmdt(addr) |
409 | Detach shared memory | 0 or -1 |
shmctl(shmid, cmd, buf) |
410 | Shared memory control | 0 or -1 |
| Syscall | Number | Description | Returns |
|---|---|---|---|
uname(buf) |
122 | Get system information | 0 or -1 |
reboot(magic, magic2, cmd, arg) |
88 | Reboot system | Does not return or -1 |
| Code | Name | Value | Description |
|---|---|---|---|
EPERM |
1 | Operation not permitted | Insufficient privileges |
ENOENT |
2 | No such file or directory | File/path not found |
ESRCH |
3 | No such process | Invalid PID |
EINTR |
4 | Interrupted system call | Signal received during syscall |
EIO |
5 | I/O error | Hardware error |
ENXIO |
6 | No such device | Device not found |
E2BIG |
7 | Argument list too long | |
EBADF |
9 | Bad file descriptor | Invalid FD or FD not open |
ECHILD |
10 | No child processes | No children to wait for |
EAGAIN |
11 | Try again | Resource temporarily unavailable |
ENOMEM |
12 | Out of memory | Cannot allocate memory |
EACCES |
13 | Permission denied | File permission error |
EFAULT |
14 | Bad address | Invalid pointer |
EBUSY |
16 | Device or resource busy | |
EEXIST |
17 | File exists | File already exists (O_CREAT |
ENOTDIR |
20 | Not a directory | Path component is not a directory |
EISDIR |
21 | Is a directory | Cannot write to directory |
EINVAL |
22 | Invalid argument | Bad parameter value |
ENFILE |
23 | Too many open files in system | System limit |
EMFILE |
24 | Too many open files | Process limit |
ENOSPC |
28 | No space left on device | Disk full |
ESPIPE |
29 | Illegal seek | Cannot seek on pipe/socket |
ENOSYS |
38 | Function not implemented | Syscall not supported |
Goal: Call read/write without using libc wrapper functions.
Steps:
- Create
userspace/bin/raw_syscall.c:
// Minimal program that writes without libc
#include <sys/syscall.h>
void _start(void) {
// Write "Hello\n" to stdout (fd=1)
const char *msg = "Hello from raw syscall!\n";
int len = 24;
// Manually call sys_write (syscall 4)
// eax=4, ebx=1 (fd), ecx=msg, edx=len
long ret;
asm volatile(
"int $0x80"
: "=a"(ret)
: "a"(4), "b"(1), "c"(msg), "d"(len)
);
// Exit via syscall 1
asm volatile(
"int $0x80"
: /* no output */
: "a"(1), "b"(0)
);
while(1); // Unreachable, but required
}- Compile with
-nostdliband-nostartfiles - Run in MentOS and verify output
- Bonus: Try reading from stdin (fd=0) using syscall 3
Goal: Create a child process, monitor its execution, and reap it.
Program:
#include <unistd.h>
#include <stdio.h>
#include <sys/wait.h>
int main() {
pid_t pid = fork();
if (pid == 0) {
// Child process
printf("[Child] I am child PID %d\n", getpid());
sleep(2);
printf("[Child] Exiting\n");
exit(0);
} else if (pid > 0) {
// Parent process
printf("[Parent] Created child PID %d\n", pid);
int status;
pid_t waited = wait(&status);
printf("[Parent] Child %d exited with status %d\n", waited, status);
} else {
printf("Fork failed!\n");
}
return 0;
}Test:
- Compile and add to MentOS
- Run it - observe parent/child messages
- Try
strace fork_demoto see syscalls (if implemented) -
Bonus: Implement
waitpid()with PID polling
Goal: Use semaphore syscalls to synchronize processes.
Implementation:
#include <sys/sem.h>
#include <sys/ipc.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
int main() {
// Create semaphore set with key 12345
int semid = semget(12345, 1, IPC_CREAT | 0666);
if (semid < 0) {
perror("semget");
exit(1);
}
printf("Created semaphore set: %d\n", semid);
// Initialize semaphore to 1
union semun {
int val;
struct semid_ds *buf;
ushort *array;
} arg;
arg.val = 1;
semctl(semid, 0, SETVAL, arg);
// Try semaphore operations (wait, signal)
struct sembuf ops[1];
ops[0].sem_num = 0;
ops[0].sem_op = -1; // Wait (decrement)
ops[0].sem_flg = 0;
printf("Waiting on semaphore...\n");
semop(semid, ops, 1);
printf("Got semaphore!\n");
// Signal (increment)
ops[0].sem_op = 1;
semop(semid, ops, 1);
printf("Released semaphore\n");
return 0;
}Test:
- Compile and run
- Try in two terminals to see synchronization
- Bonus: Implement producer/consumer with semaphores
Goal: See all syscalls a program makes.
Approach:
Use MentOS syscall debugging (if available):
# Enable syscall tracing
make qemu-gdb
# In GDB:
(gdb) b sys_read
(gdb) b sys_write
(gdb) c
# Run your program, hit breakpointsOr write to kernel log:
// In kernel code
printk("syscall fork called\n");Advanced: Implement a user-space syscall tracer that logs all calls!
- Kernel - Kernel architecture and components
- C Library - Standard library implementation
- IPC - Inter-process communication mechanisms
- File Systems - VFS and filesystem details
- Userspace Programs - Writing userspace programs
Key Source Files:
- kernel/inc/system/syscall.h - Syscall declarations
- kernel/src/system/syscall.c - Syscall dispatcher and table
- kernel/src/process/ - Process management syscalls
- kernel/src/fs/ - Filesystem syscalls
- kernel/src/ipc/ - IPC syscalls
- lib/inc/ - POSIX API headers
- lib/src/ - Syscall wrappers
External References:
- Linux syscall table
- POSIX.1-2017 specification
- The Linux Programming Interface - Michael Kerrisk's book