Memory Semantics
of 3
Code Examples
What is Copy-on-Write?
Naively, fork() would copy all of the parent’s memory to the child — that could be hundreds of megabytes! Instead, Linux uses copy-on-write (CoW): the child shares the parent’s memory pages until one of them writes to a page. Only then is that specific page copied. This makes fork() fast and memory-efficient.
Kernel creates new page table for child. Entries point to SAME physical pages as parent. Both parent and child pages marked
read-only.Hardware triggers a page fault because the page is read-only. The CPU traps into the kernel’s page fault handler.
Allocates a new physical page. Copies the old page content to it. Updates the faulting process’s page table to point to the new page. Marks new page read-write. Marks remaining shared page still read-only for the other process.
The faulting process now has its own writable copy of the page. Future writes to that page don’t trigger a fault. The other process still has the original.
shared, read-only
(parent’s)
(child’s new copy)
fork() is fast because of CoW. This example measures fork time with different memory sizes:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
#include <sys/wait.h>
#define ITERATIONS 100
double time_fork(size_t mem_size, int do_write)
{
/* Allocate memory before forking */
char *buf = malloc(mem_size);
if (buf) memset(buf, 'A', mem_size); /* touch all pages */
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
for (int i = 0; i < ITERATIONS; i++) {
pid_t pid = fork();
if (pid == -1) { perror("fork"); exit(1); }
if (pid == 0) {
if (do_write) {
/* Write to all pages: triggers CoW for each page */
memset(buf, 'B', mem_size);
}
/* No write: no CoW, very fast */
_exit(0);
}
wait(NULL);
}
clock_gettime(CLOCK_MONOTONIC, &end);
free(buf);
double elapsed = (end.tv_sec - start.tv_sec) * 1000.0
+ (end.tv_nsec - start.tv_nsec) / 1e6;
return elapsed / ITERATIONS;
}
int main(void)
{
size_t sizes[] = { 1*1024*1024, 10*1024*1024, 50*1024*1024 };
const char *names[] = { "1 MB", "10 MB", "50 MB" };
printf("%-8s %-20s %-20s\n",
"Size", "fork() no write (ms)", "fork() + write (ms)");
printf("%-8s %-20s %-20s\n",
"----", "--------------------", "-------------------");
for (int i = 0; i < 3; i++) {
double t_no_write = time_fork(sizes[i], 0);
double t_write = time_fork(sizes[i], 1);
printf("%-8s %-20.3f %-20.3f\n",
names[i], t_no_write, t_write);
}
printf("\nConclusion: fork() with CoW (no write) is fast\n");
printf("regardless of memory size. Writing triggers actual copy.\n");
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/wait.h>
long get_rss_kb(void)
{
FILE *f = fopen("/proc/self/status", "r");
if (!f) return -1;
char line[128];
long rss = -1;
while (fgets(line, sizeof(line), f)) {
if (strncmp(line, "VmRSS", 5) == 0) {
sscanf(line + 6, "%ld", &rss);
break;
}
}
fclose(f);
return rss;
}
int main(void)
{
size_t SIZE = 50 * 1024 * 1024; /* 50 MB */
char *buf = malloc(SIZE);
memset(buf, 'X', SIZE); /* actually allocate physical pages */
printf("[Parent before fork] RSS = %ld KB\n", get_rss_kb());
pid_t pid = fork();
if (pid == -1) { perror("fork"); exit(1); }
if (pid == 0) {
printf("[Child after fork, no write] RSS = %ld KB\n",
get_rss_kb());
/* Now write to all pages: triggers CoW */
memset(buf, 'Y', SIZE);
printf("[Child after write, CoW done] RSS = %ld KB\n",
get_rss_kb());
free(buf);
_exit(0);
}
wait(NULL);
printf("[Parent after child] RSS = %ld KB\n", get_rss_kb());
free(buf);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/wait.h>
#define ARRAY_SIZE (1024 * 1024) /* 1M integers = 4 MB */
/* Global array: lives in data/BSS segment */
static int big_array[ARRAY_SIZE];
int main(void)
{
/* Initialize: this touches all pages */
for (int i = 0; i < ARRAY_SIZE; i++)
big_array[i] = i;
printf("[Before fork] big_array[0]=%d, big_array[last]=%d\n",
big_array[0], big_array[ARRAY_SIZE-1]);
pid_t pid = fork();
if (pid == -1) { perror("fork"); exit(1); }
if (pid == 0) {
/* Child modifies ONLY first 10 elements */
/* This triggers CoW for just the FIRST page */
for (int i = 0; i < 10; i++)
big_array[i] = -1; /* CoW: first page copied */
printf("[Child ] big_array[0]=%d (modified)\n",
big_array[0]);
printf("[Child ] big_array[1000]=%d (still shared)\n",
big_array[1000]);
_exit(0);
}
wait(NULL);
/* Parent's array unchanged */
printf("[Parent] big_array[0]=%d (unchanged by child)\n",
big_array[0]);
printf("[Parent] big_array[1000]=%d\n", big_array[1000]);
return 0;
}
CoW defers physical memory copying until a write actually happens. After fork(), parent and child share the same physical pages (marked read-only). Only when a process writes to a page does the kernel copy that specific page. This makes fork() fast and memory-efficient, especially when fork() is followed immediately by exec() (no memory is ever copied in that case).
A write to any memory location in the shared pages. The hardware generates a page fault (because the shared pages are marked read-only). The kernel’s page fault handler detects it’s a CoW fault (not an actual access violation), allocates a new physical page, copies the content, updates the page table, and resumes the process.
Almost none. exec() discards all of the child’s address space and replaces it with the new program. Since exec() doesn’t write to any CoW pages before replacing them, no CoW copies are triggered. This is why the fork+exec combination is efficient despite the apparent overhead.
In CoW context, a page fault is a hardware exception triggered when a process tries to write to a read-only page. The CPU saves state and calls the OS page fault handler. The handler sees it’s a CoW page (not a true protection violation), makes a private copy, marks it writable, and restarts the faulting instruction transparently.
