Subtopic 2
Versions
Code Examples
How the Linux Kernel Decides Who Runs First
The Linux kernel’s decision about whether the parent or child runs first after fork() has changed across kernel versions. Modern Linux provides a tunable knob to control this behavior. Understanding the history helps you understand legacy code and write portable, correct programs.
| Kernel Version | Default Behavior | Rationale |
|---|---|---|
| Linux 2.2 and earlier | Parent ran first | Simpler to implement; parent holds the CPU it was using |
| Linux 2.4 | Parent ran first (usually) | Same approach, scheduler improvements |
| Linux 2.6 (early) | Child ran first | exec() usually follows; running child first avoids CoW copies the parent would cause |
| Linux 2.6.32+ (CFS) | Parent runs first (default) | Tunable via sched_child_runs_first=0. Avoids dirty page writes by parent before exec. |
| Modern Linux (5.x+) | Parent first (default) | sched_child_runs_first=0 by default. Programmer-controlled. |
# Check current value (0 = parent first, 1 = child first):
cat /proc/sys/kernel/sched_child_runs_first
# Set child-runs-first for testing (needs root):
echo 1 > /proc/sys/kernel/sched_child_runs_first
# Set parent-runs-first (default):
echo 0 > /proc/sys/kernel/sched_child_runs_first
# Also queryable via sysctl:
sysctl kernel.sched_child_runs_first
sched_child_runs_first=1, on a multicore system both parent and child may run simultaneously on different cores. The knob only affects which gets the first time slice on a single core. Synchronization is still required for correctness.#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
#include <time.h>
int main(void)
{
setbuf(stdout, NULL);
/* Read current sched_child_runs_first setting */
FILE *f = fopen("/proc/sys/kernel/sched_child_runs_first", "r");
if (f) {
int val;
fscanf(f, "%d", &val);
fclose(f);
printf("sched_child_runs_first = %d (%s runs first by default)\n",
val, val ? "child" : "parent");
}
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
long before = ts.tv_nsec;
pid_t pid = fork();
if (pid == -1) { perror("fork"); exit(1); }
if (pid == 0) {
clock_gettime(CLOCK_MONOTONIC, &ts);
printf("[Child PID=%d] Ran at ns offset: %ld\n",
getpid(), ts.tv_nsec - before);
_exit(0);
}
clock_gettime(CLOCK_MONOTONIC, &ts);
printf("[Parent PID=%d] Ran at ns offset: %ld\n",
getpid(), ts.tv_nsec - before);
wait(NULL);
printf("(Whoever has smaller offset ran first)\n");
return 0;
}
/* WHY CHILD-FIRST WAS PREFERRED IN EARLY 2.6:
Scenario: parent forks, child immediately execs.
If PARENT runs first after fork():
- Parent continues executing → may write to CoW pages
- Each write triggers a CoW copy (wasted work)
- Child then execs → discards all copied pages anyway
- Result: unnecessary memory copies
If CHILD runs first after fork():
- Child execs immediately → discards all pages
- No CoW copies happen at all
- Parent resumes with untouched shared pages
- Result: zero wasted memory copies
This is why early 2.6 kernels chose child-first.
Modern kernels switched back to parent-first because:
- On multicore systems, both run simultaneously anyway
- The optimization assumes exec follows immediately
- Overall system throughput was better with parent-first */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <sys/wait.h>
/* Measure fork+exec time to see effect of parent writes before exec */
int main(void)
{
/* Large allocation to make CoW cost visible */
char *buf = malloc(20 * 1024 * 1024);
memset(buf, 'X', 20 * 1024 * 1024);
struct timespec s, e;
clock_gettime(CLOCK_MONOTONIC, &s);
int N = 100;
for (int i = 0; i < N; i++) {
pid_t pid = fork();
if (pid == 0) {
/* Child: exec immediately, no CoW triggered */
char *argv[] = { "true", NULL };
execvp("true", argv);
_exit(0);
}
/* Parent: does NOT write to buf before waiting */
waitpid(pid, NULL, 0);
}
clock_gettime(CLOCK_MONOTONIC, &e);
double ms = ((e.tv_sec - s.tv_sec)*1e3 +
(e.tv_nsec - s.tv_nsec)/1e6) / N;
printf("fork+exec (no parent write): %.3f ms/call\n", ms);
free(buf);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
/* WRONG: assumes parent runs before child */
void bad_example(void)
{
int counter = 0;
pid_t pid = fork();
if (pid == -1) { perror("fork"); exit(1); }
if (pid == 0) {
/* Child assumes counter was already set by parent.
WRONG: parent might not have run yet! */
printf("[Child] counter = %d (expected 42!)\n", counter);
_exit(0);
}
counter = 42; /* Parent sets counter */
printf("[Parent] Set counter = %d\n", counter);
wait(NULL);
}
/* CORRECT: don't assume order; set values BEFORE fork */
void good_example(void)
{
int counter = 42; /* Set BEFORE fork — safe */
pid_t pid = fork();
if (pid == -1) { perror("fork"); exit(1); }
if (pid == 0) {
printf("[Child] counter = %d (correct: set before fork)\n",
counter);
_exit(0);
}
wait(NULL);
}
int main(void)
{
printf("=== BAD (order-dependent) ===\n");
bad_example();
printf("\n=== GOOD (order-independent) ===\n");
good_example();
return 0;
}
It controls whether the child (1) or parent (0) gets the CPU first after fork() on a single core. Default is 0 (parent first) on Linux 2.6.32+. Setting it to 1 can optimize fork+exec patterns by letting the child exec and discard pages before the parent writes to them (avoiding unnecessary CoW copies).
To optimize the fork+exec pattern. If the child runs first and immediately calls exec(), it discards the shared address space before the parent writes to any pages. This means zero CoW copies. If the parent ran first, it might write to pages (triggering CoW copies) which would then be discarded when the child exec’d — wasted work.
No. On a multicore machine, parent and child can run simultaneously on different CPUs. sched_child_runs_first only determines which gets the first time slice on one CPU. The other process may start on another CPU at the same time. Explicit synchronization (signals, pipes, semaphores) is the only reliable way to enforce ordering.
