Why epoll Was Invented
select() and poll() have one fundamental weakness: every call requires passing the complete list of file descriptors to the kernel, and the kernel scans everything each time. For 10,000 connections this is enormous waste, especially when only a handful are active at any moment.
epoll solves this by keeping the fd list inside the kernel. You add or remove fds once. When you call epoll_wait(), the kernel only returns fds that are actually ready — it does not scan everything. This makes epoll O(1) for event retrieval regardless of how many fds you are monitoring.
nginx, Node.js, Redis, and virtually every high-performance Linux server uses epoll at its core.
The epoll Mental Model — Interest List and Ready List
epoll works with two internal data structures inside the kernel:
The Three epoll Functions
1. epoll_create() — Create an epoll Instance
#include <sys/epoll.h>
int epoll_create(int size);
int epoll_create1(int flags); /* Preferred — use EPOLL_CLOEXEC */
/* Returns: epoll file descriptor on success, -1 on error */
/* size is ignored since Linux 2.6.8 but must be > 0 */
/* Use epoll_create1(EPOLL_CLOEXEC) in modern code */
int epfd = epoll_create1(EPOLL_CLOEXEC);
if (epfd == -1) {
perror("epoll_create1");
exit(1);
}
/* epfd is just a regular fd — close it when done */
2. epoll_ctl() — Add, Modify, or Remove File Descriptors
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
/* Returns: 0 on success, -1 on error */
/* op values: */
EPOLL_CTL_ADD /* Add fd to interest list */
EPOLL_CTL_MOD /* Modify events for fd already in interest list */
EPOLL_CTL_DEL /* Remove fd from interest list */
/* struct epoll_event */
struct epoll_event {
uint32_t events; /* EPOLLIN, EPOLLOUT, EPOLLERR, etc. */
epoll_data_t data; /* User data — returned when event fires */
};
typedef union epoll_data {
void *ptr; /* Pointer to anything */
int fd; /* Store the fd itself (most common) */
uint32_t u32;
uint64_t u64;
} epoll_data_t;
| Event Flag | Meaning |
|---|---|
| EPOLLIN | Data available for reading |
| EPOLLOUT | Space available for writing |
| EPOLLRDHUP | Peer closed write end (TCP half-close) |
| EPOLLERR | Error — always monitored automatically |
| EPOLLHUP | Hangup — always monitored automatically |
| EPOLLET | Use edge-triggered notification (default is level-triggered) |
| EPOLLONESHOT | Notify once only; then disable fd until re-armed with EPOLL_CTL_MOD |
3. epoll_wait() — Wait for Events
int epoll_wait(int epfd,
struct epoll_event *events,
int maxevents,
int timeout);
/* Returns: number of ready events, 0 on timeout, -1 on error */
/* events — array you provide; kernel fills with ready events */
/* maxevents — size of your events array */
/* timeout — milliseconds; -1 = forever, 0 = immediate */
/* After epoll_wait() returns, iterate events[0..n-1] */
/* Each events[i].data.fd tells you which fd is ready */
/* Each events[i].events tells you what happened */
Complete epoll Echo Server Example
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#define PORT 8080
#define MAX_EVENTS 64
#define BUF_SIZE 1024
/* Helper: set fd to nonblocking mode */
static void set_nonblocking(int fd)
{
int flags = fcntl(fd, F_GETFL, 0);
if (flags == -1) { perror("fcntl F_GETFL"); exit(1); }
if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) {
perror("fcntl F_SETFL"); exit(1);
}
}
/* Helper: add fd to epoll interest list */
static void epoll_add(int epfd, int fd, uint32_t events)
{
struct epoll_event ev;
ev.events = events;
ev.data.fd = fd; /* Store fd in data so we get it back in events */
if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
perror("epoll_ctl ADD"); exit(1);
}
}
int main(void)
{
int listen_fd, conn_fd, epfd;
struct sockaddr_in server_addr, client_addr;
socklen_t client_len = sizeof(client_addr);
struct epoll_event events[MAX_EVENTS];
char buf[BUF_SIZE];
int n, num_events, i;
/* Create listening socket */
listen_fd = socket(AF_INET, SOCK_STREAM, 0);
if (listen_fd == -1) { perror("socket"); exit(1); }
int opt = 1;
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
memset(&server_addr, 0, sizeof(server_addr));
server_addr.sin_family = AF_INET;
server_addr.sin_addr.s_addr = INADDR_ANY;
server_addr.sin_port = htons(PORT);
if (bind(listen_fd, (struct sockaddr *)&server_addr, sizeof(server_addr)) == -1) {
perror("bind"); exit(1);
}
if (listen(listen_fd, SOMAXCONN) == -1) { perror("listen"); exit(1); }
/* Make listening socket nonblocking */
set_nonblocking(listen_fd);
/* Step 1: Create epoll instance */
epfd = epoll_create1(EPOLL_CLOEXEC);
if (epfd == -1) { perror("epoll_create1"); exit(1); }
/* Step 2: Add listening socket to interest list */
epoll_add(epfd, listen_fd, EPOLLIN);
printf("epoll echo server on port %d\n", PORT);
/* Step 3: Event loop */
while (1) {
/* Block until events are ready */
num_events = epoll_wait(epfd, events, MAX_EVENTS, -1);
if (num_events == -1) {
if (errno == EINTR) continue; /* Interrupted by signal — retry */
perror("epoll_wait");
break;
}
/* Process each ready event */
for (i = 0; i < num_events; i++) {
int fd = events[i].data.fd;
/* Case 1: New connection on listening socket */
if (fd == listen_fd) {
conn_fd = accept(listen_fd,
(struct sockaddr *)&client_addr,
&client_len);
if (conn_fd == -1) {
perror("accept");
continue;
}
set_nonblocking(conn_fd);
epoll_add(epfd, conn_fd, EPOLLIN | EPOLLET);
printf("New client: fd %d from %s\n",
conn_fd, inet_ntoa(client_addr.sin_addr));
continue;
}
/* Case 2: Error or hangup on a client fd */
if (events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) {
printf("Client fd %d disconnected\n", fd);
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL);
close(fd);
continue;
}
/* Case 3: Data available on a connected client */
if (events[i].events & EPOLLIN) {
/* With edge-triggered, read until EAGAIN */
while (1) {
n = read(fd, buf, sizeof(buf));
if (n == -1) {
if (errno == EAGAIN || errno == EWOULDBLOCK)
break; /* No more data right now */
perror("read");
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL);
close(fd);
break;
}
if (n == 0) {
/* Client closed connection */
printf("Client fd %d closed connection\n", fd);
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL);
close(fd);
break;
}
/* Echo back to client */
write(fd, buf, n);
}
}
}
}
close(epfd);
close(listen_fd);
return 0;
}
epoll_ctl Operations — Add, Modify, Delete
/* ADD — start monitoring a new fd */
struct epoll_event ev;
ev.events = EPOLLIN | EPOLLOUT;
ev.data.fd = client_fd;
epoll_ctl(epfd, EPOLL_CTL_ADD, client_fd, &ev);
/* MODIFY — change which events to watch (e.g. add EPOLLOUT when you have data to send) */
ev.events = EPOLLIN | EPOLLOUT; /* Now also watch for writability */
ev.data.fd = client_fd;
epoll_ctl(epfd, EPOLL_CTL_MOD, client_fd, &ev);
/* DELETE — stop monitoring (call before close) */
epoll_ctl(epfd, EPOLL_CTL_DEL, client_fd, NULL); /* event can be NULL for DEL */
close(client_fd);
EPOLLONESHOT — One Event Then Disarm
When you add EPOLLONESHOT to the events, epoll will notify you once for the event and then automatically disarm the fd. To get more events for that fd, you must call epoll_ctl(EPOLL_CTL_MOD) to re-arm it.
This is useful in multi-threaded servers — you process the event in a thread pool and re-arm after processing, preventing two threads from getting events for the same fd simultaneously.
/* Add with EPOLLONESHOT */
ev.events = EPOLLIN | EPOLLONESHOT;
ev.data.fd = fd;
epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev);
/* After handling the event in your thread, re-arm */
ev.events = EPOLLIN | EPOLLONESHOT; /* Re-arm for next event */
ev.data.fd = fd;
epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ev);
Using data.ptr — Storing Per-Connection State
/* Instead of just storing fd, store a pointer to your connection struct */
struct connection {
int fd;
char buf[4096];
size_t buf_len;
char client_ip[INET_ADDRSTRLEN];
};
struct connection *conn = malloc(sizeof(*conn));
conn->fd = new_fd;
strncpy(conn->client_ip, client_ip_str, INET_ADDRSTRLEN);
conn->buf_len = 0;
struct epoll_event ev;
ev.events = EPOLLIN | EPOLLET;
ev.data.ptr = conn; /* Store pointer, not just fd */
epoll_ctl(epfd, EPOLL_CTL_ADD, new_fd, &ev);
/* When epoll_wait returns: */
struct connection *c = (struct connection *)events[i].data.ptr;
read(c->fd, c->buf, sizeof(c->buf));
/* You have both the fd and all context without any lookup */
