In a traditional PID-based system, the supervisor tracks processes by their PID, leading to potential race conditions:
1234
).1234
as available for reuse.1234
.1234
, sees it exists, and mistakenly assumes the worker is still running.pidfd
Ensures Stable Process TrackingWith pidfd
, the supervisor tracks the worker using a stable file descriptor, instead of relying on PIDs.
pidfd
Works to Prevent This Bug1234
).pidfd
for the worker (pidfd = 5
).pidfd
is a stable reference to the worker process.pidfd
remains unique and cannot be reused.1234
is freed by the kernel. pidfd = 5
is automatically marked as invalid by the pidfd layer. 1234
. pidfd = 5
.pidfd = 5
: pidfd
is Powerfulpidfd
cannot be reused by another process. In 32-bit system, we can have /proc/sys/kernel/pid_max value 32768, but in 64-bit, it is 4194304. And we can change the value of pidfd using ulimit -n
and ulimit -Hn
.
When a process exits, its exit code and status (e.g., whether it was killed by a signal) are stored in the kernel, and the parent process can retrieve this status only once using wait(), waitpid(), or waitid(). After a successful wait() (or its variants), the kernel cleans up the process’s status, fully reaping and removing it from the system, meaning any further calls to wait() on that PID will fail because the process no longer exists. A pidfd allows referring to a process safely and race-free, but it does not change the "at-most-once" rule of wait(). If another thread or process has already waited on and reaped the process, calling waitid() on pidfd will fail since the exit status is gone, though the pidfd itself remains valid but cannot retrieve the exit status again.
#include <sys/syscall.h>
#include <unistd.h>
#include <signal.h>
#include <fcntl.h>
#include <stdio.h>
int main() {
pid_t worker_pid = fork();
if (worker_pid == 0) {
// Worker process: just sleep
sleep(100);
return 0;
}
// Supervisor process: open a pidfd for the worker
int pidfd = syscall(SYS_pidfd_open, worker_pid, 0);
if (pidfd == -1) {
perror("pidfd_open failed");
return 1;
}
// Send SIGKILL to terminate the worker process
if (syscall(SYS_pidfd_send_signal, pidfd, SIGKILL, NULL, 0) == -1) {
perror("pidfd_send_signal failed");
return 1;
}
printf("Worker process (PID %d) killed via pidfd.\n", worker_pid);
close(pidfd); // Clean up
return 0;
}
This is not possible to directly do from eBPF, but we can track memory refault ratio in eBPF and then also track those low-priority background processes, and send signal to kill/freeze.
But if we attach eBPF in tp/sched/sched_process_fork
, I think we should be able to get the parent and child process from the context of this hook and use process_madvice() or pidfd_send_signal(FREEZE or KILL) through kfunc.
Sample eBPF program
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1024);
__type(key, pid_t);
__type(value, u64);
} child_pid_map SEC(".maps");
SEC("tp/sched/sched_process_fork")
int handle_sched_process_fork(struct trace_event_raw_sched_process_fork *ctx) {
pid_t parent_pid = ctx->parent_pid;
pid_t child_pid = ctx->child_pid;
// Check if the parent is our supervisor process
u64 *supervisor_pid = bpf_map_lookup_elem(&child_pid_map, &parent_pid);
if (supervisor_pid) {
// Store the child PID to be killed
bpf_map_update_elem(&child_pid_map, &child_pid, &parent_pid, BPF_ANY);
}
return 0;
}
char LICENSE[] SEC("license") = "GPL";
Sample eBPF program loader,
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <sys/syscall.h>
#include <linux/pidfd.h>
#include <bpf/libbpf.h>
#include "kill_child.skel.h"
static int handle_event(void *ctx, void *data, size_t len) {
pid_t child_pid = *(pid_t *)data;
printf("Detected child PID %d - sending SIGKILL\n", child_pid);
// Send SIGKILL via pidfd
int pidfd = syscall(SYS_pidfd_open, child_pid, 0);
if (pidfd < 0) {
perror("pidfd_open");
return 1;
}
if (syscall(SYS_pidfd_send_signal, pidfd, SIGKILL, NULL, 0) < 0) {
perror("pidfd_send_signal");
}
close(pidfd);
return 0;
}
int main() {
struct kill_child_bpf *skel;
int err;
// Load and verify BPF application
skel = kill_child_bpf__open_and_load();
if (!skel) {
fprintf(stderr, "Failed to load BPF skeleton\n");
return 1;
}
// Register supervisor PID in the map
pid_t supervisor_pid = getpid();
err = bpf_map__update_elem(skel->maps.child_pid_map,
&supervisor_pid, sizeof(pid_t),
&supervisor_pid, sizeof(pid_t),
BPF_ANY);
if (err) {
fprintf(stderr, "Failed to update BPF map\n");
goto cleanup;
}
// Attach tracepoint handler
err = kill_child_bpf__attach(skel);
if (err) {
fprintf(stderr, "Failed to attach BPF program\n");
goto cleanup;
}
printf("Supervisor PID %d monitoring for child processes...\n", supervisor_pid);
// Sleep to keep the program running
while (1) {
sleep(1);
}
cleanup:
kill_child_bpf__destroy(skel);
return 0;
}
Compilation,
clang -g -O2 -target bpf -D__TARGET_ARCH_x86_64 -I/usr/include/x86_64-linux-gnu -c kill_child.bpf.c -o kill_child.bpf.o
bpftool gen skeleton kill_child.bpf.o > kill_child.skel.h
clang -g -O2 -Wall -I . -c kill_child.c -o kill_child.o
clang -Wall -O2 -g kill_child.o -lbpf -lelf -lz -o kill_child
and run using sudo ./kill_child
Q) What happens if someone tamber pidfd from the File Descriptor Table before process terminates (or while the process is still runnign)? It means it is security side of the process hijacking.