feat: custom kernel build + boot fixes for working container runtime
Build a custom Tiny Core 17.0 kernel (6.18.2) with missing configs that the stock kernel lacks for container workloads: - CONFIG_CGROUP_BPF=y (cgroup v2 device control via BPF) - CONFIG_DEVTMPFS=y (auto-create /dev device nodes) - CONFIG_DEVTMPFS_MOUNT=y (auto-mount devtmpfs) - CONFIG_MEMCG=y (memory cgroup controller for memory.max) - CONFIG_CFS_BANDWIDTH=y (CPU bandwidth throttling for cpu.max) Also strips unnecessary subsystems (sound, GPU, wireless, Bluetooth, KVM, etc.) for minimal footprint on a headless K8s edge appliance. Init system fixes for successful boot-to-running-pods: - Add switch_root in init.sh to escape initramfs (runc pivot_root) - Add mountpoint guards in 00-early-mount.sh (skip if already mounted) - Create essential device nodes after switch_root (kmsg, console, etc.) - Enable cgroup v2 controller delegation with init process isolation - Mount BPF filesystem for cgroup v2 device control - Add mknod fallback from sysfs in 20-persistent-mount.sh for /dev/vda - Move KubeSolo binary to /usr/bin (avoid /usr/local bind mount hiding) - Generate /etc/machine-id in 60-hostname.sh (kubelet requires it) - Pre-initialize iptables tables before kube-proxy starts - Add nft_reject, nft_fib, xt_nfacct to kernel modules list Build system changes: - New build-kernel.sh script for custom kernel compilation - Dockerfile.builder adds kernel build deps (flex, bison, libelf, etc.) - Selective kernel module install (only modules.list + transitive deps) - Install iptables-nft (xtables-nft-multi) + shared libs in rootfs Tested: ISO boots in QEMU, node reaches Ready in ~35s, CoreDNS and local-path-provisioner pods start and run successfully. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,23 +1,62 @@
|
||||
#!/bin/sh
|
||||
# 00-early-mount.sh — Mount essential virtual filesystems
|
||||
|
||||
mount -t proc proc /proc 2>/dev/null || true
|
||||
mount -t sysfs sysfs /sys 2>/dev/null || true
|
||||
mount -t devtmpfs devtmpfs /dev 2>/dev/null || true
|
||||
mount -t tmpfs tmpfs /tmp
|
||||
mount -t tmpfs tmpfs /run
|
||||
# After switch_root, /proc /sys /dev are already mounted — only mount if missing
|
||||
if ! mountpoint -q /proc 2>/dev/null; then
|
||||
mount -t proc proc /proc 2>/dev/null || true
|
||||
fi
|
||||
if ! mountpoint -q /sys 2>/dev/null; then
|
||||
mount -t sysfs sysfs /sys 2>/dev/null || true
|
||||
fi
|
||||
if ! mountpoint -q /dev 2>/dev/null; then
|
||||
mount -t devtmpfs devtmpfs /dev 2>/dev/null || mount -t tmpfs tmpfs /dev
|
||||
fi
|
||||
if ! mountpoint -q /tmp 2>/dev/null; then
|
||||
mount -t tmpfs tmpfs /tmp
|
||||
fi
|
||||
if ! mountpoint -q /run 2>/dev/null; then
|
||||
mount -t tmpfs tmpfs /run
|
||||
fi
|
||||
|
||||
mkdir -p /dev/pts /dev/shm
|
||||
mount -t devpts devpts /dev/pts
|
||||
mount -t tmpfs tmpfs /dev/shm
|
||||
if ! mountpoint -q /dev/pts 2>/dev/null; then
|
||||
mount -t devpts devpts /dev/pts
|
||||
fi
|
||||
if ! mountpoint -q /dev/shm 2>/dev/null; then
|
||||
mount -t tmpfs tmpfs /dev/shm
|
||||
fi
|
||||
|
||||
# Mount cgroup2 unified hierarchy
|
||||
# Ensure essential device nodes exist (devtmpfs may be incomplete after switch_root)
|
||||
[ -e /dev/console ] || mknod -m 600 /dev/console c 5 1 2>/dev/null || true
|
||||
[ -e /dev/null ] || mknod -m 666 /dev/null c 1 3 2>/dev/null || true
|
||||
[ -e /dev/zero ] || mknod -m 666 /dev/zero c 1 5 2>/dev/null || true
|
||||
[ -e /dev/kmsg ] || mknod -m 660 /dev/kmsg c 1 11 2>/dev/null || true
|
||||
[ -e /dev/random ] || mknod -m 666 /dev/random c 1 8 2>/dev/null || true
|
||||
[ -e /dev/urandom ] || mknod -m 666 /dev/urandom c 1 9 2>/dev/null || true
|
||||
[ -e /dev/tty ] || mknod -m 666 /dev/tty c 5 0 2>/dev/null || true
|
||||
|
||||
# Set up BusyBox mdev as hotplug handler (creates /dev nodes for new devices)
|
||||
echo /sbin/mdev > /proc/sys/kernel/hotplug 2>/dev/null || true
|
||||
mdev -s 2>/dev/null || true
|
||||
|
||||
# Mount cgroup v2 unified hierarchy
|
||||
mkdir -p /sys/fs/cgroup
|
||||
mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null || {
|
||||
log_warn "cgroup v2 mount failed; attempting v1 fallback"
|
||||
mount -t tmpfs cgroup /sys/fs/cgroup
|
||||
for subsys in cpu cpuacct memory devices freezer pids; do
|
||||
mkdir -p "/sys/fs/cgroup/$subsys"
|
||||
mount -t cgroup -o "$subsys" "cgroup_${subsys}" "/sys/fs/cgroup/$subsys" 2>/dev/null || true
|
||||
if ! mountpoint -q /sys/fs/cgroup 2>/dev/null; then
|
||||
mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Enable ALL available controllers for child cgroups
|
||||
# Required: memory (memory.max), cpu (cpu.max), pids (pids.max)
|
||||
# First, move init process to its own cgroup so controllers can be enabled
|
||||
# (cgroup v2 "no internal process" rule for non-root cgroups)
|
||||
if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
|
||||
mkdir -p /sys/fs/cgroup/init
|
||||
echo $$ > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || true
|
||||
for ctrl in $(cat /sys/fs/cgroup/cgroup.controllers); do
|
||||
echo "+${ctrl}" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true
|
||||
done
|
||||
}
|
||||
fi
|
||||
|
||||
# Mount BPF filesystem (required for cgroup v2 device control via BPF)
|
||||
mkdir -p /sys/fs/bpf
|
||||
mount -t bpf bpf /sys/fs/bpf 2>/dev/null || true
|
||||
|
||||
Reference in New Issue
Block a user