feat: custom kernel build + boot fixes for working container runtime
Build a custom Tiny Core 17.0 kernel (6.18.2) with missing configs that the stock kernel lacks for container workloads: - CONFIG_CGROUP_BPF=y (cgroup v2 device control via BPF) - CONFIG_DEVTMPFS=y (auto-create /dev device nodes) - CONFIG_DEVTMPFS_MOUNT=y (auto-mount devtmpfs) - CONFIG_MEMCG=y (memory cgroup controller for memory.max) - CONFIG_CFS_BANDWIDTH=y (CPU bandwidth throttling for cpu.max) Also strips unnecessary subsystems (sound, GPU, wireless, Bluetooth, KVM, etc.) for minimal footprint on a headless K8s edge appliance. Init system fixes for successful boot-to-running-pods: - Add switch_root in init.sh to escape initramfs (runc pivot_root) - Add mountpoint guards in 00-early-mount.sh (skip if already mounted) - Create essential device nodes after switch_root (kmsg, console, etc.) - Enable cgroup v2 controller delegation with init process isolation - Mount BPF filesystem for cgroup v2 device control - Add mknod fallback from sysfs in 20-persistent-mount.sh for /dev/vda - Move KubeSolo binary to /usr/bin (avoid /usr/local bind mount hiding) - Generate /etc/machine-id in 60-hostname.sh (kubelet requires it) - Pre-initialize iptables tables before kube-proxy starts - Add nft_reject, nft_fib, xt_nfacct to kernel modules list Build system changes: - New build-kernel.sh script for custom kernel compilation - Dockerfile.builder adds kernel build deps (flex, bison, libelf, etc.) - Selective kernel module install (only modules.list + transitive deps) - Install iptables-nft (xtables-nft-multi) + shared libs in rootfs Tested: ISO boots in QEMU, node reaches Ready in ~35s, CoreDNS and local-path-provisioner pods start and run successfully. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
33
init/init.sh
33
init/init.sh
@@ -16,6 +16,39 @@
|
||||
|
||||
set -e
|
||||
|
||||
# --- Switch root: escape initramfs so runc pivot_root works ---
|
||||
# The kernel boots into an initramfs (rootfs), which is a special mount that
|
||||
# doesn't support pivot_root. Container runtimes (runc) need pivot_root to
|
||||
# set up container root filesystems. To fix this, we copy the rootfs to a
|
||||
# tmpfs and switch_root to it. The sentinel file prevents infinite loops.
|
||||
if [ ! -f /etc/.switched_root ]; then
|
||||
mount -t proc proc /proc 2>/dev/null || true
|
||||
mount -t sysfs sysfs /sys 2>/dev/null || true
|
||||
mount -t devtmpfs devtmpfs /dev 2>/dev/null || true
|
||||
mkdir -p /mnt/newroot
|
||||
mount -t tmpfs -o size=400M,mode=755 tmpfs /mnt/newroot
|
||||
echo "[init] Copying rootfs to tmpfs..." >&2
|
||||
# Copy each top-level directory explicitly (BusyBox cp -ax on rootfs is broken)
|
||||
for d in bin sbin usr lib lib64 etc var opt; do
|
||||
[ -d "/$d" ] && cp -a "/$d" /mnt/newroot/ 2>/dev/null || true
|
||||
done
|
||||
# Recreate mount point and special directories
|
||||
mkdir -p /mnt/newroot/proc /mnt/newroot/sys /mnt/newroot/dev
|
||||
mkdir -p /mnt/newroot/run /mnt/newroot/tmp /mnt/newroot/mnt
|
||||
touch /mnt/newroot/etc/.switched_root
|
||||
mount --move /proc /mnt/newroot/proc
|
||||
mount --move /sys /mnt/newroot/sys
|
||||
mount --move /dev /mnt/newroot/dev
|
||||
echo "[init] Switching root..." >&2
|
||||
exec switch_root /mnt/newroot /sbin/init
|
||||
fi
|
||||
|
||||
# --- PATH setup ---
|
||||
# Ensure /usr/local paths are in PATH (iptables, KubeSolo, etc.)
|
||||
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
# iptables shared libraries live in /usr/local/lib
|
||||
export LD_LIBRARY_PATH="/usr/local/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
|
||||
|
||||
# --- Constants ---
|
||||
INIT_LIB="/usr/lib/kubesolo-os"
|
||||
INIT_STAGES="/usr/lib/kubesolo-os/init.d"
|
||||
|
||||
@@ -1,23 +1,62 @@
|
||||
#!/bin/sh
|
||||
# 00-early-mount.sh — Mount essential virtual filesystems
|
||||
|
||||
mount -t proc proc /proc 2>/dev/null || true
|
||||
mount -t sysfs sysfs /sys 2>/dev/null || true
|
||||
mount -t devtmpfs devtmpfs /dev 2>/dev/null || true
|
||||
mount -t tmpfs tmpfs /tmp
|
||||
mount -t tmpfs tmpfs /run
|
||||
# After switch_root, /proc /sys /dev are already mounted — only mount if missing
|
||||
if ! mountpoint -q /proc 2>/dev/null; then
|
||||
mount -t proc proc /proc 2>/dev/null || true
|
||||
fi
|
||||
if ! mountpoint -q /sys 2>/dev/null; then
|
||||
mount -t sysfs sysfs /sys 2>/dev/null || true
|
||||
fi
|
||||
if ! mountpoint -q /dev 2>/dev/null; then
|
||||
mount -t devtmpfs devtmpfs /dev 2>/dev/null || mount -t tmpfs tmpfs /dev
|
||||
fi
|
||||
if ! mountpoint -q /tmp 2>/dev/null; then
|
||||
mount -t tmpfs tmpfs /tmp
|
||||
fi
|
||||
if ! mountpoint -q /run 2>/dev/null; then
|
||||
mount -t tmpfs tmpfs /run
|
||||
fi
|
||||
|
||||
mkdir -p /dev/pts /dev/shm
|
||||
mount -t devpts devpts /dev/pts
|
||||
mount -t tmpfs tmpfs /dev/shm
|
||||
if ! mountpoint -q /dev/pts 2>/dev/null; then
|
||||
mount -t devpts devpts /dev/pts
|
||||
fi
|
||||
if ! mountpoint -q /dev/shm 2>/dev/null; then
|
||||
mount -t tmpfs tmpfs /dev/shm
|
||||
fi
|
||||
|
||||
# Mount cgroup2 unified hierarchy
|
||||
# Ensure essential device nodes exist (devtmpfs may be incomplete after switch_root)
|
||||
[ -e /dev/console ] || mknod -m 600 /dev/console c 5 1 2>/dev/null || true
|
||||
[ -e /dev/null ] || mknod -m 666 /dev/null c 1 3 2>/dev/null || true
|
||||
[ -e /dev/zero ] || mknod -m 666 /dev/zero c 1 5 2>/dev/null || true
|
||||
[ -e /dev/kmsg ] || mknod -m 660 /dev/kmsg c 1 11 2>/dev/null || true
|
||||
[ -e /dev/random ] || mknod -m 666 /dev/random c 1 8 2>/dev/null || true
|
||||
[ -e /dev/urandom ] || mknod -m 666 /dev/urandom c 1 9 2>/dev/null || true
|
||||
[ -e /dev/tty ] || mknod -m 666 /dev/tty c 5 0 2>/dev/null || true
|
||||
|
||||
# Set up BusyBox mdev as hotplug handler (creates /dev nodes for new devices)
|
||||
echo /sbin/mdev > /proc/sys/kernel/hotplug 2>/dev/null || true
|
||||
mdev -s 2>/dev/null || true
|
||||
|
||||
# Mount cgroup v2 unified hierarchy
|
||||
mkdir -p /sys/fs/cgroup
|
||||
mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null || {
|
||||
log_warn "cgroup v2 mount failed; attempting v1 fallback"
|
||||
mount -t tmpfs cgroup /sys/fs/cgroup
|
||||
for subsys in cpu cpuacct memory devices freezer pids; do
|
||||
mkdir -p "/sys/fs/cgroup/$subsys"
|
||||
mount -t cgroup -o "$subsys" "cgroup_${subsys}" "/sys/fs/cgroup/$subsys" 2>/dev/null || true
|
||||
if ! mountpoint -q /sys/fs/cgroup 2>/dev/null; then
|
||||
mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Enable ALL available controllers for child cgroups
|
||||
# Required: memory (memory.max), cpu (cpu.max), pids (pids.max)
|
||||
# First, move init process to its own cgroup so controllers can be enabled
|
||||
# (cgroup v2 "no internal process" rule for non-root cgroups)
|
||||
if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
|
||||
mkdir -p /sys/fs/cgroup/init
|
||||
echo $$ > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || true
|
||||
for ctrl in $(cat /sys/fs/cgroup/cgroup.controllers); do
|
||||
echo "+${ctrl}" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true
|
||||
done
|
||||
}
|
||||
fi
|
||||
|
||||
# Mount BPF filesystem (required for cgroup v2 device control via BPF)
|
||||
mkdir -p /sys/fs/bpf
|
||||
mount -t bpf bpf /sys/fs/bpf 2>/dev/null || true
|
||||
|
||||
@@ -8,11 +8,26 @@ if [ "$KUBESOLO_NOPERSIST" = "1" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Load block device drivers before waiting (modules loaded later in stage 30,
|
||||
# but we need virtio_blk available NOW for /dev/vda detection)
|
||||
modprobe virtio_blk 2>/dev/null || true
|
||||
# Trigger mdev to create device nodes after loading driver
|
||||
mdev -s 2>/dev/null || true
|
||||
|
||||
# Fallback: create device node from sysfs if devtmpfs/mdev didn't
|
||||
DEV_NAME="${KUBESOLO_DATA_DEV##*/}"
|
||||
if [ ! -b "$KUBESOLO_DATA_DEV" ] && [ -f "/sys/class/block/$DEV_NAME/dev" ]; then
|
||||
MAJMIN=$(cat "/sys/class/block/$DEV_NAME/dev")
|
||||
mknod "$KUBESOLO_DATA_DEV" b "${MAJMIN%%:*}" "${MAJMIN##*:}" 2>/dev/null || true
|
||||
log "Created $KUBESOLO_DATA_DEV via mknod ($MAJMIN)"
|
||||
fi
|
||||
|
||||
# Wait for device to appear (USB, slow disks, virtio)
|
||||
log "Waiting for data device: $KUBESOLO_DATA_DEV"
|
||||
WAIT_SECS=30
|
||||
for i in $(seq 1 "$WAIT_SECS"); do
|
||||
[ -b "$KUBESOLO_DATA_DEV" ] && break
|
||||
mdev -s 2>/dev/null || true
|
||||
sleep 1
|
||||
done
|
||||
|
||||
|
||||
@@ -18,16 +18,16 @@ fi
|
||||
# Fallback: DHCP on first non-loopback interface
|
||||
log "Configuring network via DHCP"
|
||||
|
||||
# Bring up loopback
|
||||
ip link set lo up
|
||||
ip addr add 127.0.0.1/8 dev lo
|
||||
# Bring up loopback (use ifconfig for BusyBox compatibility)
|
||||
ifconfig lo 127.0.0.1 netmask 255.0.0.0 up 2>/dev/null || \
|
||||
{ ip link set lo up 2>/dev/null && ip addr add 127.0.0.1/8 dev lo 2>/dev/null; } || true
|
||||
|
||||
# Find first ethernet interface
|
||||
ETH_DEV=""
|
||||
for iface in /sys/class/net/*; do
|
||||
iface="$(basename "$iface")"
|
||||
case "$iface" in
|
||||
lo|docker*|veth*|br*|cni*) continue ;;
|
||||
lo|docker*|veth*|br*|cni*|dummy*|tunl*|sit*) continue ;;
|
||||
esac
|
||||
ETH_DEV="$iface"
|
||||
break
|
||||
@@ -39,7 +39,7 @@ if [ -z "$ETH_DEV" ]; then
|
||||
fi
|
||||
|
||||
log "Using interface: $ETH_DEV"
|
||||
ip link set "$ETH_DEV" up
|
||||
ifconfig "$ETH_DEV" up 2>/dev/null || ip link set "$ETH_DEV" up 2>/dev/null || true
|
||||
|
||||
# Run DHCP client (BusyBox udhcpc)
|
||||
if command -v udhcpc >/dev/null 2>&1; then
|
||||
|
||||
@@ -31,4 +31,17 @@ hostname "$HOSTNAME"
|
||||
echo "$HOSTNAME" > /etc/hostname
|
||||
echo "127.0.0.1 $HOSTNAME" >> /etc/hosts
|
||||
|
||||
# Generate /etc/machine-id if missing (kubelet requires it)
|
||||
if [ ! -f /etc/machine-id ]; then
|
||||
if [ -f "$DATA_MOUNT/etc-kubesolo/machine-id" ]; then
|
||||
cp "$DATA_MOUNT/etc-kubesolo/machine-id" /etc/machine-id
|
||||
else
|
||||
# Generate from hostname hash (deterministic across reboots)
|
||||
printf '%s' "$HOSTNAME" | md5sum 2>/dev/null | cut -d' ' -f1 > /etc/machine-id || \
|
||||
cat /proc/sys/kernel/random/uuid 2>/dev/null | tr -d '-' > /etc/machine-id || true
|
||||
# Persist for next boot
|
||||
cp /etc/machine-id "$DATA_MOUNT/etc-kubesolo/machine-id" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
|
||||
log_ok "Hostname set to: $HOSTNAME"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# This stage exec's KubeSolo as PID 1 (replacing init).
|
||||
# KubeSolo manages containerd, kubelet, API server, and all K8s components.
|
||||
|
||||
KUBESOLO_BIN="/usr/local/bin/kubesolo"
|
||||
KUBESOLO_BIN="/usr/bin/kubesolo"
|
||||
|
||||
if [ ! -x "$KUBESOLO_BIN" ]; then
|
||||
log_err "KubeSolo binary not found at $KUBESOLO_BIN"
|
||||
@@ -12,7 +12,7 @@ if [ ! -x "$KUBESOLO_BIN" ]; then
|
||||
fi
|
||||
|
||||
# Build KubeSolo command line
|
||||
KUBESOLO_ARGS="--path /var/lib/kubesolo --local-storage true"
|
||||
KUBESOLO_ARGS="--path /var/lib/kubesolo --local-storage"
|
||||
|
||||
# Add extra SANs if hostname resolves
|
||||
HOSTNAME="$(hostname)"
|
||||
@@ -30,6 +30,17 @@ if [ -f /etc/kubesolo/extra-flags ]; then
|
||||
KUBESOLO_ARGS="$KUBESOLO_ARGS $(cat /etc/kubesolo/extra-flags)"
|
||||
fi
|
||||
|
||||
# Pre-initialize iptables filter table and base chains.
|
||||
# KubeSolo's kube-proxy uses iptables-restore (nf_tables backend) which needs
|
||||
# the filter table to exist. Without this, the first iptables-restore fails
|
||||
# with "RULE_APPEND failed (No such file or directory)".
|
||||
if command -v iptables >/dev/null 2>&1; then
|
||||
iptables -t filter -L -n >/dev/null 2>&1 || true
|
||||
iptables -t nat -L -n >/dev/null 2>&1 || true
|
||||
iptables -t mangle -L -n >/dev/null 2>&1 || true
|
||||
log "Pre-initialized iptables tables (filter, nat, mangle)"
|
||||
fi
|
||||
|
||||
log "Starting KubeSolo: $KUBESOLO_BIN $KUBESOLO_ARGS"
|
||||
log "Kubeconfig will be at: /var/lib/kubesolo/pki/admin/admin.kubeconfig"
|
||||
|
||||
|
||||
@@ -29,11 +29,16 @@ wait_for_file() {
|
||||
return 1
|
||||
}
|
||||
|
||||
# Get IP address of an interface (POSIX-safe, no grep -P)
|
||||
# Get IP address of an interface (BusyBox-safe: prefer ifconfig, fall back to ip)
|
||||
get_iface_ip() {
|
||||
iface="$1"
|
||||
ip -4 addr show "$iface" 2>/dev/null | \
|
||||
sed -n 's/.*inet \([0-9.]*\).*/\1/p' | head -1
|
||||
if command -v ifconfig >/dev/null 2>&1; then
|
||||
ifconfig "$iface" 2>/dev/null | \
|
||||
sed -n 's/.*inet addr:\([0-9.]*\).*/\1/p;s/.*inet \([0-9.]*\).*/\1/p' | head -1
|
||||
elif command -v ip >/dev/null 2>&1; then
|
||||
ip -4 addr show "$iface" 2>/dev/null | \
|
||||
sed -n 's/.*inet \([0-9.]*\).*/\1/p' | head -1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if running in a VM (useful for adjusting timeouts)
|
||||
|
||||
Reference in New Issue
Block a user