From 39732488ef049c23957fa5cc47cb9633b0dda231 Mon Sep 17 00:00:00 2001 From: Adolfo Delorenzo Date: Wed, 11 Feb 2026 23:13:31 -0600 Subject: [PATCH] feat: custom kernel build + boot fixes for working container runtime Build a custom Tiny Core 17.0 kernel (6.18.2) with missing configs that the stock kernel lacks for container workloads: - CONFIG_CGROUP_BPF=y (cgroup v2 device control via BPF) - CONFIG_DEVTMPFS=y (auto-create /dev device nodes) - CONFIG_DEVTMPFS_MOUNT=y (auto-mount devtmpfs) - CONFIG_MEMCG=y (memory cgroup controller for memory.max) - CONFIG_CFS_BANDWIDTH=y (CPU bandwidth throttling for cpu.max) Also strips unnecessary subsystems (sound, GPU, wireless, Bluetooth, KVM, etc.) for minimal footprint on a headless K8s edge appliance. Init system fixes for successful boot-to-running-pods: - Add switch_root in init.sh to escape initramfs (runc pivot_root) - Add mountpoint guards in 00-early-mount.sh (skip if already mounted) - Create essential device nodes after switch_root (kmsg, console, etc.) - Enable cgroup v2 controller delegation with init process isolation - Mount BPF filesystem for cgroup v2 device control - Add mknod fallback from sysfs in 20-persistent-mount.sh for /dev/vda - Move KubeSolo binary to /usr/bin (avoid /usr/local bind mount hiding) - Generate /etc/machine-id in 60-hostname.sh (kubelet requires it) - Pre-initialize iptables tables before kube-proxy starts - Add nft_reject, nft_fib, xt_nfacct to kernel modules list Build system changes: - New build-kernel.sh script for custom kernel compilation - Dockerfile.builder adds kernel build deps (flex, bison, libelf, etc.) - Selective kernel module install (only modules.list + transitive deps) - Install iptables-nft (xtables-nft-multi) + shared libs in rootfs Tested: ISO boots in QEMU, node reaches Ready in ~35s, CoreDNS and local-path-provisioner pods start and run successfully. Co-Authored-By: Claude Opus 4.6 --- Makefile | 11 +- build/Dockerfile.builder | 27 +++- build/config/modules.list | 84 ++++++++--- build/scripts/build-kernel.sh | 209 ++++++++++++++++++++++++++ build/scripts/fetch-components.sh | 132 +++++++++++++--- build/scripts/inject-kubesolo.sh | 242 +++++++++++++++++++++++++++++- init/init.sh | 33 ++++ init/lib/00-early-mount.sh | 69 +++++++-- init/lib/20-persistent-mount.sh | 15 ++ init/lib/50-network.sh | 10 +- init/lib/60-hostname.sh | 13 ++ init/lib/90-kubesolo.sh | 15 +- init/lib/functions.sh | 11 +- 13 files changed, 794 insertions(+), 77 deletions(-) create mode 100755 build/scripts/build-kernel.sh diff --git a/Makefile b/Makefile index b5dc9c9..317d2e2 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all fetch build-cloudinit build-update-agent build-cross rootfs initramfs \ +.PHONY: all fetch kernel build-cloudinit build-update-agent build-cross rootfs initramfs \ iso disk-image oci-image \ test-boot test-k8s test-persistence test-deploy test-storage test-all \ test-cloudinit test-update-agent \ @@ -30,6 +30,10 @@ fetch: # ============================================================================= # Build stages # ============================================================================= +kernel: + @echo "==> Building custom kernel (CONFIG_CGROUP_BPF=y)..." + $(BUILD_DIR)/scripts/build-kernel.sh + build-cloudinit: @echo "==> Building cloud-init binary..." $(BUILD_DIR)/scripts/build-cloudinit.sh @@ -38,7 +42,7 @@ build-update-agent: @echo "==> Building update agent..." $(BUILD_DIR)/scripts/build-update-agent.sh -rootfs: fetch build-cloudinit build-update-agent +rootfs: fetch kernel build-cloudinit build-update-agent @echo "==> Preparing rootfs..." $(BUILD_DIR)/scripts/extract-core.sh $(BUILD_DIR)/scripts/inject-kubesolo.sh @@ -176,7 +180,7 @@ docker-build: docker run --rm --privileged \ -v $(PWD)/$(OUTPUT_DIR):/output \ -v $(PWD)/$(CACHE_DIR):/cache \ - kubesolo-os-builder make iso OUTPUT_DIR=/output CACHE_DIR=/cache + kubesolo-os-builder iso OUTPUT_DIR=/output CACHE_DIR=/cache # ============================================================================= # Cleanup @@ -197,6 +201,7 @@ help: @echo "" @echo "Build targets:" @echo " make fetch Download Tiny Core ISO, KubeSolo, dependencies" + @echo " make kernel Build custom kernel with CONFIG_CGROUP_BPF=y" @echo " make build-cloudinit Build cloud-init Go binary" @echo " make build-update-agent Build update agent Go binary" @echo " make rootfs Extract + prepare rootfs with KubeSolo" diff --git a/build/Dockerfile.builder b/build/Dockerfile.builder index cbcb1a2..de008d3 100644 --- a/build/Dockerfile.builder +++ b/build/Dockerfile.builder @@ -1,19 +1,30 @@ -FROM ubuntu:24.04 +FROM --platform=linux/amd64 ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive +# Install build tools + kernel build dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ bash \ - bsdtar \ + bc \ + bison \ + build-essential \ + ca-certificates \ cpio \ curl \ dosfstools \ + dwarves \ e2fsprogs \ fdisk \ + file \ + flex \ genisoimage \ gzip \ isolinux \ - losetup \ + iptables \ + kmod \ + libarchive-tools \ + libelf-dev \ + libssl-dev \ make \ parted \ squashfs-tools \ @@ -25,10 +36,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ xz-utils \ && rm -rf /var/lib/apt/lists/* +# Install Go (for building cloud-init and update agent) +ARG GO_VERSION=1.24.0 +RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \ + | tar -C /usr/local -xzf - +ENV PATH="/usr/local/go/bin:${PATH}" + WORKDIR /build COPY . /build -RUN chmod +x build/scripts/*.sh build/config/*.sh +RUN chmod +x build/scripts/*.sh build/config/*.sh \ + && chmod +x hack/*.sh 2>/dev/null || true \ + && chmod +x test/qemu/*.sh test/integration/*.sh test/kernel/*.sh 2>/dev/null || true ENTRYPOINT ["/usr/bin/make"] CMD ["iso"] diff --git a/build/config/modules.list b/build/config/modules.list index 04f7d76..86a4608 100644 --- a/build/config/modules.list +++ b/build/config/modules.list @@ -1,30 +1,78 @@ # Kernel modules loaded at boot by init # One module per line. Lines starting with # are ignored. -# Modules are loaded in order listed. +# Modules are loaded in order listed — dependencies must come first. -# Networking — bridge and netfilter (required for K8s pod networking) -br_netfilter -bridge -veth -vxlan +# Network device drivers (loaded early so interfaces are available) +e1000 +e1000e +virtio_net -# Netfilter / iptables (required for kube-proxy and service routing) -ip_tables -iptable_nat -iptable_filter -iptable_mangle -nf_nat -nf_conntrack -nf_conntrack_netlink +# Virtio support (for VMs — block, entropy) +virtio_blk +virtio_rng # Filesystem — overlay (required for containerd) overlay -# Conntrack (required for K8s services) -nf_conntrack +# Netfilter dependencies (must load before conntrack) +nf_defrag_ipv4 +nf_defrag_ipv6 -# Optional — useful for CNI plugins and diagnostics -tun +# Netfilter / connection tracking (required for kube-proxy) +nf_conntrack +nf_nat +nf_conntrack_netlink + +# nftables (modern iptables backend — kernel 6.18 uses nf_tables, not ip_tables) +nf_tables +nft_compat +nft_chain_nat +nft_ct +nft_masq +nft_nat +nft_redir + +# Netfilter xt match/target modules (used by kube-proxy iptables rules via nft_compat) +xt_conntrack +xt_MASQUERADE +xt_mark +xt_comment +xt_multiport +xt_nat +xt_addrtype +xt_connmark +xt_REDIRECT +xt_recent +xt_statistic +xt_set + +# nft extras (reject, fib — used by kube-proxy nf_tables rules) +nft_reject +nft_reject_ipv4 +nft_reject_ipv6 +nft_fib +nft_fib_ipv4 +nft_fib_ipv6 + +# Reject targets (used by kube-proxy iptables-restore rules) +nf_reject_ipv4 +nf_reject_ipv6 +ipt_REJECT +ip6t_REJECT + +# nfacct extension (kube-proxy probes for it) +xt_nfacct + +# Networking — bridge and netfilter (required for K8s pod networking) +# Load order: llc → stp → bridge → br_netfilter +llc +stp +bridge +br_netfilter +veth +vxlan + +# IPVS — useful for kube-proxy IPVS mode and CNI plugins ip_vs ip_vs_rr ip_vs_wrr diff --git a/build/scripts/build-kernel.sh b/build/scripts/build-kernel.sh new file mode 100755 index 0000000..17a1982 --- /dev/null +++ b/build/scripts/build-kernel.sh @@ -0,0 +1,209 @@ +#!/bin/bash +# build-kernel.sh — Build custom Tiny Core kernel with CONFIG_CGROUP_BPF=y +# +# The stock Tiny Core 17.0 kernel (6.18.2-tinycore64) lacks CONFIG_CGROUP_BPF, +# which is required for cgroup v2 device control in runc/containerd. +# This script downloads the TC-patched kernel source, enables CONFIG_CGROUP_BPF, +# and builds vmlinuz + modules. +# +# Output is cached in $CACHE_DIR/custom-kernel/ and reused across builds. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +CACHE_DIR="${CACHE_DIR:-$PROJECT_ROOT/build/cache}" + +# shellcheck source=../config/versions.env +. "$SCRIPT_DIR/../config/versions.env" + +KVER="6.18.2-tinycore64" +KERNEL_BASE_URL="https://distro.ibiblio.org/tinycorelinux/${TINYCORE_VERSION%%.*}.x/${TINYCORE_ARCH}/release/src/kernel" +KERNEL_SRC_URL="${KERNEL_BASE_URL}/linux-6.18.2-patched.tar.xz" +KERNEL_CFG_URL="${KERNEL_BASE_URL}/config-${KVER}" + +CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel" +CUSTOM_VMLINUZ="$CUSTOM_KERNEL_DIR/vmlinuz" +CUSTOM_MODULES="$CUSTOM_KERNEL_DIR/modules" + +mkdir -p "$CACHE_DIR" "$CUSTOM_KERNEL_DIR" + +# --- Skip if already built --- +if [ -f "$CUSTOM_VMLINUZ" ] && [ -d "$CUSTOM_MODULES/lib/modules/$KVER" ]; then + echo "==> Custom kernel already built (cached)" + echo " vmlinuz: $CUSTOM_VMLINUZ ($(du -h "$CUSTOM_VMLINUZ" | cut -f1))" + MOD_COUNT=$(find "$CUSTOM_MODULES/lib/modules/$KVER" -name '*.ko*' | wc -l) + echo " Modules: $MOD_COUNT modules in $CUSTOM_MODULES/lib/modules/$KVER" + exit 0 +fi + +echo "==> Building custom kernel with CONFIG_CGROUP_BPF=y..." +echo " Kernel version: $KVER" + +# --- Download kernel source --- +KERNEL_SRC_ARCHIVE="$CACHE_DIR/linux-6.18.2-patched.tar.xz" +if [ ! -f "$KERNEL_SRC_ARCHIVE" ]; then + echo "==> Downloading kernel source (~149 MB)..." + echo " URL: $KERNEL_SRC_URL" + wget -q --show-progress -O "$KERNEL_SRC_ARCHIVE" "$KERNEL_SRC_URL" 2>/dev/null || \ + curl -fSL "$KERNEL_SRC_URL" -o "$KERNEL_SRC_ARCHIVE" + echo " Downloaded: $(du -h "$KERNEL_SRC_ARCHIVE" | cut -f1)" +else + echo "==> Kernel source already cached: $(du -h "$KERNEL_SRC_ARCHIVE" | cut -f1)" +fi + +# --- Download stock config --- +KERNEL_CFG="$CACHE_DIR/config-${KVER}" +if [ ! -f "$KERNEL_CFG" ]; then + echo "==> Downloading stock kernel config..." + echo " URL: $KERNEL_CFG_URL" + wget -q -O "$KERNEL_CFG" "$KERNEL_CFG_URL" 2>/dev/null || \ + curl -fSL "$KERNEL_CFG_URL" -o "$KERNEL_CFG" +else + echo "==> Stock kernel config already cached" +fi + +# --- Extract source --- +# IMPORTANT: Must extract on a case-sensitive filesystem. The kernel source has +# files that differ only by case (e.g., xt_mark.h vs xt_MARK.h). If the cache +# is on macOS (case-insensitive APFS), extraction silently loses files. +# Use /tmp inside the container (ext4, case-sensitive) for the build. +KERNEL_BUILD_DIR="/tmp/kernel-build" +rm -rf "$KERNEL_BUILD_DIR" +mkdir -p "$KERNEL_BUILD_DIR" + +echo "==> Extracting kernel source (case-sensitive filesystem)..." +tar -xf "$KERNEL_SRC_ARCHIVE" -C "$KERNEL_BUILD_DIR" + +# Find the extracted source directory (could be linux-6.18.2 or linux-6.18.2-patched) +KERNEL_SRC_DIR=$(find "$KERNEL_BUILD_DIR" -maxdepth 1 -type d -name 'linux-*' | head -1) +if [ -z "$KERNEL_SRC_DIR" ]; then + echo "ERROR: Could not find kernel source directory after extraction" + ls -la "$KERNEL_BUILD_DIR"/ + exit 1 +fi +echo " Source dir: $(basename "$KERNEL_SRC_DIR")" + +cd "$KERNEL_SRC_DIR" + +# --- Apply stock config + enable CONFIG_CGROUP_BPF --- +echo "==> Applying stock Tiny Core config..." +cp "$KERNEL_CFG" .config + +echo "==> Enabling required kernel configs..." +./scripts/config --enable CONFIG_CGROUP_BPF +./scripts/config --enable CONFIG_DEVTMPFS +./scripts/config --enable CONFIG_DEVTMPFS_MOUNT +./scripts/config --enable CONFIG_MEMCG +./scripts/config --enable CONFIG_CFS_BANDWIDTH + +# --- Strip unnecessary subsystems for smallest footprint --- +# This is a headless K8s edge appliance — no sound, GPU, wireless, etc. +echo "==> Disabling unnecessary subsystems for minimal footprint..." + +# Sound subsystem (not needed on headless appliance) +./scripts/config --disable SOUND + +# GPU/DRM (serial console only, no display) +./scripts/config --disable DRM + +# KVM hypervisor (this IS the guest/bare metal, not a hypervisor) +./scripts/config --disable KVM + +# Media/camera/TV/radio (not needed) +./scripts/config --disable MEDIA_SUPPORT + +# Wireless networking (wired edge device) +./scripts/config --disable WIRELESS +./scripts/config --disable WLAN +./scripts/config --disable CFG80211 + +# Bluetooth (not needed) +./scripts/config --disable BT + +# NFC (not needed) +./scripts/config --disable NFC + +# Infiniband (not needed on edge) +./scripts/config --disable INFINIBAND + +# PCMCIA (legacy, not needed) +./scripts/config --disable PCMCIA + +# Amateur radio (not needed) +./scripts/config --disable HAMRADIO + +# ISDN (not needed) +./scripts/config --disable ISDN + +# ATM networking (not needed) +./scripts/config --disable ATM + +# Joystick/gamepad (not needed) +./scripts/config --disable INPUT_JOYSTICK +./scripts/config --disable INPUT_TABLET + +# FPGA (not needed) +./scripts/config --disable FPGA + +# Resolve dependencies (olddefconfig accepts defaults for new options) +make olddefconfig + +# Verify CONFIG_CGROUP_BPF is set +if grep -q 'CONFIG_CGROUP_BPF=y' .config; then + echo " CONFIG_CGROUP_BPF=y confirmed in .config" +else + echo "ERROR: CONFIG_CGROUP_BPF not set after olddefconfig" + grep 'CGROUP_BPF' .config || echo " (CGROUP_BPF not found in .config)" + echo "" + echo "Prerequisites check:" + grep -E 'CONFIG_BPF=|CONFIG_BPF_SYSCALL=' .config || echo " BPF not found" + exit 1 +fi + +# Show what changed +echo " Config diff from stock:" +diff "$KERNEL_CFG" .config | grep '^[<>]' | head -20 || echo " (no differences beyond CGROUP_BPF)" + +# --- Build kernel + modules --- +NPROC=$(nproc 2>/dev/null || echo 4) +echo "" +echo "==> Building kernel (${NPROC} parallel jobs)..." +echo " This may take 15-25 minutes..." + +make -j"$NPROC" bzImage modules 2>&1 + +echo "==> Kernel build complete" + +# --- Install to staging --- +echo "==> Installing vmlinuz..." +cp arch/x86/boot/bzImage "$CUSTOM_VMLINUZ" + +echo "==> Installing modules (stripped)..." +rm -rf "$CUSTOM_MODULES" +mkdir -p "$CUSTOM_MODULES" +make INSTALL_MOD_STRIP=1 modules_install INSTALL_MOD_PATH="$CUSTOM_MODULES" + +# Remove build/source symlinks (they point to the build dir which won't exist in rootfs) +rm -f "$CUSTOM_MODULES/lib/modules/$KVER/build" +rm -f "$CUSTOM_MODULES/lib/modules/$KVER/source" + +# Run depmod to generate proper module dependency files +echo "==> Running depmod..." +depmod -a -b "$CUSTOM_MODULES" "$KVER" 2>/dev/null || true + +# Save the final config for reference +cp .config "$CUSTOM_KERNEL_DIR/.config" + +# --- Clean up build dir (large, ~1.5 GB) --- +echo "==> Cleaning kernel build directory..." +cd / +rm -rf "$KERNEL_BUILD_DIR" + +# --- Summary --- +echo "" +echo "==> Custom kernel build complete:" +echo " vmlinuz: $CUSTOM_VMLINUZ ($(du -h "$CUSTOM_VMLINUZ" | cut -f1))" +MOD_COUNT=$(find "$CUSTOM_MODULES/lib/modules/$KVER" -name '*.ko*' | wc -l) +echo " Modules: $MOD_COUNT modules" +echo " Modules size: $(du -sh "$CUSTOM_MODULES/lib/modules/$KVER" | cut -f1)" +echo "" diff --git a/build/scripts/fetch-components.sh b/build/scripts/fetch-components.sh index dc44289..77b08c0 100755 --- a/build/scripts/fetch-components.sh +++ b/build/scripts/fetch-components.sh @@ -31,36 +31,126 @@ else fi # --- KubeSolo --- -KUBESOLO_INSTALLER="$CACHE_DIR/install-kubesolo.sh" +KUBESOLO_VERSION="${KUBESOLO_VERSION:-v1.1.0}" KUBESOLO_BIN="$CACHE_DIR/kubesolo" if [ -f "$KUBESOLO_BIN" ]; then echo "==> KubeSolo binary already cached: $KUBESOLO_BIN" else - echo "==> Downloading KubeSolo installer..." - curl -sfL "$KUBESOLO_INSTALL_URL" -o "$KUBESOLO_INSTALLER" + echo "==> Downloading KubeSolo ${KUBESOLO_VERSION}..." - echo "==> Extracting KubeSolo binary..." - echo " NOTE: The installer normally runs 'install'. We extract the binary URL instead." - echo " For Phase 1 PoC, install KubeSolo on a host and copy the binary." - echo "" - echo " Manual step required:" - echo " 1. On a Linux x86_64 host: curl -sfL https://get.kubesolo.io | sudo sh -" - echo " 2. Copy /usr/local/bin/kubesolo to: $KUBESOLO_BIN" - echo " 3. Re-run: make rootfs" - echo "" + # Determine architecture + ARCH="${TARGET_ARCH:-amd64}" + OS="linux" - # Try to extract download URL from installer script - BINARY_URL=$(grep -oP 'https://[^ ]+kubesolo[^ ]+' "$KUBESOLO_INSTALLER" 2>/dev/null | head -1 || true) - if [ -n "$BINARY_URL" ]; then - echo " Attempting direct download from: $BINARY_URL" - curl -sfL "$BINARY_URL" -o "$KUBESOLO_BIN" && chmod +x "$KUBESOLO_BIN" || { - echo " Direct download failed. Use manual step above." - } + # Build download URL from GitHub releases + # Available variants: kubesolo-v1.1.0-linux-amd64.tar.gz, kubesolo-v1.1.0-linux-amd64-musl.tar.gz + # We use the musl variant for maximum compatibility with Tiny Core Linux (musl-based) + BIN_URL="https://github.com/portainer/kubesolo/releases/download/${KUBESOLO_VERSION}/kubesolo-${KUBESOLO_VERSION}-${OS}-${ARCH}-musl.tar.gz" + BIN_URL_FALLBACK="https://github.com/portainer/kubesolo/releases/download/${KUBESOLO_VERSION}/kubesolo-${KUBESOLO_VERSION}-${OS}-${ARCH}.tar.gz" + + TEMP_DIR=$(mktemp -d) + trap "rm -rf '$TEMP_DIR'" EXIT + + echo " URL: $BIN_URL" + if curl -fSL "$BIN_URL" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then + echo " Downloaded musl variant" + elif curl -fSL "$BIN_URL_FALLBACK" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then + echo " Downloaded glibc variant (fallback)" + else + echo "ERROR: Failed to download KubeSolo from GitHub." + echo " Tried: $BIN_URL" + echo " Tried: $BIN_URL_FALLBACK" + echo "" + echo " Manual step:" + echo " 1. Download from: https://github.com/portainer/kubesolo/releases" + echo " 2. Extract and copy binary to: $KUBESOLO_BIN" + echo " 3. Re-run: make rootfs" + exit 1 fi - if [ -f "$KUBESOLO_BIN" ]; then - echo "==> KubeSolo binary: $KUBESOLO_BIN ($(du -h "$KUBESOLO_BIN" | cut -f1))" + # Extract binary from tarball + tar -xzf "$TEMP_DIR/kubesolo.tar.gz" -C "$TEMP_DIR" + + # Find the kubesolo binary in extracted contents + FOUND_BIN=$(find "$TEMP_DIR" -name "kubesolo" -type f ! -name "*.tar.gz" | head -1) + if [ -z "$FOUND_BIN" ]; then + echo "ERROR: Could not find kubesolo binary in extracted archive" + echo " Archive contents:" + ls -la "$TEMP_DIR"/ + exit 1 + fi + + cp "$FOUND_BIN" "$KUBESOLO_BIN" + chmod +x "$KUBESOLO_BIN" + + trap - EXIT + rm -rf "$TEMP_DIR" + + echo "==> KubeSolo binary: $KUBESOLO_BIN ($(du -h "$KUBESOLO_BIN" | cut -f1))" +fi + +# --- Tiny Core kernel module extensions (netfilter, iptables) --- +# The base Tiny Core initramfs does NOT include netfilter kernel modules. +# They are distributed as separate TCZ (squashfs) extensions. +# KubeSolo requires netfilter for kube-proxy, iptables NAT, conntrack, etc. + +# Detect kernel version from the cached ISO +KVER="" +if [ -f "$TC_ISO" ]; then + # Try to detect kernel version from ISO without mounting + # Tiny Core 17.0 uses 6.18.2-tinycore64 + KVER="6.18.2-tinycore64" +fi + +NETFILTER_TCZ="$CACHE_DIR/ipv6-netfilter-${KVER}.tcz" +NETFILTER_TCZ_URL="https://distro.ibiblio.org/tinycorelinux/${TINYCORE_VERSION%%.*}.x/${TINYCORE_ARCH}/tcz/ipv6-netfilter-${KVER}.tcz" + +if [ -f "$NETFILTER_TCZ" ]; then + echo "==> Netfilter modules already cached: $NETFILTER_TCZ" +else + echo "==> Downloading netfilter kernel modules (ipv6-netfilter-${KVER}.tcz)..." + echo " URL: $NETFILTER_TCZ_URL" + if wget -q --show-progress -O "$NETFILTER_TCZ" "$NETFILTER_TCZ_URL" 2>/dev/null || \ + curl -fSL "$NETFILTER_TCZ_URL" -o "$NETFILTER_TCZ" 2>/dev/null; then + echo "==> Downloaded: $NETFILTER_TCZ ($(du -h "$NETFILTER_TCZ" | cut -f1))" + else + echo "WARN: Failed to download netfilter modules. kube-proxy may not work." + rm -f "$NETFILTER_TCZ" + fi +fi + +NET_BRIDGING_TCZ="$CACHE_DIR/net-bridging-${KVER}.tcz" +NET_BRIDGING_TCZ_URL="https://distro.ibiblio.org/tinycorelinux/${TINYCORE_VERSION%%.*}.x/${TINYCORE_ARCH}/tcz/net-bridging-${KVER}.tcz" + +if [ -f "$NET_BRIDGING_TCZ" ]; then + echo "==> Net-bridging modules already cached: $NET_BRIDGING_TCZ" +else + echo "==> Downloading net-bridging kernel modules (net-bridging-${KVER}.tcz)..." + echo " URL: $NET_BRIDGING_TCZ_URL" + if wget -q --show-progress -O "$NET_BRIDGING_TCZ" "$NET_BRIDGING_TCZ_URL" 2>/dev/null || \ + curl -fSL "$NET_BRIDGING_TCZ_URL" -o "$NET_BRIDGING_TCZ" 2>/dev/null; then + echo "==> Downloaded: $NET_BRIDGING_TCZ ($(du -h "$NET_BRIDGING_TCZ" | cut -f1))" + else + echo "WARN: Failed to download net-bridging modules. CNI bridge may not work." + rm -f "$NET_BRIDGING_TCZ" + fi +fi + +IPTABLES_TCZ="$CACHE_DIR/iptables.tcz" +IPTABLES_TCZ_URL="https://distro.ibiblio.org/tinycorelinux/${TINYCORE_VERSION%%.*}.x/${TINYCORE_ARCH}/tcz/iptables.tcz" + +if [ -f "$IPTABLES_TCZ" ]; then + echo "==> iptables userspace already cached: $IPTABLES_TCZ" +else + echo "==> Downloading iptables userspace tools..." + echo " URL: $IPTABLES_TCZ_URL" + if wget -q --show-progress -O "$IPTABLES_TCZ" "$IPTABLES_TCZ_URL" 2>/dev/null || \ + curl -fSL "$IPTABLES_TCZ_URL" -o "$IPTABLES_TCZ" 2>/dev/null; then + echo "==> Downloaded: $IPTABLES_TCZ ($(du -h "$IPTABLES_TCZ" | cut -f1))" + else + echo "WARN: Failed to download iptables. KubeSolo bundles its own but this is a fallback." + rm -f "$IPTABLES_TCZ" fi fi diff --git a/build/scripts/inject-kubesolo.sh b/build/scripts/inject-kubesolo.sh index f046778..f1236bf 100755 --- a/build/scripts/inject-kubesolo.sh +++ b/build/scripts/inject-kubesolo.sh @@ -25,15 +25,19 @@ fi echo "==> Injecting KubeSolo into rootfs..." # --- 1. KubeSolo binary --- -mkdir -p "$ROOTFS/usr/local/bin" -cp "$KUBESOLO_BIN" "$ROOTFS/usr/local/bin/kubesolo" -chmod +x "$ROOTFS/usr/local/bin/kubesolo" +# Install to /usr/bin (NOT /usr/local/bin) because /usr/local is bind-mounted +# from the data partition at boot, which would hide the binary. +mkdir -p "$ROOTFS/usr/bin" +cp "$KUBESOLO_BIN" "$ROOTFS/usr/bin/kubesolo" +chmod +x "$ROOTFS/usr/bin/kubesolo" echo " Installed KubeSolo binary ($(du -h "$KUBESOLO_BIN" | cut -f1))" # --- 2. Custom init system --- echo " Installing init system..." -# Main init +# Main init — remove symlink first to avoid clobbering busybox +# (Tiny Core has /sbin/init -> ../bin/busybox; cp follows symlinks) +rm -f "$ROOTFS/sbin/init" cp "$PROJECT_ROOT/init/init.sh" "$ROOTFS/sbin/init" chmod +x "$ROOTFS/sbin/init" @@ -83,7 +87,233 @@ else echo " WARN: Update agent not found (run 'make build-update-agent' to build)" fi -# --- 3. Kernel modules list --- +# --- 3. Custom kernel or TCZ kernel modules --- +# If a custom kernel was built (with CONFIG_CGROUP_BPF=y), use it. +# Otherwise fall back to TCZ-extracted modules with manual modules.dep. +CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel" +CUSTOM_VMLINUZ="$CUSTOM_KERNEL_DIR/vmlinuz" +CUSTOM_MODULES="$CUSTOM_KERNEL_DIR/modules" + +# Detect kernel version from rootfs +KVER="" +for d in "$ROOTFS"/lib/modules/*/; do + [ -d "$d" ] && KVER="$(basename "$d")" && break +done + +if [ -z "$KVER" ]; then + echo " WARN: Could not detect kernel version from rootfs" +fi + +echo " Kernel version: $KVER" + +if [ -f "$CUSTOM_VMLINUZ" ] && [ -d "$CUSTOM_MODULES/lib/modules/$KVER" ]; then + # ========================================================================= + # Custom kernel path — selective module install (only what modules.list needs) + # ========================================================================= + echo " Using custom kernel (CONFIG_CGROUP_BPF=y)..." + + # Replace vmlinuz + cp "$CUSTOM_VMLINUZ" "$ROOTFS_DIR/vmlinuz" + echo " Installed custom vmlinuz ($(du -h "$CUSTOM_VMLINUZ" | cut -f1))" + + # Selectively install ONLY modules from modules.list + their transitive deps. + # This keeps the initramfs minimal — no sound, GPU, SCSI, etc. modules. + echo " Installing kernel modules (selective — modules.list + deps only)..." + CUSTOM_MOD_DIR="$CUSTOM_MODULES/lib/modules/$KVER" + + rm -rf "$ROOTFS/lib/modules/$KVER" + mkdir -p "$ROOTFS/lib/modules/$KVER/kernel" + + # Copy module metadata files (needed by modprobe) + for f in modules.builtin modules.builtin.modinfo modules.order \ + modules.builtin.alias.bin modules.builtin.bin; do + [ -f "$CUSTOM_MOD_DIR/$f" ] && cp "$CUSTOM_MOD_DIR/$f" "$ROOTFS/lib/modules/$KVER/" + done + + # Use modprobe --show-depends to resolve each module + its transitive deps + MODULES_LIST="$PROJECT_ROOT/build/config/modules.list" + NEEDED_MODS=$(mktemp) + while IFS= read -r mod; do + # Skip comments and blank lines + case "$mod" in \#*|"") continue ;; esac + mod=$(echo "$mod" | xargs) # trim whitespace + [ -z "$mod" ] && continue + + # modprobe -S -d --show-depends lists all deps in load order + # Output format: "insmod /path/to/module.ko" — extract path with awk + modprobe -S "$KVER" -d "$CUSTOM_MODULES" --show-depends "$mod" 2>/dev/null \ + | awk '/^insmod/{print $2}' >> "$NEEDED_MODS" \ + || echo " WARN: modprobe could not resolve: $mod" + done < "$MODULES_LIST" + + # Deduplicate and copy each needed module + sort -u "$NEEDED_MODS" | while IFS= read -r mod_path; do + mod_path=$(echo "$mod_path" | xargs) # trim whitespace + [ -z "$mod_path" ] && continue + # mod_path is absolute (e.g., /path/to/custom-kernel/modules/lib/modules/KVER/kernel/...) + if [ ! -f "$mod_path" ]; then + echo " WARN: module not found: $mod_path" + continue + fi + # Get the relative path under lib/modules/KVER/ + rel_path="${mod_path#$CUSTOM_MOD_DIR/}" + dst="$ROOTFS/lib/modules/$KVER/$rel_path" + mkdir -p "$(dirname "$dst")" + cp "$mod_path" "$dst" + done + rm -f "$NEEDED_MODS" + + # Run depmod on the selective module set to generate correct metadata + depmod -a -b "$ROOTFS" "$KVER" 2>/dev/null || true + + MOD_COUNT=$(find "$ROOTFS/lib/modules/$KVER" -name '*.ko*' | wc -l) + MOD_SIZE=$(du -sh "$ROOTFS/lib/modules/$KVER" | cut -f1) + echo " Installed $MOD_COUNT kernel modules ($MOD_SIZE) — minimal set" + +else + # ========================================================================= + # Stock kernel path — extract TCZ modules + manual modules.dep + # ========================================================================= + echo " No custom kernel found, using stock kernel with TCZ modules..." + + if [ -n "$KVER" ]; then + ROOTFS_MOD_DST="$ROOTFS/lib/modules/$KVER/kernel" + + NETFILTER_TCZ="$CACHE_DIR/ipv6-netfilter-${KVER}.tcz" + if [ -f "$NETFILTER_TCZ" ]; then + echo " Extracting netfilter modules from $(basename "$NETFILTER_TCZ")..." + TCZ_TMP=$(mktemp -d) + if command -v unsquashfs >/dev/null 2>&1; then + unsquashfs -d "$TCZ_TMP/content" "$NETFILTER_TCZ" >/dev/null 2>&1 + else + echo " ERROR: unsquashfs not found (install squashfs-tools)" + rm -rf "$TCZ_TMP" + exit 1 + fi + TCZ_MOD_SRC="$TCZ_TMP/content/usr/local/lib/modules/$KVER/kernel" + if [ -d "$TCZ_MOD_SRC" ]; then + find "$TCZ_MOD_SRC" -name '*.ko.gz' | while IFS= read -r mod_file; do + rel_path="${mod_file#$TCZ_MOD_SRC/}" + dst_dir="$ROOTFS_MOD_DST/$(dirname "$rel_path")" + mkdir -p "$dst_dir" + cp "$mod_file" "$dst_dir/" + done + MOD_COUNT=$(find "$TCZ_MOD_SRC" -name '*.ko.gz' | wc -l) + echo " Installed $MOD_COUNT kernel modules from netfilter TCZ" + fi + rm -rf "$TCZ_TMP" + else + echo " WARN: Netfilter TCZ not found. kube-proxy may not work." + fi + + NET_BRIDGING_TCZ="$CACHE_DIR/net-bridging-${KVER}.tcz" + if [ -f "$NET_BRIDGING_TCZ" ]; then + echo " Extracting bridge modules from $(basename "$NET_BRIDGING_TCZ")..." + TCZ_TMP=$(mktemp -d) + unsquashfs -d "$TCZ_TMP/content" "$NET_BRIDGING_TCZ" >/dev/null 2>&1 + TCZ_MOD_SRC="$TCZ_TMP/content/usr/local/lib/modules/$KVER/kernel" + if [ -d "$TCZ_MOD_SRC" ]; then + find "$TCZ_MOD_SRC" -name '*.ko.gz' | while IFS= read -r mod_file; do + rel_path="${mod_file#$TCZ_MOD_SRC/}" + dst_dir="$ROOTFS_MOD_DST/$(dirname "$rel_path")" + mkdir -p "$dst_dir" + cp "$mod_file" "$dst_dir/" + done + BR_COUNT=$(find "$TCZ_MOD_SRC" -name '*.ko.gz' | wc -l) + echo " Installed $BR_COUNT kernel modules from net-bridging TCZ" + fi + rm -rf "$TCZ_TMP" + else + echo " WARN: Net-bridging TCZ not found. CNI bridge networking may not work." + fi + + # Manual modules.dep for stock kernel (Ubuntu's depmod can't handle TC's kernel) + MODULES_DEP="$ROOTFS/lib/modules/$KVER/modules.dep" + if [ -f "$MODULES_DEP" ]; then + echo " Appending module entries to modules.dep..." + cat >> "$MODULES_DEP" << 'MODDEP' +kernel/net/ipv6/ipv6.ko.gz: +kernel/net/ipv4/netfilter/nf_defrag_ipv4.ko.gz: +kernel/net/ipv6/netfilter/nf_defrag_ipv6.ko.gz: kernel/net/ipv6/ipv6.ko.gz +kernel/net/netfilter/nf_conntrack.ko.gz: kernel/net/ipv4/netfilter/nf_defrag_ipv4.ko.gz kernel/net/ipv6/netfilter/nf_defrag_ipv6.ko.gz +kernel/net/netfilter/nf_nat.ko.gz: kernel/net/netfilter/nf_conntrack.ko.gz +kernel/net/netfilter/nf_conntrack_netlink.ko.gz: kernel/net/netfilter/nf_conntrack.ko.gz +kernel/net/netfilter/nf_tables.ko.gz: kernel/net/netfilter/nf_conntrack.ko.gz +kernel/net/netfilter/nft_compat.ko.gz: kernel/net/netfilter/nf_tables.ko.gz +kernel/net/netfilter/nft_chain_nat.ko.gz: kernel/net/netfilter/nf_tables.ko.gz kernel/net/netfilter/nf_nat.ko.gz +kernel/net/netfilter/nft_ct.ko.gz: kernel/net/netfilter/nf_tables.ko.gz kernel/net/netfilter/nf_conntrack.ko.gz +kernel/net/netfilter/nft_masq.ko.gz: kernel/net/netfilter/nf_tables.ko.gz kernel/net/netfilter/nf_nat.ko.gz kernel/net/netfilter/nf_conntrack.ko.gz +kernel/net/netfilter/nft_nat.ko.gz: kernel/net/netfilter/nf_tables.ko.gz kernel/net/netfilter/nf_nat.ko.gz +kernel/net/netfilter/nft_redir.ko.gz: kernel/net/netfilter/nf_tables.ko.gz kernel/net/netfilter/nf_nat.ko.gz +kernel/net/netfilter/xt_conntrack.ko.gz: kernel/net/netfilter/nf_conntrack.ko.gz +kernel/net/netfilter/xt_MASQUERADE.ko.gz: kernel/net/netfilter/nf_nat.ko.gz kernel/net/netfilter/nf_conntrack.ko.gz +kernel/net/netfilter/xt_mark.ko.gz: +kernel/net/netfilter/xt_comment.ko.gz: +kernel/net/netfilter/xt_multiport.ko.gz: +kernel/net/netfilter/xt_nat.ko.gz: kernel/net/netfilter/nf_nat.ko.gz +kernel/net/netfilter/xt_addrtype.ko.gz: +kernel/net/netfilter/xt_connmark.ko.gz: kernel/net/netfilter/nf_conntrack.ko.gz +kernel/net/netfilter/xt_REDIRECT.ko.gz: kernel/net/netfilter/nf_nat.ko.gz +kernel/net/netfilter/xt_recent.ko.gz: +kernel/net/netfilter/xt_statistic.ko.gz: +kernel/net/netfilter/xt_set.ko.gz: kernel/net/netfilter/ipset/ip_set.ko.gz +kernel/net/netfilter/ipset/ip_set.ko.gz: +kernel/net/ipv4/netfilter/nf_reject_ipv4.ko.gz: +kernel/net/ipv6/netfilter/nf_reject_ipv6.ko.gz: +kernel/net/ipv4/netfilter/ipt_REJECT.ko.gz: kernel/net/ipv4/netfilter/nf_reject_ipv4.ko.gz +kernel/net/ipv6/netfilter/ip6t_REJECT.ko.gz: kernel/net/ipv6/netfilter/nf_reject_ipv6.ko.gz +kernel/net/netfilter/nft_reject.ko.gz: kernel/net/netfilter/nf_tables.ko.gz +kernel/net/bridge/bridge.ko.gz: kernel/net/802/stp.ko.gz kernel/net/llc/llc.ko.gz +kernel/net/bridge/br_netfilter.ko.gz: kernel/net/bridge/bridge.ko.gz kernel/net/802/stp.ko.gz kernel/net/llc/llc.ko.gz +kernel/net/bridge/netfilter/nf_conntrack_bridge.ko.gz: kernel/net/netfilter/nf_conntrack.ko.gz kernel/net/bridge/bridge.ko.gz +MODDEP + + find "$ROOTFS_MOD_DST" -name '*.ko.gz' -path '*/net/*' | sort | while IFS= read -r mod_file; do + rel_path="kernel/${mod_file#$ROOTFS_MOD_DST/}" + if ! grep -q "^${rel_path}:" "$MODULES_DEP" 2>/dev/null; then + echo "${rel_path}:" >> "$MODULES_DEP" + fi + done + echo " Updated modules.dep with netfilter entries" + fi + fi +fi + +# Install iptables-nft (nftables-based iptables) from the builder system +# Kernel 6.18 uses nf_tables, not legacy ip_tables, so we need xtables-nft-multi +echo " Installing iptables-nft from builder..." +if [ -f /usr/sbin/xtables-nft-multi ]; then + mkdir -p "$ROOTFS/usr/sbin" + cp /usr/sbin/xtables-nft-multi "$ROOTFS/usr/sbin/" + + # Create standard symlinks + for cmd in iptables iptables-save iptables-restore ip6tables ip6tables-save ip6tables-restore; do + ln -sf xtables-nft-multi "$ROOTFS/usr/sbin/$cmd" + done + + # Copy required shared libraries + mkdir -p "$ROOTFS/usr/lib/x86_64-linux-gnu" "$ROOTFS/lib/x86_64-linux-gnu" "$ROOTFS/lib64" + for lib in \ + /lib/x86_64-linux-gnu/libxtables.so.12* \ + /lib/x86_64-linux-gnu/libmnl.so.0* \ + /lib/x86_64-linux-gnu/libnftnl.so.11* \ + /lib/x86_64-linux-gnu/libc.so.6 \ + /lib64/ld-linux-x86-64.so.2; do + [ -e "$lib" ] && cp -aL "$lib" "$ROOTFS${lib}" 2>/dev/null || true + done + + # Copy xtables modules directory (match extensions) + if [ -d /usr/lib/x86_64-linux-gnu/xtables ]; then + mkdir -p "$ROOTFS/usr/lib/x86_64-linux-gnu/xtables" + cp -a /usr/lib/x86_64-linux-gnu/xtables/*.so "$ROOTFS/usr/lib/x86_64-linux-gnu/xtables/" 2>/dev/null || true + fi + + echo " Installed iptables-nft (xtables-nft-multi) + shared libs" +else + echo " WARN: xtables-nft-multi not found in builder (install iptables package)" +fi + +# Kernel modules list (for init to load at boot) cp "$PROJECT_ROOT/build/config/modules.list" "$ROOTFS/usr/lib/kubesolo-os/modules.list" # --- 4. Sysctl config --- @@ -139,6 +369,6 @@ fi echo "" echo "==> Injection complete. Rootfs contents:" echo " Total size: $(du -sh "$ROOTFS" | cut -f1)" -echo " KubeSolo: $(du -h "$ROOTFS/usr/local/bin/kubesolo" | cut -f1)" +echo " KubeSolo: $(du -h "$ROOTFS/usr/bin/kubesolo" | cut -f1)" echo " Init stages: $(ls "$ROOTFS/usr/lib/kubesolo-os/init.d/" | wc -l)" echo "" diff --git a/init/init.sh b/init/init.sh index 5712e74..0b87de6 100755 --- a/init/init.sh +++ b/init/init.sh @@ -16,6 +16,39 @@ set -e +# --- Switch root: escape initramfs so runc pivot_root works --- +# The kernel boots into an initramfs (rootfs), which is a special mount that +# doesn't support pivot_root. Container runtimes (runc) need pivot_root to +# set up container root filesystems. To fix this, we copy the rootfs to a +# tmpfs and switch_root to it. The sentinel file prevents infinite loops. +if [ ! -f /etc/.switched_root ]; then + mount -t proc proc /proc 2>/dev/null || true + mount -t sysfs sysfs /sys 2>/dev/null || true + mount -t devtmpfs devtmpfs /dev 2>/dev/null || true + mkdir -p /mnt/newroot + mount -t tmpfs -o size=400M,mode=755 tmpfs /mnt/newroot + echo "[init] Copying rootfs to tmpfs..." >&2 + # Copy each top-level directory explicitly (BusyBox cp -ax on rootfs is broken) + for d in bin sbin usr lib lib64 etc var opt; do + [ -d "/$d" ] && cp -a "/$d" /mnt/newroot/ 2>/dev/null || true + done + # Recreate mount point and special directories + mkdir -p /mnt/newroot/proc /mnt/newroot/sys /mnt/newroot/dev + mkdir -p /mnt/newroot/run /mnt/newroot/tmp /mnt/newroot/mnt + touch /mnt/newroot/etc/.switched_root + mount --move /proc /mnt/newroot/proc + mount --move /sys /mnt/newroot/sys + mount --move /dev /mnt/newroot/dev + echo "[init] Switching root..." >&2 + exec switch_root /mnt/newroot /sbin/init +fi + +# --- PATH setup --- +# Ensure /usr/local paths are in PATH (iptables, KubeSolo, etc.) +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" +# iptables shared libraries live in /usr/local/lib +export LD_LIBRARY_PATH="/usr/local/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" + # --- Constants --- INIT_LIB="/usr/lib/kubesolo-os" INIT_STAGES="/usr/lib/kubesolo-os/init.d" diff --git a/init/lib/00-early-mount.sh b/init/lib/00-early-mount.sh index 952c8c0..3cc97b8 100755 --- a/init/lib/00-early-mount.sh +++ b/init/lib/00-early-mount.sh @@ -1,23 +1,62 @@ #!/bin/sh # 00-early-mount.sh — Mount essential virtual filesystems -mount -t proc proc /proc 2>/dev/null || true -mount -t sysfs sysfs /sys 2>/dev/null || true -mount -t devtmpfs devtmpfs /dev 2>/dev/null || true -mount -t tmpfs tmpfs /tmp -mount -t tmpfs tmpfs /run +# After switch_root, /proc /sys /dev are already mounted — only mount if missing +if ! mountpoint -q /proc 2>/dev/null; then + mount -t proc proc /proc 2>/dev/null || true +fi +if ! mountpoint -q /sys 2>/dev/null; then + mount -t sysfs sysfs /sys 2>/dev/null || true +fi +if ! mountpoint -q /dev 2>/dev/null; then + mount -t devtmpfs devtmpfs /dev 2>/dev/null || mount -t tmpfs tmpfs /dev +fi +if ! mountpoint -q /tmp 2>/dev/null; then + mount -t tmpfs tmpfs /tmp +fi +if ! mountpoint -q /run 2>/dev/null; then + mount -t tmpfs tmpfs /run +fi mkdir -p /dev/pts /dev/shm -mount -t devpts devpts /dev/pts -mount -t tmpfs tmpfs /dev/shm +if ! mountpoint -q /dev/pts 2>/dev/null; then + mount -t devpts devpts /dev/pts +fi +if ! mountpoint -q /dev/shm 2>/dev/null; then + mount -t tmpfs tmpfs /dev/shm +fi -# Mount cgroup2 unified hierarchy +# Ensure essential device nodes exist (devtmpfs may be incomplete after switch_root) +[ -e /dev/console ] || mknod -m 600 /dev/console c 5 1 2>/dev/null || true +[ -e /dev/null ] || mknod -m 666 /dev/null c 1 3 2>/dev/null || true +[ -e /dev/zero ] || mknod -m 666 /dev/zero c 1 5 2>/dev/null || true +[ -e /dev/kmsg ] || mknod -m 660 /dev/kmsg c 1 11 2>/dev/null || true +[ -e /dev/random ] || mknod -m 666 /dev/random c 1 8 2>/dev/null || true +[ -e /dev/urandom ] || mknod -m 666 /dev/urandom c 1 9 2>/dev/null || true +[ -e /dev/tty ] || mknod -m 666 /dev/tty c 5 0 2>/dev/null || true + +# Set up BusyBox mdev as hotplug handler (creates /dev nodes for new devices) +echo /sbin/mdev > /proc/sys/kernel/hotplug 2>/dev/null || true +mdev -s 2>/dev/null || true + +# Mount cgroup v2 unified hierarchy mkdir -p /sys/fs/cgroup -mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null || { - log_warn "cgroup v2 mount failed; attempting v1 fallback" - mount -t tmpfs cgroup /sys/fs/cgroup - for subsys in cpu cpuacct memory devices freezer pids; do - mkdir -p "/sys/fs/cgroup/$subsys" - mount -t cgroup -o "$subsys" "cgroup_${subsys}" "/sys/fs/cgroup/$subsys" 2>/dev/null || true +if ! mountpoint -q /sys/fs/cgroup 2>/dev/null; then + mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null || true +fi + +# Enable ALL available controllers for child cgroups +# Required: memory (memory.max), cpu (cpu.max), pids (pids.max) +# First, move init process to its own cgroup so controllers can be enabled +# (cgroup v2 "no internal process" rule for non-root cgroups) +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + mkdir -p /sys/fs/cgroup/init + echo $$ > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || true + for ctrl in $(cat /sys/fs/cgroup/cgroup.controllers); do + echo "+${ctrl}" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true done -} +fi + +# Mount BPF filesystem (required for cgroup v2 device control via BPF) +mkdir -p /sys/fs/bpf +mount -t bpf bpf /sys/fs/bpf 2>/dev/null || true diff --git a/init/lib/20-persistent-mount.sh b/init/lib/20-persistent-mount.sh index 30fc662..415c424 100755 --- a/init/lib/20-persistent-mount.sh +++ b/init/lib/20-persistent-mount.sh @@ -8,11 +8,26 @@ if [ "$KUBESOLO_NOPERSIST" = "1" ]; then return 0 fi +# Load block device drivers before waiting (modules loaded later in stage 30, +# but we need virtio_blk available NOW for /dev/vda detection) +modprobe virtio_blk 2>/dev/null || true +# Trigger mdev to create device nodes after loading driver +mdev -s 2>/dev/null || true + +# Fallback: create device node from sysfs if devtmpfs/mdev didn't +DEV_NAME="${KUBESOLO_DATA_DEV##*/}" +if [ ! -b "$KUBESOLO_DATA_DEV" ] && [ -f "/sys/class/block/$DEV_NAME/dev" ]; then + MAJMIN=$(cat "/sys/class/block/$DEV_NAME/dev") + mknod "$KUBESOLO_DATA_DEV" b "${MAJMIN%%:*}" "${MAJMIN##*:}" 2>/dev/null || true + log "Created $KUBESOLO_DATA_DEV via mknod ($MAJMIN)" +fi + # Wait for device to appear (USB, slow disks, virtio) log "Waiting for data device: $KUBESOLO_DATA_DEV" WAIT_SECS=30 for i in $(seq 1 "$WAIT_SECS"); do [ -b "$KUBESOLO_DATA_DEV" ] && break + mdev -s 2>/dev/null || true sleep 1 done diff --git a/init/lib/50-network.sh b/init/lib/50-network.sh index 918f940..fcdf2be 100755 --- a/init/lib/50-network.sh +++ b/init/lib/50-network.sh @@ -18,16 +18,16 @@ fi # Fallback: DHCP on first non-loopback interface log "Configuring network via DHCP" -# Bring up loopback -ip link set lo up -ip addr add 127.0.0.1/8 dev lo +# Bring up loopback (use ifconfig for BusyBox compatibility) +ifconfig lo 127.0.0.1 netmask 255.0.0.0 up 2>/dev/null || \ + { ip link set lo up 2>/dev/null && ip addr add 127.0.0.1/8 dev lo 2>/dev/null; } || true # Find first ethernet interface ETH_DEV="" for iface in /sys/class/net/*; do iface="$(basename "$iface")" case "$iface" in - lo|docker*|veth*|br*|cni*) continue ;; + lo|docker*|veth*|br*|cni*|dummy*|tunl*|sit*) continue ;; esac ETH_DEV="$iface" break @@ -39,7 +39,7 @@ if [ -z "$ETH_DEV" ]; then fi log "Using interface: $ETH_DEV" -ip link set "$ETH_DEV" up +ifconfig "$ETH_DEV" up 2>/dev/null || ip link set "$ETH_DEV" up 2>/dev/null || true # Run DHCP client (BusyBox udhcpc) if command -v udhcpc >/dev/null 2>&1; then diff --git a/init/lib/60-hostname.sh b/init/lib/60-hostname.sh index 9331813..cbc9f5d 100755 --- a/init/lib/60-hostname.sh +++ b/init/lib/60-hostname.sh @@ -31,4 +31,17 @@ hostname "$HOSTNAME" echo "$HOSTNAME" > /etc/hostname echo "127.0.0.1 $HOSTNAME" >> /etc/hosts +# Generate /etc/machine-id if missing (kubelet requires it) +if [ ! -f /etc/machine-id ]; then + if [ -f "$DATA_MOUNT/etc-kubesolo/machine-id" ]; then + cp "$DATA_MOUNT/etc-kubesolo/machine-id" /etc/machine-id + else + # Generate from hostname hash (deterministic across reboots) + printf '%s' "$HOSTNAME" | md5sum 2>/dev/null | cut -d' ' -f1 > /etc/machine-id || \ + cat /proc/sys/kernel/random/uuid 2>/dev/null | tr -d '-' > /etc/machine-id || true + # Persist for next boot + cp /etc/machine-id "$DATA_MOUNT/etc-kubesolo/machine-id" 2>/dev/null || true + fi +fi + log_ok "Hostname set to: $HOSTNAME" diff --git a/init/lib/90-kubesolo.sh b/init/lib/90-kubesolo.sh index 9cd496a..840d5d3 100755 --- a/init/lib/90-kubesolo.sh +++ b/init/lib/90-kubesolo.sh @@ -4,7 +4,7 @@ # This stage exec's KubeSolo as PID 1 (replacing init). # KubeSolo manages containerd, kubelet, API server, and all K8s components. -KUBESOLO_BIN="/usr/local/bin/kubesolo" +KUBESOLO_BIN="/usr/bin/kubesolo" if [ ! -x "$KUBESOLO_BIN" ]; then log_err "KubeSolo binary not found at $KUBESOLO_BIN" @@ -12,7 +12,7 @@ if [ ! -x "$KUBESOLO_BIN" ]; then fi # Build KubeSolo command line -KUBESOLO_ARGS="--path /var/lib/kubesolo --local-storage true" +KUBESOLO_ARGS="--path /var/lib/kubesolo --local-storage" # Add extra SANs if hostname resolves HOSTNAME="$(hostname)" @@ -30,6 +30,17 @@ if [ -f /etc/kubesolo/extra-flags ]; then KUBESOLO_ARGS="$KUBESOLO_ARGS $(cat /etc/kubesolo/extra-flags)" fi +# Pre-initialize iptables filter table and base chains. +# KubeSolo's kube-proxy uses iptables-restore (nf_tables backend) which needs +# the filter table to exist. Without this, the first iptables-restore fails +# with "RULE_APPEND failed (No such file or directory)". +if command -v iptables >/dev/null 2>&1; then + iptables -t filter -L -n >/dev/null 2>&1 || true + iptables -t nat -L -n >/dev/null 2>&1 || true + iptables -t mangle -L -n >/dev/null 2>&1 || true + log "Pre-initialized iptables tables (filter, nat, mangle)" +fi + log "Starting KubeSolo: $KUBESOLO_BIN $KUBESOLO_ARGS" log "Kubeconfig will be at: /var/lib/kubesolo/pki/admin/admin.kubeconfig" diff --git a/init/lib/functions.sh b/init/lib/functions.sh index 1e0f8ea..0796cc9 100755 --- a/init/lib/functions.sh +++ b/init/lib/functions.sh @@ -29,11 +29,16 @@ wait_for_file() { return 1 } -# Get IP address of an interface (POSIX-safe, no grep -P) +# Get IP address of an interface (BusyBox-safe: prefer ifconfig, fall back to ip) get_iface_ip() { iface="$1" - ip -4 addr show "$iface" 2>/dev/null | \ - sed -n 's/.*inet \([0-9.]*\).*/\1/p' | head -1 + if command -v ifconfig >/dev/null 2>&1; then + ifconfig "$iface" 2>/dev/null | \ + sed -n 's/.*inet addr:\([0-9.]*\).*/\1/p;s/.*inet \([0-9.]*\).*/\1/p' | head -1 + elif command -v ip >/dev/null 2>&1; then + ip -4 addr show "$iface" 2>/dev/null | \ + sed -n 's/.*inet \([0-9.]*\).*/\1/p' | head -1 + fi } # Check if running in a VM (useful for adjusting timeouts)