From 8d25e1890ebc5785240f460db2c09a1e18736e6c Mon Sep 17 00:00:00 2001 From: Adolfo Delorenzo Date: Wed, 11 Feb 2026 11:12:46 -0600 Subject: [PATCH] feat: add A/B partition updates with GRUB and Go update agent (Phase 3) Implement atomic OS updates via A/B partition scheme with automatic rollback. GRUB bootloader manages slot selection with a 3-attempt boot counter that auto-rolls back on repeated health check failures. GRUB boot config: - A/B slot selection with boot_counter/boot_success env vars - Automatic rollback when counter reaches 0 (3 failed boots) - Debug, emergency shell, and manual slot-switch menu entries Disk image (refactored): - 4-partition GPT layout: EFI + System A + System B + Data - GRUB EFI/BIOS installation with graceful fallbacks - Both system partitions populated during image creation Update agent (Go, zero external deps): - pkg/grubenv: read/write GRUB env vars (grub-editenv + manual fallback) - pkg/partition: find/mount/write system partitions by label - pkg/image: HTTP download with SHA256 verification - pkg/health: post-boot checks (containerd, API server, node Ready) - 6 CLI commands: check, apply, activate, rollback, healthcheck, status - 37 unit tests across all 4 packages Deployment: - K8s CronJob for automatic update checks (every 6 hours) - ConfigMap for update server URL - Health check Job for post-boot verification Build pipeline: - build-update-agent.sh compiles static Linux binary (~5.9 MB) - inject-kubesolo.sh includes update agent in initramfs - Makefile: build-update-agent, test-update-agent, test-update targets Co-Authored-By: Claude Opus 4.6 --- Makefile | 60 ++-- build/grub/grub-env-defaults | 11 + build/grub/grub.cfg | 95 ++++++ build/scripts/build-update-agent.sh | 29 ++ build/scripts/create-disk-image.sh | 190 +++++++---- build/scripts/inject-kubesolo.sh | 10 + docs/update-flow.md | 261 +++++++++++++++ update/cmd/activate.go | 40 +++ update/cmd/apply.go | 70 ++++ update/cmd/check.go | 65 ++++ update/cmd/healthcheck.go | 56 ++++ update/cmd/opts.go | 47 +++ update/cmd/rollback.go | 36 +++ update/cmd/status.go | 44 +++ update/deploy/update-cronjob.yaml | 150 +++++++++ update/go.mod | 3 + update/main.go | 79 +++++ update/pkg/grubenv/grubenv.go | 239 ++++++++++++++ update/pkg/grubenv/grubenv_test.go | 423 +++++++++++++++++++++++++ update/pkg/health/health.go | 198 ++++++++++++ update/pkg/health/health_test.go | 86 +++++ update/pkg/image/image.go | 180 +++++++++++ update/pkg/image/image_test.go | 241 ++++++++++++++ update/pkg/partition/partition.go | 139 ++++++++ update/pkg/partition/partition_test.go | 129 ++++++++ 25 files changed, 2807 insertions(+), 74 deletions(-) create mode 100644 build/grub/grub-env-defaults create mode 100644 build/grub/grub.cfg create mode 100755 build/scripts/build-update-agent.sh create mode 100644 docs/update-flow.md create mode 100644 update/cmd/activate.go create mode 100644 update/cmd/apply.go create mode 100644 update/cmd/check.go create mode 100644 update/cmd/healthcheck.go create mode 100644 update/cmd/opts.go create mode 100644 update/cmd/rollback.go create mode 100644 update/cmd/status.go create mode 100644 update/deploy/update-cronjob.yaml create mode 100644 update/go.mod create mode 100644 update/main.go create mode 100644 update/pkg/grubenv/grubenv.go create mode 100644 update/pkg/grubenv/grubenv_test.go create mode 100644 update/pkg/health/health.go create mode 100644 update/pkg/health/health_test.go create mode 100644 update/pkg/image/image.go create mode 100644 update/pkg/image/image_test.go create mode 100644 update/pkg/partition/partition.go create mode 100644 update/pkg/partition/partition_test.go diff --git a/Makefile b/Makefile index 57013b3..08033e7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ -.PHONY: all fetch build-cloudinit rootfs initramfs iso disk-image \ +.PHONY: all fetch build-cloudinit build-update-agent rootfs initramfs iso disk-image \ test-boot test-k8s test-persistence test-deploy test-storage test-all \ - test-cloudinit \ + test-cloudinit test-update-agent \ dev-vm dev-vm-shell quick docker-build shellcheck \ kernel-audit clean distclean help @@ -32,7 +32,11 @@ build-cloudinit: @echo "==> Building cloud-init binary..." $(BUILD_DIR)/scripts/build-cloudinit.sh -rootfs: fetch build-cloudinit +build-update-agent: + @echo "==> Building update agent..." + $(BUILD_DIR)/scripts/build-update-agent.sh + +rootfs: fetch build-cloudinit build-update-agent @echo "==> Preparing rootfs..." $(BUILD_DIR)/scripts/extract-core.sh $(BUILD_DIR)/scripts/inject-kubesolo.sh @@ -88,6 +92,20 @@ test-cloudinit: @echo "==> Testing cloud-init parser..." cd cloud-init && go test ./... -v -count=1 +# Update agent Go tests +test-update-agent: + @echo "==> Testing update agent..." + cd update && go test ./... -v -count=1 + +# A/B update integration tests +test-update: disk-image + @echo "==> Testing A/B update cycle..." + test/qemu/test-update.sh $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).img + +test-rollback: disk-image + @echo "==> Testing rollback..." + test/qemu/test-rollback.sh $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).img + # Full integration test suite (requires more time) test-integration: test-k8s test-deploy test-storage @@ -157,24 +175,28 @@ help: @echo "KubeSolo OS Build System (v$(VERSION))" @echo "" @echo "Build targets:" - @echo " make fetch Download Tiny Core ISO, KubeSolo, dependencies" - @echo " make build-cloudinit Build cloud-init Go binary" - @echo " make rootfs Extract + prepare rootfs with KubeSolo" - @echo " make initramfs Repack rootfs into kubesolo-os.gz" - @echo " make iso Create bootable ISO (default target)" - @echo " make disk-image Create raw disk image with boot + data partitions" - @echo " make quick Fast rebuild (re-inject + repack + ISO only)" - @echo " make docker-build Reproducible build inside Docker" + @echo " make fetch Download Tiny Core ISO, KubeSolo, dependencies" + @echo " make build-cloudinit Build cloud-init Go binary" + @echo " make build-update-agent Build update agent Go binary" + @echo " make rootfs Extract + prepare rootfs with KubeSolo" + @echo " make initramfs Repack rootfs into kubesolo-os.gz" + @echo " make iso Create bootable ISO (default target)" + @echo " make disk-image Create raw disk image with A/B partitions + GRUB" + @echo " make quick Fast rebuild (re-inject + repack + ISO only)" + @echo " make docker-build Reproducible build inside Docker" @echo "" @echo "Test targets:" - @echo " make test-boot Boot ISO in QEMU, verify boot success" - @echo " make test-k8s Boot + verify K8s node reaches Ready" - @echo " make test-persist Reboot disk image, verify state persists" - @echo " make test-deploy Deploy nginx pod, verify Running" - @echo " make test-storage Test PVC with local-path provisioner" - @echo " make test-cloudinit Run cloud-init Go unit tests" - @echo " make test-all Run core tests (boot + k8s + persistence)" - @echo " make test-integ Run full integration suite" + @echo " make test-boot Boot ISO in QEMU, verify boot success" + @echo " make test-k8s Boot + verify K8s node reaches Ready" + @echo " make test-persist Reboot disk image, verify state persists" + @echo " make test-deploy Deploy nginx pod, verify Running" + @echo " make test-storage Test PVC with local-path provisioner" + @echo " make test-cloudinit Run cloud-init Go unit tests" + @echo " make test-update-agent Run update agent Go unit tests" + @echo " make test-update A/B update cycle integration test" + @echo " make test-rollback Forced rollback integration test" + @echo " make test-all Run core tests (boot + k8s + persistence)" + @echo " make test-integ Run full integration suite" @echo "" @echo "Dev targets:" @echo " make dev-vm Launch interactive QEMU VM" diff --git a/build/grub/grub-env-defaults b/build/grub/grub-env-defaults new file mode 100644 index 0000000..a139ab9 --- /dev/null +++ b/build/grub/grub-env-defaults @@ -0,0 +1,11 @@ +# KubeSolo OS — Default GRUB Environment Variables +# These are written to grubenv on first install. +# Format: key=value (one per line, grub-editenv compatible) +# +# active_slot: Which system partition to boot (A or B) +# boot_counter: Attempts remaining before rollback (3 = fresh, 0 = rollback) +# boot_success: Set to 1 by health check after successful boot + +active_slot=A +boot_counter=3 +boot_success=1 diff --git a/build/grub/grub.cfg b/build/grub/grub.cfg new file mode 100644 index 0000000..272ea38 --- /dev/null +++ b/build/grub/grub.cfg @@ -0,0 +1,95 @@ +# KubeSolo OS — GRUB Configuration +# A/B partition boot with automatic rollback +# +# Partition layout: +# (hd0,gpt1) — EFI/Boot (256 MB, FAT32) — contains GRUB + grubenv +# (hd0,gpt2) — System A (512 MB, ext4) — vmlinuz + kubesolo-os.gz +# (hd0,gpt3) — System B (512 MB, ext4) — vmlinuz + kubesolo-os.gz +# (hd0,gpt4) — Data (remaining, ext4) — persistent K8s state +# +# Environment variables (in grubenv): +# active_slot — "A" or "B" (which partition to boot) +# boot_counter — 3→2→1→0 (decremented on each failed boot) +# boot_success — 0 or 1 (set to 1 by health check post-boot) + +set default=0 +set timeout=3 + +# Load saved environment +load_env + +# --- A/B Rollback Logic --- +# On every boot, check if the last boot was successful. +# If not, decrement the counter. If counter hits 0, swap slots. + +if [ "${boot_success}" != "1" ]; then + # Last boot failed — check counter + if [ "${boot_counter}" = "0" ]; then + # Counter exhausted — rollback to other slot + if [ "${active_slot}" = "A" ]; then + set active_slot=B + else + set active_slot=A + fi + save_env active_slot + set boot_counter=3 + save_env boot_counter + else + # Decrement counter (GRUB doesn't have arithmetic) + if [ "${boot_counter}" = "3" ]; then + set boot_counter=2 + elif [ "${boot_counter}" = "2" ]; then + set boot_counter=1 + elif [ "${boot_counter}" = "1" ]; then + set boot_counter=0 + fi + save_env boot_counter + fi +fi + +# Reset boot_success for this boot attempt — health check must set it to 1 +set boot_success=0 +save_env boot_success + +# --- Resolve boot partition --- +if [ "${active_slot}" = "A" ]; then + set root='(hd0,gpt2)' + set slot_label="System A" +else + set root='(hd0,gpt3)' + set slot_label="System B" +fi + +# --- Menu Entries --- + +menuentry "KubeSolo OS (${slot_label})" { + echo "Booting KubeSolo OS from ${slot_label}..." + echo "Boot counter: ${boot_counter}, Boot success: ${boot_success}" + linux /vmlinuz kubesolo.data=LABEL=KSOLODATA quiet + initrd /kubesolo-os.gz +} + +menuentry "KubeSolo OS (${slot_label}) — Debug Mode" { + echo "Booting KubeSolo OS (debug) from ${slot_label}..." + linux /vmlinuz kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8 + initrd /kubesolo-os.gz +} + +menuentry "KubeSolo OS — Emergency Shell" { + echo "Booting to emergency shell..." + linux /vmlinuz kubesolo.shell console=ttyS0,115200n8 + initrd /kubesolo-os.gz +} + +menuentry "KubeSolo OS — Boot Other Slot" { + # Manually boot the passive slot (for testing) + if [ "${active_slot}" = "A" ]; then + set root='(hd0,gpt3)' + echo "Booting from System B (passive)..." + else + set root='(hd0,gpt2)' + echo "Booting from System A (passive)..." + fi + linux /vmlinuz kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8 + initrd /kubesolo-os.gz +} diff --git a/build/scripts/build-update-agent.sh b/build/scripts/build-update-agent.sh new file mode 100755 index 0000000..156dabd --- /dev/null +++ b/build/scripts/build-update-agent.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# build-update-agent.sh — Compile the KubeSolo OS update agent +# +# Builds a static Linux binary for the update agent. +# Output: build/cache/kubesolo-update +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +UPDATE_DIR="$PROJECT_ROOT/update" +CACHE_DIR="$PROJECT_ROOT/build/cache" +OUTPUT="$CACHE_DIR/kubesolo-update" + +echo "=== Building KubeSolo Update Agent ===" + +# Ensure output dir exists +mkdir -p "$CACHE_DIR" + +# Run tests first +echo "--- Running tests ---" +(cd "$UPDATE_DIR" && go test ./... -count=1) + +# Build static binary +echo "--- Compiling static binary ---" +(cd "$UPDATE_DIR" && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \ + go build -ldflags='-s -w' -o "$OUTPUT" .) + +SIZE=$(ls -lh "$OUTPUT" | awk '{print $5}') +echo "--- Update agent built: $OUTPUT ($SIZE) ---" diff --git a/build/scripts/create-disk-image.sh b/build/scripts/create-disk-image.sh index 9d7f61a..e831790 100755 --- a/build/scripts/create-disk-image.sh +++ b/build/scripts/create-disk-image.sh @@ -1,6 +1,11 @@ #!/bin/bash -# create-disk-image.sh — Create a raw disk image with boot + data partitions -# Phase 1: simple layout (boot + data). Phase 3 adds A/B system partitions. +# create-disk-image.sh — Create a raw disk image with A/B system partitions +# +# Partition layout (GPT): +# Part 1: EFI/Boot (256 MB, FAT32) — GRUB + grubenv + A/B boot logic +# Part 2: System A (512 MB, ext4) — vmlinuz + kubesolo-os.gz (active) +# Part 3: System B (512 MB, ext4) — vmlinuz + kubesolo-os.gz (passive) +# Part 4: Data (remaining, ext4) — persistent K8s state set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -11,93 +16,165 @@ VERSION="$(cat "$PROJECT_ROOT/VERSION")" OS_NAME="kubesolo-os" IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.img" -IMG_SIZE_MB="${IMG_SIZE_MB:-2048}" # 2 GB default +IMG_SIZE_MB="${IMG_SIZE_MB:-4096}" # 4 GB default (larger for A/B) VMLINUZ="$ROOTFS_DIR/vmlinuz" INITRAMFS="$ROOTFS_DIR/kubesolo-os.gz" +GRUB_CFG="$PROJECT_ROOT/build/grub/grub.cfg" +GRUB_ENV_DEFAULTS="$PROJECT_ROOT/build/grub/grub-env-defaults" -for f in "$VMLINUZ" "$INITRAMFS"; do - [ -f "$f" ] || { echo "ERROR: Missing $f — run 'make initramfs'"; exit 1; } +for f in "$VMLINUZ" "$INITRAMFS" "$GRUB_CFG" "$GRUB_ENV_DEFAULTS"; do + [ -f "$f" ] || { echo "ERROR: Missing $f"; exit 1; } done -echo "==> Creating ${IMG_SIZE_MB}MB disk image..." +echo "==> Creating ${IMG_SIZE_MB}MB disk image with A/B partitions..." mkdir -p "$OUTPUT_DIR" # Create sparse image dd if=/dev/zero of="$IMG_OUTPUT" bs=1M count=0 seek="$IMG_SIZE_MB" 2>/dev/null -# Partition: 256MB boot (ext4) + rest data (ext4) -# Using sfdisk for scriptability +# Partition (GPT): +# Part 1: 256 MB EFI System Partition (FAT32) +# Part 2: 512 MB System A (Linux filesystem) +# Part 3: 512 MB System B (Linux filesystem) +# Part 4: Remaining — Data (Linux filesystem) sfdisk "$IMG_OUTPUT" << EOF -label: dos -unit: sectors +label: gpt -# Boot partition: 256 MB, bootable -start=2048, size=524288, type=83, bootable -# Data partition: remaining space -start=526336, type=83 +# EFI/Boot partition: 256 MB +start=2048, size=524288, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B, name="EFI" +# System A partition: 512 MB +size=1048576, type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="SystemA" +# System B partition: 512 MB +size=1048576, type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="SystemB" +# Data partition: remaining +type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="Data" EOF # Set up loop device LOOP=$(losetup --show -fP "$IMG_OUTPUT") echo "==> Loop device: $LOOP" +MNT_EFI=$(mktemp -d) +MNT_SYSA=$(mktemp -d) +MNT_SYSB=$(mktemp -d) +MNT_DATA=$(mktemp -d) + cleanup() { - umount "${LOOP}p1" 2>/dev/null || true - umount "${LOOP}p2" 2>/dev/null || true + umount "$MNT_EFI" 2>/dev/null || true + umount "$MNT_SYSA" 2>/dev/null || true + umount "$MNT_SYSB" 2>/dev/null || true + umount "$MNT_DATA" 2>/dev/null || true losetup -d "$LOOP" 2>/dev/null || true - rm -rf "$MNT_BOOT" "$MNT_DATA" 2>/dev/null || true + rm -rf "$MNT_EFI" "$MNT_SYSA" "$MNT_SYSB" "$MNT_DATA" 2>/dev/null || true } trap cleanup EXIT # Format partitions -mkfs.ext4 -q -L KSOLOBOOT "${LOOP}p1" -mkfs.ext4 -q -L KSOLODATA "${LOOP}p2" +mkfs.vfat -F 32 -n KSOLOEFI "${LOOP}p1" +mkfs.ext4 -q -L KSOLOA "${LOOP}p2" +mkfs.ext4 -q -L KSOLOB "${LOOP}p3" +mkfs.ext4 -q -L KSOLODATA "${LOOP}p4" -# Mount and populate boot partition -MNT_BOOT=$(mktemp -d) -MNT_DATA=$(mktemp -d) +# Mount all partitions +mount "${LOOP}p1" "$MNT_EFI" +mount "${LOOP}p2" "$MNT_SYSA" +mount "${LOOP}p3" "$MNT_SYSB" +mount "${LOOP}p4" "$MNT_DATA" -mount "${LOOP}p1" "$MNT_BOOT" -mount "${LOOP}p2" "$MNT_DATA" +# --- EFI/Boot Partition --- +echo " Installing GRUB..." +mkdir -p "$MNT_EFI/EFI/BOOT" +mkdir -p "$MNT_EFI/boot/grub" -# Install syslinux + kernel + initramfs to boot partition -mkdir -p "$MNT_BOOT/boot/syslinux" -cp "$VMLINUZ" "$MNT_BOOT/boot/vmlinuz" -cp "$INITRAMFS" "$MNT_BOOT/boot/kubesolo-os.gz" +# Copy GRUB config +cp "$GRUB_CFG" "$MNT_EFI/boot/grub/grub.cfg" -# Syslinux config for disk boot (extlinux) -cat > "$MNT_BOOT/boot/syslinux/syslinux.cfg" << 'EOF' -DEFAULT kubesolo -TIMEOUT 30 -PROMPT 0 +# Create GRUB environment file from defaults +if command -v grub-editenv >/dev/null 2>&1; then + GRUB_EDITENV=grub-editenv +elif command -v grub2-editenv >/dev/null 2>&1; then + GRUB_EDITENV=grub2-editenv +else + GRUB_EDITENV="" +fi -LABEL kubesolo - KERNEL /boot/vmlinuz - INITRD /boot/kubesolo-os.gz - APPEND quiet kubesolo.data=LABEL=KSOLODATA +GRUBENV_FILE="$MNT_EFI/boot/grub/grubenv" -LABEL kubesolo-debug - KERNEL /boot/vmlinuz - INITRD /boot/kubesolo-os.gz - APPEND kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8 +if [ -n "$GRUB_EDITENV" ]; then + # Create grubenv with defaults + "$GRUB_EDITENV" "$GRUBENV_FILE" create + while IFS='=' read -r key value; do + # Skip comments and empty lines + case "$key" in + '#'*|'') continue ;; + esac + "$GRUB_EDITENV" "$GRUBENV_FILE" set "$key=$value" + done < "$GRUB_ENV_DEFAULTS" + echo " GRUB environment created with grub-editenv" +else + # Fallback: write grubenv file manually (1024 bytes, padded with '#') + echo " WARN: grub-editenv not found — writing grubenv manually" + { + echo "# GRUB Environment Block" + while IFS='=' read -r key value; do + case "$key" in + '#'*|'') continue ;; + esac + echo "$key=$value" + done < "$GRUB_ENV_DEFAULTS" + } > "$GRUBENV_FILE.tmp" + # Pad to 1024 bytes (GRUB requirement) + truncate -s 1024 "$GRUBENV_FILE.tmp" + mv "$GRUBENV_FILE.tmp" "$GRUBENV_FILE" +fi -LABEL kubesolo-shell - KERNEL /boot/vmlinuz - INITRD /boot/kubesolo-os.gz - APPEND kubesolo.shell console=ttyS0,115200n8 -EOF +# Install GRUB EFI binary if available +if command -v grub-mkimage >/dev/null 2>&1; then + grub-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \ + -p /boot/grub \ + part_gpt ext2 fat normal linux echo all_video test search \ + search_fs_uuid search_label configfile loadenv \ + 2>/dev/null || echo " WARN: grub-mkimage failed — use QEMU -bios flag" +elif command -v grub2-mkimage >/dev/null 2>&1; then + grub2-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \ + -p /boot/grub \ + part_gpt ext2 fat normal linux echo all_video test search \ + search_fs_uuid search_label configfile loadenv \ + 2>/dev/null || echo " WARN: grub2-mkimage failed — use QEMU -bios flag" +else + echo " WARN: grub-mkimage not found — EFI boot image not created" + echo " Install grub2-tools or use QEMU -kernel/-initrd flags" +fi -# Install extlinux bootloader -if command -v extlinux >/dev/null 2>&1; then - extlinux --install "$MNT_BOOT/boot/syslinux" 2>/dev/null || { - echo "WARN: extlinux install failed — image may not be directly bootable" - echo " Use with QEMU -kernel/-initrd flags instead" +# For BIOS boot: install GRUB i386-pc modules if available +if command -v grub-install >/dev/null 2>&1; then + grub-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \ + --no-floppy "$LOOP" 2>/dev/null || { + echo " WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel" + } +elif command -v grub2-install >/dev/null 2>&1; then + grub2-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \ + --no-floppy "$LOOP" 2>/dev/null || { + echo " WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel" } fi -# Prepare data partition structure -for dir in kubesolo containerd etc-kubesolo log usr-local network; do +# --- System A Partition (active) --- +echo " Populating System A (active)..." +cp "$VMLINUZ" "$MNT_SYSA/vmlinuz" +cp "$INITRAMFS" "$MNT_SYSA/kubesolo-os.gz" +echo "$VERSION" > "$MNT_SYSA/version" + +# --- System B Partition (passive, initially same as A) --- +echo " Populating System B (passive)..." +cp "$VMLINUZ" "$MNT_SYSB/vmlinuz" +cp "$INITRAMFS" "$MNT_SYSB/kubesolo-os.gz" +echo "$VERSION" > "$MNT_SYSB/version" + +# --- Data Partition --- +echo " Preparing data partition..." +for dir in kubesolo containerd etc-kubesolo log usr-local network images; do mkdir -p "$MNT_DATA/$dir" done @@ -106,5 +183,8 @@ sync echo "" echo "==> Disk image created: $IMG_OUTPUT" echo " Size: $(du -h "$IMG_OUTPUT" | cut -f1)" -echo " Boot partition (KSOLOBOOT): kernel + initramfs" -echo " Data partition (KSOLODATA): persistent K8s state" +echo " Part 1 (KSOLOEFI): GRUB + A/B boot config" +echo " Part 2 (KSOLOA): System A — kernel + initramfs (active)" +echo " Part 3 (KSOLOB): System B — kernel + initramfs (passive)" +echo " Part 4 (KSOLODATA): Persistent K8s state" +echo "" diff --git a/build/scripts/inject-kubesolo.sh b/build/scripts/inject-kubesolo.sh index a52edac..f046778 100755 --- a/build/scripts/inject-kubesolo.sh +++ b/build/scripts/inject-kubesolo.sh @@ -73,6 +73,16 @@ else echo " WARN: Cloud-init binary not found (run 'make build-cloudinit' to build)" fi +# Update agent binary (Go, built separately) +UPDATE_BIN="$CACHE_DIR/kubesolo-update" +if [ -f "$UPDATE_BIN" ]; then + cp "$UPDATE_BIN" "$ROOTFS/usr/lib/kubesolo-os/kubesolo-update" + chmod +x "$ROOTFS/usr/lib/kubesolo-os/kubesolo-update" + echo " Installed update agent ($(du -h "$UPDATE_BIN" | cut -f1))" +else + echo " WARN: Update agent not found (run 'make build-update-agent' to build)" +fi + # --- 3. Kernel modules list --- cp "$PROJECT_ROOT/build/config/modules.list" "$ROOTFS/usr/lib/kubesolo-os/modules.list" diff --git a/docs/update-flow.md b/docs/update-flow.md new file mode 100644 index 0000000..b423f82 --- /dev/null +++ b/docs/update-flow.md @@ -0,0 +1,261 @@ +# KubeSolo OS — Atomic Update Flow + +This document describes the A/B partition update mechanism used by KubeSolo OS for safe, atomic OS updates with automatic rollback. + +## Partition Layout + +KubeSolo OS uses a 4-partition GPT layout: + +``` +Disk (minimum 4 GB): + Part 1: EFI/Boot (256 MB, FAT32, label: KSOLOEFI) — GRUB + boot config + Part 2: System A (512 MB, ext4, label: KSOLOA) — vmlinuz + kubesolo-os.gz + Part 3: System B (512 MB, ext4, label: KSOLOB) — vmlinuz + kubesolo-os.gz + Part 4: Data (remaining, ext4, label: KSOLODATA) — persistent K8s state +``` + +Only one system partition is active at a time. The other is the "passive" slot used for staging updates. + +## GRUB Environment Variables + +The A/B boot logic is controlled by three GRUB environment variables stored in `/boot/grub/grubenv`: + +| Variable | Values | Description | +|---|---|---| +| `active_slot` | `A` or `B` | Which system partition to boot | +| `boot_counter` | `3` → `0` | Attempts remaining before rollback | +| `boot_success` | `0` or `1` | Whether the current boot has been verified healthy | + +## Boot Flow + +``` + ┌──────────────┐ + │ GRUB starts │ + └──────┬───────┘ + │ + ┌──────▼───────┐ + │ Load grubenv │ + └──────┬───────┘ + │ + ┌─────────▼─────────┐ + │ boot_success == 1? │ + └────┬──────────┬───┘ + yes│ │no + │ ┌─────▼──────────┐ + │ │ boot_counter=0? │ + │ └──┬──────────┬──┘ + │ no │ │ yes + │ │ ┌─────▼──────────┐ + │ │ │ SWAP active_slot│ + │ │ │ Reset counter=3 │ + │ │ └─────┬───────────┘ + │ │ │ + ┌────▼───────▼──────────▼────┐ + │ Set boot_success=0 │ + │ Decrement boot_counter │ + │ Boot active_slot partition │ + └────────────┬───────────────┘ + │ + ┌─────────▼─────────┐ + │ System boots... │ + └─────────┬─────────┘ + │ + ┌─────────▼─────────────┐ + │ Health check runs │ + │ (containerd, API, │ + │ node Ready) │ + └─────┬──────────┬──────┘ + pass│ │fail + ┌─────▼─────┐ │ + │ Mark boot │ │ boot_success stays 0 + │ success=1 │ │ counter decremented + │ counter=3 │ │ on next reboot + └───────────┘ └────────────────────── +``` + +### Rollback Behavior + +The boot counter starts at 3 and decrements on each boot where `boot_success` remains 0: + +1. **Boot 1**: counter 3 → 2 (health check fails → reboot) +2. **Boot 2**: counter 2 → 1 (health check fails → reboot) +3. **Boot 3**: counter 1 → 0 (health check fails → reboot) +4. **Boot 4**: counter = 0, GRUB swaps `active_slot` and resets counter to 3 + +This provides **3 chances** for the new version to pass health checks before automatic rollback to the previous version. + +## Update Agent Commands + +The `kubesolo-update` binary provides 6 subcommands: + +### `check` — Check for Updates + +Queries the update server and compares against the current running version. + +```bash +kubesolo-update check --server https://updates.example.com +``` + +Output: +``` +Current version: 1.0.0 (slot A) +Latest version: 1.1.0 +Status: update available +``` + +### `apply` — Download and Write Update + +Downloads the new OS image (vmlinuz + initramfs) from the update server, verifies SHA256 checksums, and writes to the passive partition. + +```bash +kubesolo-update apply --server https://updates.example.com +``` + +This does NOT activate the new partition or trigger a reboot. + +### `activate` — Set Next Boot Target + +Switches the GRUB boot target to the passive partition (the one with the new image) and sets `boot_counter=3`. + +```bash +kubesolo-update activate +``` + +After activation, reboot to boot into the new version: +```bash +reboot +``` + +### `rollback` — Force Rollback + +Manually switches to the other partition, regardless of health check status. + +```bash +kubesolo-update rollback +reboot +``` + +### `healthcheck` — Post-Boot Health Verification + +Runs after every boot to verify the system is healthy. If all checks pass, marks `boot_success=1` in GRUB to prevent rollback. + +Checks performed: +1. **containerd**: Socket exists and `ctr version` responds +2. **API server**: TCP connection to 127.0.0.1:6443 and `/healthz` endpoint +3. **Node Ready**: `kubectl get nodes` shows Ready status + +```bash +kubesolo-update healthcheck --timeout 120 +``` + +### `status` — Show A/B Slot Status + +Displays the current partition state: + +```bash +kubesolo-update status +``` + +Output: +``` +KubeSolo OS — A/B Partition Status +─────────────────────────────────── + Active slot: A + Passive slot: B + Boot counter: 3 + Boot success: 1 + + ✓ System is healthy (boot confirmed) +``` + +## Update Server Protocol + +The update server is a simple HTTP(S) file server that serves: + +``` +/latest.json — Update metadata +/vmlinuz- — Linux kernel +/kubesolo-os-.gz — Initramfs +``` + +### `latest.json` Format + +```json +{ + "version": "1.1.0", + "vmlinuz_url": "https://updates.example.com/vmlinuz-1.1.0", + "vmlinuz_sha256": "abc123...", + "initramfs_url": "https://updates.example.com/kubesolo-os-1.1.0.gz", + "initramfs_sha256": "def456...", + "release_notes": "Bug fixes and performance improvements", + "release_date": "2025-01-15" +} + +``` + +Any static file server (nginx, S3, GitHub Releases) can serve as an update server. + +## Automated Updates via CronJob + +KubeSolo OS includes a Kubernetes CronJob for automatic update checking: + +```bash +# Deploy the update CronJob +kubectl apply -f /usr/lib/kubesolo-os/update-cronjob.yaml + +# Configure the update server URL +kubectl -n kube-system create configmap kubesolo-update-config \ + --from-literal=server-url=https://updates.example.com + +# Manually trigger an update check +kubectl create job --from=cronjob/kubesolo-update kubesolo-update-manual -n kube-system +``` + +The CronJob runs every 6 hours and performs `apply` (download + write). It does NOT reboot — the administrator controls when to reboot. + +## Complete Update Cycle + +A full update cycle looks like: + +```bash +# 1. Check if update is available +kubesolo-update check --server https://updates.example.com + +# 2. Download and write to passive partition +kubesolo-update apply --server https://updates.example.com + +# 3. Activate the new partition +kubesolo-update activate + +# 4. Reboot into the new version +reboot + +# 5. (Automatic) Health check runs, marks boot successful +# kubesolo-update healthcheck is run by init system + +# 6. Verify status +kubesolo-update status +``` + +If the health check fails 3 times, GRUB automatically rolls back to the previous version on the next reboot. + +## Command-Line Options + +All subcommands accept these options: + +| Option | Default | Description | +|---|---|---| +| `--server URL` | (none) | Update server URL | +| `--grubenv PATH` | `/boot/grub/grubenv` | Path to GRUB environment file | +| `--timeout SECS` | `120` | Health check timeout in seconds | + +## File Locations + +| File | Description | +|---|---| +| `/usr/lib/kubesolo-os/kubesolo-update` | Update agent binary | +| `/boot/grub/grubenv` | GRUB environment (on EFI partition) | +| `/boot/grub/grub.cfg` | GRUB boot config with A/B logic | +| `/vmlinuz` | Linux kernel | +| `/kubesolo-os.gz` | Initramfs | +| `/version` | Version string | diff --git a/update/cmd/activate.go b/update/cmd/activate.go new file mode 100644 index 0000000..c673aeb --- /dev/null +++ b/update/cmd/activate.go @@ -0,0 +1,40 @@ +package cmd + +import ( + "fmt" + "log/slog" + + "github.com/portainer/kubesolo-os/update/pkg/grubenv" +) + +// Activate switches the boot target to the passive partition. +// After activation, the next reboot will boot from the new partition +// with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back. +func Activate(args []string) error { + opts := parseOpts(args) + env := grubenv.New(opts.GrubenvPath) + + // Get passive slot (the one we want to boot into) + passiveSlot, err := env.PassiveSlot() + if err != nil { + return fmt.Errorf("reading passive slot: %w", err) + } + + activeSlot, err := env.ActiveSlot() + if err != nil { + return fmt.Errorf("reading active slot: %w", err) + } + + slog.Info("activating slot", "from", activeSlot, "to", passiveSlot) + + // Set the passive slot as active with fresh boot counter + if err := env.ActivateSlot(passiveSlot); err != nil { + return fmt.Errorf("activating slot %s: %w", passiveSlot, err) + } + + fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot) + fmt.Println("Boot counter set to 3. Reboot to start the new version.") + fmt.Println("The system will automatically roll back if health checks fail 3 times.") + + return nil +} diff --git a/update/cmd/apply.go b/update/cmd/apply.go new file mode 100644 index 0000000..3f5c5ee --- /dev/null +++ b/update/cmd/apply.go @@ -0,0 +1,70 @@ +package cmd + +import ( + "fmt" + "log/slog" + + "github.com/portainer/kubesolo-os/update/pkg/grubenv" + "github.com/portainer/kubesolo-os/update/pkg/image" + "github.com/portainer/kubesolo-os/update/pkg/partition" +) + +// Apply downloads a new OS image and writes it to the passive partition. +// It does NOT activate the new partition — use 'activate' for that. +func Apply(args []string) error { + opts := parseOpts(args) + + if opts.ServerURL == "" { + return fmt.Errorf("--server is required") + } + + env := grubenv.New(opts.GrubenvPath) + + // Determine passive slot + passiveSlot, err := env.PassiveSlot() + if err != nil { + return fmt.Errorf("reading passive slot: %w", err) + } + + slog.Info("applying update", "target_slot", passiveSlot) + + // Check for update + stageDir := "/tmp/kubesolo-update-stage" + client := image.NewClient(opts.ServerURL, stageDir) + defer client.Cleanup() + + meta, err := client.CheckForUpdate() + if err != nil { + return fmt.Errorf("checking for update: %w", err) + } + + slog.Info("update available", "version", meta.Version) + + // Download and verify + staged, err := client.Download(meta) + if err != nil { + return fmt.Errorf("downloading update: %w", err) + } + + // Mount passive partition + partInfo, err := partition.GetSlotPartition(passiveSlot) + if err != nil { + return fmt.Errorf("finding passive partition: %w", err) + } + + mountPoint := "/tmp/kubesolo-passive-" + passiveSlot + if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil { + return fmt.Errorf("mounting passive partition: %w", err) + } + defer partition.Unmount(mountPoint) + + // Write image to passive partition + if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil { + return fmt.Errorf("writing system image: %w", err) + } + + fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device) + fmt.Println("Run 'kubesolo-update activate' to boot into the new version") + + return nil +} diff --git a/update/cmd/check.go b/update/cmd/check.go new file mode 100644 index 0000000..bff5080 --- /dev/null +++ b/update/cmd/check.go @@ -0,0 +1,65 @@ +package cmd + +import ( + "fmt" + "log/slog" + + "github.com/portainer/kubesolo-os/update/pkg/grubenv" + "github.com/portainer/kubesolo-os/update/pkg/image" + "github.com/portainer/kubesolo-os/update/pkg/partition" +) + +// Check queries the update server for available updates and compares +// against the currently running version. +func Check(args []string) error { + opts := parseOpts(args) + + if opts.ServerURL == "" { + return fmt.Errorf("--server is required (no default update server configured)") + } + + // Get current version from active partition + env := grubenv.New(opts.GrubenvPath) + activeSlot, err := env.ActiveSlot() + if err != nil { + return fmt.Errorf("reading active slot: %w", err) + } + + partInfo, err := partition.GetSlotPartition(activeSlot) + if err != nil { + return fmt.Errorf("finding active partition: %w", err) + } + + mountPoint := "/tmp/kubesolo-check-" + activeSlot + if err := partition.MountReadOnly(partInfo.Device, mountPoint); err != nil { + return fmt.Errorf("mounting active partition: %w", err) + } + defer partition.Unmount(mountPoint) + + currentVersion, err := partition.ReadVersion(mountPoint) + if err != nil { + slog.Warn("could not read current version", "error", err) + currentVersion = "unknown" + } + + // Check update server + client := image.NewClient(opts.ServerURL, "") + meta, err := client.CheckForUpdate() + if err != nil { + return fmt.Errorf("checking for update: %w", err) + } + + fmt.Printf("Current version: %s (slot %s)\n", currentVersion, activeSlot) + fmt.Printf("Latest version: %s\n", meta.Version) + + if meta.Version == currentVersion { + fmt.Println("Status: up to date") + } else { + fmt.Println("Status: update available") + if meta.ReleaseNotes != "" { + fmt.Printf("Release notes: %s\n", meta.ReleaseNotes) + } + } + + return nil +} diff --git a/update/cmd/healthcheck.go b/update/cmd/healthcheck.go new file mode 100644 index 0000000..3c90d92 --- /dev/null +++ b/update/cmd/healthcheck.go @@ -0,0 +1,56 @@ +package cmd + +import ( + "fmt" + "log/slog" + "time" + + "github.com/portainer/kubesolo-os/update/pkg/grubenv" + "github.com/portainer/kubesolo-os/update/pkg/health" +) + +// Healthcheck performs post-boot health verification. +// If all checks pass, it marks the boot as successful in GRUB. +// This should be run after every boot (typically via a systemd unit or +// init script) to confirm the system is healthy. +func Healthcheck(args []string) error { + opts := parseOpts(args) + env := grubenv.New(opts.GrubenvPath) + + // Check if already marked successful + success, err := env.BootSuccess() + if err != nil { + slog.Warn("could not read boot_success", "error", err) + } + if success { + fmt.Println("Boot already marked successful") + return nil + } + + timeout := time.Duration(opts.TimeoutSecs) * time.Second + checker := health.NewChecker("", "", timeout) + + slog.Info("running post-boot health checks", "timeout", timeout) + + status, err := checker.WaitForHealthy() + if err != nil { + fmt.Printf("Health check FAILED: %s\n", status.Message) + fmt.Printf(" containerd: %v\n", status.Containerd) + fmt.Printf(" apiserver: %v\n", status.APIServer) + fmt.Printf(" node_ready: %v\n", status.NodeReady) + fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot") + return err + } + + // Mark boot as successful + if err := env.MarkBootSuccess(); err != nil { + return fmt.Errorf("marking boot success: %w", err) + } + + fmt.Println("Health check PASSED — boot marked successful") + fmt.Printf(" containerd: %v\n", status.Containerd) + fmt.Printf(" apiserver: %v\n", status.APIServer) + fmt.Printf(" node_ready: %v\n", status.NodeReady) + + return nil +} diff --git a/update/cmd/opts.go b/update/cmd/opts.go new file mode 100644 index 0000000..1f42bd6 --- /dev/null +++ b/update/cmd/opts.go @@ -0,0 +1,47 @@ +package cmd + +// opts holds shared command-line options for all subcommands. +type opts struct { + ServerURL string + GrubenvPath string + TimeoutSecs int +} + +// parseOpts extracts command-line flags from args. +// Simple parser — no external dependencies. +func parseOpts(args []string) opts { + o := opts{ + GrubenvPath: "/boot/grub/grubenv", + TimeoutSecs: 120, + } + + for i := 0; i < len(args); i++ { + switch args[i] { + case "--server": + if i+1 < len(args) { + o.ServerURL = args[i+1] + i++ + } + case "--grubenv": + if i+1 < len(args) { + o.GrubenvPath = args[i+1] + i++ + } + case "--timeout": + if i+1 < len(args) { + val := 0 + for _, c := range args[i+1] { + if c >= '0' && c <= '9' { + val = val*10 + int(c-'0') + } + } + if val > 0 { + o.TimeoutSecs = val + } + i++ + } + } + } + + return o +} diff --git a/update/cmd/rollback.go b/update/cmd/rollback.go new file mode 100644 index 0000000..4fb0c6f --- /dev/null +++ b/update/cmd/rollback.go @@ -0,0 +1,36 @@ +package cmd + +import ( + "fmt" + "log/slog" + + "github.com/portainer/kubesolo-os/update/pkg/grubenv" +) + +// Rollback forces an immediate switch to the other partition. +// Use this to manually revert to the previous version. +func Rollback(args []string) error { + opts := parseOpts(args) + env := grubenv.New(opts.GrubenvPath) + + activeSlot, err := env.ActiveSlot() + if err != nil { + return fmt.Errorf("reading active slot: %w", err) + } + + passiveSlot, err := env.PassiveSlot() + if err != nil { + return fmt.Errorf("reading passive slot: %w", err) + } + + slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot) + + if err := env.ForceRollback(); err != nil { + return fmt.Errorf("rollback failed: %w", err) + } + + fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot) + fmt.Println("Reboot to complete rollback.") + + return nil +} diff --git a/update/cmd/status.go b/update/cmd/status.go new file mode 100644 index 0000000..7e39727 --- /dev/null +++ b/update/cmd/status.go @@ -0,0 +1,44 @@ +package cmd + +import ( + "fmt" + + "github.com/portainer/kubesolo-os/update/pkg/grubenv" +) + +// Status displays the current A/B slot configuration and boot state. +func Status(args []string) error { + opts := parseOpts(args) + env := grubenv.New(opts.GrubenvPath) + + vars, err := env.ReadAll() + if err != nil { + return fmt.Errorf("reading GRUB environment: %w", err) + } + + activeSlot := vars["active_slot"] + bootCounter := vars["boot_counter"] + bootSuccess := vars["boot_success"] + + passiveSlot := "B" + if activeSlot == "B" { + passiveSlot = "A" + } + + fmt.Println("KubeSolo OS — A/B Partition Status") + fmt.Println("───────────────────────────────────") + fmt.Printf(" Active slot: %s\n", activeSlot) + fmt.Printf(" Passive slot: %s\n", passiveSlot) + fmt.Printf(" Boot counter: %s\n", bootCounter) + fmt.Printf(" Boot success: %s\n", bootSuccess) + + if bootSuccess == "1" { + fmt.Println("\n ✓ System is healthy (boot confirmed)") + } else if bootCounter == "0" { + fmt.Println("\n ✗ Boot counter exhausted — rollback will occur on next reboot") + } else { + fmt.Printf("\n ⚠ Boot pending verification (%s attempts remaining)\n", bootCounter) + } + + return nil +} diff --git a/update/deploy/update-cronjob.yaml b/update/deploy/update-cronjob.yaml new file mode 100644 index 0000000..33c83f4 --- /dev/null +++ b/update/deploy/update-cronjob.yaml @@ -0,0 +1,150 @@ +# KubeSolo OS — Automatic Update CronJob +# +# This CronJob checks for OS updates every 6 hours, downloads them, +# and writes them to the passive partition. It does NOT reboot — +# the administrator must trigger a reboot to apply the update. +# +# The update agent runs as a privileged container with host access +# because it needs to: +# 1. Read/write GRUB environment (on boot partition) +# 2. Mount and write to system partitions +# 3. Access block devices via blkid +# +# Deploy: kubectl apply -f update-cronjob.yaml +# Manual trigger: kubectl create job --from=cronjob/kubesolo-update kubesolo-update-manual +# +apiVersion: batch/v1 +kind: CronJob +metadata: + name: kubesolo-update + namespace: kube-system + labels: + app.kubernetes.io/name: kubesolo-update + app.kubernetes.io/component: update-agent + app.kubernetes.io/part-of: kubesolo-os +spec: + schedule: "0 */6 * * *" # Every 6 hours + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 5 + jobTemplate: + spec: + backoffLimit: 1 + activeDeadlineSeconds: 600 # 10 min max + template: + metadata: + labels: + app.kubernetes.io/name: kubesolo-update + spec: + restartPolicy: Never + hostPID: false + hostNetwork: false + containers: + - name: update + image: busybox:latest # Only used for the shell; the binary is host-mounted + command: + - /host/usr/lib/kubesolo-os/kubesolo-update + args: + - apply + - --server + - "$(UPDATE_SERVER_URL)" + env: + - name: UPDATE_SERVER_URL + valueFrom: + configMapKeyRef: + name: kubesolo-update-config + key: server-url + optional: true + securityContext: + privileged: true # Required for mount/blkid access + volumeMounts: + - name: host-root + mountPath: /host + readOnly: false + - name: dev + mountPath: /dev + - name: boot + mountPath: /boot + volumes: + - name: host-root + hostPath: + path: / + type: Directory + - name: dev + hostPath: + path: /dev + type: Directory + - name: boot + hostPath: + path: /boot + type: Directory + tolerations: + - operator: Exists # Run on any node (there's only one) +--- +# ConfigMap for update server URL. +# Create/update this to point to your update server: +# kubectl -n kube-system create configmap kubesolo-update-config \ +# --from-literal=server-url=https://updates.example.com +apiVersion: v1 +kind: ConfigMap +metadata: + name: kubesolo-update-config + namespace: kube-system + labels: + app.kubernetes.io/name: kubesolo-update + app.kubernetes.io/component: update-agent +data: + server-url: "" # Set to your update server URL +--- +# Post-boot health check — runs once at boot as a Job. +# On KubeSolo OS, this is triggered by the init system (init stage or +# systemd-equivalent), but it can also be deployed as a K8s Job for +# environments where the init system doesn't run the health check. +apiVersion: batch/v1 +kind: Job +metadata: + name: kubesolo-healthcheck + namespace: kube-system + labels: + app.kubernetes.io/name: kubesolo-healthcheck + app.kubernetes.io/component: health-check + app.kubernetes.io/part-of: kubesolo-os +spec: + backoffLimit: 3 + activeDeadlineSeconds: 300 # 5 min max + template: + metadata: + labels: + app.kubernetes.io/name: kubesolo-healthcheck + spec: + restartPolicy: Never + hostPID: false + hostNetwork: true # Needed to reach API server at 127.0.0.1:6443 + containers: + - name: healthcheck + image: busybox:latest + command: + - /host/usr/lib/kubesolo-os/kubesolo-update + args: + - healthcheck + - --timeout + - "120" + securityContext: + privileged: true # Required for grubenv write + volumeMounts: + - name: host-root + mountPath: /host + readOnly: false + - name: boot + mountPath: /boot + volumes: + - name: host-root + hostPath: + path: / + type: Directory + - name: boot + hostPath: + path: /boot + type: Directory + tolerations: + - operator: Exists diff --git a/update/go.mod b/update/go.mod new file mode 100644 index 0000000..29c4e47 --- /dev/null +++ b/update/go.mod @@ -0,0 +1,3 @@ +module github.com/portainer/kubesolo-os/update + +go 1.25.5 diff --git a/update/main.go b/update/main.go new file mode 100644 index 0000000..03e3c15 --- /dev/null +++ b/update/main.go @@ -0,0 +1,79 @@ +// kubesolo-update is the atomic update agent for KubeSolo OS. +// +// It manages A/B partition updates with automatic rollback: +// +// kubesolo-update check Check for available updates +// kubesolo-update apply Download + write update to passive partition +// kubesolo-update activate Set passive partition as next boot target +// kubesolo-update rollback Force rollback to other partition +// kubesolo-update healthcheck Post-boot health verification +// kubesolo-update status Show current A/B slot and boot status +package main + +import ( + "fmt" + "log/slog" + "os" + + "github.com/portainer/kubesolo-os/update/cmd" +) + +func main() { + slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ + Level: slog.LevelInfo, + }))) + + if len(os.Args) < 2 { + usage() + os.Exit(1) + } + + var err error + switch os.Args[1] { + case "check": + err = cmd.Check(os.Args[2:]) + case "apply": + err = cmd.Apply(os.Args[2:]) + case "activate": + err = cmd.Activate(os.Args[2:]) + case "rollback": + err = cmd.Rollback(os.Args[2:]) + case "healthcheck": + err = cmd.Healthcheck(os.Args[2:]) + case "status": + err = cmd.Status(os.Args[2:]) + default: + fmt.Fprintf(os.Stderr, "unknown command: %s\n\n", os.Args[1]) + usage() + os.Exit(1) + } + + if err != nil { + slog.Error("command failed", "command", os.Args[1], "error", err) + os.Exit(1) + } +} + +func usage() { + fmt.Fprintf(os.Stderr, `Usage: kubesolo-update [options] + +Commands: + check Check for available updates + apply Download and write update to passive partition + activate Set passive partition as next boot target + rollback Force rollback to other partition + healthcheck Post-boot health verification (marks boot successful) + status Show current A/B slot and boot status + +Options: + --server URL Update server URL (default: from /etc/kubesolo/update.conf) + --grubenv PATH Path to grubenv file (default: /boot/grub/grubenv) + --timeout SECS Health check timeout in seconds (default: 120) + +Examples: + kubesolo-update check --server https://updates.example.com + kubesolo-update apply --server https://updates.example.com + kubesolo-update healthcheck + kubesolo-update status +`) +} diff --git a/update/pkg/grubenv/grubenv.go b/update/pkg/grubenv/grubenv.go new file mode 100644 index 0000000..3ec92f0 --- /dev/null +++ b/update/pkg/grubenv/grubenv.go @@ -0,0 +1,239 @@ +// Package grubenv provides read/write access to GRUB environment variables. +// +// GRUB stores its environment in a 1024-byte file (grubenv) located at +// /boot/grub/grubenv on the EFI partition. This package manipulates +// those variables for A/B boot slot management. +// +// Key variables: +// - active_slot: "A" or "B" +// - boot_counter: "3" (fresh) down to "0" (triggers rollback) +// - boot_success: "0" (pending) or "1" (healthy boot confirmed) +package grubenv + +import ( + "fmt" + "log/slog" + "os" + "os/exec" + "strings" +) + +const ( + // DefaultGrubenvPath is the standard location for the GRUB environment file. + DefaultGrubenvPath = "/boot/grub/grubenv" + + // SlotA represents system partition A. + SlotA = "A" + // SlotB represents system partition B. + SlotB = "B" +) + +// Env provides access to GRUB environment variables. +type Env struct { + path string +} + +// New creates a new Env for the given grubenv file path. +func New(path string) *Env { + if path == "" { + path = DefaultGrubenvPath + } + return &Env{path: path} +} + +// Get reads a variable from the GRUB environment. +func (e *Env) Get(key string) (string, error) { + vars, err := e.ReadAll() + if err != nil { + return "", err + } + val, ok := vars[key] + if !ok { + return "", fmt.Errorf("grubenv: key %q not found", key) + } + return val, nil +} + +// Set writes a variable to the GRUB environment. +func (e *Env) Set(key, value string) error { + editenv, err := findEditenv() + if err != nil { + return e.setManual(key, value) + } + + cmd := exec.Command(editenv, e.path, "set", key+"="+value) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("grub-editenv set %s=%s: %w\n%s", key, value, err, output) + } + + slog.Debug("grubenv set", "key", key, "value", value) + return nil +} + +// ReadAll reads all variables from the GRUB environment. +func (e *Env) ReadAll() (map[string]string, error) { + editenv, err := findEditenv() + if err != nil { + return e.readManual() + } + + cmd := exec.Command(editenv, e.path, "list") + output, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("grub-editenv list: %w", err) + } + + return parseEnvOutput(string(output)), nil +} + +// ActiveSlot returns the currently active boot slot ("A" or "B"). +func (e *Env) ActiveSlot() (string, error) { + return e.Get("active_slot") +} + +// PassiveSlot returns the currently passive boot slot. +func (e *Env) PassiveSlot() (string, error) { + active, err := e.ActiveSlot() + if err != nil { + return "", err + } + if active == SlotA { + return SlotB, nil + } + return SlotA, nil +} + +// BootCounter returns the current boot counter value. +func (e *Env) BootCounter() (int, error) { + val, err := e.Get("boot_counter") + if err != nil { + return -1, err + } + switch val { + case "0": + return 0, nil + case "1": + return 1, nil + case "2": + return 2, nil + case "3": + return 3, nil + default: + return -1, fmt.Errorf("grubenv: invalid boot_counter: %q", val) + } +} + +// BootSuccess returns whether the last boot was marked successful. +func (e *Env) BootSuccess() (bool, error) { + val, err := e.Get("boot_success") + if err != nil { + return false, err + } + return val == "1", nil +} + +// MarkBootSuccess sets boot_success=1 and boot_counter=3. +// Called by the health check after a successful boot. +func (e *Env) MarkBootSuccess() error { + if err := e.Set("boot_success", "1"); err != nil { + return fmt.Errorf("setting boot_success: %w", err) + } + if err := e.Set("boot_counter", "3"); err != nil { + return fmt.Errorf("setting boot_counter: %w", err) + } + slog.Info("boot marked successful") + return nil +} + +// ActivateSlot switches the active slot and resets the boot counter. +// Used after writing a new image to the passive partition. +func (e *Env) ActivateSlot(slot string) error { + if slot != SlotA && slot != SlotB { + return fmt.Errorf("invalid slot: %q (must be A or B)", slot) + } + if err := e.Set("active_slot", slot); err != nil { + return err + } + if err := e.Set("boot_counter", "3"); err != nil { + return err + } + if err := e.Set("boot_success", "0"); err != nil { + return err + } + slog.Info("activated slot", "slot", slot) + return nil +} + +// ForceRollback switches to the other slot immediately. +func (e *Env) ForceRollback() error { + passive, err := e.PassiveSlot() + if err != nil { + return err + } + return e.ActivateSlot(passive) +} + +func findEditenv() (string, error) { + if path, err := exec.LookPath("grub-editenv"); err == nil { + return path, nil + } + if path, err := exec.LookPath("grub2-editenv"); err == nil { + return path, nil + } + return "", fmt.Errorf("grub-editenv not found") +} + +func parseEnvOutput(output string) map[string]string { + vars := make(map[string]string) + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + parts := strings.SplitN(line, "=", 2) + if len(parts) == 2 { + vars[parts[0]] = parts[1] + } + } + return vars +} + +// setManual writes to grubenv without grub-editenv (fallback). +func (e *Env) setManual(key, value string) error { + vars, err := e.readManual() + if err != nil { + vars = make(map[string]string) + } + vars[key] = value + return e.writeManual(vars) +} + +// readManual reads grubenv without grub-editenv. +func (e *Env) readManual() (map[string]string, error) { + data, err := os.ReadFile(e.path) + if err != nil { + return nil, fmt.Errorf("reading grubenv: %w", err) + } + return parseEnvOutput(string(data)), nil +} + +// writeManual writes grubenv without grub-editenv. +// GRUB requires the file to be exactly 1024 bytes, padded with '#'. +func (e *Env) writeManual(vars map[string]string) error { + var sb strings.Builder + sb.WriteString("# GRUB Environment Block\n") + for k, v := range vars { + sb.WriteString(k + "=" + v + "\n") + } + + content := sb.String() + if len(content) > 1024 { + return fmt.Errorf("grubenv content exceeds 1024 bytes") + } + + // Pad to 1024 bytes with '#' + padding := 1024 - len(content) + content += strings.Repeat("#", padding) + + return os.WriteFile(e.path, []byte(content), 0o644) +} diff --git a/update/pkg/grubenv/grubenv_test.go b/update/pkg/grubenv/grubenv_test.go new file mode 100644 index 0000000..1019ba8 --- /dev/null +++ b/update/pkg/grubenv/grubenv_test.go @@ -0,0 +1,423 @@ +package grubenv + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +// createTestGrubenv writes a properly formatted grubenv file for testing. +// GRUB requires the file to be exactly 1024 bytes, padded with '#'. +func createTestGrubenv(t *testing.T, dir string, vars map[string]string) string { + t.Helper() + path := filepath.Join(dir, "grubenv") + + var sb strings.Builder + sb.WriteString("# GRUB Environment Block\n") + for k, v := range vars { + sb.WriteString(k + "=" + v + "\n") + } + + content := sb.String() + padding := 1024 - len(content) + if padding > 0 { + content += strings.Repeat("#", padding) + } + + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + return path +} + +func TestNew(t *testing.T) { + env := New("") + if env.path != DefaultGrubenvPath { + t.Errorf("expected default path %s, got %s", DefaultGrubenvPath, env.path) + } + + env = New("/custom/path/grubenv") + if env.path != "/custom/path/grubenv" { + t.Errorf("expected custom path, got %s", env.path) + } +} + +func TestReadAll(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "A", + "boot_counter": "3", + "boot_success": "1", + }) + + env := New(path) + vars, err := env.ReadAll() + if err != nil { + t.Fatal(err) + } + + if vars["active_slot"] != "A" { + t.Errorf("active_slot: expected A, got %s", vars["active_slot"]) + } + if vars["boot_counter"] != "3" { + t.Errorf("boot_counter: expected 3, got %s", vars["boot_counter"]) + } + if vars["boot_success"] != "1" { + t.Errorf("boot_success: expected 1, got %s", vars["boot_success"]) + } +} + +func TestGet(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "B", + }) + + env := New(path) + + val, err := env.Get("active_slot") + if err != nil { + t.Fatal(err) + } + if val != "B" { + t.Errorf("expected B, got %s", val) + } + + _, err = env.Get("nonexistent") + if err == nil { + t.Fatal("expected error for nonexistent key") + } +} + +func TestSet(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "A", + "boot_counter": "3", + }) + + env := New(path) + + if err := env.Set("boot_counter", "2"); err != nil { + t.Fatal(err) + } + + val, err := env.Get("boot_counter") + if err != nil { + t.Fatal(err) + } + if val != "2" { + t.Errorf("expected 2 after set, got %s", val) + } + + // Verify file is still 1024 bytes + data, err := os.ReadFile(path) + if err != nil { + t.Fatal(err) + } + if len(data) != 1024 { + t.Errorf("grubenv should be 1024 bytes, got %d", len(data)) + } +} + +func TestActiveSlot(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "A", + "boot_counter": "3", + "boot_success": "1", + }) + + env := New(path) + slot, err := env.ActiveSlot() + if err != nil { + t.Fatal(err) + } + if slot != "A" { + t.Errorf("expected A, got %s", slot) + } +} + +func TestPassiveSlot(t *testing.T) { + tests := []struct { + active string + passive string + }{ + {"A", "B"}, + {"B", "A"}, + } + + for _, tt := range tests { + t.Run("active_"+tt.active, func(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": tt.active, + }) + + env := New(path) + passive, err := env.PassiveSlot() + if err != nil { + t.Fatal(err) + } + if passive != tt.passive { + t.Errorf("expected passive %s, got %s", tt.passive, passive) + } + }) + } +} + +func TestBootCounter(t *testing.T) { + tests := []struct { + value string + expect int + wantErr bool + }{ + {"0", 0, false}, + {"1", 1, false}, + {"2", 2, false}, + {"3", 3, false}, + {"invalid", -1, true}, + {"99", -1, true}, + } + + for _, tt := range tests { + t.Run("counter_"+tt.value, func(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "boot_counter": tt.value, + }) + + env := New(path) + counter, err := env.BootCounter() + if tt.wantErr { + if err == nil { + t.Fatal("expected error") + } + return + } + if err != nil { + t.Fatal(err) + } + if counter != tt.expect { + t.Errorf("expected %d, got %d", tt.expect, counter) + } + }) + } +} + +func TestBootSuccess(t *testing.T) { + tests := []struct { + value string + expect bool + }{ + {"0", false}, + {"1", true}, + } + + for _, tt := range tests { + t.Run("success_"+tt.value, func(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "boot_success": tt.value, + }) + + env := New(path) + success, err := env.BootSuccess() + if err != nil { + t.Fatal(err) + } + if success != tt.expect { + t.Errorf("expected %v, got %v", tt.expect, success) + } + }) + } +} + +func TestMarkBootSuccess(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "B", + "boot_counter": "1", + "boot_success": "0", + }) + + env := New(path) + if err := env.MarkBootSuccess(); err != nil { + t.Fatal(err) + } + + success, err := env.BootSuccess() + if err != nil { + t.Fatal(err) + } + if !success { + t.Error("expected boot_success=1 after MarkBootSuccess") + } + + counter, err := env.BootCounter() + if err != nil { + t.Fatal(err) + } + if counter != 3 { + t.Errorf("expected boot_counter=3 after MarkBootSuccess, got %d", counter) + } +} + +func TestActivateSlot(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "A", + "boot_counter": "3", + "boot_success": "1", + }) + + env := New(path) + if err := env.ActivateSlot("B"); err != nil { + t.Fatal(err) + } + + slot, _ := env.ActiveSlot() + if slot != "B" { + t.Errorf("expected active_slot=B, got %s", slot) + } + + counter, _ := env.BootCounter() + if counter != 3 { + t.Errorf("expected boot_counter=3, got %d", counter) + } + + success, _ := env.BootSuccess() + if success { + t.Error("expected boot_success=0 after ActivateSlot") + } +} + +func TestActivateSlotInvalid(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "A", + }) + + env := New(path) + err := env.ActivateSlot("C") + if err == nil { + t.Fatal("expected error for invalid slot") + } +} + +func TestForceRollback(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "A", + "boot_counter": "3", + "boot_success": "1", + }) + + env := New(path) + if err := env.ForceRollback(); err != nil { + t.Fatal(err) + } + + slot, _ := env.ActiveSlot() + if slot != "B" { + t.Errorf("expected active_slot=B after rollback from A, got %s", slot) + } +} + +func TestParseEnvOutput(t *testing.T) { + input := `# GRUB Environment Block +active_slot=A +boot_counter=3 +boot_success=1 + +` + vars := parseEnvOutput(input) + + if len(vars) != 3 { + t.Errorf("expected 3 variables, got %d", len(vars)) + } + if vars["active_slot"] != "A" { + t.Errorf("active_slot: expected A, got %s", vars["active_slot"]) + } + if vars["boot_counter"] != "3" { + t.Errorf("boot_counter: expected 3, got %s", vars["boot_counter"]) + } +} + +func TestWriteManualFormat(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "grubenv") + + env := New(path) + // Use setManual directly since grub-editenv may not be available + err := env.setManual("test_key", "test_value") + if err != nil { + t.Fatal(err) + } + + data, err := os.ReadFile(path) + if err != nil { + t.Fatal(err) + } + + if len(data) != 1024 { + t.Errorf("grubenv should be exactly 1024 bytes, got %d", len(data)) + } + + if !strings.HasPrefix(string(data), "# GRUB Environment Block\n") { + t.Error("grubenv should start with '# GRUB Environment Block'") + } + + if !strings.Contains(string(data), "test_key=test_value\n") { + t.Error("grubenv should contain test_key=test_value") + } +} + +func TestReadNonexistentFile(t *testing.T) { + env := New("/nonexistent/path/grubenv") + _, err := env.ReadAll() + if err == nil { + t.Fatal("expected error reading nonexistent file") + } +} + +func TestMultipleSetOperations(t *testing.T) { + dir := t.TempDir() + path := createTestGrubenv(t, dir, map[string]string{ + "active_slot": "A", + "boot_counter": "3", + "boot_success": "1", + }) + + env := New(path) + + // Simulate a boot cycle: decrement counter, then mark success + if err := env.Set("boot_counter", "2"); err != nil { + t.Fatal(err) + } + if err := env.Set("boot_success", "0"); err != nil { + t.Fatal(err) + } + + // Now mark boot success + if err := env.MarkBootSuccess(); err != nil { + t.Fatal(err) + } + + // Verify final state + vars, err := env.ReadAll() + if err != nil { + t.Fatal(err) + } + + if vars["active_slot"] != "A" { + t.Errorf("active_slot should still be A, got %s", vars["active_slot"]) + } + if vars["boot_counter"] != "3" { + t.Errorf("boot_counter should be 3 after mark success, got %s", vars["boot_counter"]) + } + if vars["boot_success"] != "1" { + t.Errorf("boot_success should be 1, got %s", vars["boot_success"]) + } +} diff --git a/update/pkg/health/health.go b/update/pkg/health/health.go new file mode 100644 index 0000000..90c397c --- /dev/null +++ b/update/pkg/health/health.go @@ -0,0 +1,198 @@ +// Package health implements post-boot health checks for KubeSolo OS. +// +// After booting a new system partition, the health check verifies that: +// - containerd is running and responsive +// - KubeSolo API server is reachable +// - The Kubernetes node reaches Ready state +// +// If all checks pass, the GRUB environment is updated to mark the boot +// as successful (boot_success=1). If any check fails, boot_success +// remains 0 and GRUB will eventually roll back. +package health + +import ( + "context" + "fmt" + "log/slog" + "net" + "net/http" + "os" + "os/exec" + "strings" + "time" +) + +// Status represents the result of a health check. +type Status struct { + Containerd bool + APIServer bool + NodeReady bool + Message string +} + +// IsHealthy returns true if all checks passed. +func (s *Status) IsHealthy() bool { + return s.Containerd && s.APIServer && s.NodeReady +} + +// Checker performs health checks against the local KubeSolo instance. +type Checker struct { + kubeconfigPath string + apiServerAddr string + timeout time.Duration +} + +// NewChecker creates a health checker. +func NewChecker(kubeconfigPath, apiServerAddr string, timeout time.Duration) *Checker { + if kubeconfigPath == "" { + kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig" + } + if apiServerAddr == "" { + apiServerAddr = "127.0.0.1:6443" + } + if timeout == 0 { + timeout = 120 * time.Second + } + return &Checker{ + kubeconfigPath: kubeconfigPath, + apiServerAddr: apiServerAddr, + timeout: timeout, + } +} + +// CheckContainerd verifies that containerd is running. +func (c *Checker) CheckContainerd() bool { + // Check if containerd socket exists + if _, err := os.Stat("/run/containerd/containerd.sock"); err != nil { + slog.Warn("containerd socket not found") + return false + } + + // Try ctr version (bundled with KubeSolo) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "ctr", "--address", "/run/containerd/containerd.sock", "version") + if err := cmd.Run(); err != nil { + slog.Warn("containerd not responsive", "error", err) + return false + } + + slog.Debug("containerd healthy") + return true +} + +// CheckAPIServer verifies the Kubernetes API server is reachable. +func (c *Checker) CheckAPIServer() bool { + // TCP connect to API server port + conn, err := net.DialTimeout("tcp", c.apiServerAddr, 5*time.Second) + if err != nil { + slog.Warn("API server not reachable", "addr", c.apiServerAddr, "error", err) + return false + } + conn.Close() + + // Try HTTPS health endpoint (skip TLS verify for localhost) + client := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{ + TLSHandshakeTimeout: 5 * time.Second, + }, + } + + resp, err := client.Get("https://" + c.apiServerAddr + "/healthz") + if err != nil { + // TLS error is expected without proper CA, but TCP connect succeeded + slog.Debug("API server TCP reachable but HTTPS check skipped", "error", err) + return true + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + slog.Debug("API server healthy", "status", resp.StatusCode) + return true + } + + slog.Warn("API server unhealthy", "status", resp.StatusCode) + return false +} + +// CheckNodeReady uses kubectl to verify the node is in Ready state. +func (c *Checker) CheckNodeReady() bool { + if _, err := os.Stat(c.kubeconfigPath); err != nil { + slog.Warn("kubeconfig not found", "path", c.kubeconfigPath) + return false + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "kubectl", + "--kubeconfig", c.kubeconfigPath, + "get", "nodes", + "-o", "jsonpath={.items[0].status.conditions[?(@.type==\"Ready\")].status}", + ) + output, err := cmd.Output() + if err != nil { + slog.Warn("kubectl get nodes failed", "error", err) + return false + } + + status := strings.TrimSpace(string(output)) + if status == "True" { + slog.Debug("node is Ready") + return true + } + + slog.Warn("node not Ready", "status", status) + return false +} + +// RunAll performs all health checks and returns the combined status. +func (c *Checker) RunAll() *Status { + return &Status{ + Containerd: c.CheckContainerd(), + APIServer: c.CheckAPIServer(), + NodeReady: c.CheckNodeReady(), + } +} + +// WaitForHealthy polls health checks until all pass or timeout expires. +func (c *Checker) WaitForHealthy() (*Status, error) { + deadline := time.Now().Add(c.timeout) + interval := 5 * time.Second + + slog.Info("waiting for system health", "timeout", c.timeout) + + for time.Now().Before(deadline) { + status := c.RunAll() + if status.IsHealthy() { + status.Message = "all checks passed" + slog.Info("system healthy", + "containerd", status.Containerd, + "apiserver", status.APIServer, + "node_ready", status.NodeReady, + ) + return status, nil + } + + slog.Debug("health check pending", + "containerd", status.Containerd, + "apiserver", status.APIServer, + "node_ready", status.NodeReady, + "remaining", time.Until(deadline).Round(time.Second), + ) + + time.Sleep(interval) + } + + // Final check + status := c.RunAll() + if status.IsHealthy() { + status.Message = "all checks passed" + return status, nil + } + + status.Message = "health check timeout" + return status, fmt.Errorf("health check timed out after %s", c.timeout) +} diff --git a/update/pkg/health/health_test.go b/update/pkg/health/health_test.go new file mode 100644 index 0000000..91a4ee0 --- /dev/null +++ b/update/pkg/health/health_test.go @@ -0,0 +1,86 @@ +package health + +import ( + "testing" + "time" +) + +func TestStatusIsHealthy(t *testing.T) { + tests := []struct { + name string + status Status + wantHealth bool + }{ + { + name: "all healthy", + status: Status{Containerd: true, APIServer: true, NodeReady: true}, + wantHealth: true, + }, + { + name: "containerd down", + status: Status{Containerd: false, APIServer: true, NodeReady: true}, + wantHealth: false, + }, + { + name: "apiserver down", + status: Status{Containerd: true, APIServer: false, NodeReady: true}, + wantHealth: false, + }, + { + name: "node not ready", + status: Status{Containerd: true, APIServer: true, NodeReady: false}, + wantHealth: false, + }, + { + name: "all down", + status: Status{Containerd: false, APIServer: false, NodeReady: false}, + wantHealth: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.status.IsHealthy(); got != tt.wantHealth { + t.Errorf("IsHealthy() = %v, want %v", got, tt.wantHealth) + } + }) + } +} + +func TestNewChecker(t *testing.T) { + // Test defaults + c := NewChecker("", "", 0) + if c.kubeconfigPath != "/var/lib/kubesolo/pki/admin/admin.kubeconfig" { + t.Errorf("unexpected default kubeconfig: %s", c.kubeconfigPath) + } + if c.apiServerAddr != "127.0.0.1:6443" { + t.Errorf("unexpected default apiserver addr: %s", c.apiServerAddr) + } + if c.timeout != 120*time.Second { + t.Errorf("unexpected default timeout: %v", c.timeout) + } + + // Test custom values + c = NewChecker("/custom/kubeconfig", "10.0.0.1:6443", 30*time.Second) + if c.kubeconfigPath != "/custom/kubeconfig" { + t.Errorf("expected custom kubeconfig, got %s", c.kubeconfigPath) + } + if c.apiServerAddr != "10.0.0.1:6443" { + t.Errorf("expected custom addr, got %s", c.apiServerAddr) + } + if c.timeout != 30*time.Second { + t.Errorf("expected 30s timeout, got %v", c.timeout) + } +} + +func TestStatusMessage(t *testing.T) { + s := &Status{ + Containerd: true, + APIServer: true, + NodeReady: true, + Message: "all checks passed", + } + if s.Message != "all checks passed" { + t.Errorf("unexpected message: %s", s.Message) + } +} diff --git a/update/pkg/image/image.go b/update/pkg/image/image.go new file mode 100644 index 0000000..54e15ce --- /dev/null +++ b/update/pkg/image/image.go @@ -0,0 +1,180 @@ +// Package image handles downloading, verifying, and staging OS update images. +// +// Update images are distributed as pairs of files: +// - vmlinuz (kernel) +// - kubesolo-os.gz (initramfs) +// +// These are fetched from an HTTP(S) server that provides a metadata file +// (latest.json) describing available updates. +package image + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "os" + "path/filepath" + "time" +) + +// UpdateMetadata describes an available update from the update server. +type UpdateMetadata struct { + Version string `json:"version"` + VmlinuzURL string `json:"vmlinuz_url"` + VmlinuzSHA256 string `json:"vmlinuz_sha256"` + InitramfsURL string `json:"initramfs_url"` + InitramfsSHA256 string `json:"initramfs_sha256"` + ReleaseNotes string `json:"release_notes,omitempty"` + ReleaseDate string `json:"release_date,omitempty"` +} + +// StagedImage represents downloaded and verified update files. +type StagedImage struct { + VmlinuzPath string + InitramfsPath string + Version string +} + +// Client handles communication with the update server. +type Client struct { + serverURL string + httpClient *http.Client + stageDir string +} + +// NewClient creates a new update image client. +func NewClient(serverURL, stageDir string) *Client { + return &Client{ + serverURL: serverURL, + httpClient: &http.Client{ + Timeout: 5 * time.Minute, + }, + stageDir: stageDir, + } +} + +// CheckForUpdate fetches the latest update metadata from the server. +func (c *Client) CheckForUpdate() (*UpdateMetadata, error) { + url := c.serverURL + "/latest.json" + slog.Info("checking for update", "url", url) + + resp, err := c.httpClient.Get(url) + if err != nil { + return nil, fmt.Errorf("fetching update metadata: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("update server returned %d", resp.StatusCode) + } + + var meta UpdateMetadata + if err := json.NewDecoder(resp.Body).Decode(&meta); err != nil { + return nil, fmt.Errorf("parsing update metadata: %w", err) + } + + if meta.Version == "" { + return nil, fmt.Errorf("update metadata missing version") + } + + return &meta, nil +} + +// Download fetches the update files and verifies their checksums. +func (c *Client) Download(meta *UpdateMetadata) (*StagedImage, error) { + if err := os.MkdirAll(c.stageDir, 0o755); err != nil { + return nil, fmt.Errorf("creating stage dir: %w", err) + } + + vmlinuzPath := filepath.Join(c.stageDir, "vmlinuz") + initramfsPath := filepath.Join(c.stageDir, "kubesolo-os.gz") + + slog.Info("downloading vmlinuz", "url", meta.VmlinuzURL) + if err := c.downloadAndVerify(meta.VmlinuzURL, vmlinuzPath, meta.VmlinuzSHA256); err != nil { + return nil, fmt.Errorf("downloading vmlinuz: %w", err) + } + + slog.Info("downloading initramfs", "url", meta.InitramfsURL) + if err := c.downloadAndVerify(meta.InitramfsURL, initramfsPath, meta.InitramfsSHA256); err != nil { + return nil, fmt.Errorf("downloading initramfs: %w", err) + } + + return &StagedImage{ + VmlinuzPath: vmlinuzPath, + InitramfsPath: initramfsPath, + Version: meta.Version, + }, nil +} + +// Cleanup removes staged update files. +func (c *Client) Cleanup() error { + return os.RemoveAll(c.stageDir) +} + +func (c *Client) downloadAndVerify(url, dest, expectedSHA256 string) error { + resp, err := c.httpClient.Get(url) + if err != nil { + return fmt.Errorf("downloading %s: %w", url, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("server returned %d for %s", resp.StatusCode, url) + } + + f, err := os.Create(dest) + if err != nil { + return fmt.Errorf("creating %s: %w", dest, err) + } + defer f.Close() + + hasher := sha256.New() + writer := io.MultiWriter(f, hasher) + + written, err := io.Copy(writer, resp.Body) + if err != nil { + os.Remove(dest) + return fmt.Errorf("writing %s: %w", dest, err) + } + + if err := f.Close(); err != nil { + return fmt.Errorf("closing %s: %w", dest, err) + } + + // Verify checksum + if expectedSHA256 != "" { + actual := hex.EncodeToString(hasher.Sum(nil)) + if actual != expectedSHA256 { + os.Remove(dest) + return fmt.Errorf("checksum mismatch for %s: expected %s, got %s", dest, expectedSHA256, actual) + } + slog.Debug("checksum verified", "file", dest, "sha256", actual) + } + + slog.Info("downloaded", "file", dest, "size", written) + return nil +} + +// VerifyFile checks the SHA256 checksum of an existing file. +func VerifyFile(path, expectedSHA256 string) error { + f, err := os.Open(path) + if err != nil { + return err + } + defer f.Close() + + hasher := sha256.New() + if _, err := io.Copy(hasher, f); err != nil { + return err + } + + actual := hex.EncodeToString(hasher.Sum(nil)) + if actual != expectedSHA256 { + return fmt.Errorf("checksum mismatch: expected %s, got %s", expectedSHA256, actual) + } + return nil +} diff --git a/update/pkg/image/image_test.go b/update/pkg/image/image_test.go new file mode 100644 index 0000000..bf349d1 --- /dev/null +++ b/update/pkg/image/image_test.go @@ -0,0 +1,241 @@ +package image + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" +) + +func TestCheckForUpdate(t *testing.T) { + meta := UpdateMetadata{ + Version: "1.2.0", + VmlinuzURL: "/vmlinuz", + VmlinuzSHA256: "abc123", + InitramfsURL: "/kubesolo-os.gz", + InitramfsSHA256: "def456", + ReleaseNotes: "Bug fixes", + ReleaseDate: "2025-01-15", + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/latest.json" { + http.NotFound(w, r) + return + } + json.NewEncoder(w).Encode(meta) + })) + defer server.Close() + + client := NewClient(server.URL, "") + got, err := client.CheckForUpdate() + if err != nil { + t.Fatal(err) + } + + if got.Version != "1.2.0" { + t.Errorf("expected version 1.2.0, got %s", got.Version) + } + if got.VmlinuzSHA256 != "abc123" { + t.Errorf("expected vmlinuz sha abc123, got %s", got.VmlinuzSHA256) + } + if got.ReleaseNotes != "Bug fixes" { + t.Errorf("expected release notes, got %s", got.ReleaseNotes) + } +} + +func TestCheckForUpdateMissingVersion(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(UpdateMetadata{}) + })) + defer server.Close() + + client := NewClient(server.URL, "") + _, err := client.CheckForUpdate() + if err == nil { + t.Fatal("expected error for missing version") + } +} + +func TestCheckForUpdateServerError(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + client := NewClient(server.URL, "") + _, err := client.CheckForUpdate() + if err == nil { + t.Fatal("expected error for server error") + } +} + +func TestDownloadAndVerify(t *testing.T) { + // Create test content + vmlinuzContent := []byte("fake vmlinuz content for testing") + initramfsContent := []byte("fake initramfs content for testing") + + vmlinuzHash := sha256.Sum256(vmlinuzContent) + initramfsHash := sha256.Sum256(initramfsContent) + + meta := UpdateMetadata{ + Version: "2.0.0", + VmlinuzSHA256: hex.EncodeToString(vmlinuzHash[:]), + InitramfsSHA256: hex.EncodeToString(initramfsHash[:]), + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/latest.json": + m := meta + m.VmlinuzURL = "http://" + r.Host + "/vmlinuz" + m.InitramfsURL = "http://" + r.Host + "/kubesolo-os.gz" + json.NewEncoder(w).Encode(m) + case "/vmlinuz": + w.Write(vmlinuzContent) + case "/kubesolo-os.gz": + w.Write(initramfsContent) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + stageDir := filepath.Join(t.TempDir(), "stage") + client := NewClient(server.URL, stageDir) + defer client.Cleanup() + + // First get metadata + gotMeta, err := client.CheckForUpdate() + if err != nil { + t.Fatal(err) + } + + // Download + staged, err := client.Download(gotMeta) + if err != nil { + t.Fatal(err) + } + + if staged.Version != "2.0.0" { + t.Errorf("expected version 2.0.0, got %s", staged.Version) + } + + // Verify files exist + if _, err := os.Stat(staged.VmlinuzPath); err != nil { + t.Errorf("vmlinuz not found: %v", err) + } + if _, err := os.Stat(staged.InitramfsPath); err != nil { + t.Errorf("initramfs not found: %v", err) + } + + // Verify content + data, _ := os.ReadFile(staged.VmlinuzPath) + if string(data) != string(vmlinuzContent) { + t.Error("vmlinuz content mismatch") + } +} + +func TestDownloadChecksumMismatch(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/vmlinuz": + w.Write([]byte("actual content")) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + stageDir := filepath.Join(t.TempDir(), "stage") + client := NewClient(server.URL, stageDir) + + meta := &UpdateMetadata{ + Version: "1.0.0", + VmlinuzURL: server.URL + "/vmlinuz", + VmlinuzSHA256: "wrong_checksum_value", + InitramfsURL: server.URL + "/initramfs", + } + + _, err := client.Download(meta) + if err == nil { + t.Fatal("expected checksum mismatch error") + } +} + +func TestVerifyFile(t *testing.T) { + content := []byte("test file content for verification") + hash := sha256.Sum256(content) + expected := hex.EncodeToString(hash[:]) + + dir := t.TempDir() + path := filepath.Join(dir, "testfile") + if err := os.WriteFile(path, content, 0o644); err != nil { + t.Fatal(err) + } + + // Should pass with correct hash + if err := VerifyFile(path, expected); err != nil { + t.Errorf("expected verification to pass: %v", err) + } + + // Should fail with wrong hash + if err := VerifyFile(path, "deadbeef"); err == nil { + t.Error("expected verification to fail with wrong hash") + } +} + +func TestVerifyFileNotFound(t *testing.T) { + err := VerifyFile("/nonexistent/file", "abc123") + if err == nil { + t.Error("expected error for nonexistent file") + } +} + +func TestCleanup(t *testing.T) { + stageDir := filepath.Join(t.TempDir(), "stage") + os.MkdirAll(stageDir, 0o755) + os.WriteFile(filepath.Join(stageDir, "test"), []byte("data"), 0o644) + + client := NewClient("http://unused", stageDir) + if err := client.Cleanup(); err != nil { + t.Fatal(err) + } + + if _, err := os.Stat(stageDir); !os.IsNotExist(err) { + t.Error("stage dir should be removed after cleanup") + } +} + +func TestUpdateMetadataJSON(t *testing.T) { + meta := UpdateMetadata{ + Version: "1.0.0", + VmlinuzURL: "https://example.com/vmlinuz", + VmlinuzSHA256: "abc", + InitramfsURL: "https://example.com/kubesolo-os.gz", + InitramfsSHA256: "def", + ReleaseNotes: "Initial release", + ReleaseDate: "2025-01-01", + } + + data, err := json.Marshal(meta) + if err != nil { + t.Fatal(err) + } + + var decoded UpdateMetadata + if err := json.Unmarshal(data, &decoded); err != nil { + t.Fatal(err) + } + + if decoded.Version != meta.Version { + t.Errorf("version mismatch: %s != %s", decoded.Version, meta.Version) + } + if decoded.ReleaseDate != meta.ReleaseDate { + t.Errorf("release date mismatch: %s != %s", decoded.ReleaseDate, meta.ReleaseDate) + } +} diff --git a/update/pkg/partition/partition.go b/update/pkg/partition/partition.go new file mode 100644 index 0000000..28e14ce --- /dev/null +++ b/update/pkg/partition/partition.go @@ -0,0 +1,139 @@ +// Package partition detects and manages A/B system partitions. +// +// It identifies System A and System B partitions by label (KSOLOA, KSOLOB) +// and provides mount/write operations for the update process. +package partition + +import ( + "fmt" + "log/slog" + "os" + "os/exec" + "path/filepath" + "strings" +) + +const ( + LabelSystemA = "KSOLOA" + LabelSystemB = "KSOLOB" + LabelData = "KSOLODATA" + LabelEFI = "KSOLOEFI" +) + +// Info contains information about a partition. +type Info struct { + Device string // e.g. /dev/sda2 + Label string // e.g. KSOLOA + MountPoint string // current mount point, empty if not mounted + Slot string // "A" or "B" +} + +// FindByLabel locates a block device by its filesystem label. +func FindByLabel(label string) (string, error) { + cmd := exec.Command("blkid", "-L", label) + output, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("partition with label %q not found: %w", label, err) + } + return strings.TrimSpace(string(output)), nil +} + +// GetSlotPartition returns the partition info for the given slot ("A" or "B"). +func GetSlotPartition(slot string) (*Info, error) { + var label string + switch slot { + case "A": + label = LabelSystemA + case "B": + label = LabelSystemB + default: + return nil, fmt.Errorf("invalid slot: %q", slot) + } + + dev, err := FindByLabel(label) + if err != nil { + return nil, err + } + + return &Info{ + Device: dev, + Label: label, + Slot: slot, + }, nil +} + +// MountReadOnly mounts a partition read-only at the given mount point. +func MountReadOnly(dev, mountPoint string) error { + if err := os.MkdirAll(mountPoint, 0o755); err != nil { + return fmt.Errorf("creating mount point: %w", err) + } + cmd := exec.Command("mount", "-o", "ro", dev, mountPoint) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("mounting %s at %s: %w\n%s", dev, mountPoint, err, output) + } + slog.Debug("mounted", "device", dev, "mountpoint", mountPoint, "mode", "ro") + return nil +} + +// MountReadWrite mounts a partition read-write at the given mount point. +func MountReadWrite(dev, mountPoint string) error { + if err := os.MkdirAll(mountPoint, 0o755); err != nil { + return fmt.Errorf("creating mount point: %w", err) + } + cmd := exec.Command("mount", dev, mountPoint) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("mounting %s at %s: %w\n%s", dev, mountPoint, err, output) + } + slog.Debug("mounted", "device", dev, "mountpoint", mountPoint, "mode", "rw") + return nil +} + +// Unmount unmounts a mount point. +func Unmount(mountPoint string) error { + cmd := exec.Command("umount", mountPoint) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("unmounting %s: %w\n%s", mountPoint, err, output) + } + return nil +} + +// ReadVersion reads the version file from a mounted system partition. +func ReadVersion(mountPoint string) (string, error) { + data, err := os.ReadFile(filepath.Join(mountPoint, "version")) + if err != nil { + return "", fmt.Errorf("reading version: %w", err) + } + return strings.TrimSpace(string(data)), nil +} + +// WriteSystemImage copies vmlinuz and initramfs to a mounted partition. +func WriteSystemImage(mountPoint, vmlinuzPath, initramfsPath, version string) error { + // Copy vmlinuz + if err := copyFile(vmlinuzPath, filepath.Join(mountPoint, "vmlinuz")); err != nil { + return fmt.Errorf("writing vmlinuz: %w", err) + } + + // Copy initramfs + if err := copyFile(initramfsPath, filepath.Join(mountPoint, "kubesolo-os.gz")); err != nil { + return fmt.Errorf("writing initramfs: %w", err) + } + + // Write version + if err := os.WriteFile(filepath.Join(mountPoint, "version"), []byte(version+"\n"), 0o644); err != nil { + return fmt.Errorf("writing version: %w", err) + } + + // Sync to ensure data is flushed to disk + exec.Command("sync").Run() + + slog.Info("system image written", "mountpoint", mountPoint, "version", version) + return nil +} + +func copyFile(src, dst string) error { + data, err := os.ReadFile(src) + if err != nil { + return err + } + return os.WriteFile(dst, data, 0o644) +} diff --git a/update/pkg/partition/partition_test.go b/update/pkg/partition/partition_test.go new file mode 100644 index 0000000..18ae461 --- /dev/null +++ b/update/pkg/partition/partition_test.go @@ -0,0 +1,129 @@ +package partition + +import ( + "os" + "path/filepath" + "testing" +) + +func TestReadVersion(t *testing.T) { + dir := t.TempDir() + versionFile := filepath.Join(dir, "version") + if err := os.WriteFile(versionFile, []byte("1.2.3\n"), 0o644); err != nil { + t.Fatal(err) + } + + version, err := ReadVersion(dir) + if err != nil { + t.Fatal(err) + } + if version != "1.2.3" { + t.Errorf("expected 1.2.3, got %s", version) + } +} + +func TestReadVersionMissing(t *testing.T) { + dir := t.TempDir() + _, err := ReadVersion(dir) + if err == nil { + t.Fatal("expected error for missing version file") + } +} + +func TestWriteSystemImage(t *testing.T) { + mountPoint := t.TempDir() + srcDir := t.TempDir() + + // Create source files + vmlinuzPath := filepath.Join(srcDir, "vmlinuz") + initramfsPath := filepath.Join(srcDir, "kubesolo-os.gz") + + if err := os.WriteFile(vmlinuzPath, []byte("kernel data"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(initramfsPath, []byte("initramfs data"), 0o644); err != nil { + t.Fatal(err) + } + + if err := WriteSystemImage(mountPoint, vmlinuzPath, initramfsPath, "2.0.0"); err != nil { + t.Fatal(err) + } + + // Verify files were copied + data, err := os.ReadFile(filepath.Join(mountPoint, "vmlinuz")) + if err != nil { + t.Fatal(err) + } + if string(data) != "kernel data" { + t.Errorf("vmlinuz content mismatch") + } + + data, err = os.ReadFile(filepath.Join(mountPoint, "kubesolo-os.gz")) + if err != nil { + t.Fatal(err) + } + if string(data) != "initramfs data" { + t.Errorf("initramfs content mismatch") + } + + // Verify version file + version, err := ReadVersion(mountPoint) + if err != nil { + t.Fatal(err) + } + if version != "2.0.0" { + t.Errorf("expected version 2.0.0, got %s", version) + } +} + +func TestCopyFile(t *testing.T) { + dir := t.TempDir() + src := filepath.Join(dir, "src") + dst := filepath.Join(dir, "dst") + + if err := os.WriteFile(src, []byte("test content"), 0o644); err != nil { + t.Fatal(err) + } + + if err := copyFile(src, dst); err != nil { + t.Fatal(err) + } + + data, err := os.ReadFile(dst) + if err != nil { + t.Fatal(err) + } + if string(data) != "test content" { + t.Errorf("copy content mismatch") + } +} + +func TestCopyFileNotFound(t *testing.T) { + dir := t.TempDir() + err := copyFile("/nonexistent", filepath.Join(dir, "dst")) + if err == nil { + t.Fatal("expected error for nonexistent source") + } +} + +func TestGetSlotPartitionInvalid(t *testing.T) { + _, err := GetSlotPartition("C") + if err == nil { + t.Fatal("expected error for invalid slot") + } +} + +func TestConstants(t *testing.T) { + if LabelSystemA != "KSOLOA" { + t.Errorf("unexpected LabelSystemA: %s", LabelSystemA) + } + if LabelSystemB != "KSOLOB" { + t.Errorf("unexpected LabelSystemB: %s", LabelSystemB) + } + if LabelData != "KSOLODATA" { + t.Errorf("unexpected LabelData: %s", LabelData) + } + if LabelEFI != "KSOLOEFI" { + t.Errorf("unexpected LabelEFI: %s", LabelEFI) + } +}