feat: add A/B partition updates with GRUB and Go update agent (Phase 3)

Implement atomic OS updates via A/B partition scheme with automatic
rollback. GRUB bootloader manages slot selection with a 3-attempt
boot counter that auto-rolls back on repeated health check failures.

GRUB boot config:
- A/B slot selection with boot_counter/boot_success env vars
- Automatic rollback when counter reaches 0 (3 failed boots)
- Debug, emergency shell, and manual slot-switch menu entries

Disk image (refactored):
- 4-partition GPT layout: EFI + System A + System B + Data
- GRUB EFI/BIOS installation with graceful fallbacks
- Both system partitions populated during image creation

Update agent (Go, zero external deps):
- pkg/grubenv: read/write GRUB env vars (grub-editenv + manual fallback)
- pkg/partition: find/mount/write system partitions by label
- pkg/image: HTTP download with SHA256 verification
- pkg/health: post-boot checks (containerd, API server, node Ready)
- 6 CLI commands: check, apply, activate, rollback, healthcheck, status
- 37 unit tests across all 4 packages

Deployment:
- K8s CronJob for automatic update checks (every 6 hours)
- ConfigMap for update server URL
- Health check Job for post-boot verification

Build pipeline:
- build-update-agent.sh compiles static Linux binary (~5.9 MB)
- inject-kubesolo.sh includes update agent in initramfs
- Makefile: build-update-agent, test-update-agent, test-update targets

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-11 11:12:46 -06:00
parent d900fa920e
commit 8d25e1890e
25 changed files with 2807 additions and 74 deletions

View File

@@ -0,0 +1,11 @@
# KubeSolo OS — Default GRUB Environment Variables
# These are written to grubenv on first install.
# Format: key=value (one per line, grub-editenv compatible)
#
# active_slot: Which system partition to boot (A or B)
# boot_counter: Attempts remaining before rollback (3 = fresh, 0 = rollback)
# boot_success: Set to 1 by health check after successful boot
active_slot=A
boot_counter=3
boot_success=1

95
build/grub/grub.cfg Normal file
View File

@@ -0,0 +1,95 @@
# KubeSolo OS — GRUB Configuration
# A/B partition boot with automatic rollback
#
# Partition layout:
# (hd0,gpt1) — EFI/Boot (256 MB, FAT32) — contains GRUB + grubenv
# (hd0,gpt2) — System A (512 MB, ext4) — vmlinuz + kubesolo-os.gz
# (hd0,gpt3) — System B (512 MB, ext4) — vmlinuz + kubesolo-os.gz
# (hd0,gpt4) — Data (remaining, ext4) — persistent K8s state
#
# Environment variables (in grubenv):
# active_slot — "A" or "B" (which partition to boot)
# boot_counter — 3→2→1→0 (decremented on each failed boot)
# boot_success — 0 or 1 (set to 1 by health check post-boot)
set default=0
set timeout=3
# Load saved environment
load_env
# --- A/B Rollback Logic ---
# On every boot, check if the last boot was successful.
# If not, decrement the counter. If counter hits 0, swap slots.
if [ "${boot_success}" != "1" ]; then
# Last boot failed — check counter
if [ "${boot_counter}" = "0" ]; then
# Counter exhausted — rollback to other slot
if [ "${active_slot}" = "A" ]; then
set active_slot=B
else
set active_slot=A
fi
save_env active_slot
set boot_counter=3
save_env boot_counter
else
# Decrement counter (GRUB doesn't have arithmetic)
if [ "${boot_counter}" = "3" ]; then
set boot_counter=2
elif [ "${boot_counter}" = "2" ]; then
set boot_counter=1
elif [ "${boot_counter}" = "1" ]; then
set boot_counter=0
fi
save_env boot_counter
fi
fi
# Reset boot_success for this boot attempt — health check must set it to 1
set boot_success=0
save_env boot_success
# --- Resolve boot partition ---
if [ "${active_slot}" = "A" ]; then
set root='(hd0,gpt2)'
set slot_label="System A"
else
set root='(hd0,gpt3)'
set slot_label="System B"
fi
# --- Menu Entries ---
menuentry "KubeSolo OS (${slot_label})" {
echo "Booting KubeSolo OS from ${slot_label}..."
echo "Boot counter: ${boot_counter}, Boot success: ${boot_success}"
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA quiet
initrd /kubesolo-os.gz
}
menuentry "KubeSolo OS (${slot_label}) — Debug Mode" {
echo "Booting KubeSolo OS (debug) from ${slot_label}..."
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8
initrd /kubesolo-os.gz
}
menuentry "KubeSolo OS — Emergency Shell" {
echo "Booting to emergency shell..."
linux /vmlinuz kubesolo.shell console=ttyS0,115200n8
initrd /kubesolo-os.gz
}
menuentry "KubeSolo OS — Boot Other Slot" {
# Manually boot the passive slot (for testing)
if [ "${active_slot}" = "A" ]; then
set root='(hd0,gpt3)'
echo "Booting from System B (passive)..."
else
set root='(hd0,gpt2)'
echo "Booting from System A (passive)..."
fi
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8
initrd /kubesolo-os.gz
}

View File

@@ -0,0 +1,29 @@
#!/bin/bash
# build-update-agent.sh — Compile the KubeSolo OS update agent
#
# Builds a static Linux binary for the update agent.
# Output: build/cache/kubesolo-update
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
UPDATE_DIR="$PROJECT_ROOT/update"
CACHE_DIR="$PROJECT_ROOT/build/cache"
OUTPUT="$CACHE_DIR/kubesolo-update"
echo "=== Building KubeSolo Update Agent ==="
# Ensure output dir exists
mkdir -p "$CACHE_DIR"
# Run tests first
echo "--- Running tests ---"
(cd "$UPDATE_DIR" && go test ./... -count=1)
# Build static binary
echo "--- Compiling static binary ---"
(cd "$UPDATE_DIR" && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
go build -ldflags='-s -w' -o "$OUTPUT" .)
SIZE=$(ls -lh "$OUTPUT" | awk '{print $5}')
echo "--- Update agent built: $OUTPUT ($SIZE) ---"

View File

@@ -1,6 +1,11 @@
#!/bin/bash
# create-disk-image.sh — Create a raw disk image with boot + data partitions
# Phase 1: simple layout (boot + data). Phase 3 adds A/B system partitions.
# create-disk-image.sh — Create a raw disk image with A/B system partitions
#
# Partition layout (GPT):
# Part 1: EFI/Boot (256 MB, FAT32) — GRUB + grubenv + A/B boot logic
# Part 2: System A (512 MB, ext4) — vmlinuz + kubesolo-os.gz (active)
# Part 3: System B (512 MB, ext4) — vmlinuz + kubesolo-os.gz (passive)
# Part 4: Data (remaining, ext4) — persistent K8s state
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -11,93 +16,165 @@ VERSION="$(cat "$PROJECT_ROOT/VERSION")"
OS_NAME="kubesolo-os"
IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.img"
IMG_SIZE_MB="${IMG_SIZE_MB:-2048}" # 2 GB default
IMG_SIZE_MB="${IMG_SIZE_MB:-4096}" # 4 GB default (larger for A/B)
VMLINUZ="$ROOTFS_DIR/vmlinuz"
INITRAMFS="$ROOTFS_DIR/kubesolo-os.gz"
GRUB_CFG="$PROJECT_ROOT/build/grub/grub.cfg"
GRUB_ENV_DEFAULTS="$PROJECT_ROOT/build/grub/grub-env-defaults"
for f in "$VMLINUZ" "$INITRAMFS"; do
[ -f "$f" ] || { echo "ERROR: Missing $f — run 'make initramfs'"; exit 1; }
for f in "$VMLINUZ" "$INITRAMFS" "$GRUB_CFG" "$GRUB_ENV_DEFAULTS"; do
[ -f "$f" ] || { echo "ERROR: Missing $f"; exit 1; }
done
echo "==> Creating ${IMG_SIZE_MB}MB disk image..."
echo "==> Creating ${IMG_SIZE_MB}MB disk image with A/B partitions..."
mkdir -p "$OUTPUT_DIR"
# Create sparse image
dd if=/dev/zero of="$IMG_OUTPUT" bs=1M count=0 seek="$IMG_SIZE_MB" 2>/dev/null
# Partition: 256MB boot (ext4) + rest data (ext4)
# Using sfdisk for scriptability
# Partition (GPT):
# Part 1: 256 MB EFI System Partition (FAT32)
# Part 2: 512 MB System A (Linux filesystem)
# Part 3: 512 MB System B (Linux filesystem)
# Part 4: Remaining — Data (Linux filesystem)
sfdisk "$IMG_OUTPUT" << EOF
label: dos
unit: sectors
label: gpt
# Boot partition: 256 MB, bootable
start=2048, size=524288, type=83, bootable
# Data partition: remaining space
start=526336, type=83
# EFI/Boot partition: 256 MB
start=2048, size=524288, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B, name="EFI"
# System A partition: 512 MB
size=1048576, type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="SystemA"
# System B partition: 512 MB
size=1048576, type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="SystemB"
# Data partition: remaining
type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="Data"
EOF
# Set up loop device
LOOP=$(losetup --show -fP "$IMG_OUTPUT")
echo "==> Loop device: $LOOP"
MNT_EFI=$(mktemp -d)
MNT_SYSA=$(mktemp -d)
MNT_SYSB=$(mktemp -d)
MNT_DATA=$(mktemp -d)
cleanup() {
umount "${LOOP}p1" 2>/dev/null || true
umount "${LOOP}p2" 2>/dev/null || true
umount "$MNT_EFI" 2>/dev/null || true
umount "$MNT_SYSA" 2>/dev/null || true
umount "$MNT_SYSB" 2>/dev/null || true
umount "$MNT_DATA" 2>/dev/null || true
losetup -d "$LOOP" 2>/dev/null || true
rm -rf "$MNT_BOOT" "$MNT_DATA" 2>/dev/null || true
rm -rf "$MNT_EFI" "$MNT_SYSA" "$MNT_SYSB" "$MNT_DATA" 2>/dev/null || true
}
trap cleanup EXIT
# Format partitions
mkfs.ext4 -q -L KSOLOBOOT "${LOOP}p1"
mkfs.ext4 -q -L KSOLODATA "${LOOP}p2"
mkfs.vfat -F 32 -n KSOLOEFI "${LOOP}p1"
mkfs.ext4 -q -L KSOLOA "${LOOP}p2"
mkfs.ext4 -q -L KSOLOB "${LOOP}p3"
mkfs.ext4 -q -L KSOLODATA "${LOOP}p4"
# Mount and populate boot partition
MNT_BOOT=$(mktemp -d)
MNT_DATA=$(mktemp -d)
# Mount all partitions
mount "${LOOP}p1" "$MNT_EFI"
mount "${LOOP}p2" "$MNT_SYSA"
mount "${LOOP}p3" "$MNT_SYSB"
mount "${LOOP}p4" "$MNT_DATA"
mount "${LOOP}p1" "$MNT_BOOT"
mount "${LOOP}p2" "$MNT_DATA"
# --- EFI/Boot Partition ---
echo " Installing GRUB..."
mkdir -p "$MNT_EFI/EFI/BOOT"
mkdir -p "$MNT_EFI/boot/grub"
# Install syslinux + kernel + initramfs to boot partition
mkdir -p "$MNT_BOOT/boot/syslinux"
cp "$VMLINUZ" "$MNT_BOOT/boot/vmlinuz"
cp "$INITRAMFS" "$MNT_BOOT/boot/kubesolo-os.gz"
# Copy GRUB config
cp "$GRUB_CFG" "$MNT_EFI/boot/grub/grub.cfg"
# Syslinux config for disk boot (extlinux)
cat > "$MNT_BOOT/boot/syslinux/syslinux.cfg" << 'EOF'
DEFAULT kubesolo
TIMEOUT 30
PROMPT 0
# Create GRUB environment file from defaults
if command -v grub-editenv >/dev/null 2>&1; then
GRUB_EDITENV=grub-editenv
elif command -v grub2-editenv >/dev/null 2>&1; then
GRUB_EDITENV=grub2-editenv
else
GRUB_EDITENV=""
fi
LABEL kubesolo
KERNEL /boot/vmlinuz
INITRD /boot/kubesolo-os.gz
APPEND quiet kubesolo.data=LABEL=KSOLODATA
GRUBENV_FILE="$MNT_EFI/boot/grub/grubenv"
LABEL kubesolo-debug
KERNEL /boot/vmlinuz
INITRD /boot/kubesolo-os.gz
APPEND kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8
if [ -n "$GRUB_EDITENV" ]; then
# Create grubenv with defaults
"$GRUB_EDITENV" "$GRUBENV_FILE" create
while IFS='=' read -r key value; do
# Skip comments and empty lines
case "$key" in
'#'*|'') continue ;;
esac
"$GRUB_EDITENV" "$GRUBENV_FILE" set "$key=$value"
done < "$GRUB_ENV_DEFAULTS"
echo " GRUB environment created with grub-editenv"
else
# Fallback: write grubenv file manually (1024 bytes, padded with '#')
echo " WARN: grub-editenv not found — writing grubenv manually"
{
echo "# GRUB Environment Block"
while IFS='=' read -r key value; do
case "$key" in
'#'*|'') continue ;;
esac
echo "$key=$value"
done < "$GRUB_ENV_DEFAULTS"
} > "$GRUBENV_FILE.tmp"
# Pad to 1024 bytes (GRUB requirement)
truncate -s 1024 "$GRUBENV_FILE.tmp"
mv "$GRUBENV_FILE.tmp" "$GRUBENV_FILE"
fi
LABEL kubesolo-shell
KERNEL /boot/vmlinuz
INITRD /boot/kubesolo-os.gz
APPEND kubesolo.shell console=ttyS0,115200n8
EOF
# Install GRUB EFI binary if available
if command -v grub-mkimage >/dev/null 2>&1; then
grub-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \
-p /boot/grub \
part_gpt ext2 fat normal linux echo all_video test search \
search_fs_uuid search_label configfile loadenv \
2>/dev/null || echo " WARN: grub-mkimage failed — use QEMU -bios flag"
elif command -v grub2-mkimage >/dev/null 2>&1; then
grub2-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \
-p /boot/grub \
part_gpt ext2 fat normal linux echo all_video test search \
search_fs_uuid search_label configfile loadenv \
2>/dev/null || echo " WARN: grub2-mkimage failed — use QEMU -bios flag"
else
echo " WARN: grub-mkimage not found — EFI boot image not created"
echo " Install grub2-tools or use QEMU -kernel/-initrd flags"
fi
# Install extlinux bootloader
if command -v extlinux >/dev/null 2>&1; then
extlinux --install "$MNT_BOOT/boot/syslinux" 2>/dev/null || {
echo "WARN: extlinux install failed — image may not be directly bootable"
echo " Use with QEMU -kernel/-initrd flags instead"
# For BIOS boot: install GRUB i386-pc modules if available
if command -v grub-install >/dev/null 2>&1; then
grub-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
--no-floppy "$LOOP" 2>/dev/null || {
echo " WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
}
elif command -v grub2-install >/dev/null 2>&1; then
grub2-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
--no-floppy "$LOOP" 2>/dev/null || {
echo " WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
}
fi
# Prepare data partition structure
for dir in kubesolo containerd etc-kubesolo log usr-local network; do
# --- System A Partition (active) ---
echo " Populating System A (active)..."
cp "$VMLINUZ" "$MNT_SYSA/vmlinuz"
cp "$INITRAMFS" "$MNT_SYSA/kubesolo-os.gz"
echo "$VERSION" > "$MNT_SYSA/version"
# --- System B Partition (passive, initially same as A) ---
echo " Populating System B (passive)..."
cp "$VMLINUZ" "$MNT_SYSB/vmlinuz"
cp "$INITRAMFS" "$MNT_SYSB/kubesolo-os.gz"
echo "$VERSION" > "$MNT_SYSB/version"
# --- Data Partition ---
echo " Preparing data partition..."
for dir in kubesolo containerd etc-kubesolo log usr-local network images; do
mkdir -p "$MNT_DATA/$dir"
done
@@ -106,5 +183,8 @@ sync
echo ""
echo "==> Disk image created: $IMG_OUTPUT"
echo " Size: $(du -h "$IMG_OUTPUT" | cut -f1)"
echo " Boot partition (KSOLOBOOT): kernel + initramfs"
echo " Data partition (KSOLODATA): persistent K8s state"
echo " Part 1 (KSOLOEFI): GRUB + A/B boot config"
echo " Part 2 (KSOLOA): System A — kernel + initramfs (active)"
echo " Part 3 (KSOLOB): System B — kernel + initramfs (passive)"
echo " Part 4 (KSOLODATA): Persistent K8s state"
echo ""

View File

@@ -73,6 +73,16 @@ else
echo " WARN: Cloud-init binary not found (run 'make build-cloudinit' to build)"
fi
# Update agent binary (Go, built separately)
UPDATE_BIN="$CACHE_DIR/kubesolo-update"
if [ -f "$UPDATE_BIN" ]; then
cp "$UPDATE_BIN" "$ROOTFS/usr/lib/kubesolo-os/kubesolo-update"
chmod +x "$ROOTFS/usr/lib/kubesolo-os/kubesolo-update"
echo " Installed update agent ($(du -h "$UPDATE_BIN" | cut -f1))"
else
echo " WARN: Update agent not found (run 'make build-update-agent' to build)"
fi
# --- 3. Kernel modules list ---
cp "$PROJECT_ROOT/build/config/modules.list" "$ROOTFS/usr/lib/kubesolo-os/modules.list"