feat: add A/B partition updates with GRUB and Go update agent (Phase 3)

Implement atomic OS updates via A/B partition scheme with automatic
rollback. GRUB bootloader manages slot selection with a 3-attempt
boot counter that auto-rolls back on repeated health check failures.

GRUB boot config:
- A/B slot selection with boot_counter/boot_success env vars
- Automatic rollback when counter reaches 0 (3 failed boots)
- Debug, emergency shell, and manual slot-switch menu entries

Disk image (refactored):
- 4-partition GPT layout: EFI + System A + System B + Data
- GRUB EFI/BIOS installation with graceful fallbacks
- Both system partitions populated during image creation

Update agent (Go, zero external deps):
- pkg/grubenv: read/write GRUB env vars (grub-editenv + manual fallback)
- pkg/partition: find/mount/write system partitions by label
- pkg/image: HTTP download with SHA256 verification
- pkg/health: post-boot checks (containerd, API server, node Ready)
- 6 CLI commands: check, apply, activate, rollback, healthcheck, status
- 37 unit tests across all 4 packages

Deployment:
- K8s CronJob for automatic update checks (every 6 hours)
- ConfigMap for update server URL
- Health check Job for post-boot verification

Build pipeline:
- build-update-agent.sh compiles static Linux binary (~5.9 MB)
- inject-kubesolo.sh includes update agent in initramfs
- Makefile: build-update-agent, test-update-agent, test-update targets

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-11 11:12:46 -06:00
parent d900fa920e
commit 8d25e1890e
25 changed files with 2807 additions and 74 deletions

View File

@@ -1,6 +1,6 @@
.PHONY: all fetch build-cloudinit rootfs initramfs iso disk-image \
.PHONY: all fetch build-cloudinit build-update-agent rootfs initramfs iso disk-image \
test-boot test-k8s test-persistence test-deploy test-storage test-all \
test-cloudinit \
test-cloudinit test-update-agent \
dev-vm dev-vm-shell quick docker-build shellcheck \
kernel-audit clean distclean help
@@ -32,7 +32,11 @@ build-cloudinit:
@echo "==> Building cloud-init binary..."
$(BUILD_DIR)/scripts/build-cloudinit.sh
rootfs: fetch build-cloudinit
build-update-agent:
@echo "==> Building update agent..."
$(BUILD_DIR)/scripts/build-update-agent.sh
rootfs: fetch build-cloudinit build-update-agent
@echo "==> Preparing rootfs..."
$(BUILD_DIR)/scripts/extract-core.sh
$(BUILD_DIR)/scripts/inject-kubesolo.sh
@@ -88,6 +92,20 @@ test-cloudinit:
@echo "==> Testing cloud-init parser..."
cd cloud-init && go test ./... -v -count=1
# Update agent Go tests
test-update-agent:
@echo "==> Testing update agent..."
cd update && go test ./... -v -count=1
# A/B update integration tests
test-update: disk-image
@echo "==> Testing A/B update cycle..."
test/qemu/test-update.sh $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).img
test-rollback: disk-image
@echo "==> Testing rollback..."
test/qemu/test-rollback.sh $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).img
# Full integration test suite (requires more time)
test-integration: test-k8s test-deploy test-storage
@@ -159,10 +177,11 @@ help:
@echo "Build targets:"
@echo " make fetch Download Tiny Core ISO, KubeSolo, dependencies"
@echo " make build-cloudinit Build cloud-init Go binary"
@echo " make build-update-agent Build update agent Go binary"
@echo " make rootfs Extract + prepare rootfs with KubeSolo"
@echo " make initramfs Repack rootfs into kubesolo-os.gz"
@echo " make iso Create bootable ISO (default target)"
@echo " make disk-image Create raw disk image with boot + data partitions"
@echo " make disk-image Create raw disk image with A/B partitions + GRUB"
@echo " make quick Fast rebuild (re-inject + repack + ISO only)"
@echo " make docker-build Reproducible build inside Docker"
@echo ""
@@ -173,6 +192,9 @@ help:
@echo " make test-deploy Deploy nginx pod, verify Running"
@echo " make test-storage Test PVC with local-path provisioner"
@echo " make test-cloudinit Run cloud-init Go unit tests"
@echo " make test-update-agent Run update agent Go unit tests"
@echo " make test-update A/B update cycle integration test"
@echo " make test-rollback Forced rollback integration test"
@echo " make test-all Run core tests (boot + k8s + persistence)"
@echo " make test-integ Run full integration suite"
@echo ""

View File

@@ -0,0 +1,11 @@
# KubeSolo OS — Default GRUB Environment Variables
# These are written to grubenv on first install.
# Format: key=value (one per line, grub-editenv compatible)
#
# active_slot: Which system partition to boot (A or B)
# boot_counter: Attempts remaining before rollback (3 = fresh, 0 = rollback)
# boot_success: Set to 1 by health check after successful boot
active_slot=A
boot_counter=3
boot_success=1

95
build/grub/grub.cfg Normal file
View File

@@ -0,0 +1,95 @@
# KubeSolo OS — GRUB Configuration
# A/B partition boot with automatic rollback
#
# Partition layout:
# (hd0,gpt1) — EFI/Boot (256 MB, FAT32) — contains GRUB + grubenv
# (hd0,gpt2) — System A (512 MB, ext4) — vmlinuz + kubesolo-os.gz
# (hd0,gpt3) — System B (512 MB, ext4) — vmlinuz + kubesolo-os.gz
# (hd0,gpt4) — Data (remaining, ext4) — persistent K8s state
#
# Environment variables (in grubenv):
# active_slot — "A" or "B" (which partition to boot)
# boot_counter — 3→2→1→0 (decremented on each failed boot)
# boot_success — 0 or 1 (set to 1 by health check post-boot)
set default=0
set timeout=3
# Load saved environment
load_env
# --- A/B Rollback Logic ---
# On every boot, check if the last boot was successful.
# If not, decrement the counter. If counter hits 0, swap slots.
if [ "${boot_success}" != "1" ]; then
# Last boot failed — check counter
if [ "${boot_counter}" = "0" ]; then
# Counter exhausted — rollback to other slot
if [ "${active_slot}" = "A" ]; then
set active_slot=B
else
set active_slot=A
fi
save_env active_slot
set boot_counter=3
save_env boot_counter
else
# Decrement counter (GRUB doesn't have arithmetic)
if [ "${boot_counter}" = "3" ]; then
set boot_counter=2
elif [ "${boot_counter}" = "2" ]; then
set boot_counter=1
elif [ "${boot_counter}" = "1" ]; then
set boot_counter=0
fi
save_env boot_counter
fi
fi
# Reset boot_success for this boot attempt — health check must set it to 1
set boot_success=0
save_env boot_success
# --- Resolve boot partition ---
if [ "${active_slot}" = "A" ]; then
set root='(hd0,gpt2)'
set slot_label="System A"
else
set root='(hd0,gpt3)'
set slot_label="System B"
fi
# --- Menu Entries ---
menuentry "KubeSolo OS (${slot_label})" {
echo "Booting KubeSolo OS from ${slot_label}..."
echo "Boot counter: ${boot_counter}, Boot success: ${boot_success}"
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA quiet
initrd /kubesolo-os.gz
}
menuentry "KubeSolo OS (${slot_label}) — Debug Mode" {
echo "Booting KubeSolo OS (debug) from ${slot_label}..."
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8
initrd /kubesolo-os.gz
}
menuentry "KubeSolo OS — Emergency Shell" {
echo "Booting to emergency shell..."
linux /vmlinuz kubesolo.shell console=ttyS0,115200n8
initrd /kubesolo-os.gz
}
menuentry "KubeSolo OS — Boot Other Slot" {
# Manually boot the passive slot (for testing)
if [ "${active_slot}" = "A" ]; then
set root='(hd0,gpt3)'
echo "Booting from System B (passive)..."
else
set root='(hd0,gpt2)'
echo "Booting from System A (passive)..."
fi
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8
initrd /kubesolo-os.gz
}

View File

@@ -0,0 +1,29 @@
#!/bin/bash
# build-update-agent.sh — Compile the KubeSolo OS update agent
#
# Builds a static Linux binary for the update agent.
# Output: build/cache/kubesolo-update
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
UPDATE_DIR="$PROJECT_ROOT/update"
CACHE_DIR="$PROJECT_ROOT/build/cache"
OUTPUT="$CACHE_DIR/kubesolo-update"
echo "=== Building KubeSolo Update Agent ==="
# Ensure output dir exists
mkdir -p "$CACHE_DIR"
# Run tests first
echo "--- Running tests ---"
(cd "$UPDATE_DIR" && go test ./... -count=1)
# Build static binary
echo "--- Compiling static binary ---"
(cd "$UPDATE_DIR" && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
go build -ldflags='-s -w' -o "$OUTPUT" .)
SIZE=$(ls -lh "$OUTPUT" | awk '{print $5}')
echo "--- Update agent built: $OUTPUT ($SIZE) ---"

View File

@@ -1,6 +1,11 @@
#!/bin/bash
# create-disk-image.sh — Create a raw disk image with boot + data partitions
# Phase 1: simple layout (boot + data). Phase 3 adds A/B system partitions.
# create-disk-image.sh — Create a raw disk image with A/B system partitions
#
# Partition layout (GPT):
# Part 1: EFI/Boot (256 MB, FAT32) — GRUB + grubenv + A/B boot logic
# Part 2: System A (512 MB, ext4) — vmlinuz + kubesolo-os.gz (active)
# Part 3: System B (512 MB, ext4) — vmlinuz + kubesolo-os.gz (passive)
# Part 4: Data (remaining, ext4) — persistent K8s state
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -11,93 +16,165 @@ VERSION="$(cat "$PROJECT_ROOT/VERSION")"
OS_NAME="kubesolo-os"
IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.img"
IMG_SIZE_MB="${IMG_SIZE_MB:-2048}" # 2 GB default
IMG_SIZE_MB="${IMG_SIZE_MB:-4096}" # 4 GB default (larger for A/B)
VMLINUZ="$ROOTFS_DIR/vmlinuz"
INITRAMFS="$ROOTFS_DIR/kubesolo-os.gz"
GRUB_CFG="$PROJECT_ROOT/build/grub/grub.cfg"
GRUB_ENV_DEFAULTS="$PROJECT_ROOT/build/grub/grub-env-defaults"
for f in "$VMLINUZ" "$INITRAMFS"; do
[ -f "$f" ] || { echo "ERROR: Missing $f — run 'make initramfs'"; exit 1; }
for f in "$VMLINUZ" "$INITRAMFS" "$GRUB_CFG" "$GRUB_ENV_DEFAULTS"; do
[ -f "$f" ] || { echo "ERROR: Missing $f"; exit 1; }
done
echo "==> Creating ${IMG_SIZE_MB}MB disk image..."
echo "==> Creating ${IMG_SIZE_MB}MB disk image with A/B partitions..."
mkdir -p "$OUTPUT_DIR"
# Create sparse image
dd if=/dev/zero of="$IMG_OUTPUT" bs=1M count=0 seek="$IMG_SIZE_MB" 2>/dev/null
# Partition: 256MB boot (ext4) + rest data (ext4)
# Using sfdisk for scriptability
# Partition (GPT):
# Part 1: 256 MB EFI System Partition (FAT32)
# Part 2: 512 MB System A (Linux filesystem)
# Part 3: 512 MB System B (Linux filesystem)
# Part 4: Remaining — Data (Linux filesystem)
sfdisk "$IMG_OUTPUT" << EOF
label: dos
unit: sectors
label: gpt
# Boot partition: 256 MB, bootable
start=2048, size=524288, type=83, bootable
# Data partition: remaining space
start=526336, type=83
# EFI/Boot partition: 256 MB
start=2048, size=524288, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B, name="EFI"
# System A partition: 512 MB
size=1048576, type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="SystemA"
# System B partition: 512 MB
size=1048576, type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="SystemB"
# Data partition: remaining
type=0FC63DAF-8483-4772-8E79-3D69D8477DE4, name="Data"
EOF
# Set up loop device
LOOP=$(losetup --show -fP "$IMG_OUTPUT")
echo "==> Loop device: $LOOP"
MNT_EFI=$(mktemp -d)
MNT_SYSA=$(mktemp -d)
MNT_SYSB=$(mktemp -d)
MNT_DATA=$(mktemp -d)
cleanup() {
umount "${LOOP}p1" 2>/dev/null || true
umount "${LOOP}p2" 2>/dev/null || true
umount "$MNT_EFI" 2>/dev/null || true
umount "$MNT_SYSA" 2>/dev/null || true
umount "$MNT_SYSB" 2>/dev/null || true
umount "$MNT_DATA" 2>/dev/null || true
losetup -d "$LOOP" 2>/dev/null || true
rm -rf "$MNT_BOOT" "$MNT_DATA" 2>/dev/null || true
rm -rf "$MNT_EFI" "$MNT_SYSA" "$MNT_SYSB" "$MNT_DATA" 2>/dev/null || true
}
trap cleanup EXIT
# Format partitions
mkfs.ext4 -q -L KSOLOBOOT "${LOOP}p1"
mkfs.ext4 -q -L KSOLODATA "${LOOP}p2"
mkfs.vfat -F 32 -n KSOLOEFI "${LOOP}p1"
mkfs.ext4 -q -L KSOLOA "${LOOP}p2"
mkfs.ext4 -q -L KSOLOB "${LOOP}p3"
mkfs.ext4 -q -L KSOLODATA "${LOOP}p4"
# Mount and populate boot partition
MNT_BOOT=$(mktemp -d)
MNT_DATA=$(mktemp -d)
# Mount all partitions
mount "${LOOP}p1" "$MNT_EFI"
mount "${LOOP}p2" "$MNT_SYSA"
mount "${LOOP}p3" "$MNT_SYSB"
mount "${LOOP}p4" "$MNT_DATA"
mount "${LOOP}p1" "$MNT_BOOT"
mount "${LOOP}p2" "$MNT_DATA"
# --- EFI/Boot Partition ---
echo " Installing GRUB..."
mkdir -p "$MNT_EFI/EFI/BOOT"
mkdir -p "$MNT_EFI/boot/grub"
# Install syslinux + kernel + initramfs to boot partition
mkdir -p "$MNT_BOOT/boot/syslinux"
cp "$VMLINUZ" "$MNT_BOOT/boot/vmlinuz"
cp "$INITRAMFS" "$MNT_BOOT/boot/kubesolo-os.gz"
# Copy GRUB config
cp "$GRUB_CFG" "$MNT_EFI/boot/grub/grub.cfg"
# Syslinux config for disk boot (extlinux)
cat > "$MNT_BOOT/boot/syslinux/syslinux.cfg" << 'EOF'
DEFAULT kubesolo
TIMEOUT 30
PROMPT 0
# Create GRUB environment file from defaults
if command -v grub-editenv >/dev/null 2>&1; then
GRUB_EDITENV=grub-editenv
elif command -v grub2-editenv >/dev/null 2>&1; then
GRUB_EDITENV=grub2-editenv
else
GRUB_EDITENV=""
fi
LABEL kubesolo
KERNEL /boot/vmlinuz
INITRD /boot/kubesolo-os.gz
APPEND quiet kubesolo.data=LABEL=KSOLODATA
GRUBENV_FILE="$MNT_EFI/boot/grub/grubenv"
LABEL kubesolo-debug
KERNEL /boot/vmlinuz
INITRD /boot/kubesolo-os.gz
APPEND kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyS0,115200n8
if [ -n "$GRUB_EDITENV" ]; then
# Create grubenv with defaults
"$GRUB_EDITENV" "$GRUBENV_FILE" create
while IFS='=' read -r key value; do
# Skip comments and empty lines
case "$key" in
'#'*|'') continue ;;
esac
"$GRUB_EDITENV" "$GRUBENV_FILE" set "$key=$value"
done < "$GRUB_ENV_DEFAULTS"
echo " GRUB environment created with grub-editenv"
else
# Fallback: write grubenv file manually (1024 bytes, padded with '#')
echo " WARN: grub-editenv not found — writing grubenv manually"
{
echo "# GRUB Environment Block"
while IFS='=' read -r key value; do
case "$key" in
'#'*|'') continue ;;
esac
echo "$key=$value"
done < "$GRUB_ENV_DEFAULTS"
} > "$GRUBENV_FILE.tmp"
# Pad to 1024 bytes (GRUB requirement)
truncate -s 1024 "$GRUBENV_FILE.tmp"
mv "$GRUBENV_FILE.tmp" "$GRUBENV_FILE"
fi
LABEL kubesolo-shell
KERNEL /boot/vmlinuz
INITRD /boot/kubesolo-os.gz
APPEND kubesolo.shell console=ttyS0,115200n8
EOF
# Install GRUB EFI binary if available
if command -v grub-mkimage >/dev/null 2>&1; then
grub-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \
-p /boot/grub \
part_gpt ext2 fat normal linux echo all_video test search \
search_fs_uuid search_label configfile loadenv \
2>/dev/null || echo " WARN: grub-mkimage failed — use QEMU -bios flag"
elif command -v grub2-mkimage >/dev/null 2>&1; then
grub2-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \
-p /boot/grub \
part_gpt ext2 fat normal linux echo all_video test search \
search_fs_uuid search_label configfile loadenv \
2>/dev/null || echo " WARN: grub2-mkimage failed — use QEMU -bios flag"
else
echo " WARN: grub-mkimage not found — EFI boot image not created"
echo " Install grub2-tools or use QEMU -kernel/-initrd flags"
fi
# Install extlinux bootloader
if command -v extlinux >/dev/null 2>&1; then
extlinux --install "$MNT_BOOT/boot/syslinux" 2>/dev/null || {
echo "WARN: extlinux install failed — image may not be directly bootable"
echo " Use with QEMU -kernel/-initrd flags instead"
# For BIOS boot: install GRUB i386-pc modules if available
if command -v grub-install >/dev/null 2>&1; then
grub-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
--no-floppy "$LOOP" 2>/dev/null || {
echo " WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
}
elif command -v grub2-install >/dev/null 2>&1; then
grub2-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
--no-floppy "$LOOP" 2>/dev/null || {
echo " WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
}
fi
# Prepare data partition structure
for dir in kubesolo containerd etc-kubesolo log usr-local network; do
# --- System A Partition (active) ---
echo " Populating System A (active)..."
cp "$VMLINUZ" "$MNT_SYSA/vmlinuz"
cp "$INITRAMFS" "$MNT_SYSA/kubesolo-os.gz"
echo "$VERSION" > "$MNT_SYSA/version"
# --- System B Partition (passive, initially same as A) ---
echo " Populating System B (passive)..."
cp "$VMLINUZ" "$MNT_SYSB/vmlinuz"
cp "$INITRAMFS" "$MNT_SYSB/kubesolo-os.gz"
echo "$VERSION" > "$MNT_SYSB/version"
# --- Data Partition ---
echo " Preparing data partition..."
for dir in kubesolo containerd etc-kubesolo log usr-local network images; do
mkdir -p "$MNT_DATA/$dir"
done
@@ -106,5 +183,8 @@ sync
echo ""
echo "==> Disk image created: $IMG_OUTPUT"
echo " Size: $(du -h "$IMG_OUTPUT" | cut -f1)"
echo " Boot partition (KSOLOBOOT): kernel + initramfs"
echo " Data partition (KSOLODATA): persistent K8s state"
echo " Part 1 (KSOLOEFI): GRUB + A/B boot config"
echo " Part 2 (KSOLOA): System A — kernel + initramfs (active)"
echo " Part 3 (KSOLOB): System B — kernel + initramfs (passive)"
echo " Part 4 (KSOLODATA): Persistent K8s state"
echo ""

View File

@@ -73,6 +73,16 @@ else
echo " WARN: Cloud-init binary not found (run 'make build-cloudinit' to build)"
fi
# Update agent binary (Go, built separately)
UPDATE_BIN="$CACHE_DIR/kubesolo-update"
if [ -f "$UPDATE_BIN" ]; then
cp "$UPDATE_BIN" "$ROOTFS/usr/lib/kubesolo-os/kubesolo-update"
chmod +x "$ROOTFS/usr/lib/kubesolo-os/kubesolo-update"
echo " Installed update agent ($(du -h "$UPDATE_BIN" | cut -f1))"
else
echo " WARN: Update agent not found (run 'make build-update-agent' to build)"
fi
# --- 3. Kernel modules list ---
cp "$PROJECT_ROOT/build/config/modules.list" "$ROOTFS/usr/lib/kubesolo-os/modules.list"

261
docs/update-flow.md Normal file
View File

@@ -0,0 +1,261 @@
# KubeSolo OS — Atomic Update Flow
This document describes the A/B partition update mechanism used by KubeSolo OS for safe, atomic OS updates with automatic rollback.
## Partition Layout
KubeSolo OS uses a 4-partition GPT layout:
```
Disk (minimum 4 GB):
Part 1: EFI/Boot (256 MB, FAT32, label: KSOLOEFI) — GRUB + boot config
Part 2: System A (512 MB, ext4, label: KSOLOA) — vmlinuz + kubesolo-os.gz
Part 3: System B (512 MB, ext4, label: KSOLOB) — vmlinuz + kubesolo-os.gz
Part 4: Data (remaining, ext4, label: KSOLODATA) — persistent K8s state
```
Only one system partition is active at a time. The other is the "passive" slot used for staging updates.
## GRUB Environment Variables
The A/B boot logic is controlled by three GRUB environment variables stored in `/boot/grub/grubenv`:
| Variable | Values | Description |
|---|---|---|
| `active_slot` | `A` or `B` | Which system partition to boot |
| `boot_counter` | `3``0` | Attempts remaining before rollback |
| `boot_success` | `0` or `1` | Whether the current boot has been verified healthy |
## Boot Flow
```
┌──────────────┐
│ GRUB starts │
└──────┬───────┘
┌──────▼───────┐
│ Load grubenv │
└──────┬───────┘
┌─────────▼─────────┐
│ boot_success == 1? │
└────┬──────────┬───┘
yes│ │no
│ ┌─────▼──────────┐
│ │ boot_counter=0? │
│ └──┬──────────┬──┘
│ no │ │ yes
│ │ ┌─────▼──────────┐
│ │ │ SWAP active_slot│
│ │ │ Reset counter=3 │
│ │ └─────┬───────────┘
│ │ │
┌────▼───────▼──────────▼────┐
│ Set boot_success=0 │
│ Decrement boot_counter │
│ Boot active_slot partition │
└────────────┬───────────────┘
┌─────────▼─────────┐
│ System boots... │
└─────────┬─────────┘
┌─────────▼─────────────┐
│ Health check runs │
│ (containerd, API, │
│ node Ready) │
└─────┬──────────┬──────┘
pass│ │fail
┌─────▼─────┐ │
│ Mark boot │ │ boot_success stays 0
│ success=1 │ │ counter decremented
│ counter=3 │ │ on next reboot
└───────────┘ └──────────────────────
```
### Rollback Behavior
The boot counter starts at 3 and decrements on each boot where `boot_success` remains 0:
1. **Boot 1**: counter 3 → 2 (health check fails → reboot)
2. **Boot 2**: counter 2 → 1 (health check fails → reboot)
3. **Boot 3**: counter 1 → 0 (health check fails → reboot)
4. **Boot 4**: counter = 0, GRUB swaps `active_slot` and resets counter to 3
This provides **3 chances** for the new version to pass health checks before automatic rollback to the previous version.
## Update Agent Commands
The `kubesolo-update` binary provides 6 subcommands:
### `check` — Check for Updates
Queries the update server and compares against the current running version.
```bash
kubesolo-update check --server https://updates.example.com
```
Output:
```
Current version: 1.0.0 (slot A)
Latest version: 1.1.0
Status: update available
```
### `apply` — Download and Write Update
Downloads the new OS image (vmlinuz + initramfs) from the update server, verifies SHA256 checksums, and writes to the passive partition.
```bash
kubesolo-update apply --server https://updates.example.com
```
This does NOT activate the new partition or trigger a reboot.
### `activate` — Set Next Boot Target
Switches the GRUB boot target to the passive partition (the one with the new image) and sets `boot_counter=3`.
```bash
kubesolo-update activate
```
After activation, reboot to boot into the new version:
```bash
reboot
```
### `rollback` — Force Rollback
Manually switches to the other partition, regardless of health check status.
```bash
kubesolo-update rollback
reboot
```
### `healthcheck` — Post-Boot Health Verification
Runs after every boot to verify the system is healthy. If all checks pass, marks `boot_success=1` in GRUB to prevent rollback.
Checks performed:
1. **containerd**: Socket exists and `ctr version` responds
2. **API server**: TCP connection to 127.0.0.1:6443 and `/healthz` endpoint
3. **Node Ready**: `kubectl get nodes` shows Ready status
```bash
kubesolo-update healthcheck --timeout 120
```
### `status` — Show A/B Slot Status
Displays the current partition state:
```bash
kubesolo-update status
```
Output:
```
KubeSolo OS — A/B Partition Status
───────────────────────────────────
Active slot: A
Passive slot: B
Boot counter: 3
Boot success: 1
✓ System is healthy (boot confirmed)
```
## Update Server Protocol
The update server is a simple HTTP(S) file server that serves:
```
/latest.json — Update metadata
/vmlinuz-<version> — Linux kernel
/kubesolo-os-<version>.gz — Initramfs
```
### `latest.json` Format
```json
{
"version": "1.1.0",
"vmlinuz_url": "https://updates.example.com/vmlinuz-1.1.0",
"vmlinuz_sha256": "abc123...",
"initramfs_url": "https://updates.example.com/kubesolo-os-1.1.0.gz",
"initramfs_sha256": "def456...",
"release_notes": "Bug fixes and performance improvements",
"release_date": "2025-01-15"
}
```
Any static file server (nginx, S3, GitHub Releases) can serve as an update server.
## Automated Updates via CronJob
KubeSolo OS includes a Kubernetes CronJob for automatic update checking:
```bash
# Deploy the update CronJob
kubectl apply -f /usr/lib/kubesolo-os/update-cronjob.yaml
# Configure the update server URL
kubectl -n kube-system create configmap kubesolo-update-config \
--from-literal=server-url=https://updates.example.com
# Manually trigger an update check
kubectl create job --from=cronjob/kubesolo-update kubesolo-update-manual -n kube-system
```
The CronJob runs every 6 hours and performs `apply` (download + write). It does NOT reboot — the administrator controls when to reboot.
## Complete Update Cycle
A full update cycle looks like:
```bash
# 1. Check if update is available
kubesolo-update check --server https://updates.example.com
# 2. Download and write to passive partition
kubesolo-update apply --server https://updates.example.com
# 3. Activate the new partition
kubesolo-update activate
# 4. Reboot into the new version
reboot
# 5. (Automatic) Health check runs, marks boot successful
# kubesolo-update healthcheck is run by init system
# 6. Verify status
kubesolo-update status
```
If the health check fails 3 times, GRUB automatically rolls back to the previous version on the next reboot.
## Command-Line Options
All subcommands accept these options:
| Option | Default | Description |
|---|---|---|
| `--server URL` | (none) | Update server URL |
| `--grubenv PATH` | `/boot/grub/grubenv` | Path to GRUB environment file |
| `--timeout SECS` | `120` | Health check timeout in seconds |
## File Locations
| File | Description |
|---|---|
| `/usr/lib/kubesolo-os/kubesolo-update` | Update agent binary |
| `/boot/grub/grubenv` | GRUB environment (on EFI partition) |
| `/boot/grub/grub.cfg` | GRUB boot config with A/B logic |
| `<system-partition>/vmlinuz` | Linux kernel |
| `<system-partition>/kubesolo-os.gz` | Initramfs |
| `<system-partition>/version` | Version string |

40
update/cmd/activate.go Normal file
View File

@@ -0,0 +1,40 @@
package cmd
import (
"fmt"
"log/slog"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
)
// Activate switches the boot target to the passive partition.
// After activation, the next reboot will boot from the new partition
// with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
func Activate(args []string) error {
opts := parseOpts(args)
env := grubenv.New(opts.GrubenvPath)
// Get passive slot (the one we want to boot into)
passiveSlot, err := env.PassiveSlot()
if err != nil {
return fmt.Errorf("reading passive slot: %w", err)
}
activeSlot, err := env.ActiveSlot()
if err != nil {
return fmt.Errorf("reading active slot: %w", err)
}
slog.Info("activating slot", "from", activeSlot, "to", passiveSlot)
// Set the passive slot as active with fresh boot counter
if err := env.ActivateSlot(passiveSlot); err != nil {
return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
}
fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
fmt.Println("Boot counter set to 3. Reboot to start the new version.")
fmt.Println("The system will automatically roll back if health checks fail 3 times.")
return nil
}

70
update/cmd/apply.go Normal file
View File

@@ -0,0 +1,70 @@
package cmd
import (
"fmt"
"log/slog"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
"github.com/portainer/kubesolo-os/update/pkg/image"
"github.com/portainer/kubesolo-os/update/pkg/partition"
)
// Apply downloads a new OS image and writes it to the passive partition.
// It does NOT activate the new partition — use 'activate' for that.
func Apply(args []string) error {
opts := parseOpts(args)
if opts.ServerURL == "" {
return fmt.Errorf("--server is required")
}
env := grubenv.New(opts.GrubenvPath)
// Determine passive slot
passiveSlot, err := env.PassiveSlot()
if err != nil {
return fmt.Errorf("reading passive slot: %w", err)
}
slog.Info("applying update", "target_slot", passiveSlot)
// Check for update
stageDir := "/tmp/kubesolo-update-stage"
client := image.NewClient(opts.ServerURL, stageDir)
defer client.Cleanup()
meta, err := client.CheckForUpdate()
if err != nil {
return fmt.Errorf("checking for update: %w", err)
}
slog.Info("update available", "version", meta.Version)
// Download and verify
staged, err := client.Download(meta)
if err != nil {
return fmt.Errorf("downloading update: %w", err)
}
// Mount passive partition
partInfo, err := partition.GetSlotPartition(passiveSlot)
if err != nil {
return fmt.Errorf("finding passive partition: %w", err)
}
mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
return fmt.Errorf("mounting passive partition: %w", err)
}
defer partition.Unmount(mountPoint)
// Write image to passive partition
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
return fmt.Errorf("writing system image: %w", err)
}
fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
fmt.Println("Run 'kubesolo-update activate' to boot into the new version")
return nil
}

65
update/cmd/check.go Normal file
View File

@@ -0,0 +1,65 @@
package cmd
import (
"fmt"
"log/slog"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
"github.com/portainer/kubesolo-os/update/pkg/image"
"github.com/portainer/kubesolo-os/update/pkg/partition"
)
// Check queries the update server for available updates and compares
// against the currently running version.
func Check(args []string) error {
opts := parseOpts(args)
if opts.ServerURL == "" {
return fmt.Errorf("--server is required (no default update server configured)")
}
// Get current version from active partition
env := grubenv.New(opts.GrubenvPath)
activeSlot, err := env.ActiveSlot()
if err != nil {
return fmt.Errorf("reading active slot: %w", err)
}
partInfo, err := partition.GetSlotPartition(activeSlot)
if err != nil {
return fmt.Errorf("finding active partition: %w", err)
}
mountPoint := "/tmp/kubesolo-check-" + activeSlot
if err := partition.MountReadOnly(partInfo.Device, mountPoint); err != nil {
return fmt.Errorf("mounting active partition: %w", err)
}
defer partition.Unmount(mountPoint)
currentVersion, err := partition.ReadVersion(mountPoint)
if err != nil {
slog.Warn("could not read current version", "error", err)
currentVersion = "unknown"
}
// Check update server
client := image.NewClient(opts.ServerURL, "")
meta, err := client.CheckForUpdate()
if err != nil {
return fmt.Errorf("checking for update: %w", err)
}
fmt.Printf("Current version: %s (slot %s)\n", currentVersion, activeSlot)
fmt.Printf("Latest version: %s\n", meta.Version)
if meta.Version == currentVersion {
fmt.Println("Status: up to date")
} else {
fmt.Println("Status: update available")
if meta.ReleaseNotes != "" {
fmt.Printf("Release notes: %s\n", meta.ReleaseNotes)
}
}
return nil
}

56
update/cmd/healthcheck.go Normal file
View File

@@ -0,0 +1,56 @@
package cmd
import (
"fmt"
"log/slog"
"time"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
"github.com/portainer/kubesolo-os/update/pkg/health"
)
// Healthcheck performs post-boot health verification.
// If all checks pass, it marks the boot as successful in GRUB.
// This should be run after every boot (typically via a systemd unit or
// init script) to confirm the system is healthy.
func Healthcheck(args []string) error {
opts := parseOpts(args)
env := grubenv.New(opts.GrubenvPath)
// Check if already marked successful
success, err := env.BootSuccess()
if err != nil {
slog.Warn("could not read boot_success", "error", err)
}
if success {
fmt.Println("Boot already marked successful")
return nil
}
timeout := time.Duration(opts.TimeoutSecs) * time.Second
checker := health.NewChecker("", "", timeout)
slog.Info("running post-boot health checks", "timeout", timeout)
status, err := checker.WaitForHealthy()
if err != nil {
fmt.Printf("Health check FAILED: %s\n", status.Message)
fmt.Printf(" containerd: %v\n", status.Containerd)
fmt.Printf(" apiserver: %v\n", status.APIServer)
fmt.Printf(" node_ready: %v\n", status.NodeReady)
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
return err
}
// Mark boot as successful
if err := env.MarkBootSuccess(); err != nil {
return fmt.Errorf("marking boot success: %w", err)
}
fmt.Println("Health check PASSED — boot marked successful")
fmt.Printf(" containerd: %v\n", status.Containerd)
fmt.Printf(" apiserver: %v\n", status.APIServer)
fmt.Printf(" node_ready: %v\n", status.NodeReady)
return nil
}

47
update/cmd/opts.go Normal file
View File

@@ -0,0 +1,47 @@
package cmd
// opts holds shared command-line options for all subcommands.
type opts struct {
ServerURL string
GrubenvPath string
TimeoutSecs int
}
// parseOpts extracts command-line flags from args.
// Simple parser — no external dependencies.
func parseOpts(args []string) opts {
o := opts{
GrubenvPath: "/boot/grub/grubenv",
TimeoutSecs: 120,
}
for i := 0; i < len(args); i++ {
switch args[i] {
case "--server":
if i+1 < len(args) {
o.ServerURL = args[i+1]
i++
}
case "--grubenv":
if i+1 < len(args) {
o.GrubenvPath = args[i+1]
i++
}
case "--timeout":
if i+1 < len(args) {
val := 0
for _, c := range args[i+1] {
if c >= '0' && c <= '9' {
val = val*10 + int(c-'0')
}
}
if val > 0 {
o.TimeoutSecs = val
}
i++
}
}
}
return o
}

36
update/cmd/rollback.go Normal file
View File

@@ -0,0 +1,36 @@
package cmd
import (
"fmt"
"log/slog"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
)
// Rollback forces an immediate switch to the other partition.
// Use this to manually revert to the previous version.
func Rollback(args []string) error {
opts := parseOpts(args)
env := grubenv.New(opts.GrubenvPath)
activeSlot, err := env.ActiveSlot()
if err != nil {
return fmt.Errorf("reading active slot: %w", err)
}
passiveSlot, err := env.PassiveSlot()
if err != nil {
return fmt.Errorf("reading passive slot: %w", err)
}
slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)
if err := env.ForceRollback(); err != nil {
return fmt.Errorf("rollback failed: %w", err)
}
fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
fmt.Println("Reboot to complete rollback.")
return nil
}

44
update/cmd/status.go Normal file
View File

@@ -0,0 +1,44 @@
package cmd
import (
"fmt"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
)
// Status displays the current A/B slot configuration and boot state.
func Status(args []string) error {
opts := parseOpts(args)
env := grubenv.New(opts.GrubenvPath)
vars, err := env.ReadAll()
if err != nil {
return fmt.Errorf("reading GRUB environment: %w", err)
}
activeSlot := vars["active_slot"]
bootCounter := vars["boot_counter"]
bootSuccess := vars["boot_success"]
passiveSlot := "B"
if activeSlot == "B" {
passiveSlot = "A"
}
fmt.Println("KubeSolo OS — A/B Partition Status")
fmt.Println("───────────────────────────────────")
fmt.Printf(" Active slot: %s\n", activeSlot)
fmt.Printf(" Passive slot: %s\n", passiveSlot)
fmt.Printf(" Boot counter: %s\n", bootCounter)
fmt.Printf(" Boot success: %s\n", bootSuccess)
if bootSuccess == "1" {
fmt.Println("\n ✓ System is healthy (boot confirmed)")
} else if bootCounter == "0" {
fmt.Println("\n ✗ Boot counter exhausted — rollback will occur on next reboot")
} else {
fmt.Printf("\n ⚠ Boot pending verification (%s attempts remaining)\n", bootCounter)
}
return nil
}

View File

@@ -0,0 +1,150 @@
# KubeSolo OS — Automatic Update CronJob
#
# This CronJob checks for OS updates every 6 hours, downloads them,
# and writes them to the passive partition. It does NOT reboot —
# the administrator must trigger a reboot to apply the update.
#
# The update agent runs as a privileged container with host access
# because it needs to:
# 1. Read/write GRUB environment (on boot partition)
# 2. Mount and write to system partitions
# 3. Access block devices via blkid
#
# Deploy: kubectl apply -f update-cronjob.yaml
# Manual trigger: kubectl create job --from=cronjob/kubesolo-update kubesolo-update-manual
#
apiVersion: batch/v1
kind: CronJob
metadata:
name: kubesolo-update
namespace: kube-system
labels:
app.kubernetes.io/name: kubesolo-update
app.kubernetes.io/component: update-agent
app.kubernetes.io/part-of: kubesolo-os
spec:
schedule: "0 */6 * * *" # Every 6 hours
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 5
jobTemplate:
spec:
backoffLimit: 1
activeDeadlineSeconds: 600 # 10 min max
template:
metadata:
labels:
app.kubernetes.io/name: kubesolo-update
spec:
restartPolicy: Never
hostPID: false
hostNetwork: false
containers:
- name: update
image: busybox:latest # Only used for the shell; the binary is host-mounted
command:
- /host/usr/lib/kubesolo-os/kubesolo-update
args:
- apply
- --server
- "$(UPDATE_SERVER_URL)"
env:
- name: UPDATE_SERVER_URL
valueFrom:
configMapKeyRef:
name: kubesolo-update-config
key: server-url
optional: true
securityContext:
privileged: true # Required for mount/blkid access
volumeMounts:
- name: host-root
mountPath: /host
readOnly: false
- name: dev
mountPath: /dev
- name: boot
mountPath: /boot
volumes:
- name: host-root
hostPath:
path: /
type: Directory
- name: dev
hostPath:
path: /dev
type: Directory
- name: boot
hostPath:
path: /boot
type: Directory
tolerations:
- operator: Exists # Run on any node (there's only one)
---
# ConfigMap for update server URL.
# Create/update this to point to your update server:
# kubectl -n kube-system create configmap kubesolo-update-config \
# --from-literal=server-url=https://updates.example.com
apiVersion: v1
kind: ConfigMap
metadata:
name: kubesolo-update-config
namespace: kube-system
labels:
app.kubernetes.io/name: kubesolo-update
app.kubernetes.io/component: update-agent
data:
server-url: "" # Set to your update server URL
---
# Post-boot health check — runs once at boot as a Job.
# On KubeSolo OS, this is triggered by the init system (init stage or
# systemd-equivalent), but it can also be deployed as a K8s Job for
# environments where the init system doesn't run the health check.
apiVersion: batch/v1
kind: Job
metadata:
name: kubesolo-healthcheck
namespace: kube-system
labels:
app.kubernetes.io/name: kubesolo-healthcheck
app.kubernetes.io/component: health-check
app.kubernetes.io/part-of: kubesolo-os
spec:
backoffLimit: 3
activeDeadlineSeconds: 300 # 5 min max
template:
metadata:
labels:
app.kubernetes.io/name: kubesolo-healthcheck
spec:
restartPolicy: Never
hostPID: false
hostNetwork: true # Needed to reach API server at 127.0.0.1:6443
containers:
- name: healthcheck
image: busybox:latest
command:
- /host/usr/lib/kubesolo-os/kubesolo-update
args:
- healthcheck
- --timeout
- "120"
securityContext:
privileged: true # Required for grubenv write
volumeMounts:
- name: host-root
mountPath: /host
readOnly: false
- name: boot
mountPath: /boot
volumes:
- name: host-root
hostPath:
path: /
type: Directory
- name: boot
hostPath:
path: /boot
type: Directory
tolerations:
- operator: Exists

3
update/go.mod Normal file
View File

@@ -0,0 +1,3 @@
module github.com/portainer/kubesolo-os/update
go 1.25.5

79
update/main.go Normal file
View File

@@ -0,0 +1,79 @@
// kubesolo-update is the atomic update agent for KubeSolo OS.
//
// It manages A/B partition updates with automatic rollback:
//
// kubesolo-update check Check for available updates
// kubesolo-update apply Download + write update to passive partition
// kubesolo-update activate Set passive partition as next boot target
// kubesolo-update rollback Force rollback to other partition
// kubesolo-update healthcheck Post-boot health verification
// kubesolo-update status Show current A/B slot and boot status
package main
import (
"fmt"
"log/slog"
"os"
"github.com/portainer/kubesolo-os/update/cmd"
)
func main() {
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: slog.LevelInfo,
})))
if len(os.Args) < 2 {
usage()
os.Exit(1)
}
var err error
switch os.Args[1] {
case "check":
err = cmd.Check(os.Args[2:])
case "apply":
err = cmd.Apply(os.Args[2:])
case "activate":
err = cmd.Activate(os.Args[2:])
case "rollback":
err = cmd.Rollback(os.Args[2:])
case "healthcheck":
err = cmd.Healthcheck(os.Args[2:])
case "status":
err = cmd.Status(os.Args[2:])
default:
fmt.Fprintf(os.Stderr, "unknown command: %s\n\n", os.Args[1])
usage()
os.Exit(1)
}
if err != nil {
slog.Error("command failed", "command", os.Args[1], "error", err)
os.Exit(1)
}
}
func usage() {
fmt.Fprintf(os.Stderr, `Usage: kubesolo-update <command> [options]
Commands:
check Check for available updates
apply Download and write update to passive partition
activate Set passive partition as next boot target
rollback Force rollback to other partition
healthcheck Post-boot health verification (marks boot successful)
status Show current A/B slot and boot status
Options:
--server URL Update server URL (default: from /etc/kubesolo/update.conf)
--grubenv PATH Path to grubenv file (default: /boot/grub/grubenv)
--timeout SECS Health check timeout in seconds (default: 120)
Examples:
kubesolo-update check --server https://updates.example.com
kubesolo-update apply --server https://updates.example.com
kubesolo-update healthcheck
kubesolo-update status
`)
}

View File

@@ -0,0 +1,239 @@
// Package grubenv provides read/write access to GRUB environment variables.
//
// GRUB stores its environment in a 1024-byte file (grubenv) located at
// /boot/grub/grubenv on the EFI partition. This package manipulates
// those variables for A/B boot slot management.
//
// Key variables:
// - active_slot: "A" or "B"
// - boot_counter: "3" (fresh) down to "0" (triggers rollback)
// - boot_success: "0" (pending) or "1" (healthy boot confirmed)
package grubenv
import (
"fmt"
"log/slog"
"os"
"os/exec"
"strings"
)
const (
// DefaultGrubenvPath is the standard location for the GRUB environment file.
DefaultGrubenvPath = "/boot/grub/grubenv"
// SlotA represents system partition A.
SlotA = "A"
// SlotB represents system partition B.
SlotB = "B"
)
// Env provides access to GRUB environment variables.
type Env struct {
path string
}
// New creates a new Env for the given grubenv file path.
func New(path string) *Env {
if path == "" {
path = DefaultGrubenvPath
}
return &Env{path: path}
}
// Get reads a variable from the GRUB environment.
func (e *Env) Get(key string) (string, error) {
vars, err := e.ReadAll()
if err != nil {
return "", err
}
val, ok := vars[key]
if !ok {
return "", fmt.Errorf("grubenv: key %q not found", key)
}
return val, nil
}
// Set writes a variable to the GRUB environment.
func (e *Env) Set(key, value string) error {
editenv, err := findEditenv()
if err != nil {
return e.setManual(key, value)
}
cmd := exec.Command(editenv, e.path, "set", key+"="+value)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("grub-editenv set %s=%s: %w\n%s", key, value, err, output)
}
slog.Debug("grubenv set", "key", key, "value", value)
return nil
}
// ReadAll reads all variables from the GRUB environment.
func (e *Env) ReadAll() (map[string]string, error) {
editenv, err := findEditenv()
if err != nil {
return e.readManual()
}
cmd := exec.Command(editenv, e.path, "list")
output, err := cmd.Output()
if err != nil {
return nil, fmt.Errorf("grub-editenv list: %w", err)
}
return parseEnvOutput(string(output)), nil
}
// ActiveSlot returns the currently active boot slot ("A" or "B").
func (e *Env) ActiveSlot() (string, error) {
return e.Get("active_slot")
}
// PassiveSlot returns the currently passive boot slot.
func (e *Env) PassiveSlot() (string, error) {
active, err := e.ActiveSlot()
if err != nil {
return "", err
}
if active == SlotA {
return SlotB, nil
}
return SlotA, nil
}
// BootCounter returns the current boot counter value.
func (e *Env) BootCounter() (int, error) {
val, err := e.Get("boot_counter")
if err != nil {
return -1, err
}
switch val {
case "0":
return 0, nil
case "1":
return 1, nil
case "2":
return 2, nil
case "3":
return 3, nil
default:
return -1, fmt.Errorf("grubenv: invalid boot_counter: %q", val)
}
}
// BootSuccess returns whether the last boot was marked successful.
func (e *Env) BootSuccess() (bool, error) {
val, err := e.Get("boot_success")
if err != nil {
return false, err
}
return val == "1", nil
}
// MarkBootSuccess sets boot_success=1 and boot_counter=3.
// Called by the health check after a successful boot.
func (e *Env) MarkBootSuccess() error {
if err := e.Set("boot_success", "1"); err != nil {
return fmt.Errorf("setting boot_success: %w", err)
}
if err := e.Set("boot_counter", "3"); err != nil {
return fmt.Errorf("setting boot_counter: %w", err)
}
slog.Info("boot marked successful")
return nil
}
// ActivateSlot switches the active slot and resets the boot counter.
// Used after writing a new image to the passive partition.
func (e *Env) ActivateSlot(slot string) error {
if slot != SlotA && slot != SlotB {
return fmt.Errorf("invalid slot: %q (must be A or B)", slot)
}
if err := e.Set("active_slot", slot); err != nil {
return err
}
if err := e.Set("boot_counter", "3"); err != nil {
return err
}
if err := e.Set("boot_success", "0"); err != nil {
return err
}
slog.Info("activated slot", "slot", slot)
return nil
}
// ForceRollback switches to the other slot immediately.
func (e *Env) ForceRollback() error {
passive, err := e.PassiveSlot()
if err != nil {
return err
}
return e.ActivateSlot(passive)
}
func findEditenv() (string, error) {
if path, err := exec.LookPath("grub-editenv"); err == nil {
return path, nil
}
if path, err := exec.LookPath("grub2-editenv"); err == nil {
return path, nil
}
return "", fmt.Errorf("grub-editenv not found")
}
func parseEnvOutput(output string) map[string]string {
vars := make(map[string]string)
for _, line := range strings.Split(output, "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, "=", 2)
if len(parts) == 2 {
vars[parts[0]] = parts[1]
}
}
return vars
}
// setManual writes to grubenv without grub-editenv (fallback).
func (e *Env) setManual(key, value string) error {
vars, err := e.readManual()
if err != nil {
vars = make(map[string]string)
}
vars[key] = value
return e.writeManual(vars)
}
// readManual reads grubenv without grub-editenv.
func (e *Env) readManual() (map[string]string, error) {
data, err := os.ReadFile(e.path)
if err != nil {
return nil, fmt.Errorf("reading grubenv: %w", err)
}
return parseEnvOutput(string(data)), nil
}
// writeManual writes grubenv without grub-editenv.
// GRUB requires the file to be exactly 1024 bytes, padded with '#'.
func (e *Env) writeManual(vars map[string]string) error {
var sb strings.Builder
sb.WriteString("# GRUB Environment Block\n")
for k, v := range vars {
sb.WriteString(k + "=" + v + "\n")
}
content := sb.String()
if len(content) > 1024 {
return fmt.Errorf("grubenv content exceeds 1024 bytes")
}
// Pad to 1024 bytes with '#'
padding := 1024 - len(content)
content += strings.Repeat("#", padding)
return os.WriteFile(e.path, []byte(content), 0o644)
}

View File

@@ -0,0 +1,423 @@
package grubenv
import (
"os"
"path/filepath"
"strings"
"testing"
)
// createTestGrubenv writes a properly formatted grubenv file for testing.
// GRUB requires the file to be exactly 1024 bytes, padded with '#'.
func createTestGrubenv(t *testing.T, dir string, vars map[string]string) string {
t.Helper()
path := filepath.Join(dir, "grubenv")
var sb strings.Builder
sb.WriteString("# GRUB Environment Block\n")
for k, v := range vars {
sb.WriteString(k + "=" + v + "\n")
}
content := sb.String()
padding := 1024 - len(content)
if padding > 0 {
content += strings.Repeat("#", padding)
}
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatal(err)
}
return path
}
func TestNew(t *testing.T) {
env := New("")
if env.path != DefaultGrubenvPath {
t.Errorf("expected default path %s, got %s", DefaultGrubenvPath, env.path)
}
env = New("/custom/path/grubenv")
if env.path != "/custom/path/grubenv" {
t.Errorf("expected custom path, got %s", env.path)
}
}
func TestReadAll(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
vars, err := env.ReadAll()
if err != nil {
t.Fatal(err)
}
if vars["active_slot"] != "A" {
t.Errorf("active_slot: expected A, got %s", vars["active_slot"])
}
if vars["boot_counter"] != "3" {
t.Errorf("boot_counter: expected 3, got %s", vars["boot_counter"])
}
if vars["boot_success"] != "1" {
t.Errorf("boot_success: expected 1, got %s", vars["boot_success"])
}
}
func TestGet(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "B",
})
env := New(path)
val, err := env.Get("active_slot")
if err != nil {
t.Fatal(err)
}
if val != "B" {
t.Errorf("expected B, got %s", val)
}
_, err = env.Get("nonexistent")
if err == nil {
t.Fatal("expected error for nonexistent key")
}
}
func TestSet(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
})
env := New(path)
if err := env.Set("boot_counter", "2"); err != nil {
t.Fatal(err)
}
val, err := env.Get("boot_counter")
if err != nil {
t.Fatal(err)
}
if val != "2" {
t.Errorf("expected 2 after set, got %s", val)
}
// Verify file is still 1024 bytes
data, err := os.ReadFile(path)
if err != nil {
t.Fatal(err)
}
if len(data) != 1024 {
t.Errorf("grubenv should be 1024 bytes, got %d", len(data))
}
}
func TestActiveSlot(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
slot, err := env.ActiveSlot()
if err != nil {
t.Fatal(err)
}
if slot != "A" {
t.Errorf("expected A, got %s", slot)
}
}
func TestPassiveSlot(t *testing.T) {
tests := []struct {
active string
passive string
}{
{"A", "B"},
{"B", "A"},
}
for _, tt := range tests {
t.Run("active_"+tt.active, func(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": tt.active,
})
env := New(path)
passive, err := env.PassiveSlot()
if err != nil {
t.Fatal(err)
}
if passive != tt.passive {
t.Errorf("expected passive %s, got %s", tt.passive, passive)
}
})
}
}
func TestBootCounter(t *testing.T) {
tests := []struct {
value string
expect int
wantErr bool
}{
{"0", 0, false},
{"1", 1, false},
{"2", 2, false},
{"3", 3, false},
{"invalid", -1, true},
{"99", -1, true},
}
for _, tt := range tests {
t.Run("counter_"+tt.value, func(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"boot_counter": tt.value,
})
env := New(path)
counter, err := env.BootCounter()
if tt.wantErr {
if err == nil {
t.Fatal("expected error")
}
return
}
if err != nil {
t.Fatal(err)
}
if counter != tt.expect {
t.Errorf("expected %d, got %d", tt.expect, counter)
}
})
}
}
func TestBootSuccess(t *testing.T) {
tests := []struct {
value string
expect bool
}{
{"0", false},
{"1", true},
}
for _, tt := range tests {
t.Run("success_"+tt.value, func(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"boot_success": tt.value,
})
env := New(path)
success, err := env.BootSuccess()
if err != nil {
t.Fatal(err)
}
if success != tt.expect {
t.Errorf("expected %v, got %v", tt.expect, success)
}
})
}
}
func TestMarkBootSuccess(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "B",
"boot_counter": "1",
"boot_success": "0",
})
env := New(path)
if err := env.MarkBootSuccess(); err != nil {
t.Fatal(err)
}
success, err := env.BootSuccess()
if err != nil {
t.Fatal(err)
}
if !success {
t.Error("expected boot_success=1 after MarkBootSuccess")
}
counter, err := env.BootCounter()
if err != nil {
t.Fatal(err)
}
if counter != 3 {
t.Errorf("expected boot_counter=3 after MarkBootSuccess, got %d", counter)
}
}
func TestActivateSlot(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
if err := env.ActivateSlot("B"); err != nil {
t.Fatal(err)
}
slot, _ := env.ActiveSlot()
if slot != "B" {
t.Errorf("expected active_slot=B, got %s", slot)
}
counter, _ := env.BootCounter()
if counter != 3 {
t.Errorf("expected boot_counter=3, got %d", counter)
}
success, _ := env.BootSuccess()
if success {
t.Error("expected boot_success=0 after ActivateSlot")
}
}
func TestActivateSlotInvalid(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
})
env := New(path)
err := env.ActivateSlot("C")
if err == nil {
t.Fatal("expected error for invalid slot")
}
}
func TestForceRollback(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
if err := env.ForceRollback(); err != nil {
t.Fatal(err)
}
slot, _ := env.ActiveSlot()
if slot != "B" {
t.Errorf("expected active_slot=B after rollback from A, got %s", slot)
}
}
func TestParseEnvOutput(t *testing.T) {
input := `# GRUB Environment Block
active_slot=A
boot_counter=3
boot_success=1
`
vars := parseEnvOutput(input)
if len(vars) != 3 {
t.Errorf("expected 3 variables, got %d", len(vars))
}
if vars["active_slot"] != "A" {
t.Errorf("active_slot: expected A, got %s", vars["active_slot"])
}
if vars["boot_counter"] != "3" {
t.Errorf("boot_counter: expected 3, got %s", vars["boot_counter"])
}
}
func TestWriteManualFormat(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "grubenv")
env := New(path)
// Use setManual directly since grub-editenv may not be available
err := env.setManual("test_key", "test_value")
if err != nil {
t.Fatal(err)
}
data, err := os.ReadFile(path)
if err != nil {
t.Fatal(err)
}
if len(data) != 1024 {
t.Errorf("grubenv should be exactly 1024 bytes, got %d", len(data))
}
if !strings.HasPrefix(string(data), "# GRUB Environment Block\n") {
t.Error("grubenv should start with '# GRUB Environment Block'")
}
if !strings.Contains(string(data), "test_key=test_value\n") {
t.Error("grubenv should contain test_key=test_value")
}
}
func TestReadNonexistentFile(t *testing.T) {
env := New("/nonexistent/path/grubenv")
_, err := env.ReadAll()
if err == nil {
t.Fatal("expected error reading nonexistent file")
}
}
func TestMultipleSetOperations(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
// Simulate a boot cycle: decrement counter, then mark success
if err := env.Set("boot_counter", "2"); err != nil {
t.Fatal(err)
}
if err := env.Set("boot_success", "0"); err != nil {
t.Fatal(err)
}
// Now mark boot success
if err := env.MarkBootSuccess(); err != nil {
t.Fatal(err)
}
// Verify final state
vars, err := env.ReadAll()
if err != nil {
t.Fatal(err)
}
if vars["active_slot"] != "A" {
t.Errorf("active_slot should still be A, got %s", vars["active_slot"])
}
if vars["boot_counter"] != "3" {
t.Errorf("boot_counter should be 3 after mark success, got %s", vars["boot_counter"])
}
if vars["boot_success"] != "1" {
t.Errorf("boot_success should be 1, got %s", vars["boot_success"])
}
}

198
update/pkg/health/health.go Normal file
View File

@@ -0,0 +1,198 @@
// Package health implements post-boot health checks for KubeSolo OS.
//
// After booting a new system partition, the health check verifies that:
// - containerd is running and responsive
// - KubeSolo API server is reachable
// - The Kubernetes node reaches Ready state
//
// If all checks pass, the GRUB environment is updated to mark the boot
// as successful (boot_success=1). If any check fails, boot_success
// remains 0 and GRUB will eventually roll back.
package health
import (
"context"
"fmt"
"log/slog"
"net"
"net/http"
"os"
"os/exec"
"strings"
"time"
)
// Status represents the result of a health check.
type Status struct {
Containerd bool
APIServer bool
NodeReady bool
Message string
}
// IsHealthy returns true if all checks passed.
func (s *Status) IsHealthy() bool {
return s.Containerd && s.APIServer && s.NodeReady
}
// Checker performs health checks against the local KubeSolo instance.
type Checker struct {
kubeconfigPath string
apiServerAddr string
timeout time.Duration
}
// NewChecker creates a health checker.
func NewChecker(kubeconfigPath, apiServerAddr string, timeout time.Duration) *Checker {
if kubeconfigPath == "" {
kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
}
if apiServerAddr == "" {
apiServerAddr = "127.0.0.1:6443"
}
if timeout == 0 {
timeout = 120 * time.Second
}
return &Checker{
kubeconfigPath: kubeconfigPath,
apiServerAddr: apiServerAddr,
timeout: timeout,
}
}
// CheckContainerd verifies that containerd is running.
func (c *Checker) CheckContainerd() bool {
// Check if containerd socket exists
if _, err := os.Stat("/run/containerd/containerd.sock"); err != nil {
slog.Warn("containerd socket not found")
return false
}
// Try ctr version (bundled with KubeSolo)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "ctr", "--address", "/run/containerd/containerd.sock", "version")
if err := cmd.Run(); err != nil {
slog.Warn("containerd not responsive", "error", err)
return false
}
slog.Debug("containerd healthy")
return true
}
// CheckAPIServer verifies the Kubernetes API server is reachable.
func (c *Checker) CheckAPIServer() bool {
// TCP connect to API server port
conn, err := net.DialTimeout("tcp", c.apiServerAddr, 5*time.Second)
if err != nil {
slog.Warn("API server not reachable", "addr", c.apiServerAddr, "error", err)
return false
}
conn.Close()
// Try HTTPS health endpoint (skip TLS verify for localhost)
client := &http.Client{
Timeout: 5 * time.Second,
Transport: &http.Transport{
TLSHandshakeTimeout: 5 * time.Second,
},
}
resp, err := client.Get("https://" + c.apiServerAddr + "/healthz")
if err != nil {
// TLS error is expected without proper CA, but TCP connect succeeded
slog.Debug("API server TCP reachable but HTTPS check skipped", "error", err)
return true
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusOK {
slog.Debug("API server healthy", "status", resp.StatusCode)
return true
}
slog.Warn("API server unhealthy", "status", resp.StatusCode)
return false
}
// CheckNodeReady uses kubectl to verify the node is in Ready state.
func (c *Checker) CheckNodeReady() bool {
if _, err := os.Stat(c.kubeconfigPath); err != nil {
slog.Warn("kubeconfig not found", "path", c.kubeconfigPath)
return false
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "kubectl",
"--kubeconfig", c.kubeconfigPath,
"get", "nodes",
"-o", "jsonpath={.items[0].status.conditions[?(@.type==\"Ready\")].status}",
)
output, err := cmd.Output()
if err != nil {
slog.Warn("kubectl get nodes failed", "error", err)
return false
}
status := strings.TrimSpace(string(output))
if status == "True" {
slog.Debug("node is Ready")
return true
}
slog.Warn("node not Ready", "status", status)
return false
}
// RunAll performs all health checks and returns the combined status.
func (c *Checker) RunAll() *Status {
return &Status{
Containerd: c.CheckContainerd(),
APIServer: c.CheckAPIServer(),
NodeReady: c.CheckNodeReady(),
}
}
// WaitForHealthy polls health checks until all pass or timeout expires.
func (c *Checker) WaitForHealthy() (*Status, error) {
deadline := time.Now().Add(c.timeout)
interval := 5 * time.Second
slog.Info("waiting for system health", "timeout", c.timeout)
for time.Now().Before(deadline) {
status := c.RunAll()
if status.IsHealthy() {
status.Message = "all checks passed"
slog.Info("system healthy",
"containerd", status.Containerd,
"apiserver", status.APIServer,
"node_ready", status.NodeReady,
)
return status, nil
}
slog.Debug("health check pending",
"containerd", status.Containerd,
"apiserver", status.APIServer,
"node_ready", status.NodeReady,
"remaining", time.Until(deadline).Round(time.Second),
)
time.Sleep(interval)
}
// Final check
status := c.RunAll()
if status.IsHealthy() {
status.Message = "all checks passed"
return status, nil
}
status.Message = "health check timeout"
return status, fmt.Errorf("health check timed out after %s", c.timeout)
}

View File

@@ -0,0 +1,86 @@
package health
import (
"testing"
"time"
)
func TestStatusIsHealthy(t *testing.T) {
tests := []struct {
name string
status Status
wantHealth bool
}{
{
name: "all healthy",
status: Status{Containerd: true, APIServer: true, NodeReady: true},
wantHealth: true,
},
{
name: "containerd down",
status: Status{Containerd: false, APIServer: true, NodeReady: true},
wantHealth: false,
},
{
name: "apiserver down",
status: Status{Containerd: true, APIServer: false, NodeReady: true},
wantHealth: false,
},
{
name: "node not ready",
status: Status{Containerd: true, APIServer: true, NodeReady: false},
wantHealth: false,
},
{
name: "all down",
status: Status{Containerd: false, APIServer: false, NodeReady: false},
wantHealth: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := tt.status.IsHealthy(); got != tt.wantHealth {
t.Errorf("IsHealthy() = %v, want %v", got, tt.wantHealth)
}
})
}
}
func TestNewChecker(t *testing.T) {
// Test defaults
c := NewChecker("", "", 0)
if c.kubeconfigPath != "/var/lib/kubesolo/pki/admin/admin.kubeconfig" {
t.Errorf("unexpected default kubeconfig: %s", c.kubeconfigPath)
}
if c.apiServerAddr != "127.0.0.1:6443" {
t.Errorf("unexpected default apiserver addr: %s", c.apiServerAddr)
}
if c.timeout != 120*time.Second {
t.Errorf("unexpected default timeout: %v", c.timeout)
}
// Test custom values
c = NewChecker("/custom/kubeconfig", "10.0.0.1:6443", 30*time.Second)
if c.kubeconfigPath != "/custom/kubeconfig" {
t.Errorf("expected custom kubeconfig, got %s", c.kubeconfigPath)
}
if c.apiServerAddr != "10.0.0.1:6443" {
t.Errorf("expected custom addr, got %s", c.apiServerAddr)
}
if c.timeout != 30*time.Second {
t.Errorf("expected 30s timeout, got %v", c.timeout)
}
}
func TestStatusMessage(t *testing.T) {
s := &Status{
Containerd: true,
APIServer: true,
NodeReady: true,
Message: "all checks passed",
}
if s.Message != "all checks passed" {
t.Errorf("unexpected message: %s", s.Message)
}
}

180
update/pkg/image/image.go Normal file
View File

@@ -0,0 +1,180 @@
// Package image handles downloading, verifying, and staging OS update images.
//
// Update images are distributed as pairs of files:
// - vmlinuz (kernel)
// - kubesolo-os.gz (initramfs)
//
// These are fetched from an HTTP(S) server that provides a metadata file
// (latest.json) describing available updates.
package image
import (
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"path/filepath"
"time"
)
// UpdateMetadata describes an available update from the update server.
type UpdateMetadata struct {
Version string `json:"version"`
VmlinuzURL string `json:"vmlinuz_url"`
VmlinuzSHA256 string `json:"vmlinuz_sha256"`
InitramfsURL string `json:"initramfs_url"`
InitramfsSHA256 string `json:"initramfs_sha256"`
ReleaseNotes string `json:"release_notes,omitempty"`
ReleaseDate string `json:"release_date,omitempty"`
}
// StagedImage represents downloaded and verified update files.
type StagedImage struct {
VmlinuzPath string
InitramfsPath string
Version string
}
// Client handles communication with the update server.
type Client struct {
serverURL string
httpClient *http.Client
stageDir string
}
// NewClient creates a new update image client.
func NewClient(serverURL, stageDir string) *Client {
return &Client{
serverURL: serverURL,
httpClient: &http.Client{
Timeout: 5 * time.Minute,
},
stageDir: stageDir,
}
}
// CheckForUpdate fetches the latest update metadata from the server.
func (c *Client) CheckForUpdate() (*UpdateMetadata, error) {
url := c.serverURL + "/latest.json"
slog.Info("checking for update", "url", url)
resp, err := c.httpClient.Get(url)
if err != nil {
return nil, fmt.Errorf("fetching update metadata: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("update server returned %d", resp.StatusCode)
}
var meta UpdateMetadata
if err := json.NewDecoder(resp.Body).Decode(&meta); err != nil {
return nil, fmt.Errorf("parsing update metadata: %w", err)
}
if meta.Version == "" {
return nil, fmt.Errorf("update metadata missing version")
}
return &meta, nil
}
// Download fetches the update files and verifies their checksums.
func (c *Client) Download(meta *UpdateMetadata) (*StagedImage, error) {
if err := os.MkdirAll(c.stageDir, 0o755); err != nil {
return nil, fmt.Errorf("creating stage dir: %w", err)
}
vmlinuzPath := filepath.Join(c.stageDir, "vmlinuz")
initramfsPath := filepath.Join(c.stageDir, "kubesolo-os.gz")
slog.Info("downloading vmlinuz", "url", meta.VmlinuzURL)
if err := c.downloadAndVerify(meta.VmlinuzURL, vmlinuzPath, meta.VmlinuzSHA256); err != nil {
return nil, fmt.Errorf("downloading vmlinuz: %w", err)
}
slog.Info("downloading initramfs", "url", meta.InitramfsURL)
if err := c.downloadAndVerify(meta.InitramfsURL, initramfsPath, meta.InitramfsSHA256); err != nil {
return nil, fmt.Errorf("downloading initramfs: %w", err)
}
return &StagedImage{
VmlinuzPath: vmlinuzPath,
InitramfsPath: initramfsPath,
Version: meta.Version,
}, nil
}
// Cleanup removes staged update files.
func (c *Client) Cleanup() error {
return os.RemoveAll(c.stageDir)
}
func (c *Client) downloadAndVerify(url, dest, expectedSHA256 string) error {
resp, err := c.httpClient.Get(url)
if err != nil {
return fmt.Errorf("downloading %s: %w", url, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("server returned %d for %s", resp.StatusCode, url)
}
f, err := os.Create(dest)
if err != nil {
return fmt.Errorf("creating %s: %w", dest, err)
}
defer f.Close()
hasher := sha256.New()
writer := io.MultiWriter(f, hasher)
written, err := io.Copy(writer, resp.Body)
if err != nil {
os.Remove(dest)
return fmt.Errorf("writing %s: %w", dest, err)
}
if err := f.Close(); err != nil {
return fmt.Errorf("closing %s: %w", dest, err)
}
// Verify checksum
if expectedSHA256 != "" {
actual := hex.EncodeToString(hasher.Sum(nil))
if actual != expectedSHA256 {
os.Remove(dest)
return fmt.Errorf("checksum mismatch for %s: expected %s, got %s", dest, expectedSHA256, actual)
}
slog.Debug("checksum verified", "file", dest, "sha256", actual)
}
slog.Info("downloaded", "file", dest, "size", written)
return nil
}
// VerifyFile checks the SHA256 checksum of an existing file.
func VerifyFile(path, expectedSHA256 string) error {
f, err := os.Open(path)
if err != nil {
return err
}
defer f.Close()
hasher := sha256.New()
if _, err := io.Copy(hasher, f); err != nil {
return err
}
actual := hex.EncodeToString(hasher.Sum(nil))
if actual != expectedSHA256 {
return fmt.Errorf("checksum mismatch: expected %s, got %s", expectedSHA256, actual)
}
return nil
}

View File

@@ -0,0 +1,241 @@
package image
import (
"crypto/sha256"
"encoding/hex"
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
)
func TestCheckForUpdate(t *testing.T) {
meta := UpdateMetadata{
Version: "1.2.0",
VmlinuzURL: "/vmlinuz",
VmlinuzSHA256: "abc123",
InitramfsURL: "/kubesolo-os.gz",
InitramfsSHA256: "def456",
ReleaseNotes: "Bug fixes",
ReleaseDate: "2025-01-15",
}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/latest.json" {
http.NotFound(w, r)
return
}
json.NewEncoder(w).Encode(meta)
}))
defer server.Close()
client := NewClient(server.URL, "")
got, err := client.CheckForUpdate()
if err != nil {
t.Fatal(err)
}
if got.Version != "1.2.0" {
t.Errorf("expected version 1.2.0, got %s", got.Version)
}
if got.VmlinuzSHA256 != "abc123" {
t.Errorf("expected vmlinuz sha abc123, got %s", got.VmlinuzSHA256)
}
if got.ReleaseNotes != "Bug fixes" {
t.Errorf("expected release notes, got %s", got.ReleaseNotes)
}
}
func TestCheckForUpdateMissingVersion(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
json.NewEncoder(w).Encode(UpdateMetadata{})
}))
defer server.Close()
client := NewClient(server.URL, "")
_, err := client.CheckForUpdate()
if err == nil {
t.Fatal("expected error for missing version")
}
}
func TestCheckForUpdateServerError(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer server.Close()
client := NewClient(server.URL, "")
_, err := client.CheckForUpdate()
if err == nil {
t.Fatal("expected error for server error")
}
}
func TestDownloadAndVerify(t *testing.T) {
// Create test content
vmlinuzContent := []byte("fake vmlinuz content for testing")
initramfsContent := []byte("fake initramfs content for testing")
vmlinuzHash := sha256.Sum256(vmlinuzContent)
initramfsHash := sha256.Sum256(initramfsContent)
meta := UpdateMetadata{
Version: "2.0.0",
VmlinuzSHA256: hex.EncodeToString(vmlinuzHash[:]),
InitramfsSHA256: hex.EncodeToString(initramfsHash[:]),
}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/latest.json":
m := meta
m.VmlinuzURL = "http://" + r.Host + "/vmlinuz"
m.InitramfsURL = "http://" + r.Host + "/kubesolo-os.gz"
json.NewEncoder(w).Encode(m)
case "/vmlinuz":
w.Write(vmlinuzContent)
case "/kubesolo-os.gz":
w.Write(initramfsContent)
default:
http.NotFound(w, r)
}
}))
defer server.Close()
stageDir := filepath.Join(t.TempDir(), "stage")
client := NewClient(server.URL, stageDir)
defer client.Cleanup()
// First get metadata
gotMeta, err := client.CheckForUpdate()
if err != nil {
t.Fatal(err)
}
// Download
staged, err := client.Download(gotMeta)
if err != nil {
t.Fatal(err)
}
if staged.Version != "2.0.0" {
t.Errorf("expected version 2.0.0, got %s", staged.Version)
}
// Verify files exist
if _, err := os.Stat(staged.VmlinuzPath); err != nil {
t.Errorf("vmlinuz not found: %v", err)
}
if _, err := os.Stat(staged.InitramfsPath); err != nil {
t.Errorf("initramfs not found: %v", err)
}
// Verify content
data, _ := os.ReadFile(staged.VmlinuzPath)
if string(data) != string(vmlinuzContent) {
t.Error("vmlinuz content mismatch")
}
}
func TestDownloadChecksumMismatch(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/vmlinuz":
w.Write([]byte("actual content"))
default:
http.NotFound(w, r)
}
}))
defer server.Close()
stageDir := filepath.Join(t.TempDir(), "stage")
client := NewClient(server.URL, stageDir)
meta := &UpdateMetadata{
Version: "1.0.0",
VmlinuzURL: server.URL + "/vmlinuz",
VmlinuzSHA256: "wrong_checksum_value",
InitramfsURL: server.URL + "/initramfs",
}
_, err := client.Download(meta)
if err == nil {
t.Fatal("expected checksum mismatch error")
}
}
func TestVerifyFile(t *testing.T) {
content := []byte("test file content for verification")
hash := sha256.Sum256(content)
expected := hex.EncodeToString(hash[:])
dir := t.TempDir()
path := filepath.Join(dir, "testfile")
if err := os.WriteFile(path, content, 0o644); err != nil {
t.Fatal(err)
}
// Should pass with correct hash
if err := VerifyFile(path, expected); err != nil {
t.Errorf("expected verification to pass: %v", err)
}
// Should fail with wrong hash
if err := VerifyFile(path, "deadbeef"); err == nil {
t.Error("expected verification to fail with wrong hash")
}
}
func TestVerifyFileNotFound(t *testing.T) {
err := VerifyFile("/nonexistent/file", "abc123")
if err == nil {
t.Error("expected error for nonexistent file")
}
}
func TestCleanup(t *testing.T) {
stageDir := filepath.Join(t.TempDir(), "stage")
os.MkdirAll(stageDir, 0o755)
os.WriteFile(filepath.Join(stageDir, "test"), []byte("data"), 0o644)
client := NewClient("http://unused", stageDir)
if err := client.Cleanup(); err != nil {
t.Fatal(err)
}
if _, err := os.Stat(stageDir); !os.IsNotExist(err) {
t.Error("stage dir should be removed after cleanup")
}
}
func TestUpdateMetadataJSON(t *testing.T) {
meta := UpdateMetadata{
Version: "1.0.0",
VmlinuzURL: "https://example.com/vmlinuz",
VmlinuzSHA256: "abc",
InitramfsURL: "https://example.com/kubesolo-os.gz",
InitramfsSHA256: "def",
ReleaseNotes: "Initial release",
ReleaseDate: "2025-01-01",
}
data, err := json.Marshal(meta)
if err != nil {
t.Fatal(err)
}
var decoded UpdateMetadata
if err := json.Unmarshal(data, &decoded); err != nil {
t.Fatal(err)
}
if decoded.Version != meta.Version {
t.Errorf("version mismatch: %s != %s", decoded.Version, meta.Version)
}
if decoded.ReleaseDate != meta.ReleaseDate {
t.Errorf("release date mismatch: %s != %s", decoded.ReleaseDate, meta.ReleaseDate)
}
}

View File

@@ -0,0 +1,139 @@
// Package partition detects and manages A/B system partitions.
//
// It identifies System A and System B partitions by label (KSOLOA, KSOLOB)
// and provides mount/write operations for the update process.
package partition
import (
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
"strings"
)
const (
LabelSystemA = "KSOLOA"
LabelSystemB = "KSOLOB"
LabelData = "KSOLODATA"
LabelEFI = "KSOLOEFI"
)
// Info contains information about a partition.
type Info struct {
Device string // e.g. /dev/sda2
Label string // e.g. KSOLOA
MountPoint string // current mount point, empty if not mounted
Slot string // "A" or "B"
}
// FindByLabel locates a block device by its filesystem label.
func FindByLabel(label string) (string, error) {
cmd := exec.Command("blkid", "-L", label)
output, err := cmd.Output()
if err != nil {
return "", fmt.Errorf("partition with label %q not found: %w", label, err)
}
return strings.TrimSpace(string(output)), nil
}
// GetSlotPartition returns the partition info for the given slot ("A" or "B").
func GetSlotPartition(slot string) (*Info, error) {
var label string
switch slot {
case "A":
label = LabelSystemA
case "B":
label = LabelSystemB
default:
return nil, fmt.Errorf("invalid slot: %q", slot)
}
dev, err := FindByLabel(label)
if err != nil {
return nil, err
}
return &Info{
Device: dev,
Label: label,
Slot: slot,
}, nil
}
// MountReadOnly mounts a partition read-only at the given mount point.
func MountReadOnly(dev, mountPoint string) error {
if err := os.MkdirAll(mountPoint, 0o755); err != nil {
return fmt.Errorf("creating mount point: %w", err)
}
cmd := exec.Command("mount", "-o", "ro", dev, mountPoint)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("mounting %s at %s: %w\n%s", dev, mountPoint, err, output)
}
slog.Debug("mounted", "device", dev, "mountpoint", mountPoint, "mode", "ro")
return nil
}
// MountReadWrite mounts a partition read-write at the given mount point.
func MountReadWrite(dev, mountPoint string) error {
if err := os.MkdirAll(mountPoint, 0o755); err != nil {
return fmt.Errorf("creating mount point: %w", err)
}
cmd := exec.Command("mount", dev, mountPoint)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("mounting %s at %s: %w\n%s", dev, mountPoint, err, output)
}
slog.Debug("mounted", "device", dev, "mountpoint", mountPoint, "mode", "rw")
return nil
}
// Unmount unmounts a mount point.
func Unmount(mountPoint string) error {
cmd := exec.Command("umount", mountPoint)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("unmounting %s: %w\n%s", mountPoint, err, output)
}
return nil
}
// ReadVersion reads the version file from a mounted system partition.
func ReadVersion(mountPoint string) (string, error) {
data, err := os.ReadFile(filepath.Join(mountPoint, "version"))
if err != nil {
return "", fmt.Errorf("reading version: %w", err)
}
return strings.TrimSpace(string(data)), nil
}
// WriteSystemImage copies vmlinuz and initramfs to a mounted partition.
func WriteSystemImage(mountPoint, vmlinuzPath, initramfsPath, version string) error {
// Copy vmlinuz
if err := copyFile(vmlinuzPath, filepath.Join(mountPoint, "vmlinuz")); err != nil {
return fmt.Errorf("writing vmlinuz: %w", err)
}
// Copy initramfs
if err := copyFile(initramfsPath, filepath.Join(mountPoint, "kubesolo-os.gz")); err != nil {
return fmt.Errorf("writing initramfs: %w", err)
}
// Write version
if err := os.WriteFile(filepath.Join(mountPoint, "version"), []byte(version+"\n"), 0o644); err != nil {
return fmt.Errorf("writing version: %w", err)
}
// Sync to ensure data is flushed to disk
exec.Command("sync").Run()
slog.Info("system image written", "mountpoint", mountPoint, "version", version)
return nil
}
func copyFile(src, dst string) error {
data, err := os.ReadFile(src)
if err != nil {
return err
}
return os.WriteFile(dst, data, 0o644)
}

View File

@@ -0,0 +1,129 @@
package partition
import (
"os"
"path/filepath"
"testing"
)
func TestReadVersion(t *testing.T) {
dir := t.TempDir()
versionFile := filepath.Join(dir, "version")
if err := os.WriteFile(versionFile, []byte("1.2.3\n"), 0o644); err != nil {
t.Fatal(err)
}
version, err := ReadVersion(dir)
if err != nil {
t.Fatal(err)
}
if version != "1.2.3" {
t.Errorf("expected 1.2.3, got %s", version)
}
}
func TestReadVersionMissing(t *testing.T) {
dir := t.TempDir()
_, err := ReadVersion(dir)
if err == nil {
t.Fatal("expected error for missing version file")
}
}
func TestWriteSystemImage(t *testing.T) {
mountPoint := t.TempDir()
srcDir := t.TempDir()
// Create source files
vmlinuzPath := filepath.Join(srcDir, "vmlinuz")
initramfsPath := filepath.Join(srcDir, "kubesolo-os.gz")
if err := os.WriteFile(vmlinuzPath, []byte("kernel data"), 0o644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(initramfsPath, []byte("initramfs data"), 0o644); err != nil {
t.Fatal(err)
}
if err := WriteSystemImage(mountPoint, vmlinuzPath, initramfsPath, "2.0.0"); err != nil {
t.Fatal(err)
}
// Verify files were copied
data, err := os.ReadFile(filepath.Join(mountPoint, "vmlinuz"))
if err != nil {
t.Fatal(err)
}
if string(data) != "kernel data" {
t.Errorf("vmlinuz content mismatch")
}
data, err = os.ReadFile(filepath.Join(mountPoint, "kubesolo-os.gz"))
if err != nil {
t.Fatal(err)
}
if string(data) != "initramfs data" {
t.Errorf("initramfs content mismatch")
}
// Verify version file
version, err := ReadVersion(mountPoint)
if err != nil {
t.Fatal(err)
}
if version != "2.0.0" {
t.Errorf("expected version 2.0.0, got %s", version)
}
}
func TestCopyFile(t *testing.T) {
dir := t.TempDir()
src := filepath.Join(dir, "src")
dst := filepath.Join(dir, "dst")
if err := os.WriteFile(src, []byte("test content"), 0o644); err != nil {
t.Fatal(err)
}
if err := copyFile(src, dst); err != nil {
t.Fatal(err)
}
data, err := os.ReadFile(dst)
if err != nil {
t.Fatal(err)
}
if string(data) != "test content" {
t.Errorf("copy content mismatch")
}
}
func TestCopyFileNotFound(t *testing.T) {
dir := t.TempDir()
err := copyFile("/nonexistent", filepath.Join(dir, "dst"))
if err == nil {
t.Fatal("expected error for nonexistent source")
}
}
func TestGetSlotPartitionInvalid(t *testing.T) {
_, err := GetSlotPartition("C")
if err == nil {
t.Fatal("expected error for invalid slot")
}
}
func TestConstants(t *testing.T) {
if LabelSystemA != "KSOLOA" {
t.Errorf("unexpected LabelSystemA: %s", LabelSystemA)
}
if LabelSystemB != "KSOLOB" {
t.Errorf("unexpected LabelSystemB: %s", LabelSystemB)
}
if LabelData != "KSOLODATA" {
t.Errorf("unexpected LabelData: %s", LabelData)
}
if LabelEFI != "KSOLOEFI" {
t.Errorf("unexpected LabelEFI: %s", LabelEFI)
}
}