feat: add A/B partition updates with GRUB and Go update agent (Phase 3)

Implement atomic OS updates via A/B partition scheme with automatic
rollback. GRUB bootloader manages slot selection with a 3-attempt
boot counter that auto-rolls back on repeated health check failures.

GRUB boot config:
- A/B slot selection with boot_counter/boot_success env vars
- Automatic rollback when counter reaches 0 (3 failed boots)
- Debug, emergency shell, and manual slot-switch menu entries

Disk image (refactored):
- 4-partition GPT layout: EFI + System A + System B + Data
- GRUB EFI/BIOS installation with graceful fallbacks
- Both system partitions populated during image creation

Update agent (Go, zero external deps):
- pkg/grubenv: read/write GRUB env vars (grub-editenv + manual fallback)
- pkg/partition: find/mount/write system partitions by label
- pkg/image: HTTP download with SHA256 verification
- pkg/health: post-boot checks (containerd, API server, node Ready)
- 6 CLI commands: check, apply, activate, rollback, healthcheck, status
- 37 unit tests across all 4 packages

Deployment:
- K8s CronJob for automatic update checks (every 6 hours)
- ConfigMap for update server URL
- Health check Job for post-boot verification

Build pipeline:
- build-update-agent.sh compiles static Linux binary (~5.9 MB)
- inject-kubesolo.sh includes update agent in initramfs
- Makefile: build-update-agent, test-update-agent, test-update targets

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-11 11:12:46 -06:00
parent d900fa920e
commit 8d25e1890e
25 changed files with 2807 additions and 74 deletions

40
update/cmd/activate.go Normal file
View File

@@ -0,0 +1,40 @@
package cmd
import (
"fmt"
"log/slog"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
)
// Activate switches the boot target to the passive partition.
// After activation, the next reboot will boot from the new partition
// with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
func Activate(args []string) error {
opts := parseOpts(args)
env := grubenv.New(opts.GrubenvPath)
// Get passive slot (the one we want to boot into)
passiveSlot, err := env.PassiveSlot()
if err != nil {
return fmt.Errorf("reading passive slot: %w", err)
}
activeSlot, err := env.ActiveSlot()
if err != nil {
return fmt.Errorf("reading active slot: %w", err)
}
slog.Info("activating slot", "from", activeSlot, "to", passiveSlot)
// Set the passive slot as active with fresh boot counter
if err := env.ActivateSlot(passiveSlot); err != nil {
return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
}
fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
fmt.Println("Boot counter set to 3. Reboot to start the new version.")
fmt.Println("The system will automatically roll back if health checks fail 3 times.")
return nil
}

70
update/cmd/apply.go Normal file
View File

@@ -0,0 +1,70 @@
package cmd
import (
"fmt"
"log/slog"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
"github.com/portainer/kubesolo-os/update/pkg/image"
"github.com/portainer/kubesolo-os/update/pkg/partition"
)
// Apply downloads a new OS image and writes it to the passive partition.
// It does NOT activate the new partition — use 'activate' for that.
func Apply(args []string) error {
opts := parseOpts(args)
if opts.ServerURL == "" {
return fmt.Errorf("--server is required")
}
env := grubenv.New(opts.GrubenvPath)
// Determine passive slot
passiveSlot, err := env.PassiveSlot()
if err != nil {
return fmt.Errorf("reading passive slot: %w", err)
}
slog.Info("applying update", "target_slot", passiveSlot)
// Check for update
stageDir := "/tmp/kubesolo-update-stage"
client := image.NewClient(opts.ServerURL, stageDir)
defer client.Cleanup()
meta, err := client.CheckForUpdate()
if err != nil {
return fmt.Errorf("checking for update: %w", err)
}
slog.Info("update available", "version", meta.Version)
// Download and verify
staged, err := client.Download(meta)
if err != nil {
return fmt.Errorf("downloading update: %w", err)
}
// Mount passive partition
partInfo, err := partition.GetSlotPartition(passiveSlot)
if err != nil {
return fmt.Errorf("finding passive partition: %w", err)
}
mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
return fmt.Errorf("mounting passive partition: %w", err)
}
defer partition.Unmount(mountPoint)
// Write image to passive partition
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
return fmt.Errorf("writing system image: %w", err)
}
fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
fmt.Println("Run 'kubesolo-update activate' to boot into the new version")
return nil
}

65
update/cmd/check.go Normal file
View File

@@ -0,0 +1,65 @@
package cmd
import (
"fmt"
"log/slog"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
"github.com/portainer/kubesolo-os/update/pkg/image"
"github.com/portainer/kubesolo-os/update/pkg/partition"
)
// Check queries the update server for available updates and compares
// against the currently running version.
func Check(args []string) error {
opts := parseOpts(args)
if opts.ServerURL == "" {
return fmt.Errorf("--server is required (no default update server configured)")
}
// Get current version from active partition
env := grubenv.New(opts.GrubenvPath)
activeSlot, err := env.ActiveSlot()
if err != nil {
return fmt.Errorf("reading active slot: %w", err)
}
partInfo, err := partition.GetSlotPartition(activeSlot)
if err != nil {
return fmt.Errorf("finding active partition: %w", err)
}
mountPoint := "/tmp/kubesolo-check-" + activeSlot
if err := partition.MountReadOnly(partInfo.Device, mountPoint); err != nil {
return fmt.Errorf("mounting active partition: %w", err)
}
defer partition.Unmount(mountPoint)
currentVersion, err := partition.ReadVersion(mountPoint)
if err != nil {
slog.Warn("could not read current version", "error", err)
currentVersion = "unknown"
}
// Check update server
client := image.NewClient(opts.ServerURL, "")
meta, err := client.CheckForUpdate()
if err != nil {
return fmt.Errorf("checking for update: %w", err)
}
fmt.Printf("Current version: %s (slot %s)\n", currentVersion, activeSlot)
fmt.Printf("Latest version: %s\n", meta.Version)
if meta.Version == currentVersion {
fmt.Println("Status: up to date")
} else {
fmt.Println("Status: update available")
if meta.ReleaseNotes != "" {
fmt.Printf("Release notes: %s\n", meta.ReleaseNotes)
}
}
return nil
}

56
update/cmd/healthcheck.go Normal file
View File

@@ -0,0 +1,56 @@
package cmd
import (
"fmt"
"log/slog"
"time"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
"github.com/portainer/kubesolo-os/update/pkg/health"
)
// Healthcheck performs post-boot health verification.
// If all checks pass, it marks the boot as successful in GRUB.
// This should be run after every boot (typically via a systemd unit or
// init script) to confirm the system is healthy.
func Healthcheck(args []string) error {
opts := parseOpts(args)
env := grubenv.New(opts.GrubenvPath)
// Check if already marked successful
success, err := env.BootSuccess()
if err != nil {
slog.Warn("could not read boot_success", "error", err)
}
if success {
fmt.Println("Boot already marked successful")
return nil
}
timeout := time.Duration(opts.TimeoutSecs) * time.Second
checker := health.NewChecker("", "", timeout)
slog.Info("running post-boot health checks", "timeout", timeout)
status, err := checker.WaitForHealthy()
if err != nil {
fmt.Printf("Health check FAILED: %s\n", status.Message)
fmt.Printf(" containerd: %v\n", status.Containerd)
fmt.Printf(" apiserver: %v\n", status.APIServer)
fmt.Printf(" node_ready: %v\n", status.NodeReady)
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
return err
}
// Mark boot as successful
if err := env.MarkBootSuccess(); err != nil {
return fmt.Errorf("marking boot success: %w", err)
}
fmt.Println("Health check PASSED — boot marked successful")
fmt.Printf(" containerd: %v\n", status.Containerd)
fmt.Printf(" apiserver: %v\n", status.APIServer)
fmt.Printf(" node_ready: %v\n", status.NodeReady)
return nil
}

47
update/cmd/opts.go Normal file
View File

@@ -0,0 +1,47 @@
package cmd
// opts holds shared command-line options for all subcommands.
type opts struct {
ServerURL string
GrubenvPath string
TimeoutSecs int
}
// parseOpts extracts command-line flags from args.
// Simple parser — no external dependencies.
func parseOpts(args []string) opts {
o := opts{
GrubenvPath: "/boot/grub/grubenv",
TimeoutSecs: 120,
}
for i := 0; i < len(args); i++ {
switch args[i] {
case "--server":
if i+1 < len(args) {
o.ServerURL = args[i+1]
i++
}
case "--grubenv":
if i+1 < len(args) {
o.GrubenvPath = args[i+1]
i++
}
case "--timeout":
if i+1 < len(args) {
val := 0
for _, c := range args[i+1] {
if c >= '0' && c <= '9' {
val = val*10 + int(c-'0')
}
}
if val > 0 {
o.TimeoutSecs = val
}
i++
}
}
}
return o
}

36
update/cmd/rollback.go Normal file
View File

@@ -0,0 +1,36 @@
package cmd
import (
"fmt"
"log/slog"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
)
// Rollback forces an immediate switch to the other partition.
// Use this to manually revert to the previous version.
func Rollback(args []string) error {
opts := parseOpts(args)
env := grubenv.New(opts.GrubenvPath)
activeSlot, err := env.ActiveSlot()
if err != nil {
return fmt.Errorf("reading active slot: %w", err)
}
passiveSlot, err := env.PassiveSlot()
if err != nil {
return fmt.Errorf("reading passive slot: %w", err)
}
slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)
if err := env.ForceRollback(); err != nil {
return fmt.Errorf("rollback failed: %w", err)
}
fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
fmt.Println("Reboot to complete rollback.")
return nil
}

44
update/cmd/status.go Normal file
View File

@@ -0,0 +1,44 @@
package cmd
import (
"fmt"
"github.com/portainer/kubesolo-os/update/pkg/grubenv"
)
// Status displays the current A/B slot configuration and boot state.
func Status(args []string) error {
opts := parseOpts(args)
env := grubenv.New(opts.GrubenvPath)
vars, err := env.ReadAll()
if err != nil {
return fmt.Errorf("reading GRUB environment: %w", err)
}
activeSlot := vars["active_slot"]
bootCounter := vars["boot_counter"]
bootSuccess := vars["boot_success"]
passiveSlot := "B"
if activeSlot == "B" {
passiveSlot = "A"
}
fmt.Println("KubeSolo OS — A/B Partition Status")
fmt.Println("───────────────────────────────────")
fmt.Printf(" Active slot: %s\n", activeSlot)
fmt.Printf(" Passive slot: %s\n", passiveSlot)
fmt.Printf(" Boot counter: %s\n", bootCounter)
fmt.Printf(" Boot success: %s\n", bootSuccess)
if bootSuccess == "1" {
fmt.Println("\n ✓ System is healthy (boot confirmed)")
} else if bootCounter == "0" {
fmt.Println("\n ✗ Boot counter exhausted — rollback will occur on next reboot")
} else {
fmt.Printf("\n ⚠ Boot pending verification (%s attempts remaining)\n", bootCounter)
}
return nil
}