feat: add A/B partition updates with GRUB and Go update agent (Phase 3)

Implement atomic OS updates via A/B partition scheme with automatic rollback. GRUB bootloader manages slot selection with a 3-attempt boot counter that auto-rolls back on repeated health check failures. GRUB boot config: - A/B slot selection with boot_counter/boot_success env vars - Automatic rollback when counter reaches 0 (3 failed boots) - Debug, emergency shell, and manual slot-switch menu entries Disk image (refactored): - 4-partition GPT layout: EFI + System A + System B + Data - GRUB EFI/BIOS installation with graceful fallbacks - Both system partitions populated during image creation Update agent (Go, zero external deps): - pkg/grubenv: read/write GRUB env vars (grub-editenv + manual fallback) - pkg/partition: find/mount/write system partitions by label - pkg/image: HTTP download with SHA256 verification - pkg/health: post-boot checks (containerd, API server, node Ready) - 6 CLI commands: check, apply, activate, rollback, healthcheck, status - 37 unit tests across all 4 packages Deployment: - K8s CronJob for automatic update checks (every 6 hours) - ConfigMap for update server URL - Health check Job for post-boot verification Build pipeline: - build-update-agent.sh compiles static Linux binary (~5.9 MB) - inject-kubesolo.sh includes update agent in initramfs - Makefile: build-update-agent, test-update-agent, test-update targets Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 11:12:46 -06:00
parent d900fa920e
commit 8d25e1890e
25 changed files with 2807 additions and 74 deletions
--- a/update/pkg/grubenv/grubenv.go
+++ b/update/pkg/grubenv/grubenv.go
@@ -0,0 +1,239 @@
+// Package grubenv provides read/write access to GRUB environment variables.
+//
+// GRUB stores its environment in a 1024-byte file (grubenv) located at
+// /boot/grub/grubenv on the EFI partition. This package manipulates
+// those variables for A/B boot slot management.
+//
+// Key variables:
+//   - active_slot:  "A" or "B"
+//   - boot_counter: "3" (fresh) down to "0" (triggers rollback)
+//   - boot_success: "0" (pending) or "1" (healthy boot confirmed)
+package grubenv
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"os/exec"
+	"strings"
+)
+
+const (
+	// DefaultGrubenvPath is the standard location for the GRUB environment file.
+	DefaultGrubenvPath = "/boot/grub/grubenv"
+
+	// SlotA represents system partition A.
+	SlotA = "A"
+	// SlotB represents system partition B.
+	SlotB = "B"
+)
+
+// Env provides access to GRUB environment variables.
+type Env struct {
+	path string
+}
+
+// New creates a new Env for the given grubenv file path.
+func New(path string) *Env {
+	if path == "" {
+		path = DefaultGrubenvPath
+	}
+	return &Env{path: path}
+}
+
+// Get reads a variable from the GRUB environment.
+func (e *Env) Get(key string) (string, error) {
+	vars, err := e.ReadAll()
+	if err != nil {
+		return "", err
+	}
+	val, ok := vars[key]
+	if !ok {
+		return "", fmt.Errorf("grubenv: key %q not found", key)
+	}
+	return val, nil
+}
+
+// Set writes a variable to the GRUB environment.
+func (e *Env) Set(key, value string) error {
+	editenv, err := findEditenv()
+	if err != nil {
+		return e.setManual(key, value)
+	}
+
+	cmd := exec.Command(editenv, e.path, "set", key+"="+value)
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("grub-editenv set %s=%s: %w\n%s", key, value, err, output)
+	}
+
+	slog.Debug("grubenv set", "key", key, "value", value)
+	return nil
+}
+
+// ReadAll reads all variables from the GRUB environment.
+func (e *Env) ReadAll() (map[string]string, error) {
+	editenv, err := findEditenv()
+	if err != nil {
+		return e.readManual()
+	}
+
+	cmd := exec.Command(editenv, e.path, "list")
+	output, err := cmd.Output()
+	if err != nil {
+		return nil, fmt.Errorf("grub-editenv list: %w", err)
+	}
+
+	return parseEnvOutput(string(output)), nil
+}
+
+// ActiveSlot returns the currently active boot slot ("A" or "B").
+func (e *Env) ActiveSlot() (string, error) {
+	return e.Get("active_slot")
+}
+
+// PassiveSlot returns the currently passive boot slot.
+func (e *Env) PassiveSlot() (string, error) {
+	active, err := e.ActiveSlot()
+	if err != nil {
+		return "", err
+	}
+	if active == SlotA {
+		return SlotB, nil
+	}
+	return SlotA, nil
+}
+
+// BootCounter returns the current boot counter value.
+func (e *Env) BootCounter() (int, error) {
+	val, err := e.Get("boot_counter")
+	if err != nil {
+		return -1, err
+	}
+	switch val {
+	case "0":
+		return 0, nil
+	case "1":
+		return 1, nil
+	case "2":
+		return 2, nil
+	case "3":
+		return 3, nil
+	default:
+		return -1, fmt.Errorf("grubenv: invalid boot_counter: %q", val)
+	}
+}
+
+// BootSuccess returns whether the last boot was marked successful.
+func (e *Env) BootSuccess() (bool, error) {
+	val, err := e.Get("boot_success")
+	if err != nil {
+		return false, err
+	}
+	return val == "1", nil
+}
+
+// MarkBootSuccess sets boot_success=1 and boot_counter=3.
+// Called by the health check after a successful boot.
+func (e *Env) MarkBootSuccess() error {
+	if err := e.Set("boot_success", "1"); err != nil {
+		return fmt.Errorf("setting boot_success: %w", err)
+	}
+	if err := e.Set("boot_counter", "3"); err != nil {
+		return fmt.Errorf("setting boot_counter: %w", err)
+	}
+	slog.Info("boot marked successful")
+	return nil
+}
+
+// ActivateSlot switches the active slot and resets the boot counter.
+// Used after writing a new image to the passive partition.
+func (e *Env) ActivateSlot(slot string) error {
+	if slot != SlotA && slot != SlotB {
+		return fmt.Errorf("invalid slot: %q (must be A or B)", slot)
+	}
+	if err := e.Set("active_slot", slot); err != nil {
+		return err
+	}
+	if err := e.Set("boot_counter", "3"); err != nil {
+		return err
+	}
+	if err := e.Set("boot_success", "0"); err != nil {
+		return err
+	}
+	slog.Info("activated slot", "slot", slot)
+	return nil
+}
+
+// ForceRollback switches to the other slot immediately.
+func (e *Env) ForceRollback() error {
+	passive, err := e.PassiveSlot()
+	if err != nil {
+		return err
+	}
+	return e.ActivateSlot(passive)
+}
+
+func findEditenv() (string, error) {
+	if path, err := exec.LookPath("grub-editenv"); err == nil {
+		return path, nil
+	}
+	if path, err := exec.LookPath("grub2-editenv"); err == nil {
+		return path, nil
+	}
+	return "", fmt.Errorf("grub-editenv not found")
+}
+
+func parseEnvOutput(output string) map[string]string {
+	vars := make(map[string]string)
+	for _, line := range strings.Split(output, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		parts := strings.SplitN(line, "=", 2)
+		if len(parts) == 2 {
+			vars[parts[0]] = parts[1]
+		}
+	}
+	return vars
+}
+
+// setManual writes to grubenv without grub-editenv (fallback).
+func (e *Env) setManual(key, value string) error {
+	vars, err := e.readManual()
+	if err != nil {
+		vars = make(map[string]string)
+	}
+	vars[key] = value
+	return e.writeManual(vars)
+}
+
+// readManual reads grubenv without grub-editenv.
+func (e *Env) readManual() (map[string]string, error) {
+	data, err := os.ReadFile(e.path)
+	if err != nil {
+		return nil, fmt.Errorf("reading grubenv: %w", err)
+	}
+	return parseEnvOutput(string(data)), nil
+}
+
+// writeManual writes grubenv without grub-editenv.
+// GRUB requires the file to be exactly 1024 bytes, padded with '#'.
+func (e *Env) writeManual(vars map[string]string) error {
+	var sb strings.Builder
+	sb.WriteString("# GRUB Environment Block\n")
+	for k, v := range vars {
+		sb.WriteString(k + "=" + v + "\n")
+	}
+
+	content := sb.String()
+	if len(content) > 1024 {
+		return fmt.Errorf("grubenv content exceeds 1024 bytes")
+	}
+
+	// Pad to 1024 bytes with '#'
+	padding := 1024 - len(content)
+	content += strings.Repeat("#", padding)
+
+	return os.WriteFile(e.path, []byte(content), 0o644)
+}
--- a/update/pkg/grubenv/grubenv_test.go
+++ b/update/pkg/grubenv/grubenv_test.go
@@ -0,0 +1,423 @@
+package grubenv
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+// createTestGrubenv writes a properly formatted grubenv file for testing.
+// GRUB requires the file to be exactly 1024 bytes, padded with '#'.
+func createTestGrubenv(t *testing.T, dir string, vars map[string]string) string {
+	t.Helper()
+	path := filepath.Join(dir, "grubenv")
+
+	var sb strings.Builder
+	sb.WriteString("# GRUB Environment Block\n")
+	for k, v := range vars {
+		sb.WriteString(k + "=" + v + "\n")
+	}
+
+	content := sb.String()
+	padding := 1024 - len(content)
+	if padding > 0 {
+		content += strings.Repeat("#", padding)
+	}
+
+	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	return path
+}
+
+func TestNew(t *testing.T) {
+	env := New("")
+	if env.path != DefaultGrubenvPath {
+		t.Errorf("expected default path %s, got %s", DefaultGrubenvPath, env.path)
+	}
+
+	env = New("/custom/path/grubenv")
+	if env.path != "/custom/path/grubenv" {
+		t.Errorf("expected custom path, got %s", env.path)
+	}
+}
+
+func TestReadAll(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot":  "A",
+		"boot_counter": "3",
+		"boot_success": "1",
+	})
+
+	env := New(path)
+	vars, err := env.ReadAll()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if vars["active_slot"] != "A" {
+		t.Errorf("active_slot: expected A, got %s", vars["active_slot"])
+	}
+	if vars["boot_counter"] != "3" {
+		t.Errorf("boot_counter: expected 3, got %s", vars["boot_counter"])
+	}
+	if vars["boot_success"] != "1" {
+		t.Errorf("boot_success: expected 1, got %s", vars["boot_success"])
+	}
+}
+
+func TestGet(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot": "B",
+	})
+
+	env := New(path)
+
+	val, err := env.Get("active_slot")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if val != "B" {
+		t.Errorf("expected B, got %s", val)
+	}
+
+	_, err = env.Get("nonexistent")
+	if err == nil {
+		t.Fatal("expected error for nonexistent key")
+	}
+}
+
+func TestSet(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot":  "A",
+		"boot_counter": "3",
+	})
+
+	env := New(path)
+
+	if err := env.Set("boot_counter", "2"); err != nil {
+		t.Fatal(err)
+	}
+
+	val, err := env.Get("boot_counter")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if val != "2" {
+		t.Errorf("expected 2 after set, got %s", val)
+	}
+
+	// Verify file is still 1024 bytes
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(data) != 1024 {
+		t.Errorf("grubenv should be 1024 bytes, got %d", len(data))
+	}
+}
+
+func TestActiveSlot(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot":  "A",
+		"boot_counter": "3",
+		"boot_success": "1",
+	})
+
+	env := New(path)
+	slot, err := env.ActiveSlot()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if slot != "A" {
+		t.Errorf("expected A, got %s", slot)
+	}
+}
+
+func TestPassiveSlot(t *testing.T) {
+	tests := []struct {
+		active  string
+		passive string
+	}{
+		{"A", "B"},
+		{"B", "A"},
+	}
+
+	for _, tt := range tests {
+		t.Run("active_"+tt.active, func(t *testing.T) {
+			dir := t.TempDir()
+			path := createTestGrubenv(t, dir, map[string]string{
+				"active_slot": tt.active,
+			})
+
+			env := New(path)
+			passive, err := env.PassiveSlot()
+			if err != nil {
+				t.Fatal(err)
+			}
+			if passive != tt.passive {
+				t.Errorf("expected passive %s, got %s", tt.passive, passive)
+			}
+		})
+	}
+}
+
+func TestBootCounter(t *testing.T) {
+	tests := []struct {
+		value   string
+		expect  int
+		wantErr bool
+	}{
+		{"0", 0, false},
+		{"1", 1, false},
+		{"2", 2, false},
+		{"3", 3, false},
+		{"invalid", -1, true},
+		{"99", -1, true},
+	}
+
+	for _, tt := range tests {
+		t.Run("counter_"+tt.value, func(t *testing.T) {
+			dir := t.TempDir()
+			path := createTestGrubenv(t, dir, map[string]string{
+				"boot_counter": tt.value,
+			})
+
+			env := New(path)
+			counter, err := env.BootCounter()
+			if tt.wantErr {
+				if err == nil {
+					t.Fatal("expected error")
+				}
+				return
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			if counter != tt.expect {
+				t.Errorf("expected %d, got %d", tt.expect, counter)
+			}
+		})
+	}
+}
+
+func TestBootSuccess(t *testing.T) {
+	tests := []struct {
+		value  string
+		expect bool
+	}{
+		{"0", false},
+		{"1", true},
+	}
+
+	for _, tt := range tests {
+		t.Run("success_"+tt.value, func(t *testing.T) {
+			dir := t.TempDir()
+			path := createTestGrubenv(t, dir, map[string]string{
+				"boot_success": tt.value,
+			})
+
+			env := New(path)
+			success, err := env.BootSuccess()
+			if err != nil {
+				t.Fatal(err)
+			}
+			if success != tt.expect {
+				t.Errorf("expected %v, got %v", tt.expect, success)
+			}
+		})
+	}
+}
+
+func TestMarkBootSuccess(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot":  "B",
+		"boot_counter": "1",
+		"boot_success": "0",
+	})
+
+	env := New(path)
+	if err := env.MarkBootSuccess(); err != nil {
+		t.Fatal(err)
+	}
+
+	success, err := env.BootSuccess()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !success {
+		t.Error("expected boot_success=1 after MarkBootSuccess")
+	}
+
+	counter, err := env.BootCounter()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if counter != 3 {
+		t.Errorf("expected boot_counter=3 after MarkBootSuccess, got %d", counter)
+	}
+}
+
+func TestActivateSlot(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot":  "A",
+		"boot_counter": "3",
+		"boot_success": "1",
+	})
+
+	env := New(path)
+	if err := env.ActivateSlot("B"); err != nil {
+		t.Fatal(err)
+	}
+
+	slot, _ := env.ActiveSlot()
+	if slot != "B" {
+		t.Errorf("expected active_slot=B, got %s", slot)
+	}
+
+	counter, _ := env.BootCounter()
+	if counter != 3 {
+		t.Errorf("expected boot_counter=3, got %d", counter)
+	}
+
+	success, _ := env.BootSuccess()
+	if success {
+		t.Error("expected boot_success=0 after ActivateSlot")
+	}
+}
+
+func TestActivateSlotInvalid(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot": "A",
+	})
+
+	env := New(path)
+	err := env.ActivateSlot("C")
+	if err == nil {
+		t.Fatal("expected error for invalid slot")
+	}
+}
+
+func TestForceRollback(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot":  "A",
+		"boot_counter": "3",
+		"boot_success": "1",
+	})
+
+	env := New(path)
+	if err := env.ForceRollback(); err != nil {
+		t.Fatal(err)
+	}
+
+	slot, _ := env.ActiveSlot()
+	if slot != "B" {
+		t.Errorf("expected active_slot=B after rollback from A, got %s", slot)
+	}
+}
+
+func TestParseEnvOutput(t *testing.T) {
+	input := `# GRUB Environment Block
+active_slot=A
+boot_counter=3
+boot_success=1
+
+`
+	vars := parseEnvOutput(input)
+
+	if len(vars) != 3 {
+		t.Errorf("expected 3 variables, got %d", len(vars))
+	}
+	if vars["active_slot"] != "A" {
+		t.Errorf("active_slot: expected A, got %s", vars["active_slot"])
+	}
+	if vars["boot_counter"] != "3" {
+		t.Errorf("boot_counter: expected 3, got %s", vars["boot_counter"])
+	}
+}
+
+func TestWriteManualFormat(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "grubenv")
+
+	env := New(path)
+	// Use setManual directly since grub-editenv may not be available
+	err := env.setManual("test_key", "test_value")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(data) != 1024 {
+		t.Errorf("grubenv should be exactly 1024 bytes, got %d", len(data))
+	}
+
+	if !strings.HasPrefix(string(data), "# GRUB Environment Block\n") {
+		t.Error("grubenv should start with '# GRUB Environment Block'")
+	}
+
+	if !strings.Contains(string(data), "test_key=test_value\n") {
+		t.Error("grubenv should contain test_key=test_value")
+	}
+}
+
+func TestReadNonexistentFile(t *testing.T) {
+	env := New("/nonexistent/path/grubenv")
+	_, err := env.ReadAll()
+	if err == nil {
+		t.Fatal("expected error reading nonexistent file")
+	}
+}
+
+func TestMultipleSetOperations(t *testing.T) {
+	dir := t.TempDir()
+	path := createTestGrubenv(t, dir, map[string]string{
+		"active_slot":  "A",
+		"boot_counter": "3",
+		"boot_success": "1",
+	})
+
+	env := New(path)
+
+	// Simulate a boot cycle: decrement counter, then mark success
+	if err := env.Set("boot_counter", "2"); err != nil {
+		t.Fatal(err)
+	}
+	if err := env.Set("boot_success", "0"); err != nil {
+		t.Fatal(err)
+	}
+
+	// Now mark boot success
+	if err := env.MarkBootSuccess(); err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify final state
+	vars, err := env.ReadAll()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if vars["active_slot"] != "A" {
+		t.Errorf("active_slot should still be A, got %s", vars["active_slot"])
+	}
+	if vars["boot_counter"] != "3" {
+		t.Errorf("boot_counter should be 3 after mark success, got %s", vars["boot_counter"])
+	}
+	if vars["boot_success"] != "1" {
+		t.Errorf("boot_success should be 1, got %s", vars["boot_success"])
+	}
+}