feat: add A/B partition updates with GRUB and Go update agent (Phase 3)

Implement atomic OS updates via A/B partition scheme with automatic
rollback. GRUB bootloader manages slot selection with a 3-attempt
boot counter that auto-rolls back on repeated health check failures.

GRUB boot config:
- A/B slot selection with boot_counter/boot_success env vars
- Automatic rollback when counter reaches 0 (3 failed boots)
- Debug, emergency shell, and manual slot-switch menu entries

Disk image (refactored):
- 4-partition GPT layout: EFI + System A + System B + Data
- GRUB EFI/BIOS installation with graceful fallbacks
- Both system partitions populated during image creation

Update agent (Go, zero external deps):
- pkg/grubenv: read/write GRUB env vars (grub-editenv + manual fallback)
- pkg/partition: find/mount/write system partitions by label
- pkg/image: HTTP download with SHA256 verification
- pkg/health: post-boot checks (containerd, API server, node Ready)
- 6 CLI commands: check, apply, activate, rollback, healthcheck, status
- 37 unit tests across all 4 packages

Deployment:
- K8s CronJob for automatic update checks (every 6 hours)
- ConfigMap for update server URL
- Health check Job for post-boot verification

Build pipeline:
- build-update-agent.sh compiles static Linux binary (~5.9 MB)
- inject-kubesolo.sh includes update agent in initramfs
- Makefile: build-update-agent, test-update-agent, test-update targets

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-11 11:12:46 -06:00
parent d900fa920e
commit 8d25e1890e
25 changed files with 2807 additions and 74 deletions

View File

@@ -0,0 +1,239 @@
// Package grubenv provides read/write access to GRUB environment variables.
//
// GRUB stores its environment in a 1024-byte file (grubenv) located at
// /boot/grub/grubenv on the EFI partition. This package manipulates
// those variables for A/B boot slot management.
//
// Key variables:
// - active_slot: "A" or "B"
// - boot_counter: "3" (fresh) down to "0" (triggers rollback)
// - boot_success: "0" (pending) or "1" (healthy boot confirmed)
package grubenv
import (
"fmt"
"log/slog"
"os"
"os/exec"
"strings"
)
const (
// DefaultGrubenvPath is the standard location for the GRUB environment file.
DefaultGrubenvPath = "/boot/grub/grubenv"
// SlotA represents system partition A.
SlotA = "A"
// SlotB represents system partition B.
SlotB = "B"
)
// Env provides access to GRUB environment variables.
type Env struct {
path string
}
// New creates a new Env for the given grubenv file path.
func New(path string) *Env {
if path == "" {
path = DefaultGrubenvPath
}
return &Env{path: path}
}
// Get reads a variable from the GRUB environment.
func (e *Env) Get(key string) (string, error) {
vars, err := e.ReadAll()
if err != nil {
return "", err
}
val, ok := vars[key]
if !ok {
return "", fmt.Errorf("grubenv: key %q not found", key)
}
return val, nil
}
// Set writes a variable to the GRUB environment.
func (e *Env) Set(key, value string) error {
editenv, err := findEditenv()
if err != nil {
return e.setManual(key, value)
}
cmd := exec.Command(editenv, e.path, "set", key+"="+value)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("grub-editenv set %s=%s: %w\n%s", key, value, err, output)
}
slog.Debug("grubenv set", "key", key, "value", value)
return nil
}
// ReadAll reads all variables from the GRUB environment.
func (e *Env) ReadAll() (map[string]string, error) {
editenv, err := findEditenv()
if err != nil {
return e.readManual()
}
cmd := exec.Command(editenv, e.path, "list")
output, err := cmd.Output()
if err != nil {
return nil, fmt.Errorf("grub-editenv list: %w", err)
}
return parseEnvOutput(string(output)), nil
}
// ActiveSlot returns the currently active boot slot ("A" or "B").
func (e *Env) ActiveSlot() (string, error) {
return e.Get("active_slot")
}
// PassiveSlot returns the currently passive boot slot.
func (e *Env) PassiveSlot() (string, error) {
active, err := e.ActiveSlot()
if err != nil {
return "", err
}
if active == SlotA {
return SlotB, nil
}
return SlotA, nil
}
// BootCounter returns the current boot counter value.
func (e *Env) BootCounter() (int, error) {
val, err := e.Get("boot_counter")
if err != nil {
return -1, err
}
switch val {
case "0":
return 0, nil
case "1":
return 1, nil
case "2":
return 2, nil
case "3":
return 3, nil
default:
return -1, fmt.Errorf("grubenv: invalid boot_counter: %q", val)
}
}
// BootSuccess returns whether the last boot was marked successful.
func (e *Env) BootSuccess() (bool, error) {
val, err := e.Get("boot_success")
if err != nil {
return false, err
}
return val == "1", nil
}
// MarkBootSuccess sets boot_success=1 and boot_counter=3.
// Called by the health check after a successful boot.
func (e *Env) MarkBootSuccess() error {
if err := e.Set("boot_success", "1"); err != nil {
return fmt.Errorf("setting boot_success: %w", err)
}
if err := e.Set("boot_counter", "3"); err != nil {
return fmt.Errorf("setting boot_counter: %w", err)
}
slog.Info("boot marked successful")
return nil
}
// ActivateSlot switches the active slot and resets the boot counter.
// Used after writing a new image to the passive partition.
func (e *Env) ActivateSlot(slot string) error {
if slot != SlotA && slot != SlotB {
return fmt.Errorf("invalid slot: %q (must be A or B)", slot)
}
if err := e.Set("active_slot", slot); err != nil {
return err
}
if err := e.Set("boot_counter", "3"); err != nil {
return err
}
if err := e.Set("boot_success", "0"); err != nil {
return err
}
slog.Info("activated slot", "slot", slot)
return nil
}
// ForceRollback switches to the other slot immediately.
func (e *Env) ForceRollback() error {
passive, err := e.PassiveSlot()
if err != nil {
return err
}
return e.ActivateSlot(passive)
}
func findEditenv() (string, error) {
if path, err := exec.LookPath("grub-editenv"); err == nil {
return path, nil
}
if path, err := exec.LookPath("grub2-editenv"); err == nil {
return path, nil
}
return "", fmt.Errorf("grub-editenv not found")
}
func parseEnvOutput(output string) map[string]string {
vars := make(map[string]string)
for _, line := range strings.Split(output, "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, "=", 2)
if len(parts) == 2 {
vars[parts[0]] = parts[1]
}
}
return vars
}
// setManual writes to grubenv without grub-editenv (fallback).
func (e *Env) setManual(key, value string) error {
vars, err := e.readManual()
if err != nil {
vars = make(map[string]string)
}
vars[key] = value
return e.writeManual(vars)
}
// readManual reads grubenv without grub-editenv.
func (e *Env) readManual() (map[string]string, error) {
data, err := os.ReadFile(e.path)
if err != nil {
return nil, fmt.Errorf("reading grubenv: %w", err)
}
return parseEnvOutput(string(data)), nil
}
// writeManual writes grubenv without grub-editenv.
// GRUB requires the file to be exactly 1024 bytes, padded with '#'.
func (e *Env) writeManual(vars map[string]string) error {
var sb strings.Builder
sb.WriteString("# GRUB Environment Block\n")
for k, v := range vars {
sb.WriteString(k + "=" + v + "\n")
}
content := sb.String()
if len(content) > 1024 {
return fmt.Errorf("grubenv content exceeds 1024 bytes")
}
// Pad to 1024 bytes with '#'
padding := 1024 - len(content)
content += strings.Repeat("#", padding)
return os.WriteFile(e.path, []byte(content), 0o644)
}

View File

@@ -0,0 +1,423 @@
package grubenv
import (
"os"
"path/filepath"
"strings"
"testing"
)
// createTestGrubenv writes a properly formatted grubenv file for testing.
// GRUB requires the file to be exactly 1024 bytes, padded with '#'.
func createTestGrubenv(t *testing.T, dir string, vars map[string]string) string {
t.Helper()
path := filepath.Join(dir, "grubenv")
var sb strings.Builder
sb.WriteString("# GRUB Environment Block\n")
for k, v := range vars {
sb.WriteString(k + "=" + v + "\n")
}
content := sb.String()
padding := 1024 - len(content)
if padding > 0 {
content += strings.Repeat("#", padding)
}
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatal(err)
}
return path
}
func TestNew(t *testing.T) {
env := New("")
if env.path != DefaultGrubenvPath {
t.Errorf("expected default path %s, got %s", DefaultGrubenvPath, env.path)
}
env = New("/custom/path/grubenv")
if env.path != "/custom/path/grubenv" {
t.Errorf("expected custom path, got %s", env.path)
}
}
func TestReadAll(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
vars, err := env.ReadAll()
if err != nil {
t.Fatal(err)
}
if vars["active_slot"] != "A" {
t.Errorf("active_slot: expected A, got %s", vars["active_slot"])
}
if vars["boot_counter"] != "3" {
t.Errorf("boot_counter: expected 3, got %s", vars["boot_counter"])
}
if vars["boot_success"] != "1" {
t.Errorf("boot_success: expected 1, got %s", vars["boot_success"])
}
}
func TestGet(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "B",
})
env := New(path)
val, err := env.Get("active_slot")
if err != nil {
t.Fatal(err)
}
if val != "B" {
t.Errorf("expected B, got %s", val)
}
_, err = env.Get("nonexistent")
if err == nil {
t.Fatal("expected error for nonexistent key")
}
}
func TestSet(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
})
env := New(path)
if err := env.Set("boot_counter", "2"); err != nil {
t.Fatal(err)
}
val, err := env.Get("boot_counter")
if err != nil {
t.Fatal(err)
}
if val != "2" {
t.Errorf("expected 2 after set, got %s", val)
}
// Verify file is still 1024 bytes
data, err := os.ReadFile(path)
if err != nil {
t.Fatal(err)
}
if len(data) != 1024 {
t.Errorf("grubenv should be 1024 bytes, got %d", len(data))
}
}
func TestActiveSlot(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
slot, err := env.ActiveSlot()
if err != nil {
t.Fatal(err)
}
if slot != "A" {
t.Errorf("expected A, got %s", slot)
}
}
func TestPassiveSlot(t *testing.T) {
tests := []struct {
active string
passive string
}{
{"A", "B"},
{"B", "A"},
}
for _, tt := range tests {
t.Run("active_"+tt.active, func(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": tt.active,
})
env := New(path)
passive, err := env.PassiveSlot()
if err != nil {
t.Fatal(err)
}
if passive != tt.passive {
t.Errorf("expected passive %s, got %s", tt.passive, passive)
}
})
}
}
func TestBootCounter(t *testing.T) {
tests := []struct {
value string
expect int
wantErr bool
}{
{"0", 0, false},
{"1", 1, false},
{"2", 2, false},
{"3", 3, false},
{"invalid", -1, true},
{"99", -1, true},
}
for _, tt := range tests {
t.Run("counter_"+tt.value, func(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"boot_counter": tt.value,
})
env := New(path)
counter, err := env.BootCounter()
if tt.wantErr {
if err == nil {
t.Fatal("expected error")
}
return
}
if err != nil {
t.Fatal(err)
}
if counter != tt.expect {
t.Errorf("expected %d, got %d", tt.expect, counter)
}
})
}
}
func TestBootSuccess(t *testing.T) {
tests := []struct {
value string
expect bool
}{
{"0", false},
{"1", true},
}
for _, tt := range tests {
t.Run("success_"+tt.value, func(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"boot_success": tt.value,
})
env := New(path)
success, err := env.BootSuccess()
if err != nil {
t.Fatal(err)
}
if success != tt.expect {
t.Errorf("expected %v, got %v", tt.expect, success)
}
})
}
}
func TestMarkBootSuccess(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "B",
"boot_counter": "1",
"boot_success": "0",
})
env := New(path)
if err := env.MarkBootSuccess(); err != nil {
t.Fatal(err)
}
success, err := env.BootSuccess()
if err != nil {
t.Fatal(err)
}
if !success {
t.Error("expected boot_success=1 after MarkBootSuccess")
}
counter, err := env.BootCounter()
if err != nil {
t.Fatal(err)
}
if counter != 3 {
t.Errorf("expected boot_counter=3 after MarkBootSuccess, got %d", counter)
}
}
func TestActivateSlot(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
if err := env.ActivateSlot("B"); err != nil {
t.Fatal(err)
}
slot, _ := env.ActiveSlot()
if slot != "B" {
t.Errorf("expected active_slot=B, got %s", slot)
}
counter, _ := env.BootCounter()
if counter != 3 {
t.Errorf("expected boot_counter=3, got %d", counter)
}
success, _ := env.BootSuccess()
if success {
t.Error("expected boot_success=0 after ActivateSlot")
}
}
func TestActivateSlotInvalid(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
})
env := New(path)
err := env.ActivateSlot("C")
if err == nil {
t.Fatal("expected error for invalid slot")
}
}
func TestForceRollback(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
if err := env.ForceRollback(); err != nil {
t.Fatal(err)
}
slot, _ := env.ActiveSlot()
if slot != "B" {
t.Errorf("expected active_slot=B after rollback from A, got %s", slot)
}
}
func TestParseEnvOutput(t *testing.T) {
input := `# GRUB Environment Block
active_slot=A
boot_counter=3
boot_success=1
`
vars := parseEnvOutput(input)
if len(vars) != 3 {
t.Errorf("expected 3 variables, got %d", len(vars))
}
if vars["active_slot"] != "A" {
t.Errorf("active_slot: expected A, got %s", vars["active_slot"])
}
if vars["boot_counter"] != "3" {
t.Errorf("boot_counter: expected 3, got %s", vars["boot_counter"])
}
}
func TestWriteManualFormat(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "grubenv")
env := New(path)
// Use setManual directly since grub-editenv may not be available
err := env.setManual("test_key", "test_value")
if err != nil {
t.Fatal(err)
}
data, err := os.ReadFile(path)
if err != nil {
t.Fatal(err)
}
if len(data) != 1024 {
t.Errorf("grubenv should be exactly 1024 bytes, got %d", len(data))
}
if !strings.HasPrefix(string(data), "# GRUB Environment Block\n") {
t.Error("grubenv should start with '# GRUB Environment Block'")
}
if !strings.Contains(string(data), "test_key=test_value\n") {
t.Error("grubenv should contain test_key=test_value")
}
}
func TestReadNonexistentFile(t *testing.T) {
env := New("/nonexistent/path/grubenv")
_, err := env.ReadAll()
if err == nil {
t.Fatal("expected error reading nonexistent file")
}
}
func TestMultipleSetOperations(t *testing.T) {
dir := t.TempDir()
path := createTestGrubenv(t, dir, map[string]string{
"active_slot": "A",
"boot_counter": "3",
"boot_success": "1",
})
env := New(path)
// Simulate a boot cycle: decrement counter, then mark success
if err := env.Set("boot_counter", "2"); err != nil {
t.Fatal(err)
}
if err := env.Set("boot_success", "0"); err != nil {
t.Fatal(err)
}
// Now mark boot success
if err := env.MarkBootSuccess(); err != nil {
t.Fatal(err)
}
// Verify final state
vars, err := env.ReadAll()
if err != nil {
t.Fatal(err)
}
if vars["active_slot"] != "A" {
t.Errorf("active_slot should still be A, got %s", vars["active_slot"])
}
if vars["boot_counter"] != "3" {
t.Errorf("boot_counter should be 3 after mark success, got %s", vars["boot_counter"])
}
if vars["boot_success"] != "1" {
t.Errorf("boot_success should be 1, got %s", vars["boot_success"])
}
}