feat(update): persistent state machine + lifecycle metrics

Phase 5 of v0.3. Adds an explicit, on-disk state machine to the update agent so the lifecycle of an attempt is observable end-to-end, instead of being inferred from logs and side effects. New package update/pkg/state: - Phase enum (idle, checking, downloading, staged, activated, verifying, success, rolled_back, failed) - UpdateState struct persisted to /var/lib/kubesolo/update/state.json (overridable via --state). Atomic write (.tmp + rename). Survives reboots and slot switches because the file lives on the data partition. - Transition helper that bumps AttemptCount when an attempt starts, resets it when the target version changes, sets/clears LastError on failed/success transitions, and stamps StartedAt + UpdatedAt. - 13 unit tests cover the lifecycle, atomic write, version-change reset, error recording, idempotent SetFromVersion, garbage-file handling. Wired into the existing commands: - apply.go transitions Idle -> Checking -> Downloading -> Staged, with RecordError on any step failure. Reads the active slot's version file to populate FromVersion. - activate.go transitions to Activated. - healthcheck.go transitions Activated -> Verifying -> Success on pass, or to Failed on fail. Skips transitions if state isn't post-activation (manual healthcheck on a stable system shouldn't churn the state). - rollback.go transitions to RolledBack with LastError="manual rollback". - check.go intentionally untouched — checks are passive queries, not attempts; they shouldn't reset AttemptCount. status.go gains a --json mode that emits the full state report (A/B slots, boot counter, full UpdateState) for orchestration tooling. Human-readable mode also prints an Update Lifecycle section when state.phase != idle. pkg/metrics gains three new series, derived from state.json at scrape time: - kubesolo_update_phase{phase="..."} — 1 for current, 0 for all others; all nine phase values always emitted so dashboards see complete series - kubesolo_update_attempts_total - kubesolo_update_last_attempt_timestamp_seconds Server.SetStatePath() configures the file location; defaults to absent which emits Idle defaults. Three new tests cover the absent / active / all-phases-emitted cases. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 18:11:47 -06:00
parent 0c6e200585
commit bce565e2f7
12 changed files with 726 additions and 0 deletions
--- a/update/pkg/state/state_test.go
+++ b/update/pkg/state/state_test.go
@@ -0,0 +1,197 @@
+package state
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// statePath returns a per-test state file path inside t.TempDir().
+func statePath(t *testing.T) string {
+	t.Helper()
+	return filepath.Join(t.TempDir(), "state.json")
+}
+
+func TestLoadMissingReturnsIdle(t *testing.T) {
+	s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
+	if err != nil {
+		t.Fatalf("unexpected error loading missing state: %v", err)
+	}
+	if s.Phase != PhaseIdle {
+		t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
+	}
+}
+
+func TestSaveLoadRoundTrip(t *testing.T) {
+	path := statePath(t)
+	in := &UpdateState{
+		Phase:        PhaseStaged,
+		FromVersion:  "v0.2.0",
+		ToVersion:    "v0.3.0",
+		AttemptCount: 1,
+	}
+	if err := in.Save(path); err != nil {
+		t.Fatalf("save: %v", err)
+	}
+	out, err := Load(path)
+	if err != nil {
+		t.Fatalf("load: %v", err)
+	}
+	if out.Phase != in.Phase {
+		t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
+	}
+	if out.FromVersion != in.FromVersion {
+		t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
+	}
+	if out.ToVersion != in.ToVersion {
+		t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
+	}
+	if out.AttemptCount != in.AttemptCount {
+		t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
+	}
+	if out.UpdatedAt.IsZero() {
+		t.Error("UpdatedAt should be set by Save")
+	}
+}
+
+func TestSaveRejectsInvalidPhase(t *testing.T) {
+	s := &UpdateState{Phase: Phase("bogus")}
+	err := s.Save(statePath(t))
+	if err == nil {
+		t.Fatal("expected error saving invalid phase, got nil")
+	}
+}
+
+func TestSaveIsAtomic(t *testing.T) {
+	// After Save, the .tmp file should NOT exist — confirming we renamed it.
+	path := statePath(t)
+	s := New()
+	if err := s.Save(path); err != nil {
+		t.Fatalf("save: %v", err)
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Errorf("tmp file still present after Save: %v", err)
+	}
+}
+
+func TestSaveCreatesDirectory(t *testing.T) {
+	// State directory may not exist yet (first-ever boot). Save() should mkdir.
+	dir := filepath.Join(t.TempDir(), "fresh", "subdir")
+	path := filepath.Join(dir, "state.json")
+	if err := New().Save(path); err != nil {
+		t.Fatalf("save into nonexistent dir: %v", err)
+	}
+	if _, err := os.Stat(path); err != nil {
+		t.Errorf("state file not present after Save: %v", err)
+	}
+}
+
+func TestTransitionIdleToChecking(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
+		t.Fatalf("transition: %v", err)
+	}
+	if s.Phase != PhaseChecking {
+		t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
+	}
+	if s.ToVersion != "v0.3.0" {
+		t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
+	}
+	if s.AttemptCount != 1 {
+		t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
+	}
+	if s.StartedAt.IsZero() {
+		t.Error("StartedAt should be set when leaving Idle")
+	}
+}
+
+func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
+	_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
+	_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
+	if s.AttemptCount != 1 {
+		t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
+	}
+}
+
+func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
+	// Now an attempt at a NEW version starts. AttemptCount should reset.
+	_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
+	if s.ToVersion != "v0.4.0" {
+		t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
+	}
+	if s.AttemptCount != 0 {
+		t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
+	}
+}
+
+func TestTransitionFailedRecordsError(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
+	_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
+	if s.Phase != PhaseFailed {
+		t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
+	}
+	if s.LastError != "checksum mismatch" {
+		t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
+	}
+}
+
+func TestTransitionSuccessClearsError(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
+	if s.LastError == "" {
+		t.Fatal("setup: LastError should be non-empty before success")
+	}
+	_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
+	if s.LastError != "" {
+		t.Errorf("last_error after success: got %q, want empty", s.LastError)
+	}
+}
+
+func TestRecordError(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	if err := s.RecordError(path, errors.New("network down")); err != nil {
+		t.Fatalf("RecordError: %v", err)
+	}
+	if s.Phase != PhaseFailed {
+		t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
+	}
+	if s.LastError != "network down" {
+		t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
+	}
+}
+
+func TestSetFromVersionIdempotent(t *testing.T) {
+	s := New()
+	s.SetFromVersion("v0.2.0")
+	if s.FromVersion != "v0.2.0" {
+		t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
+	}
+	// Second call should not overwrite.
+	s.SetFromVersion("v0.1.0")
+	if s.FromVersion != "v0.2.0" {
+		t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
+	}
+}
+
+func TestLoadHandlesGarbageFile(t *testing.T) {
+	path := statePath(t)
+	if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+	_, err := Load(path)
+	if err == nil {
+		t.Error("expected error loading garbage, got nil")
+	}
+}