Files
kubesolo-os/update/pkg/state/state_test.go
Adolfo Delorenzo bce565e2f7
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s
feat(update): persistent state machine + lifecycle metrics
Phase 5 of v0.3. Adds an explicit, on-disk state machine to the update agent
so the lifecycle of an attempt is observable end-to-end, instead of being
inferred from logs and side effects.

New package update/pkg/state:
- Phase enum (idle, checking, downloading, staged, activated, verifying,
  success, rolled_back, failed)
- UpdateState struct persisted to /var/lib/kubesolo/update/state.json
  (overridable via --state). Atomic write (.tmp + rename). Survives reboots
  and slot switches because the file lives on the data partition.
- Transition helper that bumps AttemptCount when an attempt starts, resets
  it when the target version changes, sets/clears LastError on
  failed/success transitions, and stamps StartedAt + UpdatedAt.
- 13 unit tests cover the lifecycle, atomic write, version-change reset,
  error recording, idempotent SetFromVersion, garbage-file handling.

Wired into the existing commands:
- apply.go transitions Idle -> Checking -> Downloading -> Staged, with
  RecordError on any step failure. Reads the active slot's version file to
  populate FromVersion.
- activate.go transitions to Activated.
- healthcheck.go transitions Activated -> Verifying -> Success on pass,
  or to Failed on fail. Skips transitions if state isn't post-activation
  (manual healthcheck on a stable system shouldn't churn the state).
- rollback.go transitions to RolledBack with LastError="manual rollback".
- check.go intentionally untouched — checks are passive queries, not
  attempts; they shouldn't reset AttemptCount.

status.go gains a --json mode that emits the full state report (A/B slots,
boot counter, full UpdateState) for orchestration tooling. Human-readable
mode also prints an Update Lifecycle section when state.phase != idle.

pkg/metrics gains three new series, derived from state.json at scrape time:
- kubesolo_update_phase{phase="..."} — 1 for current, 0 for all others;
  all nine phase values always emitted so dashboards see complete series
- kubesolo_update_attempts_total
- kubesolo_update_last_attempt_timestamp_seconds
Server.SetStatePath() configures the file location; defaults to absent
which emits Idle defaults. Three new tests cover the absent / active /
all-phases-emitted cases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 18:11:47 -06:00

198 lines
5.5 KiB
Go

package state
import (
"errors"
"os"
"path/filepath"
"testing"
)
// statePath returns a per-test state file path inside t.TempDir().
func statePath(t *testing.T) string {
t.Helper()
return filepath.Join(t.TempDir(), "state.json")
}
func TestLoadMissingReturnsIdle(t *testing.T) {
s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
if err != nil {
t.Fatalf("unexpected error loading missing state: %v", err)
}
if s.Phase != PhaseIdle {
t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
}
}
func TestSaveLoadRoundTrip(t *testing.T) {
path := statePath(t)
in := &UpdateState{
Phase: PhaseStaged,
FromVersion: "v0.2.0",
ToVersion: "v0.3.0",
AttemptCount: 1,
}
if err := in.Save(path); err != nil {
t.Fatalf("save: %v", err)
}
out, err := Load(path)
if err != nil {
t.Fatalf("load: %v", err)
}
if out.Phase != in.Phase {
t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
}
if out.FromVersion != in.FromVersion {
t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
}
if out.ToVersion != in.ToVersion {
t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
}
if out.AttemptCount != in.AttemptCount {
t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
}
if out.UpdatedAt.IsZero() {
t.Error("UpdatedAt should be set by Save")
}
}
func TestSaveRejectsInvalidPhase(t *testing.T) {
s := &UpdateState{Phase: Phase("bogus")}
err := s.Save(statePath(t))
if err == nil {
t.Fatal("expected error saving invalid phase, got nil")
}
}
func TestSaveIsAtomic(t *testing.T) {
// After Save, the .tmp file should NOT exist — confirming we renamed it.
path := statePath(t)
s := New()
if err := s.Save(path); err != nil {
t.Fatalf("save: %v", err)
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Errorf("tmp file still present after Save: %v", err)
}
}
func TestSaveCreatesDirectory(t *testing.T) {
// State directory may not exist yet (first-ever boot). Save() should mkdir.
dir := filepath.Join(t.TempDir(), "fresh", "subdir")
path := filepath.Join(dir, "state.json")
if err := New().Save(path); err != nil {
t.Fatalf("save into nonexistent dir: %v", err)
}
if _, err := os.Stat(path); err != nil {
t.Errorf("state file not present after Save: %v", err)
}
}
func TestTransitionIdleToChecking(t *testing.T) {
path := statePath(t)
s := New()
if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
t.Fatalf("transition: %v", err)
}
if s.Phase != PhaseChecking {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
}
if s.ToVersion != "v0.3.0" {
t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
}
if s.AttemptCount != 1 {
t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
}
if s.StartedAt.IsZero() {
t.Error("StartedAt should be set when leaving Idle")
}
}
func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
if s.AttemptCount != 1 {
t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
}
}
func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
// Now an attempt at a NEW version starts. AttemptCount should reset.
_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
if s.ToVersion != "v0.4.0" {
t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
}
if s.AttemptCount != 0 {
t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
}
}
func TestTransitionFailedRecordsError(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
if s.Phase != PhaseFailed {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
}
if s.LastError != "checksum mismatch" {
t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
}
}
func TestTransitionSuccessClearsError(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
if s.LastError == "" {
t.Fatal("setup: LastError should be non-empty before success")
}
_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
if s.LastError != "" {
t.Errorf("last_error after success: got %q, want empty", s.LastError)
}
}
func TestRecordError(t *testing.T) {
path := statePath(t)
s := New()
if err := s.RecordError(path, errors.New("network down")); err != nil {
t.Fatalf("RecordError: %v", err)
}
if s.Phase != PhaseFailed {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
}
if s.LastError != "network down" {
t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
}
}
func TestSetFromVersionIdempotent(t *testing.T) {
s := New()
s.SetFromVersion("v0.2.0")
if s.FromVersion != "v0.2.0" {
t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
}
// Second call should not overwrite.
s.SetFromVersion("v0.1.0")
if s.FromVersion != "v0.2.0" {
t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
}
}
func TestLoadHandlesGarbageFile(t *testing.T) {
path := statePath(t)
if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
t.Fatalf("seed: %v", err)
}
_, err := Load(path)
if err == nil {
t.Error("expected error loading garbage, got nil")
}
}