feat(update): persistent state machine + lifecycle metrics
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s
Phase 5 of v0.3. Adds an explicit, on-disk state machine to the update agent
so the lifecycle of an attempt is observable end-to-end, instead of being
inferred from logs and side effects.
New package update/pkg/state:
- Phase enum (idle, checking, downloading, staged, activated, verifying,
success, rolled_back, failed)
- UpdateState struct persisted to /var/lib/kubesolo/update/state.json
(overridable via --state). Atomic write (.tmp + rename). Survives reboots
and slot switches because the file lives on the data partition.
- Transition helper that bumps AttemptCount when an attempt starts, resets
it when the target version changes, sets/clears LastError on
failed/success transitions, and stamps StartedAt + UpdatedAt.
- 13 unit tests cover the lifecycle, atomic write, version-change reset,
error recording, idempotent SetFromVersion, garbage-file handling.
Wired into the existing commands:
- apply.go transitions Idle -> Checking -> Downloading -> Staged, with
RecordError on any step failure. Reads the active slot's version file to
populate FromVersion.
- activate.go transitions to Activated.
- healthcheck.go transitions Activated -> Verifying -> Success on pass,
or to Failed on fail. Skips transitions if state isn't post-activation
(manual healthcheck on a stable system shouldn't churn the state).
- rollback.go transitions to RolledBack with LastError="manual rollback".
- check.go intentionally untouched — checks are passive queries, not
attempts; they shouldn't reset AttemptCount.
status.go gains a --json mode that emits the full state report (A/B slots,
boot counter, full UpdateState) for orchestration tooling. Human-readable
mode also prints an Update Lifecycle section when state.phase != idle.
pkg/metrics gains three new series, derived from state.json at scrape time:
- kubesolo_update_phase{phase="..."} — 1 for current, 0 for all others;
all nine phase values always emitted so dashboards see complete series
- kubesolo_update_attempts_total
- kubesolo_update_last_attempt_timestamp_seconds
Server.SetStatePath() configures the file location; defaults to absent
which emits Idle defaults. Three new tests cover the absent / active /
all-phases-emitted cases.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
197
update/pkg/state/state_test.go
Normal file
197
update/pkg/state/state_test.go
Normal file
@@ -0,0 +1,197 @@
|
||||
package state
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// statePath returns a per-test state file path inside t.TempDir().
|
||||
func statePath(t *testing.T) string {
|
||||
t.Helper()
|
||||
return filepath.Join(t.TempDir(), "state.json")
|
||||
}
|
||||
|
||||
func TestLoadMissingReturnsIdle(t *testing.T) {
|
||||
s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error loading missing state: %v", err)
|
||||
}
|
||||
if s.Phase != PhaseIdle {
|
||||
t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveLoadRoundTrip(t *testing.T) {
|
||||
path := statePath(t)
|
||||
in := &UpdateState{
|
||||
Phase: PhaseStaged,
|
||||
FromVersion: "v0.2.0",
|
||||
ToVersion: "v0.3.0",
|
||||
AttemptCount: 1,
|
||||
}
|
||||
if err := in.Save(path); err != nil {
|
||||
t.Fatalf("save: %v", err)
|
||||
}
|
||||
out, err := Load(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if out.Phase != in.Phase {
|
||||
t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
|
||||
}
|
||||
if out.FromVersion != in.FromVersion {
|
||||
t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
|
||||
}
|
||||
if out.ToVersion != in.ToVersion {
|
||||
t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
|
||||
}
|
||||
if out.AttemptCount != in.AttemptCount {
|
||||
t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
|
||||
}
|
||||
if out.UpdatedAt.IsZero() {
|
||||
t.Error("UpdatedAt should be set by Save")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveRejectsInvalidPhase(t *testing.T) {
|
||||
s := &UpdateState{Phase: Phase("bogus")}
|
||||
err := s.Save(statePath(t))
|
||||
if err == nil {
|
||||
t.Fatal("expected error saving invalid phase, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveIsAtomic(t *testing.T) {
|
||||
// After Save, the .tmp file should NOT exist — confirming we renamed it.
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
if err := s.Save(path); err != nil {
|
||||
t.Fatalf("save: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Errorf("tmp file still present after Save: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveCreatesDirectory(t *testing.T) {
|
||||
// State directory may not exist yet (first-ever boot). Save() should mkdir.
|
||||
dir := filepath.Join(t.TempDir(), "fresh", "subdir")
|
||||
path := filepath.Join(dir, "state.json")
|
||||
if err := New().Save(path); err != nil {
|
||||
t.Fatalf("save into nonexistent dir: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(path); err != nil {
|
||||
t.Errorf("state file not present after Save: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionIdleToChecking(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
|
||||
t.Fatalf("transition: %v", err)
|
||||
}
|
||||
if s.Phase != PhaseChecking {
|
||||
t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
|
||||
}
|
||||
if s.ToVersion != "v0.3.0" {
|
||||
t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
|
||||
}
|
||||
if s.AttemptCount != 1 {
|
||||
t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
|
||||
}
|
||||
if s.StartedAt.IsZero() {
|
||||
t.Error("StartedAt should be set when leaving Idle")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
|
||||
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
|
||||
_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
|
||||
if s.AttemptCount != 1 {
|
||||
t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
|
||||
// Now an attempt at a NEW version starts. AttemptCount should reset.
|
||||
_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
|
||||
if s.ToVersion != "v0.4.0" {
|
||||
t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
|
||||
}
|
||||
if s.AttemptCount != 0 {
|
||||
t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionFailedRecordsError(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
|
||||
_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
|
||||
if s.Phase != PhaseFailed {
|
||||
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
|
||||
}
|
||||
if s.LastError != "checksum mismatch" {
|
||||
t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionSuccessClearsError(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
|
||||
if s.LastError == "" {
|
||||
t.Fatal("setup: LastError should be non-empty before success")
|
||||
}
|
||||
_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
|
||||
if s.LastError != "" {
|
||||
t.Errorf("last_error after success: got %q, want empty", s.LastError)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecordError(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
if err := s.RecordError(path, errors.New("network down")); err != nil {
|
||||
t.Fatalf("RecordError: %v", err)
|
||||
}
|
||||
if s.Phase != PhaseFailed {
|
||||
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
|
||||
}
|
||||
if s.LastError != "network down" {
|
||||
t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetFromVersionIdempotent(t *testing.T) {
|
||||
s := New()
|
||||
s.SetFromVersion("v0.2.0")
|
||||
if s.FromVersion != "v0.2.0" {
|
||||
t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
|
||||
}
|
||||
// Second call should not overwrite.
|
||||
s.SetFromVersion("v0.1.0")
|
||||
if s.FromVersion != "v0.2.0" {
|
||||
t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadHandlesGarbageFile(t *testing.T) {
|
||||
path := statePath(t)
|
||||
if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
|
||||
t.Fatalf("seed: %v", err)
|
||||
}
|
||||
_, err := Load(path)
|
||||
if err == nil {
|
||||
t.Error("expected error loading garbage, got nil")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user