feat(update): persistent state machine + lifecycle metrics
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s

Phase 5 of v0.3. Adds an explicit, on-disk state machine to the update agent
so the lifecycle of an attempt is observable end-to-end, instead of being
inferred from logs and side effects.

New package update/pkg/state:
- Phase enum (idle, checking, downloading, staged, activated, verifying,
  success, rolled_back, failed)
- UpdateState struct persisted to /var/lib/kubesolo/update/state.json
  (overridable via --state). Atomic write (.tmp + rename). Survives reboots
  and slot switches because the file lives on the data partition.
- Transition helper that bumps AttemptCount when an attempt starts, resets
  it when the target version changes, sets/clears LastError on
  failed/success transitions, and stamps StartedAt + UpdatedAt.
- 13 unit tests cover the lifecycle, atomic write, version-change reset,
  error recording, idempotent SetFromVersion, garbage-file handling.

Wired into the existing commands:
- apply.go transitions Idle -> Checking -> Downloading -> Staged, with
  RecordError on any step failure. Reads the active slot's version file to
  populate FromVersion.
- activate.go transitions to Activated.
- healthcheck.go transitions Activated -> Verifying -> Success on pass,
  or to Failed on fail. Skips transitions if state isn't post-activation
  (manual healthcheck on a stable system shouldn't churn the state).
- rollback.go transitions to RolledBack with LastError="manual rollback".
- check.go intentionally untouched — checks are passive queries, not
  attempts; they shouldn't reset AttemptCount.

status.go gains a --json mode that emits the full state report (A/B slots,
boot counter, full UpdateState) for orchestration tooling. Human-readable
mode also prints an Update Lifecycle section when state.phase != idle.

pkg/metrics gains three new series, derived from state.json at scrape time:
- kubesolo_update_phase{phase="..."} — 1 for current, 0 for all others;
  all nine phase values always emitted so dashboards see complete series
- kubesolo_update_attempts_total
- kubesolo_update_last_attempt_timestamp_seconds
Server.SetStatePath() configures the file location; defaults to absent
which emits Idle defaults. Three new tests cover the absent / active /
all-phases-emitted cases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-14 18:11:47 -06:00
parent 0c6e200585
commit bce565e2f7
12 changed files with 726 additions and 0 deletions

View File

@@ -11,6 +11,9 @@
// kubesolo_os_update_last_check_timestamp_seconds unix timestamp (gauge)
// kubesolo_os_memory_total_bytes total RAM (gauge)
// kubesolo_os_memory_available_bytes available RAM (gauge)
// kubesolo_update_phase{phase} 1 for current phase, 0 for others
// kubesolo_update_attempts_total counter — attempts at current ToVersion
// kubesolo_update_last_attempt_timestamp_seconds unix timestamp of last state update
//
// This is a zero-dependency implementation — no Prometheus client library needed.
// It serves metrics in the Prometheus text exposition format.
@@ -25,11 +28,14 @@ import (
"strings"
"sync"
"time"
"github.com/portainer/kubesolo-os/update/pkg/state"
)
// Server is a lightweight Prometheus metrics HTTP server.
type Server struct {
grubenvPath string
statePath string
listenAddr string
startTime time.Time
@@ -47,6 +53,27 @@ func NewServer(listenAddr, grubenvPath string) *Server {
}
}
// SetStatePath sets the location of the update state.json file. If empty or
// unset, state-derived metrics are emitted with the Idle defaults.
func (s *Server) SetStatePath(p string) {
s.statePath = p
}
// allPhases lists every Phase value we emit as a kubesolo_update_phase
// time-series, so consumers see all label values (with value 0 for non-current
// phases). Mirror of validPhases in pkg/state.
var allPhases = []state.Phase{
state.PhaseIdle,
state.PhaseChecking,
state.PhaseDownloading,
state.PhaseStaged,
state.PhaseActivated,
state.PhaseVerifying,
state.PhaseSuccess,
state.PhaseRolledBack,
state.PhaseFailed,
}
// SetUpdateAvailable records whether an update is available.
func (s *Server) SetUpdateAvailable(available bool) {
s.mu.Lock()
@@ -125,9 +152,49 @@ func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n")
sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail))
// Update lifecycle (from state.json)
s.writeUpdateStateMetrics(&sb)
fmt.Fprint(w, sb.String())
}
// writeUpdateStateMetrics appends update-lifecycle metrics derived from the
// state.json file. If the file is missing or unreadable, emits the Idle
// defaults so the metric series exists at all times.
func (s *Server) writeUpdateStateMetrics(sb *strings.Builder) {
current := state.PhaseIdle
var attempts int
var lastTS float64
if s.statePath != "" {
if st, err := state.Load(s.statePath); err == nil && st != nil {
current = st.Phase
attempts = st.AttemptCount
if !st.UpdatedAt.IsZero() {
lastTS = float64(st.UpdatedAt.Unix())
}
}
}
sb.WriteString("# HELP kubesolo_update_phase Current update lifecycle phase (1 for active, 0 otherwise).\n")
sb.WriteString("# TYPE kubesolo_update_phase gauge\n")
for _, p := range allPhases {
v := 0
if p == current {
v = 1
}
sb.WriteString(fmt.Sprintf("kubesolo_update_phase{phase=%q} %d\n", string(p), v))
}
sb.WriteString("# HELP kubesolo_update_attempts_total Number of update attempts at the current target version.\n")
sb.WriteString("# TYPE kubesolo_update_attempts_total counter\n")
sb.WriteString(fmt.Sprintf("kubesolo_update_attempts_total %d\n", attempts))
sb.WriteString("# HELP kubesolo_update_last_attempt_timestamp_seconds Unix timestamp of the last state transition.\n")
sb.WriteString("# TYPE kubesolo_update_last_attempt_timestamp_seconds gauge\n")
sb.WriteString(fmt.Sprintf("kubesolo_update_last_attempt_timestamp_seconds %.0f\n", lastTS))
}
// readGrubenvVar reads a single variable from grubenv using simple file parse.
func (s *Server) readGrubenvVar(key string) string {
data, err := os.ReadFile(s.grubenvPath)

View File

@@ -8,6 +8,8 @@ import (
"path/filepath"
"strings"
"testing"
"github.com/portainer/kubesolo-os/update/pkg/state"
)
func TestNewServer(t *testing.T) {
@@ -247,6 +249,86 @@ func TestSafeInt(t *testing.T) {
}
}
func TestUpdateStateMetricsAbsentStateFile(t *testing.T) {
// No state path set — should emit Idle defaults so the metric series
// exists from first boot.
s := NewServer(":9100", "/tmp/nonexistent")
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
w := httptest.NewRecorder()
s.handleMetrics(w, req)
body, _ := io.ReadAll(w.Result().Body)
output := string(body)
if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 1`) {
t.Errorf("expected idle=1 with no state file, got:\n%s", output)
}
if !strings.Contains(output, `kubesolo_update_phase{phase="checking"} 0`) {
t.Errorf("expected checking=0 with no state file, got:\n%s", output)
}
if !strings.Contains(output, "kubesolo_update_attempts_total 0") {
t.Errorf("expected attempts=0 with no state file, got:\n%s", output)
}
}
func TestUpdateStateMetricsActivePhase(t *testing.T) {
dir := t.TempDir()
statePath := filepath.Join(dir, "state.json")
st := state.New()
if err := st.Transition(statePath, state.PhaseDownloading, "v0.3.0", ""); err != nil {
t.Fatalf("seed state: %v", err)
}
s := NewServer(":9100", "/tmp/nonexistent")
s.SetStatePath(statePath)
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
w := httptest.NewRecorder()
s.handleMetrics(w, req)
body, _ := io.ReadAll(w.Result().Body)
output := string(body)
if !strings.Contains(output, `kubesolo_update_phase{phase="downloading"} 1`) {
t.Errorf("expected downloading=1, got:\n%s", output)
}
if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 0`) {
t.Errorf("expected idle=0 when downloading is active, got:\n%s", output)
}
if !strings.Contains(output, "kubesolo_update_attempts_total 1") {
t.Errorf("expected attempts=1 after first Transition, got:\n%s", output)
}
if strings.Contains(output, "kubesolo_update_last_attempt_timestamp_seconds 0\n") {
t.Errorf("expected non-zero timestamp after state write, got:\n%s", output)
}
}
func TestUpdateStateMetricsAllPhasesEmitted(t *testing.T) {
// Every phase value should appear in the output, so dashboards can graph
// the series cleanly.
s := NewServer(":9100", "/tmp/nonexistent")
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
w := httptest.NewRecorder()
s.handleMetrics(w, req)
body, _ := io.ReadAll(w.Result().Body)
output := string(body)
for _, p := range []state.Phase{
state.PhaseIdle, state.PhaseChecking, state.PhaseDownloading,
state.PhaseStaged, state.PhaseActivated, state.PhaseVerifying,
state.PhaseSuccess, state.PhaseRolledBack, state.PhaseFailed,
} {
needle := `kubesolo_update_phase{phase="` + string(p) + `"}`
if !strings.Contains(output, needle) {
t.Errorf("phase %q not present in metrics output", p)
}
}
}
func TestReadFileString(t *testing.T) {
dir := t.TempDir()

200
update/pkg/state/state.go Normal file
View File

@@ -0,0 +1,200 @@
// Package state tracks the lifecycle of an OS update on disk.
//
// The state file (default /var/lib/kubesolo/update/state.json) records which
// phase the agent is in, what versions are involved, when the attempt started,
// any error from the last operation, and how many attempts have been made.
// Updates are atomic via tmp+rename, so a crash mid-write doesn't corrupt the
// state.
//
// Consumers:
// - cmd/check, cmd/apply, cmd/activate, cmd/healthcheck, cmd/rollback —
// transition the phase as they enter / leave their operations.
// - cmd/status --json — emits the raw state for orchestration tooling.
// - pkg/metrics — reads the state at scrape time to expose phase and
// attempt-count gauges.
package state
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"time"
)
// DefaultPath is where state.json lives on a live system. The directory is on
// the persistent data partition so the file survives A/B slot switches.
const DefaultPath = "/var/lib/kubesolo/update/state.json"
// Phase represents the current step in the update lifecycle.
//
// Terminal phases (Success, RolledBack, Failed) describe the outcome of the
// most recent attempt; transient phases (Checking, Downloading, Staged,
// Activated, Verifying) describe in-progress work. Idle means no update has
// been attempted yet, or the previous attempt has been acknowledged.
type Phase string
const (
// PhaseIdle — no update in progress.
PhaseIdle Phase = "idle"
// PhaseChecking — querying the update server for new versions.
PhaseChecking Phase = "checking"
// PhaseDownloading — pulling artifacts from the server.
PhaseDownloading Phase = "downloading"
// PhaseStaged — artifacts written to the passive partition; not yet active.
PhaseStaged Phase = "staged"
// PhaseActivated — passive slot promoted; next boot will use the new version.
PhaseActivated Phase = "activated"
// PhaseVerifying — post-boot healthcheck in progress on the new version.
PhaseVerifying Phase = "verifying"
// PhaseSuccess — last attempt completed and verified.
PhaseSuccess Phase = "success"
// PhaseRolledBack — last attempt failed verification; reverted to prior slot.
PhaseRolledBack Phase = "rolled_back"
// PhaseFailed — last attempt failed before reaching activation (download,
// checksum, signature, etc.). System still on the original slot.
PhaseFailed Phase = "failed"
)
// validPhases lists every legal Phase value. Anything not in this set is
// rejected by Save() to catch typos.
var validPhases = map[Phase]struct{}{
PhaseIdle: {},
PhaseChecking: {},
PhaseDownloading: {},
PhaseStaged: {},
PhaseActivated: {},
PhaseVerifying: {},
PhaseSuccess: {},
PhaseRolledBack: {},
PhaseFailed: {},
}
// UpdateState is the on-disk representation. Fields use JSON tags so the
// file format is forward-compatible (extra fields ignored, missing fields
// default).
type UpdateState struct {
// Phase is the current lifecycle position.
Phase Phase `json:"phase"`
// FromVersion is the version the system was running before the attempt.
// Empty when no attempt has run.
FromVersion string `json:"from_version,omitempty"`
// ToVersion is the version the attempt is targeting.
// Empty when no attempt has run.
ToVersion string `json:"to_version,omitempty"`
// StartedAt is when the current attempt entered a non-Idle phase.
StartedAt time.Time `json:"started_at,omitempty"`
// UpdatedAt is the last time the file was written. Always set on Save().
UpdatedAt time.Time `json:"updated_at"`
// LastError carries the most recent operation error, populated when
// transitioning to PhaseFailed or PhaseRolledBack. Cleared on Success/Idle.
LastError string `json:"last_error,omitempty"`
// AttemptCount counts attempts at the current ToVersion. Reset when
// ToVersion changes or on successful completion.
AttemptCount int `json:"attempt_count"`
}
// New returns a fresh Idle state with UpdatedAt set to now.
func New() *UpdateState {
return &UpdateState{
Phase: PhaseIdle,
UpdatedAt: time.Now().UTC(),
}
}
// Load reads the state from disk. If the file does not exist, returns a fresh
// Idle state — this is the normal first-run case, not an error.
func Load(path string) (*UpdateState, error) {
data, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {
return New(), nil
}
return nil, fmt.Errorf("read state %s: %w", path, err)
}
var s UpdateState
if err := json.Unmarshal(data, &s); err != nil {
return nil, fmt.Errorf("parse state %s: %w", path, err)
}
return &s, nil
}
// Save writes the state to disk atomically (tmp file + rename), so an
// interrupted write never leaves a partial file at `path`.
func (s *UpdateState) Save(path string) error {
if _, ok := validPhases[s.Phase]; !ok {
return fmt.Errorf("invalid phase %q", s.Phase)
}
s.UpdatedAt = time.Now().UTC()
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
return fmt.Errorf("creating state dir: %w", err)
}
data, err := json.MarshalIndent(s, "", " ")
if err != nil {
return fmt.Errorf("marshal state: %w", err)
}
data = append(data, '\n')
tmp := path + ".tmp"
if err := os.WriteFile(tmp, data, 0o644); err != nil {
return fmt.Errorf("write tmp state: %w", err)
}
if err := os.Rename(tmp, path); err != nil {
_ = os.Remove(tmp)
return fmt.Errorf("rename state: %w", err)
}
return nil
}
// Transition moves the state to phase `next` and persists it. If `next`
// targets a new ToVersion (different from the current one), AttemptCount is
// reset to 1; otherwise it is left untouched. StartedAt is set when
// transitioning out of Idle. LastError is cleared unless `next` is Failed or
// RolledBack.
func (s *UpdateState) Transition(path string, next Phase, toVersion, errMsg string) error {
now := time.Now().UTC()
// Reset attempt counter when targeting a new version.
if toVersion != "" && toVersion != s.ToVersion {
s.ToVersion = toVersion
s.AttemptCount = 0
}
// First non-Idle phase of an attempt: record start time and bump count.
if s.Phase == PhaseIdle && next != PhaseIdle {
s.StartedAt = now
s.AttemptCount++
}
s.Phase = next
switch next {
case PhaseFailed, PhaseRolledBack:
if errMsg != "" {
s.LastError = errMsg
}
case PhaseSuccess, PhaseIdle:
s.LastError = ""
}
return s.Save(path)
}
// RecordError marks the state as failed with the given error and saves.
// Convenience wrapper around Transition for the most common failure path.
func (s *UpdateState) RecordError(path string, err error) error {
msg := ""
if err != nil {
msg = err.Error()
}
return s.Transition(path, PhaseFailed, "", msg)
}
// SetFromVersion records the version the system was running when an attempt
// started. Idempotent; only takes effect when From is empty.
func (s *UpdateState) SetFromVersion(v string) {
if s.FromVersion == "" {
s.FromVersion = v
}
}

View File

@@ -0,0 +1,197 @@
package state
import (
"errors"
"os"
"path/filepath"
"testing"
)
// statePath returns a per-test state file path inside t.TempDir().
func statePath(t *testing.T) string {
t.Helper()
return filepath.Join(t.TempDir(), "state.json")
}
func TestLoadMissingReturnsIdle(t *testing.T) {
s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
if err != nil {
t.Fatalf("unexpected error loading missing state: %v", err)
}
if s.Phase != PhaseIdle {
t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
}
}
func TestSaveLoadRoundTrip(t *testing.T) {
path := statePath(t)
in := &UpdateState{
Phase: PhaseStaged,
FromVersion: "v0.2.0",
ToVersion: "v0.3.0",
AttemptCount: 1,
}
if err := in.Save(path); err != nil {
t.Fatalf("save: %v", err)
}
out, err := Load(path)
if err != nil {
t.Fatalf("load: %v", err)
}
if out.Phase != in.Phase {
t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
}
if out.FromVersion != in.FromVersion {
t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
}
if out.ToVersion != in.ToVersion {
t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
}
if out.AttemptCount != in.AttemptCount {
t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
}
if out.UpdatedAt.IsZero() {
t.Error("UpdatedAt should be set by Save")
}
}
func TestSaveRejectsInvalidPhase(t *testing.T) {
s := &UpdateState{Phase: Phase("bogus")}
err := s.Save(statePath(t))
if err == nil {
t.Fatal("expected error saving invalid phase, got nil")
}
}
func TestSaveIsAtomic(t *testing.T) {
// After Save, the .tmp file should NOT exist — confirming we renamed it.
path := statePath(t)
s := New()
if err := s.Save(path); err != nil {
t.Fatalf("save: %v", err)
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Errorf("tmp file still present after Save: %v", err)
}
}
func TestSaveCreatesDirectory(t *testing.T) {
// State directory may not exist yet (first-ever boot). Save() should mkdir.
dir := filepath.Join(t.TempDir(), "fresh", "subdir")
path := filepath.Join(dir, "state.json")
if err := New().Save(path); err != nil {
t.Fatalf("save into nonexistent dir: %v", err)
}
if _, err := os.Stat(path); err != nil {
t.Errorf("state file not present after Save: %v", err)
}
}
func TestTransitionIdleToChecking(t *testing.T) {
path := statePath(t)
s := New()
if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
t.Fatalf("transition: %v", err)
}
if s.Phase != PhaseChecking {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
}
if s.ToVersion != "v0.3.0" {
t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
}
if s.AttemptCount != 1 {
t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
}
if s.StartedAt.IsZero() {
t.Error("StartedAt should be set when leaving Idle")
}
}
func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
if s.AttemptCount != 1 {
t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
}
}
func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
// Now an attempt at a NEW version starts. AttemptCount should reset.
_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
if s.ToVersion != "v0.4.0" {
t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
}
if s.AttemptCount != 0 {
t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
}
}
func TestTransitionFailedRecordsError(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
if s.Phase != PhaseFailed {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
}
if s.LastError != "checksum mismatch" {
t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
}
}
func TestTransitionSuccessClearsError(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
if s.LastError == "" {
t.Fatal("setup: LastError should be non-empty before success")
}
_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
if s.LastError != "" {
t.Errorf("last_error after success: got %q, want empty", s.LastError)
}
}
func TestRecordError(t *testing.T) {
path := statePath(t)
s := New()
if err := s.RecordError(path, errors.New("network down")); err != nil {
t.Fatalf("RecordError: %v", err)
}
if s.Phase != PhaseFailed {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
}
if s.LastError != "network down" {
t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
}
}
func TestSetFromVersionIdempotent(t *testing.T) {
s := New()
s.SetFromVersion("v0.2.0")
if s.FromVersion != "v0.2.0" {
t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
}
// Second call should not overwrite.
s.SetFromVersion("v0.1.0")
if s.FromVersion != "v0.2.0" {
t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
}
}
func TestLoadHandlesGarbageFile(t *testing.T) {
path := statePath(t)
if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
t.Fatalf("seed: %v", err)
}
_, err := Load(path)
if err == nil {
t.Error("expected error loading garbage, got nil")
}
}