// Package state tracks the lifecycle of an OS update on disk. // // The state file (default /var/lib/kubesolo/update/state.json) records which // phase the agent is in, what versions are involved, when the attempt started, // any error from the last operation, and how many attempts have been made. // Updates are atomic via tmp+rename, so a crash mid-write doesn't corrupt the // state. // // Consumers: // - cmd/check, cmd/apply, cmd/activate, cmd/healthcheck, cmd/rollback — // transition the phase as they enter / leave their operations. // - cmd/status --json — emits the raw state for orchestration tooling. // - pkg/metrics — reads the state at scrape time to expose phase and // attempt-count gauges. package state import ( "encoding/json" "fmt" "os" "path/filepath" "time" ) // DefaultPath is where state.json lives on a live system. The directory is on // the persistent data partition so the file survives A/B slot switches. const DefaultPath = "/var/lib/kubesolo/update/state.json" // Phase represents the current step in the update lifecycle. // // Terminal phases (Success, RolledBack, Failed) describe the outcome of the // most recent attempt; transient phases (Checking, Downloading, Staged, // Activated, Verifying) describe in-progress work. Idle means no update has // been attempted yet, or the previous attempt has been acknowledged. type Phase string const ( // PhaseIdle — no update in progress. PhaseIdle Phase = "idle" // PhaseChecking — querying the update server for new versions. PhaseChecking Phase = "checking" // PhaseDownloading — pulling artifacts from the server. PhaseDownloading Phase = "downloading" // PhaseStaged — artifacts written to the passive partition; not yet active. PhaseStaged Phase = "staged" // PhaseActivated — passive slot promoted; next boot will use the new version. PhaseActivated Phase = "activated" // PhaseVerifying — post-boot healthcheck in progress on the new version. PhaseVerifying Phase = "verifying" // PhaseSuccess — last attempt completed and verified. PhaseSuccess Phase = "success" // PhaseRolledBack — last attempt failed verification; reverted to prior slot. PhaseRolledBack Phase = "rolled_back" // PhaseFailed — last attempt failed before reaching activation (download, // checksum, signature, etc.). System still on the original slot. PhaseFailed Phase = "failed" ) // validPhases lists every legal Phase value. Anything not in this set is // rejected by Save() to catch typos. var validPhases = map[Phase]struct{}{ PhaseIdle: {}, PhaseChecking: {}, PhaseDownloading: {}, PhaseStaged: {}, PhaseActivated: {}, PhaseVerifying: {}, PhaseSuccess: {}, PhaseRolledBack: {}, PhaseFailed: {}, } // UpdateState is the on-disk representation. Fields use JSON tags so the // file format is forward-compatible (extra fields ignored, missing fields // default). type UpdateState struct { // Phase is the current lifecycle position. Phase Phase `json:"phase"` // FromVersion is the version the system was running before the attempt. // Empty when no attempt has run. FromVersion string `json:"from_version,omitempty"` // ToVersion is the version the attempt is targeting. // Empty when no attempt has run. ToVersion string `json:"to_version,omitempty"` // StartedAt is when the current attempt entered a non-Idle phase. StartedAt time.Time `json:"started_at,omitempty"` // UpdatedAt is the last time the file was written. Always set on Save(). UpdatedAt time.Time `json:"updated_at"` // LastError carries the most recent operation error, populated when // transitioning to PhaseFailed or PhaseRolledBack. Cleared on Success/Idle. LastError string `json:"last_error,omitempty"` // AttemptCount counts attempts at the current ToVersion. Reset when // ToVersion changes or on successful completion. AttemptCount int `json:"attempt_count"` } // New returns a fresh Idle state with UpdatedAt set to now. func New() *UpdateState { return &UpdateState{ Phase: PhaseIdle, UpdatedAt: time.Now().UTC(), } } // Load reads the state from disk. If the file does not exist, returns a fresh // Idle state — this is the normal first-run case, not an error. func Load(path string) (*UpdateState, error) { data, err := os.ReadFile(path) if err != nil { if os.IsNotExist(err) { return New(), nil } return nil, fmt.Errorf("read state %s: %w", path, err) } var s UpdateState if err := json.Unmarshal(data, &s); err != nil { return nil, fmt.Errorf("parse state %s: %w", path, err) } return &s, nil } // Save writes the state to disk atomically (tmp file + rename), so an // interrupted write never leaves a partial file at `path`. func (s *UpdateState) Save(path string) error { if _, ok := validPhases[s.Phase]; !ok { return fmt.Errorf("invalid phase %q", s.Phase) } s.UpdatedAt = time.Now().UTC() if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { return fmt.Errorf("creating state dir: %w", err) } data, err := json.MarshalIndent(s, "", " ") if err != nil { return fmt.Errorf("marshal state: %w", err) } data = append(data, '\n') tmp := path + ".tmp" if err := os.WriteFile(tmp, data, 0o644); err != nil { return fmt.Errorf("write tmp state: %w", err) } if err := os.Rename(tmp, path); err != nil { _ = os.Remove(tmp) return fmt.Errorf("rename state: %w", err) } return nil } // Transition moves the state to phase `next` and persists it. If `next` // targets a new ToVersion (different from the current one), AttemptCount is // reset to 1; otherwise it is left untouched. StartedAt is set when // transitioning out of Idle. LastError is cleared unless `next` is Failed or // RolledBack. func (s *UpdateState) Transition(path string, next Phase, toVersion, errMsg string) error { now := time.Now().UTC() // Reset attempt counter when targeting a new version. if toVersion != "" && toVersion != s.ToVersion { s.ToVersion = toVersion s.AttemptCount = 0 } // First non-Idle phase of an attempt: record start time and bump count. if s.Phase == PhaseIdle && next != PhaseIdle { s.StartedAt = now s.AttemptCount++ } s.Phase = next switch next { case PhaseFailed, PhaseRolledBack: if errMsg != "" { s.LastError = errMsg } case PhaseSuccess, PhaseIdle: s.LastError = "" } return s.Save(path) } // RecordError marks the state as failed with the given error and saves. // Convenience wrapper around Transition for the most common failure path. func (s *UpdateState) RecordError(path string, err error) error { msg := "" if err != nil { msg = err.Error() } return s.Transition(path, PhaseFailed, "", msg) } // SetFromVersion records the version the system was running when an attempt // started. Idempotent; only takes effect when From is empty. func (s *UpdateState) SetFromVersion(v string) { if s.FromVersion == "" { s.FromVersion = v } }