diff --git a/update/cmd/activate.go b/update/cmd/activate.go index e2dc134..420750b 100644 --- a/update/cmd/activate.go +++ b/update/cmd/activate.go @@ -3,23 +3,35 @@ package cmd import ( "fmt" "log/slog" + + "github.com/portainer/kubesolo-os/update/pkg/state" ) // Activate switches the boot target to the passive partition. // After activation, the next reboot will boot from the new partition // with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back. +// +// State transition: Staged → Activated. On failure → Failed. func Activate(args []string) error { opts := parseOpts(args) env := opts.NewBootEnv() + st, err := state.Load(opts.StatePath) + if err != nil { + slog.Warn("state file unreadable, starting fresh", "error", err) + st = state.New() + } + // Get passive slot (the one we want to boot into) passiveSlot, err := env.PassiveSlot() if err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err)) return fmt.Errorf("reading passive slot: %w", err) } activeSlot, err := env.ActiveSlot() if err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("reading active slot: %w", err)) return fmt.Errorf("reading active slot: %w", err) } @@ -27,9 +39,14 @@ func Activate(args []string) error { // Set the passive slot as active with fresh boot counter if err := env.ActivateSlot(passiveSlot); err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("activating slot %s: %w", passiveSlot, err)) return fmt.Errorf("activating slot %s: %w", passiveSlot, err) } + if err := st.Transition(opts.StatePath, state.PhaseActivated, "", ""); err != nil { + slog.Warn("state transition failed", "phase", state.PhaseActivated, "error", err) + } + fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot) fmt.Println("Boot counter set to 3. Reboot to start the new version.") fmt.Println("The system will automatically roll back if health checks fail 3 times.") diff --git a/update/cmd/apply.go b/update/cmd/apply.go index b1a0b8b..9f19bc4 100644 --- a/update/cmd/apply.go +++ b/update/cmd/apply.go @@ -6,10 +6,14 @@ import ( "github.com/portainer/kubesolo-os/update/pkg/image" "github.com/portainer/kubesolo-os/update/pkg/partition" + "github.com/portainer/kubesolo-os/update/pkg/state" ) // Apply downloads a new OS image and writes it to the passive partition. // It does NOT activate the new partition — use 'activate' for that. +// +// State transitions: Idle/Success/Failed → Checking → Downloading → Staged. +// On any error the state moves to Failed with LastError set. func Apply(args []string) error { opts := parseOpts(args) @@ -17,11 +21,34 @@ func Apply(args []string) error { return fmt.Errorf("--server is required") } + st, err := state.Load(opts.StatePath) + if err != nil { + // Don't block the operation on a corrupt state file. Log + recover. + slog.Warn("state file unreadable, starting fresh", "error", err) + st = state.New() + } + env := opts.NewBootEnv() + // Record the current running version as the "from" reference. The active + // slot's version file is the most reliable source. + activeSlot, slotErr := env.ActiveSlot() + if slotErr == nil { + if partInfo, perr := partition.GetSlotPartition(activeSlot); perr == nil { + mp := "/tmp/kubesolo-active-" + activeSlot + if merr := partition.MountReadOnly(partInfo.Device, mp); merr == nil { + if v, rerr := partition.ReadVersion(mp); rerr == nil { + st.SetFromVersion(v) + } + partition.Unmount(mp) + } + } + } + // Determine passive slot passiveSlot, err := env.PassiveSlot() if err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err)) return fmt.Errorf("reading passive slot: %w", err) } @@ -38,36 +65,55 @@ func Apply(args []string) error { slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath) } + if err := st.Transition(opts.StatePath, state.PhaseChecking, "", ""); err != nil { + slog.Warn("state transition failed", "phase", state.PhaseChecking, "error", err) + } + meta, err := client.CheckForUpdate() if err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("checking for update: %w", err)) return fmt.Errorf("checking for update: %w", err) } slog.Info("update available", "version", meta.Version) + // Now we know the target version — record it (resets attempt count if it + // differs from the previous attempt's ToVersion). + if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil { + slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err) + } + // Download and verify staged, err := client.Download(meta) if err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("downloading update: %w", err)) return fmt.Errorf("downloading update: %w", err) } // Mount passive partition partInfo, err := partition.GetSlotPartition(passiveSlot) if err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("finding passive partition: %w", err)) return fmt.Errorf("finding passive partition: %w", err) } mountPoint := "/tmp/kubesolo-passive-" + passiveSlot if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("mounting passive partition: %w", err)) return fmt.Errorf("mounting passive partition: %w", err) } defer partition.Unmount(mountPoint) // Write image to passive partition if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err)) return fmt.Errorf("writing system image: %w", err) } + if err := st.Transition(opts.StatePath, state.PhaseStaged, staged.Version, ""); err != nil { + slog.Warn("state transition failed", "phase", state.PhaseStaged, "error", err) + } + fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device) fmt.Println("Run 'kubesolo-update activate' to boot into the new version") diff --git a/update/cmd/healthcheck.go b/update/cmd/healthcheck.go index b9a46af..2ad04a8 100644 --- a/update/cmd/healthcheck.go +++ b/update/cmd/healthcheck.go @@ -6,16 +6,27 @@ import ( "time" "github.com/portainer/kubesolo-os/update/pkg/health" + "github.com/portainer/kubesolo-os/update/pkg/state" ) // Healthcheck performs post-boot health verification. // If all checks pass, it marks the boot as successful in GRUB. // This should be run after every boot (typically via a systemd unit or // init script) to confirm the system is healthy. +// +// State transition: Activated → Verifying → Success on pass, → Failed on fail. +// If state isn't in Activated (e.g. manual run on a long-stable system), the +// state file is left alone — healthcheck still does its job. func Healthcheck(args []string) error { opts := parseOpts(args) env := opts.NewBootEnv() + st, err := state.Load(opts.StatePath) + if err != nil { + slog.Warn("state file unreadable, starting fresh", "error", err) + st = state.New() + } + // Check if already marked successful success, err := env.BootSuccess() if err != nil { @@ -26,6 +37,15 @@ func Healthcheck(args []string) error { return nil } + // Only transition state if we're post-activation. Manual healthcheck on a + // long-stable system shouldn't reset Idle → Verifying. + postActivation := st.Phase == state.PhaseActivated + if postActivation { + if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil { + slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err) + } + } + timeout := time.Duration(opts.TimeoutSecs) * time.Second checker := health.NewChecker("", "", timeout) @@ -38,14 +58,26 @@ func Healthcheck(args []string) error { fmt.Printf(" apiserver: %v\n", status.APIServer) fmt.Printf(" node_ready: %v\n", status.NodeReady) fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot") + if postActivation { + _ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message)) + } return err } // Mark boot as successful if err := env.MarkBootSuccess(); err != nil { + if postActivation { + _ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err)) + } return fmt.Errorf("marking boot success: %w", err) } + if postActivation { + if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil { + slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err) + } + } + fmt.Println("Health check PASSED — boot marked successful") fmt.Printf(" containerd: %v\n", status.Containerd) fmt.Printf(" apiserver: %v\n", status.APIServer) diff --git a/update/cmd/metrics.go b/update/cmd/metrics.go index d8823e5..a571af6 100644 --- a/update/cmd/metrics.go +++ b/update/cmd/metrics.go @@ -5,6 +5,7 @@ import ( "fmt" "github.com/portainer/kubesolo-os/update/pkg/metrics" + "github.com/portainer/kubesolo-os/update/pkg/state" ) // Metrics starts the Prometheus-compatible metrics HTTP server. @@ -12,10 +13,12 @@ func Metrics(args []string) error { fs := flag.NewFlagSet("metrics", flag.ExitOnError) listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address") grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file") + statePath := fs.String("state", state.DefaultPath, "Path to update state.json") if err := fs.Parse(args); err != nil { return fmt.Errorf("parse flags: %w", err) } srv := metrics.NewServer(*listenAddr, *grubenvPath) + srv.SetStatePath(*statePath) return srv.ListenAndServe() } diff --git a/update/cmd/opts.go b/update/cmd/opts.go index 94b8824..3ac64a3 100644 --- a/update/cmd/opts.go +++ b/update/cmd/opts.go @@ -2,6 +2,7 @@ package cmd import ( "github.com/portainer/kubesolo-os/update/pkg/bootenv" + "github.com/portainer/kubesolo-os/update/pkg/state" ) // opts holds shared command-line options for all subcommands. @@ -12,6 +13,8 @@ type opts struct { PubKeyPath string BootEnvType string // "grub" or "rpi" BootEnvPath string // path for RPi boot control dir + StatePath string // location of state.json (default: state.DefaultPath) + JSON bool // status: emit JSON instead of human-readable } // NewBootEnv creates a BootEnv from the parsed options. @@ -31,10 +34,18 @@ func parseOpts(args []string) opts { GrubenvPath: "/boot/grub/grubenv", TimeoutSecs: 120, BootEnvType: "grub", + StatePath: state.DefaultPath, } for i := 0; i < len(args); i++ { switch args[i] { + case "--state": + if i+1 < len(args) { + o.StatePath = args[i+1] + i++ + } + case "--json": + o.JSON = true case "--server": if i+1 < len(args) { o.ServerURL = args[i+1] diff --git a/update/cmd/rollback.go b/update/cmd/rollback.go index 1c580a1..3d239c4 100644 --- a/update/cmd/rollback.go +++ b/update/cmd/rollback.go @@ -3,14 +3,24 @@ package cmd import ( "fmt" "log/slog" + + "github.com/portainer/kubesolo-os/update/pkg/state" ) // Rollback forces an immediate switch to the other partition. // Use this to manually revert to the previous version. +// +// State transition: any → RolledBack with LastError="manual rollback". func Rollback(args []string) error { opts := parseOpts(args) env := opts.NewBootEnv() + st, err := state.Load(opts.StatePath) + if err != nil { + slog.Warn("state file unreadable, starting fresh", "error", err) + st = state.New() + } + activeSlot, err := env.ActiveSlot() if err != nil { return fmt.Errorf("reading active slot: %w", err) @@ -24,9 +34,14 @@ func Rollback(args []string) error { slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot) if err := env.ForceRollback(); err != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("rollback failed: %w", err)) return fmt.Errorf("rollback failed: %w", err) } + if err := st.Transition(opts.StatePath, state.PhaseRolledBack, "", "manual rollback"); err != nil { + slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", err) + } + fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot) fmt.Println("Reboot to complete rollback.") diff --git a/update/cmd/status.go b/update/cmd/status.go index b74fcb8..8077624 100644 --- a/update/cmd/status.go +++ b/update/cmd/status.go @@ -1,10 +1,26 @@ package cmd import ( + "encoding/json" "fmt" + "os" + + "github.com/portainer/kubesolo-os/update/pkg/state" ) +// statusReport is the JSON-emitted shape of `kubesolo-update status --json`. +// Combines the bootloader-level A/B view with the update-agent state machine. +type statusReport struct { + ActiveSlot string `json:"active_slot"` + PassiveSlot string `json:"passive_slot"` + BootCounter int `json:"boot_counter"` + BootSuccess bool `json:"boot_success"` + State *state.UpdateState `json:"state"` +} + // Status displays the current A/B slot configuration and boot state. +// With --json, emits the full state report to stdout for orchestration +// tooling. func Status(args []string) error { opts := parseOpts(args) env := opts.NewBootEnv() @@ -29,6 +45,23 @@ func Status(args []string) error { return fmt.Errorf("reading boot success: %w", err) } + // State file is non-fatal: present means we have an update lifecycle + // recorded; absent means no update has run yet. + st, _ := state.Load(opts.StatePath) + + if opts.JSON { + report := statusReport{ + ActiveSlot: activeSlot, + PassiveSlot: passiveSlot, + BootCounter: bootCounter, + BootSuccess: bootSuccess, + State: st, + } + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(report) + } + fmt.Println("KubeSolo OS — A/B Partition Status") fmt.Println("───────────────────────────────────") fmt.Printf(" Active slot: %s\n", activeSlot) @@ -48,5 +81,25 @@ func Status(args []string) error { fmt.Printf("\n ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter) } + if st != nil && st.Phase != state.PhaseIdle { + fmt.Println("\nUpdate Lifecycle") + fmt.Println("───────────────────────────────────") + fmt.Printf(" Phase: %s\n", st.Phase) + if st.FromVersion != "" { + fmt.Printf(" From version: %s\n", st.FromVersion) + } + if st.ToVersion != "" { + fmt.Printf(" To version: %s\n", st.ToVersion) + } + if !st.StartedAt.IsZero() { + fmt.Printf(" Started: %s\n", st.StartedAt.Format("2006-01-02 15:04:05 MST")) + } + fmt.Printf(" Updated: %s\n", st.UpdatedAt.Format("2006-01-02 15:04:05 MST")) + fmt.Printf(" Attempts: %d\n", st.AttemptCount) + if st.LastError != "" { + fmt.Printf(" Last error: %s\n", st.LastError) + } + } + return nil } diff --git a/update/main.go b/update/main.go index b2238b2..09e5267 100644 --- a/update/main.go +++ b/update/main.go @@ -80,13 +80,16 @@ Commands: Options: --server URL Update server URL (default: from /etc/kubesolo/update.conf) --grubenv PATH Path to grubenv file (default: /boot/grub/grubenv) + --state PATH Update state file (default: /var/lib/kubesolo/update/state.json) --timeout SECS Health check timeout in seconds (default: 120) --pubkey PATH Ed25519 public key for signature verification (optional) + --json For 'status': emit JSON instead of human-readable output Examples: kubesolo-update check --server https://updates.example.com kubesolo-update apply --server https://updates.example.com --pubkey /etc/kubesolo/update-pubkey.hex kubesolo-update healthcheck kubesolo-update status + kubesolo-update status --json `) } diff --git a/update/pkg/metrics/metrics.go b/update/pkg/metrics/metrics.go index b99f40d..225c2c8 100644 --- a/update/pkg/metrics/metrics.go +++ b/update/pkg/metrics/metrics.go @@ -11,6 +11,9 @@ // kubesolo_os_update_last_check_timestamp_seconds unix timestamp (gauge) // kubesolo_os_memory_total_bytes total RAM (gauge) // kubesolo_os_memory_available_bytes available RAM (gauge) +// kubesolo_update_phase{phase} 1 for current phase, 0 for others +// kubesolo_update_attempts_total counter — attempts at current ToVersion +// kubesolo_update_last_attempt_timestamp_seconds unix timestamp of last state update // // This is a zero-dependency implementation — no Prometheus client library needed. // It serves metrics in the Prometheus text exposition format. @@ -25,11 +28,14 @@ import ( "strings" "sync" "time" + + "github.com/portainer/kubesolo-os/update/pkg/state" ) // Server is a lightweight Prometheus metrics HTTP server. type Server struct { grubenvPath string + statePath string listenAddr string startTime time.Time @@ -47,6 +53,27 @@ func NewServer(listenAddr, grubenvPath string) *Server { } } +// SetStatePath sets the location of the update state.json file. If empty or +// unset, state-derived metrics are emitted with the Idle defaults. +func (s *Server) SetStatePath(p string) { + s.statePath = p +} + +// allPhases lists every Phase value we emit as a kubesolo_update_phase +// time-series, so consumers see all label values (with value 0 for non-current +// phases). Mirror of validPhases in pkg/state. +var allPhases = []state.Phase{ + state.PhaseIdle, + state.PhaseChecking, + state.PhaseDownloading, + state.PhaseStaged, + state.PhaseActivated, + state.PhaseVerifying, + state.PhaseSuccess, + state.PhaseRolledBack, + state.PhaseFailed, +} + // SetUpdateAvailable records whether an update is available. func (s *Server) SetUpdateAvailable(available bool) { s.mu.Lock() @@ -125,9 +152,49 @@ func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) { sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n") sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail)) + // Update lifecycle (from state.json) + s.writeUpdateStateMetrics(&sb) + fmt.Fprint(w, sb.String()) } +// writeUpdateStateMetrics appends update-lifecycle metrics derived from the +// state.json file. If the file is missing or unreadable, emits the Idle +// defaults so the metric series exists at all times. +func (s *Server) writeUpdateStateMetrics(sb *strings.Builder) { + current := state.PhaseIdle + var attempts int + var lastTS float64 + + if s.statePath != "" { + if st, err := state.Load(s.statePath); err == nil && st != nil { + current = st.Phase + attempts = st.AttemptCount + if !st.UpdatedAt.IsZero() { + lastTS = float64(st.UpdatedAt.Unix()) + } + } + } + + sb.WriteString("# HELP kubesolo_update_phase Current update lifecycle phase (1 for active, 0 otherwise).\n") + sb.WriteString("# TYPE kubesolo_update_phase gauge\n") + for _, p := range allPhases { + v := 0 + if p == current { + v = 1 + } + sb.WriteString(fmt.Sprintf("kubesolo_update_phase{phase=%q} %d\n", string(p), v)) + } + + sb.WriteString("# HELP kubesolo_update_attempts_total Number of update attempts at the current target version.\n") + sb.WriteString("# TYPE kubesolo_update_attempts_total counter\n") + sb.WriteString(fmt.Sprintf("kubesolo_update_attempts_total %d\n", attempts)) + + sb.WriteString("# HELP kubesolo_update_last_attempt_timestamp_seconds Unix timestamp of the last state transition.\n") + sb.WriteString("# TYPE kubesolo_update_last_attempt_timestamp_seconds gauge\n") + sb.WriteString(fmt.Sprintf("kubesolo_update_last_attempt_timestamp_seconds %.0f\n", lastTS)) +} + // readGrubenvVar reads a single variable from grubenv using simple file parse. func (s *Server) readGrubenvVar(key string) string { data, err := os.ReadFile(s.grubenvPath) diff --git a/update/pkg/metrics/metrics_test.go b/update/pkg/metrics/metrics_test.go index 18bd8d1..c38fa90 100644 --- a/update/pkg/metrics/metrics_test.go +++ b/update/pkg/metrics/metrics_test.go @@ -8,6 +8,8 @@ import ( "path/filepath" "strings" "testing" + + "github.com/portainer/kubesolo-os/update/pkg/state" ) func TestNewServer(t *testing.T) { @@ -247,6 +249,86 @@ func TestSafeInt(t *testing.T) { } } +func TestUpdateStateMetricsAbsentStateFile(t *testing.T) { + // No state path set — should emit Idle defaults so the metric series + // exists from first boot. + s := NewServer(":9100", "/tmp/nonexistent") + + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + w := httptest.NewRecorder() + s.handleMetrics(w, req) + + body, _ := io.ReadAll(w.Result().Body) + output := string(body) + + if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 1`) { + t.Errorf("expected idle=1 with no state file, got:\n%s", output) + } + if !strings.Contains(output, `kubesolo_update_phase{phase="checking"} 0`) { + t.Errorf("expected checking=0 with no state file, got:\n%s", output) + } + if !strings.Contains(output, "kubesolo_update_attempts_total 0") { + t.Errorf("expected attempts=0 with no state file, got:\n%s", output) + } +} + +func TestUpdateStateMetricsActivePhase(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "state.json") + + st := state.New() + if err := st.Transition(statePath, state.PhaseDownloading, "v0.3.0", ""); err != nil { + t.Fatalf("seed state: %v", err) + } + + s := NewServer(":9100", "/tmp/nonexistent") + s.SetStatePath(statePath) + + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + w := httptest.NewRecorder() + s.handleMetrics(w, req) + + body, _ := io.ReadAll(w.Result().Body) + output := string(body) + + if !strings.Contains(output, `kubesolo_update_phase{phase="downloading"} 1`) { + t.Errorf("expected downloading=1, got:\n%s", output) + } + if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 0`) { + t.Errorf("expected idle=0 when downloading is active, got:\n%s", output) + } + if !strings.Contains(output, "kubesolo_update_attempts_total 1") { + t.Errorf("expected attempts=1 after first Transition, got:\n%s", output) + } + if strings.Contains(output, "kubesolo_update_last_attempt_timestamp_seconds 0\n") { + t.Errorf("expected non-zero timestamp after state write, got:\n%s", output) + } +} + +func TestUpdateStateMetricsAllPhasesEmitted(t *testing.T) { + // Every phase value should appear in the output, so dashboards can graph + // the series cleanly. + s := NewServer(":9100", "/tmp/nonexistent") + + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + w := httptest.NewRecorder() + s.handleMetrics(w, req) + + body, _ := io.ReadAll(w.Result().Body) + output := string(body) + + for _, p := range []state.Phase{ + state.PhaseIdle, state.PhaseChecking, state.PhaseDownloading, + state.PhaseStaged, state.PhaseActivated, state.PhaseVerifying, + state.PhaseSuccess, state.PhaseRolledBack, state.PhaseFailed, + } { + needle := `kubesolo_update_phase{phase="` + string(p) + `"}` + if !strings.Contains(output, needle) { + t.Errorf("phase %q not present in metrics output", p) + } + } +} + func TestReadFileString(t *testing.T) { dir := t.TempDir() diff --git a/update/pkg/state/state.go b/update/pkg/state/state.go new file mode 100644 index 0000000..f63afdd --- /dev/null +++ b/update/pkg/state/state.go @@ -0,0 +1,200 @@ +// Package state tracks the lifecycle of an OS update on disk. +// +// The state file (default /var/lib/kubesolo/update/state.json) records which +// phase the agent is in, what versions are involved, when the attempt started, +// any error from the last operation, and how many attempts have been made. +// Updates are atomic via tmp+rename, so a crash mid-write doesn't corrupt the +// state. +// +// Consumers: +// - cmd/check, cmd/apply, cmd/activate, cmd/healthcheck, cmd/rollback — +// transition the phase as they enter / leave their operations. +// - cmd/status --json — emits the raw state for orchestration tooling. +// - pkg/metrics — reads the state at scrape time to expose phase and +// attempt-count gauges. +package state + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "time" +) + +// DefaultPath is where state.json lives on a live system. The directory is on +// the persistent data partition so the file survives A/B slot switches. +const DefaultPath = "/var/lib/kubesolo/update/state.json" + +// Phase represents the current step in the update lifecycle. +// +// Terminal phases (Success, RolledBack, Failed) describe the outcome of the +// most recent attempt; transient phases (Checking, Downloading, Staged, +// Activated, Verifying) describe in-progress work. Idle means no update has +// been attempted yet, or the previous attempt has been acknowledged. +type Phase string + +const ( + // PhaseIdle — no update in progress. + PhaseIdle Phase = "idle" + // PhaseChecking — querying the update server for new versions. + PhaseChecking Phase = "checking" + // PhaseDownloading — pulling artifacts from the server. + PhaseDownloading Phase = "downloading" + // PhaseStaged — artifacts written to the passive partition; not yet active. + PhaseStaged Phase = "staged" + // PhaseActivated — passive slot promoted; next boot will use the new version. + PhaseActivated Phase = "activated" + // PhaseVerifying — post-boot healthcheck in progress on the new version. + PhaseVerifying Phase = "verifying" + // PhaseSuccess — last attempt completed and verified. + PhaseSuccess Phase = "success" + // PhaseRolledBack — last attempt failed verification; reverted to prior slot. + PhaseRolledBack Phase = "rolled_back" + // PhaseFailed — last attempt failed before reaching activation (download, + // checksum, signature, etc.). System still on the original slot. + PhaseFailed Phase = "failed" +) + +// validPhases lists every legal Phase value. Anything not in this set is +// rejected by Save() to catch typos. +var validPhases = map[Phase]struct{}{ + PhaseIdle: {}, + PhaseChecking: {}, + PhaseDownloading: {}, + PhaseStaged: {}, + PhaseActivated: {}, + PhaseVerifying: {}, + PhaseSuccess: {}, + PhaseRolledBack: {}, + PhaseFailed: {}, +} + +// UpdateState is the on-disk representation. Fields use JSON tags so the +// file format is forward-compatible (extra fields ignored, missing fields +// default). +type UpdateState struct { + // Phase is the current lifecycle position. + Phase Phase `json:"phase"` + // FromVersion is the version the system was running before the attempt. + // Empty when no attempt has run. + FromVersion string `json:"from_version,omitempty"` + // ToVersion is the version the attempt is targeting. + // Empty when no attempt has run. + ToVersion string `json:"to_version,omitempty"` + // StartedAt is when the current attempt entered a non-Idle phase. + StartedAt time.Time `json:"started_at,omitempty"` + // UpdatedAt is the last time the file was written. Always set on Save(). + UpdatedAt time.Time `json:"updated_at"` + // LastError carries the most recent operation error, populated when + // transitioning to PhaseFailed or PhaseRolledBack. Cleared on Success/Idle. + LastError string `json:"last_error,omitempty"` + // AttemptCount counts attempts at the current ToVersion. Reset when + // ToVersion changes or on successful completion. + AttemptCount int `json:"attempt_count"` +} + +// New returns a fresh Idle state with UpdatedAt set to now. +func New() *UpdateState { + return &UpdateState{ + Phase: PhaseIdle, + UpdatedAt: time.Now().UTC(), + } +} + +// Load reads the state from disk. If the file does not exist, returns a fresh +// Idle state — this is the normal first-run case, not an error. +func Load(path string) (*UpdateState, error) { + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return New(), nil + } + return nil, fmt.Errorf("read state %s: %w", path, err) + } + var s UpdateState + if err := json.Unmarshal(data, &s); err != nil { + return nil, fmt.Errorf("parse state %s: %w", path, err) + } + return &s, nil +} + +// Save writes the state to disk atomically (tmp file + rename), so an +// interrupted write never leaves a partial file at `path`. +func (s *UpdateState) Save(path string) error { + if _, ok := validPhases[s.Phase]; !ok { + return fmt.Errorf("invalid phase %q", s.Phase) + } + s.UpdatedAt = time.Now().UTC() + + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return fmt.Errorf("creating state dir: %w", err) + } + + data, err := json.MarshalIndent(s, "", " ") + if err != nil { + return fmt.Errorf("marshal state: %w", err) + } + data = append(data, '\n') + + tmp := path + ".tmp" + if err := os.WriteFile(tmp, data, 0o644); err != nil { + return fmt.Errorf("write tmp state: %w", err) + } + if err := os.Rename(tmp, path); err != nil { + _ = os.Remove(tmp) + return fmt.Errorf("rename state: %w", err) + } + return nil +} + +// Transition moves the state to phase `next` and persists it. If `next` +// targets a new ToVersion (different from the current one), AttemptCount is +// reset to 1; otherwise it is left untouched. StartedAt is set when +// transitioning out of Idle. LastError is cleared unless `next` is Failed or +// RolledBack. +func (s *UpdateState) Transition(path string, next Phase, toVersion, errMsg string) error { + now := time.Now().UTC() + + // Reset attempt counter when targeting a new version. + if toVersion != "" && toVersion != s.ToVersion { + s.ToVersion = toVersion + s.AttemptCount = 0 + } + + // First non-Idle phase of an attempt: record start time and bump count. + if s.Phase == PhaseIdle && next != PhaseIdle { + s.StartedAt = now + s.AttemptCount++ + } + + s.Phase = next + switch next { + case PhaseFailed, PhaseRolledBack: + if errMsg != "" { + s.LastError = errMsg + } + case PhaseSuccess, PhaseIdle: + s.LastError = "" + } + + return s.Save(path) +} + +// RecordError marks the state as failed with the given error and saves. +// Convenience wrapper around Transition for the most common failure path. +func (s *UpdateState) RecordError(path string, err error) error { + msg := "" + if err != nil { + msg = err.Error() + } + return s.Transition(path, PhaseFailed, "", msg) +} + +// SetFromVersion records the version the system was running when an attempt +// started. Idempotent; only takes effect when From is empty. +func (s *UpdateState) SetFromVersion(v string) { + if s.FromVersion == "" { + s.FromVersion = v + } +} diff --git a/update/pkg/state/state_test.go b/update/pkg/state/state_test.go new file mode 100644 index 0000000..91cf996 --- /dev/null +++ b/update/pkg/state/state_test.go @@ -0,0 +1,197 @@ +package state + +import ( + "errors" + "os" + "path/filepath" + "testing" +) + +// statePath returns a per-test state file path inside t.TempDir(). +func statePath(t *testing.T) string { + t.Helper() + return filepath.Join(t.TempDir(), "state.json") +} + +func TestLoadMissingReturnsIdle(t *testing.T) { + s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json")) + if err != nil { + t.Fatalf("unexpected error loading missing state: %v", err) + } + if s.Phase != PhaseIdle { + t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle) + } +} + +func TestSaveLoadRoundTrip(t *testing.T) { + path := statePath(t) + in := &UpdateState{ + Phase: PhaseStaged, + FromVersion: "v0.2.0", + ToVersion: "v0.3.0", + AttemptCount: 1, + } + if err := in.Save(path); err != nil { + t.Fatalf("save: %v", err) + } + out, err := Load(path) + if err != nil { + t.Fatalf("load: %v", err) + } + if out.Phase != in.Phase { + t.Errorf("phase: got %q, want %q", out.Phase, in.Phase) + } + if out.FromVersion != in.FromVersion { + t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion) + } + if out.ToVersion != in.ToVersion { + t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion) + } + if out.AttemptCount != in.AttemptCount { + t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount) + } + if out.UpdatedAt.IsZero() { + t.Error("UpdatedAt should be set by Save") + } +} + +func TestSaveRejectsInvalidPhase(t *testing.T) { + s := &UpdateState{Phase: Phase("bogus")} + err := s.Save(statePath(t)) + if err == nil { + t.Fatal("expected error saving invalid phase, got nil") + } +} + +func TestSaveIsAtomic(t *testing.T) { + // After Save, the .tmp file should NOT exist — confirming we renamed it. + path := statePath(t) + s := New() + if err := s.Save(path); err != nil { + t.Fatalf("save: %v", err) + } + if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) { + t.Errorf("tmp file still present after Save: %v", err) + } +} + +func TestSaveCreatesDirectory(t *testing.T) { + // State directory may not exist yet (first-ever boot). Save() should mkdir. + dir := filepath.Join(t.TempDir(), "fresh", "subdir") + path := filepath.Join(dir, "state.json") + if err := New().Save(path); err != nil { + t.Fatalf("save into nonexistent dir: %v", err) + } + if _, err := os.Stat(path); err != nil { + t.Errorf("state file not present after Save: %v", err) + } +} + +func TestTransitionIdleToChecking(t *testing.T) { + path := statePath(t) + s := New() + if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil { + t.Fatalf("transition: %v", err) + } + if s.Phase != PhaseChecking { + t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking) + } + if s.ToVersion != "v0.3.0" { + t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion) + } + if s.AttemptCount != 1 { + t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount) + } + if s.StartedAt.IsZero() { + t.Error("StartedAt should be set when leaving Idle") + } +} + +func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) { + path := statePath(t) + s := New() + _ = s.Transition(path, PhaseChecking, "v0.3.0", "") + _ = s.Transition(path, PhaseDownloading, "v0.3.0", "") + _ = s.Transition(path, PhaseStaged, "v0.3.0", "") + if s.AttemptCount != 1 { + t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount) + } +} + +func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) { + path := statePath(t) + s := New() + _ = s.Transition(path, PhaseChecking, "v0.3.0", "") + // Now an attempt at a NEW version starts. AttemptCount should reset. + _ = s.Transition(path, PhaseChecking, "v0.4.0", "") + if s.ToVersion != "v0.4.0" { + t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion) + } + if s.AttemptCount != 0 { + t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount) + } +} + +func TestTransitionFailedRecordsError(t *testing.T) { + path := statePath(t) + s := New() + _ = s.Transition(path, PhaseDownloading, "v0.3.0", "") + _ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch") + if s.Phase != PhaseFailed { + t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed) + } + if s.LastError != "checksum mismatch" { + t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch") + } +} + +func TestTransitionSuccessClearsError(t *testing.T) { + path := statePath(t) + s := New() + _ = s.Transition(path, PhaseFailed, "v0.3.0", "boom") + if s.LastError == "" { + t.Fatal("setup: LastError should be non-empty before success") + } + _ = s.Transition(path, PhaseSuccess, "v0.3.0", "") + if s.LastError != "" { + t.Errorf("last_error after success: got %q, want empty", s.LastError) + } +} + +func TestRecordError(t *testing.T) { + path := statePath(t) + s := New() + if err := s.RecordError(path, errors.New("network down")); err != nil { + t.Fatalf("RecordError: %v", err) + } + if s.Phase != PhaseFailed { + t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed) + } + if s.LastError != "network down" { + t.Errorf("last_error: got %q, want %q", s.LastError, "network down") + } +} + +func TestSetFromVersionIdempotent(t *testing.T) { + s := New() + s.SetFromVersion("v0.2.0") + if s.FromVersion != "v0.2.0" { + t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion) + } + // Second call should not overwrite. + s.SetFromVersion("v0.1.0") + if s.FromVersion != "v0.2.0" { + t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion) + } +} + +func TestLoadHandlesGarbageFile(t *testing.T) { + path := statePath(t) + if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil { + t.Fatalf("seed: %v", err) + } + _, err := Load(path) + if err == nil { + t.Error("expected error loading garbage, got nil") + } +}