feat(update): persistent state machine + lifecycle metrics
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s

Phase 5 of v0.3. Adds an explicit, on-disk state machine to the update agent
so the lifecycle of an attempt is observable end-to-end, instead of being
inferred from logs and side effects.

New package update/pkg/state:
- Phase enum (idle, checking, downloading, staged, activated, verifying,
  success, rolled_back, failed)
- UpdateState struct persisted to /var/lib/kubesolo/update/state.json
  (overridable via --state). Atomic write (.tmp + rename). Survives reboots
  and slot switches because the file lives on the data partition.
- Transition helper that bumps AttemptCount when an attempt starts, resets
  it when the target version changes, sets/clears LastError on
  failed/success transitions, and stamps StartedAt + UpdatedAt.
- 13 unit tests cover the lifecycle, atomic write, version-change reset,
  error recording, idempotent SetFromVersion, garbage-file handling.

Wired into the existing commands:
- apply.go transitions Idle -> Checking -> Downloading -> Staged, with
  RecordError on any step failure. Reads the active slot's version file to
  populate FromVersion.
- activate.go transitions to Activated.
- healthcheck.go transitions Activated -> Verifying -> Success on pass,
  or to Failed on fail. Skips transitions if state isn't post-activation
  (manual healthcheck on a stable system shouldn't churn the state).
- rollback.go transitions to RolledBack with LastError="manual rollback".
- check.go intentionally untouched — checks are passive queries, not
  attempts; they shouldn't reset AttemptCount.

status.go gains a --json mode that emits the full state report (A/B slots,
boot counter, full UpdateState) for orchestration tooling. Human-readable
mode also prints an Update Lifecycle section when state.phase != idle.

pkg/metrics gains three new series, derived from state.json at scrape time:
- kubesolo_update_phase{phase="..."} — 1 for current, 0 for all others;
  all nine phase values always emitted so dashboards see complete series
- kubesolo_update_attempts_total
- kubesolo_update_last_attempt_timestamp_seconds
Server.SetStatePath() configures the file location; defaults to absent
which emits Idle defaults. Three new tests cover the absent / active /
all-phases-emitted cases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-14 18:11:47 -06:00
parent 0c6e200585
commit bce565e2f7
12 changed files with 726 additions and 0 deletions

View File

@@ -3,23 +3,35 @@ package cmd
import ( import (
"fmt" "fmt"
"log/slog" "log/slog"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
// Activate switches the boot target to the passive partition. // Activate switches the boot target to the passive partition.
// After activation, the next reboot will boot from the new partition // After activation, the next reboot will boot from the new partition
// with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back. // with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
//
// State transition: Staged → Activated. On failure → Failed.
func Activate(args []string) error { func Activate(args []string) error {
opts := parseOpts(args) opts := parseOpts(args)
env := opts.NewBootEnv() env := opts.NewBootEnv()
st, err := state.Load(opts.StatePath)
if err != nil {
slog.Warn("state file unreadable, starting fresh", "error", err)
st = state.New()
}
// Get passive slot (the one we want to boot into) // Get passive slot (the one we want to boot into)
passiveSlot, err := env.PassiveSlot() passiveSlot, err := env.PassiveSlot()
if err != nil { if err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
return fmt.Errorf("reading passive slot: %w", err) return fmt.Errorf("reading passive slot: %w", err)
} }
activeSlot, err := env.ActiveSlot() activeSlot, err := env.ActiveSlot()
if err != nil { if err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading active slot: %w", err))
return fmt.Errorf("reading active slot: %w", err) return fmt.Errorf("reading active slot: %w", err)
} }
@@ -27,9 +39,14 @@ func Activate(args []string) error {
// Set the passive slot as active with fresh boot counter // Set the passive slot as active with fresh boot counter
if err := env.ActivateSlot(passiveSlot); err != nil { if err := env.ActivateSlot(passiveSlot); err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("activating slot %s: %w", passiveSlot, err))
return fmt.Errorf("activating slot %s: %w", passiveSlot, err) return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
} }
if err := st.Transition(opts.StatePath, state.PhaseActivated, "", ""); err != nil {
slog.Warn("state transition failed", "phase", state.PhaseActivated, "error", err)
}
fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot) fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
fmt.Println("Boot counter set to 3. Reboot to start the new version.") fmt.Println("Boot counter set to 3. Reboot to start the new version.")
fmt.Println("The system will automatically roll back if health checks fail 3 times.") fmt.Println("The system will automatically roll back if health checks fail 3 times.")

View File

@@ -6,10 +6,14 @@ import (
"github.com/portainer/kubesolo-os/update/pkg/image" "github.com/portainer/kubesolo-os/update/pkg/image"
"github.com/portainer/kubesolo-os/update/pkg/partition" "github.com/portainer/kubesolo-os/update/pkg/partition"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
// Apply downloads a new OS image and writes it to the passive partition. // Apply downloads a new OS image and writes it to the passive partition.
// It does NOT activate the new partition — use 'activate' for that. // It does NOT activate the new partition — use 'activate' for that.
//
// State transitions: Idle/Success/Failed → Checking → Downloading → Staged.
// On any error the state moves to Failed with LastError set.
func Apply(args []string) error { func Apply(args []string) error {
opts := parseOpts(args) opts := parseOpts(args)
@@ -17,11 +21,34 @@ func Apply(args []string) error {
return fmt.Errorf("--server is required") return fmt.Errorf("--server is required")
} }
st, err := state.Load(opts.StatePath)
if err != nil {
// Don't block the operation on a corrupt state file. Log + recover.
slog.Warn("state file unreadable, starting fresh", "error", err)
st = state.New()
}
env := opts.NewBootEnv() env := opts.NewBootEnv()
// Record the current running version as the "from" reference. The active
// slot's version file is the most reliable source.
activeSlot, slotErr := env.ActiveSlot()
if slotErr == nil {
if partInfo, perr := partition.GetSlotPartition(activeSlot); perr == nil {
mp := "/tmp/kubesolo-active-" + activeSlot
if merr := partition.MountReadOnly(partInfo.Device, mp); merr == nil {
if v, rerr := partition.ReadVersion(mp); rerr == nil {
st.SetFromVersion(v)
}
partition.Unmount(mp)
}
}
}
// Determine passive slot // Determine passive slot
passiveSlot, err := env.PassiveSlot() passiveSlot, err := env.PassiveSlot()
if err != nil { if err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
return fmt.Errorf("reading passive slot: %w", err) return fmt.Errorf("reading passive slot: %w", err)
} }
@@ -38,36 +65,55 @@ func Apply(args []string) error {
slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath) slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
} }
if err := st.Transition(opts.StatePath, state.PhaseChecking, "", ""); err != nil {
slog.Warn("state transition failed", "phase", state.PhaseChecking, "error", err)
}
meta, err := client.CheckForUpdate() meta, err := client.CheckForUpdate()
if err != nil { if err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("checking for update: %w", err))
return fmt.Errorf("checking for update: %w", err) return fmt.Errorf("checking for update: %w", err)
} }
slog.Info("update available", "version", meta.Version) slog.Info("update available", "version", meta.Version)
// Now we know the target version — record it (resets attempt count if it
// differs from the previous attempt's ToVersion).
if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
}
// Download and verify // Download and verify
staged, err := client.Download(meta) staged, err := client.Download(meta)
if err != nil { if err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("downloading update: %w", err))
return fmt.Errorf("downloading update: %w", err) return fmt.Errorf("downloading update: %w", err)
} }
// Mount passive partition // Mount passive partition
partInfo, err := partition.GetSlotPartition(passiveSlot) partInfo, err := partition.GetSlotPartition(passiveSlot)
if err != nil { if err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("finding passive partition: %w", err))
return fmt.Errorf("finding passive partition: %w", err) return fmt.Errorf("finding passive partition: %w", err)
} }
mountPoint := "/tmp/kubesolo-passive-" + passiveSlot mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil { if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("mounting passive partition: %w", err))
return fmt.Errorf("mounting passive partition: %w", err) return fmt.Errorf("mounting passive partition: %w", err)
} }
defer partition.Unmount(mountPoint) defer partition.Unmount(mountPoint)
// Write image to passive partition // Write image to passive partition
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil { if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))
return fmt.Errorf("writing system image: %w", err) return fmt.Errorf("writing system image: %w", err)
} }
if err := st.Transition(opts.StatePath, state.PhaseStaged, staged.Version, ""); err != nil {
slog.Warn("state transition failed", "phase", state.PhaseStaged, "error", err)
}
fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device) fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
fmt.Println("Run 'kubesolo-update activate' to boot into the new version") fmt.Println("Run 'kubesolo-update activate' to boot into the new version")

View File

@@ -6,16 +6,27 @@ import (
"time" "time"
"github.com/portainer/kubesolo-os/update/pkg/health" "github.com/portainer/kubesolo-os/update/pkg/health"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
// Healthcheck performs post-boot health verification. // Healthcheck performs post-boot health verification.
// If all checks pass, it marks the boot as successful in GRUB. // If all checks pass, it marks the boot as successful in GRUB.
// This should be run after every boot (typically via a systemd unit or // This should be run after every boot (typically via a systemd unit or
// init script) to confirm the system is healthy. // init script) to confirm the system is healthy.
//
// State transition: Activated → Verifying → Success on pass, → Failed on fail.
// If state isn't in Activated (e.g. manual run on a long-stable system), the
// state file is left alone — healthcheck still does its job.
func Healthcheck(args []string) error { func Healthcheck(args []string) error {
opts := parseOpts(args) opts := parseOpts(args)
env := opts.NewBootEnv() env := opts.NewBootEnv()
st, err := state.Load(opts.StatePath)
if err != nil {
slog.Warn("state file unreadable, starting fresh", "error", err)
st = state.New()
}
// Check if already marked successful // Check if already marked successful
success, err := env.BootSuccess() success, err := env.BootSuccess()
if err != nil { if err != nil {
@@ -26,6 +37,15 @@ func Healthcheck(args []string) error {
return nil return nil
} }
// Only transition state if we're post-activation. Manual healthcheck on a
// long-stable system shouldn't reset Idle → Verifying.
postActivation := st.Phase == state.PhaseActivated
if postActivation {
if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil {
slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err)
}
}
timeout := time.Duration(opts.TimeoutSecs) * time.Second timeout := time.Duration(opts.TimeoutSecs) * time.Second
checker := health.NewChecker("", "", timeout) checker := health.NewChecker("", "", timeout)
@@ -38,14 +58,26 @@ func Healthcheck(args []string) error {
fmt.Printf(" apiserver: %v\n", status.APIServer) fmt.Printf(" apiserver: %v\n", status.APIServer)
fmt.Printf(" node_ready: %v\n", status.NodeReady) fmt.Printf(" node_ready: %v\n", status.NodeReady)
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot") fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
if postActivation {
_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
}
return err return err
} }
// Mark boot as successful // Mark boot as successful
if err := env.MarkBootSuccess(); err != nil { if err := env.MarkBootSuccess(); err != nil {
if postActivation {
_ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err))
}
return fmt.Errorf("marking boot success: %w", err) return fmt.Errorf("marking boot success: %w", err)
} }
if postActivation {
if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
}
}
fmt.Println("Health check PASSED — boot marked successful") fmt.Println("Health check PASSED — boot marked successful")
fmt.Printf(" containerd: %v\n", status.Containerd) fmt.Printf(" containerd: %v\n", status.Containerd)
fmt.Printf(" apiserver: %v\n", status.APIServer) fmt.Printf(" apiserver: %v\n", status.APIServer)

View File

@@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"github.com/portainer/kubesolo-os/update/pkg/metrics" "github.com/portainer/kubesolo-os/update/pkg/metrics"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
// Metrics starts the Prometheus-compatible metrics HTTP server. // Metrics starts the Prometheus-compatible metrics HTTP server.
@@ -12,10 +13,12 @@ func Metrics(args []string) error {
fs := flag.NewFlagSet("metrics", flag.ExitOnError) fs := flag.NewFlagSet("metrics", flag.ExitOnError)
listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address") listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address")
grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file") grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file")
statePath := fs.String("state", state.DefaultPath, "Path to update state.json")
if err := fs.Parse(args); err != nil { if err := fs.Parse(args); err != nil {
return fmt.Errorf("parse flags: %w", err) return fmt.Errorf("parse flags: %w", err)
} }
srv := metrics.NewServer(*listenAddr, *grubenvPath) srv := metrics.NewServer(*listenAddr, *grubenvPath)
srv.SetStatePath(*statePath)
return srv.ListenAndServe() return srv.ListenAndServe()
} }

View File

@@ -2,6 +2,7 @@ package cmd
import ( import (
"github.com/portainer/kubesolo-os/update/pkg/bootenv" "github.com/portainer/kubesolo-os/update/pkg/bootenv"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
// opts holds shared command-line options for all subcommands. // opts holds shared command-line options for all subcommands.
@@ -12,6 +13,8 @@ type opts struct {
PubKeyPath string PubKeyPath string
BootEnvType string // "grub" or "rpi" BootEnvType string // "grub" or "rpi"
BootEnvPath string // path for RPi boot control dir BootEnvPath string // path for RPi boot control dir
StatePath string // location of state.json (default: state.DefaultPath)
JSON bool // status: emit JSON instead of human-readable
} }
// NewBootEnv creates a BootEnv from the parsed options. // NewBootEnv creates a BootEnv from the parsed options.
@@ -31,10 +34,18 @@ func parseOpts(args []string) opts {
GrubenvPath: "/boot/grub/grubenv", GrubenvPath: "/boot/grub/grubenv",
TimeoutSecs: 120, TimeoutSecs: 120,
BootEnvType: "grub", BootEnvType: "grub",
StatePath: state.DefaultPath,
} }
for i := 0; i < len(args); i++ { for i := 0; i < len(args); i++ {
switch args[i] { switch args[i] {
case "--state":
if i+1 < len(args) {
o.StatePath = args[i+1]
i++
}
case "--json":
o.JSON = true
case "--server": case "--server":
if i+1 < len(args) { if i+1 < len(args) {
o.ServerURL = args[i+1] o.ServerURL = args[i+1]

View File

@@ -3,14 +3,24 @@ package cmd
import ( import (
"fmt" "fmt"
"log/slog" "log/slog"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
// Rollback forces an immediate switch to the other partition. // Rollback forces an immediate switch to the other partition.
// Use this to manually revert to the previous version. // Use this to manually revert to the previous version.
//
// State transition: any → RolledBack with LastError="manual rollback".
func Rollback(args []string) error { func Rollback(args []string) error {
opts := parseOpts(args) opts := parseOpts(args)
env := opts.NewBootEnv() env := opts.NewBootEnv()
st, err := state.Load(opts.StatePath)
if err != nil {
slog.Warn("state file unreadable, starting fresh", "error", err)
st = state.New()
}
activeSlot, err := env.ActiveSlot() activeSlot, err := env.ActiveSlot()
if err != nil { if err != nil {
return fmt.Errorf("reading active slot: %w", err) return fmt.Errorf("reading active slot: %w", err)
@@ -24,9 +34,14 @@ func Rollback(args []string) error {
slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot) slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)
if err := env.ForceRollback(); err != nil { if err := env.ForceRollback(); err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("rollback failed: %w", err))
return fmt.Errorf("rollback failed: %w", err) return fmt.Errorf("rollback failed: %w", err)
} }
if err := st.Transition(opts.StatePath, state.PhaseRolledBack, "", "manual rollback"); err != nil {
slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", err)
}
fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot) fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
fmt.Println("Reboot to complete rollback.") fmt.Println("Reboot to complete rollback.")

View File

@@ -1,10 +1,26 @@
package cmd package cmd
import ( import (
"encoding/json"
"fmt" "fmt"
"os"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
// statusReport is the JSON-emitted shape of `kubesolo-update status --json`.
// Combines the bootloader-level A/B view with the update-agent state machine.
type statusReport struct {
ActiveSlot string `json:"active_slot"`
PassiveSlot string `json:"passive_slot"`
BootCounter int `json:"boot_counter"`
BootSuccess bool `json:"boot_success"`
State *state.UpdateState `json:"state"`
}
// Status displays the current A/B slot configuration and boot state. // Status displays the current A/B slot configuration and boot state.
// With --json, emits the full state report to stdout for orchestration
// tooling.
func Status(args []string) error { func Status(args []string) error {
opts := parseOpts(args) opts := parseOpts(args)
env := opts.NewBootEnv() env := opts.NewBootEnv()
@@ -29,6 +45,23 @@ func Status(args []string) error {
return fmt.Errorf("reading boot success: %w", err) return fmt.Errorf("reading boot success: %w", err)
} }
// State file is non-fatal: present means we have an update lifecycle
// recorded; absent means no update has run yet.
st, _ := state.Load(opts.StatePath)
if opts.JSON {
report := statusReport{
ActiveSlot: activeSlot,
PassiveSlot: passiveSlot,
BootCounter: bootCounter,
BootSuccess: bootSuccess,
State: st,
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(report)
}
fmt.Println("KubeSolo OS — A/B Partition Status") fmt.Println("KubeSolo OS — A/B Partition Status")
fmt.Println("───────────────────────────────────") fmt.Println("───────────────────────────────────")
fmt.Printf(" Active slot: %s\n", activeSlot) fmt.Printf(" Active slot: %s\n", activeSlot)
@@ -48,5 +81,25 @@ func Status(args []string) error {
fmt.Printf("\n ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter) fmt.Printf("\n ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter)
} }
if st != nil && st.Phase != state.PhaseIdle {
fmt.Println("\nUpdate Lifecycle")
fmt.Println("───────────────────────────────────")
fmt.Printf(" Phase: %s\n", st.Phase)
if st.FromVersion != "" {
fmt.Printf(" From version: %s\n", st.FromVersion)
}
if st.ToVersion != "" {
fmt.Printf(" To version: %s\n", st.ToVersion)
}
if !st.StartedAt.IsZero() {
fmt.Printf(" Started: %s\n", st.StartedAt.Format("2006-01-02 15:04:05 MST"))
}
fmt.Printf(" Updated: %s\n", st.UpdatedAt.Format("2006-01-02 15:04:05 MST"))
fmt.Printf(" Attempts: %d\n", st.AttemptCount)
if st.LastError != "" {
fmt.Printf(" Last error: %s\n", st.LastError)
}
}
return nil return nil
} }

View File

@@ -80,13 +80,16 @@ Commands:
Options: Options:
--server URL Update server URL (default: from /etc/kubesolo/update.conf) --server URL Update server URL (default: from /etc/kubesolo/update.conf)
--grubenv PATH Path to grubenv file (default: /boot/grub/grubenv) --grubenv PATH Path to grubenv file (default: /boot/grub/grubenv)
--state PATH Update state file (default: /var/lib/kubesolo/update/state.json)
--timeout SECS Health check timeout in seconds (default: 120) --timeout SECS Health check timeout in seconds (default: 120)
--pubkey PATH Ed25519 public key for signature verification (optional) --pubkey PATH Ed25519 public key for signature verification (optional)
--json For 'status': emit JSON instead of human-readable output
Examples: Examples:
kubesolo-update check --server https://updates.example.com kubesolo-update check --server https://updates.example.com
kubesolo-update apply --server https://updates.example.com --pubkey /etc/kubesolo/update-pubkey.hex kubesolo-update apply --server https://updates.example.com --pubkey /etc/kubesolo/update-pubkey.hex
kubesolo-update healthcheck kubesolo-update healthcheck
kubesolo-update status kubesolo-update status
kubesolo-update status --json
`) `)
} }

View File

@@ -11,6 +11,9 @@
// kubesolo_os_update_last_check_timestamp_seconds unix timestamp (gauge) // kubesolo_os_update_last_check_timestamp_seconds unix timestamp (gauge)
// kubesolo_os_memory_total_bytes total RAM (gauge) // kubesolo_os_memory_total_bytes total RAM (gauge)
// kubesolo_os_memory_available_bytes available RAM (gauge) // kubesolo_os_memory_available_bytes available RAM (gauge)
// kubesolo_update_phase{phase} 1 for current phase, 0 for others
// kubesolo_update_attempts_total counter — attempts at current ToVersion
// kubesolo_update_last_attempt_timestamp_seconds unix timestamp of last state update
// //
// This is a zero-dependency implementation — no Prometheus client library needed. // This is a zero-dependency implementation — no Prometheus client library needed.
// It serves metrics in the Prometheus text exposition format. // It serves metrics in the Prometheus text exposition format.
@@ -25,11 +28,14 @@ import (
"strings" "strings"
"sync" "sync"
"time" "time"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
// Server is a lightweight Prometheus metrics HTTP server. // Server is a lightweight Prometheus metrics HTTP server.
type Server struct { type Server struct {
grubenvPath string grubenvPath string
statePath string
listenAddr string listenAddr string
startTime time.Time startTime time.Time
@@ -47,6 +53,27 @@ func NewServer(listenAddr, grubenvPath string) *Server {
} }
} }
// SetStatePath sets the location of the update state.json file. If empty or
// unset, state-derived metrics are emitted with the Idle defaults.
func (s *Server) SetStatePath(p string) {
s.statePath = p
}
// allPhases lists every Phase value we emit as a kubesolo_update_phase
// time-series, so consumers see all label values (with value 0 for non-current
// phases). Mirror of validPhases in pkg/state.
var allPhases = []state.Phase{
state.PhaseIdle,
state.PhaseChecking,
state.PhaseDownloading,
state.PhaseStaged,
state.PhaseActivated,
state.PhaseVerifying,
state.PhaseSuccess,
state.PhaseRolledBack,
state.PhaseFailed,
}
// SetUpdateAvailable records whether an update is available. // SetUpdateAvailable records whether an update is available.
func (s *Server) SetUpdateAvailable(available bool) { func (s *Server) SetUpdateAvailable(available bool) {
s.mu.Lock() s.mu.Lock()
@@ -125,9 +152,49 @@ func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n") sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n")
sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail)) sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail))
// Update lifecycle (from state.json)
s.writeUpdateStateMetrics(&sb)
fmt.Fprint(w, sb.String()) fmt.Fprint(w, sb.String())
} }
// writeUpdateStateMetrics appends update-lifecycle metrics derived from the
// state.json file. If the file is missing or unreadable, emits the Idle
// defaults so the metric series exists at all times.
func (s *Server) writeUpdateStateMetrics(sb *strings.Builder) {
current := state.PhaseIdle
var attempts int
var lastTS float64
if s.statePath != "" {
if st, err := state.Load(s.statePath); err == nil && st != nil {
current = st.Phase
attempts = st.AttemptCount
if !st.UpdatedAt.IsZero() {
lastTS = float64(st.UpdatedAt.Unix())
}
}
}
sb.WriteString("# HELP kubesolo_update_phase Current update lifecycle phase (1 for active, 0 otherwise).\n")
sb.WriteString("# TYPE kubesolo_update_phase gauge\n")
for _, p := range allPhases {
v := 0
if p == current {
v = 1
}
sb.WriteString(fmt.Sprintf("kubesolo_update_phase{phase=%q} %d\n", string(p), v))
}
sb.WriteString("# HELP kubesolo_update_attempts_total Number of update attempts at the current target version.\n")
sb.WriteString("# TYPE kubesolo_update_attempts_total counter\n")
sb.WriteString(fmt.Sprintf("kubesolo_update_attempts_total %d\n", attempts))
sb.WriteString("# HELP kubesolo_update_last_attempt_timestamp_seconds Unix timestamp of the last state transition.\n")
sb.WriteString("# TYPE kubesolo_update_last_attempt_timestamp_seconds gauge\n")
sb.WriteString(fmt.Sprintf("kubesolo_update_last_attempt_timestamp_seconds %.0f\n", lastTS))
}
// readGrubenvVar reads a single variable from grubenv using simple file parse. // readGrubenvVar reads a single variable from grubenv using simple file parse.
func (s *Server) readGrubenvVar(key string) string { func (s *Server) readGrubenvVar(key string) string {
data, err := os.ReadFile(s.grubenvPath) data, err := os.ReadFile(s.grubenvPath)

View File

@@ -8,6 +8,8 @@ import (
"path/filepath" "path/filepath"
"strings" "strings"
"testing" "testing"
"github.com/portainer/kubesolo-os/update/pkg/state"
) )
func TestNewServer(t *testing.T) { func TestNewServer(t *testing.T) {
@@ -247,6 +249,86 @@ func TestSafeInt(t *testing.T) {
} }
} }
func TestUpdateStateMetricsAbsentStateFile(t *testing.T) {
// No state path set — should emit Idle defaults so the metric series
// exists from first boot.
s := NewServer(":9100", "/tmp/nonexistent")
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
w := httptest.NewRecorder()
s.handleMetrics(w, req)
body, _ := io.ReadAll(w.Result().Body)
output := string(body)
if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 1`) {
t.Errorf("expected idle=1 with no state file, got:\n%s", output)
}
if !strings.Contains(output, `kubesolo_update_phase{phase="checking"} 0`) {
t.Errorf("expected checking=0 with no state file, got:\n%s", output)
}
if !strings.Contains(output, "kubesolo_update_attempts_total 0") {
t.Errorf("expected attempts=0 with no state file, got:\n%s", output)
}
}
func TestUpdateStateMetricsActivePhase(t *testing.T) {
dir := t.TempDir()
statePath := filepath.Join(dir, "state.json")
st := state.New()
if err := st.Transition(statePath, state.PhaseDownloading, "v0.3.0", ""); err != nil {
t.Fatalf("seed state: %v", err)
}
s := NewServer(":9100", "/tmp/nonexistent")
s.SetStatePath(statePath)
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
w := httptest.NewRecorder()
s.handleMetrics(w, req)
body, _ := io.ReadAll(w.Result().Body)
output := string(body)
if !strings.Contains(output, `kubesolo_update_phase{phase="downloading"} 1`) {
t.Errorf("expected downloading=1, got:\n%s", output)
}
if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 0`) {
t.Errorf("expected idle=0 when downloading is active, got:\n%s", output)
}
if !strings.Contains(output, "kubesolo_update_attempts_total 1") {
t.Errorf("expected attempts=1 after first Transition, got:\n%s", output)
}
if strings.Contains(output, "kubesolo_update_last_attempt_timestamp_seconds 0\n") {
t.Errorf("expected non-zero timestamp after state write, got:\n%s", output)
}
}
func TestUpdateStateMetricsAllPhasesEmitted(t *testing.T) {
// Every phase value should appear in the output, so dashboards can graph
// the series cleanly.
s := NewServer(":9100", "/tmp/nonexistent")
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
w := httptest.NewRecorder()
s.handleMetrics(w, req)
body, _ := io.ReadAll(w.Result().Body)
output := string(body)
for _, p := range []state.Phase{
state.PhaseIdle, state.PhaseChecking, state.PhaseDownloading,
state.PhaseStaged, state.PhaseActivated, state.PhaseVerifying,
state.PhaseSuccess, state.PhaseRolledBack, state.PhaseFailed,
} {
needle := `kubesolo_update_phase{phase="` + string(p) + `"}`
if !strings.Contains(output, needle) {
t.Errorf("phase %q not present in metrics output", p)
}
}
}
func TestReadFileString(t *testing.T) { func TestReadFileString(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()

200
update/pkg/state/state.go Normal file
View File

@@ -0,0 +1,200 @@
// Package state tracks the lifecycle of an OS update on disk.
//
// The state file (default /var/lib/kubesolo/update/state.json) records which
// phase the agent is in, what versions are involved, when the attempt started,
// any error from the last operation, and how many attempts have been made.
// Updates are atomic via tmp+rename, so a crash mid-write doesn't corrupt the
// state.
//
// Consumers:
// - cmd/check, cmd/apply, cmd/activate, cmd/healthcheck, cmd/rollback —
// transition the phase as they enter / leave their operations.
// - cmd/status --json — emits the raw state for orchestration tooling.
// - pkg/metrics — reads the state at scrape time to expose phase and
// attempt-count gauges.
package state
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"time"
)
// DefaultPath is where state.json lives on a live system. The directory is on
// the persistent data partition so the file survives A/B slot switches.
const DefaultPath = "/var/lib/kubesolo/update/state.json"
// Phase represents the current step in the update lifecycle.
//
// Terminal phases (Success, RolledBack, Failed) describe the outcome of the
// most recent attempt; transient phases (Checking, Downloading, Staged,
// Activated, Verifying) describe in-progress work. Idle means no update has
// been attempted yet, or the previous attempt has been acknowledged.
type Phase string
const (
// PhaseIdle — no update in progress.
PhaseIdle Phase = "idle"
// PhaseChecking — querying the update server for new versions.
PhaseChecking Phase = "checking"
// PhaseDownloading — pulling artifacts from the server.
PhaseDownloading Phase = "downloading"
// PhaseStaged — artifacts written to the passive partition; not yet active.
PhaseStaged Phase = "staged"
// PhaseActivated — passive slot promoted; next boot will use the new version.
PhaseActivated Phase = "activated"
// PhaseVerifying — post-boot healthcheck in progress on the new version.
PhaseVerifying Phase = "verifying"
// PhaseSuccess — last attempt completed and verified.
PhaseSuccess Phase = "success"
// PhaseRolledBack — last attempt failed verification; reverted to prior slot.
PhaseRolledBack Phase = "rolled_back"
// PhaseFailed — last attempt failed before reaching activation (download,
// checksum, signature, etc.). System still on the original slot.
PhaseFailed Phase = "failed"
)
// validPhases lists every legal Phase value. Anything not in this set is
// rejected by Save() to catch typos.
var validPhases = map[Phase]struct{}{
PhaseIdle: {},
PhaseChecking: {},
PhaseDownloading: {},
PhaseStaged: {},
PhaseActivated: {},
PhaseVerifying: {},
PhaseSuccess: {},
PhaseRolledBack: {},
PhaseFailed: {},
}
// UpdateState is the on-disk representation. Fields use JSON tags so the
// file format is forward-compatible (extra fields ignored, missing fields
// default).
type UpdateState struct {
// Phase is the current lifecycle position.
Phase Phase `json:"phase"`
// FromVersion is the version the system was running before the attempt.
// Empty when no attempt has run.
FromVersion string `json:"from_version,omitempty"`
// ToVersion is the version the attempt is targeting.
// Empty when no attempt has run.
ToVersion string `json:"to_version,omitempty"`
// StartedAt is when the current attempt entered a non-Idle phase.
StartedAt time.Time `json:"started_at,omitempty"`
// UpdatedAt is the last time the file was written. Always set on Save().
UpdatedAt time.Time `json:"updated_at"`
// LastError carries the most recent operation error, populated when
// transitioning to PhaseFailed or PhaseRolledBack. Cleared on Success/Idle.
LastError string `json:"last_error,omitempty"`
// AttemptCount counts attempts at the current ToVersion. Reset when
// ToVersion changes or on successful completion.
AttemptCount int `json:"attempt_count"`
}
// New returns a fresh Idle state with UpdatedAt set to now.
func New() *UpdateState {
return &UpdateState{
Phase: PhaseIdle,
UpdatedAt: time.Now().UTC(),
}
}
// Load reads the state from disk. If the file does not exist, returns a fresh
// Idle state — this is the normal first-run case, not an error.
func Load(path string) (*UpdateState, error) {
data, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {
return New(), nil
}
return nil, fmt.Errorf("read state %s: %w", path, err)
}
var s UpdateState
if err := json.Unmarshal(data, &s); err != nil {
return nil, fmt.Errorf("parse state %s: %w", path, err)
}
return &s, nil
}
// Save writes the state to disk atomically (tmp file + rename), so an
// interrupted write never leaves a partial file at `path`.
func (s *UpdateState) Save(path string) error {
if _, ok := validPhases[s.Phase]; !ok {
return fmt.Errorf("invalid phase %q", s.Phase)
}
s.UpdatedAt = time.Now().UTC()
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
return fmt.Errorf("creating state dir: %w", err)
}
data, err := json.MarshalIndent(s, "", " ")
if err != nil {
return fmt.Errorf("marshal state: %w", err)
}
data = append(data, '\n')
tmp := path + ".tmp"
if err := os.WriteFile(tmp, data, 0o644); err != nil {
return fmt.Errorf("write tmp state: %w", err)
}
if err := os.Rename(tmp, path); err != nil {
_ = os.Remove(tmp)
return fmt.Errorf("rename state: %w", err)
}
return nil
}
// Transition moves the state to phase `next` and persists it. If `next`
// targets a new ToVersion (different from the current one), AttemptCount is
// reset to 1; otherwise it is left untouched. StartedAt is set when
// transitioning out of Idle. LastError is cleared unless `next` is Failed or
// RolledBack.
func (s *UpdateState) Transition(path string, next Phase, toVersion, errMsg string) error {
now := time.Now().UTC()
// Reset attempt counter when targeting a new version.
if toVersion != "" && toVersion != s.ToVersion {
s.ToVersion = toVersion
s.AttemptCount = 0
}
// First non-Idle phase of an attempt: record start time and bump count.
if s.Phase == PhaseIdle && next != PhaseIdle {
s.StartedAt = now
s.AttemptCount++
}
s.Phase = next
switch next {
case PhaseFailed, PhaseRolledBack:
if errMsg != "" {
s.LastError = errMsg
}
case PhaseSuccess, PhaseIdle:
s.LastError = ""
}
return s.Save(path)
}
// RecordError marks the state as failed with the given error and saves.
// Convenience wrapper around Transition for the most common failure path.
func (s *UpdateState) RecordError(path string, err error) error {
msg := ""
if err != nil {
msg = err.Error()
}
return s.Transition(path, PhaseFailed, "", msg)
}
// SetFromVersion records the version the system was running when an attempt
// started. Idempotent; only takes effect when From is empty.
func (s *UpdateState) SetFromVersion(v string) {
if s.FromVersion == "" {
s.FromVersion = v
}
}

View File

@@ -0,0 +1,197 @@
package state
import (
"errors"
"os"
"path/filepath"
"testing"
)
// statePath returns a per-test state file path inside t.TempDir().
func statePath(t *testing.T) string {
t.Helper()
return filepath.Join(t.TempDir(), "state.json")
}
func TestLoadMissingReturnsIdle(t *testing.T) {
s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
if err != nil {
t.Fatalf("unexpected error loading missing state: %v", err)
}
if s.Phase != PhaseIdle {
t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
}
}
func TestSaveLoadRoundTrip(t *testing.T) {
path := statePath(t)
in := &UpdateState{
Phase: PhaseStaged,
FromVersion: "v0.2.0",
ToVersion: "v0.3.0",
AttemptCount: 1,
}
if err := in.Save(path); err != nil {
t.Fatalf("save: %v", err)
}
out, err := Load(path)
if err != nil {
t.Fatalf("load: %v", err)
}
if out.Phase != in.Phase {
t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
}
if out.FromVersion != in.FromVersion {
t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
}
if out.ToVersion != in.ToVersion {
t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
}
if out.AttemptCount != in.AttemptCount {
t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
}
if out.UpdatedAt.IsZero() {
t.Error("UpdatedAt should be set by Save")
}
}
func TestSaveRejectsInvalidPhase(t *testing.T) {
s := &UpdateState{Phase: Phase("bogus")}
err := s.Save(statePath(t))
if err == nil {
t.Fatal("expected error saving invalid phase, got nil")
}
}
func TestSaveIsAtomic(t *testing.T) {
// After Save, the .tmp file should NOT exist — confirming we renamed it.
path := statePath(t)
s := New()
if err := s.Save(path); err != nil {
t.Fatalf("save: %v", err)
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Errorf("tmp file still present after Save: %v", err)
}
}
func TestSaveCreatesDirectory(t *testing.T) {
// State directory may not exist yet (first-ever boot). Save() should mkdir.
dir := filepath.Join(t.TempDir(), "fresh", "subdir")
path := filepath.Join(dir, "state.json")
if err := New().Save(path); err != nil {
t.Fatalf("save into nonexistent dir: %v", err)
}
if _, err := os.Stat(path); err != nil {
t.Errorf("state file not present after Save: %v", err)
}
}
func TestTransitionIdleToChecking(t *testing.T) {
path := statePath(t)
s := New()
if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
t.Fatalf("transition: %v", err)
}
if s.Phase != PhaseChecking {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
}
if s.ToVersion != "v0.3.0" {
t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
}
if s.AttemptCount != 1 {
t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
}
if s.StartedAt.IsZero() {
t.Error("StartedAt should be set when leaving Idle")
}
}
func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
if s.AttemptCount != 1 {
t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
}
}
func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
// Now an attempt at a NEW version starts. AttemptCount should reset.
_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
if s.ToVersion != "v0.4.0" {
t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
}
if s.AttemptCount != 0 {
t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
}
}
func TestTransitionFailedRecordsError(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
if s.Phase != PhaseFailed {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
}
if s.LastError != "checksum mismatch" {
t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
}
}
func TestTransitionSuccessClearsError(t *testing.T) {
path := statePath(t)
s := New()
_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
if s.LastError == "" {
t.Fatal("setup: LastError should be non-empty before success")
}
_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
if s.LastError != "" {
t.Errorf("last_error after success: got %q, want empty", s.LastError)
}
}
func TestRecordError(t *testing.T) {
path := statePath(t)
s := New()
if err := s.RecordError(path, errors.New("network down")); err != nil {
t.Fatalf("RecordError: %v", err)
}
if s.Phase != PhaseFailed {
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
}
if s.LastError != "network down" {
t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
}
}
func TestSetFromVersionIdempotent(t *testing.T) {
s := New()
s.SetFromVersion("v0.2.0")
if s.FromVersion != "v0.2.0" {
t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
}
// Second call should not overwrite.
s.SetFromVersion("v0.1.0")
if s.FromVersion != "v0.2.0" {
t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
}
}
func TestLoadHandlesGarbageFile(t *testing.T) {
path := statePath(t)
if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
t.Fatalf("seed: %v", err)
}
_, err := Load(path)
if err == nil {
t.Error("expected error loading garbage, got nil")
}
}