feat(update): persistent state machine + lifecycle metrics
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s
Phase 5 of v0.3. Adds an explicit, on-disk state machine to the update agent
so the lifecycle of an attempt is observable end-to-end, instead of being
inferred from logs and side effects.
New package update/pkg/state:
- Phase enum (idle, checking, downloading, staged, activated, verifying,
success, rolled_back, failed)
- UpdateState struct persisted to /var/lib/kubesolo/update/state.json
(overridable via --state). Atomic write (.tmp + rename). Survives reboots
and slot switches because the file lives on the data partition.
- Transition helper that bumps AttemptCount when an attempt starts, resets
it when the target version changes, sets/clears LastError on
failed/success transitions, and stamps StartedAt + UpdatedAt.
- 13 unit tests cover the lifecycle, atomic write, version-change reset,
error recording, idempotent SetFromVersion, garbage-file handling.
Wired into the existing commands:
- apply.go transitions Idle -> Checking -> Downloading -> Staged, with
RecordError on any step failure. Reads the active slot's version file to
populate FromVersion.
- activate.go transitions to Activated.
- healthcheck.go transitions Activated -> Verifying -> Success on pass,
or to Failed on fail. Skips transitions if state isn't post-activation
(manual healthcheck on a stable system shouldn't churn the state).
- rollback.go transitions to RolledBack with LastError="manual rollback".
- check.go intentionally untouched — checks are passive queries, not
attempts; they shouldn't reset AttemptCount.
status.go gains a --json mode that emits the full state report (A/B slots,
boot counter, full UpdateState) for orchestration tooling. Human-readable
mode also prints an Update Lifecycle section when state.phase != idle.
pkg/metrics gains three new series, derived from state.json at scrape time:
- kubesolo_update_phase{phase="..."} — 1 for current, 0 for all others;
all nine phase values always emitted so dashboards see complete series
- kubesolo_update_attempts_total
- kubesolo_update_last_attempt_timestamp_seconds
Server.SetStatePath() configures the file location; defaults to absent
which emits Idle defaults. Three new tests cover the absent / active /
all-phases-emitted cases.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,23 +3,35 @@ package cmd
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Activate switches the boot target to the passive partition.
|
||||
// After activation, the next reboot will boot from the new partition
|
||||
// with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
|
||||
//
|
||||
// State transition: Staged → Activated. On failure → Failed.
|
||||
func Activate(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
env := opts.NewBootEnv()
|
||||
|
||||
st, err := state.Load(opts.StatePath)
|
||||
if err != nil {
|
||||
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||
st = state.New()
|
||||
}
|
||||
|
||||
// Get passive slot (the one we want to boot into)
|
||||
passiveSlot, err := env.PassiveSlot()
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
|
||||
return fmt.Errorf("reading passive slot: %w", err)
|
||||
}
|
||||
|
||||
activeSlot, err := env.ActiveSlot()
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading active slot: %w", err))
|
||||
return fmt.Errorf("reading active slot: %w", err)
|
||||
}
|
||||
|
||||
@@ -27,9 +39,14 @@ func Activate(args []string) error {
|
||||
|
||||
// Set the passive slot as active with fresh boot counter
|
||||
if err := env.ActivateSlot(passiveSlot); err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("activating slot %s: %w", passiveSlot, err))
|
||||
return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
|
||||
}
|
||||
|
||||
if err := st.Transition(opts.StatePath, state.PhaseActivated, "", ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseActivated, "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
|
||||
fmt.Println("Boot counter set to 3. Reboot to start the new version.")
|
||||
fmt.Println("The system will automatically roll back if health checks fail 3 times.")
|
||||
|
||||
@@ -6,10 +6,14 @@ import (
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/image"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/partition"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Apply downloads a new OS image and writes it to the passive partition.
|
||||
// It does NOT activate the new partition — use 'activate' for that.
|
||||
//
|
||||
// State transitions: Idle/Success/Failed → Checking → Downloading → Staged.
|
||||
// On any error the state moves to Failed with LastError set.
|
||||
func Apply(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
|
||||
@@ -17,11 +21,34 @@ func Apply(args []string) error {
|
||||
return fmt.Errorf("--server is required")
|
||||
}
|
||||
|
||||
st, err := state.Load(opts.StatePath)
|
||||
if err != nil {
|
||||
// Don't block the operation on a corrupt state file. Log + recover.
|
||||
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||
st = state.New()
|
||||
}
|
||||
|
||||
env := opts.NewBootEnv()
|
||||
|
||||
// Record the current running version as the "from" reference. The active
|
||||
// slot's version file is the most reliable source.
|
||||
activeSlot, slotErr := env.ActiveSlot()
|
||||
if slotErr == nil {
|
||||
if partInfo, perr := partition.GetSlotPartition(activeSlot); perr == nil {
|
||||
mp := "/tmp/kubesolo-active-" + activeSlot
|
||||
if merr := partition.MountReadOnly(partInfo.Device, mp); merr == nil {
|
||||
if v, rerr := partition.ReadVersion(mp); rerr == nil {
|
||||
st.SetFromVersion(v)
|
||||
}
|
||||
partition.Unmount(mp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine passive slot
|
||||
passiveSlot, err := env.PassiveSlot()
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
|
||||
return fmt.Errorf("reading passive slot: %w", err)
|
||||
}
|
||||
|
||||
@@ -38,36 +65,55 @@ func Apply(args []string) error {
|
||||
slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
|
||||
}
|
||||
|
||||
if err := st.Transition(opts.StatePath, state.PhaseChecking, "", ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseChecking, "error", err)
|
||||
}
|
||||
|
||||
meta, err := client.CheckForUpdate()
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("checking for update: %w", err))
|
||||
return fmt.Errorf("checking for update: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("update available", "version", meta.Version)
|
||||
|
||||
// Now we know the target version — record it (resets attempt count if it
|
||||
// differs from the previous attempt's ToVersion).
|
||||
if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
|
||||
}
|
||||
|
||||
// Download and verify
|
||||
staged, err := client.Download(meta)
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("downloading update: %w", err))
|
||||
return fmt.Errorf("downloading update: %w", err)
|
||||
}
|
||||
|
||||
// Mount passive partition
|
||||
partInfo, err := partition.GetSlotPartition(passiveSlot)
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("finding passive partition: %w", err))
|
||||
return fmt.Errorf("finding passive partition: %w", err)
|
||||
}
|
||||
|
||||
mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
|
||||
if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("mounting passive partition: %w", err))
|
||||
return fmt.Errorf("mounting passive partition: %w", err)
|
||||
}
|
||||
defer partition.Unmount(mountPoint)
|
||||
|
||||
// Write image to passive partition
|
||||
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))
|
||||
return fmt.Errorf("writing system image: %w", err)
|
||||
}
|
||||
|
||||
if err := st.Transition(opts.StatePath, state.PhaseStaged, staged.Version, ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseStaged, "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
|
||||
fmt.Println("Run 'kubesolo-update activate' to boot into the new version")
|
||||
|
||||
|
||||
@@ -6,16 +6,27 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/health"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Healthcheck performs post-boot health verification.
|
||||
// If all checks pass, it marks the boot as successful in GRUB.
|
||||
// This should be run after every boot (typically via a systemd unit or
|
||||
// init script) to confirm the system is healthy.
|
||||
//
|
||||
// State transition: Activated → Verifying → Success on pass, → Failed on fail.
|
||||
// If state isn't in Activated (e.g. manual run on a long-stable system), the
|
||||
// state file is left alone — healthcheck still does its job.
|
||||
func Healthcheck(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
env := opts.NewBootEnv()
|
||||
|
||||
st, err := state.Load(opts.StatePath)
|
||||
if err != nil {
|
||||
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||
st = state.New()
|
||||
}
|
||||
|
||||
// Check if already marked successful
|
||||
success, err := env.BootSuccess()
|
||||
if err != nil {
|
||||
@@ -26,6 +37,15 @@ func Healthcheck(args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Only transition state if we're post-activation. Manual healthcheck on a
|
||||
// long-stable system shouldn't reset Idle → Verifying.
|
||||
postActivation := st.Phase == state.PhaseActivated
|
||||
if postActivation {
|
||||
if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
timeout := time.Duration(opts.TimeoutSecs) * time.Second
|
||||
checker := health.NewChecker("", "", timeout)
|
||||
|
||||
@@ -38,14 +58,26 @@ func Healthcheck(args []string) error {
|
||||
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
||||
fmt.Printf(" node_ready: %v\n", status.NodeReady)
|
||||
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
|
||||
if postActivation {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Mark boot as successful
|
||||
if err := env.MarkBootSuccess(); err != nil {
|
||||
if postActivation {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err))
|
||||
}
|
||||
return fmt.Errorf("marking boot success: %w", err)
|
||||
}
|
||||
|
||||
if postActivation {
|
||||
if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println("Health check PASSED — boot marked successful")
|
||||
fmt.Printf(" containerd: %v\n", status.Containerd)
|
||||
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/metrics"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Metrics starts the Prometheus-compatible metrics HTTP server.
|
||||
@@ -12,10 +13,12 @@ func Metrics(args []string) error {
|
||||
fs := flag.NewFlagSet("metrics", flag.ExitOnError)
|
||||
listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address")
|
||||
grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file")
|
||||
statePath := fs.String("state", state.DefaultPath, "Path to update state.json")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return fmt.Errorf("parse flags: %w", err)
|
||||
}
|
||||
|
||||
srv := metrics.NewServer(*listenAddr, *grubenvPath)
|
||||
srv.SetStatePath(*statePath)
|
||||
return srv.ListenAndServe()
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package cmd
|
||||
|
||||
import (
|
||||
"github.com/portainer/kubesolo-os/update/pkg/bootenv"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// opts holds shared command-line options for all subcommands.
|
||||
@@ -12,6 +13,8 @@ type opts struct {
|
||||
PubKeyPath string
|
||||
BootEnvType string // "grub" or "rpi"
|
||||
BootEnvPath string // path for RPi boot control dir
|
||||
StatePath string // location of state.json (default: state.DefaultPath)
|
||||
JSON bool // status: emit JSON instead of human-readable
|
||||
}
|
||||
|
||||
// NewBootEnv creates a BootEnv from the parsed options.
|
||||
@@ -31,10 +34,18 @@ func parseOpts(args []string) opts {
|
||||
GrubenvPath: "/boot/grub/grubenv",
|
||||
TimeoutSecs: 120,
|
||||
BootEnvType: "grub",
|
||||
StatePath: state.DefaultPath,
|
||||
}
|
||||
|
||||
for i := 0; i < len(args); i++ {
|
||||
switch args[i] {
|
||||
case "--state":
|
||||
if i+1 < len(args) {
|
||||
o.StatePath = args[i+1]
|
||||
i++
|
||||
}
|
||||
case "--json":
|
||||
o.JSON = true
|
||||
case "--server":
|
||||
if i+1 < len(args) {
|
||||
o.ServerURL = args[i+1]
|
||||
|
||||
@@ -3,14 +3,24 @@ package cmd
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Rollback forces an immediate switch to the other partition.
|
||||
// Use this to manually revert to the previous version.
|
||||
//
|
||||
// State transition: any → RolledBack with LastError="manual rollback".
|
||||
func Rollback(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
env := opts.NewBootEnv()
|
||||
|
||||
st, err := state.Load(opts.StatePath)
|
||||
if err != nil {
|
||||
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||
st = state.New()
|
||||
}
|
||||
|
||||
activeSlot, err := env.ActiveSlot()
|
||||
if err != nil {
|
||||
return fmt.Errorf("reading active slot: %w", err)
|
||||
@@ -24,9 +34,14 @@ func Rollback(args []string) error {
|
||||
slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)
|
||||
|
||||
if err := env.ForceRollback(); err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("rollback failed: %w", err))
|
||||
return fmt.Errorf("rollback failed: %w", err)
|
||||
}
|
||||
|
||||
if err := st.Transition(opts.StatePath, state.PhaseRolledBack, "", "manual rollback"); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
|
||||
fmt.Println("Reboot to complete rollback.")
|
||||
|
||||
|
||||
@@ -1,10 +1,26 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// statusReport is the JSON-emitted shape of `kubesolo-update status --json`.
|
||||
// Combines the bootloader-level A/B view with the update-agent state machine.
|
||||
type statusReport struct {
|
||||
ActiveSlot string `json:"active_slot"`
|
||||
PassiveSlot string `json:"passive_slot"`
|
||||
BootCounter int `json:"boot_counter"`
|
||||
BootSuccess bool `json:"boot_success"`
|
||||
State *state.UpdateState `json:"state"`
|
||||
}
|
||||
|
||||
// Status displays the current A/B slot configuration and boot state.
|
||||
// With --json, emits the full state report to stdout for orchestration
|
||||
// tooling.
|
||||
func Status(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
env := opts.NewBootEnv()
|
||||
@@ -29,6 +45,23 @@ func Status(args []string) error {
|
||||
return fmt.Errorf("reading boot success: %w", err)
|
||||
}
|
||||
|
||||
// State file is non-fatal: present means we have an update lifecycle
|
||||
// recorded; absent means no update has run yet.
|
||||
st, _ := state.Load(opts.StatePath)
|
||||
|
||||
if opts.JSON {
|
||||
report := statusReport{
|
||||
ActiveSlot: activeSlot,
|
||||
PassiveSlot: passiveSlot,
|
||||
BootCounter: bootCounter,
|
||||
BootSuccess: bootSuccess,
|
||||
State: st,
|
||||
}
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
return enc.Encode(report)
|
||||
}
|
||||
|
||||
fmt.Println("KubeSolo OS — A/B Partition Status")
|
||||
fmt.Println("───────────────────────────────────")
|
||||
fmt.Printf(" Active slot: %s\n", activeSlot)
|
||||
@@ -48,5 +81,25 @@ func Status(args []string) error {
|
||||
fmt.Printf("\n ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter)
|
||||
}
|
||||
|
||||
if st != nil && st.Phase != state.PhaseIdle {
|
||||
fmt.Println("\nUpdate Lifecycle")
|
||||
fmt.Println("───────────────────────────────────")
|
||||
fmt.Printf(" Phase: %s\n", st.Phase)
|
||||
if st.FromVersion != "" {
|
||||
fmt.Printf(" From version: %s\n", st.FromVersion)
|
||||
}
|
||||
if st.ToVersion != "" {
|
||||
fmt.Printf(" To version: %s\n", st.ToVersion)
|
||||
}
|
||||
if !st.StartedAt.IsZero() {
|
||||
fmt.Printf(" Started: %s\n", st.StartedAt.Format("2006-01-02 15:04:05 MST"))
|
||||
}
|
||||
fmt.Printf(" Updated: %s\n", st.UpdatedAt.Format("2006-01-02 15:04:05 MST"))
|
||||
fmt.Printf(" Attempts: %d\n", st.AttemptCount)
|
||||
if st.LastError != "" {
|
||||
fmt.Printf(" Last error: %s\n", st.LastError)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user