feat(update): persistent state machine + lifecycle metrics
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m31s
CI / Shellcheck (push) Successful in 47s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Failing after 10s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Failing after 16s
Phase 5 of v0.3. Adds an explicit, on-disk state machine to the update agent
so the lifecycle of an attempt is observable end-to-end, instead of being
inferred from logs and side effects.
New package update/pkg/state:
- Phase enum (idle, checking, downloading, staged, activated, verifying,
success, rolled_back, failed)
- UpdateState struct persisted to /var/lib/kubesolo/update/state.json
(overridable via --state). Atomic write (.tmp + rename). Survives reboots
and slot switches because the file lives on the data partition.
- Transition helper that bumps AttemptCount when an attempt starts, resets
it when the target version changes, sets/clears LastError on
failed/success transitions, and stamps StartedAt + UpdatedAt.
- 13 unit tests cover the lifecycle, atomic write, version-change reset,
error recording, idempotent SetFromVersion, garbage-file handling.
Wired into the existing commands:
- apply.go transitions Idle -> Checking -> Downloading -> Staged, with
RecordError on any step failure. Reads the active slot's version file to
populate FromVersion.
- activate.go transitions to Activated.
- healthcheck.go transitions Activated -> Verifying -> Success on pass,
or to Failed on fail. Skips transitions if state isn't post-activation
(manual healthcheck on a stable system shouldn't churn the state).
- rollback.go transitions to RolledBack with LastError="manual rollback".
- check.go intentionally untouched — checks are passive queries, not
attempts; they shouldn't reset AttemptCount.
status.go gains a --json mode that emits the full state report (A/B slots,
boot counter, full UpdateState) for orchestration tooling. Human-readable
mode also prints an Update Lifecycle section when state.phase != idle.
pkg/metrics gains three new series, derived from state.json at scrape time:
- kubesolo_update_phase{phase="..."} — 1 for current, 0 for all others;
all nine phase values always emitted so dashboards see complete series
- kubesolo_update_attempts_total
- kubesolo_update_last_attempt_timestamp_seconds
Server.SetStatePath() configures the file location; defaults to absent
which emits Idle defaults. Three new tests cover the absent / active /
all-phases-emitted cases.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,23 +3,35 @@ package cmd
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Activate switches the boot target to the passive partition.
|
// Activate switches the boot target to the passive partition.
|
||||||
// After activation, the next reboot will boot from the new partition
|
// After activation, the next reboot will boot from the new partition
|
||||||
// with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
|
// with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
|
||||||
|
//
|
||||||
|
// State transition: Staged → Activated. On failure → Failed.
|
||||||
func Activate(args []string) error {
|
func Activate(args []string) error {
|
||||||
opts := parseOpts(args)
|
opts := parseOpts(args)
|
||||||
env := opts.NewBootEnv()
|
env := opts.NewBootEnv()
|
||||||
|
|
||||||
|
st, err := state.Load(opts.StatePath)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||||
|
st = state.New()
|
||||||
|
}
|
||||||
|
|
||||||
// Get passive slot (the one we want to boot into)
|
// Get passive slot (the one we want to boot into)
|
||||||
passiveSlot, err := env.PassiveSlot()
|
passiveSlot, err := env.PassiveSlot()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
|
||||||
return fmt.Errorf("reading passive slot: %w", err)
|
return fmt.Errorf("reading passive slot: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
activeSlot, err := env.ActiveSlot()
|
activeSlot, err := env.ActiveSlot()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading active slot: %w", err))
|
||||||
return fmt.Errorf("reading active slot: %w", err)
|
return fmt.Errorf("reading active slot: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -27,9 +39,14 @@ func Activate(args []string) error {
|
|||||||
|
|
||||||
// Set the passive slot as active with fresh boot counter
|
// Set the passive slot as active with fresh boot counter
|
||||||
if err := env.ActivateSlot(passiveSlot); err != nil {
|
if err := env.ActivateSlot(passiveSlot); err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("activating slot %s: %w", passiveSlot, err))
|
||||||
return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
|
return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := st.Transition(opts.StatePath, state.PhaseActivated, "", ""); err != nil {
|
||||||
|
slog.Warn("state transition failed", "phase", state.PhaseActivated, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
|
fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
|
||||||
fmt.Println("Boot counter set to 3. Reboot to start the new version.")
|
fmt.Println("Boot counter set to 3. Reboot to start the new version.")
|
||||||
fmt.Println("The system will automatically roll back if health checks fail 3 times.")
|
fmt.Println("The system will automatically roll back if health checks fail 3 times.")
|
||||||
|
|||||||
@@ -6,10 +6,14 @@ import (
|
|||||||
|
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/image"
|
"github.com/portainer/kubesolo-os/update/pkg/image"
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/partition"
|
"github.com/portainer/kubesolo-os/update/pkg/partition"
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Apply downloads a new OS image and writes it to the passive partition.
|
// Apply downloads a new OS image and writes it to the passive partition.
|
||||||
// It does NOT activate the new partition — use 'activate' for that.
|
// It does NOT activate the new partition — use 'activate' for that.
|
||||||
|
//
|
||||||
|
// State transitions: Idle/Success/Failed → Checking → Downloading → Staged.
|
||||||
|
// On any error the state moves to Failed with LastError set.
|
||||||
func Apply(args []string) error {
|
func Apply(args []string) error {
|
||||||
opts := parseOpts(args)
|
opts := parseOpts(args)
|
||||||
|
|
||||||
@@ -17,11 +21,34 @@ func Apply(args []string) error {
|
|||||||
return fmt.Errorf("--server is required")
|
return fmt.Errorf("--server is required")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
st, err := state.Load(opts.StatePath)
|
||||||
|
if err != nil {
|
||||||
|
// Don't block the operation on a corrupt state file. Log + recover.
|
||||||
|
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||||
|
st = state.New()
|
||||||
|
}
|
||||||
|
|
||||||
env := opts.NewBootEnv()
|
env := opts.NewBootEnv()
|
||||||
|
|
||||||
|
// Record the current running version as the "from" reference. The active
|
||||||
|
// slot's version file is the most reliable source.
|
||||||
|
activeSlot, slotErr := env.ActiveSlot()
|
||||||
|
if slotErr == nil {
|
||||||
|
if partInfo, perr := partition.GetSlotPartition(activeSlot); perr == nil {
|
||||||
|
mp := "/tmp/kubesolo-active-" + activeSlot
|
||||||
|
if merr := partition.MountReadOnly(partInfo.Device, mp); merr == nil {
|
||||||
|
if v, rerr := partition.ReadVersion(mp); rerr == nil {
|
||||||
|
st.SetFromVersion(v)
|
||||||
|
}
|
||||||
|
partition.Unmount(mp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Determine passive slot
|
// Determine passive slot
|
||||||
passiveSlot, err := env.PassiveSlot()
|
passiveSlot, err := env.PassiveSlot()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
|
||||||
return fmt.Errorf("reading passive slot: %w", err)
|
return fmt.Errorf("reading passive slot: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -38,36 +65,55 @@ func Apply(args []string) error {
|
|||||||
slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
|
slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := st.Transition(opts.StatePath, state.PhaseChecking, "", ""); err != nil {
|
||||||
|
slog.Warn("state transition failed", "phase", state.PhaseChecking, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
meta, err := client.CheckForUpdate()
|
meta, err := client.CheckForUpdate()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("checking for update: %w", err))
|
||||||
return fmt.Errorf("checking for update: %w", err)
|
return fmt.Errorf("checking for update: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("update available", "version", meta.Version)
|
slog.Info("update available", "version", meta.Version)
|
||||||
|
|
||||||
|
// Now we know the target version — record it (resets attempt count if it
|
||||||
|
// differs from the previous attempt's ToVersion).
|
||||||
|
if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
|
||||||
|
slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Download and verify
|
// Download and verify
|
||||||
staged, err := client.Download(meta)
|
staged, err := client.Download(meta)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("downloading update: %w", err))
|
||||||
return fmt.Errorf("downloading update: %w", err)
|
return fmt.Errorf("downloading update: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mount passive partition
|
// Mount passive partition
|
||||||
partInfo, err := partition.GetSlotPartition(passiveSlot)
|
partInfo, err := partition.GetSlotPartition(passiveSlot)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("finding passive partition: %w", err))
|
||||||
return fmt.Errorf("finding passive partition: %w", err)
|
return fmt.Errorf("finding passive partition: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
|
mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
|
||||||
if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
|
if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("mounting passive partition: %w", err))
|
||||||
return fmt.Errorf("mounting passive partition: %w", err)
|
return fmt.Errorf("mounting passive partition: %w", err)
|
||||||
}
|
}
|
||||||
defer partition.Unmount(mountPoint)
|
defer partition.Unmount(mountPoint)
|
||||||
|
|
||||||
// Write image to passive partition
|
// Write image to passive partition
|
||||||
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
|
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))
|
||||||
return fmt.Errorf("writing system image: %w", err)
|
return fmt.Errorf("writing system image: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := st.Transition(opts.StatePath, state.PhaseStaged, staged.Version, ""); err != nil {
|
||||||
|
slog.Warn("state transition failed", "phase", state.PhaseStaged, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
|
fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
|
||||||
fmt.Println("Run 'kubesolo-update activate' to boot into the new version")
|
fmt.Println("Run 'kubesolo-update activate' to boot into the new version")
|
||||||
|
|
||||||
|
|||||||
@@ -6,16 +6,27 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/health"
|
"github.com/portainer/kubesolo-os/update/pkg/health"
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Healthcheck performs post-boot health verification.
|
// Healthcheck performs post-boot health verification.
|
||||||
// If all checks pass, it marks the boot as successful in GRUB.
|
// If all checks pass, it marks the boot as successful in GRUB.
|
||||||
// This should be run after every boot (typically via a systemd unit or
|
// This should be run after every boot (typically via a systemd unit or
|
||||||
// init script) to confirm the system is healthy.
|
// init script) to confirm the system is healthy.
|
||||||
|
//
|
||||||
|
// State transition: Activated → Verifying → Success on pass, → Failed on fail.
|
||||||
|
// If state isn't in Activated (e.g. manual run on a long-stable system), the
|
||||||
|
// state file is left alone — healthcheck still does its job.
|
||||||
func Healthcheck(args []string) error {
|
func Healthcheck(args []string) error {
|
||||||
opts := parseOpts(args)
|
opts := parseOpts(args)
|
||||||
env := opts.NewBootEnv()
|
env := opts.NewBootEnv()
|
||||||
|
|
||||||
|
st, err := state.Load(opts.StatePath)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||||
|
st = state.New()
|
||||||
|
}
|
||||||
|
|
||||||
// Check if already marked successful
|
// Check if already marked successful
|
||||||
success, err := env.BootSuccess()
|
success, err := env.BootSuccess()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -26,6 +37,15 @@ func Healthcheck(args []string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Only transition state if we're post-activation. Manual healthcheck on a
|
||||||
|
// long-stable system shouldn't reset Idle → Verifying.
|
||||||
|
postActivation := st.Phase == state.PhaseActivated
|
||||||
|
if postActivation {
|
||||||
|
if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil {
|
||||||
|
slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
timeout := time.Duration(opts.TimeoutSecs) * time.Second
|
timeout := time.Duration(opts.TimeoutSecs) * time.Second
|
||||||
checker := health.NewChecker("", "", timeout)
|
checker := health.NewChecker("", "", timeout)
|
||||||
|
|
||||||
@@ -38,14 +58,26 @@ func Healthcheck(args []string) error {
|
|||||||
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
||||||
fmt.Printf(" node_ready: %v\n", status.NodeReady)
|
fmt.Printf(" node_ready: %v\n", status.NodeReady)
|
||||||
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
|
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
|
||||||
|
if postActivation {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mark boot as successful
|
// Mark boot as successful
|
||||||
if err := env.MarkBootSuccess(); err != nil {
|
if err := env.MarkBootSuccess(); err != nil {
|
||||||
|
if postActivation {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err))
|
||||||
|
}
|
||||||
return fmt.Errorf("marking boot success: %w", err)
|
return fmt.Errorf("marking boot success: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if postActivation {
|
||||||
|
if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
|
||||||
|
slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Println("Health check PASSED — boot marked successful")
|
fmt.Println("Health check PASSED — boot marked successful")
|
||||||
fmt.Printf(" containerd: %v\n", status.Containerd)
|
fmt.Printf(" containerd: %v\n", status.Containerd)
|
||||||
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/metrics"
|
"github.com/portainer/kubesolo-os/update/pkg/metrics"
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Metrics starts the Prometheus-compatible metrics HTTP server.
|
// Metrics starts the Prometheus-compatible metrics HTTP server.
|
||||||
@@ -12,10 +13,12 @@ func Metrics(args []string) error {
|
|||||||
fs := flag.NewFlagSet("metrics", flag.ExitOnError)
|
fs := flag.NewFlagSet("metrics", flag.ExitOnError)
|
||||||
listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address")
|
listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address")
|
||||||
grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file")
|
grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file")
|
||||||
|
statePath := fs.String("state", state.DefaultPath, "Path to update state.json")
|
||||||
if err := fs.Parse(args); err != nil {
|
if err := fs.Parse(args); err != nil {
|
||||||
return fmt.Errorf("parse flags: %w", err)
|
return fmt.Errorf("parse flags: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
srv := metrics.NewServer(*listenAddr, *grubenvPath)
|
srv := metrics.NewServer(*listenAddr, *grubenvPath)
|
||||||
|
srv.SetStatePath(*statePath)
|
||||||
return srv.ListenAndServe()
|
return srv.ListenAndServe()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package cmd
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/bootenv"
|
"github.com/portainer/kubesolo-os/update/pkg/bootenv"
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// opts holds shared command-line options for all subcommands.
|
// opts holds shared command-line options for all subcommands.
|
||||||
@@ -12,6 +13,8 @@ type opts struct {
|
|||||||
PubKeyPath string
|
PubKeyPath string
|
||||||
BootEnvType string // "grub" or "rpi"
|
BootEnvType string // "grub" or "rpi"
|
||||||
BootEnvPath string // path for RPi boot control dir
|
BootEnvPath string // path for RPi boot control dir
|
||||||
|
StatePath string // location of state.json (default: state.DefaultPath)
|
||||||
|
JSON bool // status: emit JSON instead of human-readable
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewBootEnv creates a BootEnv from the parsed options.
|
// NewBootEnv creates a BootEnv from the parsed options.
|
||||||
@@ -31,10 +34,18 @@ func parseOpts(args []string) opts {
|
|||||||
GrubenvPath: "/boot/grub/grubenv",
|
GrubenvPath: "/boot/grub/grubenv",
|
||||||
TimeoutSecs: 120,
|
TimeoutSecs: 120,
|
||||||
BootEnvType: "grub",
|
BootEnvType: "grub",
|
||||||
|
StatePath: state.DefaultPath,
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; i < len(args); i++ {
|
for i := 0; i < len(args); i++ {
|
||||||
switch args[i] {
|
switch args[i] {
|
||||||
|
case "--state":
|
||||||
|
if i+1 < len(args) {
|
||||||
|
o.StatePath = args[i+1]
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
case "--json":
|
||||||
|
o.JSON = true
|
||||||
case "--server":
|
case "--server":
|
||||||
if i+1 < len(args) {
|
if i+1 < len(args) {
|
||||||
o.ServerURL = args[i+1]
|
o.ServerURL = args[i+1]
|
||||||
|
|||||||
@@ -3,14 +3,24 @@ package cmd
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Rollback forces an immediate switch to the other partition.
|
// Rollback forces an immediate switch to the other partition.
|
||||||
// Use this to manually revert to the previous version.
|
// Use this to manually revert to the previous version.
|
||||||
|
//
|
||||||
|
// State transition: any → RolledBack with LastError="manual rollback".
|
||||||
func Rollback(args []string) error {
|
func Rollback(args []string) error {
|
||||||
opts := parseOpts(args)
|
opts := parseOpts(args)
|
||||||
env := opts.NewBootEnv()
|
env := opts.NewBootEnv()
|
||||||
|
|
||||||
|
st, err := state.Load(opts.StatePath)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||||
|
st = state.New()
|
||||||
|
}
|
||||||
|
|
||||||
activeSlot, err := env.ActiveSlot()
|
activeSlot, err := env.ActiveSlot()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("reading active slot: %w", err)
|
return fmt.Errorf("reading active slot: %w", err)
|
||||||
@@ -24,9 +34,14 @@ func Rollback(args []string) error {
|
|||||||
slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)
|
slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)
|
||||||
|
|
||||||
if err := env.ForceRollback(); err != nil {
|
if err := env.ForceRollback(); err != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("rollback failed: %w", err))
|
||||||
return fmt.Errorf("rollback failed: %w", err)
|
return fmt.Errorf("rollback failed: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := st.Transition(opts.StatePath, state.PhaseRolledBack, "", "manual rollback"); err != nil {
|
||||||
|
slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
|
fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
|
||||||
fmt.Println("Reboot to complete rollback.")
|
fmt.Println("Reboot to complete rollback.")
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,26 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// statusReport is the JSON-emitted shape of `kubesolo-update status --json`.
|
||||||
|
// Combines the bootloader-level A/B view with the update-agent state machine.
|
||||||
|
type statusReport struct {
|
||||||
|
ActiveSlot string `json:"active_slot"`
|
||||||
|
PassiveSlot string `json:"passive_slot"`
|
||||||
|
BootCounter int `json:"boot_counter"`
|
||||||
|
BootSuccess bool `json:"boot_success"`
|
||||||
|
State *state.UpdateState `json:"state"`
|
||||||
|
}
|
||||||
|
|
||||||
// Status displays the current A/B slot configuration and boot state.
|
// Status displays the current A/B slot configuration and boot state.
|
||||||
|
// With --json, emits the full state report to stdout for orchestration
|
||||||
|
// tooling.
|
||||||
func Status(args []string) error {
|
func Status(args []string) error {
|
||||||
opts := parseOpts(args)
|
opts := parseOpts(args)
|
||||||
env := opts.NewBootEnv()
|
env := opts.NewBootEnv()
|
||||||
@@ -29,6 +45,23 @@ func Status(args []string) error {
|
|||||||
return fmt.Errorf("reading boot success: %w", err)
|
return fmt.Errorf("reading boot success: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// State file is non-fatal: present means we have an update lifecycle
|
||||||
|
// recorded; absent means no update has run yet.
|
||||||
|
st, _ := state.Load(opts.StatePath)
|
||||||
|
|
||||||
|
if opts.JSON {
|
||||||
|
report := statusReport{
|
||||||
|
ActiveSlot: activeSlot,
|
||||||
|
PassiveSlot: passiveSlot,
|
||||||
|
BootCounter: bootCounter,
|
||||||
|
BootSuccess: bootSuccess,
|
||||||
|
State: st,
|
||||||
|
}
|
||||||
|
enc := json.NewEncoder(os.Stdout)
|
||||||
|
enc.SetIndent("", " ")
|
||||||
|
return enc.Encode(report)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Println("KubeSolo OS — A/B Partition Status")
|
fmt.Println("KubeSolo OS — A/B Partition Status")
|
||||||
fmt.Println("───────────────────────────────────")
|
fmt.Println("───────────────────────────────────")
|
||||||
fmt.Printf(" Active slot: %s\n", activeSlot)
|
fmt.Printf(" Active slot: %s\n", activeSlot)
|
||||||
@@ -48,5 +81,25 @@ func Status(args []string) error {
|
|||||||
fmt.Printf("\n ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter)
|
fmt.Printf("\n ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if st != nil && st.Phase != state.PhaseIdle {
|
||||||
|
fmt.Println("\nUpdate Lifecycle")
|
||||||
|
fmt.Println("───────────────────────────────────")
|
||||||
|
fmt.Printf(" Phase: %s\n", st.Phase)
|
||||||
|
if st.FromVersion != "" {
|
||||||
|
fmt.Printf(" From version: %s\n", st.FromVersion)
|
||||||
|
}
|
||||||
|
if st.ToVersion != "" {
|
||||||
|
fmt.Printf(" To version: %s\n", st.ToVersion)
|
||||||
|
}
|
||||||
|
if !st.StartedAt.IsZero() {
|
||||||
|
fmt.Printf(" Started: %s\n", st.StartedAt.Format("2006-01-02 15:04:05 MST"))
|
||||||
|
}
|
||||||
|
fmt.Printf(" Updated: %s\n", st.UpdatedAt.Format("2006-01-02 15:04:05 MST"))
|
||||||
|
fmt.Printf(" Attempts: %d\n", st.AttemptCount)
|
||||||
|
if st.LastError != "" {
|
||||||
|
fmt.Printf(" Last error: %s\n", st.LastError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -80,13 +80,16 @@ Commands:
|
|||||||
Options:
|
Options:
|
||||||
--server URL Update server URL (default: from /etc/kubesolo/update.conf)
|
--server URL Update server URL (default: from /etc/kubesolo/update.conf)
|
||||||
--grubenv PATH Path to grubenv file (default: /boot/grub/grubenv)
|
--grubenv PATH Path to grubenv file (default: /boot/grub/grubenv)
|
||||||
|
--state PATH Update state file (default: /var/lib/kubesolo/update/state.json)
|
||||||
--timeout SECS Health check timeout in seconds (default: 120)
|
--timeout SECS Health check timeout in seconds (default: 120)
|
||||||
--pubkey PATH Ed25519 public key for signature verification (optional)
|
--pubkey PATH Ed25519 public key for signature verification (optional)
|
||||||
|
--json For 'status': emit JSON instead of human-readable output
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
kubesolo-update check --server https://updates.example.com
|
kubesolo-update check --server https://updates.example.com
|
||||||
kubesolo-update apply --server https://updates.example.com --pubkey /etc/kubesolo/update-pubkey.hex
|
kubesolo-update apply --server https://updates.example.com --pubkey /etc/kubesolo/update-pubkey.hex
|
||||||
kubesolo-update healthcheck
|
kubesolo-update healthcheck
|
||||||
kubesolo-update status
|
kubesolo-update status
|
||||||
|
kubesolo-update status --json
|
||||||
`)
|
`)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,9 @@
|
|||||||
// kubesolo_os_update_last_check_timestamp_seconds unix timestamp (gauge)
|
// kubesolo_os_update_last_check_timestamp_seconds unix timestamp (gauge)
|
||||||
// kubesolo_os_memory_total_bytes total RAM (gauge)
|
// kubesolo_os_memory_total_bytes total RAM (gauge)
|
||||||
// kubesolo_os_memory_available_bytes available RAM (gauge)
|
// kubesolo_os_memory_available_bytes available RAM (gauge)
|
||||||
|
// kubesolo_update_phase{phase} 1 for current phase, 0 for others
|
||||||
|
// kubesolo_update_attempts_total counter — attempts at current ToVersion
|
||||||
|
// kubesolo_update_last_attempt_timestamp_seconds unix timestamp of last state update
|
||||||
//
|
//
|
||||||
// This is a zero-dependency implementation — no Prometheus client library needed.
|
// This is a zero-dependency implementation — no Prometheus client library needed.
|
||||||
// It serves metrics in the Prometheus text exposition format.
|
// It serves metrics in the Prometheus text exposition format.
|
||||||
@@ -25,11 +28,14 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Server is a lightweight Prometheus metrics HTTP server.
|
// Server is a lightweight Prometheus metrics HTTP server.
|
||||||
type Server struct {
|
type Server struct {
|
||||||
grubenvPath string
|
grubenvPath string
|
||||||
|
statePath string
|
||||||
listenAddr string
|
listenAddr string
|
||||||
startTime time.Time
|
startTime time.Time
|
||||||
|
|
||||||
@@ -47,6 +53,27 @@ func NewServer(listenAddr, grubenvPath string) *Server {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SetStatePath sets the location of the update state.json file. If empty or
|
||||||
|
// unset, state-derived metrics are emitted with the Idle defaults.
|
||||||
|
func (s *Server) SetStatePath(p string) {
|
||||||
|
s.statePath = p
|
||||||
|
}
|
||||||
|
|
||||||
|
// allPhases lists every Phase value we emit as a kubesolo_update_phase
|
||||||
|
// time-series, so consumers see all label values (with value 0 for non-current
|
||||||
|
// phases). Mirror of validPhases in pkg/state.
|
||||||
|
var allPhases = []state.Phase{
|
||||||
|
state.PhaseIdle,
|
||||||
|
state.PhaseChecking,
|
||||||
|
state.PhaseDownloading,
|
||||||
|
state.PhaseStaged,
|
||||||
|
state.PhaseActivated,
|
||||||
|
state.PhaseVerifying,
|
||||||
|
state.PhaseSuccess,
|
||||||
|
state.PhaseRolledBack,
|
||||||
|
state.PhaseFailed,
|
||||||
|
}
|
||||||
|
|
||||||
// SetUpdateAvailable records whether an update is available.
|
// SetUpdateAvailable records whether an update is available.
|
||||||
func (s *Server) SetUpdateAvailable(available bool) {
|
func (s *Server) SetUpdateAvailable(available bool) {
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
@@ -125,9 +152,49 @@ func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
|
|||||||
sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n")
|
sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n")
|
||||||
sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail))
|
sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail))
|
||||||
|
|
||||||
|
// Update lifecycle (from state.json)
|
||||||
|
s.writeUpdateStateMetrics(&sb)
|
||||||
|
|
||||||
fmt.Fprint(w, sb.String())
|
fmt.Fprint(w, sb.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// writeUpdateStateMetrics appends update-lifecycle metrics derived from the
|
||||||
|
// state.json file. If the file is missing or unreadable, emits the Idle
|
||||||
|
// defaults so the metric series exists at all times.
|
||||||
|
func (s *Server) writeUpdateStateMetrics(sb *strings.Builder) {
|
||||||
|
current := state.PhaseIdle
|
||||||
|
var attempts int
|
||||||
|
var lastTS float64
|
||||||
|
|
||||||
|
if s.statePath != "" {
|
||||||
|
if st, err := state.Load(s.statePath); err == nil && st != nil {
|
||||||
|
current = st.Phase
|
||||||
|
attempts = st.AttemptCount
|
||||||
|
if !st.UpdatedAt.IsZero() {
|
||||||
|
lastTS = float64(st.UpdatedAt.Unix())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.WriteString("# HELP kubesolo_update_phase Current update lifecycle phase (1 for active, 0 otherwise).\n")
|
||||||
|
sb.WriteString("# TYPE kubesolo_update_phase gauge\n")
|
||||||
|
for _, p := range allPhases {
|
||||||
|
v := 0
|
||||||
|
if p == current {
|
||||||
|
v = 1
|
||||||
|
}
|
||||||
|
sb.WriteString(fmt.Sprintf("kubesolo_update_phase{phase=%q} %d\n", string(p), v))
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.WriteString("# HELP kubesolo_update_attempts_total Number of update attempts at the current target version.\n")
|
||||||
|
sb.WriteString("# TYPE kubesolo_update_attempts_total counter\n")
|
||||||
|
sb.WriteString(fmt.Sprintf("kubesolo_update_attempts_total %d\n", attempts))
|
||||||
|
|
||||||
|
sb.WriteString("# HELP kubesolo_update_last_attempt_timestamp_seconds Unix timestamp of the last state transition.\n")
|
||||||
|
sb.WriteString("# TYPE kubesolo_update_last_attempt_timestamp_seconds gauge\n")
|
||||||
|
sb.WriteString(fmt.Sprintf("kubesolo_update_last_attempt_timestamp_seconds %.0f\n", lastTS))
|
||||||
|
}
|
||||||
|
|
||||||
// readGrubenvVar reads a single variable from grubenv using simple file parse.
|
// readGrubenvVar reads a single variable from grubenv using simple file parse.
|
||||||
func (s *Server) readGrubenvVar(key string) string {
|
func (s *Server) readGrubenvVar(key string) string {
|
||||||
data, err := os.ReadFile(s.grubenvPath)
|
data, err := os.ReadFile(s.grubenvPath)
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestNewServer(t *testing.T) {
|
func TestNewServer(t *testing.T) {
|
||||||
@@ -247,6 +249,86 @@ func TestSafeInt(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestUpdateStateMetricsAbsentStateFile(t *testing.T) {
|
||||||
|
// No state path set — should emit Idle defaults so the metric series
|
||||||
|
// exists from first boot.
|
||||||
|
s := NewServer(":9100", "/tmp/nonexistent")
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
s.handleMetrics(w, req)
|
||||||
|
|
||||||
|
body, _ := io.ReadAll(w.Result().Body)
|
||||||
|
output := string(body)
|
||||||
|
|
||||||
|
if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 1`) {
|
||||||
|
t.Errorf("expected idle=1 with no state file, got:\n%s", output)
|
||||||
|
}
|
||||||
|
if !strings.Contains(output, `kubesolo_update_phase{phase="checking"} 0`) {
|
||||||
|
t.Errorf("expected checking=0 with no state file, got:\n%s", output)
|
||||||
|
}
|
||||||
|
if !strings.Contains(output, "kubesolo_update_attempts_total 0") {
|
||||||
|
t.Errorf("expected attempts=0 with no state file, got:\n%s", output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateStateMetricsActivePhase(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
statePath := filepath.Join(dir, "state.json")
|
||||||
|
|
||||||
|
st := state.New()
|
||||||
|
if err := st.Transition(statePath, state.PhaseDownloading, "v0.3.0", ""); err != nil {
|
||||||
|
t.Fatalf("seed state: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
s := NewServer(":9100", "/tmp/nonexistent")
|
||||||
|
s.SetStatePath(statePath)
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
s.handleMetrics(w, req)
|
||||||
|
|
||||||
|
body, _ := io.ReadAll(w.Result().Body)
|
||||||
|
output := string(body)
|
||||||
|
|
||||||
|
if !strings.Contains(output, `kubesolo_update_phase{phase="downloading"} 1`) {
|
||||||
|
t.Errorf("expected downloading=1, got:\n%s", output)
|
||||||
|
}
|
||||||
|
if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 0`) {
|
||||||
|
t.Errorf("expected idle=0 when downloading is active, got:\n%s", output)
|
||||||
|
}
|
||||||
|
if !strings.Contains(output, "kubesolo_update_attempts_total 1") {
|
||||||
|
t.Errorf("expected attempts=1 after first Transition, got:\n%s", output)
|
||||||
|
}
|
||||||
|
if strings.Contains(output, "kubesolo_update_last_attempt_timestamp_seconds 0\n") {
|
||||||
|
t.Errorf("expected non-zero timestamp after state write, got:\n%s", output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateStateMetricsAllPhasesEmitted(t *testing.T) {
|
||||||
|
// Every phase value should appear in the output, so dashboards can graph
|
||||||
|
// the series cleanly.
|
||||||
|
s := NewServer(":9100", "/tmp/nonexistent")
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
s.handleMetrics(w, req)
|
||||||
|
|
||||||
|
body, _ := io.ReadAll(w.Result().Body)
|
||||||
|
output := string(body)
|
||||||
|
|
||||||
|
for _, p := range []state.Phase{
|
||||||
|
state.PhaseIdle, state.PhaseChecking, state.PhaseDownloading,
|
||||||
|
state.PhaseStaged, state.PhaseActivated, state.PhaseVerifying,
|
||||||
|
state.PhaseSuccess, state.PhaseRolledBack, state.PhaseFailed,
|
||||||
|
} {
|
||||||
|
needle := `kubesolo_update_phase{phase="` + string(p) + `"}`
|
||||||
|
if !strings.Contains(output, needle) {
|
||||||
|
t.Errorf("phase %q not present in metrics output", p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestReadFileString(t *testing.T) {
|
func TestReadFileString(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
|
|
||||||
|
|||||||
200
update/pkg/state/state.go
Normal file
200
update/pkg/state/state.go
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
// Package state tracks the lifecycle of an OS update on disk.
|
||||||
|
//
|
||||||
|
// The state file (default /var/lib/kubesolo/update/state.json) records which
|
||||||
|
// phase the agent is in, what versions are involved, when the attempt started,
|
||||||
|
// any error from the last operation, and how many attempts have been made.
|
||||||
|
// Updates are atomic via tmp+rename, so a crash mid-write doesn't corrupt the
|
||||||
|
// state.
|
||||||
|
//
|
||||||
|
// Consumers:
|
||||||
|
// - cmd/check, cmd/apply, cmd/activate, cmd/healthcheck, cmd/rollback —
|
||||||
|
// transition the phase as they enter / leave their operations.
|
||||||
|
// - cmd/status --json — emits the raw state for orchestration tooling.
|
||||||
|
// - pkg/metrics — reads the state at scrape time to expose phase and
|
||||||
|
// attempt-count gauges.
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DefaultPath is where state.json lives on a live system. The directory is on
|
||||||
|
// the persistent data partition so the file survives A/B slot switches.
|
||||||
|
const DefaultPath = "/var/lib/kubesolo/update/state.json"
|
||||||
|
|
||||||
|
// Phase represents the current step in the update lifecycle.
|
||||||
|
//
|
||||||
|
// Terminal phases (Success, RolledBack, Failed) describe the outcome of the
|
||||||
|
// most recent attempt; transient phases (Checking, Downloading, Staged,
|
||||||
|
// Activated, Verifying) describe in-progress work. Idle means no update has
|
||||||
|
// been attempted yet, or the previous attempt has been acknowledged.
|
||||||
|
type Phase string
|
||||||
|
|
||||||
|
const (
|
||||||
|
// PhaseIdle — no update in progress.
|
||||||
|
PhaseIdle Phase = "idle"
|
||||||
|
// PhaseChecking — querying the update server for new versions.
|
||||||
|
PhaseChecking Phase = "checking"
|
||||||
|
// PhaseDownloading — pulling artifacts from the server.
|
||||||
|
PhaseDownloading Phase = "downloading"
|
||||||
|
// PhaseStaged — artifacts written to the passive partition; not yet active.
|
||||||
|
PhaseStaged Phase = "staged"
|
||||||
|
// PhaseActivated — passive slot promoted; next boot will use the new version.
|
||||||
|
PhaseActivated Phase = "activated"
|
||||||
|
// PhaseVerifying — post-boot healthcheck in progress on the new version.
|
||||||
|
PhaseVerifying Phase = "verifying"
|
||||||
|
// PhaseSuccess — last attempt completed and verified.
|
||||||
|
PhaseSuccess Phase = "success"
|
||||||
|
// PhaseRolledBack — last attempt failed verification; reverted to prior slot.
|
||||||
|
PhaseRolledBack Phase = "rolled_back"
|
||||||
|
// PhaseFailed — last attempt failed before reaching activation (download,
|
||||||
|
// checksum, signature, etc.). System still on the original slot.
|
||||||
|
PhaseFailed Phase = "failed"
|
||||||
|
)
|
||||||
|
|
||||||
|
// validPhases lists every legal Phase value. Anything not in this set is
|
||||||
|
// rejected by Save() to catch typos.
|
||||||
|
var validPhases = map[Phase]struct{}{
|
||||||
|
PhaseIdle: {},
|
||||||
|
PhaseChecking: {},
|
||||||
|
PhaseDownloading: {},
|
||||||
|
PhaseStaged: {},
|
||||||
|
PhaseActivated: {},
|
||||||
|
PhaseVerifying: {},
|
||||||
|
PhaseSuccess: {},
|
||||||
|
PhaseRolledBack: {},
|
||||||
|
PhaseFailed: {},
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateState is the on-disk representation. Fields use JSON tags so the
|
||||||
|
// file format is forward-compatible (extra fields ignored, missing fields
|
||||||
|
// default).
|
||||||
|
type UpdateState struct {
|
||||||
|
// Phase is the current lifecycle position.
|
||||||
|
Phase Phase `json:"phase"`
|
||||||
|
// FromVersion is the version the system was running before the attempt.
|
||||||
|
// Empty when no attempt has run.
|
||||||
|
FromVersion string `json:"from_version,omitempty"`
|
||||||
|
// ToVersion is the version the attempt is targeting.
|
||||||
|
// Empty when no attempt has run.
|
||||||
|
ToVersion string `json:"to_version,omitempty"`
|
||||||
|
// StartedAt is when the current attempt entered a non-Idle phase.
|
||||||
|
StartedAt time.Time `json:"started_at,omitempty"`
|
||||||
|
// UpdatedAt is the last time the file was written. Always set on Save().
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
// LastError carries the most recent operation error, populated when
|
||||||
|
// transitioning to PhaseFailed or PhaseRolledBack. Cleared on Success/Idle.
|
||||||
|
LastError string `json:"last_error,omitempty"`
|
||||||
|
// AttemptCount counts attempts at the current ToVersion. Reset when
|
||||||
|
// ToVersion changes or on successful completion.
|
||||||
|
AttemptCount int `json:"attempt_count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// New returns a fresh Idle state with UpdatedAt set to now.
|
||||||
|
func New() *UpdateState {
|
||||||
|
return &UpdateState{
|
||||||
|
Phase: PhaseIdle,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load reads the state from disk. If the file does not exist, returns a fresh
|
||||||
|
// Idle state — this is the normal first-run case, not an error.
|
||||||
|
func Load(path string) (*UpdateState, error) {
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return New(), nil
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("read state %s: %w", path, err)
|
||||||
|
}
|
||||||
|
var s UpdateState
|
||||||
|
if err := json.Unmarshal(data, &s); err != nil {
|
||||||
|
return nil, fmt.Errorf("parse state %s: %w", path, err)
|
||||||
|
}
|
||||||
|
return &s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save writes the state to disk atomically (tmp file + rename), so an
|
||||||
|
// interrupted write never leaves a partial file at `path`.
|
||||||
|
func (s *UpdateState) Save(path string) error {
|
||||||
|
if _, ok := validPhases[s.Phase]; !ok {
|
||||||
|
return fmt.Errorf("invalid phase %q", s.Phase)
|
||||||
|
}
|
||||||
|
s.UpdatedAt = time.Now().UTC()
|
||||||
|
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
||||||
|
return fmt.Errorf("creating state dir: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := json.MarshalIndent(s, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("marshal state: %w", err)
|
||||||
|
}
|
||||||
|
data = append(data, '\n')
|
||||||
|
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0o644); err != nil {
|
||||||
|
return fmt.Errorf("write tmp state: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.Rename(tmp, path); err != nil {
|
||||||
|
_ = os.Remove(tmp)
|
||||||
|
return fmt.Errorf("rename state: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transition moves the state to phase `next` and persists it. If `next`
|
||||||
|
// targets a new ToVersion (different from the current one), AttemptCount is
|
||||||
|
// reset to 1; otherwise it is left untouched. StartedAt is set when
|
||||||
|
// transitioning out of Idle. LastError is cleared unless `next` is Failed or
|
||||||
|
// RolledBack.
|
||||||
|
func (s *UpdateState) Transition(path string, next Phase, toVersion, errMsg string) error {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
|
||||||
|
// Reset attempt counter when targeting a new version.
|
||||||
|
if toVersion != "" && toVersion != s.ToVersion {
|
||||||
|
s.ToVersion = toVersion
|
||||||
|
s.AttemptCount = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// First non-Idle phase of an attempt: record start time and bump count.
|
||||||
|
if s.Phase == PhaseIdle && next != PhaseIdle {
|
||||||
|
s.StartedAt = now
|
||||||
|
s.AttemptCount++
|
||||||
|
}
|
||||||
|
|
||||||
|
s.Phase = next
|
||||||
|
switch next {
|
||||||
|
case PhaseFailed, PhaseRolledBack:
|
||||||
|
if errMsg != "" {
|
||||||
|
s.LastError = errMsg
|
||||||
|
}
|
||||||
|
case PhaseSuccess, PhaseIdle:
|
||||||
|
s.LastError = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
return s.Save(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RecordError marks the state as failed with the given error and saves.
|
||||||
|
// Convenience wrapper around Transition for the most common failure path.
|
||||||
|
func (s *UpdateState) RecordError(path string, err error) error {
|
||||||
|
msg := ""
|
||||||
|
if err != nil {
|
||||||
|
msg = err.Error()
|
||||||
|
}
|
||||||
|
return s.Transition(path, PhaseFailed, "", msg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetFromVersion records the version the system was running when an attempt
|
||||||
|
// started. Idempotent; only takes effect when From is empty.
|
||||||
|
func (s *UpdateState) SetFromVersion(v string) {
|
||||||
|
if s.FromVersion == "" {
|
||||||
|
s.FromVersion = v
|
||||||
|
}
|
||||||
|
}
|
||||||
197
update/pkg/state/state_test.go
Normal file
197
update/pkg/state/state_test.go
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// statePath returns a per-test state file path inside t.TempDir().
|
||||||
|
func statePath(t *testing.T) string {
|
||||||
|
t.Helper()
|
||||||
|
return filepath.Join(t.TempDir(), "state.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadMissingReturnsIdle(t *testing.T) {
|
||||||
|
s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error loading missing state: %v", err)
|
||||||
|
}
|
||||||
|
if s.Phase != PhaseIdle {
|
||||||
|
t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSaveLoadRoundTrip(t *testing.T) {
|
||||||
|
path := statePath(t)
|
||||||
|
in := &UpdateState{
|
||||||
|
Phase: PhaseStaged,
|
||||||
|
FromVersion: "v0.2.0",
|
||||||
|
ToVersion: "v0.3.0",
|
||||||
|
AttemptCount: 1,
|
||||||
|
}
|
||||||
|
if err := in.Save(path); err != nil {
|
||||||
|
t.Fatalf("save: %v", err)
|
||||||
|
}
|
||||||
|
out, err := Load(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("load: %v", err)
|
||||||
|
}
|
||||||
|
if out.Phase != in.Phase {
|
||||||
|
t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
|
||||||
|
}
|
||||||
|
if out.FromVersion != in.FromVersion {
|
||||||
|
t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
|
||||||
|
}
|
||||||
|
if out.ToVersion != in.ToVersion {
|
||||||
|
t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
|
||||||
|
}
|
||||||
|
if out.AttemptCount != in.AttemptCount {
|
||||||
|
t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
|
||||||
|
}
|
||||||
|
if out.UpdatedAt.IsZero() {
|
||||||
|
t.Error("UpdatedAt should be set by Save")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSaveRejectsInvalidPhase(t *testing.T) {
|
||||||
|
s := &UpdateState{Phase: Phase("bogus")}
|
||||||
|
err := s.Save(statePath(t))
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error saving invalid phase, got nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSaveIsAtomic(t *testing.T) {
|
||||||
|
// After Save, the .tmp file should NOT exist — confirming we renamed it.
|
||||||
|
path := statePath(t)
|
||||||
|
s := New()
|
||||||
|
if err := s.Save(path); err != nil {
|
||||||
|
t.Fatalf("save: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Errorf("tmp file still present after Save: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSaveCreatesDirectory(t *testing.T) {
|
||||||
|
// State directory may not exist yet (first-ever boot). Save() should mkdir.
|
||||||
|
dir := filepath.Join(t.TempDir(), "fresh", "subdir")
|
||||||
|
path := filepath.Join(dir, "state.json")
|
||||||
|
if err := New().Save(path); err != nil {
|
||||||
|
t.Fatalf("save into nonexistent dir: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path); err != nil {
|
||||||
|
t.Errorf("state file not present after Save: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransitionIdleToChecking(t *testing.T) {
|
||||||
|
path := statePath(t)
|
||||||
|
s := New()
|
||||||
|
if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
|
||||||
|
t.Fatalf("transition: %v", err)
|
||||||
|
}
|
||||||
|
if s.Phase != PhaseChecking {
|
||||||
|
t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
|
||||||
|
}
|
||||||
|
if s.ToVersion != "v0.3.0" {
|
||||||
|
t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
|
||||||
|
}
|
||||||
|
if s.AttemptCount != 1 {
|
||||||
|
t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
|
||||||
|
}
|
||||||
|
if s.StartedAt.IsZero() {
|
||||||
|
t.Error("StartedAt should be set when leaving Idle")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
|
||||||
|
path := statePath(t)
|
||||||
|
s := New()
|
||||||
|
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
|
||||||
|
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
|
||||||
|
_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
|
||||||
|
if s.AttemptCount != 1 {
|
||||||
|
t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
|
||||||
|
path := statePath(t)
|
||||||
|
s := New()
|
||||||
|
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
|
||||||
|
// Now an attempt at a NEW version starts. AttemptCount should reset.
|
||||||
|
_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
|
||||||
|
if s.ToVersion != "v0.4.0" {
|
||||||
|
t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
|
||||||
|
}
|
||||||
|
if s.AttemptCount != 0 {
|
||||||
|
t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransitionFailedRecordsError(t *testing.T) {
|
||||||
|
path := statePath(t)
|
||||||
|
s := New()
|
||||||
|
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
|
||||||
|
_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
|
||||||
|
if s.Phase != PhaseFailed {
|
||||||
|
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
|
||||||
|
}
|
||||||
|
if s.LastError != "checksum mismatch" {
|
||||||
|
t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransitionSuccessClearsError(t *testing.T) {
|
||||||
|
path := statePath(t)
|
||||||
|
s := New()
|
||||||
|
_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
|
||||||
|
if s.LastError == "" {
|
||||||
|
t.Fatal("setup: LastError should be non-empty before success")
|
||||||
|
}
|
||||||
|
_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
|
||||||
|
if s.LastError != "" {
|
||||||
|
t.Errorf("last_error after success: got %q, want empty", s.LastError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRecordError(t *testing.T) {
|
||||||
|
path := statePath(t)
|
||||||
|
s := New()
|
||||||
|
if err := s.RecordError(path, errors.New("network down")); err != nil {
|
||||||
|
t.Fatalf("RecordError: %v", err)
|
||||||
|
}
|
||||||
|
if s.Phase != PhaseFailed {
|
||||||
|
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
|
||||||
|
}
|
||||||
|
if s.LastError != "network down" {
|
||||||
|
t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSetFromVersionIdempotent(t *testing.T) {
|
||||||
|
s := New()
|
||||||
|
s.SetFromVersion("v0.2.0")
|
||||||
|
if s.FromVersion != "v0.2.0" {
|
||||||
|
t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
|
||||||
|
}
|
||||||
|
// Second call should not overwrite.
|
||||||
|
s.SetFromVersion("v0.1.0")
|
||||||
|
if s.FromVersion != "v0.2.0" {
|
||||||
|
t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadHandlesGarbageFile(t *testing.T) {
|
||||||
|
path := statePath(t)
|
||||||
|
if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
|
||||||
|
t.Fatalf("seed: %v", err)
|
||||||
|
}
|
||||||
|
_, err := Load(path)
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error loading garbage, got nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user