feat(update): pre-flight gates + deeper healthcheck + auto-rollback
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m29s
CI / Shellcheck (push) Successful in 48s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Successful in 1m12s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Has been cancelled
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m29s
CI / Shellcheck (push) Successful in 48s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Successful in 1m12s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Has been cancelled
Phase 8 of v0.3. Tightens the update lifecycle on both ends. Pre-flight (apply.go, before any download): - Free-space check on the passive partition: image size + 10% headroom must be available. Uses statfs(2) via the new pkg/partition.FreeBytes / HasFreeSpaceFor helpers (tests cover happy path, tiny request, huge request, missing path). Catches corrupted-FS and shrunk-partition cases before we destroy the existing slot data. - Node-block-label check: refuses if the local K8s node carries the updates.kubesolo.io/block=true label. New pkg/health.CheckNodeBlocked shells out to kubectl per the project's zero-deps stance. Silently bypassed when no kubeconfig is reachable (air-gap case). Skipped by --force. Healthcheck (extended via new pkg/health/extended.go + preflight.go): - CheckKubeSystemReady waits until every kube-system pod has held the Running phase for >= N seconds (default 30). Catches "started ok, will crash-loop" bugs that a single-shot phase check misses. - CheckProbeURL fetches an operator-supplied URL; 200 = pass. Wired through update.conf as healthcheck_url= and cloud-init updates.healthcheck_url. - CheckDiskWritable writes/fsyncs/reads a 1-KiB probe under /var/lib/kubesolo. Always runs in healthcheck so a wedged data partition fails fast. - pkg/health.Status grows KubeSystemReady, ProbeURL, DiskWritable booleans. Optional checks default to true in RunAll() so they don't block when unconfigured. health_test.go updated to the new 6-field shape. Auto-rollback (healthcheck.go): - state.UpdateState gains HealthCheckFailures (consecutive post-Activated failures). Reset on a clean pass. - --auto-rollback-after N (also auto_rollback_after= in update.conf) triggers env.ForceRollback() when the failure count reaches the threshold. State transitions to RolledBack with a descriptive LastError. The command still exits with the healthcheck error; the operator/init is expected to reboot. - Only fires while Phase == Activated. Doesn't second-guess a long-stable system that happens to fail one healthcheck. config / opts / cloud-init plumbing: - update.conf gains healthcheck_url= and auto_rollback_after= keys. - New CLI flags: --healthcheck-url, --auto-rollback-after, --kube-system-settle. - cloud-init full-config.yaml documents the new updates: subfields. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -72,3 +72,14 @@ updates:
|
|||||||
# Path to Ed25519 public key for signature verification. Omit to disable
|
# Path to Ed25519 public key for signature verification. Omit to disable
|
||||||
# signature verification (NOT recommended for production fleets).
|
# signature verification (NOT recommended for production fleets).
|
||||||
# pubkey: "/etc/kubesolo/update-pubkey.hex"
|
# pubkey: "/etc/kubesolo/update-pubkey.hex"
|
||||||
|
|
||||||
|
# Optional post-boot healthcheck probe URL. If set, healthcheck GETs it
|
||||||
|
# and treats anything other than HTTP 200 as a failure. Useful when your
|
||||||
|
# workload exposes its own readiness on a known endpoint.
|
||||||
|
# healthcheck_url: "http://localhost:8000/ready"
|
||||||
|
|
||||||
|
# Auto-rollback threshold: after N consecutive post-activation healthcheck
|
||||||
|
# failures, the agent triggers a rollback on its own. 0 disables the
|
||||||
|
# feature (the bootloader still does GRUB-counter-based rollback after
|
||||||
|
# 3 failed boots). Recommended: 3 for production fleets.
|
||||||
|
# auto_rollback_after: 3
|
||||||
|
|||||||
@@ -4,10 +4,12 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"runtime"
|
"runtime"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/config"
|
"github.com/portainer/kubesolo-os/update/pkg/config"
|
||||||
|
"github.com/portainer/kubesolo-os/update/pkg/health"
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/image"
|
"github.com/portainer/kubesolo-os/update/pkg/image"
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/oci"
|
"github.com/portainer/kubesolo-os/update/pkg/oci"
|
||||||
"github.com/portainer/kubesolo-os/update/pkg/partition"
|
"github.com/portainer/kubesolo-os/update/pkg/partition"
|
||||||
@@ -71,6 +73,19 @@ func Apply(args []string) error {
|
|||||||
window.String())
|
window.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Node-block-label gate — workload authors can defer an update by
|
||||||
|
// labeling the node updates.kubesolo.io/block=true. Skipped with --force
|
||||||
|
// and silently bypassed when the K8s API isn't reachable (air-gap).
|
||||||
|
if !opts.Force {
|
||||||
|
blocked, berr := health.CheckNodeBlocked("")
|
||||||
|
if berr != nil {
|
||||||
|
slog.Warn("node-block check failed, allowing update", "error", berr)
|
||||||
|
} else if blocked {
|
||||||
|
return fmt.Errorf("node carries label %s=true; refusing update (pass --force to override)",
|
||||||
|
health.NodeBlockLabel)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
st, err := state.Load(opts.StatePath)
|
st, err := state.Load(opts.StatePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Don't block the operation on a corrupt state file. Log + recover.
|
// Don't block the operation on a corrupt state file. Log + recover.
|
||||||
@@ -186,6 +201,30 @@ func Apply(args []string) error {
|
|||||||
}
|
}
|
||||||
defer partition.Unmount(mountPoint)
|
defer partition.Unmount(mountPoint)
|
||||||
|
|
||||||
|
// Free-space pre-write check: the passive partition must have at least
|
||||||
|
// (kernel + initramfs) + 10% headroom. Catches corrupted-FS reports and
|
||||||
|
// shrunk/wrong-size partitions before we destroy the existing slot data.
|
||||||
|
var imgSize int64
|
||||||
|
for _, p := range []string{staged.VmlinuzPath, staged.InitramfsPath} {
|
||||||
|
fi, ferr := os.Stat(p)
|
||||||
|
if ferr != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("stat staged file %s: %w", p, ferr))
|
||||||
|
return fmt.Errorf("stat staged file %s: %w", p, ferr)
|
||||||
|
}
|
||||||
|
imgSize += fi.Size()
|
||||||
|
}
|
||||||
|
avail, ok, ferr := partition.HasFreeSpaceFor(mountPoint, imgSize, 10)
|
||||||
|
if ferr != nil {
|
||||||
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("free-space check: %w", ferr))
|
||||||
|
return fmt.Errorf("free-space check: %w", ferr)
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
err := fmt.Errorf("insufficient space on %s: have %.1f MiB, need %.1f MiB (image + 10%% headroom)",
|
||||||
|
passiveSlot, float64(avail)/(1<<20), float64(imgSize)*1.1/(1<<20))
|
||||||
|
_ = st.RecordError(opts.StatePath, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// Write image to passive partition
|
// Write image to passive partition
|
||||||
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
|
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
|
||||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))
|
||||||
|
|||||||
@@ -17,6 +17,11 @@ import (
|
|||||||
// State transition: Activated → Verifying → Success on pass, → Failed on fail.
|
// State transition: Activated → Verifying → Success on pass, → Failed on fail.
|
||||||
// If state isn't in Activated (e.g. manual run on a long-stable system), the
|
// If state isn't in Activated (e.g. manual run on a long-stable system), the
|
||||||
// state file is left alone — healthcheck still does its job.
|
// state file is left alone — healthcheck still does its job.
|
||||||
|
//
|
||||||
|
// When --auto-rollback-after N is set, consecutive post-Activated failures
|
||||||
|
// are counted in state.HealthCheckFailures. On the Nth failure, the agent
|
||||||
|
// calls Rollback() and the operator is expected to reboot (this command
|
||||||
|
// does not reboot the host — that's policy left to systemd/init).
|
||||||
func Healthcheck(args []string) error {
|
func Healthcheck(args []string) error {
|
||||||
opts := parseOpts(args)
|
opts := parseOpts(args)
|
||||||
env := opts.NewBootEnv()
|
env := opts.NewBootEnv()
|
||||||
@@ -48,18 +53,44 @@ func Healthcheck(args []string) error {
|
|||||||
|
|
||||||
timeout := time.Duration(opts.TimeoutSecs) * time.Second
|
timeout := time.Duration(opts.TimeoutSecs) * time.Second
|
||||||
checker := health.NewChecker("", "", timeout)
|
checker := health.NewChecker("", "", timeout)
|
||||||
|
checker.ProbeURL = opts.HealthcheckURL
|
||||||
|
if opts.KubeSystemSettle > 0 {
|
||||||
|
checker.KubeSystemSettle = time.Duration(opts.KubeSystemSettle) * time.Second
|
||||||
|
}
|
||||||
|
// Probe the data partition every healthcheck so a wedged disk fails fast.
|
||||||
|
checker.DataDir = "/var/lib/kubesolo"
|
||||||
|
|
||||||
slog.Info("running post-boot health checks", "timeout", timeout)
|
slog.Info("running post-boot health checks",
|
||||||
|
"timeout", timeout,
|
||||||
|
"probe_url", checker.ProbeURL,
|
||||||
|
"kube_system_settle", checker.KubeSystemSettle)
|
||||||
|
|
||||||
status, err := checker.WaitForHealthy()
|
status, err := checker.WaitForHealthy()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Printf("Health check FAILED: %s\n", status.Message)
|
fmt.Printf("Health check FAILED: %s\n", status.Message)
|
||||||
fmt.Printf(" containerd: %v\n", status.Containerd)
|
printStatusBreakdown(status)
|
||||||
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
|
||||||
fmt.Printf(" node_ready: %v\n", status.NodeReady)
|
|
||||||
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
|
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
|
||||||
|
|
||||||
if postActivation {
|
if postActivation {
|
||||||
|
st.HealthCheckFailures++
|
||||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
|
_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
|
||||||
|
|
||||||
|
// Auto-rollback escalation. Only trigger when post-Activated;
|
||||||
|
// don't second-guess a healthy long-running system.
|
||||||
|
if opts.AutoRollbackAfter > 0 && st.HealthCheckFailures >= opts.AutoRollbackAfter {
|
||||||
|
slog.Warn("auto-rollback threshold reached",
|
||||||
|
"failures", st.HealthCheckFailures,
|
||||||
|
"threshold", opts.AutoRollbackAfter)
|
||||||
|
if rerr := env.ForceRollback(); rerr != nil {
|
||||||
|
slog.Error("auto-rollback failed", "error", rerr)
|
||||||
|
return err // return the original healthcheck error
|
||||||
|
}
|
||||||
|
if terr := st.Transition(opts.StatePath, state.PhaseRolledBack, "",
|
||||||
|
fmt.Sprintf("auto-rollback after %d healthcheck failures", st.HealthCheckFailures)); terr != nil {
|
||||||
|
slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", terr)
|
||||||
|
}
|
||||||
|
fmt.Println("\nAuto-rollback triggered. Reboot to complete the rollback.")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -73,15 +104,32 @@ func Healthcheck(args []string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if postActivation {
|
if postActivation {
|
||||||
|
// Reset failure counter on a clean pass.
|
||||||
|
st.HealthCheckFailures = 0
|
||||||
if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
|
if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
|
||||||
slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
|
slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println("Health check PASSED — boot marked successful")
|
fmt.Println("Health check PASSED — boot marked successful")
|
||||||
fmt.Printf(" containerd: %v\n", status.Containerd)
|
printStatusBreakdown(status)
|
||||||
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
|
||||||
fmt.Printf(" node_ready: %v\n", status.NodeReady)
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// printStatusBreakdown emits a human-readable per-check summary. Only emits
|
||||||
|
// optional check lines when they actually ran.
|
||||||
|
func printStatusBreakdown(s *health.Status) {
|
||||||
|
fmt.Printf(" containerd: %v\n", s.Containerd)
|
||||||
|
fmt.Printf(" apiserver: %v\n", s.APIServer)
|
||||||
|
fmt.Printf(" node_ready: %v\n", s.NodeReady)
|
||||||
|
if !s.KubeSystemReady {
|
||||||
|
fmt.Printf(" kube-system pods: %v\n", s.KubeSystemReady)
|
||||||
|
}
|
||||||
|
if !s.ProbeURL {
|
||||||
|
fmt.Printf(" probe URL: %v\n", s.ProbeURL)
|
||||||
|
}
|
||||||
|
if !s.DiskWritable {
|
||||||
|
fmt.Printf(" disk writable: %v\n", s.DiskWritable)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -22,6 +22,9 @@ type opts struct {
|
|||||||
ConfPath string // location of update.conf (default: config.DefaultPath)
|
ConfPath string // location of update.conf (default: config.DefaultPath)
|
||||||
Channel string // update channel ("stable" by default)
|
Channel string // update channel ("stable" by default)
|
||||||
MaintenanceWindow string // "HH:MM-HH:MM" or empty for always-allow
|
MaintenanceWindow string // "HH:MM-HH:MM" or empty for always-allow
|
||||||
|
HealthcheckURL string // optional GET probe for healthcheck
|
||||||
|
AutoRollbackAfter int // healthcheck: rollback after N consecutive failures (0=off)
|
||||||
|
KubeSystemSettle int // healthcheck: kube-system pods must be Running for N seconds (0=disabled)
|
||||||
Force bool // bypass maintenance window
|
Force bool // bypass maintenance window
|
||||||
JSON bool // status: emit JSON instead of human-readable
|
JSON bool // status: emit JSON instead of human-readable
|
||||||
}
|
}
|
||||||
@@ -74,6 +77,12 @@ func parseOpts(args []string) opts {
|
|||||||
if cfg.PubKey != "" {
|
if cfg.PubKey != "" {
|
||||||
o.PubKeyPath = cfg.PubKey
|
o.PubKeyPath = cfg.PubKey
|
||||||
}
|
}
|
||||||
|
if cfg.HealthcheckURL != "" {
|
||||||
|
o.HealthcheckURL = cfg.HealthcheckURL
|
||||||
|
}
|
||||||
|
if cfg.AutoRollbackAfter > 0 {
|
||||||
|
o.AutoRollbackAfter = cfg.AutoRollbackAfter
|
||||||
|
}
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
slog.Warn("could not load update.conf", "path", o.ConfPath, "error", err)
|
slog.Warn("could not load update.conf", "path", o.ConfPath, "error", err)
|
||||||
}
|
}
|
||||||
@@ -100,6 +109,43 @@ func parseOpts(args []string) opts {
|
|||||||
}
|
}
|
||||||
case "--force":
|
case "--force":
|
||||||
o.Force = true
|
o.Force = true
|
||||||
|
case "--healthcheck-url":
|
||||||
|
if i+1 < len(args) {
|
||||||
|
o.HealthcheckURL = args[i+1]
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
case "--auto-rollback-after":
|
||||||
|
if i+1 < len(args) {
|
||||||
|
n := 0
|
||||||
|
for _, ch := range args[i+1] {
|
||||||
|
if ch >= '0' && ch <= '9' {
|
||||||
|
n = n*10 + int(ch-'0')
|
||||||
|
} else {
|
||||||
|
n = 0
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if n > 0 {
|
||||||
|
o.AutoRollbackAfter = n
|
||||||
|
}
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
case "--kube-system-settle":
|
||||||
|
if i+1 < len(args) {
|
||||||
|
n := 0
|
||||||
|
for _, ch := range args[i+1] {
|
||||||
|
if ch >= '0' && ch <= '9' {
|
||||||
|
n = n*10 + int(ch-'0')
|
||||||
|
} else {
|
||||||
|
n = 0
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if n > 0 {
|
||||||
|
o.KubeSystemSettle = n
|
||||||
|
}
|
||||||
|
i++
|
||||||
|
}
|
||||||
case "--json":
|
case "--json":
|
||||||
o.JSON = true
|
o.JSON = true
|
||||||
case "--server":
|
case "--server":
|
||||||
|
|||||||
@@ -90,6 +90,9 @@ Options:
|
|||||||
--grubenv PATH Path to grubenv file (default: /boot/grub/grubenv)
|
--grubenv PATH Path to grubenv file (default: /boot/grub/grubenv)
|
||||||
--timeout SECS Health check timeout in seconds (default: 120)
|
--timeout SECS Health check timeout in seconds (default: 120)
|
||||||
--pubkey PATH Ed25519 public key for signature verification (optional)
|
--pubkey PATH Ed25519 public key for signature verification (optional)
|
||||||
|
--healthcheck-url URL Optional GET probe in healthcheck; 200 = pass
|
||||||
|
--auto-rollback-after N healthcheck: rollback after N consecutive failures
|
||||||
|
--kube-system-settle N healthcheck: require kube-system pods Running ≥ N seconds
|
||||||
--json For 'status': emit JSON instead of human-readable output
|
--json For 'status': emit JSON instead of human-readable output
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|||||||
@@ -35,6 +35,13 @@ type Config struct {
|
|||||||
Channel string
|
Channel string
|
||||||
MaintenanceWindow string
|
MaintenanceWindow string
|
||||||
PubKey string
|
PubKey string
|
||||||
|
// HealthcheckURL is an optional URL the healthcheck command will GET;
|
||||||
|
// 200 = pass, anything else = fail.
|
||||||
|
HealthcheckURL string
|
||||||
|
// AutoRollbackAfter is the number of consecutive post-boot healthcheck
|
||||||
|
// failures after which the agent will call Rollback automatically.
|
||||||
|
// 0 = disabled (default).
|
||||||
|
AutoRollbackAfter int
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load reads and parses update.conf. A missing file returns an empty Config
|
// Load reads and parses update.conf. A missing file returns an empty Config
|
||||||
@@ -73,6 +80,21 @@ func Load(path string) (*Config, error) {
|
|||||||
c.MaintenanceWindow = value
|
c.MaintenanceWindow = value
|
||||||
case "pubkey":
|
case "pubkey":
|
||||||
c.PubKey = value
|
c.PubKey = value
|
||||||
|
case "healthcheck_url":
|
||||||
|
c.HealthcheckURL = value
|
||||||
|
case "auto_rollback_after":
|
||||||
|
// Parse a small integer. Non-numeric values are silently
|
||||||
|
// ignored (forward compat); zero disables the feature.
|
||||||
|
n := 0
|
||||||
|
for _, ch := range value {
|
||||||
|
if ch >= '0' && ch <= '9' {
|
||||||
|
n = n*10 + int(ch-'0')
|
||||||
|
} else {
|
||||||
|
n = 0
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c.AutoRollbackAfter = n
|
||||||
}
|
}
|
||||||
// Unknown keys are silently ignored for forward compatibility.
|
// Unknown keys are silently ignored for forward compatibility.
|
||||||
}
|
}
|
||||||
|
|||||||
125
update/pkg/health/extended.go
Normal file
125
update/pkg/health/extended.go
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
package health
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// kubeSystemSettleSeconds is how long all kube-system pods must hold a
|
||||||
|
// Running phase before we consider the cluster genuinely up. Catches the
|
||||||
|
// "pod just started, will crash-loop in 5s" case.
|
||||||
|
const kubeSystemSettleSeconds = 30
|
||||||
|
|
||||||
|
// CheckKubeSystemReady verifies that every pod in the kube-system namespace
|
||||||
|
// is in Running phase and has been Running for at least settle. Returns
|
||||||
|
// (ready, error). settle defaults to 30s when zero.
|
||||||
|
func (c *Checker) CheckKubeSystemReady(settle time.Duration) bool {
|
||||||
|
if settle == 0 {
|
||||||
|
settle = kubeSystemSettleSeconds * time.Second
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(c.kubeconfigPath); err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// jsonpath emits one line per pod: <phase>|<startTime>
|
||||||
|
cmd := exec.CommandContext(ctx, "kubectl",
|
||||||
|
"--kubeconfig", c.kubeconfigPath,
|
||||||
|
"get", "pods", "-n", "kube-system",
|
||||||
|
"-o", `jsonpath={range .items[*]}{.status.phase}|{.status.startTime}{"\n"}{end}`,
|
||||||
|
)
|
||||||
|
out, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||||
|
if len(lines) == 0 || lines[0] == "" {
|
||||||
|
// No pods reported. Conservatively treat as not-ready: kube-system
|
||||||
|
// is expected to host at least CoreDNS + pause.
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
for _, line := range lines {
|
||||||
|
parts := strings.SplitN(line, "|", 2)
|
||||||
|
phase := strings.TrimSpace(parts[0])
|
||||||
|
if phase != "Running" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if len(parts) < 2 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
start, perr := time.Parse(time.RFC3339, strings.TrimSpace(parts[1]))
|
||||||
|
if perr != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if now.Sub(start) < settle {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// CheckProbeURL fetches the given URL and reports whether it returned 200.
|
||||||
|
// Empty url returns (true, nil) — the check is opt-in.
|
||||||
|
func CheckProbeURL(url string) (bool, error) {
|
||||||
|
if url == "" {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
client := &http.Client{Timeout: 5 * time.Second}
|
||||||
|
resp, err := client.Get(url)
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("probe URL %s: %w", url, err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
return resp.StatusCode == http.StatusOK, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CheckDiskWritable writes a small file under dataDir, fsyncs, reads it back,
|
||||||
|
// and removes it. Confirms the data partition is mounted read-write and the
|
||||||
|
// underlying disk is responsive. Empty dataDir defaults to /var/lib/kubesolo.
|
||||||
|
func CheckDiskWritable(dataDir string) (bool, error) {
|
||||||
|
if dataDir == "" {
|
||||||
|
dataDir = "/var/lib/kubesolo"
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(dataDir); err != nil {
|
||||||
|
// Data partition not mounted? That's catastrophic but we shouldn't
|
||||||
|
// claim the disk is fine.
|
||||||
|
return false, fmt.Errorf("dataDir %s: %w", dataDir, err)
|
||||||
|
}
|
||||||
|
probe := filepath.Join(dataDir, ".update-probe")
|
||||||
|
want := []byte("kubesolo-os healthcheck probe")
|
||||||
|
|
||||||
|
f, err := os.Create(probe)
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("create probe: %w", err)
|
||||||
|
}
|
||||||
|
defer os.Remove(probe)
|
||||||
|
|
||||||
|
if _, err := f.Write(want); err != nil {
|
||||||
|
f.Close()
|
||||||
|
return false, fmt.Errorf("write probe: %w", err)
|
||||||
|
}
|
||||||
|
if err := f.Sync(); err != nil {
|
||||||
|
f.Close()
|
||||||
|
return false, fmt.Errorf("fsync probe: %w", err)
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return false, fmt.Errorf("close probe: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := os.ReadFile(probe)
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("read probe: %w", err)
|
||||||
|
}
|
||||||
|
if string(got) != string(want) {
|
||||||
|
return false, fmt.Errorf("probe content mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
77
update/pkg/health/extended_test.go
Normal file
77
update/pkg/health/extended_test.go
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
package health
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCheckProbeURLEmptyAlwaysPasses(t *testing.T) {
|
||||||
|
ok, err := CheckProbeURL("")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CheckProbeURL(\"\"): %v", err)
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
t.Error("empty probe URL should return ok=true (check disabled)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckProbeURL200(t *testing.T) {
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
ok, err := CheckProbeURL(srv.URL)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CheckProbeURL: %v", err)
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
t.Error("expected ok=true on 200")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckProbeURLNon200(t *testing.T) {
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
ok, err := CheckProbeURL(srv.URL)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CheckProbeURL: %v", err)
|
||||||
|
}
|
||||||
|
if ok {
|
||||||
|
t.Error("expected ok=false on 503")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckProbeURLNetworkError(t *testing.T) {
|
||||||
|
// Port 1 is reserved (tcpmux) and never bound by Linux defaults.
|
||||||
|
_, err := CheckProbeURL("http://127.0.0.1:1")
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error for unreachable URL, got nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckDiskWritableHappyPath(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
ok, err := CheckDiskWritable(dir)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CheckDiskWritable: %v", err)
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
t.Error("expected ok=true on writable temp dir")
|
||||||
|
}
|
||||||
|
// Probe file should have been cleaned up.
|
||||||
|
if _, err := os.Stat(filepath.Join(dir, ".update-probe")); !os.IsNotExist(err) {
|
||||||
|
t.Errorf("probe file not cleaned up: stat err=%v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckDiskWritableMissingDir(t *testing.T) {
|
||||||
|
_, err := CheckDiskWritable("/this/path/does/not/exist")
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error for missing dataDir, got nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -27,12 +27,17 @@ type Status struct {
|
|||||||
Containerd bool
|
Containerd bool
|
||||||
APIServer bool
|
APIServer bool
|
||||||
NodeReady bool
|
NodeReady bool
|
||||||
|
KubeSystemReady bool // optional — true unless KubeSystemSettle is non-zero
|
||||||
|
ProbeURL bool // optional — true unless ProbeURL is set
|
||||||
|
DiskWritable bool // optional — true unless DataDir is set
|
||||||
Message string
|
Message string
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsHealthy returns true if all checks passed.
|
// IsHealthy returns true if all required checks passed. Optional checks
|
||||||
|
// default to true when not configured, so they don't block the result.
|
||||||
func (s *Status) IsHealthy() bool {
|
func (s *Status) IsHealthy() bool {
|
||||||
return s.Containerd && s.APIServer && s.NodeReady
|
return s.Containerd && s.APIServer && s.NodeReady &&
|
||||||
|
s.KubeSystemReady && s.ProbeURL && s.DiskWritable
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checker performs health checks against the local KubeSolo instance.
|
// Checker performs health checks against the local KubeSolo instance.
|
||||||
@@ -40,6 +45,11 @@ type Checker struct {
|
|||||||
kubeconfigPath string
|
kubeconfigPath string
|
||||||
apiServerAddr string
|
apiServerAddr string
|
||||||
timeout time.Duration
|
timeout time.Duration
|
||||||
|
|
||||||
|
// Optional gates. Zero values disable the check (it reports true).
|
||||||
|
KubeSystemSettle time.Duration
|
||||||
|
ProbeURL string
|
||||||
|
DataDir string
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewChecker creates a health checker.
|
// NewChecker creates a health checker.
|
||||||
@@ -149,12 +159,37 @@ func (c *Checker) CheckNodeReady() bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// RunAll performs all health checks and returns the combined status.
|
// RunAll performs all health checks and returns the combined status.
|
||||||
|
//
|
||||||
|
// Optional checks (kube-system settle, user probe URL, disk writability) are
|
||||||
|
// only run if the corresponding Checker fields are set; otherwise they
|
||||||
|
// report true so as not to block the result.
|
||||||
func (c *Checker) RunAll() *Status {
|
func (c *Checker) RunAll() *Status {
|
||||||
return &Status{
|
s := &Status{
|
||||||
Containerd: c.CheckContainerd(),
|
Containerd: c.CheckContainerd(),
|
||||||
APIServer: c.CheckAPIServer(),
|
APIServer: c.CheckAPIServer(),
|
||||||
NodeReady: c.CheckNodeReady(),
|
NodeReady: c.CheckNodeReady(),
|
||||||
|
KubeSystemReady: true,
|
||||||
|
ProbeURL: true,
|
||||||
|
DiskWritable: true,
|
||||||
}
|
}
|
||||||
|
if c.KubeSystemSettle > 0 {
|
||||||
|
s.KubeSystemReady = c.CheckKubeSystemReady(c.KubeSystemSettle)
|
||||||
|
}
|
||||||
|
if c.ProbeURL != "" {
|
||||||
|
ok, err := CheckProbeURL(c.ProbeURL)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("probe URL check failed", "url", c.ProbeURL, "error", err)
|
||||||
|
}
|
||||||
|
s.ProbeURL = ok
|
||||||
|
}
|
||||||
|
if c.DataDir != "" {
|
||||||
|
ok, err := CheckDiskWritable(c.DataDir)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("disk writability check failed", "dir", c.DataDir, "error", err)
|
||||||
|
}
|
||||||
|
s.DiskWritable = ok
|
||||||
|
}
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// WaitForHealthy polls health checks until all pass or timeout expires.
|
// WaitForHealthy polls health checks until all pass or timeout expires.
|
||||||
|
|||||||
@@ -6,36 +6,42 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestStatusIsHealthy(t *testing.T) {
|
func TestStatusIsHealthy(t *testing.T) {
|
||||||
|
// Helper for the new 6-field Status: all-true except the named one.
|
||||||
|
allBut := func(field string) Status {
|
||||||
|
s := Status{
|
||||||
|
Containerd: true, APIServer: true, NodeReady: true,
|
||||||
|
KubeSystemReady: true, ProbeURL: true, DiskWritable: true,
|
||||||
|
}
|
||||||
|
switch field {
|
||||||
|
case "Containerd":
|
||||||
|
s.Containerd = false
|
||||||
|
case "APIServer":
|
||||||
|
s.APIServer = false
|
||||||
|
case "NodeReady":
|
||||||
|
s.NodeReady = false
|
||||||
|
case "KubeSystemReady":
|
||||||
|
s.KubeSystemReady = false
|
||||||
|
case "ProbeURL":
|
||||||
|
s.ProbeURL = false
|
||||||
|
case "DiskWritable":
|
||||||
|
s.DiskWritable = false
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
status Status
|
status Status
|
||||||
wantHealth bool
|
wantHealth bool
|
||||||
}{
|
}{
|
||||||
{
|
{"all healthy", allBut(""), true},
|
||||||
name: "all healthy",
|
{"containerd down", allBut("Containerd"), false},
|
||||||
status: Status{Containerd: true, APIServer: true, NodeReady: true},
|
{"apiserver down", allBut("APIServer"), false},
|
||||||
wantHealth: true,
|
{"node not ready", allBut("NodeReady"), false},
|
||||||
},
|
{"kube-system not ready", allBut("KubeSystemReady"), false},
|
||||||
{
|
{"probe URL failed", allBut("ProbeURL"), false},
|
||||||
name: "containerd down",
|
{"disk not writable", allBut("DiskWritable"), false},
|
||||||
status: Status{Containerd: false, APIServer: true, NodeReady: true},
|
{"all down", Status{}, false},
|
||||||
wantHealth: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "apiserver down",
|
|
||||||
status: Status{Containerd: true, APIServer: false, NodeReady: true},
|
|
||||||
wantHealth: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "node not ready",
|
|
||||||
status: Status{Containerd: true, APIServer: true, NodeReady: false},
|
|
||||||
wantHealth: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "all down",
|
|
||||||
status: Status{Containerd: false, APIServer: false, NodeReady: false},
|
|
||||||
wantHealth: false,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
|
|||||||
51
update/pkg/health/preflight.go
Normal file
51
update/pkg/health/preflight.go
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
package health
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NodeBlockLabel is the well-known label that workload authors set on the
|
||||||
|
// local node to defer an OS update. When present and "true", apply refuses.
|
||||||
|
const NodeBlockLabel = "updates.kubesolo.io/block"
|
||||||
|
|
||||||
|
// CheckNodeBlocked returns (blocked, error). blocked==true means the local
|
||||||
|
// node carries the updates.kubesolo.io/block=true label and the caller should
|
||||||
|
// refuse the update.
|
||||||
|
//
|
||||||
|
// If the kubeconfig is not available (offline / pre-boot / air-gap), this
|
||||||
|
// returns (false, nil) — silently allowing the update. That's the safe
|
||||||
|
// behaviour for the air-gap case where the node may not be reachable from
|
||||||
|
// the agent's perspective.
|
||||||
|
func CheckNodeBlocked(kubeconfigPath string) (bool, error) {
|
||||||
|
if kubeconfigPath == "" {
|
||||||
|
kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(kubeconfigPath); err != nil {
|
||||||
|
// No kubeconfig — assume air-gap / pre-K8s. Don't block updates.
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Query the node label via kubectl. We don't know the node name a
|
||||||
|
// priori, so we use --kubeconfig on the local admin config and ask for
|
||||||
|
// "the only node" (KubeSolo is single-node by design).
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
cmd := exec.CommandContext(ctx, "kubectl",
|
||||||
|
"--kubeconfig", kubeconfigPath,
|
||||||
|
"get", "node",
|
||||||
|
"-o", `jsonpath={.items[0].metadata.labels.updates\.kubesolo\.io/block}`)
|
||||||
|
out, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
// API unreachable or no nodes — treat as not blocked (analogous to
|
||||||
|
// the kubeconfig-missing case). We still surface the error so the
|
||||||
|
// caller can decide to log it.
|
||||||
|
return false, fmt.Errorf("query node label: %w", err)
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(out)) == "true", nil
|
||||||
|
}
|
||||||
34
update/pkg/partition/freespace.go
Normal file
34
update/pkg/partition/freespace.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package partition
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
// FreeBytes returns the number of free bytes available on the filesystem
|
||||||
|
// containing `path`. Uses statfs(2); path must exist and be readable.
|
||||||
|
func FreeBytes(path string) (uint64, error) {
|
||||||
|
var stat syscall.Statfs_t
|
||||||
|
if err := syscall.Statfs(path, &stat); err != nil {
|
||||||
|
return 0, fmt.Errorf("statfs %s: %w", path, err)
|
||||||
|
}
|
||||||
|
// Bavail is the count of free blocks available to non-root users —
|
||||||
|
// matches what `df` reports. Bsize is the block size in bytes.
|
||||||
|
//nolint:unconvert // Bavail is uint64 on most platforms but int64 on darwin/freebsd
|
||||||
|
return uint64(stat.Bavail) * uint64(stat.Bsize), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// HasFreeSpaceFor reports whether `path`'s filesystem has at least `wantBytes`
|
||||||
|
// of free space, with `headroomPct` reserved (e.g. 10 = require 110% of want).
|
||||||
|
// Returns the available bytes alongside, so callers can render a useful error.
|
||||||
|
func HasFreeSpaceFor(path string, wantBytes int64, headroomPct int) (avail uint64, ok bool, err error) {
|
||||||
|
avail, err = FreeBytes(path)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false, err
|
||||||
|
}
|
||||||
|
if wantBytes < 0 {
|
||||||
|
return avail, false, fmt.Errorf("invalid wantBytes %d", wantBytes)
|
||||||
|
}
|
||||||
|
required := uint64(wantBytes) * uint64(100+headroomPct) / 100
|
||||||
|
return avail, avail >= required, nil
|
||||||
|
}
|
||||||
44
update/pkg/partition/freespace_test.go
Normal file
44
update/pkg/partition/freespace_test.go
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
package partition
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestFreeBytesReturnsNonZeroOnTempDir(t *testing.T) {
|
||||||
|
b, err := FreeBytes(t.TempDir())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("FreeBytes: %v", err)
|
||||||
|
}
|
||||||
|
// On any sane test runner the temp filesystem has more than 1 KiB free.
|
||||||
|
if b < 1024 {
|
||||||
|
t.Errorf("FreeBytes = %d, want > 1024 on /tmp", b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFreeBytesNonExistentPath(t *testing.T) {
|
||||||
|
_, err := FreeBytes("/this/path/does/not/exist/at/all")
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error for missing path, got nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHasFreeSpaceForRejectsHugeRequest(t *testing.T) {
|
||||||
|
// Request 1 PiB with 10% headroom on /tmp — no test runner has that
|
||||||
|
// much free, so this should consistently report not-enough.
|
||||||
|
avail, ok, err := HasFreeSpaceFor(t.TempDir(), 1<<50, 10)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("HasFreeSpaceFor: %v", err)
|
||||||
|
}
|
||||||
|
if ok {
|
||||||
|
t.Errorf("expected insufficient space for 1PiB, got avail=%d ok=true", avail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHasFreeSpaceForAcceptsSmallRequest(t *testing.T) {
|
||||||
|
// 1 KiB with 10% headroom = 1.1 KiB. Any temp dir has this.
|
||||||
|
_, ok, err := HasFreeSpaceFor(t.TempDir(), 1024, 10)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("HasFreeSpaceFor: %v", err)
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
t.Error("expected sufficient space for 1KiB on /tmp")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -92,6 +92,12 @@ type UpdateState struct {
|
|||||||
// AttemptCount counts attempts at the current ToVersion. Reset when
|
// AttemptCount counts attempts at the current ToVersion. Reset when
|
||||||
// ToVersion changes or on successful completion.
|
// ToVersion changes or on successful completion.
|
||||||
AttemptCount int `json:"attempt_count"`
|
AttemptCount int `json:"attempt_count"`
|
||||||
|
|
||||||
|
// HealthCheckFailures counts consecutive post-Activated healthcheck
|
||||||
|
// failures. Reset to 0 on a successful healthcheck or after a rollback.
|
||||||
|
// Used by `kubesolo-update healthcheck --auto-rollback-after N` to
|
||||||
|
// trigger automatic recovery on a wedged new boot.
|
||||||
|
HealthCheckFailures int `json:"health_check_failures,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// New returns a fresh Idle state with UpdatedAt set to now.
|
// New returns a fresh Idle state with UpdatedAt set to now.
|
||||||
|
|||||||
Reference in New Issue
Block a user