From 9fb894c5af8f15a11685cc26100404d5abcf0946 Mon Sep 17 00:00:00 2001 From: Adolfo Delorenzo Date: Thu, 14 May 2026 19:08:30 -0600 Subject: [PATCH] feat(update): pre-flight gates + deeper healthcheck + auto-rollback Phase 8 of v0.3. Tightens the update lifecycle on both ends. Pre-flight (apply.go, before any download): - Free-space check on the passive partition: image size + 10% headroom must be available. Uses statfs(2) via the new pkg/partition.FreeBytes / HasFreeSpaceFor helpers (tests cover happy path, tiny request, huge request, missing path). Catches corrupted-FS and shrunk-partition cases before we destroy the existing slot data. - Node-block-label check: refuses if the local K8s node carries the updates.kubesolo.io/block=true label. New pkg/health.CheckNodeBlocked shells out to kubectl per the project's zero-deps stance. Silently bypassed when no kubeconfig is reachable (air-gap case). Skipped by --force. Healthcheck (extended via new pkg/health/extended.go + preflight.go): - CheckKubeSystemReady waits until every kube-system pod has held the Running phase for >= N seconds (default 30). Catches "started ok, will crash-loop" bugs that a single-shot phase check misses. - CheckProbeURL fetches an operator-supplied URL; 200 = pass. Wired through update.conf as healthcheck_url= and cloud-init updates.healthcheck_url. - CheckDiskWritable writes/fsyncs/reads a 1-KiB probe under /var/lib/kubesolo. Always runs in healthcheck so a wedged data partition fails fast. - pkg/health.Status grows KubeSystemReady, ProbeURL, DiskWritable booleans. Optional checks default to true in RunAll() so they don't block when unconfigured. health_test.go updated to the new 6-field shape. Auto-rollback (healthcheck.go): - state.UpdateState gains HealthCheckFailures (consecutive post-Activated failures). Reset on a clean pass. - --auto-rollback-after N (also auto_rollback_after= in update.conf) triggers env.ForceRollback() when the failure count reaches the threshold. State transitions to RolledBack with a descriptive LastError. The command still exits with the healthcheck error; the operator/init is expected to reboot. - Only fires while Phase == Activated. Doesn't second-guess a long-stable system that happens to fail one healthcheck. config / opts / cloud-init plumbing: - update.conf gains healthcheck_url= and auto_rollback_after= keys. - New CLI flags: --healthcheck-url, --auto-rollback-after, --kube-system-settle. - cloud-init full-config.yaml documents the new updates: subfields. Co-Authored-By: Claude Opus 4.7 (1M context) --- cloud-init/examples/full-config.yaml | 11 +++ update/cmd/apply.go | 39 ++++++++ update/cmd/healthcheck.go | 62 ++++++++++-- update/cmd/opts.go | 58 ++++++++++-- update/main.go | 3 + update/pkg/config/config.go | 22 +++++ update/pkg/health/extended.go | 125 +++++++++++++++++++++++++ update/pkg/health/extended_test.go | 77 +++++++++++++++ update/pkg/health/health.go | 55 +++++++++-- update/pkg/health/health_test.go | 56 ++++++----- update/pkg/health/preflight.go | 51 ++++++++++ update/pkg/partition/freespace.go | 34 +++++++ update/pkg/partition/freespace_test.go | 44 +++++++++ update/pkg/state/state.go | 6 ++ 14 files changed, 595 insertions(+), 48 deletions(-) create mode 100644 update/pkg/health/extended.go create mode 100644 update/pkg/health/extended_test.go create mode 100644 update/pkg/health/preflight.go create mode 100644 update/pkg/partition/freespace.go create mode 100644 update/pkg/partition/freespace_test.go diff --git a/cloud-init/examples/full-config.yaml b/cloud-init/examples/full-config.yaml index 17b9e54..711520d 100644 --- a/cloud-init/examples/full-config.yaml +++ b/cloud-init/examples/full-config.yaml @@ -72,3 +72,14 @@ updates: # Path to Ed25519 public key for signature verification. Omit to disable # signature verification (NOT recommended for production fleets). # pubkey: "/etc/kubesolo/update-pubkey.hex" + + # Optional post-boot healthcheck probe URL. If set, healthcheck GETs it + # and treats anything other than HTTP 200 as a failure. Useful when your + # workload exposes its own readiness on a known endpoint. + # healthcheck_url: "http://localhost:8000/ready" + + # Auto-rollback threshold: after N consecutive post-activation healthcheck + # failures, the agent triggers a rollback on its own. 0 disables the + # feature (the bootloader still does GRUB-counter-based rollback after + # 3 failed boots). Recommended: 3 for production fleets. + # auto_rollback_after: 3 diff --git a/update/cmd/apply.go b/update/cmd/apply.go index 7fc8f22..aa61abe 100644 --- a/update/cmd/apply.go +++ b/update/cmd/apply.go @@ -4,10 +4,12 @@ import ( "context" "fmt" "log/slog" + "os" "runtime" "time" "github.com/portainer/kubesolo-os/update/pkg/config" + "github.com/portainer/kubesolo-os/update/pkg/health" "github.com/portainer/kubesolo-os/update/pkg/image" "github.com/portainer/kubesolo-os/update/pkg/oci" "github.com/portainer/kubesolo-os/update/pkg/partition" @@ -71,6 +73,19 @@ func Apply(args []string) error { window.String()) } + // Node-block-label gate — workload authors can defer an update by + // labeling the node updates.kubesolo.io/block=true. Skipped with --force + // and silently bypassed when the K8s API isn't reachable (air-gap). + if !opts.Force { + blocked, berr := health.CheckNodeBlocked("") + if berr != nil { + slog.Warn("node-block check failed, allowing update", "error", berr) + } else if blocked { + return fmt.Errorf("node carries label %s=true; refusing update (pass --force to override)", + health.NodeBlockLabel) + } + } + st, err := state.Load(opts.StatePath) if err != nil { // Don't block the operation on a corrupt state file. Log + recover. @@ -186,6 +201,30 @@ func Apply(args []string) error { } defer partition.Unmount(mountPoint) + // Free-space pre-write check: the passive partition must have at least + // (kernel + initramfs) + 10% headroom. Catches corrupted-FS reports and + // shrunk/wrong-size partitions before we destroy the existing slot data. + var imgSize int64 + for _, p := range []string{staged.VmlinuzPath, staged.InitramfsPath} { + fi, ferr := os.Stat(p) + if ferr != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("stat staged file %s: %w", p, ferr)) + return fmt.Errorf("stat staged file %s: %w", p, ferr) + } + imgSize += fi.Size() + } + avail, ok, ferr := partition.HasFreeSpaceFor(mountPoint, imgSize, 10) + if ferr != nil { + _ = st.RecordError(opts.StatePath, fmt.Errorf("free-space check: %w", ferr)) + return fmt.Errorf("free-space check: %w", ferr) + } + if !ok { + err := fmt.Errorf("insufficient space on %s: have %.1f MiB, need %.1f MiB (image + 10%% headroom)", + passiveSlot, float64(avail)/(1<<20), float64(imgSize)*1.1/(1<<20)) + _ = st.RecordError(opts.StatePath, err) + return err + } + // Write image to passive partition if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil { _ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err)) diff --git a/update/cmd/healthcheck.go b/update/cmd/healthcheck.go index 2ad04a8..68178a1 100644 --- a/update/cmd/healthcheck.go +++ b/update/cmd/healthcheck.go @@ -17,6 +17,11 @@ import ( // State transition: Activated → Verifying → Success on pass, → Failed on fail. // If state isn't in Activated (e.g. manual run on a long-stable system), the // state file is left alone — healthcheck still does its job. +// +// When --auto-rollback-after N is set, consecutive post-Activated failures +// are counted in state.HealthCheckFailures. On the Nth failure, the agent +// calls Rollback() and the operator is expected to reboot (this command +// does not reboot the host — that's policy left to systemd/init). func Healthcheck(args []string) error { opts := parseOpts(args) env := opts.NewBootEnv() @@ -48,18 +53,44 @@ func Healthcheck(args []string) error { timeout := time.Duration(opts.TimeoutSecs) * time.Second checker := health.NewChecker("", "", timeout) + checker.ProbeURL = opts.HealthcheckURL + if opts.KubeSystemSettle > 0 { + checker.KubeSystemSettle = time.Duration(opts.KubeSystemSettle) * time.Second + } + // Probe the data partition every healthcheck so a wedged disk fails fast. + checker.DataDir = "/var/lib/kubesolo" - slog.Info("running post-boot health checks", "timeout", timeout) + slog.Info("running post-boot health checks", + "timeout", timeout, + "probe_url", checker.ProbeURL, + "kube_system_settle", checker.KubeSystemSettle) status, err := checker.WaitForHealthy() if err != nil { fmt.Printf("Health check FAILED: %s\n", status.Message) - fmt.Printf(" containerd: %v\n", status.Containerd) - fmt.Printf(" apiserver: %v\n", status.APIServer) - fmt.Printf(" node_ready: %v\n", status.NodeReady) + printStatusBreakdown(status) fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot") + if postActivation { + st.HealthCheckFailures++ _ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message)) + + // Auto-rollback escalation. Only trigger when post-Activated; + // don't second-guess a healthy long-running system. + if opts.AutoRollbackAfter > 0 && st.HealthCheckFailures >= opts.AutoRollbackAfter { + slog.Warn("auto-rollback threshold reached", + "failures", st.HealthCheckFailures, + "threshold", opts.AutoRollbackAfter) + if rerr := env.ForceRollback(); rerr != nil { + slog.Error("auto-rollback failed", "error", rerr) + return err // return the original healthcheck error + } + if terr := st.Transition(opts.StatePath, state.PhaseRolledBack, "", + fmt.Sprintf("auto-rollback after %d healthcheck failures", st.HealthCheckFailures)); terr != nil { + slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", terr) + } + fmt.Println("\nAuto-rollback triggered. Reboot to complete the rollback.") + } } return err } @@ -73,15 +104,32 @@ func Healthcheck(args []string) error { } if postActivation { + // Reset failure counter on a clean pass. + st.HealthCheckFailures = 0 if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil { slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err) } } fmt.Println("Health check PASSED — boot marked successful") - fmt.Printf(" containerd: %v\n", status.Containerd) - fmt.Printf(" apiserver: %v\n", status.APIServer) - fmt.Printf(" node_ready: %v\n", status.NodeReady) + printStatusBreakdown(status) return nil } + +// printStatusBreakdown emits a human-readable per-check summary. Only emits +// optional check lines when they actually ran. +func printStatusBreakdown(s *health.Status) { + fmt.Printf(" containerd: %v\n", s.Containerd) + fmt.Printf(" apiserver: %v\n", s.APIServer) + fmt.Printf(" node_ready: %v\n", s.NodeReady) + if !s.KubeSystemReady { + fmt.Printf(" kube-system pods: %v\n", s.KubeSystemReady) + } + if !s.ProbeURL { + fmt.Printf(" probe URL: %v\n", s.ProbeURL) + } + if !s.DiskWritable { + fmt.Printf(" disk writable: %v\n", s.DiskWritable) + } +} diff --git a/update/cmd/opts.go b/update/cmd/opts.go index 9903569..c698767 100644 --- a/update/cmd/opts.go +++ b/update/cmd/opts.go @@ -18,12 +18,15 @@ type opts struct { PubKeyPath string BootEnvType string // "grub" or "rpi" BootEnvPath string // path for RPi boot control dir - StatePath string // location of state.json (default: state.DefaultPath) - ConfPath string // location of update.conf (default: config.DefaultPath) - Channel string // update channel ("stable" by default) - MaintenanceWindow string // "HH:MM-HH:MM" or empty for always-allow - Force bool // bypass maintenance window - JSON bool // status: emit JSON instead of human-readable + StatePath string // location of state.json (default: state.DefaultPath) + ConfPath string // location of update.conf (default: config.DefaultPath) + Channel string // update channel ("stable" by default) + MaintenanceWindow string // "HH:MM-HH:MM" or empty for always-allow + HealthcheckURL string // optional GET probe for healthcheck + AutoRollbackAfter int // healthcheck: rollback after N consecutive failures (0=off) + KubeSystemSettle int // healthcheck: kube-system pods must be Running for N seconds (0=disabled) + Force bool // bypass maintenance window + JSON bool // status: emit JSON instead of human-readable } // NewBootEnv creates a BootEnv from the parsed options. @@ -74,6 +77,12 @@ func parseOpts(args []string) opts { if cfg.PubKey != "" { o.PubKeyPath = cfg.PubKey } + if cfg.HealthcheckURL != "" { + o.HealthcheckURL = cfg.HealthcheckURL + } + if cfg.AutoRollbackAfter > 0 { + o.AutoRollbackAfter = cfg.AutoRollbackAfter + } } else if err != nil { slog.Warn("could not load update.conf", "path", o.ConfPath, "error", err) } @@ -100,6 +109,43 @@ func parseOpts(args []string) opts { } case "--force": o.Force = true + case "--healthcheck-url": + if i+1 < len(args) { + o.HealthcheckURL = args[i+1] + i++ + } + case "--auto-rollback-after": + if i+1 < len(args) { + n := 0 + for _, ch := range args[i+1] { + if ch >= '0' && ch <= '9' { + n = n*10 + int(ch-'0') + } else { + n = 0 + break + } + } + if n > 0 { + o.AutoRollbackAfter = n + } + i++ + } + case "--kube-system-settle": + if i+1 < len(args) { + n := 0 + for _, ch := range args[i+1] { + if ch >= '0' && ch <= '9' { + n = n*10 + int(ch-'0') + } else { + n = 0 + break + } + } + if n > 0 { + o.KubeSystemSettle = n + } + i++ + } case "--json": o.JSON = true case "--server": diff --git a/update/main.go b/update/main.go index e614145..07e8d50 100644 --- a/update/main.go +++ b/update/main.go @@ -90,6 +90,9 @@ Options: --grubenv PATH Path to grubenv file (default: /boot/grub/grubenv) --timeout SECS Health check timeout in seconds (default: 120) --pubkey PATH Ed25519 public key for signature verification (optional) + --healthcheck-url URL Optional GET probe in healthcheck; 200 = pass + --auto-rollback-after N healthcheck: rollback after N consecutive failures + --kube-system-settle N healthcheck: require kube-system pods Running ≥ N seconds --json For 'status': emit JSON instead of human-readable output Examples: diff --git a/update/pkg/config/config.go b/update/pkg/config/config.go index bcd2978..a69e860 100644 --- a/update/pkg/config/config.go +++ b/update/pkg/config/config.go @@ -35,6 +35,13 @@ type Config struct { Channel string MaintenanceWindow string PubKey string + // HealthcheckURL is an optional URL the healthcheck command will GET; + // 200 = pass, anything else = fail. + HealthcheckURL string + // AutoRollbackAfter is the number of consecutive post-boot healthcheck + // failures after which the agent will call Rollback automatically. + // 0 = disabled (default). + AutoRollbackAfter int } // Load reads and parses update.conf. A missing file returns an empty Config @@ -73,6 +80,21 @@ func Load(path string) (*Config, error) { c.MaintenanceWindow = value case "pubkey": c.PubKey = value + case "healthcheck_url": + c.HealthcheckURL = value + case "auto_rollback_after": + // Parse a small integer. Non-numeric values are silently + // ignored (forward compat); zero disables the feature. + n := 0 + for _, ch := range value { + if ch >= '0' && ch <= '9' { + n = n*10 + int(ch-'0') + } else { + n = 0 + break + } + } + c.AutoRollbackAfter = n } // Unknown keys are silently ignored for forward compatibility. } diff --git a/update/pkg/health/extended.go b/update/pkg/health/extended.go new file mode 100644 index 0000000..eedd81f --- /dev/null +++ b/update/pkg/health/extended.go @@ -0,0 +1,125 @@ +package health + +import ( + "context" + "fmt" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +// kubeSystemSettleSeconds is how long all kube-system pods must hold a +// Running phase before we consider the cluster genuinely up. Catches the +// "pod just started, will crash-loop in 5s" case. +const kubeSystemSettleSeconds = 30 + +// CheckKubeSystemReady verifies that every pod in the kube-system namespace +// is in Running phase and has been Running for at least settle. Returns +// (ready, error). settle defaults to 30s when zero. +func (c *Checker) CheckKubeSystemReady(settle time.Duration) bool { + if settle == 0 { + settle = kubeSystemSettleSeconds * time.Second + } + if _, err := os.Stat(c.kubeconfigPath); err != nil { + return false + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + // jsonpath emits one line per pod: | + cmd := exec.CommandContext(ctx, "kubectl", + "--kubeconfig", c.kubeconfigPath, + "get", "pods", "-n", "kube-system", + "-o", `jsonpath={range .items[*]}{.status.phase}|{.status.startTime}{"\n"}{end}`, + ) + out, err := cmd.Output() + if err != nil { + return false + } + lines := strings.Split(strings.TrimSpace(string(out)), "\n") + if len(lines) == 0 || lines[0] == "" { + // No pods reported. Conservatively treat as not-ready: kube-system + // is expected to host at least CoreDNS + pause. + return false + } + now := time.Now() + for _, line := range lines { + parts := strings.SplitN(line, "|", 2) + phase := strings.TrimSpace(parts[0]) + if phase != "Running" { + return false + } + if len(parts) < 2 { + return false + } + start, perr := time.Parse(time.RFC3339, strings.TrimSpace(parts[1])) + if perr != nil { + return false + } + if now.Sub(start) < settle { + return false + } + } + return true +} + +// CheckProbeURL fetches the given URL and reports whether it returned 200. +// Empty url returns (true, nil) — the check is opt-in. +func CheckProbeURL(url string) (bool, error) { + if url == "" { + return true, nil + } + client := &http.Client{Timeout: 5 * time.Second} + resp, err := client.Get(url) + if err != nil { + return false, fmt.Errorf("probe URL %s: %w", url, err) + } + defer resp.Body.Close() + return resp.StatusCode == http.StatusOK, nil +} + +// CheckDiskWritable writes a small file under dataDir, fsyncs, reads it back, +// and removes it. Confirms the data partition is mounted read-write and the +// underlying disk is responsive. Empty dataDir defaults to /var/lib/kubesolo. +func CheckDiskWritable(dataDir string) (bool, error) { + if dataDir == "" { + dataDir = "/var/lib/kubesolo" + } + if _, err := os.Stat(dataDir); err != nil { + // Data partition not mounted? That's catastrophic but we shouldn't + // claim the disk is fine. + return false, fmt.Errorf("dataDir %s: %w", dataDir, err) + } + probe := filepath.Join(dataDir, ".update-probe") + want := []byte("kubesolo-os healthcheck probe") + + f, err := os.Create(probe) + if err != nil { + return false, fmt.Errorf("create probe: %w", err) + } + defer os.Remove(probe) + + if _, err := f.Write(want); err != nil { + f.Close() + return false, fmt.Errorf("write probe: %w", err) + } + if err := f.Sync(); err != nil { + f.Close() + return false, fmt.Errorf("fsync probe: %w", err) + } + if err := f.Close(); err != nil { + return false, fmt.Errorf("close probe: %w", err) + } + + got, err := os.ReadFile(probe) + if err != nil { + return false, fmt.Errorf("read probe: %w", err) + } + if string(got) != string(want) { + return false, fmt.Errorf("probe content mismatch: got %q", got) + } + return true, nil +} diff --git a/update/pkg/health/extended_test.go b/update/pkg/health/extended_test.go new file mode 100644 index 0000000..f322d6b --- /dev/null +++ b/update/pkg/health/extended_test.go @@ -0,0 +1,77 @@ +package health + +import ( + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" +) + +func TestCheckProbeURLEmptyAlwaysPasses(t *testing.T) { + ok, err := CheckProbeURL("") + if err != nil { + t.Fatalf("CheckProbeURL(\"\"): %v", err) + } + if !ok { + t.Error("empty probe URL should return ok=true (check disabled)") + } +} + +func TestCheckProbeURL200(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + ok, err := CheckProbeURL(srv.URL) + if err != nil { + t.Fatalf("CheckProbeURL: %v", err) + } + if !ok { + t.Error("expected ok=true on 200") + } +} + +func TestCheckProbeURLNon200(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + defer srv.Close() + ok, err := CheckProbeURL(srv.URL) + if err != nil { + t.Fatalf("CheckProbeURL: %v", err) + } + if ok { + t.Error("expected ok=false on 503") + } +} + +func TestCheckProbeURLNetworkError(t *testing.T) { + // Port 1 is reserved (tcpmux) and never bound by Linux defaults. + _, err := CheckProbeURL("http://127.0.0.1:1") + if err == nil { + t.Error("expected error for unreachable URL, got nil") + } +} + +func TestCheckDiskWritableHappyPath(t *testing.T) { + dir := t.TempDir() + ok, err := CheckDiskWritable(dir) + if err != nil { + t.Fatalf("CheckDiskWritable: %v", err) + } + if !ok { + t.Error("expected ok=true on writable temp dir") + } + // Probe file should have been cleaned up. + if _, err := os.Stat(filepath.Join(dir, ".update-probe")); !os.IsNotExist(err) { + t.Errorf("probe file not cleaned up: stat err=%v", err) + } +} + +func TestCheckDiskWritableMissingDir(t *testing.T) { + _, err := CheckDiskWritable("/this/path/does/not/exist") + if err == nil { + t.Error("expected error for missing dataDir, got nil") + } +} diff --git a/update/pkg/health/health.go b/update/pkg/health/health.go index 90c397c..c3a4578 100644 --- a/update/pkg/health/health.go +++ b/update/pkg/health/health.go @@ -24,15 +24,20 @@ import ( // Status represents the result of a health check. type Status struct { - Containerd bool - APIServer bool - NodeReady bool - Message string + Containerd bool + APIServer bool + NodeReady bool + KubeSystemReady bool // optional — true unless KubeSystemSettle is non-zero + ProbeURL bool // optional — true unless ProbeURL is set + DiskWritable bool // optional — true unless DataDir is set + Message string } -// IsHealthy returns true if all checks passed. +// IsHealthy returns true if all required checks passed. Optional checks +// default to true when not configured, so they don't block the result. func (s *Status) IsHealthy() bool { - return s.Containerd && s.APIServer && s.NodeReady + return s.Containerd && s.APIServer && s.NodeReady && + s.KubeSystemReady && s.ProbeURL && s.DiskWritable } // Checker performs health checks against the local KubeSolo instance. @@ -40,6 +45,11 @@ type Checker struct { kubeconfigPath string apiServerAddr string timeout time.Duration + + // Optional gates. Zero values disable the check (it reports true). + KubeSystemSettle time.Duration + ProbeURL string + DataDir string } // NewChecker creates a health checker. @@ -149,12 +159,37 @@ func (c *Checker) CheckNodeReady() bool { } // RunAll performs all health checks and returns the combined status. +// +// Optional checks (kube-system settle, user probe URL, disk writability) are +// only run if the corresponding Checker fields are set; otherwise they +// report true so as not to block the result. func (c *Checker) RunAll() *Status { - return &Status{ - Containerd: c.CheckContainerd(), - APIServer: c.CheckAPIServer(), - NodeReady: c.CheckNodeReady(), + s := &Status{ + Containerd: c.CheckContainerd(), + APIServer: c.CheckAPIServer(), + NodeReady: c.CheckNodeReady(), + KubeSystemReady: true, + ProbeURL: true, + DiskWritable: true, } + if c.KubeSystemSettle > 0 { + s.KubeSystemReady = c.CheckKubeSystemReady(c.KubeSystemSettle) + } + if c.ProbeURL != "" { + ok, err := CheckProbeURL(c.ProbeURL) + if err != nil { + slog.Warn("probe URL check failed", "url", c.ProbeURL, "error", err) + } + s.ProbeURL = ok + } + if c.DataDir != "" { + ok, err := CheckDiskWritable(c.DataDir) + if err != nil { + slog.Warn("disk writability check failed", "dir", c.DataDir, "error", err) + } + s.DiskWritable = ok + } + return s } // WaitForHealthy polls health checks until all pass or timeout expires. diff --git a/update/pkg/health/health_test.go b/update/pkg/health/health_test.go index 91a4ee0..d192ec0 100644 --- a/update/pkg/health/health_test.go +++ b/update/pkg/health/health_test.go @@ -6,36 +6,42 @@ import ( ) func TestStatusIsHealthy(t *testing.T) { + // Helper for the new 6-field Status: all-true except the named one. + allBut := func(field string) Status { + s := Status{ + Containerd: true, APIServer: true, NodeReady: true, + KubeSystemReady: true, ProbeURL: true, DiskWritable: true, + } + switch field { + case "Containerd": + s.Containerd = false + case "APIServer": + s.APIServer = false + case "NodeReady": + s.NodeReady = false + case "KubeSystemReady": + s.KubeSystemReady = false + case "ProbeURL": + s.ProbeURL = false + case "DiskWritable": + s.DiskWritable = false + } + return s + } + tests := []struct { name string status Status wantHealth bool }{ - { - name: "all healthy", - status: Status{Containerd: true, APIServer: true, NodeReady: true}, - wantHealth: true, - }, - { - name: "containerd down", - status: Status{Containerd: false, APIServer: true, NodeReady: true}, - wantHealth: false, - }, - { - name: "apiserver down", - status: Status{Containerd: true, APIServer: false, NodeReady: true}, - wantHealth: false, - }, - { - name: "node not ready", - status: Status{Containerd: true, APIServer: true, NodeReady: false}, - wantHealth: false, - }, - { - name: "all down", - status: Status{Containerd: false, APIServer: false, NodeReady: false}, - wantHealth: false, - }, + {"all healthy", allBut(""), true}, + {"containerd down", allBut("Containerd"), false}, + {"apiserver down", allBut("APIServer"), false}, + {"node not ready", allBut("NodeReady"), false}, + {"kube-system not ready", allBut("KubeSystemReady"), false}, + {"probe URL failed", allBut("ProbeURL"), false}, + {"disk not writable", allBut("DiskWritable"), false}, + {"all down", Status{}, false}, } for _, tt := range tests { diff --git a/update/pkg/health/preflight.go b/update/pkg/health/preflight.go new file mode 100644 index 0000000..396d1ad --- /dev/null +++ b/update/pkg/health/preflight.go @@ -0,0 +1,51 @@ +package health + +import ( + "context" + "fmt" + "os" + "os/exec" + "strings" + "time" +) + +// NodeBlockLabel is the well-known label that workload authors set on the +// local node to defer an OS update. When present and "true", apply refuses. +const NodeBlockLabel = "updates.kubesolo.io/block" + +// CheckNodeBlocked returns (blocked, error). blocked==true means the local +// node carries the updates.kubesolo.io/block=true label and the caller should +// refuse the update. +// +// If the kubeconfig is not available (offline / pre-boot / air-gap), this +// returns (false, nil) — silently allowing the update. That's the safe +// behaviour for the air-gap case where the node may not be reachable from +// the agent's perspective. +func CheckNodeBlocked(kubeconfigPath string) (bool, error) { + if kubeconfigPath == "" { + kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig" + } + if _, err := os.Stat(kubeconfigPath); err != nil { + // No kubeconfig — assume air-gap / pre-K8s. Don't block updates. + return false, nil + } + + // Query the node label via kubectl. We don't know the node name a + // priori, so we use --kubeconfig on the local admin config and ask for + // "the only node" (KubeSolo is single-node by design). + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "kubectl", + "--kubeconfig", kubeconfigPath, + "get", "node", + "-o", `jsonpath={.items[0].metadata.labels.updates\.kubesolo\.io/block}`) + out, err := cmd.Output() + if err != nil { + // API unreachable or no nodes — treat as not blocked (analogous to + // the kubeconfig-missing case). We still surface the error so the + // caller can decide to log it. + return false, fmt.Errorf("query node label: %w", err) + } + return strings.TrimSpace(string(out)) == "true", nil +} diff --git a/update/pkg/partition/freespace.go b/update/pkg/partition/freespace.go new file mode 100644 index 0000000..a738f55 --- /dev/null +++ b/update/pkg/partition/freespace.go @@ -0,0 +1,34 @@ +package partition + +import ( + "fmt" + "syscall" +) + +// FreeBytes returns the number of free bytes available on the filesystem +// containing `path`. Uses statfs(2); path must exist and be readable. +func FreeBytes(path string) (uint64, error) { + var stat syscall.Statfs_t + if err := syscall.Statfs(path, &stat); err != nil { + return 0, fmt.Errorf("statfs %s: %w", path, err) + } + // Bavail is the count of free blocks available to non-root users — + // matches what `df` reports. Bsize is the block size in bytes. + //nolint:unconvert // Bavail is uint64 on most platforms but int64 on darwin/freebsd + return uint64(stat.Bavail) * uint64(stat.Bsize), nil +} + +// HasFreeSpaceFor reports whether `path`'s filesystem has at least `wantBytes` +// of free space, with `headroomPct` reserved (e.g. 10 = require 110% of want). +// Returns the available bytes alongside, so callers can render a useful error. +func HasFreeSpaceFor(path string, wantBytes int64, headroomPct int) (avail uint64, ok bool, err error) { + avail, err = FreeBytes(path) + if err != nil { + return 0, false, err + } + if wantBytes < 0 { + return avail, false, fmt.Errorf("invalid wantBytes %d", wantBytes) + } + required := uint64(wantBytes) * uint64(100+headroomPct) / 100 + return avail, avail >= required, nil +} diff --git a/update/pkg/partition/freespace_test.go b/update/pkg/partition/freespace_test.go new file mode 100644 index 0000000..fb9fa32 --- /dev/null +++ b/update/pkg/partition/freespace_test.go @@ -0,0 +1,44 @@ +package partition + +import "testing" + +func TestFreeBytesReturnsNonZeroOnTempDir(t *testing.T) { + b, err := FreeBytes(t.TempDir()) + if err != nil { + t.Fatalf("FreeBytes: %v", err) + } + // On any sane test runner the temp filesystem has more than 1 KiB free. + if b < 1024 { + t.Errorf("FreeBytes = %d, want > 1024 on /tmp", b) + } +} + +func TestFreeBytesNonExistentPath(t *testing.T) { + _, err := FreeBytes("/this/path/does/not/exist/at/all") + if err == nil { + t.Error("expected error for missing path, got nil") + } +} + +func TestHasFreeSpaceForRejectsHugeRequest(t *testing.T) { + // Request 1 PiB with 10% headroom on /tmp — no test runner has that + // much free, so this should consistently report not-enough. + avail, ok, err := HasFreeSpaceFor(t.TempDir(), 1<<50, 10) + if err != nil { + t.Fatalf("HasFreeSpaceFor: %v", err) + } + if ok { + t.Errorf("expected insufficient space for 1PiB, got avail=%d ok=true", avail) + } +} + +func TestHasFreeSpaceForAcceptsSmallRequest(t *testing.T) { + // 1 KiB with 10% headroom = 1.1 KiB. Any temp dir has this. + _, ok, err := HasFreeSpaceFor(t.TempDir(), 1024, 10) + if err != nil { + t.Fatalf("HasFreeSpaceFor: %v", err) + } + if !ok { + t.Error("expected sufficient space for 1KiB on /tmp") + } +} diff --git a/update/pkg/state/state.go b/update/pkg/state/state.go index f63afdd..c1e9517 100644 --- a/update/pkg/state/state.go +++ b/update/pkg/state/state.go @@ -92,6 +92,12 @@ type UpdateState struct { // AttemptCount counts attempts at the current ToVersion. Reset when // ToVersion changes or on successful completion. AttemptCount int `json:"attempt_count"` + + // HealthCheckFailures counts consecutive post-Activated healthcheck + // failures. Reset to 0 on a successful healthcheck or after a rollback. + // Used by `kubesolo-update healthcheck --auto-rollback-after N` to + // trigger automatic recovery on a wedged new boot. + HealthCheckFailures int `json:"health_check_failures,omitempty"` } // New returns a fresh Idle state with UpdatedAt set to now.