feat(update): pre-flight gates + deeper healthcheck + auto-rollback
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m29s
CI / Shellcheck (push) Successful in 48s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Successful in 1m12s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Has been cancelled

Phase 8 of v0.3. Tightens the update lifecycle on both ends.

Pre-flight (apply.go, before any download):
- Free-space check on the passive partition: image size + 10% headroom must
  be available. Uses statfs(2) via the new pkg/partition.FreeBytes /
  HasFreeSpaceFor helpers (tests cover happy path, tiny request, huge
  request, missing path). Catches corrupted-FS and shrunk-partition cases
  before we destroy the existing slot data.
- Node-block-label check: refuses if the local K8s node carries the
  updates.kubesolo.io/block=true label. New pkg/health.CheckNodeBlocked
  shells out to kubectl per the project's zero-deps stance. Silently bypassed
  when no kubeconfig is reachable (air-gap case). Skipped by --force.

Healthcheck (extended via new pkg/health/extended.go + preflight.go):
- CheckKubeSystemReady waits until every kube-system pod has held the Running
  phase for >= N seconds (default 30). Catches "started ok, will crash-loop"
  bugs that a single-shot phase check misses.
- CheckProbeURL fetches an operator-supplied URL; 200 = pass. Wired through
  update.conf as healthcheck_url= and cloud-init updates.healthcheck_url.
- CheckDiskWritable writes/fsyncs/reads a 1-KiB probe under /var/lib/kubesolo.
  Always runs in healthcheck so a wedged data partition fails fast.
- pkg/health.Status grows KubeSystemReady, ProbeURL, DiskWritable booleans.
  Optional checks default to true in RunAll() so they don't block when
  unconfigured. health_test.go updated to the new 6-field shape.

Auto-rollback (healthcheck.go):
- state.UpdateState gains HealthCheckFailures (consecutive post-Activated
  failures). Reset on a clean pass.
- --auto-rollback-after N (also auto_rollback_after= in update.conf) triggers
  env.ForceRollback() when the failure count reaches the threshold. State
  transitions to RolledBack with a descriptive LastError. The command still
  exits with the healthcheck error; the operator/init is expected to reboot.
- Only fires while Phase == Activated. Doesn't second-guess a long-stable
  system that happens to fail one healthcheck.

config / opts / cloud-init plumbing:
- update.conf gains healthcheck_url= and auto_rollback_after= keys.
- New CLI flags: --healthcheck-url, --auto-rollback-after, --kube-system-settle.
- cloud-init full-config.yaml documents the new updates: subfields.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-14 19:08:30 -06:00
parent 28de656b97
commit 9fb894c5af
14 changed files with 595 additions and 48 deletions

View File

@@ -72,3 +72,14 @@ updates:
# Path to Ed25519 public key for signature verification. Omit to disable
# signature verification (NOT recommended for production fleets).
# pubkey: "/etc/kubesolo/update-pubkey.hex"
# Optional post-boot healthcheck probe URL. If set, healthcheck GETs it
# and treats anything other than HTTP 200 as a failure. Useful when your
# workload exposes its own readiness on a known endpoint.
# healthcheck_url: "http://localhost:8000/ready"
# Auto-rollback threshold: after N consecutive post-activation healthcheck
# failures, the agent triggers a rollback on its own. 0 disables the
# feature (the bootloader still does GRUB-counter-based rollback after
# 3 failed boots). Recommended: 3 for production fleets.
# auto_rollback_after: 3

View File

@@ -4,10 +4,12 @@ import (
"context"
"fmt"
"log/slog"
"os"
"runtime"
"time"
"github.com/portainer/kubesolo-os/update/pkg/config"
"github.com/portainer/kubesolo-os/update/pkg/health"
"github.com/portainer/kubesolo-os/update/pkg/image"
"github.com/portainer/kubesolo-os/update/pkg/oci"
"github.com/portainer/kubesolo-os/update/pkg/partition"
@@ -71,6 +73,19 @@ func Apply(args []string) error {
window.String())
}
// Node-block-label gate — workload authors can defer an update by
// labeling the node updates.kubesolo.io/block=true. Skipped with --force
// and silently bypassed when the K8s API isn't reachable (air-gap).
if !opts.Force {
blocked, berr := health.CheckNodeBlocked("")
if berr != nil {
slog.Warn("node-block check failed, allowing update", "error", berr)
} else if blocked {
return fmt.Errorf("node carries label %s=true; refusing update (pass --force to override)",
health.NodeBlockLabel)
}
}
st, err := state.Load(opts.StatePath)
if err != nil {
// Don't block the operation on a corrupt state file. Log + recover.
@@ -186,6 +201,30 @@ func Apply(args []string) error {
}
defer partition.Unmount(mountPoint)
// Free-space pre-write check: the passive partition must have at least
// (kernel + initramfs) + 10% headroom. Catches corrupted-FS reports and
// shrunk/wrong-size partitions before we destroy the existing slot data.
var imgSize int64
for _, p := range []string{staged.VmlinuzPath, staged.InitramfsPath} {
fi, ferr := os.Stat(p)
if ferr != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("stat staged file %s: %w", p, ferr))
return fmt.Errorf("stat staged file %s: %w", p, ferr)
}
imgSize += fi.Size()
}
avail, ok, ferr := partition.HasFreeSpaceFor(mountPoint, imgSize, 10)
if ferr != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("free-space check: %w", ferr))
return fmt.Errorf("free-space check: %w", ferr)
}
if !ok {
err := fmt.Errorf("insufficient space on %s: have %.1f MiB, need %.1f MiB (image + 10%% headroom)",
passiveSlot, float64(avail)/(1<<20), float64(imgSize)*1.1/(1<<20))
_ = st.RecordError(opts.StatePath, err)
return err
}
// Write image to passive partition
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))

View File

@@ -17,6 +17,11 @@ import (
// State transition: Activated → Verifying → Success on pass, → Failed on fail.
// If state isn't in Activated (e.g. manual run on a long-stable system), the
// state file is left alone — healthcheck still does its job.
//
// When --auto-rollback-after N is set, consecutive post-Activated failures
// are counted in state.HealthCheckFailures. On the Nth failure, the agent
// calls Rollback() and the operator is expected to reboot (this command
// does not reboot the host — that's policy left to systemd/init).
func Healthcheck(args []string) error {
opts := parseOpts(args)
env := opts.NewBootEnv()
@@ -48,18 +53,44 @@ func Healthcheck(args []string) error {
timeout := time.Duration(opts.TimeoutSecs) * time.Second
checker := health.NewChecker("", "", timeout)
checker.ProbeURL = opts.HealthcheckURL
if opts.KubeSystemSettle > 0 {
checker.KubeSystemSettle = time.Duration(opts.KubeSystemSettle) * time.Second
}
// Probe the data partition every healthcheck so a wedged disk fails fast.
checker.DataDir = "/var/lib/kubesolo"
slog.Info("running post-boot health checks", "timeout", timeout)
slog.Info("running post-boot health checks",
"timeout", timeout,
"probe_url", checker.ProbeURL,
"kube_system_settle", checker.KubeSystemSettle)
status, err := checker.WaitForHealthy()
if err != nil {
fmt.Printf("Health check FAILED: %s\n", status.Message)
fmt.Printf(" containerd: %v\n", status.Containerd)
fmt.Printf(" apiserver: %v\n", status.APIServer)
fmt.Printf(" node_ready: %v\n", status.NodeReady)
printStatusBreakdown(status)
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
if postActivation {
st.HealthCheckFailures++
_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
// Auto-rollback escalation. Only trigger when post-Activated;
// don't second-guess a healthy long-running system.
if opts.AutoRollbackAfter > 0 && st.HealthCheckFailures >= opts.AutoRollbackAfter {
slog.Warn("auto-rollback threshold reached",
"failures", st.HealthCheckFailures,
"threshold", opts.AutoRollbackAfter)
if rerr := env.ForceRollback(); rerr != nil {
slog.Error("auto-rollback failed", "error", rerr)
return err // return the original healthcheck error
}
if terr := st.Transition(opts.StatePath, state.PhaseRolledBack, "",
fmt.Sprintf("auto-rollback after %d healthcheck failures", st.HealthCheckFailures)); terr != nil {
slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", terr)
}
fmt.Println("\nAuto-rollback triggered. Reboot to complete the rollback.")
}
}
return err
}
@@ -73,15 +104,32 @@ func Healthcheck(args []string) error {
}
if postActivation {
// Reset failure counter on a clean pass.
st.HealthCheckFailures = 0
if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
}
}
fmt.Println("Health check PASSED — boot marked successful")
fmt.Printf(" containerd: %v\n", status.Containerd)
fmt.Printf(" apiserver: %v\n", status.APIServer)
fmt.Printf(" node_ready: %v\n", status.NodeReady)
printStatusBreakdown(status)
return nil
}
// printStatusBreakdown emits a human-readable per-check summary. Only emits
// optional check lines when they actually ran.
func printStatusBreakdown(s *health.Status) {
fmt.Printf(" containerd: %v\n", s.Containerd)
fmt.Printf(" apiserver: %v\n", s.APIServer)
fmt.Printf(" node_ready: %v\n", s.NodeReady)
if !s.KubeSystemReady {
fmt.Printf(" kube-system pods: %v\n", s.KubeSystemReady)
}
if !s.ProbeURL {
fmt.Printf(" probe URL: %v\n", s.ProbeURL)
}
if !s.DiskWritable {
fmt.Printf(" disk writable: %v\n", s.DiskWritable)
}
}

View File

@@ -18,12 +18,15 @@ type opts struct {
PubKeyPath string
BootEnvType string // "grub" or "rpi"
BootEnvPath string // path for RPi boot control dir
StatePath string // location of state.json (default: state.DefaultPath)
ConfPath string // location of update.conf (default: config.DefaultPath)
Channel string // update channel ("stable" by default)
MaintenanceWindow string // "HH:MM-HH:MM" or empty for always-allow
Force bool // bypass maintenance window
JSON bool // status: emit JSON instead of human-readable
StatePath string // location of state.json (default: state.DefaultPath)
ConfPath string // location of update.conf (default: config.DefaultPath)
Channel string // update channel ("stable" by default)
MaintenanceWindow string // "HH:MM-HH:MM" or empty for always-allow
HealthcheckURL string // optional GET probe for healthcheck
AutoRollbackAfter int // healthcheck: rollback after N consecutive failures (0=off)
KubeSystemSettle int // healthcheck: kube-system pods must be Running for N seconds (0=disabled)
Force bool // bypass maintenance window
JSON bool // status: emit JSON instead of human-readable
}
// NewBootEnv creates a BootEnv from the parsed options.
@@ -74,6 +77,12 @@ func parseOpts(args []string) opts {
if cfg.PubKey != "" {
o.PubKeyPath = cfg.PubKey
}
if cfg.HealthcheckURL != "" {
o.HealthcheckURL = cfg.HealthcheckURL
}
if cfg.AutoRollbackAfter > 0 {
o.AutoRollbackAfter = cfg.AutoRollbackAfter
}
} else if err != nil {
slog.Warn("could not load update.conf", "path", o.ConfPath, "error", err)
}
@@ -100,6 +109,43 @@ func parseOpts(args []string) opts {
}
case "--force":
o.Force = true
case "--healthcheck-url":
if i+1 < len(args) {
o.HealthcheckURL = args[i+1]
i++
}
case "--auto-rollback-after":
if i+1 < len(args) {
n := 0
for _, ch := range args[i+1] {
if ch >= '0' && ch <= '9' {
n = n*10 + int(ch-'0')
} else {
n = 0
break
}
}
if n > 0 {
o.AutoRollbackAfter = n
}
i++
}
case "--kube-system-settle":
if i+1 < len(args) {
n := 0
for _, ch := range args[i+1] {
if ch >= '0' && ch <= '9' {
n = n*10 + int(ch-'0')
} else {
n = 0
break
}
}
if n > 0 {
o.KubeSystemSettle = n
}
i++
}
case "--json":
o.JSON = true
case "--server":

View File

@@ -90,6 +90,9 @@ Options:
--grubenv PATH Path to grubenv file (default: /boot/grub/grubenv)
--timeout SECS Health check timeout in seconds (default: 120)
--pubkey PATH Ed25519 public key for signature verification (optional)
--healthcheck-url URL Optional GET probe in healthcheck; 200 = pass
--auto-rollback-after N healthcheck: rollback after N consecutive failures
--kube-system-settle N healthcheck: require kube-system pods Running ≥ N seconds
--json For 'status': emit JSON instead of human-readable output
Examples:

View File

@@ -35,6 +35,13 @@ type Config struct {
Channel string
MaintenanceWindow string
PubKey string
// HealthcheckURL is an optional URL the healthcheck command will GET;
// 200 = pass, anything else = fail.
HealthcheckURL string
// AutoRollbackAfter is the number of consecutive post-boot healthcheck
// failures after which the agent will call Rollback automatically.
// 0 = disabled (default).
AutoRollbackAfter int
}
// Load reads and parses update.conf. A missing file returns an empty Config
@@ -73,6 +80,21 @@ func Load(path string) (*Config, error) {
c.MaintenanceWindow = value
case "pubkey":
c.PubKey = value
case "healthcheck_url":
c.HealthcheckURL = value
case "auto_rollback_after":
// Parse a small integer. Non-numeric values are silently
// ignored (forward compat); zero disables the feature.
n := 0
for _, ch := range value {
if ch >= '0' && ch <= '9' {
n = n*10 + int(ch-'0')
} else {
n = 0
break
}
}
c.AutoRollbackAfter = n
}
// Unknown keys are silently ignored for forward compatibility.
}

View File

@@ -0,0 +1,125 @@
package health
import (
"context"
"fmt"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
// kubeSystemSettleSeconds is how long all kube-system pods must hold a
// Running phase before we consider the cluster genuinely up. Catches the
// "pod just started, will crash-loop in 5s" case.
const kubeSystemSettleSeconds = 30
// CheckKubeSystemReady verifies that every pod in the kube-system namespace
// is in Running phase and has been Running for at least settle. Returns
// (ready, error). settle defaults to 30s when zero.
func (c *Checker) CheckKubeSystemReady(settle time.Duration) bool {
if settle == 0 {
settle = kubeSystemSettleSeconds * time.Second
}
if _, err := os.Stat(c.kubeconfigPath); err != nil {
return false
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// jsonpath emits one line per pod: <phase>|<startTime>
cmd := exec.CommandContext(ctx, "kubectl",
"--kubeconfig", c.kubeconfigPath,
"get", "pods", "-n", "kube-system",
"-o", `jsonpath={range .items[*]}{.status.phase}|{.status.startTime}{"\n"}{end}`,
)
out, err := cmd.Output()
if err != nil {
return false
}
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
if len(lines) == 0 || lines[0] == "" {
// No pods reported. Conservatively treat as not-ready: kube-system
// is expected to host at least CoreDNS + pause.
return false
}
now := time.Now()
for _, line := range lines {
parts := strings.SplitN(line, "|", 2)
phase := strings.TrimSpace(parts[0])
if phase != "Running" {
return false
}
if len(parts) < 2 {
return false
}
start, perr := time.Parse(time.RFC3339, strings.TrimSpace(parts[1]))
if perr != nil {
return false
}
if now.Sub(start) < settle {
return false
}
}
return true
}
// CheckProbeURL fetches the given URL and reports whether it returned 200.
// Empty url returns (true, nil) — the check is opt-in.
func CheckProbeURL(url string) (bool, error) {
if url == "" {
return true, nil
}
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(url)
if err != nil {
return false, fmt.Errorf("probe URL %s: %w", url, err)
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK, nil
}
// CheckDiskWritable writes a small file under dataDir, fsyncs, reads it back,
// and removes it. Confirms the data partition is mounted read-write and the
// underlying disk is responsive. Empty dataDir defaults to /var/lib/kubesolo.
func CheckDiskWritable(dataDir string) (bool, error) {
if dataDir == "" {
dataDir = "/var/lib/kubesolo"
}
if _, err := os.Stat(dataDir); err != nil {
// Data partition not mounted? That's catastrophic but we shouldn't
// claim the disk is fine.
return false, fmt.Errorf("dataDir %s: %w", dataDir, err)
}
probe := filepath.Join(dataDir, ".update-probe")
want := []byte("kubesolo-os healthcheck probe")
f, err := os.Create(probe)
if err != nil {
return false, fmt.Errorf("create probe: %w", err)
}
defer os.Remove(probe)
if _, err := f.Write(want); err != nil {
f.Close()
return false, fmt.Errorf("write probe: %w", err)
}
if err := f.Sync(); err != nil {
f.Close()
return false, fmt.Errorf("fsync probe: %w", err)
}
if err := f.Close(); err != nil {
return false, fmt.Errorf("close probe: %w", err)
}
got, err := os.ReadFile(probe)
if err != nil {
return false, fmt.Errorf("read probe: %w", err)
}
if string(got) != string(want) {
return false, fmt.Errorf("probe content mismatch: got %q", got)
}
return true, nil
}

View File

@@ -0,0 +1,77 @@
package health
import (
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
)
func TestCheckProbeURLEmptyAlwaysPasses(t *testing.T) {
ok, err := CheckProbeURL("")
if err != nil {
t.Fatalf("CheckProbeURL(\"\"): %v", err)
}
if !ok {
t.Error("empty probe URL should return ok=true (check disabled)")
}
}
func TestCheckProbeURL200(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
ok, err := CheckProbeURL(srv.URL)
if err != nil {
t.Fatalf("CheckProbeURL: %v", err)
}
if !ok {
t.Error("expected ok=true on 200")
}
}
func TestCheckProbeURLNon200(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusServiceUnavailable)
}))
defer srv.Close()
ok, err := CheckProbeURL(srv.URL)
if err != nil {
t.Fatalf("CheckProbeURL: %v", err)
}
if ok {
t.Error("expected ok=false on 503")
}
}
func TestCheckProbeURLNetworkError(t *testing.T) {
// Port 1 is reserved (tcpmux) and never bound by Linux defaults.
_, err := CheckProbeURL("http://127.0.0.1:1")
if err == nil {
t.Error("expected error for unreachable URL, got nil")
}
}
func TestCheckDiskWritableHappyPath(t *testing.T) {
dir := t.TempDir()
ok, err := CheckDiskWritable(dir)
if err != nil {
t.Fatalf("CheckDiskWritable: %v", err)
}
if !ok {
t.Error("expected ok=true on writable temp dir")
}
// Probe file should have been cleaned up.
if _, err := os.Stat(filepath.Join(dir, ".update-probe")); !os.IsNotExist(err) {
t.Errorf("probe file not cleaned up: stat err=%v", err)
}
}
func TestCheckDiskWritableMissingDir(t *testing.T) {
_, err := CheckDiskWritable("/this/path/does/not/exist")
if err == nil {
t.Error("expected error for missing dataDir, got nil")
}
}

View File

@@ -24,15 +24,20 @@ import (
// Status represents the result of a health check.
type Status struct {
Containerd bool
APIServer bool
NodeReady bool
Message string
Containerd bool
APIServer bool
NodeReady bool
KubeSystemReady bool // optional — true unless KubeSystemSettle is non-zero
ProbeURL bool // optional — true unless ProbeURL is set
DiskWritable bool // optional — true unless DataDir is set
Message string
}
// IsHealthy returns true if all checks passed.
// IsHealthy returns true if all required checks passed. Optional checks
// default to true when not configured, so they don't block the result.
func (s *Status) IsHealthy() bool {
return s.Containerd && s.APIServer && s.NodeReady
return s.Containerd && s.APIServer && s.NodeReady &&
s.KubeSystemReady && s.ProbeURL && s.DiskWritable
}
// Checker performs health checks against the local KubeSolo instance.
@@ -40,6 +45,11 @@ type Checker struct {
kubeconfigPath string
apiServerAddr string
timeout time.Duration
// Optional gates. Zero values disable the check (it reports true).
KubeSystemSettle time.Duration
ProbeURL string
DataDir string
}
// NewChecker creates a health checker.
@@ -149,12 +159,37 @@ func (c *Checker) CheckNodeReady() bool {
}
// RunAll performs all health checks and returns the combined status.
//
// Optional checks (kube-system settle, user probe URL, disk writability) are
// only run if the corresponding Checker fields are set; otherwise they
// report true so as not to block the result.
func (c *Checker) RunAll() *Status {
return &Status{
Containerd: c.CheckContainerd(),
APIServer: c.CheckAPIServer(),
NodeReady: c.CheckNodeReady(),
s := &Status{
Containerd: c.CheckContainerd(),
APIServer: c.CheckAPIServer(),
NodeReady: c.CheckNodeReady(),
KubeSystemReady: true,
ProbeURL: true,
DiskWritable: true,
}
if c.KubeSystemSettle > 0 {
s.KubeSystemReady = c.CheckKubeSystemReady(c.KubeSystemSettle)
}
if c.ProbeURL != "" {
ok, err := CheckProbeURL(c.ProbeURL)
if err != nil {
slog.Warn("probe URL check failed", "url", c.ProbeURL, "error", err)
}
s.ProbeURL = ok
}
if c.DataDir != "" {
ok, err := CheckDiskWritable(c.DataDir)
if err != nil {
slog.Warn("disk writability check failed", "dir", c.DataDir, "error", err)
}
s.DiskWritable = ok
}
return s
}
// WaitForHealthy polls health checks until all pass or timeout expires.

View File

@@ -6,36 +6,42 @@ import (
)
func TestStatusIsHealthy(t *testing.T) {
// Helper for the new 6-field Status: all-true except the named one.
allBut := func(field string) Status {
s := Status{
Containerd: true, APIServer: true, NodeReady: true,
KubeSystemReady: true, ProbeURL: true, DiskWritable: true,
}
switch field {
case "Containerd":
s.Containerd = false
case "APIServer":
s.APIServer = false
case "NodeReady":
s.NodeReady = false
case "KubeSystemReady":
s.KubeSystemReady = false
case "ProbeURL":
s.ProbeURL = false
case "DiskWritable":
s.DiskWritable = false
}
return s
}
tests := []struct {
name string
status Status
wantHealth bool
}{
{
name: "all healthy",
status: Status{Containerd: true, APIServer: true, NodeReady: true},
wantHealth: true,
},
{
name: "containerd down",
status: Status{Containerd: false, APIServer: true, NodeReady: true},
wantHealth: false,
},
{
name: "apiserver down",
status: Status{Containerd: true, APIServer: false, NodeReady: true},
wantHealth: false,
},
{
name: "node not ready",
status: Status{Containerd: true, APIServer: true, NodeReady: false},
wantHealth: false,
},
{
name: "all down",
status: Status{Containerd: false, APIServer: false, NodeReady: false},
wantHealth: false,
},
{"all healthy", allBut(""), true},
{"containerd down", allBut("Containerd"), false},
{"apiserver down", allBut("APIServer"), false},
{"node not ready", allBut("NodeReady"), false},
{"kube-system not ready", allBut("KubeSystemReady"), false},
{"probe URL failed", allBut("ProbeURL"), false},
{"disk not writable", allBut("DiskWritable"), false},
{"all down", Status{}, false},
}
for _, tt := range tests {

View File

@@ -0,0 +1,51 @@
package health
import (
"context"
"fmt"
"os"
"os/exec"
"strings"
"time"
)
// NodeBlockLabel is the well-known label that workload authors set on the
// local node to defer an OS update. When present and "true", apply refuses.
const NodeBlockLabel = "updates.kubesolo.io/block"
// CheckNodeBlocked returns (blocked, error). blocked==true means the local
// node carries the updates.kubesolo.io/block=true label and the caller should
// refuse the update.
//
// If the kubeconfig is not available (offline / pre-boot / air-gap), this
// returns (false, nil) — silently allowing the update. That's the safe
// behaviour for the air-gap case where the node may not be reachable from
// the agent's perspective.
func CheckNodeBlocked(kubeconfigPath string) (bool, error) {
if kubeconfigPath == "" {
kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
}
if _, err := os.Stat(kubeconfigPath); err != nil {
// No kubeconfig — assume air-gap / pre-K8s. Don't block updates.
return false, nil
}
// Query the node label via kubectl. We don't know the node name a
// priori, so we use --kubeconfig on the local admin config and ask for
// "the only node" (KubeSolo is single-node by design).
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "kubectl",
"--kubeconfig", kubeconfigPath,
"get", "node",
"-o", `jsonpath={.items[0].metadata.labels.updates\.kubesolo\.io/block}`)
out, err := cmd.Output()
if err != nil {
// API unreachable or no nodes — treat as not blocked (analogous to
// the kubeconfig-missing case). We still surface the error so the
// caller can decide to log it.
return false, fmt.Errorf("query node label: %w", err)
}
return strings.TrimSpace(string(out)) == "true", nil
}

View File

@@ -0,0 +1,34 @@
package partition
import (
"fmt"
"syscall"
)
// FreeBytes returns the number of free bytes available on the filesystem
// containing `path`. Uses statfs(2); path must exist and be readable.
func FreeBytes(path string) (uint64, error) {
var stat syscall.Statfs_t
if err := syscall.Statfs(path, &stat); err != nil {
return 0, fmt.Errorf("statfs %s: %w", path, err)
}
// Bavail is the count of free blocks available to non-root users —
// matches what `df` reports. Bsize is the block size in bytes.
//nolint:unconvert // Bavail is uint64 on most platforms but int64 on darwin/freebsd
return uint64(stat.Bavail) * uint64(stat.Bsize), nil
}
// HasFreeSpaceFor reports whether `path`'s filesystem has at least `wantBytes`
// of free space, with `headroomPct` reserved (e.g. 10 = require 110% of want).
// Returns the available bytes alongside, so callers can render a useful error.
func HasFreeSpaceFor(path string, wantBytes int64, headroomPct int) (avail uint64, ok bool, err error) {
avail, err = FreeBytes(path)
if err != nil {
return 0, false, err
}
if wantBytes < 0 {
return avail, false, fmt.Errorf("invalid wantBytes %d", wantBytes)
}
required := uint64(wantBytes) * uint64(100+headroomPct) / 100
return avail, avail >= required, nil
}

View File

@@ -0,0 +1,44 @@
package partition
import "testing"
func TestFreeBytesReturnsNonZeroOnTempDir(t *testing.T) {
b, err := FreeBytes(t.TempDir())
if err != nil {
t.Fatalf("FreeBytes: %v", err)
}
// On any sane test runner the temp filesystem has more than 1 KiB free.
if b < 1024 {
t.Errorf("FreeBytes = %d, want > 1024 on /tmp", b)
}
}
func TestFreeBytesNonExistentPath(t *testing.T) {
_, err := FreeBytes("/this/path/does/not/exist/at/all")
if err == nil {
t.Error("expected error for missing path, got nil")
}
}
func TestHasFreeSpaceForRejectsHugeRequest(t *testing.T) {
// Request 1 PiB with 10% headroom on /tmp — no test runner has that
// much free, so this should consistently report not-enough.
avail, ok, err := HasFreeSpaceFor(t.TempDir(), 1<<50, 10)
if err != nil {
t.Fatalf("HasFreeSpaceFor: %v", err)
}
if ok {
t.Errorf("expected insufficient space for 1PiB, got avail=%d ok=true", avail)
}
}
func TestHasFreeSpaceForAcceptsSmallRequest(t *testing.T) {
// 1 KiB with 10% headroom = 1.1 KiB. Any temp dir has this.
_, ok, err := HasFreeSpaceFor(t.TempDir(), 1024, 10)
if err != nil {
t.Fatalf("HasFreeSpaceFor: %v", err)
}
if !ok {
t.Error("expected sufficient space for 1KiB on /tmp")
}
}

View File

@@ -92,6 +92,12 @@ type UpdateState struct {
// AttemptCount counts attempts at the current ToVersion. Reset when
// ToVersion changes or on successful completion.
AttemptCount int `json:"attempt_count"`
// HealthCheckFailures counts consecutive post-Activated healthcheck
// failures. Reset to 0 on a successful healthcheck or after a rollback.
// Used by `kubesolo-update healthcheck --auto-rollback-after N` to
// trigger automatic recovery on a wedged new boot.
HealthCheckFailures int `json:"health_check_failures,omitempty"`
}
// New returns a fresh Idle state with UpdatedAt set to now.