feat(update): pre-flight gates + deeper healthcheck + auto-rollback
Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m29s
CI / Shellcheck (push) Successful in 48s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Successful in 1m12s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Has been cancelled

Phase 8 of v0.3. Tightens the update lifecycle on both ends.

Pre-flight (apply.go, before any download):
- Free-space check on the passive partition: image size + 10% headroom must
  be available. Uses statfs(2) via the new pkg/partition.FreeBytes /
  HasFreeSpaceFor helpers (tests cover happy path, tiny request, huge
  request, missing path). Catches corrupted-FS and shrunk-partition cases
  before we destroy the existing slot data.
- Node-block-label check: refuses if the local K8s node carries the
  updates.kubesolo.io/block=true label. New pkg/health.CheckNodeBlocked
  shells out to kubectl per the project's zero-deps stance. Silently bypassed
  when no kubeconfig is reachable (air-gap case). Skipped by --force.

Healthcheck (extended via new pkg/health/extended.go + preflight.go):
- CheckKubeSystemReady waits until every kube-system pod has held the Running
  phase for >= N seconds (default 30). Catches "started ok, will crash-loop"
  bugs that a single-shot phase check misses.
- CheckProbeURL fetches an operator-supplied URL; 200 = pass. Wired through
  update.conf as healthcheck_url= and cloud-init updates.healthcheck_url.
- CheckDiskWritable writes/fsyncs/reads a 1-KiB probe under /var/lib/kubesolo.
  Always runs in healthcheck so a wedged data partition fails fast.
- pkg/health.Status grows KubeSystemReady, ProbeURL, DiskWritable booleans.
  Optional checks default to true in RunAll() so they don't block when
  unconfigured. health_test.go updated to the new 6-field shape.

Auto-rollback (healthcheck.go):
- state.UpdateState gains HealthCheckFailures (consecutive post-Activated
  failures). Reset on a clean pass.
- --auto-rollback-after N (also auto_rollback_after= in update.conf) triggers
  env.ForceRollback() when the failure count reaches the threshold. State
  transitions to RolledBack with a descriptive LastError. The command still
  exits with the healthcheck error; the operator/init is expected to reboot.
- Only fires while Phase == Activated. Doesn't second-guess a long-stable
  system that happens to fail one healthcheck.

config / opts / cloud-init plumbing:
- update.conf gains healthcheck_url= and auto_rollback_after= keys.
- New CLI flags: --healthcheck-url, --auto-rollback-after, --kube-system-settle.
- cloud-init full-config.yaml documents the new updates: subfields.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-14 19:08:30 -06:00
parent 28de656b97
commit 9fb894c5af
14 changed files with 595 additions and 48 deletions

View File

@@ -0,0 +1,125 @@
package health
import (
"context"
"fmt"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
// kubeSystemSettleSeconds is how long all kube-system pods must hold a
// Running phase before we consider the cluster genuinely up. Catches the
// "pod just started, will crash-loop in 5s" case.
const kubeSystemSettleSeconds = 30
// CheckKubeSystemReady verifies that every pod in the kube-system namespace
// is in Running phase and has been Running for at least settle. Returns
// (ready, error). settle defaults to 30s when zero.
func (c *Checker) CheckKubeSystemReady(settle time.Duration) bool {
if settle == 0 {
settle = kubeSystemSettleSeconds * time.Second
}
if _, err := os.Stat(c.kubeconfigPath); err != nil {
return false
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// jsonpath emits one line per pod: <phase>|<startTime>
cmd := exec.CommandContext(ctx, "kubectl",
"--kubeconfig", c.kubeconfigPath,
"get", "pods", "-n", "kube-system",
"-o", `jsonpath={range .items[*]}{.status.phase}|{.status.startTime}{"\n"}{end}`,
)
out, err := cmd.Output()
if err != nil {
return false
}
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
if len(lines) == 0 || lines[0] == "" {
// No pods reported. Conservatively treat as not-ready: kube-system
// is expected to host at least CoreDNS + pause.
return false
}
now := time.Now()
for _, line := range lines {
parts := strings.SplitN(line, "|", 2)
phase := strings.TrimSpace(parts[0])
if phase != "Running" {
return false
}
if len(parts) < 2 {
return false
}
start, perr := time.Parse(time.RFC3339, strings.TrimSpace(parts[1]))
if perr != nil {
return false
}
if now.Sub(start) < settle {
return false
}
}
return true
}
// CheckProbeURL fetches the given URL and reports whether it returned 200.
// Empty url returns (true, nil) — the check is opt-in.
func CheckProbeURL(url string) (bool, error) {
if url == "" {
return true, nil
}
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(url)
if err != nil {
return false, fmt.Errorf("probe URL %s: %w", url, err)
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK, nil
}
// CheckDiskWritable writes a small file under dataDir, fsyncs, reads it back,
// and removes it. Confirms the data partition is mounted read-write and the
// underlying disk is responsive. Empty dataDir defaults to /var/lib/kubesolo.
func CheckDiskWritable(dataDir string) (bool, error) {
if dataDir == "" {
dataDir = "/var/lib/kubesolo"
}
if _, err := os.Stat(dataDir); err != nil {
// Data partition not mounted? That's catastrophic but we shouldn't
// claim the disk is fine.
return false, fmt.Errorf("dataDir %s: %w", dataDir, err)
}
probe := filepath.Join(dataDir, ".update-probe")
want := []byte("kubesolo-os healthcheck probe")
f, err := os.Create(probe)
if err != nil {
return false, fmt.Errorf("create probe: %w", err)
}
defer os.Remove(probe)
if _, err := f.Write(want); err != nil {
f.Close()
return false, fmt.Errorf("write probe: %w", err)
}
if err := f.Sync(); err != nil {
f.Close()
return false, fmt.Errorf("fsync probe: %w", err)
}
if err := f.Close(); err != nil {
return false, fmt.Errorf("close probe: %w", err)
}
got, err := os.ReadFile(probe)
if err != nil {
return false, fmt.Errorf("read probe: %w", err)
}
if string(got) != string(want) {
return false, fmt.Errorf("probe content mismatch: got %q", got)
}
return true, nil
}

View File

@@ -0,0 +1,77 @@
package health
import (
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
)
func TestCheckProbeURLEmptyAlwaysPasses(t *testing.T) {
ok, err := CheckProbeURL("")
if err != nil {
t.Fatalf("CheckProbeURL(\"\"): %v", err)
}
if !ok {
t.Error("empty probe URL should return ok=true (check disabled)")
}
}
func TestCheckProbeURL200(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
ok, err := CheckProbeURL(srv.URL)
if err != nil {
t.Fatalf("CheckProbeURL: %v", err)
}
if !ok {
t.Error("expected ok=true on 200")
}
}
func TestCheckProbeURLNon200(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusServiceUnavailable)
}))
defer srv.Close()
ok, err := CheckProbeURL(srv.URL)
if err != nil {
t.Fatalf("CheckProbeURL: %v", err)
}
if ok {
t.Error("expected ok=false on 503")
}
}
func TestCheckProbeURLNetworkError(t *testing.T) {
// Port 1 is reserved (tcpmux) and never bound by Linux defaults.
_, err := CheckProbeURL("http://127.0.0.1:1")
if err == nil {
t.Error("expected error for unreachable URL, got nil")
}
}
func TestCheckDiskWritableHappyPath(t *testing.T) {
dir := t.TempDir()
ok, err := CheckDiskWritable(dir)
if err != nil {
t.Fatalf("CheckDiskWritable: %v", err)
}
if !ok {
t.Error("expected ok=true on writable temp dir")
}
// Probe file should have been cleaned up.
if _, err := os.Stat(filepath.Join(dir, ".update-probe")); !os.IsNotExist(err) {
t.Errorf("probe file not cleaned up: stat err=%v", err)
}
}
func TestCheckDiskWritableMissingDir(t *testing.T) {
_, err := CheckDiskWritable("/this/path/does/not/exist")
if err == nil {
t.Error("expected error for missing dataDir, got nil")
}
}

View File

@@ -24,15 +24,20 @@ import (
// Status represents the result of a health check.
type Status struct {
Containerd bool
APIServer bool
NodeReady bool
Message string
Containerd bool
APIServer bool
NodeReady bool
KubeSystemReady bool // optional — true unless KubeSystemSettle is non-zero
ProbeURL bool // optional — true unless ProbeURL is set
DiskWritable bool // optional — true unless DataDir is set
Message string
}
// IsHealthy returns true if all checks passed.
// IsHealthy returns true if all required checks passed. Optional checks
// default to true when not configured, so they don't block the result.
func (s *Status) IsHealthy() bool {
return s.Containerd && s.APIServer && s.NodeReady
return s.Containerd && s.APIServer && s.NodeReady &&
s.KubeSystemReady && s.ProbeURL && s.DiskWritable
}
// Checker performs health checks against the local KubeSolo instance.
@@ -40,6 +45,11 @@ type Checker struct {
kubeconfigPath string
apiServerAddr string
timeout time.Duration
// Optional gates. Zero values disable the check (it reports true).
KubeSystemSettle time.Duration
ProbeURL string
DataDir string
}
// NewChecker creates a health checker.
@@ -149,12 +159,37 @@ func (c *Checker) CheckNodeReady() bool {
}
// RunAll performs all health checks and returns the combined status.
//
// Optional checks (kube-system settle, user probe URL, disk writability) are
// only run if the corresponding Checker fields are set; otherwise they
// report true so as not to block the result.
func (c *Checker) RunAll() *Status {
return &Status{
Containerd: c.CheckContainerd(),
APIServer: c.CheckAPIServer(),
NodeReady: c.CheckNodeReady(),
s := &Status{
Containerd: c.CheckContainerd(),
APIServer: c.CheckAPIServer(),
NodeReady: c.CheckNodeReady(),
KubeSystemReady: true,
ProbeURL: true,
DiskWritable: true,
}
if c.KubeSystemSettle > 0 {
s.KubeSystemReady = c.CheckKubeSystemReady(c.KubeSystemSettle)
}
if c.ProbeURL != "" {
ok, err := CheckProbeURL(c.ProbeURL)
if err != nil {
slog.Warn("probe URL check failed", "url", c.ProbeURL, "error", err)
}
s.ProbeURL = ok
}
if c.DataDir != "" {
ok, err := CheckDiskWritable(c.DataDir)
if err != nil {
slog.Warn("disk writability check failed", "dir", c.DataDir, "error", err)
}
s.DiskWritable = ok
}
return s
}
// WaitForHealthy polls health checks until all pass or timeout expires.

View File

@@ -6,36 +6,42 @@ import (
)
func TestStatusIsHealthy(t *testing.T) {
// Helper for the new 6-field Status: all-true except the named one.
allBut := func(field string) Status {
s := Status{
Containerd: true, APIServer: true, NodeReady: true,
KubeSystemReady: true, ProbeURL: true, DiskWritable: true,
}
switch field {
case "Containerd":
s.Containerd = false
case "APIServer":
s.APIServer = false
case "NodeReady":
s.NodeReady = false
case "KubeSystemReady":
s.KubeSystemReady = false
case "ProbeURL":
s.ProbeURL = false
case "DiskWritable":
s.DiskWritable = false
}
return s
}
tests := []struct {
name string
status Status
wantHealth bool
}{
{
name: "all healthy",
status: Status{Containerd: true, APIServer: true, NodeReady: true},
wantHealth: true,
},
{
name: "containerd down",
status: Status{Containerd: false, APIServer: true, NodeReady: true},
wantHealth: false,
},
{
name: "apiserver down",
status: Status{Containerd: true, APIServer: false, NodeReady: true},
wantHealth: false,
},
{
name: "node not ready",
status: Status{Containerd: true, APIServer: true, NodeReady: false},
wantHealth: false,
},
{
name: "all down",
status: Status{Containerd: false, APIServer: false, NodeReady: false},
wantHealth: false,
},
{"all healthy", allBut(""), true},
{"containerd down", allBut("Containerd"), false},
{"apiserver down", allBut("APIServer"), false},
{"node not ready", allBut("NodeReady"), false},
{"kube-system not ready", allBut("KubeSystemReady"), false},
{"probe URL failed", allBut("ProbeURL"), false},
{"disk not writable", allBut("DiskWritable"), false},
{"all down", Status{}, false},
}
for _, tt := range tests {

View File

@@ -0,0 +1,51 @@
package health
import (
"context"
"fmt"
"os"
"os/exec"
"strings"
"time"
)
// NodeBlockLabel is the well-known label that workload authors set on the
// local node to defer an OS update. When present and "true", apply refuses.
const NodeBlockLabel = "updates.kubesolo.io/block"
// CheckNodeBlocked returns (blocked, error). blocked==true means the local
// node carries the updates.kubesolo.io/block=true label and the caller should
// refuse the update.
//
// If the kubeconfig is not available (offline / pre-boot / air-gap), this
// returns (false, nil) — silently allowing the update. That's the safe
// behaviour for the air-gap case where the node may not be reachable from
// the agent's perspective.
func CheckNodeBlocked(kubeconfigPath string) (bool, error) {
if kubeconfigPath == "" {
kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
}
if _, err := os.Stat(kubeconfigPath); err != nil {
// No kubeconfig — assume air-gap / pre-K8s. Don't block updates.
return false, nil
}
// Query the node label via kubectl. We don't know the node name a
// priori, so we use --kubeconfig on the local admin config and ask for
// "the only node" (KubeSolo is single-node by design).
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "kubectl",
"--kubeconfig", kubeconfigPath,
"get", "node",
"-o", `jsonpath={.items[0].metadata.labels.updates\.kubesolo\.io/block}`)
out, err := cmd.Output()
if err != nil {
// API unreachable or no nodes — treat as not blocked (analogous to
// the kubeconfig-missing case). We still surface the error so the
// caller can decide to log it.
return false, fmt.Errorf("query node label: %w", err)
}
return strings.TrimSpace(string(out)) == "true", nil
}