Some checks failed
ARM64 Build / Build generic ARM64 disk image (push) Failing after 4s
CI / Go Tests (push) Successful in 1m29s
CI / Shellcheck (push) Successful in 48s
CI / Build Go Binaries (amd64, linux, linux-amd64) (push) Successful in 1m12s
CI / Build Go Binaries (arm64, linux, linux-arm64) (push) Has been cancelled
Phase 8 of v0.3. Tightens the update lifecycle on both ends. Pre-flight (apply.go, before any download): - Free-space check on the passive partition: image size + 10% headroom must be available. Uses statfs(2) via the new pkg/partition.FreeBytes / HasFreeSpaceFor helpers (tests cover happy path, tiny request, huge request, missing path). Catches corrupted-FS and shrunk-partition cases before we destroy the existing slot data. - Node-block-label check: refuses if the local K8s node carries the updates.kubesolo.io/block=true label. New pkg/health.CheckNodeBlocked shells out to kubectl per the project's zero-deps stance. Silently bypassed when no kubeconfig is reachable (air-gap case). Skipped by --force. Healthcheck (extended via new pkg/health/extended.go + preflight.go): - CheckKubeSystemReady waits until every kube-system pod has held the Running phase for >= N seconds (default 30). Catches "started ok, will crash-loop" bugs that a single-shot phase check misses. - CheckProbeURL fetches an operator-supplied URL; 200 = pass. Wired through update.conf as healthcheck_url= and cloud-init updates.healthcheck_url. - CheckDiskWritable writes/fsyncs/reads a 1-KiB probe under /var/lib/kubesolo. Always runs in healthcheck so a wedged data partition fails fast. - pkg/health.Status grows KubeSystemReady, ProbeURL, DiskWritable booleans. Optional checks default to true in RunAll() so they don't block when unconfigured. health_test.go updated to the new 6-field shape. Auto-rollback (healthcheck.go): - state.UpdateState gains HealthCheckFailures (consecutive post-Activated failures). Reset on a clean pass. - --auto-rollback-after N (also auto_rollback_after= in update.conf) triggers env.ForceRollback() when the failure count reaches the threshold. State transitions to RolledBack with a descriptive LastError. The command still exits with the healthcheck error; the operator/init is expected to reboot. - Only fires while Phase == Activated. Doesn't second-guess a long-stable system that happens to fail one healthcheck. config / opts / cloud-init plumbing: - update.conf gains healthcheck_url= and auto_rollback_after= keys. - New CLI flags: --healthcheck-url, --auto-rollback-after, --kube-system-settle. - cloud-init full-config.yaml documents the new updates: subfields. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
234 lines
6.4 KiB
Go
234 lines
6.4 KiB
Go
// Package health implements post-boot health checks for KubeSolo OS.
|
|
//
|
|
// After booting a new system partition, the health check verifies that:
|
|
// - containerd is running and responsive
|
|
// - KubeSolo API server is reachable
|
|
// - The Kubernetes node reaches Ready state
|
|
//
|
|
// If all checks pass, the GRUB environment is updated to mark the boot
|
|
// as successful (boot_success=1). If any check fails, boot_success
|
|
// remains 0 and GRUB will eventually roll back.
|
|
package health
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"net/http"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// Status represents the result of a health check.
|
|
type Status struct {
|
|
Containerd bool
|
|
APIServer bool
|
|
NodeReady bool
|
|
KubeSystemReady bool // optional — true unless KubeSystemSettle is non-zero
|
|
ProbeURL bool // optional — true unless ProbeURL is set
|
|
DiskWritable bool // optional — true unless DataDir is set
|
|
Message string
|
|
}
|
|
|
|
// IsHealthy returns true if all required checks passed. Optional checks
|
|
// default to true when not configured, so they don't block the result.
|
|
func (s *Status) IsHealthy() bool {
|
|
return s.Containerd && s.APIServer && s.NodeReady &&
|
|
s.KubeSystemReady && s.ProbeURL && s.DiskWritable
|
|
}
|
|
|
|
// Checker performs health checks against the local KubeSolo instance.
|
|
type Checker struct {
|
|
kubeconfigPath string
|
|
apiServerAddr string
|
|
timeout time.Duration
|
|
|
|
// Optional gates. Zero values disable the check (it reports true).
|
|
KubeSystemSettle time.Duration
|
|
ProbeURL string
|
|
DataDir string
|
|
}
|
|
|
|
// NewChecker creates a health checker.
|
|
func NewChecker(kubeconfigPath, apiServerAddr string, timeout time.Duration) *Checker {
|
|
if kubeconfigPath == "" {
|
|
kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
|
|
}
|
|
if apiServerAddr == "" {
|
|
apiServerAddr = "127.0.0.1:6443"
|
|
}
|
|
if timeout == 0 {
|
|
timeout = 120 * time.Second
|
|
}
|
|
return &Checker{
|
|
kubeconfigPath: kubeconfigPath,
|
|
apiServerAddr: apiServerAddr,
|
|
timeout: timeout,
|
|
}
|
|
}
|
|
|
|
// CheckContainerd verifies that containerd is running.
|
|
func (c *Checker) CheckContainerd() bool {
|
|
// Check if containerd socket exists
|
|
if _, err := os.Stat("/run/containerd/containerd.sock"); err != nil {
|
|
slog.Warn("containerd socket not found")
|
|
return false
|
|
}
|
|
|
|
// Try ctr version (bundled with KubeSolo)
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "ctr", "--address", "/run/containerd/containerd.sock", "version")
|
|
if err := cmd.Run(); err != nil {
|
|
slog.Warn("containerd not responsive", "error", err)
|
|
return false
|
|
}
|
|
|
|
slog.Debug("containerd healthy")
|
|
return true
|
|
}
|
|
|
|
// CheckAPIServer verifies the Kubernetes API server is reachable.
|
|
func (c *Checker) CheckAPIServer() bool {
|
|
// TCP connect to API server port
|
|
conn, err := net.DialTimeout("tcp", c.apiServerAddr, 5*time.Second)
|
|
if err != nil {
|
|
slog.Warn("API server not reachable", "addr", c.apiServerAddr, "error", err)
|
|
return false
|
|
}
|
|
conn.Close()
|
|
|
|
// Try HTTPS health endpoint (skip TLS verify for localhost)
|
|
client := &http.Client{
|
|
Timeout: 5 * time.Second,
|
|
Transport: &http.Transport{
|
|
TLSHandshakeTimeout: 5 * time.Second,
|
|
},
|
|
}
|
|
|
|
resp, err := client.Get("https://" + c.apiServerAddr + "/healthz")
|
|
if err != nil {
|
|
// TLS error is expected without proper CA, but TCP connect succeeded
|
|
slog.Debug("API server TCP reachable but HTTPS check skipped", "error", err)
|
|
return true
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode == http.StatusOK {
|
|
slog.Debug("API server healthy", "status", resp.StatusCode)
|
|
return true
|
|
}
|
|
|
|
slog.Warn("API server unhealthy", "status", resp.StatusCode)
|
|
return false
|
|
}
|
|
|
|
// CheckNodeReady uses kubectl to verify the node is in Ready state.
|
|
func (c *Checker) CheckNodeReady() bool {
|
|
if _, err := os.Stat(c.kubeconfigPath); err != nil {
|
|
slog.Warn("kubeconfig not found", "path", c.kubeconfigPath)
|
|
return false
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "kubectl",
|
|
"--kubeconfig", c.kubeconfigPath,
|
|
"get", "nodes",
|
|
"-o", "jsonpath={.items[0].status.conditions[?(@.type==\"Ready\")].status}",
|
|
)
|
|
output, err := cmd.Output()
|
|
if err != nil {
|
|
slog.Warn("kubectl get nodes failed", "error", err)
|
|
return false
|
|
}
|
|
|
|
status := strings.TrimSpace(string(output))
|
|
if status == "True" {
|
|
slog.Debug("node is Ready")
|
|
return true
|
|
}
|
|
|
|
slog.Warn("node not Ready", "status", status)
|
|
return false
|
|
}
|
|
|
|
// RunAll performs all health checks and returns the combined status.
|
|
//
|
|
// Optional checks (kube-system settle, user probe URL, disk writability) are
|
|
// only run if the corresponding Checker fields are set; otherwise they
|
|
// report true so as not to block the result.
|
|
func (c *Checker) RunAll() *Status {
|
|
s := &Status{
|
|
Containerd: c.CheckContainerd(),
|
|
APIServer: c.CheckAPIServer(),
|
|
NodeReady: c.CheckNodeReady(),
|
|
KubeSystemReady: true,
|
|
ProbeURL: true,
|
|
DiskWritable: true,
|
|
}
|
|
if c.KubeSystemSettle > 0 {
|
|
s.KubeSystemReady = c.CheckKubeSystemReady(c.KubeSystemSettle)
|
|
}
|
|
if c.ProbeURL != "" {
|
|
ok, err := CheckProbeURL(c.ProbeURL)
|
|
if err != nil {
|
|
slog.Warn("probe URL check failed", "url", c.ProbeURL, "error", err)
|
|
}
|
|
s.ProbeURL = ok
|
|
}
|
|
if c.DataDir != "" {
|
|
ok, err := CheckDiskWritable(c.DataDir)
|
|
if err != nil {
|
|
slog.Warn("disk writability check failed", "dir", c.DataDir, "error", err)
|
|
}
|
|
s.DiskWritable = ok
|
|
}
|
|
return s
|
|
}
|
|
|
|
// WaitForHealthy polls health checks until all pass or timeout expires.
|
|
func (c *Checker) WaitForHealthy() (*Status, error) {
|
|
deadline := time.Now().Add(c.timeout)
|
|
interval := 5 * time.Second
|
|
|
|
slog.Info("waiting for system health", "timeout", c.timeout)
|
|
|
|
for time.Now().Before(deadline) {
|
|
status := c.RunAll()
|
|
if status.IsHealthy() {
|
|
status.Message = "all checks passed"
|
|
slog.Info("system healthy",
|
|
"containerd", status.Containerd,
|
|
"apiserver", status.APIServer,
|
|
"node_ready", status.NodeReady,
|
|
)
|
|
return status, nil
|
|
}
|
|
|
|
slog.Debug("health check pending",
|
|
"containerd", status.Containerd,
|
|
"apiserver", status.APIServer,
|
|
"node_ready", status.NodeReady,
|
|
"remaining", time.Until(deadline).Round(time.Second),
|
|
)
|
|
|
|
time.Sleep(interval)
|
|
}
|
|
|
|
// Final check
|
|
status := c.RunAll()
|
|
if status.IsHealthy() {
|
|
status.Message = "all checks passed"
|
|
return status, nil
|
|
}
|
|
|
|
status.Message = "health check timeout"
|
|
return status, fmt.Errorf("health check timed out after %s", c.timeout)
|
|
}
|