// Package health implements post-boot health checks for KubeSolo OS. // // After booting a new system partition, the health check verifies that: // - containerd is running and responsive // - KubeSolo API server is reachable // - The Kubernetes node reaches Ready state // // If all checks pass, the GRUB environment is updated to mark the boot // as successful (boot_success=1). If any check fails, boot_success // remains 0 and GRUB will eventually roll back. package health import ( "context" "fmt" "log/slog" "net" "net/http" "os" "os/exec" "strings" "time" ) // Status represents the result of a health check. type Status struct { Containerd bool APIServer bool NodeReady bool Message string } // IsHealthy returns true if all checks passed. func (s *Status) IsHealthy() bool { return s.Containerd && s.APIServer && s.NodeReady } // Checker performs health checks against the local KubeSolo instance. type Checker struct { kubeconfigPath string apiServerAddr string timeout time.Duration } // NewChecker creates a health checker. func NewChecker(kubeconfigPath, apiServerAddr string, timeout time.Duration) *Checker { if kubeconfigPath == "" { kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig" } if apiServerAddr == "" { apiServerAddr = "127.0.0.1:6443" } if timeout == 0 { timeout = 120 * time.Second } return &Checker{ kubeconfigPath: kubeconfigPath, apiServerAddr: apiServerAddr, timeout: timeout, } } // CheckContainerd verifies that containerd is running. func (c *Checker) CheckContainerd() bool { // Check if containerd socket exists if _, err := os.Stat("/run/containerd/containerd.sock"); err != nil { slog.Warn("containerd socket not found") return false } // Try ctr version (bundled with KubeSolo) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() cmd := exec.CommandContext(ctx, "ctr", "--address", "/run/containerd/containerd.sock", "version") if err := cmd.Run(); err != nil { slog.Warn("containerd not responsive", "error", err) return false } slog.Debug("containerd healthy") return true } // CheckAPIServer verifies the Kubernetes API server is reachable. func (c *Checker) CheckAPIServer() bool { // TCP connect to API server port conn, err := net.DialTimeout("tcp", c.apiServerAddr, 5*time.Second) if err != nil { slog.Warn("API server not reachable", "addr", c.apiServerAddr, "error", err) return false } conn.Close() // Try HTTPS health endpoint (skip TLS verify for localhost) client := &http.Client{ Timeout: 5 * time.Second, Transport: &http.Transport{ TLSHandshakeTimeout: 5 * time.Second, }, } resp, err := client.Get("https://" + c.apiServerAddr + "/healthz") if err != nil { // TLS error is expected without proper CA, but TCP connect succeeded slog.Debug("API server TCP reachable but HTTPS check skipped", "error", err) return true } defer resp.Body.Close() if resp.StatusCode == http.StatusOK { slog.Debug("API server healthy", "status", resp.StatusCode) return true } slog.Warn("API server unhealthy", "status", resp.StatusCode) return false } // CheckNodeReady uses kubectl to verify the node is in Ready state. func (c *Checker) CheckNodeReady() bool { if _, err := os.Stat(c.kubeconfigPath); err != nil { slog.Warn("kubeconfig not found", "path", c.kubeconfigPath) return false } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() cmd := exec.CommandContext(ctx, "kubectl", "--kubeconfig", c.kubeconfigPath, "get", "nodes", "-o", "jsonpath={.items[0].status.conditions[?(@.type==\"Ready\")].status}", ) output, err := cmd.Output() if err != nil { slog.Warn("kubectl get nodes failed", "error", err) return false } status := strings.TrimSpace(string(output)) if status == "True" { slog.Debug("node is Ready") return true } slog.Warn("node not Ready", "status", status) return false } // RunAll performs all health checks and returns the combined status. func (c *Checker) RunAll() *Status { return &Status{ Containerd: c.CheckContainerd(), APIServer: c.CheckAPIServer(), NodeReady: c.CheckNodeReady(), } } // WaitForHealthy polls health checks until all pass or timeout expires. func (c *Checker) WaitForHealthy() (*Status, error) { deadline := time.Now().Add(c.timeout) interval := 5 * time.Second slog.Info("waiting for system health", "timeout", c.timeout) for time.Now().Before(deadline) { status := c.RunAll() if status.IsHealthy() { status.Message = "all checks passed" slog.Info("system healthy", "containerd", status.Containerd, "apiserver", status.APIServer, "node_ready", status.NodeReady, ) return status, nil } slog.Debug("health check pending", "containerd", status.Containerd, "apiserver", status.APIServer, "node_ready", status.NodeReady, "remaining", time.Until(deadline).Round(time.Second), ) time.Sleep(interval) } // Final check status := c.RunAll() if status.IsHealthy() { status.Message = "all checks passed" return status, nil } status.Message = "health check timeout" return status, fmt.Errorf("health check timed out after %s", c.timeout) }