kubesolo-os/update/pkg/health/health.go

// Package health implements post-boot health checks for KubeSolo OS.
//
// After booting a new system partition, the health check verifies that:
//   - containerd is running and responsive
//   - KubeSolo API server is reachable
//   - The Kubernetes node reaches Ready state
//
// If all checks pass, the GRUB environment is updated to mark the boot
// as successful (boot_success=1). If any check fails, boot_success
// remains 0 and GRUB will eventually roll back.
package health

import (
	"context"
	"fmt"
	"log/slog"
	"net"
	"net/http"
	"os"
	"os/exec"
	"strings"
	"time"
)

// Status represents the result of a health check.
type Status struct {
	Containerd bool
	APIServer  bool
	NodeReady  bool
	Message    string
}

// IsHealthy returns true if all checks passed.
func (s *Status) IsHealthy() bool {
	return s.Containerd && s.APIServer && s.NodeReady
}

// Checker performs health checks against the local KubeSolo instance.
type Checker struct {
	kubeconfigPath string
	apiServerAddr  string
	timeout        time.Duration
}

// NewChecker creates a health checker.
func NewChecker(kubeconfigPath, apiServerAddr string, timeout time.Duration) *Checker {
	if kubeconfigPath == "" {
		kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
	}
	if apiServerAddr == "" {
		apiServerAddr = "127.0.0.1:6443"
	}
	if timeout == 0 {
		timeout = 120 * time.Second
	}
	return &Checker{
		kubeconfigPath: kubeconfigPath,
		apiServerAddr:  apiServerAddr,
		timeout:        timeout,
	}
}

// CheckContainerd verifies that containerd is running.
func (c *Checker) CheckContainerd() bool {
	// Check if containerd socket exists
	if _, err := os.Stat("/run/containerd/containerd.sock"); err != nil {
		slog.Warn("containerd socket not found")
		return false
	}

	// Try ctr version (bundled with KubeSolo)
	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
	defer cancel()

	cmd := exec.CommandContext(ctx, "ctr", "--address", "/run/containerd/containerd.sock", "version")
	if err := cmd.Run(); err != nil {
		slog.Warn("containerd not responsive", "error", err)
		return false
	}

	slog.Debug("containerd healthy")
	return true
}

// CheckAPIServer verifies the Kubernetes API server is reachable.
func (c *Checker) CheckAPIServer() bool {
	// TCP connect to API server port
	conn, err := net.DialTimeout("tcp", c.apiServerAddr, 5*time.Second)
	if err != nil {
		slog.Warn("API server not reachable", "addr", c.apiServerAddr, "error", err)
		return false
	}
	conn.Close()

	// Try HTTPS health endpoint (skip TLS verify for localhost)
	client := &http.Client{
		Timeout: 5 * time.Second,
		Transport: &http.Transport{
			TLSHandshakeTimeout: 5 * time.Second,
		},
	}

	resp, err := client.Get("https://" + c.apiServerAddr + "/healthz")
	if err != nil {
		// TLS error is expected without proper CA, but TCP connect succeeded
		slog.Debug("API server TCP reachable but HTTPS check skipped", "error", err)
		return true
	}
	defer resp.Body.Close()

	if resp.StatusCode == http.StatusOK {
		slog.Debug("API server healthy", "status", resp.StatusCode)
		return true
	}

	slog.Warn("API server unhealthy", "status", resp.StatusCode)
	return false
}

// CheckNodeReady uses kubectl to verify the node is in Ready state.
func (c *Checker) CheckNodeReady() bool {
	if _, err := os.Stat(c.kubeconfigPath); err != nil {
		slog.Warn("kubeconfig not found", "path", c.kubeconfigPath)
		return false
	}

	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
	defer cancel()

	cmd := exec.CommandContext(ctx, "kubectl",
		"--kubeconfig", c.kubeconfigPath,
		"get", "nodes",
		"-o", "jsonpath={.items[0].status.conditions[?(@.type==\"Ready\")].status}",
	)
	output, err := cmd.Output()
	if err != nil {
		slog.Warn("kubectl get nodes failed", "error", err)
		return false
	}

	status := strings.TrimSpace(string(output))
	if status == "True" {
		slog.Debug("node is Ready")
		return true
	}

	slog.Warn("node not Ready", "status", status)
	return false
}

// RunAll performs all health checks and returns the combined status.
func (c *Checker) RunAll() *Status {
	return &Status{
		Containerd: c.CheckContainerd(),
		APIServer:  c.CheckAPIServer(),
		NodeReady:  c.CheckNodeReady(),
	}
}

// WaitForHealthy polls health checks until all pass or timeout expires.
func (c *Checker) WaitForHealthy() (*Status, error) {
	deadline := time.Now().Add(c.timeout)
	interval := 5 * time.Second

	slog.Info("waiting for system health", "timeout", c.timeout)

	for time.Now().Before(deadline) {
		status := c.RunAll()
		if status.IsHealthy() {
			status.Message = "all checks passed"
			slog.Info("system healthy",
				"containerd", status.Containerd,
				"apiserver", status.APIServer,
				"node_ready", status.NodeReady,
			)
			return status, nil
		}

		slog.Debug("health check pending",
			"containerd", status.Containerd,
			"apiserver", status.APIServer,
			"node_ready", status.NodeReady,
			"remaining", time.Until(deadline).Round(time.Second),
		)

		time.Sleep(interval)
	}

	// Final check
	status := c.RunAll()
	if status.IsHealthy() {
		status.Message = "all checks passed"
		return status, nil
	}

	status.Message = "health check timeout"
	return status, fmt.Errorf("health check timed out after %s", c.timeout)
}