package health import ( "context" "fmt" "net/http" "os" "os/exec" "path/filepath" "strings" "time" ) // kubeSystemSettleSeconds is how long all kube-system pods must hold a // Running phase before we consider the cluster genuinely up. Catches the // "pod just started, will crash-loop in 5s" case. const kubeSystemSettleSeconds = 30 // CheckKubeSystemReady verifies that every pod in the kube-system namespace // is in Running phase and has been Running for at least settle. Returns // (ready, error). settle defaults to 30s when zero. func (c *Checker) CheckKubeSystemReady(settle time.Duration) bool { if settle == 0 { settle = kubeSystemSettleSeconds * time.Second } if _, err := os.Stat(c.kubeconfigPath); err != nil { return false } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() // jsonpath emits one line per pod: | cmd := exec.CommandContext(ctx, "kubectl", "--kubeconfig", c.kubeconfigPath, "get", "pods", "-n", "kube-system", "-o", `jsonpath={range .items[*]}{.status.phase}|{.status.startTime}{"\n"}{end}`, ) out, err := cmd.Output() if err != nil { return false } lines := strings.Split(strings.TrimSpace(string(out)), "\n") if len(lines) == 0 || lines[0] == "" { // No pods reported. Conservatively treat as not-ready: kube-system // is expected to host at least CoreDNS + pause. return false } now := time.Now() for _, line := range lines { parts := strings.SplitN(line, "|", 2) phase := strings.TrimSpace(parts[0]) if phase != "Running" { return false } if len(parts) < 2 { return false } start, perr := time.Parse(time.RFC3339, strings.TrimSpace(parts[1])) if perr != nil { return false } if now.Sub(start) < settle { return false } } return true } // CheckProbeURL fetches the given URL and reports whether it returned 200. // Empty url returns (true, nil) — the check is opt-in. func CheckProbeURL(url string) (bool, error) { if url == "" { return true, nil } client := &http.Client{Timeout: 5 * time.Second} resp, err := client.Get(url) if err != nil { return false, fmt.Errorf("probe URL %s: %w", url, err) } defer resp.Body.Close() return resp.StatusCode == http.StatusOK, nil } // CheckDiskWritable writes a small file under dataDir, fsyncs, reads it back, // and removes it. Confirms the data partition is mounted read-write and the // underlying disk is responsive. Empty dataDir defaults to /var/lib/kubesolo. func CheckDiskWritable(dataDir string) (bool, error) { if dataDir == "" { dataDir = "/var/lib/kubesolo" } if _, err := os.Stat(dataDir); err != nil { // Data partition not mounted? That's catastrophic but we shouldn't // claim the disk is fine. return false, fmt.Errorf("dataDir %s: %w", dataDir, err) } probe := filepath.Join(dataDir, ".update-probe") want := []byte("kubesolo-os healthcheck probe") f, err := os.Create(probe) if err != nil { return false, fmt.Errorf("create probe: %w", err) } defer os.Remove(probe) if _, err := f.Write(want); err != nil { f.Close() return false, fmt.Errorf("write probe: %w", err) } if err := f.Sync(); err != nil { f.Close() return false, fmt.Errorf("fsync probe: %w", err) } if err := f.Close(); err != nil { return false, fmt.Errorf("close probe: %w", err) } got, err := os.ReadFile(probe) if err != nil { return false, fmt.Errorf("read probe: %w", err) } if string(got) != string(want) { return false, fmt.Errorf("probe content mismatch: got %q", got) } return true, nil }