package cmd import ( "fmt" "log/slog" "time" "github.com/portainer/kubesolo-os/update/pkg/health" "github.com/portainer/kubesolo-os/update/pkg/state" ) // Healthcheck performs post-boot health verification. // If all checks pass, it marks the boot as successful in GRUB. // This should be run after every boot (typically via a systemd unit or // init script) to confirm the system is healthy. // // State transition: Activated → Verifying → Success on pass, → Failed on fail. // If state isn't in Activated (e.g. manual run on a long-stable system), the // state file is left alone — healthcheck still does its job. // // When --auto-rollback-after N is set, consecutive post-Activated failures // are counted in state.HealthCheckFailures. On the Nth failure, the agent // calls Rollback() and the operator is expected to reboot (this command // does not reboot the host — that's policy left to systemd/init). func Healthcheck(args []string) error { opts := parseOpts(args) env := opts.NewBootEnv() st, err := state.Load(opts.StatePath) if err != nil { slog.Warn("state file unreadable, starting fresh", "error", err) st = state.New() } // Check if already marked successful success, err := env.BootSuccess() if err != nil { slog.Warn("could not read boot_success", "error", err) } if success { fmt.Println("Boot already marked successful") return nil } // Only transition state if we're post-activation. Manual healthcheck on a // long-stable system shouldn't reset Idle → Verifying. postActivation := st.Phase == state.PhaseActivated if postActivation { if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil { slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err) } } timeout := time.Duration(opts.TimeoutSecs) * time.Second checker := health.NewChecker("", "", timeout) checker.ProbeURL = opts.HealthcheckURL if opts.KubeSystemSettle > 0 { checker.KubeSystemSettle = time.Duration(opts.KubeSystemSettle) * time.Second } // Probe the data partition every healthcheck so a wedged disk fails fast. checker.DataDir = "/var/lib/kubesolo" slog.Info("running post-boot health checks", "timeout", timeout, "probe_url", checker.ProbeURL, "kube_system_settle", checker.KubeSystemSettle) status, err := checker.WaitForHealthy() if err != nil { fmt.Printf("Health check FAILED: %s\n", status.Message) printStatusBreakdown(status) fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot") if postActivation { st.HealthCheckFailures++ _ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message)) // Auto-rollback escalation. Only trigger when post-Activated; // don't second-guess a healthy long-running system. if opts.AutoRollbackAfter > 0 && st.HealthCheckFailures >= opts.AutoRollbackAfter { slog.Warn("auto-rollback threshold reached", "failures", st.HealthCheckFailures, "threshold", opts.AutoRollbackAfter) if rerr := env.ForceRollback(); rerr != nil { slog.Error("auto-rollback failed", "error", rerr) return err // return the original healthcheck error } if terr := st.Transition(opts.StatePath, state.PhaseRolledBack, "", fmt.Sprintf("auto-rollback after %d healthcheck failures", st.HealthCheckFailures)); terr != nil { slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", terr) } fmt.Println("\nAuto-rollback triggered. Reboot to complete the rollback.") } } return err } // Mark boot as successful if err := env.MarkBootSuccess(); err != nil { if postActivation { _ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err)) } return fmt.Errorf("marking boot success: %w", err) } if postActivation { // Reset failure counter on a clean pass. st.HealthCheckFailures = 0 if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil { slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err) } } fmt.Println("Health check PASSED — boot marked successful") printStatusBreakdown(status) return nil } // printStatusBreakdown emits a human-readable per-check summary. Only emits // optional check lines when they actually ran. func printStatusBreakdown(s *health.Status) { fmt.Printf(" containerd: %v\n", s.Containerd) fmt.Printf(" apiserver: %v\n", s.APIServer) fmt.Printf(" node_ready: %v\n", s.NodeReady) if !s.KubeSystemReady { fmt.Printf(" kube-system pods: %v\n", s.KubeSystemReady) } if !s.ProbeURL { fmt.Printf(" probe URL: %v\n", s.ProbeURL) } if !s.DiskWritable { fmt.Printf(" disk writable: %v\n", s.DiskWritable) } }