package cmd import ( "fmt" "log/slog" "time" "github.com/portainer/kubesolo-os/update/pkg/health" "github.com/portainer/kubesolo-os/update/pkg/state" ) // Healthcheck performs post-boot health verification. // If all checks pass, it marks the boot as successful in GRUB. // This should be run after every boot (typically via a systemd unit or // init script) to confirm the system is healthy. // // State transition: Activated → Verifying → Success on pass, → Failed on fail. // If state isn't in Activated (e.g. manual run on a long-stable system), the // state file is left alone — healthcheck still does its job. func Healthcheck(args []string) error { opts := parseOpts(args) env := opts.NewBootEnv() st, err := state.Load(opts.StatePath) if err != nil { slog.Warn("state file unreadable, starting fresh", "error", err) st = state.New() } // Check if already marked successful success, err := env.BootSuccess() if err != nil { slog.Warn("could not read boot_success", "error", err) } if success { fmt.Println("Boot already marked successful") return nil } // Only transition state if we're post-activation. Manual healthcheck on a // long-stable system shouldn't reset Idle → Verifying. postActivation := st.Phase == state.PhaseActivated if postActivation { if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil { slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err) } } timeout := time.Duration(opts.TimeoutSecs) * time.Second checker := health.NewChecker("", "", timeout) slog.Info("running post-boot health checks", "timeout", timeout) status, err := checker.WaitForHealthy() if err != nil { fmt.Printf("Health check FAILED: %s\n", status.Message) fmt.Printf(" containerd: %v\n", status.Containerd) fmt.Printf(" apiserver: %v\n", status.APIServer) fmt.Printf(" node_ready: %v\n", status.NodeReady) fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot") if postActivation { _ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message)) } return err } // Mark boot as successful if err := env.MarkBootSuccess(); err != nil { if postActivation { _ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err)) } return fmt.Errorf("marking boot success: %w", err) } if postActivation { if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil { slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err) } } fmt.Println("Health check PASSED — boot marked successful") fmt.Printf(" containerd: %v\n", status.Containerd) fmt.Printf(" apiserver: %v\n", status.APIServer) fmt.Printf(" node_ready: %v\n", status.NodeReady) return nil }