kubesolo-os/update/cmd/healthcheck.go

package cmd

import (
	"fmt"
	"log/slog"
	"time"

	"github.com/portainer/kubesolo-os/update/pkg/health"
	"github.com/portainer/kubesolo-os/update/pkg/state"
)

// Healthcheck performs post-boot health verification.
// If all checks pass, it marks the boot as successful in GRUB.
// This should be run after every boot (typically via a systemd unit or
// init script) to confirm the system is healthy.
//
// State transition: Activated → Verifying → Success on pass, → Failed on fail.
// If state isn't in Activated (e.g. manual run on a long-stable system), the
// state file is left alone — healthcheck still does its job.
//
// When --auto-rollback-after N is set, consecutive post-Activated failures
// are counted in state.HealthCheckFailures. On the Nth failure, the agent
// calls Rollback() and the operator is expected to reboot (this command
// does not reboot the host — that's policy left to systemd/init).
func Healthcheck(args []string) error {
	opts := parseOpts(args)
	env := opts.NewBootEnv()

	st, err := state.Load(opts.StatePath)
	if err != nil {
		slog.Warn("state file unreadable, starting fresh", "error", err)
		st = state.New()
	}

	// Check if already marked successful
	success, err := env.BootSuccess()
	if err != nil {
		slog.Warn("could not read boot_success", "error", err)
	}
	if success {
		fmt.Println("Boot already marked successful")
		return nil
	}

	// Only transition state if we're post-activation. Manual healthcheck on a
	// long-stable system shouldn't reset Idle → Verifying.
	postActivation := st.Phase == state.PhaseActivated
	if postActivation {
		if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil {
			slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err)
		}
	}

	timeout := time.Duration(opts.TimeoutSecs) * time.Second
	checker := health.NewChecker("", "", timeout)
	checker.ProbeURL = opts.HealthcheckURL
	if opts.KubeSystemSettle > 0 {
		checker.KubeSystemSettle = time.Duration(opts.KubeSystemSettle) * time.Second
	}
	// Probe the data partition every healthcheck so a wedged disk fails fast.
	checker.DataDir = "/var/lib/kubesolo"

	slog.Info("running post-boot health checks",
		"timeout", timeout,
		"probe_url", checker.ProbeURL,
		"kube_system_settle", checker.KubeSystemSettle)

	status, err := checker.WaitForHealthy()
	if err != nil {
		fmt.Printf("Health check FAILED: %s\n", status.Message)
		printStatusBreakdown(status)
		fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")

		if postActivation {
			st.HealthCheckFailures++
			_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))

			// Auto-rollback escalation. Only trigger when post-Activated;
			// don't second-guess a healthy long-running system.
			if opts.AutoRollbackAfter > 0 && st.HealthCheckFailures >= opts.AutoRollbackAfter {
				slog.Warn("auto-rollback threshold reached",
					"failures", st.HealthCheckFailures,
					"threshold", opts.AutoRollbackAfter)
				if rerr := env.ForceRollback(); rerr != nil {
					slog.Error("auto-rollback failed", "error", rerr)
					return err // return the original healthcheck error
				}
				if terr := st.Transition(opts.StatePath, state.PhaseRolledBack, "",
					fmt.Sprintf("auto-rollback after %d healthcheck failures", st.HealthCheckFailures)); terr != nil {
					slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", terr)
				}
				fmt.Println("\nAuto-rollback triggered. Reboot to complete the rollback.")
			}
		}
		return err
	}

	// Mark boot as successful
	if err := env.MarkBootSuccess(); err != nil {
		if postActivation {
			_ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err))
		}
		return fmt.Errorf("marking boot success: %w", err)
	}

	if postActivation {
		// Reset failure counter on a clean pass.
		st.HealthCheckFailures = 0
		if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
			slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
		}
	}

	fmt.Println("Health check PASSED — boot marked successful")
	printStatusBreakdown(status)

	return nil
}

// printStatusBreakdown emits a human-readable per-check summary. Only emits
// optional check lines when they actually ran.
func printStatusBreakdown(s *health.Status) {
	fmt.Printf("  containerd:        %v\n", s.Containerd)
	fmt.Printf("  apiserver:         %v\n", s.APIServer)
	fmt.Printf("  node_ready:        %v\n", s.NodeReady)
	if !s.KubeSystemReady {
		fmt.Printf("  kube-system pods:  %v\n", s.KubeSystemReady)
	}
	if !s.ProbeURL {
		fmt.Printf("  probe URL:         %v\n", s.ProbeURL)
	}
	if !s.DiskWritable {
		fmt.Printf("  disk writable:     %v\n", s.DiskWritable)
	}
}