release: v0.3.0

Promote VERSION from 0.3.0-dev to 0.3.0. Finalise CHANGELOG entry with phases 5-8 work (state machine + metrics, channels + maintenance windows, OCI multi-arch distribution, pre-flight gates + deeper healthcheck + auto-rollback). Refresh README quick-start to show both x86_64 and generic ARM64 paths; update the roadmap status table to mark all v0.3 phases complete and explicitly track the v0.3.1 follow-ups (OCI cosign, LABEL=KSOLODATA on ARM64, real-hardware validation). Add docs/release-notes-0.3.0.md as the operator-facing summary, including a v0.2.x -> v0.3.0 migration section (non-breaking on live systems) and the known-limitations list copied from CHANGELOG. All tests green: cloud-init module, all 10 update-module packages, shellcheck across init / build / test / hack scripts under the v0.3 severity policy. Tagging is intentionally NOT done from this commit — that's a manual step so the operator can decide when v0.3.0 is final. After tagging: git tag -a v0.3.0 -m "KubeSolo OS v0.3.0" git push origin v0.3.0 The push triggers .gitea/workflows/build-arm64.yaml which runs the full ARM64 build on the Odroid runner. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
feat(update): pre-flight gates + deeper healthcheck + auto-rollback
2026-05-14 19:13:09 -06:00 · 2026-05-14 19:08:30 -06:00 · 2026-05-14 18:58:38 -06:00 · 2026-05-14 18:21:46 -06:00 · 2026-05-14 18:11:47 -06:00 · 2026-05-14 18:04:10 -06:00
65 changed files with 4997 additions and 374 deletions
--- a/.gitea/workflows/build-arm64.yaml
+++ b/.gitea/workflows/build-arm64.yaml
@@ -0,0 +1,73 @@
 name: ARM64 Build
 # Triggers on push to main and on tags. Skipped on PRs to keep PR feedback fast;
 # manual via Gitea UI ("Run workflow") if needed.
 on:
  push:
    branches: [main]
    tags: ['v*']
  workflow_dispatch:
 jobs:
  build-arm64-generic:
    name: Build generic ARM64 disk image
    # Routes to the Odroid self-hosted runner via the arm64-linux label.
    # See docs/ci-runners.md for runner setup.
    runs-on: arm64-linux
    steps:
      - uses: actions/checkout@v4
      - name: Show host info
        run: |
          uname -a
          nproc
          free -h
          df -h /home /tmp || df -h /
      - name: Verify build prerequisites
        run: |
          # The Odroid runner ships these via apt; this is a sanity check.
          which gcc make bc bison flex cpio gzip xz wget curl mkfs.ext4 mkfs.vfat \
                 sfdisk losetup kpartx grub-mkimage qemu-system-aarch64 git busybox
          ls -la /bin/busybox
          file /bin/busybox | grep -q 'statically linked' || {
              echo "ERROR: /bin/busybox is not statically linked — install busybox-static"
              exit 1
          }
      - name: Build mainline ARM64 kernel
        # Cached in build/cache/kernel-arm64-generic between runs (persistent
        # working dir on the host runner). First run takes 30-60 min; reruns
        # exit immediately once the .config + Image match.
        run: |
          time make kernel-arm64
      - name: Build cross-arch Go binaries
        run: make build-cross
      - name: Prepare generic ARM64 rootfs
        run: sudo make rootfs-arm64
      - name: Build ARM64 UEFI disk image
        run: sudo make disk-image-arm64
      - name: Show output artifact
        run: |
          ls -lh output/
          file output/*.arm64.img
      - name: Boot smoke test (best-effort)
        # KubeSolo's image import deadline can fire under QEMU TCG on the
        # Odroid; the boot itself succeeds through stage 90 every time, but
        # the final "KubeSolo started" health check is timing-sensitive.
        # We mark this continue-on-error until we have KVM or real hardware.
        continue-on-error: true
        run: sudo make test-boot-arm64-disk
      - name: Upload disk image
        if: startsWith(github.ref, 'refs/tags/v')
        uses: actions/upload-artifact@v4
        with:
          name: kubesolo-os-arm64-${{ github.ref_name }}
          path: output/kubesolo-os-*.arm64.img
          retention-days: 90
--- a/.gitea/workflows/ci.yaml
+++ b/.gitea/workflows/ci.yaml
@@ -62,7 +62,8 @@ jobs:
        working-directory: update
      - name: Upload binaries
-        uses: actions/upload-artifact@v4
+        # @v4 not yet fully supported by Gitea Actions runner; @v3 works.
        uses: actions/upload-artifact@v3
        with:
          name: binaries-${{ matrix.suffix }}
          path: |
@@ -78,14 +79,39 @@ jobs:
      - name: Install shellcheck
        run: sudo apt-get update && sudo apt-get install -y shellcheck
      # --severity=error filters out style/info/warning findings. Several of
      # those are unavoidable in init-style scripts that source other files
      # dynamically (SC1090/SC1091/SC2034). Exclude them explicitly so they
      # don't fire even at warning level if we lift severity later.
      # Codes excluded:
      #   SC1090 — non-constant source path (we source by stage name)
      #   SC1091 — source target not specified as input (we reference relative paths)
      #   SC2034 — var "unused" (false positive: used via sourced scripts)
      #   SC2002 — useless cat (style only, very common pattern in our scripts)
      #   SC2015 — A && B || C (deliberate idiom)
      #   SC2012 — use find not ls (style only)
      #   SC2013 — read words not lines (style only, applies to /proc parsing)
      - name: Lint init scripts (POSIX sh)
-        run: shellcheck -s sh init/init.sh init/lib/*.sh init/emergency-shell.sh
+        run: |
          shellcheck -s sh --severity=error \
            -e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
            init/init.sh init/lib/*.sh init/emergency-shell.sh
      - name: Lint build scripts (bash)
-        run: shellcheck -s bash build/scripts/*.sh build/config/kernel-audit.sh
+        run: |
          shellcheck -s bash --severity=error \
            -e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
            build/scripts/*.sh build/config/kernel-audit.sh
      - name: Lint test scripts (bash)
-        run: shellcheck -s bash test/qemu/*.sh test/integration/*.sh test/kernel/*.sh || true
+        run: |
          shellcheck -s bash --severity=error \
            -e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
            test/qemu/*.sh test/integration/*.sh test/kernel/*.sh
      - name: Lint hack scripts (bash)
-        run: shellcheck -s bash hack/*.sh || true
+        run: |
          shellcheck -s bash --severity=error \
            -e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
            hack/*.sh
--- a/.gitignore
+++ b/.gitignore
@@ -18,8 +18,19 @@ build/rootfs-work/
 # OS
 .DS_Store
 ._*
 Thumbs.db
 # Photos / screenshots — keep documentation images under docs/ instead
 *.PNG
 *.png
 *.JPG
 *.jpg
 *.JPEG
 *.jpeg
 *.HEIC
 *.heic
 # Go
 update/update-agent
 cloud-init/cloud-init-parser
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,164 @@ All notable changes to KubeSolo OS are documented in this file.
 Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 versioning follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [0.3.0] - 2026-05-14
 The main themes: generic ARM64 (not just Raspberry Pi), an honest update
 lifecycle with state file + metrics, OCI multi-arch distribution via ghcr.io,
 and policy gates (channels, maintenance windows, version stepping-stones,
 pre-flight checks, auto-rollback).
 ### Added
 - Generic ARM64 build track distinct from Raspberry Pi:
  - `make kernel-arm64` builds a mainline kernel.org LTS kernel (6.12.10 by
    default) from `arm64 defconfig` + shared `kernel-container.fragment` +
    arm64 virt-host enables (VIRTIO_*, EFI_STUB, NVMe).
  - `make disk-image-arm64` produces a UEFI-bootable raw GPT image with A/B
    system partitions and GRUB-EFI ARM64. Targets QEMU virt, Graviton, Ampere,
    or any UEFI ARM64 host.
  - `hack/dev-vm-arm64.sh --disk` boots the built image through QEMU UEFI for
    end-to-end testing.
  - `test/qemu/test-boot-arm64-disk.sh` automated boot smoke test.
 - Bumped KubeSolo to v1.1.5 (was v1.1.0). New cloud-init flags surfaced:
  - `kubesolo.full` (v1.1.4+) — disable edge-optimised overrides
  - `kubesolo.disable-ipv6` (v1.1.5+)
  - `kubesolo.db-wal-repair` (v1.1.5+) — recover from unclean shutdowns
 - Per-arch supply-chain verification: `KUBESOLO_SHA256_AMD64` and
  `KUBESOLO_SHA256_ARM64` in `versions.env`, applied to the tarball before
  extract.
 - `docs/arm64-architecture.md` — defines the generic-vs-RPi two-track layout.
 - `docs/arm64-status.md` — Phase 3 status snapshot, known limitations, what's
  needed to ship.
 - `docs/ci-runners.md` — Gitea Actions runner setup (Odroid arm64-linux).
 - Update agent state machine and observability (`update/pkg/state`):
  - Persistent on-disk `state.json` at `/var/lib/kubesolo/update/state.json`
    (atomic write via tmp + rename). Records Phase (Idle / Checking /
    Downloading / Staged / Activated / Verifying / Success / RolledBack /
    Failed), FromVersion, ToVersion, StartedAt, UpdatedAt, LastError,
    AttemptCount, HealthCheckFailures.
  - `apply`, `activate`, `healthcheck`, `rollback` all transition state
    explicitly on entry / exit / failure. Errors land in LastError so
    `status` can show why.
  - `kubesolo-update status --json` emits the full state for
    orchestration tooling. Human-readable mode adds an "Update Lifecycle"
    section when not idle.
  - New Prometheus metrics: `kubesolo_update_phase{phase="..."}` (all 9
    phase labels always emitted), `kubesolo_update_attempts_total`,
    `kubesolo_update_last_attempt_timestamp_seconds`.
 - Channels, maintenance windows, version policy (`update/pkg/config`):
  - `/etc/kubesolo/update.conf` (key=value, comments, missing-OK) configures
    server, channel, maintenance_window, pubkey, healthcheck_url,
    auto_rollback_after.
  - `cloud-init` top-level `updates:` block writes `update.conf` on first
    boot. Empty block leaves any existing file alone.
  - `apply` enforces four gates before download: maintenance window,
    channel match, runtime architecture match, min_compatible_version
    stepping-stone. All gate failures land in the state machine as Failed
    with a clear LastError. `--force` bypasses window + node-block-label.
  - `UpdateMetadata` JSON gains `channel`, `min_compatible_version`,
    `architecture` (all optional, omitempty).
 - OCI registry distribution (`update/pkg/oci`, ~280 LOC, 9 tests):
  - `kubesolo-update apply --registry ghcr.io/<org>/kubesolo-os --tag stable`
    pulls update artifacts from any OCI-compliant registry. Multi-arch
    indexes resolve to the runtime.GOARCH-matching manifest automatically.
  - Custom media types: `application/vnd.kubesolo.os.kernel.v1+octet-stream`
    and `application/vnd.kubesolo.os.initramfs.v1+gzip`. Annotations:
    `io.kubesolo.os.{version,channel,architecture,min_compatible_version,
    release_notes,release_date}`.
  - End-to-end digest verification from manifest to blobs via oras-go/v2.
  - `build/scripts/push-oci-artifact.sh` publishes per-arch artifacts via
    `oras`. Multi-arch index composition documented inline.
  - Dependencies added (update module only): oras.land/oras-go/v2 and
    transitive opencontainers/{go-digest,image-spec} + golang.org/x/sync.
 - Pre-flight gates and deeper healthcheck (`update/pkg/health` extended,
  `update/pkg/partition` extended):
  - Free-space pre-flight on the passive partition (image + 10% headroom)
    via `partition.FreeBytes` / `HasFreeSpaceFor`.
  - Node-block-label pre-flight: refuses if the local K8s node carries
    `updates.kubesolo.io/block=true`. Silently allowed when no kubeconfig
    (air-gap). Skipped by `--force`.
  - `CheckKubeSystemReady` waits until every kube-system pod has held
    Running for ≥ N seconds (configurable via
    `--kube-system-settle`).
  - `CheckProbeURL` GETs an operator-supplied URL; 200 = pass. Configurable
    via `--healthcheck-url` or `healthcheck_url=` in update.conf.
  - `CheckDiskWritable` writes / fsyncs / reads / deletes a probe file
    under `/var/lib/kubesolo` to catch a wedged data partition.
  - `--auto-rollback-after N` (also `auto_rollback_after=` in update.conf):
    after N consecutive post-activation healthcheck failures, the agent
    calls `ForceRollback()` and the operator/init reboots. Reset to 0 on
    a clean pass.
 - `.gitea/workflows/build-arm64.yaml` — full ARM64 build on the Odroid
  self-hosted runner. Triggers on push to main, tags, and workflow_dispatch.
  Boot smoke test marked continue-on-error pending KVM or real-hardware
  validation.
 ### Changed
 - `build/scripts/build-kernel-arm64.sh` is now the **generic ARM64** kernel
  build (mainline kernel.org LTS, generic UEFI/virtio).
 - Renamed `build/scripts/build-kernel-rpi.sh` (was `build-kernel-arm64.sh`).
  RPi kernel build (raspberrypi/linux fork, bcm2711_defconfig) lives here now.
 - Renamed `build/config/kernel-container.fragment` (was
  `rpi-kernel-config.fragment`). Misnomer: contents are arch-agnostic and now
  shared across x86, ARM64-generic, and RPi kernels.
 - `build/scripts/build-kernel.sh` (x86) refactored to consume the shared
  fragment via a generic `apply_fragment` function. ~50 lines of duplication
  killed.
 - `KUBESOLO_VERSION` moved out of `fetch-components.sh` defaults into
  `versions.env`. Bumping is now a one-line PR.
 ### Fixed
 - Native ARM64 build hosts (e.g. an Odroid runner) no longer require the x86
  cross-compiler. Both `build-kernel-arm64.sh` and `build-kernel-rpi.sh` detect
  `uname -m` and use the host's gcc directly when arch matches.
 - ARM64 grub.cfg console ordering: `ttyAMA0` is now the primary console
  (`console=ttyS0,... console=ttyAMA0,...`). Init output is now visible on
  QEMU virt and most ARM64 SBCs without further configuration.
 - ARM64 boot: replaced piCore64's `/init` with our staged init at `/init` and
  `/sbin/init`. Previously the kernel ran piCore's TCE handler which
  segfaulted in our environment.
 - ARM64 boot: replaced piCore64's broken dynamic BusyBox with the build
  host's `busybox-static`. piCore's binary triggered EL0 instruction-abort
  panics on QEMU virt under both `-cpu cortex-a72` and `-cpu max`.
 - POSIX-character-class portability: `tr -d '[:space:]'` in
  `30-kernel-modules.sh` and `40-sysctl.sh` replaced with explicit
  `' \t\r\n'`. Ubuntu's busybox-static 1.30.1 doesn't parse `[:space:]` and
  instead deletes the literal characters `[ : s p a c e ]`, which truncated
  module names (`virtio_net` → `virtio_nt`, etc.) and sysctl keys.
 - `inject-kubesolo.sh` no longer copies `init/lib/functions.sh` into
  `init.d/`. Previously the main init loop tried to run it as a stage after
  stage 90 and panicked with "Init completed without exec'ing KubeSolo".
 - ARM64 disk image: `TARGET_ARCH=arm64 create-disk-image.sh` produces
  `BOOTAA64.EFI` via `grub-mkimage -O arm64-efi` (not `bootx64.efi`). Skips
  the BIOS-only `grub-install --target=i386-pc` step.
 - `build/Dockerfile.builder`: added `grub-efi-amd64-bin`, `grub-efi-arm64-bin`,
  `grub-pc-bin`, `grub-common`, `grub2-common`, and `busybox-static` so the
  Docker-based build flow can produce ARM64 disk images and gets the same
  BusyBox swap behaviour as native builds.
 ### Known limitations (deferred to follow-up)
 - **ARM64 LABEL= resolution** doesn't work yet — piCore's `blkid`/`findfs`
  crash in QEMU and our static busybox lacks the applets. Hardcoded
  `/dev/vda4` as a workaround in `build/grub/grub-arm64.cfg`. Production
  fix: ship static `blkid`/`findfs` or replace LABEL resolution with a
  sysfs walk.
 - **AppArmor profile load fails on ARM64** (apparmor_parser ABI mismatch).
  Init reports it; boot continues without enforcement.
 - **OCI signature verification** is deferred. The HTTP transport still
  honours `--pubkey` for `.sig` files; the OCI transport is digest-verified
  end-to-end via oras-go but does not yet consume cosign-style referrer
  attestations. Targeted for v0.3.1.
 - **Real-hardware validation** of the generic ARM64 image is still
  pending. Builds and boots end-to-end under QEMU virt; production
  certification waits on a Graviton / Ampere run.
 - **QEMU TCG performance** can trigger KubeSolo's first-boot image-import
  deadline. Not a defect in the OS itself; real hardware and KVM-accelerated
  QEMU complete the import in seconds.
 ## [0.2.0] - 2026-02-12
 ### Added
--- a/60
+++ b/60
@@ -1,8 +1,8 @@
 .PHONY: all fetch kernel build-cloudinit build-update-agent build-cross rootfs initramfs \
-       iso disk-image oci-image rpi-image \
+       iso disk-image disk-image-arm64 oci-image rpi-image \
-       kernel-arm64 rootfs-arm64 \
+       kernel-arm64 kernel-rpi rootfs-arm64 rootfs-arm64-rpi \
       test-boot test-k8s test-persistence test-deploy test-storage test-security test-all \
-       test-boot-arm64 test-cloudinit test-update-agent \
+       test-boot-arm64 test-boot-arm64-disk test-cloudinit test-update-agent \
       bench-boot bench-resources \
       dev-vm dev-vm-shell dev-vm-arm64 quick docker-build shellcheck \
       kernel-audit clean distclean help
@@ -73,21 +73,43 @@ build-cross:
 	$(BUILD_DIR)/scripts/build-cross.sh
 # =============================================================================
-# ARM64 Raspberry Pi targets
+# ARM64 generic targets (mainline kernel, UEFI, virtio — for cloud / SBCs)
 # =============================================================================
 kernel-arm64:
-	@echo "==> Building ARM64 kernel for Raspberry Pi..."
+	@echo "==> Building generic ARM64 kernel (mainline LTS)..."
 	$(BUILD_DIR)/scripts/build-kernel-arm64.sh
 # Generic ARM64 rootfs consumes the mainline kernel modules.
 rootfs-arm64: build-cross
-	@echo "==> Preparing ARM64 rootfs..."
+	@echo "==> Preparing generic ARM64 rootfs..."
 	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/fetch-components.sh
 	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/extract-core.sh
-	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/inject-kubesolo.sh
+	TARGET_ARCH=arm64 TARGET_VARIANT=generic $(BUILD_DIR)/scripts/inject-kubesolo.sh
-	@echo "==> Packing ARM64 initramfs..."
+	@echo "==> Packing generic ARM64 initramfs..."
 	$(BUILD_DIR)/scripts/pack-initramfs.sh
-rpi-image: rootfs-arm64 kernel-arm64
+disk-image-arm64: rootfs-arm64 kernel-arm64
 	@echo "==> Creating generic ARM64 disk image (UEFI + GRUB A/B)..."
 	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/create-disk-image.sh
 	@echo "==> Built: $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).arm64.img"
 # =============================================================================
 # ARM64 Raspberry Pi targets (RPi-patched kernel, firmware blobs, SD card)
 # =============================================================================
 kernel-rpi:
 	@echo "==> Building RPi kernel (raspberrypi/linux)..."
 	$(BUILD_DIR)/scripts/build-kernel-rpi.sh
 # RPi-flavoured rootfs consumes the RPi kernel modules.
 rootfs-arm64-rpi: build-cross
 	@echo "==> Preparing RPi ARM64 rootfs..."
 	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/fetch-components.sh
 	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/extract-core.sh
 	TARGET_ARCH=arm64 TARGET_VARIANT=rpi $(BUILD_DIR)/scripts/inject-kubesolo.sh
 	@echo "==> Packing RPi ARM64 initramfs..."
 	$(BUILD_DIR)/scripts/pack-initramfs.sh
 rpi-image: rootfs-arm64-rpi kernel-rpi
 	@echo "==> Creating Raspberry Pi SD card image..."
 	$(BUILD_DIR)/scripts/create-rpi-image.sh
 	@echo "==> Built: $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).rpi.img"
@@ -127,9 +149,13 @@ test-security: iso
 	test/integration/test-security-hardening.sh $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).iso
 test-boot-arm64:
-	@echo "==> Testing ARM64 boot in QEMU..."
+	@echo "==> Testing ARM64 boot in QEMU (direct kernel)..."
 	test/qemu/test-boot-arm64.sh
 test-boot-arm64-disk: disk-image-arm64
 	@echo "==> Testing ARM64 UEFI disk boot in QEMU..."
 	test/qemu/test-boot-arm64-disk.sh $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).arm64.img
 test-all: test-boot test-k8s test-persistence
 # Cloud-init Go tests
@@ -246,10 +272,15 @@ help:
 	@echo "  make quick              Fast rebuild (re-inject + repack + ISO only)"
 	@echo "  make docker-build       Reproducible build inside Docker"
 	@echo ""
 	@echo "Build targets (ARM64 generic — UEFI / cloud / SBCs):"
 	@echo "  make kernel-arm64       Build mainline ARM64 kernel from kernel.org LTS"
 	@echo "  make rootfs-arm64       Prepare generic ARM64 rootfs (mainline kernel modules)"
 	@echo "  make disk-image-arm64   Create UEFI-bootable A/B GPT disk image (.arm64.img)"
 	@echo ""
 	@echo "Build targets (ARM64 Raspberry Pi):"
-	@echo "  make kernel-arm64       Build ARM64 kernel from raspberrypi/linux"
+	@echo "  make kernel-rpi         Build RPi kernel from raspberrypi/linux"
-	@echo "  make rootfs-arm64       Extract + prepare ARM64 rootfs from piCore64"
+	@echo "  make rootfs-arm64-rpi   Prepare RPi-flavoured rootfs (RPi kernel modules)"
-	@echo "  make rpi-image          Create Raspberry Pi SD card image with A/B partitions"
+	@echo "  make rpi-image          Create Raspberry Pi SD card image with A/B autoboot"
 	@echo ""
 	@echo "Test targets:"
 	@echo "  make test-boot          Boot ISO in QEMU, verify boot success"
@@ -262,7 +293,8 @@ help:
 	@echo "  make test-update-agent  Run update agent Go unit tests"
 	@echo "  make test-update        A/B update cycle integration test"
 	@echo "  make test-rollback      Forced rollback integration test"
-	@echo "  make test-boot-arm64    ARM64 boot test in QEMU aarch64"
+	@echo "  make test-boot-arm64    ARM64 boot test (direct kernel, fast)"
 	@echo "  make test-boot-arm64-disk  ARM64 full UEFI disk-boot test"
 	@echo "  make test-all           Run core tests (boot + k8s + persistence)"
 	@echo "  make test-integ         Run full integration suite"
 	@echo "  make bench-boot         Benchmark boot performance (3 runs)"
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 An immutable, bootable Linux distribution purpose-built for [KubeSolo](https://github.com/portainer/kubesolo) — Portainer's ultra-lightweight single-node Kubernetes.
-> **Status:** All 6 phases complete. Boots and runs K8s workloads. Portainer Edge Agent tested and connected.
+> **Status (v0.3.0):** x86_64 and generic ARM64 (UEFI / virtio / mainline kernel) both build and boot end-to-end. Update agent has an explicit state machine, OCI registry distribution alongside HTTP, channel + maintenance-window + version-stepping-stone gates, and auto-rollback. ARM64 Raspberry Pi support remains paused pending physical hardware. See [docs/release-notes-0.3.0.md](docs/release-notes-0.3.0.md) for the full v0.3.0 changelog.
 ## What is this?
@@ -24,23 +24,34 @@ KubeSolo OS combines **Tiny Core Linux** (~11 MB) with **KubeSolo** (single-bina
 ## Quick Start
 ### x86_64 ISO
 ```bash
-# Fetch Tiny Core ISO + KubeSolo binary
+make fetch          # Tiny Core ISO + KubeSolo binary
-make fetch
+make kernel         # Custom kernel (first time only, ~25 min, cached)
 # Build custom kernel (first time only, ~25 min, cached)
 make kernel
 # Build Go binaries
 make build-cloudinit build-update-agent
 # Build bootable ISO
 make rootfs initramfs iso
 # Test in QEMU
 make dev-vm
 ```
 ### Generic ARM64 disk image (v0.3.0+)
 For Graviton / Ampere / generic UEFI ARM64 hosts:
 ```bash
 make kernel-arm64       # Mainline 6.12 LTS kernel (first time only, ~30-60 min)
 make rootfs-arm64       # Mainline kernel modules + KubeSolo arm64
 make disk-image-arm64   # UEFI-bootable A/B GPT image
 make test-boot-arm64-disk  # boot smoke test under qemu-system-aarch64
 ```
 ### Raspberry Pi (work in progress)
 Build path lives at `make kernel-rpi` / `make rpi-image`; needs physical
 hardware to validate the firmware + autoboot.txt path. See
 [docs/arm64-architecture.md](docs/arm64-architecture.md) for the two-track
 build layout.
 Or build everything at once inside Docker:
 ```bash
@@ -227,13 +238,19 @@ Metrics include: `kubesolo_os_info`, `boot_success`, `boot_counter`, `uptime_sec
 | Phase | Scope | Status |
 |-------|-------|--------|
-| 1 | PoC: boot Tiny Core + KubeSolo, verify K8s | Complete |
+| 1 | PoC: boot Tiny Core + KubeSolo, verify K8s | Complete (x86_64) |
 | 2 | Cloud-init Go parser, network, hostname | Complete |
-| 3 | A/B atomic updates, GRUB, rollback agent | Complete |
+| 3 | A/B atomic updates, GRUB, rollback agent | Complete (x86_64) |
 | 4 | Ed25519 signing, Portainer Edge, SSH extension | Complete |
-| 5 | CI/CD, OCI distribution, Prometheus metrics, ARM64 | Complete |
+| 5 | CI/CD, OCI distribution, Prometheus metrics, ARM64 cross-compile | Complete |
-| 6 | Security hardening, AppArmor, ARM64 RPi support | Complete |
+| 6 | Security hardening, AppArmor | Complete |
-| - | Custom kernel build for container runtime fixes | Complete |
+| - | Custom kernel build for container runtime fixes | Complete (x86_64) |
 | 7 | ARM64 generic (mainline kernel, UEFI, virtio) | Complete (v0.3.0, QEMU validated) |
 | 8 | Update engine v2 (state machine, channels, OCI, pre-flight gates) | Complete (v0.3.0) |
 | - | ARM64 Raspberry Pi (custom kernel, firmware, SD card image) | Paused — needs hardware |
 | - | OCI cosign signature verification | Planned for v0.3.1 |
 | - | LABEL=KSOLODATA on ARM64 (replace blkid/findfs path) | Planned for v0.3.1 |
 | - | Real-hardware ARM64 validation (Graviton / Ampere) | Planned for v0.3.1 |
 ## License
--- a/2
+++ b/2
@@ -1 +1 @@
-0.2.0
+0.3.0
--- a/build/Dockerfile.builder
+++ b/build/Dockerfile.builder
@@ -18,6 +18,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    file \
    flex \
    genisoimage \
    grub-common \
    grub-efi-amd64-bin \
    grub-efi-arm64-bin \
    grub-pc-bin \
    grub2-common \
    gzip \
    isolinux \
    iptables \
@@ -35,6 +40,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    apparmor-utils \
    gcc-aarch64-linux-gnu \
    binutils-aarch64-linux-gnu \
    busybox-static \
    git \
    kpartx \
    unzip \
@@ -49,6 +55,13 @@ RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \
    | tar -C /usr/local -xzf -
 ENV PATH="/usr/local/go/bin:${PATH}"
 # Install oras (OCI artifact CLI) for push-oci-artifact.sh.
 # Bump ORAS_VERSION when pushing breaks or when oras gains useful flags.
 ARG ORAS_VERSION=1.2.3
 RUN curl -fsSL "https://github.com/oras-project/oras/releases/download/v${ORAS_VERSION}/oras_${ORAS_VERSION}_linux_amd64.tar.gz" \
    | tar -C /usr/local/bin -xzf - oras \
    && chmod +x /usr/local/bin/oras
 WORKDIR /build
 COPY . /build
--- a/build/config/rpi-kernel-config.fragment
+++ b/build/config/rpi-kernel-config.fragment
@@ -1,6 +1,15 @@
-# KubeSolo OS — Raspberry Pi kernel config overrides
+# KubeSolo OS — Shared kernel config fragment for container workloads
-# Applied on top of bcm2711_defconfig (Pi 4) or bcm2712_defconfig (Pi 5)
+#
-# These ensure container runtime support is enabled.
+# Applied on top of:
 #   - Tiny Core stock config (x86_64)        via build-kernel.sh
 #   - mainline kernel.org arm64 defconfig    via build-kernel-arm64.sh
 #   - bcm2711_defconfig / bcm2712_defconfig  via build-kernel-rpi.sh
 #
 # All entries here are architecture-agnostic.
 # Apply this fragment twice with `make olddefconfig` between passes — TC's stock
 # config has CONFIG_SECURITY disabled, which causes a single-pass olddefconfig
 # to strip the security subtree before its dependencies (SYSFS, MULTIUSER) are
 # resolved.
 # cgroup v2 (mandatory for containerd/runc)
 CONFIG_CGROUPS=y
@@ -52,6 +61,7 @@ CONFIG_SECURITYFS=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_APPARMOR=y
 CONFIG_DEFAULT_SECURITY_APPARMOR=y
 CONFIG_LSM=lockdown,yama,apparmor
 # Security: seccomp
 CONFIG_SECCOMP=y
@@ -60,10 +70,21 @@ CONFIG_SECCOMP_FILTER=y
 # Crypto (image verification)
 CONFIG_CRYPTO_SHA256=y
-# Disable unnecessary subsystems for edge appliance
+# Disable unnecessary subsystems for headless edge appliance
 # CONFIG_SOUND is not set
 # CONFIG_DRM is not set
 # CONFIG_KVM is not set
 # CONFIG_MEDIA_SUPPORT is not set
 # CONFIG_WIRELESS is not set
 # CONFIG_WLAN is not set
 # CONFIG_CFG80211 is not set
 # CONFIG_BT is not set
 # CONFIG_NFC is not set
 # CONFIG_INFINIBAND is not set
 # CONFIG_PCMCIA is not set
 # CONFIG_HAMRADIO is not set
 # CONFIG_ISDN is not set
 # CONFIG_ATM is not set
 # CONFIG_INPUT_JOYSTICK is not set
 # CONFIG_INPUT_TABLET is not set
 # CONFIG_FPGA is not set
--- a/build/config/versions.env
+++ b/build/config/versions.env
@@ -9,7 +9,13 @@ TINYCORE_ISO=CorePure64-${TINYCORE_VERSION}.iso
 TINYCORE_ISO_URL=${TINYCORE_MIRROR}/${TINYCORE_VERSION%%.*}.x/${TINYCORE_ARCH}/release/${TINYCORE_ISO}
 # KubeSolo
 # Pinned release tag from https://github.com/portainer/kubesolo/releases.
 # Bump here and re-run `make fetch` to pull a new version.
 KUBESOLO_VERSION=v1.1.5
 KUBESOLO_INSTALL_URL=https://get.kubesolo.io
 # Per-arch SHA256 of the musl tarball (verified at fetch time when non-empty).
 KUBESOLO_SHA256_AMD64=565bd5fd98fc8ce09160e646b55de3493c782d74c0e0c46ccf130ff4bcabab81
 KUBESOLO_SHA256_ARM64=db865a5e9b2617d595f9c2b7d011272edc94587621a9690e2de0f47cc94f0748
 # Build tools (used inside builder container)
 GRUB_VERSION=2.12
@@ -19,7 +25,6 @@ SYSLINUX_VERSION=6.03
 # Populate by running: sha256sum build/cache/<file>
 # Leave empty to skip verification (useful for first fetch)
 TINYCORE_ISO_SHA256=""
 KUBESOLO_SHA256=""
 NETFILTER_TCZ_SHA256=""
 NET_BRIDGING_TCZ_SHA256=""
 IPTABLES_TCZ_SHA256=""
@@ -38,5 +43,13 @@ RPI_FIRMWARE_URL=https://github.com/raspberrypi/firmware/archive/refs/tags/${RPI
 RPI_KERNEL_BRANCH=rpi-6.6.y
 RPI_KERNEL_REPO=https://github.com/raspberrypi/linux
 # Mainline Linux kernel (for generic ARM64 — kernel.org LTS)
 # Bump within the 6.12 LTS series as patch levels release.
 # 6.12 LTS is supported until Dec 2029.
 MAINLINE_KERNEL_VERSION=6.12.10
 MAINLINE_KERNEL_MAJOR=v6.x
 MAINLINE_KERNEL_URL=https://cdn.kernel.org/pub/linux/kernel/${MAINLINE_KERNEL_MAJOR}/linux-${MAINLINE_KERNEL_VERSION}.tar.xz
 MAINLINE_KERNEL_SHA256=""
 # Output naming
 OS_NAME=kubesolo-os
--- a/build/grub/grub-arm64.cfg
+++ b/build/grub/grub-arm64.cfg
@@ -0,0 +1,93 @@
 # KubeSolo OS — GRUB Configuration (ARM64)
 # A/B partition boot with automatic rollback.
 #
 # Same A/B logic as build/grub/grub.cfg; only the console parameters differ
 # (ARM64 PL011 / 16550-compat UART rather than x86 ttyS0).
 #
 # Partition layout:
 #   (hd0,gpt1) — EFI/Boot  (256 MB, FAT32) — contains GRUB + grubenv
 #   (hd0,gpt2) — System A  (512 MB, ext4)  — vmlinuz + kubesolo-os.gz
 #   (hd0,gpt3) — System B  (512 MB, ext4)  — vmlinuz + kubesolo-os.gz
 #   (hd0,gpt4) — Data      (remaining, ext4) — persistent K8s state
 set default=0
 set timeout=3
 load_env
 # --- A/B Rollback Logic (identical to amd64 grub.cfg) ---
 if [ "${boot_success}" != "1" ]; then
    if [ "${boot_counter}" = "0" ]; then
        if [ "${active_slot}" = "A" ]; then
            set active_slot=B
        else
            set active_slot=A
        fi
        save_env active_slot
        set boot_counter=3
        save_env boot_counter
    else
        if [ "${boot_counter}" = "3" ]; then
            set boot_counter=2
        elif [ "${boot_counter}" = "2" ]; then
            set boot_counter=1
        elif [ "${boot_counter}" = "1" ]; then
            set boot_counter=0
        fi
        save_env boot_counter
    fi
 fi
 set boot_success=0
 save_env boot_success
 if [ "${active_slot}" = "A" ]; then
    set root='(hd0,gpt2)'
    set slot_label="System A"
 else
    set root='(hd0,gpt3)'
    set slot_label="System B"
 fi
 # --- ARM64 console string ---
 # Order matters: the LAST `console=` is the primary system console (where /dev/console
 # points and where init's stdout/stderr land). Earlier `console=` entries get mirrored
 # kernel output but don't carry process I/O.
 #
 # Covers Graviton/16550 (ttyS0) as secondary and QEMU virt / PL011 / Ampere (ttyAMA0)
 # as primary. ttyAMA0 must be last for `-nographic` QEMU + most ARM64 SBCs.
 #
 # `quiet` is intentionally omitted from the default entry while we stabilise the
 # generic ARM64 boot path. Add back once boots are reliable.
 menuentry "KubeSolo OS (${slot_label})" {
    echo "Booting KubeSolo OS from ${slot_label}..."
    echo "Boot counter: ${boot_counter}, Boot success: ${boot_success}"
    linux /vmlinuz init=/sbin/init kubesolo.data=/dev/vda4 console=ttyS0,115200 console=ttyAMA0,115200
    initrd /kubesolo-os.gz
 }
 menuentry "KubeSolo OS (${slot_label}) — Debug Mode" {
    echo "Booting KubeSolo OS (debug) from ${slot_label}..."
    linux /vmlinuz kubesolo.data=/dev/vda4 kubesolo.debug console=ttyS0,115200 console=ttyAMA0,115200
    initrd /kubesolo-os.gz
 }
 menuentry "KubeSolo OS — Emergency Shell" {
    echo "Booting to emergency shell..."
    linux /vmlinuz init=/sbin/init kubesolo.shell console=ttyS0,115200 console=ttyAMA0,115200
    initrd /kubesolo-os.gz
 }
 menuentry "KubeSolo OS — Boot Other Slot" {
    if [ "${active_slot}" = "A" ]; then
        set root='(hd0,gpt3)'
        echo "Booting from System B (passive)..."
    else
        set root='(hd0,gpt2)'
        echo "Booting from System A (passive)..."
    fi
    linux /vmlinuz kubesolo.data=/dev/vda4 kubesolo.debug console=ttyS0,115200 console=ttyAMA0,115200
    initrd /kubesolo-os.gz
 }
--- a/build/scripts/build-kernel-arm64.sh
+++ b/build/scripts/build-kernel-arm64.sh
@@ -1,14 +1,20 @@
 #!/bin/bash
-# build-kernel-arm64.sh — Build ARM64 kernel for Raspberry Pi 4/5
+# build-kernel-arm64.sh — Build generic ARM64 kernel (mainline LTS)
 #
-# Uses the official raspberrypi/linux kernel fork with bcm2711_defconfig
+# Builds a Linux kernel from kernel.org mainline LTS source, suitable for:
-# as the base, overlaid with container-critical config options.
+#   - qemu-system-aarch64 -machine virt
 #   - UEFI ARM64 hosts (Ampere, Graviton, generic ARM64 servers)
 #   - Future ARM64 SBCs with UEFI/u-boot generic-distro support
 #
-# Output is cached in $CACHE_DIR/custom-kernel-arm64/ and reused across builds.
+# This is the GENERIC ARM64 build track. For Raspberry Pi specifically
 # (raspberrypi/linux fork, RPi firmware boot path, custom DTBs), see
 # build/scripts/build-kernel-rpi.sh.
 #
 # Output is cached in $CACHE_DIR/kernel-arm64-generic/ and reused across builds.
 #
 # Requirements:
 #   - gcc-aarch64-linux-gnu (cross-compiler)
-#   - Standard kernel build deps (bc, bison, flex, etc.)
+#   - Standard kernel build deps (bc, bison, flex, libelf-dev, libssl-dev)
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -18,94 +24,165 @@ CACHE_DIR="${CACHE_DIR:-$PROJECT_ROOT/build/cache}"
 # shellcheck source=../config/versions.env
 . "$SCRIPT_DIR/../config/versions.env"
-CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel-arm64"
+KVER="$MAINLINE_KERNEL_VERSION"
 CUSTOM_KERNEL_DIR="$CACHE_DIR/kernel-arm64-generic"
 CUSTOM_IMAGE="$CUSTOM_KERNEL_DIR/Image"
 CUSTOM_MODULES="$CUSTOM_KERNEL_DIR/modules"
 CUSTOM_DTBS="$CUSTOM_KERNEL_DIR/dtbs"
 mkdir -p "$CACHE_DIR" "$CUSTOM_KERNEL_DIR"
 # --- Skip if already built ---
-if [ -f "$CUSTOM_IMAGE" ] && [ -d "$CUSTOM_MODULES" ]; then
+if [ -f "$CUSTOM_IMAGE" ] && [ -d "$CUSTOM_MODULES/lib/modules/$KVER" ]; then
-    echo "==> ARM64 kernel already built (cached)"
+    echo "==> Generic ARM64 kernel already built (cached)"
-    echo "    Image: $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
+    echo "    Image:   $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
    echo "    Kernel:  $KVER"
    exit 0
 fi
-# --- Verify cross-compiler ---
+# --- Toolchain selection: native on arm64 hosts, cross-compile elsewhere ---
-if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
+HOST_ARCH="$(uname -m)"
-    echo "ERROR: aarch64-linux-gnu-gcc not found"
+if [ "$HOST_ARCH" = "aarch64" ] || [ "$HOST_ARCH" = "arm64" ]; then
-    echo "Install: apt-get install gcc-aarch64-linux-gnu"
+    # Native build — use the host's gcc
    if ! command -v gcc >/dev/null 2>&1; then
        echo "ERROR: gcc not found"
        echo "Install: apt-get install build-essential"
        exit 1
    fi
    CROSS_COMPILE=""
    echo "==> Native ARM64 build (host arch: $HOST_ARCH)"
 else
    # Cross-build from x86 — use aarch64 cross-compiler
    if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
        echo "ERROR: aarch64-linux-gnu-gcc not found"
        echo "Install: apt-get install gcc-aarch64-linux-gnu"
        exit 1
    fi
    CROSS_COMPILE="aarch64-linux-gnu-"
    echo "==> Cross-building ARM64 kernel from $HOST_ARCH"
 fi
 echo "==> Building generic ARM64 kernel (mainline $KVER)..."
 echo "    Source: $MAINLINE_KERNEL_URL"
 # --- Download mainline kernel source ---
 KERNEL_SRC_ARCHIVE="$CACHE_DIR/linux-${KVER}.tar.xz"
 if [ ! -f "$KERNEL_SRC_ARCHIVE" ]; then
    echo "==> Downloading mainline kernel source (~140 MB)..."
    wget -q --show-progress -O "$KERNEL_SRC_ARCHIVE" "$MAINLINE_KERNEL_URL" 2>/dev/null || \
        curl -fSL "$MAINLINE_KERNEL_URL" -o "$KERNEL_SRC_ARCHIVE"
    echo "    Downloaded: $(du -h "$KERNEL_SRC_ARCHIVE" | cut -f1)"
 else
    echo "==> Kernel source already cached: $(du -h "$KERNEL_SRC_ARCHIVE" | cut -f1)"
 fi
 # --- Verify checksum if pinned ---
 if [ -n "${MAINLINE_KERNEL_SHA256:-}" ]; then
    actual=$(sha256sum "$KERNEL_SRC_ARCHIVE" | awk '{print $1}')
    if [ "$actual" != "$MAINLINE_KERNEL_SHA256" ]; then
        echo "ERROR: Kernel source checksum mismatch"
        echo "  Expected: $MAINLINE_KERNEL_SHA256"
        echo "  Got:      $actual"
        exit 1
    fi
    echo "    Checksum OK"
 fi
 # --- Extract to case-sensitive fs ---
 # The kernel source has files differing only by case (xt_mark.h vs xt_MARK.h).
 # Build in /tmp (ext4 on Linux runners, case-sensitive).
 KERNEL_BUILD_DIR="/tmp/kernel-build-arm64-generic"
 rm -rf "$KERNEL_BUILD_DIR"
 mkdir -p "$KERNEL_BUILD_DIR"
 echo "==> Extracting kernel source..."
 tar -xf "$KERNEL_SRC_ARCHIVE" -C "$KERNEL_BUILD_DIR"
 KERNEL_SRC_DIR=$(find "$KERNEL_BUILD_DIR" -maxdepth 1 -type d -name 'linux-*' | head -1)
 if [ -z "$KERNEL_SRC_DIR" ]; then
    echo "ERROR: Could not find extracted source directory"
    ls -la "$KERNEL_BUILD_DIR"/
    exit 1
 fi
-echo "==> Building ARM64 kernel for Raspberry Pi..."
+cd "$KERNEL_SRC_DIR"
 echo "    Branch: $RPI_KERNEL_BRANCH"
 echo "    Repo:   $RPI_KERNEL_REPO"
-# --- Download kernel source ---
+# --- Base config: arm64 defconfig (generic ARMv8) ---
-KERNEL_SRC_DIR="$CACHE_DIR/rpi-linux-${RPI_KERNEL_BRANCH}"
+echo "==> Applying arm64 defconfig..."
-if [ ! -d "$KERNEL_SRC_DIR" ]; then
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" defconfig
-    echo "==> Downloading RPi kernel source (shallow clone)..."
+
-    git clone --depth 1 --branch "$RPI_KERNEL_BRANCH" \
+# --- Apply shared container fragment ---
-        "$RPI_KERNEL_REPO" "$KERNEL_SRC_DIR"
+CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/kernel-container.fragment"
-else
+if [ ! -f "$CONFIG_FRAGMENT" ]; then
-    echo "==> Kernel source already cached"
+    echo "ERROR: Config fragment not found: $CONFIG_FRAGMENT"
    exit 1
 fi
-# --- Build in /tmp for case-sensitivity ---
+apply_fragment() {
-KERNEL_BUILD_DIR="/tmp/kernel-build-arm64"
+    local fragment="$1"
 rm -rf "$KERNEL_BUILD_DIR"
 cp -a "$KERNEL_SRC_DIR" "$KERNEL_BUILD_DIR"
 cd "$KERNEL_BUILD_DIR"
 # --- Apply base config (Pi 4 = bcm2711) ---
 echo "==> Applying bcm2711_defconfig..."
 make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- bcm2711_defconfig
 # --- Apply container config overrides ---
 CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/rpi-kernel-config.fragment"
 if [ -f "$CONFIG_FRAGMENT" ]; then
    echo "==> Applying KubeSolo config overrides..."
    while IFS= read -r line; do
        # Skip comments and empty lines
        case "$line" in \#*|"") continue ;; esac
        key="${line%%=*}"
        value="${line#*=}"
        case "$value" in
            y)   ./scripts/config --enable "$key" ;;
            m)   ./scripts/config --module "$key" ;;
            n)   ./scripts/config --disable "${key#CONFIG_}" ;;
            *)   ./scripts/config --set-str "$key" "$value" ;;
        esac
    done < "$CONFIG_FRAGMENT"
 fi
 # Handle "is not set" comments as disables
 if [ -f "$CONFIG_FRAGMENT" ]; then
    while IFS= read -r line; do
        case "$line" in
            "# CONFIG_"*" is not set")
-                key=$(echo "$line" | sed -n 's/^# \(CONFIG_[A-Z_]*\) is not set$/\1/p')
+                key=$(echo "$line" | sed -n 's/^# \(CONFIG_[A-Z0-9_]*\) is not set$/\1/p')
                [ -n "$key" ] && ./scripts/config --disable "${key#CONFIG_}"
                continue
                ;;
            \#*|"") continue ;;
        esac
-    done < "$CONFIG_FRAGMENT"
+        key="${line%%=*}"
-fi
+        value="${line#*=}"
        case "$value" in
            y) ./scripts/config --enable "$key" ;;
            m) ./scripts/config --module "$key" ;;
            n) ./scripts/config --disable "${key#CONFIG_}" ;;
            *) ./scripts/config --set-str "$key" "$value" ;;
        esac
    done < "$fragment"
 }
-# Resolve dependencies
+echo "==> Applying kernel-container.fragment (pass 1)..."
-make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig
+apply_fragment "$CONFIG_FRAGMENT"
 make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
-# --- Build kernel + modules + DTBs ---
+echo "==> Applying kernel-container.fragment (pass 2)..."
 apply_fragment "$CONFIG_FRAGMENT"
 make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
 # --- ARM64 virt-host specific enables ---
 # These are needed for the generic UEFI/virtio boot path but are arch-specific
 # so they live in this script rather than the shared fragment.
 echo "==> Enabling ARM64 virt-host configs..."
 ./scripts/config --enable CONFIG_EFI
 ./scripts/config --enable CONFIG_EFI_STUB
 ./scripts/config --enable CONFIG_VIRTIO
 ./scripts/config --enable CONFIG_VIRTIO_PCI
 ./scripts/config --enable CONFIG_VIRTIO_BLK
 ./scripts/config --enable CONFIG_VIRTIO_NET
 ./scripts/config --enable CONFIG_VIRTIO_CONSOLE
 ./scripts/config --enable CONFIG_VIRTIO_MMIO
 ./scripts/config --enable CONFIG_HW_RANDOM_VIRTIO
 # NVMe for cloud / bare-metal ARM64 hosts that don't use virtio
 ./scripts/config --enable CONFIG_BLK_DEV_NVME
 make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
 # --- Verify critical configs ---
 echo "==> Verifying critical configs..."
 for cfg in CGROUP_BPF SECURITY_APPARMOR AUDIT VIRTIO_BLK EFI_STUB; do
    if ! grep -q "CONFIG_${cfg}=y" .config; then
        echo "ERROR: CONFIG_${cfg} not set after olddefconfig"
        grep "CONFIG_${cfg}" .config || echo "    (not found)"
        exit 1
    fi
    echo "    CONFIG_${cfg}=y confirmed"
 done
 # --- Build kernel + modules (no DTBs — UEFI hosts use ACPI/virtio) ---
 NPROC=$(nproc 2>/dev/null || echo 4)
 echo ""
 echo "==> Building ARM64 kernel (${NPROC} parallel jobs)..."
-echo "    This may take 20-30 minutes..."
+echo "    This may take 20-40 minutes on a 6-core Odroid..."
-make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j"$NPROC" Image modules dtbs 2>&1
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" -j"$NPROC" Image modules 2>&1
-echo "==> ARM64 kernel build complete"
+echo "==> Kernel build complete"
 # --- Install to staging ---
 echo "==> Installing Image..."
@@ -114,31 +191,16 @@ cp arch/arm64/boot/Image "$CUSTOM_IMAGE"
 echo "==> Installing modules (stripped)..."
 rm -rf "$CUSTOM_MODULES"
 mkdir -p "$CUSTOM_MODULES"
-make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" \
    INSTALL_MOD_STRIP=1 modules_install INSTALL_MOD_PATH="$CUSTOM_MODULES"
-# Remove build/source symlinks
+# Pick up actual kernel version (e.g. 6.12.10 if KVER differs from package suffix)
-KVER=$(ls "$CUSTOM_MODULES/lib/modules/" | head -1)
+ACTUAL_KVER=$(ls "$CUSTOM_MODULES/lib/modules/" | head -1)
-rm -f "$CUSTOM_MODULES/lib/modules/$KVER/build"
+rm -f "$CUSTOM_MODULES/lib/modules/$ACTUAL_KVER/build"
-rm -f "$CUSTOM_MODULES/lib/modules/$KVER/source"
+rm -f "$CUSTOM_MODULES/lib/modules/$ACTUAL_KVER/source"
-# Run depmod
+depmod -a -b "$CUSTOM_MODULES" "$ACTUAL_KVER" 2>/dev/null || true
 depmod -a -b "$CUSTOM_MODULES" "$KVER" 2>/dev/null || true
 echo "==> Installing Device Tree Blobs..."
 rm -rf "$CUSTOM_DTBS"
 mkdir -p "$CUSTOM_DTBS/overlays"
 # Pi 4 DTBs
 cp arch/arm64/boot/dts/broadcom/bcm2711*.dtb "$CUSTOM_DTBS/" 2>/dev/null || true
 # Pi 5 DTBs
 cp arch/arm64/boot/dts/broadcom/bcm2712*.dtb "$CUSTOM_DTBS/" 2>/dev/null || true
 # Overlays we need
 for overlay in disable-wifi disable-bt; do
    [ -f "arch/arm64/boot/dts/overlays/${overlay}.dtbo" ] && \
        cp "arch/arm64/boot/dts/overlays/${overlay}.dtbo" "$CUSTOM_DTBS/overlays/"
 done
 # Save config for reference
 cp .config "$CUSTOM_KERNEL_DIR/.config"
 # --- Clean up ---
@@ -148,11 +210,10 @@ rm -rf "$KERNEL_BUILD_DIR"
 # --- Summary ---
 echo ""
-echo "==> ARM64 kernel build complete:"
+echo "==> Generic ARM64 kernel build complete:"
 echo "    Image:        $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
-echo "    Kernel ver:   $KVER"
+echo "    Kernel ver:   $ACTUAL_KVER"
-MOD_COUNT=$(find "$CUSTOM_MODULES/lib/modules/$KVER" -name '*.ko*' 2>/dev/null | wc -l)
+MOD_COUNT=$(find "$CUSTOM_MODULES/lib/modules/$ACTUAL_KVER" -name '*.ko*' 2>/dev/null | wc -l)
 echo "    Modules:      $MOD_COUNT"
-echo "    Modules size: $(du -sh "$CUSTOM_MODULES/lib/modules/$KVER" 2>/dev/null | cut -f1)"
+echo "    Modules size: $(du -sh "$CUSTOM_MODULES/lib/modules/$ACTUAL_KVER" 2>/dev/null | cut -f1)"
 echo "    DTBs:         $(ls "$CUSTOM_DTBS"/*.dtb 2>/dev/null | wc -l)"
 echo ""
--- a/build/scripts/build-kernel-rpi.sh
+++ b/build/scripts/build-kernel-rpi.sh
@@ -0,0 +1,174 @@
 #!/bin/bash
 # build-kernel-rpi.sh — Build kernel for Raspberry Pi 4/5 (ARM64)
 #
 # Uses the official raspberrypi/linux kernel fork with bcm2711_defconfig as the
 # base, overlaid with the shared container-config fragment.
 #
 # This is the RPi-specific build track. For generic ARM64 (UEFI / virtio /
 # kernel.org mainline) see build/scripts/build-kernel-arm64.sh.
 #
 # Output is cached in $CACHE_DIR/custom-kernel-rpi/ and reused across builds.
 #
 # Requirements:
 #   - gcc-aarch64-linux-gnu (cross-compiler)
 #   - Standard kernel build deps (bc, bison, flex, etc.)
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 CACHE_DIR="${CACHE_DIR:-$PROJECT_ROOT/build/cache}"
 # shellcheck source=../config/versions.env
 . "$SCRIPT_DIR/../config/versions.env"
 CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel-rpi"
 CUSTOM_IMAGE="$CUSTOM_KERNEL_DIR/Image"
 CUSTOM_MODULES="$CUSTOM_KERNEL_DIR/modules"
 CUSTOM_DTBS="$CUSTOM_KERNEL_DIR/dtbs"
 mkdir -p "$CACHE_DIR" "$CUSTOM_KERNEL_DIR"
 # --- Skip if already built ---
 if [ -f "$CUSTOM_IMAGE" ] && [ -d "$CUSTOM_MODULES" ]; then
    echo "==> RPi kernel already built (cached)"
    echo "    Image: $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
    exit 0
 fi
 # --- Toolchain selection: native on arm64 hosts, cross-compile elsewhere ---
 HOST_ARCH="$(uname -m)"
 if [ "$HOST_ARCH" = "aarch64" ] || [ "$HOST_ARCH" = "arm64" ]; then
    if ! command -v gcc >/dev/null 2>&1; then
        echo "ERROR: gcc not found"
        echo "Install: apt-get install build-essential"
        exit 1
    fi
    CROSS_COMPILE=""
    echo "==> Native ARM64 build (host arch: $HOST_ARCH)"
 else
    if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
        echo "ERROR: aarch64-linux-gnu-gcc not found"
        echo "Install: apt-get install gcc-aarch64-linux-gnu"
        exit 1
    fi
    CROSS_COMPILE="aarch64-linux-gnu-"
    echo "==> Cross-building RPi kernel from $HOST_ARCH"
 fi
 echo "==> Building RPi kernel (raspberrypi/linux)..."
 echo "    Branch: $RPI_KERNEL_BRANCH"
 echo "    Repo:   $RPI_KERNEL_REPO"
 # --- Download kernel source ---
 KERNEL_SRC_DIR="$CACHE_DIR/rpi-linux-${RPI_KERNEL_BRANCH}"
 if [ ! -d "$KERNEL_SRC_DIR" ]; then
    echo "==> Downloading RPi kernel source (shallow clone)..."
    git clone --depth 1 --branch "$RPI_KERNEL_BRANCH" \
        "$RPI_KERNEL_REPO" "$KERNEL_SRC_DIR"
 else
    echo "==> Kernel source already cached"
 fi
 # --- Build in /tmp for case-sensitivity ---
 KERNEL_BUILD_DIR="/tmp/kernel-build-arm64"
 rm -rf "$KERNEL_BUILD_DIR"
 cp -a "$KERNEL_SRC_DIR" "$KERNEL_BUILD_DIR"
 cd "$KERNEL_BUILD_DIR"
 # --- Apply base config (Pi 4 = bcm2711) ---
 echo "==> Applying bcm2711_defconfig..."
 make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" bcm2711_defconfig
 # --- Apply container config overrides ---
 CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/kernel-container.fragment"
 if [ -f "$CONFIG_FRAGMENT" ]; then
    echo "==> Applying KubeSolo config overrides..."
    while IFS= read -r line; do
        # Skip comments and empty lines
        case "$line" in \#*|"") continue ;; esac
        key="${line%%=*}"
        value="${line#*=}"
        case "$value" in
            y)   ./scripts/config --enable "$key" ;;
            m)   ./scripts/config --module "$key" ;;
            n)   ./scripts/config --disable "${key#CONFIG_}" ;;
            *)   ./scripts/config --set-str "$key" "$value" ;;
        esac
    done < "$CONFIG_FRAGMENT"
 fi
 # Handle "is not set" comments as disables
 if [ -f "$CONFIG_FRAGMENT" ]; then
    while IFS= read -r line; do
        case "$line" in
            "# CONFIG_"*" is not set")
                key=$(echo "$line" | sed -n 's/^# \(CONFIG_[A-Z_]*\) is not set$/\1/p')
                [ -n "$key" ] && ./scripts/config --disable "${key#CONFIG_}"
                ;;
        esac
    done < "$CONFIG_FRAGMENT"
 fi
 # Resolve dependencies
 make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
 # --- Build kernel + modules + DTBs ---
 NPROC=$(nproc 2>/dev/null || echo 4)
 echo ""
 echo "==> Building RPi kernel (${NPROC} parallel jobs)..."
 echo "    This may take 20-30 minutes..."
 make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" -j"$NPROC" Image modules dtbs 2>&1
 echo "==> RPi kernel build complete"
 # --- Install to staging ---
 echo "==> Installing Image..."
 cp arch/arm64/boot/Image "$CUSTOM_IMAGE"
 echo "==> Installing modules (stripped)..."
 rm -rf "$CUSTOM_MODULES"
 mkdir -p "$CUSTOM_MODULES"
 make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" \
    INSTALL_MOD_STRIP=1 modules_install INSTALL_MOD_PATH="$CUSTOM_MODULES"
 # Remove build/source symlinks
 KVER=$(ls "$CUSTOM_MODULES/lib/modules/" | head -1)
 rm -f "$CUSTOM_MODULES/lib/modules/$KVER/build"
 rm -f "$CUSTOM_MODULES/lib/modules/$KVER/source"
 # Run depmod
 depmod -a -b "$CUSTOM_MODULES" "$KVER" 2>/dev/null || true
 echo "==> Installing Device Tree Blobs..."
 rm -rf "$CUSTOM_DTBS"
 mkdir -p "$CUSTOM_DTBS/overlays"
 # Pi 4 DTBs
 cp arch/arm64/boot/dts/broadcom/bcm2711*.dtb "$CUSTOM_DTBS/" 2>/dev/null || true
 # Pi 5 DTBs
 cp arch/arm64/boot/dts/broadcom/bcm2712*.dtb "$CUSTOM_DTBS/" 2>/dev/null || true
 # Overlays we need
 for overlay in disable-wifi disable-bt; do
    [ -f "arch/arm64/boot/dts/overlays/${overlay}.dtbo" ] && \
        cp "arch/arm64/boot/dts/overlays/${overlay}.dtbo" "$CUSTOM_DTBS/overlays/"
 done
 # Save config for reference
 cp .config "$CUSTOM_KERNEL_DIR/.config"
 # --- Clean up ---
 echo "==> Cleaning kernel build directory..."
 cd /
 rm -rf "$KERNEL_BUILD_DIR"
 # --- Summary ---
 echo ""
 echo "==> RPi kernel build complete:"
 echo "    Image:        $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
 echo "    Kernel ver:   $KVER"
 MOD_COUNT=$(find "$CUSTOM_MODULES/lib/modules/$KVER" -name '*.ko*' 2>/dev/null | wc -l)
 echo "    Modules:      $MOD_COUNT"
 echo "    Modules size: $(du -sh "$CUSTOM_MODULES/lib/modules/$KVER" 2>/dev/null | cut -f1)"
 echo "    DTBs:         $(ls "$CUSTOM_DTBS"/*.dtb 2>/dev/null | wc -l)"
 echo ""
--- a/build/scripts/build-kernel.sh
+++ b/build/scripts/build-kernel.sh
@@ -85,85 +85,49 @@ echo "    Source dir: $(basename "$KERNEL_SRC_DIR")"
 cd "$KERNEL_SRC_DIR"
-# --- Apply stock config + enable CONFIG_CGROUP_BPF ---
+# --- Apply stock config + shared container-config fragment ---
 echo "==> Applying stock Tiny Core config..."
 cp "$KERNEL_CFG" .config
-echo "==> Enabling required kernel configs..."
+CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/kernel-container.fragment"
-./scripts/config --enable CONFIG_CGROUP_BPF
+if [ ! -f "$CONFIG_FRAGMENT" ]; then
-./scripts/config --enable CONFIG_DEVTMPFS
+    echo "ERROR: Config fragment not found: $CONFIG_FRAGMENT"
-./scripts/config --enable CONFIG_DEVTMPFS_MOUNT
+    exit 1
-./scripts/config --enable CONFIG_MEMCG
+fi
 ./scripts/config --enable CONFIG_CFS_BANDWIDTH
-# --- Strip unnecessary subsystems for smallest footprint ---
+# Apply the fragment: each "CONFIG_X=v" line becomes the right scripts/config
-# This is a headless K8s edge appliance — no sound, GPU, wireless, etc.
+# invocation; "# CONFIG_X is not set" comments become --disable.
-echo "==> Disabling unnecessary subsystems for minimal footprint..."
+apply_fragment() {
    local fragment="$1"
    while IFS= read -r line; do
        case "$line" in
            "# CONFIG_"*" is not set")
                key=$(echo "$line" | sed -n 's/^# \(CONFIG_[A-Z0-9_]*\) is not set$/\1/p')
                [ -n "$key" ] && ./scripts/config --disable "${key#CONFIG_}"
                continue
                ;;
            \#*|"") continue ;;
        esac
        key="${line%%=*}"
        value="${line#*=}"
        case "$value" in
            y) ./scripts/config --enable "$key" ;;
            m) ./scripts/config --module "$key" ;;
            n) ./scripts/config --disable "${key#CONFIG_}" ;;
            *) ./scripts/config --set-str "$key" "$value" ;;
        esac
    done < "$fragment"
 }
-# Sound subsystem (not needed on headless appliance)
+# Two-pass apply: TC's stock config has CONFIG_SECURITY disabled, so olddefconfig
-./scripts/config --disable SOUND
+# strips the security subtree before its dependencies resolve. Re-applying the
-
+# fragment after the first olddefconfig restores those entries.
-# GPU/DRM (serial console only, no display)
+echo "==> Applying kernel-container.fragment (pass 1)..."
-./scripts/config --disable DRM
+apply_fragment "$CONFIG_FRAGMENT"
 # KVM hypervisor (this IS the guest/bare metal, not a hypervisor)
 ./scripts/config --disable KVM
 # Media/camera/TV/radio (not needed)
 ./scripts/config --disable MEDIA_SUPPORT
 # Wireless networking (wired edge device)
 ./scripts/config --disable WIRELESS
 ./scripts/config --disable WLAN
 ./scripts/config --disable CFG80211
 # Bluetooth (not needed)
 ./scripts/config --disable BT
 # NFC (not needed)
 ./scripts/config --disable NFC
 # Infiniband (not needed on edge)
 ./scripts/config --disable INFINIBAND
 # PCMCIA (legacy, not needed)
 ./scripts/config --disable PCMCIA
 # Amateur radio (not needed)
 ./scripts/config --disable HAMRADIO
 # ISDN (not needed)
 ./scripts/config --disable ISDN
 # ATM networking (not needed)
 ./scripts/config --disable ATM
 # Joystick/gamepad (not needed)
 ./scripts/config --disable INPUT_JOYSTICK
 ./scripts/config --disable INPUT_TABLET
 # FPGA (not needed)
 ./scripts/config --disable FPGA
 # First pass: resolve base dependencies before adding security configs.
 # The stock TC config has "# CONFIG_SECURITY is not set" which causes
 # olddefconfig to strip security-related options if applied in a single pass.
 make olddefconfig
-# Security: AppArmor LSM + Audit subsystem
+echo "==> Applying kernel-container.fragment (pass 2)..."
-# Applied AFTER first olddefconfig to ensure CONFIG_SECURITY dependencies
+apply_fragment "$CONFIG_FRAGMENT"
 # (SYSFS, MULTIUSER) are resolved before enabling the security subtree.
 echo "==> Enabling AppArmor + Audit kernel configs..."
 ./scripts/config --enable CONFIG_AUDIT
 ./scripts/config --enable CONFIG_AUDITSYSCALL
 ./scripts/config --enable CONFIG_SECURITY
 ./scripts/config --enable CONFIG_SECURITYFS
 ./scripts/config --enable CONFIG_SECURITY_NETWORK
 ./scripts/config --enable CONFIG_SECURITY_APPARMOR
 ./scripts/config --set-str CONFIG_LSM "lockdown,yama,apparmor"
 ./scripts/config --set-str CONFIG_DEFAULT_SECURITY "apparmor"
 # Second pass: resolve security config dependencies
 make olddefconfig
 # Verify critical configs are set
--- a/build/scripts/create-disk-image.sh
+++ b/build/scripts/create-disk-image.sh
@@ -6,28 +6,61 @@
 #   Part 2: System A    (512 MB, ext4)   — vmlinuz + kubesolo-os.gz (active)
 #   Part 3: System B    (512 MB, ext4)   — vmlinuz + kubesolo-os.gz (passive)
 #   Part 4: Data        (remaining, ext4) — persistent K8s state
 #
 # Supports both x86_64 (default) and ARM64 generic UEFI targets. ARM64 RPi
 # uses a different image format — see build/scripts/create-rpi-image.sh.
 #
 # Environment:
 #   TARGET_ARCH  amd64 (default) or arm64
 #   IMG_SIZE_MB  Image size in MB (default 4096)
 #   CACHE_DIR    Build cache (default <project>/build/cache)
 #   ROOTFS_DIR   Rootfs work dir (default <project>/build/rootfs-work)
 #   OUTPUT_DIR   Output dir (default <project>/output)
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 ROOTFS_DIR="${ROOTFS_DIR:-$PROJECT_ROOT/build/rootfs-work}"
 CACHE_DIR="${CACHE_DIR:-$PROJECT_ROOT/build/cache}"
 OUTPUT_DIR="${OUTPUT_DIR:-$PROJECT_ROOT/output}"
 VERSION="$(cat "$PROJECT_ROOT/VERSION")"
 OS_NAME="kubesolo-os"
 TARGET_ARCH="${TARGET_ARCH:-amd64}"
 IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.img"
 IMG_SIZE_MB="${IMG_SIZE_MB:-4096}"  # 4 GB default (larger for A/B)
-VMLINUZ="$ROOTFS_DIR/vmlinuz"
+# --- Arch-specific paths ---
 case "$TARGET_ARCH" in
    amd64)
        IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.img"
        VMLINUZ="$ROOTFS_DIR/vmlinuz"
        GRUB_CFG="$PROJECT_ROOT/build/grub/grub.cfg"
        GRUB_TARGET="x86_64-efi"
        GRUB_EFI_BIN="bootx64.efi"
        GRUB_INSTALL_BIOS=true
        ;;
    arm64)
        IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.arm64.img"
        VMLINUZ="$CACHE_DIR/kernel-arm64-generic/Image"
        GRUB_CFG="$PROJECT_ROOT/build/grub/grub-arm64.cfg"
        GRUB_TARGET="arm64-efi"
        GRUB_EFI_BIN="BOOTAA64.EFI"
        GRUB_INSTALL_BIOS=false
        ;;
    *)
        echo "ERROR: TARGET_ARCH must be 'amd64' or 'arm64' (got: $TARGET_ARCH)"
        exit 1
        ;;
 esac
 INITRAMFS="$ROOTFS_DIR/kubesolo-os.gz"
 GRUB_CFG="$PROJECT_ROOT/build/grub/grub.cfg"
 GRUB_ENV_DEFAULTS="$PROJECT_ROOT/build/grub/grub-env-defaults"
 for f in "$VMLINUZ" "$INITRAMFS" "$GRUB_CFG" "$GRUB_ENV_DEFAULTS"; do
    [ -f "$f" ] || { echo "ERROR: Missing $f"; exit 1; }
 done
-echo "==> Creating ${IMG_SIZE_MB}MB disk image with A/B partitions..."
+echo "==> Creating ${IMG_SIZE_MB}MB ${TARGET_ARCH} disk image with A/B partitions..."
 mkdir -p "$OUTPUT_DIR"
 # Create sparse image
@@ -161,35 +194,44 @@ else
    mv "$GRUBENV_FILE.tmp" "$GRUBENV_FILE"
 fi
-# Install GRUB EFI binary if available
+# Install GRUB EFI binary
-if command -v grub-mkimage >/dev/null 2>&1; then
+# Modules required: part_gpt + fat (boot partition), ext2 (system A/B),
-    grub-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \
+# normal + linux + echo + configfile + loadenv (boot menu + grubenv),
-        -p /boot/grub \
+# search_* (locate partitions by label).
-        part_gpt ext2 fat normal linux echo all_video test search \
+# all_video + test are x86-specific (DRM init); leave them out on arm64.
-        search_fs_uuid search_label configfile loadenv \
+if [ "$TARGET_ARCH" = "arm64" ]; then
-        2>/dev/null || echo "    WARN: grub-mkimage failed — use QEMU -bios flag"
+    GRUB_MODULES="part_gpt ext2 fat normal linux echo test search search_fs_uuid search_label configfile loadenv"
 elif command -v grub2-mkimage >/dev/null 2>&1; then
    grub2-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \
        -p /boot/grub \
        part_gpt ext2 fat normal linux echo all_video test search \
        search_fs_uuid search_label configfile loadenv \
        2>/dev/null || echo "    WARN: grub2-mkimage failed — use QEMU -bios flag"
 else
-    echo "    WARN: grub-mkimage not found — EFI boot image not created"
+    GRUB_MODULES="part_gpt ext2 fat normal linux echo all_video test search search_fs_uuid search_label configfile loadenv"
    echo "          Install grub2-tools or use QEMU -kernel/-initrd flags"
 fi
-# For BIOS boot: install GRUB i386-pc modules if available
+# shellcheck disable=SC2086  # GRUB_MODULES is intentionally word-split
-if command -v grub-install >/dev/null 2>&1; then
+if command -v grub-mkimage >/dev/null 2>&1; then
-    grub-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
+    grub-mkimage -O "$GRUB_TARGET" -o "$MNT_EFI/EFI/BOOT/$GRUB_EFI_BIN" \
-        --no-floppy "$LOOP" 2>/dev/null || {
+        -p /boot/grub $GRUB_MODULES \
-        echo "    WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
+        || echo "    WARN: grub-mkimage failed — use QEMU -bios flag"
-    }
+elif command -v grub2-mkimage >/dev/null 2>&1; then
-elif command -v grub2-install >/dev/null 2>&1; then
+    grub2-mkimage -O "$GRUB_TARGET" -o "$MNT_EFI/EFI/BOOT/$GRUB_EFI_BIN" \
-    grub2-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
+        -p /boot/grub $GRUB_MODULES \
-        --no-floppy "$LOOP" 2>/dev/null || {
+        || echo "    WARN: grub2-mkimage failed — use QEMU -bios flag"
-        echo "    WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
+else
-    }
+    echo "    WARN: grub-mkimage not found — EFI boot image not created"
    echo "          Install grub-efi-${TARGET_ARCH}-bin or use QEMU -kernel/-initrd flags"
 fi
 # For BIOS boot: install GRUB i386-pc modules (x86 only — ARM64 is UEFI-only).
 if [ "$GRUB_INSTALL_BIOS" = "true" ]; then
    if command -v grub-install >/dev/null 2>&1; then
        grub-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
            --no-floppy "$LOOP" 2>/dev/null || {
            echo "    WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
        }
    elif command -v grub2-install >/dev/null 2>&1; then
        grub2-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
            --no-floppy "$LOOP" 2>/dev/null || {
            echo "    WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
        }
    fi
 fi
 # --- System A Partition (active) ---
@@ -213,9 +255,9 @@ done
 sync
 echo ""
-echo "==> Disk image created: $IMG_OUTPUT"
+echo "==> ${TARGET_ARCH} disk image created: $IMG_OUTPUT"
 echo "    Size: $(du -h "$IMG_OUTPUT" | cut -f1)"
-echo "    Part 1 (KSOLOEFI):  GRUB + A/B boot config"
+echo "    Part 1 (KSOLOEFI):  GRUB ($GRUB_TARGET) + A/B boot config"
 echo "    Part 2 (KSOLOA):    System A — kernel + initramfs (active)"
 echo "    Part 3 (KSOLOB):    System B — kernel + initramfs (passive)"
 echo "    Part 4 (KSOLODATA): Persistent K8s state"
--- a/build/scripts/create-rpi-image.sh
+++ b/build/scripts/create-rpi-image.sh
@@ -31,12 +31,12 @@ IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.rpi.img"
 IMG_SIZE_MB="${IMG_SIZE_MB:-2048}"  # 2 GB default
 # ARM64 kernel (Image format, not bzImage)
-KERNEL="${CACHE_DIR}/custom-kernel-arm64/Image"
+KERNEL="${CACHE_DIR}/custom-kernel-rpi/Image"
 INITRAMFS="${ROOTFS_DIR}/kubesolo-os.gz"
 RPI_FIRMWARE_DIR="${CACHE_DIR}/rpi-firmware"
 # DTBs MUST come from the kernel build (not firmware repo) to match the kernel.
 # A DTB mismatch causes sdhci-iproc to silently fail — zero block devices.
-KERNEL_DTBS_DIR="${CACHE_DIR}/custom-kernel-arm64/dtbs"
+KERNEL_DTBS_DIR="${CACHE_DIR}/custom-kernel-rpi/dtbs"
 echo "==> Creating ${IMG_SIZE_MB}MB Raspberry Pi disk image..."
@@ -173,7 +173,7 @@ CFGTXT
    # cmdline.txt — kernel command line
    # Note: must be a single line
-    echo "console=serial0,115200 console=tty1 kubesolo.data=LABEL=KSOLODATA quiet" > "$MNT/cmdline.txt"
+    echo "console=serial0,115200 console=tty1 kubesolo.data=LABEL=KSOLODATA initcall_debug loglevel=7" > "$MNT/cmdline.txt"
    # Copy kernel as kernel8.img (RPi 3/4/5 ARM64 convention)
    cp "$KERNEL" "$MNT/kernel8.img"
--- a/build/scripts/fetch-components.sh
+++ b/build/scripts/fetch-components.sh
@@ -51,8 +51,7 @@ if [ "$FETCH_ARCH" = "arm64" ]; then
    echo "==> Fetching RPi firmware..."
    "$SCRIPT_DIR/fetch-rpi-firmware.sh"
-    # Download ARM64 KubeSolo binary
+    # Download ARM64 KubeSolo binary (KUBESOLO_VERSION set from versions.env)
    KUBESOLO_VERSION="${KUBESOLO_VERSION:-v1.1.0}"
    KUBESOLO_BIN_ARM64="$CACHE_DIR/kubesolo-arm64"
    if [ -f "$KUBESOLO_BIN_ARM64" ]; then
        echo "==> KubeSolo ARM64 binary already cached: $KUBESOLO_BIN_ARM64"
@@ -61,17 +60,19 @@ if [ "$FETCH_ARCH" = "arm64" ]; then
        BIN_URL="https://github.com/portainer/kubesolo/releases/download/${KUBESOLO_VERSION}/kubesolo-${KUBESOLO_VERSION}-linux-arm64-musl.tar.gz"
        BIN_URL_FALLBACK="https://github.com/portainer/kubesolo/releases/download/${KUBESOLO_VERSION}/kubesolo-${KUBESOLO_VERSION}-linux-arm64.tar.gz"
        TEMP_DIR=$(mktemp -d)
        TARBALL="$TEMP_DIR/kubesolo.tar.gz"
        echo "    URL: $BIN_URL"
-        if curl -fSL "$BIN_URL" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
+        if curl -fSL "$BIN_URL" -o "$TARBALL" 2>/dev/null; then
            echo "    Downloaded musl variant (arm64)"
-        elif curl -fSL "$BIN_URL_FALLBACK" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
+        elif curl -fSL "$BIN_URL_FALLBACK" -o "$TARBALL" 2>/dev/null; then
            echo "    Downloaded glibc variant (arm64 fallback)"
        else
            echo "ERROR: Failed to download KubeSolo ARM64 from GitHub."
            rm -rf "$TEMP_DIR"
            exit 1
        fi
-        tar -xzf "$TEMP_DIR/kubesolo.tar.gz" -C "$TEMP_DIR"
+        verify_checksum "$TARBALL" "${KUBESOLO_SHA256_ARM64:-}" "KubeSolo arm64 tarball"
        tar -xzf "$TARBALL" -C "$TEMP_DIR"
        FOUND_BIN=$(find "$TEMP_DIR" -name "kubesolo" -type f ! -name "*.tar.gz" | head -1)
        if [ -z "$FOUND_BIN" ]; then
            echo "ERROR: Could not find kubesolo binary in extracted archive"
@@ -112,7 +113,7 @@ else
 fi
 # --- KubeSolo ---
-KUBESOLO_VERSION="${KUBESOLO_VERSION:-v1.1.0}"
+# KUBESOLO_VERSION sourced from versions.env
 KUBESOLO_BIN="$CACHE_DIR/kubesolo"
 if [ -f "$KUBESOLO_BIN" ]; then
@@ -132,11 +133,12 @@ else
    TEMP_DIR=$(mktemp -d)
    trap 'rm -rf "$TEMP_DIR"' EXIT
    TARBALL="$TEMP_DIR/kubesolo.tar.gz"
    echo "    URL: $BIN_URL"
-    if curl -fSL "$BIN_URL" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
+    if curl -fSL "$BIN_URL" -o "$TARBALL" 2>/dev/null; then
        echo "    Downloaded musl variant"
-    elif curl -fSL "$BIN_URL_FALLBACK" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
+    elif curl -fSL "$BIN_URL_FALLBACK" -o "$TARBALL" 2>/dev/null; then
        echo "    Downloaded glibc variant (fallback)"
    else
        echo "ERROR: Failed to download KubeSolo from GitHub."
@@ -149,9 +151,10 @@ else
        echo "    3. Re-run: make rootfs"
        exit 1
    fi
    verify_checksum "$TARBALL" "${KUBESOLO_SHA256_AMD64:-}" "KubeSolo amd64 tarball"
    # Extract binary from tarball
-    tar -xzf "$TEMP_DIR/kubesolo.tar.gz" -C "$TEMP_DIR"
+    tar -xzf "$TARBALL" -C "$TEMP_DIR"
    # Find the kubesolo binary in extracted contents
    FOUND_BIN=$(find "$TEMP_DIR" -name "kubesolo" -type f ! -name "*.tar.gz" | head -1)
@@ -169,7 +172,6 @@ else
    rm -rf "$TEMP_DIR"
    echo "==> KubeSolo binary: $KUBESOLO_BIN ($(du -h "$KUBESOLO_BIN" | cut -f1))"
    verify_checksum "$KUBESOLO_BIN" "$KUBESOLO_SHA256" "KubeSolo binary"
 fi
 # --- Tiny Core kernel module extensions (netfilter, iptables) ---
--- a/build/scripts/inject-kubesolo.sh
+++ b/build/scripts/inject-kubesolo.sh
@@ -55,10 +55,44 @@ rm -f "$ROOTFS/sbin/init"
 cp "$PROJECT_ROOT/init/init.sh" "$ROOTFS/sbin/init"
 chmod +x "$ROOTFS/sbin/init"
-# Init stages
+# Replace the upstream /init at the rootfs root with our staged init.
 # The kernel ALWAYS runs /init when booting from an initramfs (legacy root-mount
 # fallback otherwise). piCore/TC ship their own /init; ours has to take its
 # place so the kernel runs our staged boot, not piCore's TCE handler.
 rm -f "$ROOTFS/init"
 cp "$PROJECT_ROOT/init/init.sh" "$ROOTFS/init"
 chmod +x "$ROOTFS/init"
 echo "    Installed staged init at /init and /sbin/init"
 # --- 2b. BusyBox override for ARM64 ---
 # piCore64 v15's BusyBox is dynamically linked and uses ARM instructions that
 # QEMU virt cannot emulate even with -cpu max, causing applets (mkdir, uname,
 # etc.) to SIGILL. Replace with the host's statically-linked busybox-static
 # package, which is built for generic ARMv8-A and runs anywhere.
 #
 # On x86 builds this isn't an issue (TC's BusyBox works fine on QEMU x86).
 if [ "$INJECT_ARCH" = "arm64" ] && [ -x /bin/busybox ]; then
    if file /bin/busybox 2>/dev/null | grep -q 'statically linked'; then
        cp /bin/busybox "$ROOTFS/bin/busybox"
        # busybox.suid is used by mount/su/etc. Same binary; suid bit applied
        # separately. We don't need suid for our use (init runs as PID 1 / uid 0).
        cp /bin/busybox "$ROOTFS/bin/busybox.suid"
        chmod +x "$ROOTFS/bin/busybox" "$ROOTFS/bin/busybox.suid"
        echo "    Replaced piCore BusyBox with host's static busybox ($(du -h /bin/busybox | cut -f1))"
    else
        echo "    WARN: /bin/busybox on host is not static; piCore BusyBox kept (may crash in QEMU virt)"
    fi
 fi
 # Init stages — copy NN-name.sh files only. functions.sh is a shared library
 # (sourced by init.sh proper), not a numbered stage; if it ends up in init.d
 # the main loop will try to run it as a stage and fail.
 mkdir -p "$ROOTFS/usr/lib/kubesolo-os/init.d"
 for stage in "$PROJECT_ROOT"/init/lib/*.sh; do
    [ -f "$stage" ] || continue
    case "$(basename "$stage")" in
        functions.sh) continue ;;
    esac
    cp "$stage" "$ROOTFS/usr/lib/kubesolo-os/init.d/"
    chmod +x "$ROOTFS/usr/lib/kubesolo-os/init.d/$(basename "$stage")"
 done
@@ -109,7 +143,19 @@ fi
 # If a custom kernel was built (with CONFIG_CGROUP_BPF=y), use it.
 # Otherwise fall back to TCZ-extracted modules with manual modules.dep.
 if [ "$INJECT_ARCH" = "arm64" ]; then
-    CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel-arm64"
+    # TARGET_VARIANT selects which ARM64 kernel to consume:
    #   rpi      -> $CACHE_DIR/custom-kernel-rpi/      (raspberrypi/linux fork)
    #   generic  -> $CACHE_DIR/kernel-arm64-generic/   (mainline kernel.org LTS)
    # Default is rpi for backwards compatibility with existing rpi-image target.
    TARGET_VARIANT="${TARGET_VARIANT:-rpi}"
    case "$TARGET_VARIANT" in
        generic) CUSTOM_KERNEL_DIR="$CACHE_DIR/kernel-arm64-generic" ;;
        rpi)     CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel-rpi" ;;
        *)
            echo "ERROR: TARGET_VARIANT must be 'rpi' or 'generic' (got: $TARGET_VARIANT)"
            exit 1
            ;;
    esac
    CUSTOM_VMLINUZ="$CUSTOM_KERNEL_DIR/Image"
 else
    CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel"
--- a/build/scripts/push-oci-artifact.sh
+++ b/build/scripts/push-oci-artifact.sh
@@ -0,0 +1,150 @@
 #!/bin/bash
 # push-oci-artifact.sh — Publish a KubeSolo OS update artifact to an OCI registry.
 #
 # Produces the artifact format consumed by `kubesolo-update --registry`:
 #
 #   <registry>/<repo>:<version>-<arch>     per-arch manifest, layers:
 #     * vmlinuz (Image on arm64)  → application/vnd.kubesolo.os.kernel.v1+octet-stream
 #     * kubesolo-os.gz            → application/vnd.kubesolo.os.initramfs.v1+gzip
 #     annotations:
 #       io.kubesolo.os.version
 #       io.kubesolo.os.channel
 #       io.kubesolo.os.architecture
 #       io.kubesolo.os.min_compatible_version (optional)
 #
 # After running this for each architecture, combine the per-arch tags into a
 # multi-arch index with `oras manifest index create` (see end of script).
 #
 # Requires: oras (>= 1.2), curl, jq.
 #
 # Usage:
 #   build/scripts/push-oci-artifact.sh \
 #       --registry ghcr.io/portainer/kubesolo-os \
 #       --arch amd64 \
 #       --channel stable \
 #       [--min-compatible-version v0.2.0]
 #
 # Authentication: oras reads ~/.docker/config.json. In CI, run
 #   `oras login ghcr.io -u USER -p TOKEN` before invoking this script
 # (or set DOCKER_CONFIG to a directory with config.json).
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 VERSION="$(cat "$PROJECT_ROOT/VERSION")"
 OUTPUT_DIR="$PROJECT_ROOT/output"
 CACHE_DIR="$PROJECT_ROOT/build/cache"
 REGISTRY=""
 ARCH=""
 CHANNEL="stable"
 MIN_COMPATIBLE_VERSION=""
 RELEASE_NOTES=""
 while [ $# -gt 0 ]; do
    case "$1" in
        --registry) REGISTRY="$2"; shift 2 ;;
        --arch) ARCH="$2"; shift 2 ;;
        --channel) CHANNEL="$2"; shift 2 ;;
        --min-compatible-version) MIN_COMPATIBLE_VERSION="$2"; shift 2 ;;
        --release-notes) RELEASE_NOTES="$2"; shift 2 ;;
        *) echo "Unknown option: $1" >&2; exit 1 ;;
    esac
 done
 if [ -z "$REGISTRY" ] || [ -z "$ARCH" ]; then
    echo "Usage: $0 --registry REGISTRY/REPO --arch (amd64|arm64) [--channel stable] [--min-compatible-version vX.Y.Z]" >&2
    exit 1
 fi
 if ! command -v oras >/dev/null 2>&1; then
    echo "ERROR: oras CLI not found. Install from https://oras.land/docs/installation/" >&2
    echo "  or apt-get install oras (Ubuntu 24.04+)" >&2
    exit 1
 fi
 # Locate the artifacts. For arm64 the kernel is "Image"; everywhere else it's
 # "vmlinuz". Initramfs is always kubesolo-os.gz.
 case "$ARCH" in
    amd64)
        KERNEL="$CACHE_DIR/custom-kernel/vmlinuz"
        [ -f "$KERNEL" ] || KERNEL="$OUTPUT_DIR/vmlinuz"
        KERNEL_BASENAME="vmlinuz"
        ;;
    arm64)
        KERNEL="$CACHE_DIR/kernel-arm64-generic/Image"
        KERNEL_BASENAME="vmlinuz"  # we publish under the vmlinuz name regardless;
                                   # the consumer looks up by media type, not filename.
        ;;
    *)
        echo "ERROR: unsupported --arch $ARCH (use amd64 or arm64)" >&2
        exit 1
        ;;
 esac
 INITRAMFS="$PROJECT_ROOT/build/rootfs-work/kubesolo-os.gz"
 if [ ! -f "$KERNEL" ]; then
    echo "ERROR: kernel not found at $KERNEL" >&2
    echo "  Run 'make kernel' (amd64) or 'make kernel-arm64' (arm64) first." >&2
    exit 1
 fi
 if [ ! -f "$INITRAMFS" ]; then
    echo "ERROR: initramfs not found at $INITRAMFS" >&2
    echo "  Run 'make initramfs' or 'make rootfs-arm64' first." >&2
    exit 1
 fi
 # Stage files in a temp dir so the basenames in the manifest are clean.
 STAGE="$(mktemp -d)"
 trap 'rm -rf "$STAGE"' EXIT
 cp "$KERNEL" "$STAGE/$KERNEL_BASENAME"
 cp "$INITRAMFS" "$STAGE/kubesolo-os.gz"
 KERNEL_MEDIA="application/vnd.kubesolo.os.kernel.v1+octet-stream"
 INITRD_MEDIA="application/vnd.kubesolo.os.initramfs.v1+gzip"
 REF="${REGISTRY}:${VERSION}-${ARCH}"
 CHANNEL_REF="${REGISTRY}:${CHANNEL}-${ARCH}"
 echo "==> Pushing ${REF}"
 echo "    kernel:     $KERNEL ($(du -h "$KERNEL" | cut -f1))"
 echo "    initramfs:  $INITRAMFS ($(du -h "$INITRAMFS" | cut -f1))"
 ORAS_ANNOTATIONS=(
    --annotation "io.kubesolo.os.version=${VERSION}"
    --annotation "io.kubesolo.os.channel=${CHANNEL}"
    --annotation "io.kubesolo.os.architecture=${ARCH}"
 )
 if [ -n "$MIN_COMPATIBLE_VERSION" ]; then
    ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.min_compatible_version=${MIN_COMPATIBLE_VERSION}")
 fi
 if [ -n "$RELEASE_NOTES" ]; then
    ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.release_notes=${RELEASE_NOTES}")
 fi
 ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.release_date=$(date -u +%Y-%m-%dT%H:%M:%SZ)")
 # oras push: --artifact-type sets the manifest artifactType field;
 # file:type syntax sets per-layer media types.
 (cd "$STAGE" && oras push "$REF" \
    --artifact-type "application/vnd.kubesolo.os.update.v1+json" \
    "${ORAS_ANNOTATIONS[@]}" \
    "${KERNEL_BASENAME}:${KERNEL_MEDIA}" \
    "kubesolo-os.gz:${INITRD_MEDIA}")
 # Also tag as <channel>-<arch> so the manifest-index step can reference it
 # stably across patch releases.
 echo "==> Tagging ${CHANNEL_REF}"
 oras tag "$REF" "${CHANNEL}-${ARCH}"
 echo ""
 echo "==> Published:"
 echo "    ${REF}"
 echo "    ${CHANNEL_REF}"
 echo ""
 echo "To combine multi-arch into the channel index, run after both arches are pushed:"
 echo ""
 echo "  oras manifest index create ${REGISTRY}:${CHANNEL} \\"
 echo "      ${REGISTRY}:${CHANNEL}-amd64,platform=linux/amd64 \\"
 echo "      ${REGISTRY}:${CHANNEL}-arm64,platform=linux/arm64"
 echo ""
--- a/cloud-init/cmd/main.go
+++ b/cloud-init/cmd/main.go
@@ -97,6 +97,11 @@ func cmdApply(configPath string) error {
 		return fmt.Errorf("portainer edge agent: %w", err)
 	}
 	// 5. Write /etc/kubesolo/update.conf from updates: block (if any).
 	if err := cloudinit.ApplyUpdates(cfg, ""); err != nil {
 		return fmt.Errorf("updates: %w", err)
 	}
 	// 5. Save persistent configs for next boot
 	if err := cloudinit.SaveHostname(cfg, persistDataDir+"/etc-kubesolo"); err != nil {
 		slog.Warn("failed to save hostname", "error", err)
--- a/cloud-init/config.go
+++ b/cloud-init/config.go
@@ -12,12 +12,30 @@ package cloudinit
 // Config is the top-level cloud-init configuration.
 type Config struct {
-	Hostname string          `yaml:"hostname"`
+	Hostname  string          `yaml:"hostname"`
-	Network  NetworkConfig   `yaml:"network"`
+	Network   NetworkConfig   `yaml:"network"`
-	KubeSolo KubeSoloConfig  `yaml:"kubesolo"`
+	KubeSolo  KubeSoloConfig  `yaml:"kubesolo"`
-	NTP      NTPConfig       `yaml:"ntp"`
+	NTP       NTPConfig       `yaml:"ntp"`
-	Airgap   AirgapConfig    `yaml:"airgap"`
+	Airgap    AirgapConfig    `yaml:"airgap"`
 	Portainer PortainerConfig `yaml:"portainer"`
 	Updates   UpdatesConfig   `yaml:"updates"`
 }
 // UpdatesConfig configures the kubesolo-update agent. Written to
 // /etc/kubesolo/update.conf on first boot. See update/pkg/config.
 type UpdatesConfig struct {
 	// Server is the update server URL (HTTP or OCI registry).
 	Server string `yaml:"server"`
 	// Channel selects which channel to track ("stable", "beta", "edge").
 	// Empty = "stable".
 	Channel string `yaml:"channel"`
 	// MaintenanceWindow restricts apply to the given local time range,
 	// e.g. "03:00-05:00". Wrapping windows like "23:00-01:00" supported.
 	// Empty = no restriction.
 	MaintenanceWindow string `yaml:"maintenance_window"`
 	// PubKey is the path to the Ed25519 public key file used to verify
 	// signed update artifacts. Empty = signature verification disabled.
 	PubKey string `yaml:"pubkey"`
 }
 // NetworkConfig defines network settings.
@@ -40,6 +58,14 @@ type KubeSoloConfig struct {
 	PortainerEdgeID        string   `yaml:"portainer-edge-id"`
 	PortainerEdgeKey       string   `yaml:"portainer-edge-key"`
 	PortainerEdgeAsync     bool     `yaml:"portainer-edge-async"`
 	// v1.1.4+: skip edge-optimised overrides, use upstream k8s defaults
 	// (useful for CI and powerful machines, disabled by default).
 	Full bool `yaml:"full"`
 	// v1.1.5+: disable IPv6 in the cluster.
 	DisableIPv6 bool `yaml:"disable-ipv6"`
 	// v1.1.5+: detect SQLite WAL corruption on startup and recover from
 	// unclean shutdowns (e.g. power loss). Recommended ON for edge devices.
 	DBWALRepair bool `yaml:"db-wal-repair"`
 }
 // NTPConfig defines NTP settings.
--- a/cloud-init/examples/full-config.yaml
+++ b/cloud-init/examples/full-config.yaml
@@ -36,5 +36,50 @@ kubesolo:
  portainer-edge-key: "your-edge-key"
  portainer-edge-async: true
  # KubeSolo v1.1.4+: skip the edge-optimised overrides and use upstream
  # Kubernetes defaults. Useful for CI and high-spec machines. Default off.
  full: false
  # KubeSolo v1.1.5+: disable IPv6 throughout the cluster. Default off.
  disable-ipv6: false
  # KubeSolo v1.1.5+: detect SQLite WAL corruption at startup and recover
  # from unclean shutdowns (e.g. power loss). Recommended ON for edge
  # appliances that may lose power.
  db-wal-repair: true
  # Arbitrary extra flags passed directly to the KubeSolo binary
  # extra-flags: "--disable traefik --disable servicelb"
 # Update agent settings (written to /etc/kubesolo/update.conf on first boot).
 # Omit any subfield to leave the corresponding default in place.
 updates:
  # Update server URL — HTTPS for the JSON+blob protocol, or an OCI registry
  # reference (e.g. ghcr.io/portainer/kubesolo-os) when OCI distribution
  # lands in v0.3.
  server: "https://updates.kubesolo.example.com"
  # Channel to track. "stable" is the default; "beta"/"edge" expose
  # pre-release artifacts. The agent refuses to apply metadata whose
  # channel doesn't match.
  channel: "stable"
  # Maintenance window (local time, HH:MM-HH:MM, wrapping midnight OK).
  # `apply` refuses to run outside this window unless --force is passed.
  # Leave empty (or omit) to allow updates at any time.
  maintenance_window: "03:00-05:00"
  # Path to Ed25519 public key for signature verification. Omit to disable
  # signature verification (NOT recommended for production fleets).
  # pubkey: "/etc/kubesolo/update-pubkey.hex"
  # Optional post-boot healthcheck probe URL. If set, healthcheck GETs it
  # and treats anything other than HTTP 200 as a failure. Useful when your
  # workload exposes its own readiness on a known endpoint.
  # healthcheck_url: "http://localhost:8000/ready"
  # Auto-rollback threshold: after N consecutive post-activation healthcheck
  # failures, the agent triggers a rollback on its own. 0 disables the
  # feature (the bootloader still does GRUB-counter-based rollback after
  # 3 failed boots). Recommended: 3 for production fleets.
  # auto_rollback_after: 3
--- a/cloud-init/kubesolo.go
+++ b/cloud-init/kubesolo.go
@@ -70,6 +70,18 @@ func buildExtraFlags(cfg *Config) string {
 		parts = append(parts, "--portainer-edge-async")
 	}
 	if cfg.KubeSolo.Full {
 		parts = append(parts, "--full")
 	}
 	if cfg.KubeSolo.DisableIPv6 {
 		parts = append(parts, "--disable-ipv6")
 	}
 	if cfg.KubeSolo.DBWALRepair {
 		parts = append(parts, "--db-wal-repair")
 	}
 	return strings.Join(parts, " ")
 }
--- a/cloud-init/updates.go
+++ b/cloud-init/updates.go
@@ -0,0 +1,57 @@
 package cloudinit
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strings"
 )
 // DefaultUpdateConfPath is where the update agent expects to find its config.
 // Kept in sync with update/pkg/config.DefaultPath.
 const DefaultUpdateConfPath = "/etc/kubesolo/update.conf"
 // ApplyUpdates writes /etc/kubesolo/update.conf from the cloud-init
 // updates: block. Called once per boot; idempotent (overwrites any existing
 // file with the cloud-init values).
 //
 // If the updates: block is empty (all fields blank), the file is not
 // written — preserves any hand-edited update.conf on systems that aren't
 // managed via cloud-init.
 func ApplyUpdates(cfg *Config, confPath string) error {
 	if confPath == "" {
 		confPath = DefaultUpdateConfPath
 	}
 	u := cfg.Updates
 	if u.Server == "" && u.Channel == "" && u.MaintenanceWindow == "" && u.PubKey == "" {
 		// Nothing to write — leave any existing file alone.
 		return nil
 	}
 	if err := os.MkdirAll(filepath.Dir(confPath), 0o755); err != nil {
 		return fmt.Errorf("creating dir for %s: %w", confPath, err)
 	}
 	var sb strings.Builder
 	sb.WriteString("# Generated by KubeSolo OS cloud-init — edit this file or the\n")
 	sb.WriteString("# cloud-init source YAML; subsequent first-boots will regenerate it.\n")
 	if u.Server != "" {
 		fmt.Fprintf(&sb, "server = %s\n", u.Server)
 	}
 	if u.Channel != "" {
 		fmt.Fprintf(&sb, "channel = %s\n", u.Channel)
 	}
 	if u.MaintenanceWindow != "" {
 		fmt.Fprintf(&sb, "maintenance_window = %s\n", u.MaintenanceWindow)
 	}
 	if u.PubKey != "" {
 		fmt.Fprintf(&sb, "pubkey = %s\n", u.PubKey)
 	}
 	if err := os.WriteFile(confPath, []byte(sb.String()), 0o644); err != nil {
 		return fmt.Errorf("writing %s: %w", confPath, err)
 	}
 	slog.Info("wrote update.conf", "path", confPath)
 	return nil
 }
--- a/cloud-init/updates_test.go
+++ b/cloud-init/updates_test.go
@@ -0,0 +1,81 @@
 package cloudinit
 import (
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 )
 func TestApplyUpdatesEmptyConfigSkipsWrite(t *testing.T) {
 	confPath := filepath.Join(t.TempDir(), "update.conf")
 	cfg := &Config{} // Updates block default-zero
 	if err := ApplyUpdates(cfg, confPath); err != nil {
 		t.Fatalf("apply: %v", err)
 	}
 	if _, err := os.Stat(confPath); !os.IsNotExist(err) {
 		t.Errorf("expected no file when cloud-init Updates is empty, got %v", err)
 	}
 }
 func TestApplyUpdatesAllFields(t *testing.T) {
 	confPath := filepath.Join(t.TempDir(), "update.conf")
 	cfg := &Config{Updates: UpdatesConfig{
 		Server:            "https://updates.example.com",
 		Channel:           "stable",
 		MaintenanceWindow: "03:00-05:00",
 		PubKey:            "/etc/kubesolo/pub.hex",
 	}}
 	if err := ApplyUpdates(cfg, confPath); err != nil {
 		t.Fatalf("apply: %v", err)
 	}
 	data, err := os.ReadFile(confPath)
 	if err != nil {
 		t.Fatalf("read: %v", err)
 	}
 	out := string(data)
 	wants := []string{
 		"server = https://updates.example.com",
 		"channel = stable",
 		"maintenance_window = 03:00-05:00",
 		"pubkey = /etc/kubesolo/pub.hex",
 	}
 	for _, w := range wants {
 		if !strings.Contains(out, w) {
 			t.Errorf("update.conf missing %q in output:\n%s", w, out)
 		}
 	}
 }
 func TestApplyUpdatesPartialFields(t *testing.T) {
 	// Only server set — others should be omitted from the file, not written
 	// as blank values.
 	confPath := filepath.Join(t.TempDir(), "update.conf")
 	cfg := &Config{Updates: UpdatesConfig{Server: "https://x.example.com"}}
 	if err := ApplyUpdates(cfg, confPath); err != nil {
 		t.Fatalf("apply: %v", err)
 	}
 	data, _ := os.ReadFile(confPath)
 	out := string(data)
 	if !strings.Contains(out, "server = https://x.example.com") {
 		t.Errorf("missing server line:\n%s", out)
 	}
 	for _, unwanted := range []string{"channel = ", "maintenance_window = ", "pubkey = "} {
 		if strings.Contains(out, unwanted) {
 			t.Errorf("unexpected empty line %q present in:\n%s", unwanted, out)
 		}
 	}
 }
 func TestApplyUpdatesCreatesParentDir(t *testing.T) {
 	// /etc/kubesolo may not exist on first boot before cloud-init runs.
 	confPath := filepath.Join(t.TempDir(), "nested", "kubesolo", "update.conf")
 	cfg := &Config{Updates: UpdatesConfig{Server: "https://x"}}
 	if err := ApplyUpdates(cfg, confPath); err != nil {
 		t.Fatalf("apply: %v", err)
 	}
 	if _, err := os.Stat(confPath); err != nil {
 		t.Errorf("file not created: %v", err)
 	}
 }
--- a/docs/arm64-architecture.md
+++ b/docs/arm64-architecture.md
@@ -0,0 +1,124 @@
 # ARM64 Build Architecture
 KubeSolo OS supports ARM64 via two distinct build tracks. This document defines the
 split, lists which files belong to each track, and identifies the shared substrate.
 ## The two tracks
 ### Generic ARM64 (UEFI / virtio / GRUB)
 **Target:** Any UEFI-compliant ARM64 host — Ampere/Graviton VMs, generic ARM64
 servers, `qemu-system-aarch64 -machine virt`, future SBCs that boot via UEFI.
 **Boot path:** UEFI firmware → GRUB-EFI → kernel + initramfs → KubeSolo init.
 **Kernel:** Mainline Linux (kernel.org LTS), built from `defconfig` + shared
 container-config fragment.
 **Storage:** virtio-blk / NVMe / SATA — detected and probed by mainline drivers.
 **Disk image format:** GPT, identical 4-partition layout to x86_64 (EFI + System A
 + System B + Data).
 ### Raspberry Pi ARM64
 **Target:** Raspberry Pi 4 and 5 specifically.
 **Boot path:** RPi EEPROM → VideoCore firmware (`start4.elf`) → `config.txt` →
 kernel + DTB + initramfs → KubeSolo init. (No UEFI, no GRUB — `autoboot.txt`
 provides the A/B selection.)
 **Kernel:** Built from `raspberrypi/linux` fork with `bcm2711_defconfig`
 (Pi 4) or `bcm2712_defconfig` (Pi 5). RPi-patched, includes BCM-specific drivers
 (sdhci-iproc, bcm2835-mmc, GPIO, mailbox).
 **Storage:** SD card via `sdhci-iproc` driver — requires kernel-built DTBs to match
 the kernel binary.
 **Disk image format:** MBR with `autoboot.txt` A/B redirect:
 - Part 1: Boot/Control (FAT32, firmware + fallback kernel)
 - Part 2: Boot A (FAT32, kernel + DTBs + initramfs)
 - Part 3: Boot B (FAT32, same as A initially)
 - Part 4: Data (ext4)
 ## File-by-file ownership
 ### Shared substrate (used by both tracks)
 | Path | Why shared |
 |------|------------|
 | `init/` (all of it) | Boot is identical post-kernel — same staged init, same persistent mount, same KubeSolo launch |
 | `cloud-init/` | Arch-agnostic Go binary |
 | `update/` | Arch-agnostic Go binary; bootenv abstraction handles GRUB vs RPi-autoboot variants |
 | `build/scripts/inject-kubesolo.sh` | Single script; switches `LIB_ARCH` / `LD_SO` based on `TARGET_ARCH` |
 | `build/scripts/extract-core.sh` | Single script; arm64 branch uses piCore64 userland (arch-agnostic BusyBox) |
 | `build/config/modules-arm64.list` | Already generic — no BCM-specific modules; works in QEMU virt, AWS Graviton, and RPi |
 | `build/config/rpi-kernel-config.fragment` | **Misnamed.** Contents (cgroup, namespaces, netfilter, AppArmor) are arch-agnostic. Will be renamed `kernel-container.fragment` in Phase 2 and applied to x86, generic-ARM64, and RPi kernels alike. |
 | `hack/dev-vm-arm64.sh` | Uses `-machine virt` + virtio — generic, not RPi-specific |
 | `test/qemu/test-boot-arm64.sh` | Same as above |
 ### Generic ARM64 only (to be created in Phases 2–3)
 | Path | Purpose |
 |------|---------|
 | `build/scripts/build-kernel-arm64.sh` *(rewritten in Phase 2)* | Build mainline kernel.org LTS from `defconfig` + shared fragment + arm64-virt enables (`VIRTIO_BLK`, `EFI_STUB`). Replaces the existing RPi-flavoured script of the same name. |
 | `build/scripts/create-disk-image-arm64.sh` *(new in Phase 3)* | Build UEFI-bootable raw disk image (GPT + System A/B + Data) using `grub-efi-arm64`. Or fold into existing `create-disk-image.sh` with an arch parameter. |
 | `build/cache/kernel-arm64-generic/` | Build output for mainline ARM64 kernel — keep separate from RPi-kernel cache. |
 ### Raspberry Pi only (to be renamed/reorganised in Phase 2)
 | Path | Purpose |
 |------|---------|
 | `build/scripts/build-kernel-rpi.sh` *(renamed from `build-kernel-arm64.sh`)* | Build kernel from `raspberrypi/linux` with `bcm2711_defconfig` + shared fragment + RPi-specific overrides. |
 | `build/scripts/create-rpi-image.sh` | Build SD card image (MBR + autoboot.txt + firmware blobs + DTBs). Already correctly scoped. |
 | `build/scripts/fetch-rpi-firmware.sh` | Download VideoCore firmware blobs from `raspberrypi/firmware`. Already correctly scoped. |
 | `build/config/rpi-kernel-overrides.fragment` *(new, Phase 2)* | Pi-specific kernel config knobs (DMA, audio off, etc.) layered on top of the shared container fragment. |
 | `build/cache/custom-kernel-rpi/` *(renamed from `custom-kernel-arm64/`)* | Build output for RPi kernel — DTBs, modules, Image. |
 | `versions.env` keys: `RPI_KERNEL_BRANCH`, `RPI_KERNEL_REPO`, `RPI_FIRMWARE_TAG`, `RPI_FIRMWARE_URL`, `PICORE_*` | Already correctly named. |
 ## Make targets
 | Target | Track |
 |--------|-------|
 | `make iso` | x86_64 |
 | `make disk-image` | x86_64 |
 | `make kernel` | x86_64 |
 | `make kernel-arm64` *(Phase 2: now builds mainline)* | Generic ARM64 |
 | `make rootfs-arm64` | Generic ARM64 (and reusable for RPi rootfs) |
 | `make disk-image-arm64` *(Phase 3: new)* | Generic ARM64 |
 | `make kernel-rpi` *(Phase 2: renamed from former kernel-arm64)* | RPi |
 | `make rpi-image` | RPi |
 ## Why two tracks, not one
 The RPi boot path is fundamentally different from generic ARM64:
 - **No UEFI.** RPi boots through a multi-stage firmware chain that ends with
  `config.txt` parsing and direct kernel load. UEFI/GRUB is not an option without
  third-party firmware (which has its own bugs).
 - **DTB required.** RPi kernel needs a device tree blob matching the kernel binary;
  generic ARM64 under UEFI uses ACPI or self-describing virtio.
 - **Custom drivers.** SD card (sdhci-iproc), GPIO, mailbox interfaces require
  RPi-patched kernel sources. Mainline support exists but lags behind the
  raspberrypi/linux fork for new boards.
 - **A/B selection mechanism.** RPi uses `autoboot.txt` + EEPROM cooperation; generic
  ARM64 uses GRUB's `boot_default`/`boot_counter` envvars (same as x86_64).
 Trying to unify into a single track would force compromises in both. Two tracks
 sharing the post-kernel substrate (init, cloud-init, update agent) gives us the best
 of both: code reuse where it makes sense, divergence only where the hardware demands
 it.
 ## Migration plan
 This document is descriptive of the **target** v0.3.0 layout. The current code
 (as of v0.2.0) has:
 - `build/scripts/build-kernel-arm64.sh` building the RPi kernel (will be renamed in
  Phase 2).
 - `build/config/rpi-kernel-config.fragment` containing generic configs (will be
  renamed in Phase 2).
 - No generic ARM64 kernel script (will be created in Phase 2).
 - No generic ARM64 disk image script (will be created in Phase 3).
 Phases 2 and 3 of the v0.3.0 plan execute the migration.
--- a/docs/arm64-status.md
+++ b/docs/arm64-status.md
@@ -0,0 +1,125 @@
 # ARM64 Generic Status (v0.3 in-progress)
 End-of-Phase-3 snapshot of the generic ARM64 build track.
 ## What works
 End-to-end boot through QEMU on an Odroid (aarch64 Ubuntu 22.04 build host):
 1. `make kernel-arm64` produces a mainline 6.12.10 LTS kernel (44 MB Image, 868
   modules)
 2. `make rootfs-arm64` extracts piCore64 userland, replaces BusyBox with
   Ubuntu's static busybox-static, injects KubeSolo + Go agents + init scripts
 3. `make disk-image-arm64` produces a UEFI-bootable 4 GB GPT image with GRUB
   A/B slots
 4. `hack/dev-vm-arm64.sh --disk` boots the image:
   - UEFI firmware loads GRUB
   - GRUB loads kernel + initramfs
   - Custom init runs all 14 stages (early-mount, parse-cmdline, persistent-mount,
     kernel-modules, apparmor, sysctl, cloud-init, network, hostname, clock,
     containerd, security-lockdown, kubesolo)
   - Data partition mounts (ext4 on vda4)
   - Network configured (DHCP on virtio eth0)
   - KubeSolo starts; containerd boots successfully; CoreDNS + pause images
     register
 ## Known limitations of the current dev setup
 These are debugging-environment issues, not production blockers:
 ### 1. QEMU TCG performance hits KubeSolo's image-import deadline
 KubeSolo bundles its essential container images and imports them into
 containerd on first boot. Under QEMU TCG (software emulation on the Odroid's
 1.8 GB / 6-core ARM64), the import takes longer than KubeSolo's internal
 deadline, so we see:
 ```
 failed to import images: ... context deadline exceeded
 shutdown requested before containerd was ready
 ```
 On real ARM64 hardware (Graviton, Ampere, RPi 5, etc.) this import completes
 in seconds. KVM acceleration on the Odroid would also fix it, but the
 Odroid's vendor kernel (4.9.337-38) doesn't ship the KVM module — fixing that
 requires a host-kernel upgrade outside this project's scope.
 ### 2. Hardcoded `/dev/vda4` data partition path
 Stage 20 currently expects `kubesolo.data=/dev/vda4` rather than
 `LABEL=KSOLODATA`. The LABEL= path is preferred (works regardless of disk
 naming on different hosts), but resolution depends on `blkid` and `findfs`,
 which:
 - piCore64 ships as dynamic util-linux binaries that crash in QEMU virt
 - Ubuntu's `busybox-static` 1.30.1 doesn't include the applets
 Production fix options (deferred to next phase):
 - Build a more comprehensive static BusyBox (Alpine's, or upstream + custom config)
 - Ship statically-linked `blkid` and `findfs` from util-linux
 - Replace LABEL resolution with a sysfs walk that reads `/sys/class/block/*/holders`
  and `/dev/<n>` device numbers
 ### 3. AppArmor profiles fail to load
 `apparmor_parser` errors on the containerd and kubelet profiles, probably
 because the parser binary or libraries copied from the build host don't
 match the rootfs's libc layout. Boot proceeds without AppArmor enforcement.
 Same fix path as #2 (better static binaries).
 ### 4. piCore64 BusyBox swap is a build-host dependency
 `inject-kubesolo.sh` replaces piCore's `/bin/busybox` with the build host's
 `/bin/busybox` (Ubuntu's busybox-static package). That binary must exist on
 the build host or in the builder Docker image. Documented; works in CI
 because the Dockerfile installs busybox-static.
 A more reproducible approach (future work): ship a known-good ARM64 BusyBox
 binary as a tracked artifact rather than depending on the host package.
 ### 5. busybox-static 1.30.1 has its own bugs
 Even after the swap, some applets misbehave inside QEMU:
 - `modprobe` triggers "stack smashing detected" abort (kernel modules still
  load via direct write to /sys/... in stage 30, so this isn't fatal)
 - `tr` doesn't parse POSIX character classes like `[:space:]` — already
  worked around by using explicit `' \t\r\n'` in our scripts
 - Missing applets: `blkid`, `findfs`, `--version`, etc.
 These won't necessarily manifest on real hardware (different CPU, different
 glibc interaction) but they confirm that 1.30.1 isn't the right long-term
 BusyBox.
 ## What's needed to ship v0.3 ARM64 as production-ready
 In order of priority:
 1. **Validate on real ARM64 hardware** — boot the image on a Graviton EC2
   instance, Ampere VPS, RPi 5 (when hardware available), or any UEFI-capable
   ARM64 board. Confirm full KubeSolo bring-up: node Ready, pods schedule.
 2. **Fix LABEL=KSOLODATA resolution** — see option list in #2 above.
 3. **Replace busybox-static with a curated build** — see #4.
 4. **Add a Gitea workflow** that runs `make kernel-arm64 + disk-image-arm64`
   on the Odroid runner and the QEMU boot-test as a smoke test (with the
   expectation that KubeSolo doesn't finish first-boot under TCG).
 ## Files exercised by the Phase 3 work
 | Path | Status |
 |------|--------|
 | `build/scripts/build-kernel-arm64.sh` | New — mainline 6.12.10 kernel build, native or cross |
 | `build/scripts/build-kernel-rpi.sh` | Renamed from old `build-kernel-arm64.sh` — RPi path |
 | `build/config/kernel-container.fragment` | Renamed from `rpi-kernel-config.fragment` |
 | `build/scripts/create-disk-image.sh` | Refactored — accepts `TARGET_ARCH=arm64` |
 | `build/grub/grub-arm64.cfg` | New — ARM64 console + `init=/sbin/init` |
 | `build/scripts/inject-kubesolo.sh` | Updated — BusyBox swap, `/init` install, variant routing |
 | `init/init.sh` | Updated — output to `/dev/console` for early-boot visibility |
 | `init/lib/30-kernel-modules.sh` | Fixed — `tr -d ' \t\r\n'` instead of `[:space:]` |
 | `init/lib/40-sysctl.sh` | Same fix |
 | `hack/dev-vm-arm64.sh` | Updated — `-cpu max`, UEFI `--disk` mode |
 | `test/qemu/test-boot-arm64-disk.sh` | New — CI test for UEFI boot |
 | `Makefile` | New targets: `kernel-arm64`, `kernel-rpi`, `disk-image-arm64`, `test-boot-arm64-disk`, `rootfs-arm64-rpi` |
 | `build/config/versions.env` | Pinned `MAINLINE_KERNEL_VERSION=6.12.10`, `KUBESOLO_VERSION=v1.1.0` |
 | `build/Dockerfile.builder` | Added `grub-efi-amd64-bin`, `grub-efi-arm64-bin`, `busybox-static` |
--- a/docs/ci-runners.md
+++ b/docs/ci-runners.md
@@ -0,0 +1,165 @@
 # CI Runners
 KubeSolo OS is built and tested on Gitea Actions runners. This document records the
 runners currently in service and how to register a new one if a host is wiped.
 ## Active runners
 | Name | Host | Arch | OS | Labels | Notes |
 |------|------|------|-----|--------|-------|
 | `odroid-arm64` | `odroid.local` | aarch64 | Ubuntu 22.04 LTS | `arm64-linux`, `ubuntu-latest`, `ubuntu-24.04`, `ubuntu-22.04` | Native ARM64 builder; 6 cores, 1.8 GB RAM + 4 GB swap; runs as systemd service `act_runner` |
 ## Workflow targeting
 ARM64-specific jobs target the Odroid via the `arm64-linux` label:
 ```yaml
 jobs:
  build-arm64:
    runs-on: arm64-linux
    steps:
      - uses: actions/checkout@v4
      - run: make rootfs-arm64
 ```
 Generic ubuntu jobs that don't care about arch fall through to whichever runner picks
 them up first; on the Odroid they run in Docker via the `ubuntu-latest` /
 `ubuntu-22.04` / `ubuntu-24.04` labels.
 ## Registering a new runner
 ### Prerequisites
 - Linux host (Ubuntu / Debian preferred; the install instructions below use Ubuntu
  22.04+ paths).
 - Outbound HTTPS to the Gitea instance.
 - Root access on the runner host (the runner needs to create loop devices and run
  `mkfs.ext4` for disk-image builds).
 - A Gitea Actions runner registration token. Get it from:
  - **Repo-scoped:** `<repo>/settings/actions/runners` → "Create new Runner"
  - **Org-scoped (preferred for this project):** `<org>/-/settings/actions/runners` →
    "Create new Runner"
  - **Site-scoped:** `/-/admin/actions/runners` → "Create new Runner"
 ### Step 1 — Add swap if the host has <4 GB RAM
 Kernel builds in later phases need ~2 GB resident; tight hosts will OOM-kill `cc1`
 without swap.
 ```bash
 sudo fallocate -l 4G /swapfile
 sudo chmod 600 /swapfile
 sudo mkswap /swapfile
 sudo swapon /swapfile
 echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
 ```
 ### Step 2 — Install the gitea-runner binary
 Pinned to a known-good version. Check
 <https://gitea.com/gitea/runner/releases> for the current stable tag before
 bumping.
 ```bash
 sudo -i
 mkdir -p /opt/act_runner && cd /opt/act_runner
 # Bump VERSION to the current stable release as needed
 VERSION=1.0.3
 ARCH=$(uname -m | sed 's/aarch64/arm64/; s/x86_64/amd64/')
 curl -fL "https://gitea.com/gitea/runner/releases/download/v${VERSION}/gitea-runner-${VERSION}-linux-${ARCH}" \
  -o act_runner
 chmod +x act_runner
 ./act_runner --version
 ```
 > The upstream project was renamed `act_runner` → `gitea-runner` at the v1.0.0
 > release. The release asset filenames use `gitea-runner-*` even though we keep the
 > local binary named `act_runner` to match this systemd unit. The CLI surface
 > (`register`, `daemon`, `generate-config`) is unchanged.
 ### Step 3 — Register against Gitea
 ```bash
 ./act_runner register --no-interactive \
  --instance https://git.oe74.net \
  --token PASTE_TOKEN_HERE \
  --name <hostname> \
  --labels arm64-linux        # adjust label for amd64 hosts
 ```
 This creates a `.runner` file with the registration credentials.
 ### Step 4 — Generate and tune config
 ```bash
 ./act_runner generate-config > config.yaml
 ```
 In `config.yaml`, confirm the `runner.labels:` block includes the labels you want.
 The `:host` suffix routes jobs directly to the host (no Docker wrapper) — required
 for disk-image builds that need loop devices and `mkfs`.
 Example labels for an arm64 host:
 ```yaml
 runner:
  labels:
    - "arm64-linux:host"
    - "ubuntu-latest:docker://docker.gitea.com/runner-images:ubuntu-latest"
    - "ubuntu-24.04:docker://docker.gitea.com/runner-images:ubuntu-24.04"
    - "ubuntu-22.04:docker://docker.gitea.com/runner-images:ubuntu-22.04"
 ```
 ### Step 5 — Install as a systemd service
 ```bash
 cat > /etc/systemd/system/act_runner.service << 'EOF'
 [Unit]
 Description=Gitea Actions runner
 After=network-online.target
 Wants=network-online.target
 [Service]
 ExecStart=/opt/act_runner/act_runner daemon --config /opt/act_runner/config.yaml
 WorkingDirectory=/opt/act_runner
 User=root
 Restart=always
 RestartSec=5
 [Install]
 WantedBy=multi-user.target
 EOF
 systemctl daemon-reload
 systemctl enable --now act_runner
 systemctl status act_runner --no-pager
 ```
 ### Step 6 — Verify in Gitea UI
 Visit the runners page at the scope you registered against. The runner should appear
 as `Idle` with the labels you configured.
 ## Removing a runner
 On the host:
 ```bash
 systemctl disable --now act_runner
 rm -rf /opt/act_runner /etc/systemd/system/act_runner.service
 systemctl daemon-reload
 ```
 Then delete the runner entry from the Gitea Actions UI so Gitea stops trying to
 schedule against it.
 ## Operational notes
 - The runner stores in-progress job working directories under `/tmp/act_runner` by
  default. Large disk-image builds may need that path moved to a larger volume —
  edit `host.workdir_parent:` in `config.yaml`.
 - Logs are visible via `journalctl -u act_runner -f`.
 - If a job is interrupted (e.g. host reboot mid-build), the Gitea UI will mark it as
  failed/cancelled. Re-run from the Actions UI.
--- a/docs/release-notes-0.3.0.md
+++ b/docs/release-notes-0.3.0.md
@@ -0,0 +1,181 @@
 # KubeSolo OS v0.3.0 — Release Notes
 **Released:** 2026-05-14
 v0.3.0 is the second feature release after v0.2.0 and the first release that
 ships a generic ARM64 build alongside x86_64. The update agent grew up: it
 now has an explicit on-disk lifecycle, OCI registry distribution, and a
 fleet-friendly set of policy gates (channels, maintenance windows,
 version-stepping-stones, pre-flight checks, auto-rollback).
 This document is the operator-facing summary. The full per-phase changelog
 lives in [CHANGELOG.md](../CHANGELOG.md).
 ## What's new
 ### Generic ARM64 build
 The image you build with `make disk-image-arm64` now targets any UEFI-capable
 ARM64 host: AWS Graviton, Oracle Ampere, generic ARM64 servers, future SBCs
 with UEFI-compatible firmware. The kernel comes from kernel.org mainline LTS
 (6.12.10 by default, configurable via `MAINLINE_KERNEL_VERSION` in
 `build/config/versions.env`).
 This is **distinct** from the Raspberry Pi build path. RPi keeps its
 specialised kernel from `raspberrypi/linux` with bcm-defconfig + custom DTBs;
 the generic ARM64 path uses mainline + arm64-defconfig + UEFI/virtio. See
 [docs/arm64-architecture.md](arm64-architecture.md) for the file-by-file
 split.
 KubeSolo bumped to **v1.1.5** (was v1.1.0). New flags surfaced via cloud-init:
 - `kubesolo.full` — disable edge-optimised k8s overrides
 - `kubesolo.disable-ipv6` — disable IPv6 cluster-wide
 - `kubesolo.db-wal-repair` — recover from unclean shutdowns
 ### Update lifecycle is now observable
 The update agent writes a `state.json` at `/var/lib/kubesolo/update/state.json`
 recording where the current attempt is in the lifecycle:
 ```
 idle → checking → downloading → staged → activated → verifying → success
                                                              ↘ rolled_back
                                                              ↘ failed
 ```
 `kubesolo-update status --json` emits the full state for orchestration tooling.
 The Prometheus metrics endpoint gains three new series:
 - `kubesolo_update_phase{phase="..."}` — 1 for current phase, 0 for others (all 9 always emitted)
 - `kubesolo_update_attempts_total`
 - `kubesolo_update_last_attempt_timestamp_seconds`
 ### OCI registry distribution
 Update artifacts can now be pulled from any OCI-compliant registry alongside
 the existing HTTP `latest.json` protocol:
 ```bash
 # HTTP, unchanged from v0.2:
 kubesolo-update apply --server https://updates.example.com
 # New: OCI from ghcr.io (or quay.io, harbor, zot, ...)
 kubesolo-update apply --registry ghcr.io/yourorg/kubesolo-os --tag stable
 ```
 Multi-arch is handled transparently — the same `stable` tag points at a
 manifest index, the agent picks the manifest matching its `runtime.GOARCH`.
 Publish your own artifacts with `build/scripts/push-oci-artifact.sh`. See
 the script's header comment for the full publishing flow.
 ### Policy gates
 `apply` now enforces five gates before destroying the passive slot:
 1. **Maintenance window** (configurable, e.g. `03:00-05:00`; wrapping
   midnight supported)
 2. **Node-block-label** — refuses if the K8s node carries
   `updates.kubesolo.io/block=true` (workload-author kill switch)
 3. **Channel** — `stable` / `beta` / `edge` must match between the artifact
   metadata and the local channel
 4. **Architecture** — refuses cross-arch artifacts via `runtime.GOARCH` check
 5. **Min compatible version** — stepping-stone enforcement; refuses an
   upgrade that bypasses a required intermediate version
 `--force` bypasses the maintenance window and node-block label (channel /
 arch / min-version are non-negotiable). Failures are recorded in `state.json`
 with a clear `LastError` field.
 ### Healthcheck deepening + auto-rollback
 `kubesolo-update healthcheck` grew three optional probes:
 - **Kube-system pods** must hold Running for ≥ N seconds before passing
 - **Operator probe URL** — GET an operator-supplied endpoint; 200 = pass
 - **Disk smoke test** — write/fsync/read/delete a probe file under
  `/var/lib/kubesolo` to catch a wedged data partition
 Plus auto-rollback: with `--auto-rollback-after N` (or `auto_rollback_after=`
 in `update.conf`), after N consecutive post-activation failures, the agent
 calls `ForceRollback()` and the operator/init is expected to reboot. The
 counter resets on a clean pass.
 ### Persistent configuration via `/etc/kubesolo/update.conf`
 Cloud-init writes this file on first boot from a new `updates:` block; you
 can also hand-edit it. Recognised keys:
 ```
 server = https://updates.example.com         # or omit if using registry
 registry =                                   # OCI registry ref (alt to server)
 channel = stable
 maintenance_window = 03:00-05:00
 pubkey = /etc/kubesolo/update-pubkey.hex
 healthcheck_url = http://localhost:8000/ready
 auto_rollback_after = 3
 ```
 Cloud-init full reference at
 [cloud-init/examples/full-config.yaml](../cloud-init/examples/full-config.yaml).
 ## Migration from v0.2.x
 This is a non-breaking release for live systems. v0.2.x → v0.3.0 changes:
 - **`state.json` will appear** at `/var/lib/kubesolo/update/state.json` the
  first time a v0.3 agent runs `apply`. Pre-existing v0.2 deployments without
  this file are fine — the agent treats a missing file as fresh Idle state.
 - **`update.conf` is optional**. v0.2 deployments that pass everything via
  CLI flags keep working unchanged.
 - **HTTP `latest.json` protocol unchanged**. Existing update servers don't
  need a rebuild.
 - **GRUB env (boot counter, active slot)** unchanged. The bootloader's
  rollback behaviour is the same.
 - **No new mandatory kernel command-line parameters**.
 To opt into the new lifecycle, transports, and gates, drop in an
 `update.conf` (or update cloud-init) and switch to `--registry` if you want
 OCI distribution.
 ## Known limitations
 These shipped intentionally with v0.3.0 and are explicitly tracked for
 v0.3.1+:
 - **OCI signature verification** — the OCI transport is digest-verified
  end-to-end via oras-go, but does not yet consume cosign-style referrer
  attestations. The HTTP transport still honours `--pubkey` for `.sig`
  files.
 - **ARM64 LABEL=KSOLODATA** resolution doesn't work yet — piCore's
  `blkid`/`findfs` crash on QEMU virt under our mainline kernel; the
  static `busybox-static` we ship doesn't include those applets.
  `build/grub/grub-arm64.cfg` hardcodes `kubesolo.data=/dev/vda4` as a
  workaround. On real ARM64 hardware the device path may differ.
 - **Real-hardware ARM64 validation** is pending. The image builds and
  boots end-to-end under QEMU virt; production certification waits on a
  Graviton / Ampere run.
 - **AppArmor profile load fails on ARM64** (`apparmor_parser` ABI mismatch).
  Init reports the failure; boot continues without AppArmor enforcement.
 - **QEMU TCG performance** can trigger KubeSolo's first-boot image-import
  deadline. Not an OS defect; real hardware and KVM-accelerated QEMU
  complete the import in seconds.
 ## How to upgrade your build host
 ```bash
 git pull
 make distclean   # optional — drops the build cache; full rebuild takes ~30 min
 make iso         # or disk-image, or disk-image-arm64
 ```
 The Docker-based builder (`make docker-build`) regenerates its own image
 from `build/Dockerfile.builder` on next invocation; oras 1.2.3 and
 busybox-static are now included.
 ## Acknowledgements
 v0.3.0 work was driven by a single multi-week pair-programming session
 working through Phases 0–9 of the v0.3 roadmap. The Odroid self-hosted
 Gitea Actions runner (`odroid.local`, arm64-linux) carried every ARM64
 build during development.
--- a/hack/dev-vm-arm64.sh
+++ b/hack/dev-vm-arm64.sh
@@ -1,64 +1,167 @@
 #!/bin/bash
 # dev-vm-arm64.sh — Launch ARM64 QEMU VM for development
 #
-# Uses qemu-system-aarch64 with -machine virt to emulate an ARM64 system.
+# Two modes:
-# This is useful for testing ARM64/RPi builds on x86_64 hosts.
+#
 #   Default (direct kernel boot — fast iteration):
 #     qemu loads the kernel Image + initramfs directly via -kernel/-initrd.
 #     Skips bootloader, UEFI firmware, and disk image entirely.
 #     Use this for kernel and init-script changes.
 #
 #   --disk (full UEFI boot — integration testing):
 #     qemu boots the .arm64.img disk image via UEFI firmware -> GRUB -> kernel.
 #     Exercises the full boot chain. Use this when changing the disk image
 #     layout, GRUB config, or anything that touches the EFI partition.
 #
 # Usage:
-#   ./hack/dev-vm-arm64.sh                           # Use default kernel + initramfs
+#   ./hack/dev-vm-arm64.sh                       # direct kernel boot (default)
-#   ./hack/dev-vm-arm64.sh <kernel> <initramfs>      # Specify custom paths
+#   ./hack/dev-vm-arm64.sh --disk                # full UEFI boot from built image
-#   ./hack/dev-vm-arm64.sh --debug                   # Enable debug logging
+#   ./hack/dev-vm-arm64.sh --debug               # enable kubesolo.debug
-#   ./hack/dev-vm-arm64.sh --shell                   # Drop to emergency shell
+#   ./hack/dev-vm-arm64.sh --shell               # drop to emergency shell
 #   ./hack/dev-vm-arm64.sh --disk /path/to.img   # boot a specific disk image
 #   ./hack/dev-vm-arm64.sh <kernel> <initramfs>  # direct boot with custom files
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 VERSION="$(cat "$PROJECT_ROOT/VERSION")"
 MODE="kernel"           # kernel | disk
 VMLINUZ=""
 INITRD=""
 DISK_IMAGE=""
 EXTRA_APPEND=""
-# Parse arguments
+while [ $# -gt 0 ]; do
-for arg in "$@"; do
+    case "$1" in
-    case "$arg" in
+        --shell)   EXTRA_APPEND="$EXTRA_APPEND kubesolo.shell"; shift ;;
-        --shell)   EXTRA_APPEND="$EXTRA_APPEND kubesolo.shell" ;;
+        --debug)   EXTRA_APPEND="$EXTRA_APPEND kubesolo.debug"; shift ;;
-        --debug)   EXTRA_APPEND="$EXTRA_APPEND kubesolo.debug" ;;
+        --disk)
-        *)
+            MODE="disk"
-            if [ -z "$VMLINUZ" ]; then
+            shift
-                VMLINUZ="$arg"
+            # Optional next-arg as disk image path
-            elif [ -z "$INITRD" ]; then
+            if [ $# -gt 0 ] && [ -f "$1" ]; then
-                INITRD="$arg"
+                DISK_IMAGE="$1"
                shift
            fi
            ;;
        *)
            if [ "$MODE" = "kernel" ] && [ -z "$VMLINUZ" ]; then
                VMLINUZ="$1"
            elif [ "$MODE" = "kernel" ] && [ -z "$INITRD" ]; then
                INITRD="$1"
            fi
            shift
            ;;
    esac
 done
-# Defaults
+# ---------------------------------------------------------------------------
-VMLINUZ="${VMLINUZ:-$PROJECT_ROOT/build/cache/custom-kernel-arm64/Image}"
+# UEFI firmware probe (used for --disk mode)
 # ---------------------------------------------------------------------------
 find_uefi_firmware() {
    local candidates=(
        /usr/share/qemu-efi-aarch64/QEMU_EFI.fd
        /usr/share/AAVMF/AAVMF_CODE.fd
        /usr/share/edk2/aarch64/QEMU_EFI.fd
        /usr/share/qemu/edk2-aarch64-code.fd
        /opt/homebrew/share/qemu/edk2-aarch64-code.fd
        /usr/local/share/qemu/edk2-aarch64-code.fd
    )
    for f in "${candidates[@]}"; do
        [ -f "$f" ] && echo "$f" && return 0
    done
    return 1
 }
 # ---------------------------------------------------------------------------
 # mkfs.ext4 probe (kernel mode creates a scratch data disk)
 # ---------------------------------------------------------------------------
 find_mkfs_ext4() {
    if command -v mkfs.ext4 >/dev/null 2>&1; then
        echo "mkfs.ext4"
    elif [ -x "/opt/homebrew/opt/e2fsprogs/sbin/mkfs.ext4" ]; then
        echo "/opt/homebrew/opt/e2fsprogs/sbin/mkfs.ext4"
    elif [ -x "/usr/local/opt/e2fsprogs/sbin/mkfs.ext4" ]; then
        echo "/usr/local/opt/e2fsprogs/sbin/mkfs.ext4"
    fi
 }
 # ===========================================================================
 # Disk mode: boot the built .arm64.img through UEFI firmware + GRUB
 # ===========================================================================
 if [ "$MODE" = "disk" ]; then
    DISK_IMAGE="${DISK_IMAGE:-$PROJECT_ROOT/output/kubesolo-os-${VERSION}.arm64.img}"
    if [ ! -f "$DISK_IMAGE" ]; then
        echo "ERROR: Disk image not found: $DISK_IMAGE"
        echo "  Run 'make disk-image-arm64' to build it."
        exit 1
    fi
    UEFI_FW="$(find_uefi_firmware || true)"
    if [ -z "$UEFI_FW" ]; then
        echo "ERROR: No ARM64 UEFI firmware found."
        echo "  Install one of:"
        echo "    apt install qemu-efi-aarch64        # Debian/Ubuntu"
        echo "    dnf install edk2-aarch64            # Fedora/RHEL"
        echo "    brew install qemu                   # macOS (bundled)"
        exit 1
    fi
    # Pad UEFI firmware variable store to 64 MiB if QEMU expects pflash sizing.
    # Most ARM64 EFI .fd files are 64 MB; if yours is smaller, QEMU may refuse.
    echo "==> Launching ARM64 QEMU (UEFI disk boot)..."
    echo "    Firmware: $UEFI_FW"
    echo "    Disk:     $DISK_IMAGE"
    echo ""
    echo "    K8s API: localhost:6443"
    echo "    SSH:     localhost:2222"
    echo "    Press Ctrl+A X to exit QEMU"
    echo ""
    # -cpu max enables all emulated ARMv8 features (atomics, crypto, fp16).
    # piCore64's BusyBox is built with -march=armv8-a+crypto+lse and segfaults
    # under -cpu cortex-a72 because some required extensions aren't on by
    # default in that model.
    qemu-system-aarch64 \
        -machine virt \
        -cpu max \
        -m 2048 \
        -smp 2 \
        -nographic \
        -bios "$UEFI_FW" \
        -drive "file=$DISK_IMAGE,format=raw,if=virtio,media=disk" \
        -net "nic,model=virtio" \
        -net "user,hostfwd=tcp::6443-:6443,hostfwd=tcp::2222-:22"
    exit 0
 fi
 # ===========================================================================
 # Kernel mode (default): direct -kernel / -initrd, fast iteration
 # ===========================================================================
 VMLINUZ="${VMLINUZ:-$PROJECT_ROOT/build/cache/kernel-arm64-generic/Image}"
 INITRD="${INITRD:-$PROJECT_ROOT/build/rootfs-work/kubesolo-os.gz}"
-# Verify files exist
+# Fallback: previous-generation RPi kernel cache, in case someone hasn't yet
 # rebuilt under v0.3 paths.
 if [ ! -f "$VMLINUZ" ] && [ -f "$PROJECT_ROOT/build/cache/custom-kernel-rpi/Image" ]; then
    VMLINUZ="$PROJECT_ROOT/build/cache/custom-kernel-rpi/Image"
    echo "==> Note: falling back to RPi kernel ($VMLINUZ)"
 fi
 if [ ! -f "$VMLINUZ" ]; then
    echo "ERROR: Kernel not found: $VMLINUZ"
-    echo "  Run 'make kernel-arm64' to build the ARM64 kernel."
+    echo "  Run 'make kernel-arm64' (generic) or 'make kernel-rpi' to build a kernel."
    exit 1
 fi
 if [ ! -f "$INITRD" ]; then
    echo "ERROR: Initrd not found: $INITRD"
-    echo "  Run 'make initramfs' to build the initramfs."
+    echo "  Run 'make rootfs-arm64' to build the initramfs."
    exit 1
 fi
-# Find mkfs.ext4
+MKFS_EXT4="$(find_mkfs_ext4)"
 MKFS_EXT4=""
 if command -v mkfs.ext4 >/dev/null 2>&1; then
    MKFS_EXT4="mkfs.ext4"
 elif [ -x "/opt/homebrew/opt/e2fsprogs/sbin/mkfs.ext4" ]; then
    MKFS_EXT4="/opt/homebrew/opt/e2fsprogs/sbin/mkfs.ext4"
 elif [ -x "/usr/local/opt/e2fsprogs/sbin/mkfs.ext4" ]; then
    MKFS_EXT4="/usr/local/opt/e2fsprogs/sbin/mkfs.ext4"
 fi
 if [ -z "$MKFS_EXT4" ]; then
    echo "ERROR: mkfs.ext4 not found. Install e2fsprogs:"
    if [ "$(uname)" = "Darwin" ]; then
@@ -70,13 +173,12 @@ if [ -z "$MKFS_EXT4" ]; then
    exit 1
 fi
 # Create data disk
 DATA_DISK="$(mktemp /tmp/kubesolo-arm64-data-XXXXXX).img"
 dd if=/dev/zero of="$DATA_DISK" bs=1M count=1024 2>/dev/null
 "$MKFS_EXT4" -q -L KSOLODATA "$DATA_DISK" 2>/dev/null
 trap 'rm -f "$DATA_DISK"' EXIT
-echo "==> Launching ARM64 QEMU VM..."
+echo "==> Launching ARM64 QEMU (direct kernel boot)..."
 echo "    Kernel: $VMLINUZ"
 echo "    Initrd: $INITRD"
 echo "    Data:   $DATA_DISK"
@@ -88,7 +190,7 @@ echo ""
 qemu-system-aarch64 \
    -machine virt \
-    -cpu cortex-a72 \
+    -cpu max \
    -m 2048 \
    -smp 2 \
    -nographic \
--- a/init/init.sh
+++ b/init/init.sh
@@ -14,6 +14,11 @@
 #   kubesolo.cloudinit=<path> Path to cloud-init config
 #   kubesolo.flags=<flags>    Extra flags for KubeSolo binary
 # Route early boot output to /dev/console — before switch_root the kernel may
 # not have a controlling tty, and some stages echo to stderr expecting it to
 # reach the serial console. This is a no-op once the staged init proper starts.
 exec >/dev/console 2>&1
 set -e
 # --- Switch root: escape initramfs so runc pivot_root works ---
--- a/init/lib/20-persistent-mount.sh
+++ b/init/lib/20-persistent-mount.sh
@@ -58,12 +58,46 @@ esac
 if [ ! -b "$KUBESOLO_DATA_DEV" ]; then
    log_err "Data device $KUBESOLO_DATA_DEV not found after ${WAIT_SECS}s"
-    # Show available block devices for debugging
+    # Comprehensive diagnostics for block device failure
-    log_err "Available block devices:"
+    log_err "=== Block device diagnostics ==="
-    ls -la /dev/mmc* /dev/sd* /dev/vd* 2>/dev/null | while read -r line; do
+    log_err "--- /dev block devices ---"
    ls -la /dev/mmc* /dev/sd* /dev/vd* /dev/nvme* 2>/dev/null | while read -r line; do
        log_err "  $line"
    done
-    return 1
+    log_err "--- /sys/class/block (kernel registered) ---"
    ls /sys/class/block/ 2>/dev/null | while read -r line; do
        log_err "  $line"
    done
    log_err "--- dmesg: MMC/SDHCI/emmc ---"
    dmesg 2>/dev/null | grep -i -e mmc -e sdhci -e emmc | while read -r line; do
        log_err "  $line"
    done
    log_err "--- dmesg: regulator ---"
    dmesg 2>/dev/null | grep -i regulator | while read -r line; do
        log_err "  $line"
    done
    log_err "--- dmesg: firmware/mailbox ---"
    dmesg 2>/dev/null | grep -i -e 'raspberrypi' -e 'mailbox' -e 'firmware' | while read -r line; do
        log_err "  $line"
    done
    log_err "--- dmesg: errors ---"
    dmesg 2>/dev/null | grep -i -e 'error' -e 'fail' -e 'unable' | while read -r line; do
        log_err "  $line"
    done
    log_err "--- Full dmesg (last 60 lines) ---"
    dmesg 2>/dev/null | tail -60 | while read -r line; do
        log_err "  $line"
    done
    log_err "=== End diagnostics ==="
    log_err ""
    log_err "Dropping to debug shell in 10 seconds..."
    log_err "Run 'dmesg' to see full kernel log."
    log_err "Run 'ls /sys/class/block/' to check block devices."
    log_err ""
    sleep 10
    # Drop to interactive shell instead of returning failure
    # (returning 1 with set -e causes kernel panic before emergency_shell)
    exec /bin/sh </dev/console >/dev/console 2>&1
 fi
 # Mount data partition (format on first boot if unformatted)
--- a/init/lib/30-kernel-modules.sh
+++ b/init/lib/30-kernel-modules.sh
@@ -16,7 +16,11 @@ while IFS= read -r mod; do
    case "$mod" in
        '#'*|'') continue ;;
    esac
-    mod="$(echo "$mod" | tr -d '[:space:]')"
+    # NOTE: do NOT use tr -d '[:space:]' — Ubuntu's busybox-static 1.30.1 (used
    # in the ARM64 rootfs override) doesn't parse POSIX char classes and treats
    # them as a literal set, deleting [, :, s, p, a, c, e, ]. Use explicit
    # whitespace chars instead so the same script works under any tr.
    mod="$(printf '%s' "$mod" | tr -d ' \t\r\n')"
    if modprobe "$mod" 2>/dev/null; then
        LOADED=$((LOADED + 1))
    else
--- a/init/lib/40-sysctl.sh
+++ b/init/lib/40-sysctl.sh
@@ -8,8 +8,11 @@ for conf in /etc/sysctl.d/*.conf; do
        case "$key" in
            '#'*|'') continue ;;
        esac
-        key="$(echo "$key" | tr -d '[:space:]')"
+        # NOTE: do NOT use tr -d '[:space:]' — see 30-kernel-modules.sh for the
-        value="$(echo "$value" | tr -d '[:space:]')"
+        # rationale. Use explicit whitespace chars so this works under
        # Ubuntu's busybox-static tr too.
        key="$(printf '%s' "$key" | tr -d ' \t\r\n')"
        value="$(printf '%s' "$value" | tr -d ' \t\r\n')"
        if [ -n "$key" ] && [ -n "$value" ]; then
            sysctl -w "${key}=${value}" >/dev/null 2>&1 || \
                log_warn "Failed to set sysctl: ${key}=${value}"
--- a/test/qemu/test-boot-arm64-disk.sh
+++ b/test/qemu/test-boot-arm64-disk.sh
@@ -0,0 +1,129 @@
 #!/bin/bash
 # test-boot-arm64-disk.sh — Boot the ARM64 .arm64.img via UEFI + GRUB and
 # verify the init system reaches stage 90.
 #
 # This is the full-stack integration test: UEFI firmware -> GRUB -> kernel ->
 # initramfs -> staged init. Contrast with test-boot-arm64.sh which skips the
 # bootloader and loads kernel/initramfs directly.
 #
 # Exit 0 = PASS, Exit 1 = FAIL.
 #
 # Usage: ./test/qemu/test-boot-arm64-disk.sh [disk.img]
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 VERSION="$(cat "$PROJECT_ROOT/VERSION")"
 DISK_IMAGE="${1:-$PROJECT_ROOT/output/kubesolo-os-${VERSION}.arm64.img}"
 TIMEOUT=180
 echo "==> ARM64 UEFI Disk Boot Test"
 echo "    Disk image: $DISK_IMAGE"
 echo "    Timeout:    ${TIMEOUT}s"
 if [ ! -f "$DISK_IMAGE" ]; then
    echo "ERROR: Disk image not found: $DISK_IMAGE"
    echo "  Run 'make disk-image-arm64' to build it."
    exit 1
 fi
 if ! command -v qemu-system-aarch64 >/dev/null 2>&1; then
    echo "ERROR: qemu-system-aarch64 not found."
    echo "  apt install qemu-system-arm   # Debian/Ubuntu"
    echo "  dnf install qemu-system-aarch64  # Fedora/RHEL"
    exit 1
 fi
 # --- Locate UEFI firmware ---
 UEFI_FW=""
 for candidate in \
    /usr/share/qemu-efi-aarch64/QEMU_EFI.fd \
    /usr/share/AAVMF/AAVMF_CODE.fd \
    /usr/share/edk2/aarch64/QEMU_EFI.fd \
    /usr/share/qemu/edk2-aarch64-code.fd \
    /opt/homebrew/share/qemu/edk2-aarch64-code.fd \
    /usr/local/share/qemu/edk2-aarch64-code.fd
 do
    if [ -f "$candidate" ]; then
        UEFI_FW="$candidate"
        break
    fi
 done
 if [ -z "$UEFI_FW" ]; then
    echo "ERROR: No ARM64 UEFI firmware found."
    echo "  apt install qemu-efi-aarch64"
    exit 1
 fi
 echo "    UEFI fw:    $UEFI_FW"
 # Copy disk image to a scratch file so the test doesn't mutate the source.
 # UEFI will write to grubenv on the EFI partition; we don't want to bake those
 # changes into the canonical build artifact.
 SCRATCH_DISK=$(mktemp /tmp/kubesolo-arm64-disk-test-XXXXXX.img)
 SERIAL_LOG=$(mktemp /tmp/kubesolo-arm64-disk-serial-XXXXXX.log)
 QEMU_PID=""
 cleanup() {
    [ -n "$QEMU_PID" ] && kill "$QEMU_PID" 2>/dev/null || true
    rm -f "$SCRATCH_DISK" "$SERIAL_LOG"
 }
 trap cleanup EXIT
 cp --reflink=auto "$DISK_IMAGE" "$SCRATCH_DISK" 2>/dev/null || cp "$DISK_IMAGE" "$SCRATCH_DISK"
 # --- Launch QEMU ---
 qemu-system-aarch64 \
    -machine virt \
    -cpu cortex-a72 \
    -m 2048 \
    -smp 2 \
    -nographic \
    -bios "$UEFI_FW" \
    -drive "file=$SCRATCH_DISK,format=raw,if=virtio,media=disk" \
    -net nic,model=virtio \
    -net user \
    -serial "file:$SERIAL_LOG" &
 QEMU_PID=$!
 echo "    Waiting for boot (PID $QEMU_PID)..."
 ELAPSED=0
 SUCCESS=0
 while [ "$ELAPSED" -lt "$TIMEOUT" ]; do
    if grep -q "\[kubesolo-init\] \[OK\] Stage 90-kubesolo.sh complete" "$SERIAL_LOG" 2>/dev/null; then
        SUCCESS=1
        break
    fi
    if grep -q "KubeSolo is running" "$SERIAL_LOG" 2>/dev/null; then
        SUCCESS=1
        break
    fi
    if ! kill -0 "$QEMU_PID" 2>/dev/null; then
        echo ""
        echo "==> FAIL: QEMU exited prematurely"
        echo "    Last 30 lines of serial output:"
        tail -30 "$SERIAL_LOG" 2>/dev/null || echo "    (no output)"
        exit 1
    fi
    sleep 2
    ELAPSED=$((ELAPSED + 2))
    printf "\r    Elapsed: %ds / %ds" "$ELAPSED" "$TIMEOUT"
 done
 echo ""
 kill "$QEMU_PID" 2>/dev/null || true
 wait "$QEMU_PID" 2>/dev/null || true
 QEMU_PID=""
 if [ "$SUCCESS" = "1" ]; then
    echo "==> ARM64 UEFI Disk Boot Test PASSED (${ELAPSED}s)"
    exit 0
 fi
 echo "==> ARM64 UEFI Disk Boot Test FAILED (timeout ${TIMEOUT}s)"
 echo ""
 echo "==> Last 50 lines of serial output:"
 tail -50 "$SERIAL_LOG" 2>/dev/null || echo "    (no output)"
 exit 1
--- a/update/cmd/activate.go
+++ b/update/cmd/activate.go
@@ -3,23 +3,35 @@ package cmd
 import (
 	"fmt"
 	"log/slog"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 // Activate switches the boot target to the passive partition.
 // After activation, the next reboot will boot from the new partition
 // with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
 //
 // State transition: Staged → Activated. On failure → Failed.
 func Activate(args []string) error {
 	opts := parseOpts(args)
 	env := opts.NewBootEnv()
 	st, err := state.Load(opts.StatePath)
 	if err != nil {
 		slog.Warn("state file unreadable, starting fresh", "error", err)
 		st = state.New()
 	}
 	// Get passive slot (the one we want to boot into)
 	passiveSlot, err := env.PassiveSlot()
 	if err != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
 		return fmt.Errorf("reading passive slot: %w", err)
 	}
 	activeSlot, err := env.ActiveSlot()
 	if err != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("reading active slot: %w", err))
 		return fmt.Errorf("reading active slot: %w", err)
 	}
@@ -27,9 +39,14 @@ func Activate(args []string) error {
 	// Set the passive slot as active with fresh boot counter
 	if err := env.ActivateSlot(passiveSlot); err != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("activating slot %s: %w", passiveSlot, err))
 		return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
 	}
 	if err := st.Transition(opts.StatePath, state.PhaseActivated, "", ""); err != nil {
 		slog.Warn("state transition failed", "phase", state.PhaseActivated, "error", err)
 	}
 	fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
 	fmt.Println("Boot counter set to 3. Reboot to start the new version.")
 	fmt.Println("The system will automatically roll back if health checks fail 3 times.")
--- a/update/cmd/apply.go
+++ b/update/cmd/apply.go
@@ -1,73 +1,240 @@
 package cmd
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"os"
 	"runtime"
 	"time"
 	"github.com/portainer/kubesolo-os/update/pkg/config"
 	"github.com/portainer/kubesolo-os/update/pkg/health"
 	"github.com/portainer/kubesolo-os/update/pkg/image"
 	"github.com/portainer/kubesolo-os/update/pkg/oci"
 	"github.com/portainer/kubesolo-os/update/pkg/partition"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 // applyMetadataGates enforces channel / architecture / min-version policy on
 // resolved update metadata, regardless of transport (HTTP or OCI). Records
 // any failure to the state file before returning.
 func applyMetadataGates(opts opts, st *state.UpdateState, meta *image.UpdateMetadata) error {
 	if meta.Channel != "" && meta.Channel != opts.Channel {
 		err := fmt.Errorf("metadata channel %q does not match local channel %q",
 			meta.Channel, opts.Channel)
 		_ = st.RecordError(opts.StatePath, err)
 		return err
 	}
 	if meta.Architecture != "" && meta.Architecture != runtime.GOARCH {
 		err := fmt.Errorf("metadata architecture %q does not match runtime %q",
 			meta.Architecture, runtime.GOARCH)
 		_ = st.RecordError(opts.StatePath, err)
 		return err
 	}
 	if meta.MinCompatibleVersion != "" && st.FromVersion != "" {
 		cmp, cerr := config.CompareVersions(st.FromVersion, meta.MinCompatibleVersion)
 		if cerr != nil {
 			slog.Warn("min-version comparison failed", "error", cerr,
 				"from", st.FromVersion, "min", meta.MinCompatibleVersion)
 		} else if cmp < 0 {
 			err := fmt.Errorf("current version %s is below min_compatible_version %s; install %s first",
 				st.FromVersion, meta.MinCompatibleVersion, meta.MinCompatibleVersion)
 			_ = st.RecordError(opts.StatePath, err)
 			return err
 		}
 	}
 	return nil
 }
 // Apply downloads a new OS image and writes it to the passive partition.
 // It does NOT activate the new partition — use 'activate' for that.
 //
 // State transitions: Idle/Success/Failed → Checking → Downloading → Staged.
 // On any error the state moves to Failed with LastError set.
 func Apply(args []string) error {
 	opts := parseOpts(args)
-	if opts.ServerURL == "" {
+	if opts.ServerURL == "" && opts.Registry == "" {
-		return fmt.Errorf("--server is required")
+		return fmt.Errorf("--server or --registry is required (or set in /etc/kubesolo/update.conf)")
 	}
 	if opts.ServerURL != "" && opts.Registry != "" {
 		return fmt.Errorf("--server and --registry are mutually exclusive")
 	}
 	// Maintenance window gate — earliest cheap check, before any HTTP work.
 	// Skipped with --force.
 	window, werr := config.ParseWindow(opts.MaintenanceWindow)
 	if werr != nil {
 		return fmt.Errorf("parse maintenance_window: %w", werr)
 	}
 	if !opts.Force && !window.Contains(time.Now()) {
 		return fmt.Errorf("outside maintenance window (%s); pass --force to override",
 			window.String())
 	}
 	// Node-block-label gate — workload authors can defer an update by
 	// labeling the node updates.kubesolo.io/block=true. Skipped with --force
 	// and silently bypassed when the K8s API isn't reachable (air-gap).
 	if !opts.Force {
 		blocked, berr := health.CheckNodeBlocked("")
 		if berr != nil {
 			slog.Warn("node-block check failed, allowing update", "error", berr)
 		} else if blocked {
 			return fmt.Errorf("node carries label %s=true; refusing update (pass --force to override)",
 				health.NodeBlockLabel)
 		}
 	}
 	st, err := state.Load(opts.StatePath)
 	if err != nil {
 		// Don't block the operation on a corrupt state file. Log + recover.
 		slog.Warn("state file unreadable, starting fresh", "error", err)
 		st = state.New()
 	}
 	env := opts.NewBootEnv()
 	// Record the current running version as the "from" reference. The active
 	// slot's version file is the most reliable source.
 	activeSlot, slotErr := env.ActiveSlot()
 	if slotErr == nil {
 		if partInfo, perr := partition.GetSlotPartition(activeSlot); perr == nil {
 			mp := "/tmp/kubesolo-active-" + activeSlot
 			if merr := partition.MountReadOnly(partInfo.Device, mp); merr == nil {
 				if v, rerr := partition.ReadVersion(mp); rerr == nil {
 					st.SetFromVersion(v)
 				}
 				partition.Unmount(mp)
 			}
 		}
 	}
 	// Determine passive slot
 	passiveSlot, err := env.PassiveSlot()
 	if err != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
 		return fmt.Errorf("reading passive slot: %w", err)
 	}
 	slog.Info("applying update", "target_slot", passiveSlot)
 	// Check for update
 	stageDir := "/tmp/kubesolo-update-stage"
 	client := image.NewClient(opts.ServerURL, stageDir)
 	defer client.Cleanup()
-	// Enable signature verification if public key is configured
+	if err := st.Transition(opts.StatePath, state.PhaseChecking, "", ""); err != nil {
-	if opts.PubKeyPath != "" {
+		slog.Warn("state transition failed", "phase", state.PhaseChecking, "error", err)
 		client.SetPublicKeyPath(opts.PubKeyPath)
 		slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
 	}
-	meta, err := client.CheckForUpdate()
+	// Resolve metadata via the configured transport. OCI registry mode pulls
-	if err != nil {
+	// the manifest only; HTTP mode hits latest.json.
-		return fmt.Errorf("checking for update: %w", err)
+	var (
 		meta   *image.UpdateMetadata
 		staged *image.StagedImage
 	)
 	if opts.Registry != "" {
 		ociClient, err := oci.NewClient(opts.Registry)
 		if err != nil {
 			_ = st.RecordError(opts.StatePath, fmt.Errorf("oci client: %w", err))
 			return fmt.Errorf("oci client: %w", err)
 		}
 		tag := opts.Tag
 		if tag == "" {
 			tag = opts.Channel
 		}
 		if tag == "" {
 			tag = "stable"
 		}
 		meta, err = ociClient.FetchMetadata(context.Background(), tag)
 		if err != nil {
 			_ = st.RecordError(opts.StatePath, fmt.Errorf("oci fetch metadata: %w", err))
 			return fmt.Errorf("oci fetch metadata: %w", err)
 		}
 		if err := applyMetadataGates(opts, st, meta); err != nil {
 			return err
 		}
 		if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
 			slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
 		}
 		staged, _, err = ociClient.Pull(context.Background(), tag, stageDir)
 		if err != nil {
 			_ = st.RecordError(opts.StatePath, fmt.Errorf("oci pull: %w", err))
 			return fmt.Errorf("oci pull: %w", err)
 		}
 	} else {
 		client := image.NewClient(opts.ServerURL, stageDir)
 		defer client.Cleanup()
 		if opts.PubKeyPath != "" {
 			client.SetPublicKeyPath(opts.PubKeyPath)
 			slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
 		}
 		var err error
 		meta, err = client.CheckForUpdate()
 		if err != nil {
 			_ = st.RecordError(opts.StatePath, fmt.Errorf("checking for update: %w", err))
 			return fmt.Errorf("checking for update: %w", err)
 		}
 		if err := applyMetadataGates(opts, st, meta); err != nil {
 			return err
 		}
 		if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
 			slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
 		}
 		staged, err = client.Download(meta)
 		if err != nil {
 			_ = st.RecordError(opts.StatePath, fmt.Errorf("downloading update: %w", err))
 			return fmt.Errorf("downloading update: %w", err)
 		}
 	}
-	slog.Info("update available", "version", meta.Version)
+	slog.Info("update available", "version", meta.Version, "channel", meta.Channel, "arch", meta.Architecture)
 	// Download and verify
 	staged, err := client.Download(meta)
 	if err != nil {
 		return fmt.Errorf("downloading update: %w", err)
 	}
 	// Mount passive partition
 	partInfo, err := partition.GetSlotPartition(passiveSlot)
 	if err != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("finding passive partition: %w", err))
 		return fmt.Errorf("finding passive partition: %w", err)
 	}
 	mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
 	if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("mounting passive partition: %w", err))
 		return fmt.Errorf("mounting passive partition: %w", err)
 	}
 	defer partition.Unmount(mountPoint)
 	// Free-space pre-write check: the passive partition must have at least
 	// (kernel + initramfs) + 10% headroom. Catches corrupted-FS reports and
 	// shrunk/wrong-size partitions before we destroy the existing slot data.
 	var imgSize int64
 	for _, p := range []string{staged.VmlinuzPath, staged.InitramfsPath} {
 		fi, ferr := os.Stat(p)
 		if ferr != nil {
 			_ = st.RecordError(opts.StatePath, fmt.Errorf("stat staged file %s: %w", p, ferr))
 			return fmt.Errorf("stat staged file %s: %w", p, ferr)
 		}
 		imgSize += fi.Size()
 	}
 	avail, ok, ferr := partition.HasFreeSpaceFor(mountPoint, imgSize, 10)
 	if ferr != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("free-space check: %w", ferr))
 		return fmt.Errorf("free-space check: %w", ferr)
 	}
 	if !ok {
 		err := fmt.Errorf("insufficient space on %s: have %.1f MiB, need %.1f MiB (image + 10%% headroom)",
 			passiveSlot, float64(avail)/(1<<20), float64(imgSize)*1.1/(1<<20))
 		_ = st.RecordError(opts.StatePath, err)
 		return err
 	}
 	// Write image to passive partition
 	if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))
 		return fmt.Errorf("writing system image: %w", err)
 	}
 	if err := st.Transition(opts.StatePath, state.PhaseStaged, staged.Version, ""); err != nil {
 		slog.Warn("state transition failed", "phase", state.PhaseStaged, "error", err)
 	}
 	fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
 	fmt.Println("Run 'kubesolo-update activate' to boot into the new version")
--- a/update/cmd/healthcheck.go
+++ b/update/cmd/healthcheck.go
@@ -6,16 +6,32 @@ import (
 	"time"
 	"github.com/portainer/kubesolo-os/update/pkg/health"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 // Healthcheck performs post-boot health verification.
 // If all checks pass, it marks the boot as successful in GRUB.
 // This should be run after every boot (typically via a systemd unit or
 // init script) to confirm the system is healthy.
 //
 // State transition: Activated → Verifying → Success on pass, → Failed on fail.
 // If state isn't in Activated (e.g. manual run on a long-stable system), the
 // state file is left alone — healthcheck still does its job.
 //
 // When --auto-rollback-after N is set, consecutive post-Activated failures
 // are counted in state.HealthCheckFailures. On the Nth failure, the agent
 // calls Rollback() and the operator is expected to reboot (this command
 // does not reboot the host — that's policy left to systemd/init).
 func Healthcheck(args []string) error {
 	opts := parseOpts(args)
 	env := opts.NewBootEnv()
 	st, err := state.Load(opts.StatePath)
 	if err != nil {
 		slog.Warn("state file unreadable, starting fresh", "error", err)
 		st = state.New()
 	}
 	// Check if already marked successful
 	success, err := env.BootSuccess()
 	if err != nil {
@@ -26,30 +42,94 @@ func Healthcheck(args []string) error {
 		return nil
 	}
 	// Only transition state if we're post-activation. Manual healthcheck on a
 	// long-stable system shouldn't reset Idle → Verifying.
 	postActivation := st.Phase == state.PhaseActivated
 	if postActivation {
 		if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil {
 			slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err)
 		}
 	}
 	timeout := time.Duration(opts.TimeoutSecs) * time.Second
 	checker := health.NewChecker("", "", timeout)
 	checker.ProbeURL = opts.HealthcheckURL
 	if opts.KubeSystemSettle > 0 {
 		checker.KubeSystemSettle = time.Duration(opts.KubeSystemSettle) * time.Second
 	}
 	// Probe the data partition every healthcheck so a wedged disk fails fast.
 	checker.DataDir = "/var/lib/kubesolo"
-	slog.Info("running post-boot health checks", "timeout", timeout)
+	slog.Info("running post-boot health checks",
 		"timeout", timeout,
 		"probe_url", checker.ProbeURL,
 		"kube_system_settle", checker.KubeSystemSettle)
 	status, err := checker.WaitForHealthy()
 	if err != nil {
 		fmt.Printf("Health check FAILED: %s\n", status.Message)
-		fmt.Printf("  containerd: %v\n", status.Containerd)
+		printStatusBreakdown(status)
 		fmt.Printf("  apiserver:  %v\n", status.APIServer)
 		fmt.Printf("  node_ready: %v\n", status.NodeReady)
 		fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
 		if postActivation {
 			st.HealthCheckFailures++
 			_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
 			// Auto-rollback escalation. Only trigger when post-Activated;
 			// don't second-guess a healthy long-running system.
 			if opts.AutoRollbackAfter > 0 && st.HealthCheckFailures >= opts.AutoRollbackAfter {
 				slog.Warn("auto-rollback threshold reached",
 					"failures", st.HealthCheckFailures,
 					"threshold", opts.AutoRollbackAfter)
 				if rerr := env.ForceRollback(); rerr != nil {
 					slog.Error("auto-rollback failed", "error", rerr)
 					return err // return the original healthcheck error
 				}
 				if terr := st.Transition(opts.StatePath, state.PhaseRolledBack, "",
 					fmt.Sprintf("auto-rollback after %d healthcheck failures", st.HealthCheckFailures)); terr != nil {
 					slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", terr)
 				}
 				fmt.Println("\nAuto-rollback triggered. Reboot to complete the rollback.")
 			}
 		}
 		return err
 	}
 	// Mark boot as successful
 	if err := env.MarkBootSuccess(); err != nil {
 		if postActivation {
 			_ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err))
 		}
 		return fmt.Errorf("marking boot success: %w", err)
 	}
 	if postActivation {
 		// Reset failure counter on a clean pass.
 		st.HealthCheckFailures = 0
 		if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
 			slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
 		}
 	}
 	fmt.Println("Health check PASSED — boot marked successful")
-	fmt.Printf("  containerd: %v\n", status.Containerd)
+	printStatusBreakdown(status)
 	fmt.Printf("  apiserver:  %v\n", status.APIServer)
 	fmt.Printf("  node_ready: %v\n", status.NodeReady)
 	return nil
 }
 // printStatusBreakdown emits a human-readable per-check summary. Only emits
 // optional check lines when they actually ran.
 func printStatusBreakdown(s *health.Status) {
 	fmt.Printf("  containerd:        %v\n", s.Containerd)
 	fmt.Printf("  apiserver:         %v\n", s.APIServer)
 	fmt.Printf("  node_ready:        %v\n", s.NodeReady)
 	if !s.KubeSystemReady {
 		fmt.Printf("  kube-system pods:  %v\n", s.KubeSystemReady)
 	}
 	if !s.ProbeURL {
 		fmt.Printf("  probe URL:         %v\n", s.ProbeURL)
 	}
 	if !s.DiskWritable {
 		fmt.Printf("  disk writable:     %v\n", s.DiskWritable)
 	}
 }
--- a/update/cmd/metrics.go
+++ b/update/cmd/metrics.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"github.com/portainer/kubesolo-os/update/pkg/metrics"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 // Metrics starts the Prometheus-compatible metrics HTTP server.
@@ -12,10 +13,12 @@ func Metrics(args []string) error {
 	fs := flag.NewFlagSet("metrics", flag.ExitOnError)
 	listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address")
 	grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file")
 	statePath := fs.String("state", state.DefaultPath, "Path to update state.json")
 	if err := fs.Parse(args); err != nil {
 		return fmt.Errorf("parse flags: %w", err)
 	}
 	srv := metrics.NewServer(*listenAddr, *grubenvPath)
 	srv.SetStatePath(*statePath)
 	return srv.ListenAndServe()
 }
--- a/update/cmd/opts.go
+++ b/update/cmd/opts.go
@@ -1,17 +1,32 @@
 package cmd
 import (
 	"log/slog"
 	"github.com/portainer/kubesolo-os/update/pkg/bootenv"
 	"github.com/portainer/kubesolo-os/update/pkg/config"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 // opts holds shared command-line options for all subcommands.
 type opts struct {
-	ServerURL   string
+	ServerURL         string
-	GrubenvPath string
+	Registry          string // OCI registry ref (e.g. ghcr.io/foo/kubesolo-os). Mutually exclusive with ServerURL.
-	TimeoutSecs int
+	Tag               string // OCI tag to pull (default: equal to Channel, falling back to "stable")
-	PubKeyPath  string
+	GrubenvPath       string
-	BootEnvType string // "grub" or "rpi"
+	TimeoutSecs       int
-	BootEnvPath string // path for RPi boot control dir
+	PubKeyPath        string
 	BootEnvType       string // "grub" or "rpi"
 	BootEnvPath       string // path for RPi boot control dir
 	StatePath          string // location of state.json (default: state.DefaultPath)
 	ConfPath           string // location of update.conf (default: config.DefaultPath)
 	Channel            string // update channel ("stable" by default)
 	MaintenanceWindow  string // "HH:MM-HH:MM" or empty for always-allow
 	HealthcheckURL     string // optional GET probe for healthcheck
 	AutoRollbackAfter  int    // healthcheck: rollback after N consecutive failures (0=off)
 	KubeSystemSettle   int    // healthcheck: kube-system pods must be Running for N seconds (0=disabled)
 	Force              bool   // bypass maintenance window
 	JSON               bool   // status: emit JSON instead of human-readable
 }
 // NewBootEnv creates a BootEnv from the parsed options.
@@ -25,21 +40,129 @@ func (o opts) NewBootEnv() bootenv.BootEnv {
 }
 // parseOpts extracts command-line flags from args.
-// Simple parser — no external dependencies.
+//
 // Precedence: explicit CLI flags > /etc/kubesolo/update.conf > package
 // defaults. The config file is loaded first so any CLI flag overrides it.
 //
 // Unknown flags are ignored (forward-compat).
 func parseOpts(args []string) opts {
 	o := opts{
 		GrubenvPath: "/boot/grub/grubenv",
 		TimeoutSecs: 120,
 		BootEnvType: "grub",
 		StatePath:   state.DefaultPath,
 		ConfPath:    config.DefaultPath,
 		Channel:     "stable",
 	}
 	// First pass: pick up --conf so it can point at a different file before
 	// we load. (Tests pass --conf <tempdir>/update.conf.)
 	for i := 0; i < len(args); i++ {
 		if args[i] == "--conf" && i+1 < len(args) {
 			o.ConfPath = args[i+1]
 		}
 	}
 	// Load config file. Missing file is fine (fresh system, no cloud-init yet).
 	if cfg, err := config.Load(o.ConfPath); err == nil && cfg != nil {
 		if cfg.Server != "" {
 			o.ServerURL = cfg.Server
 		}
 		if cfg.Channel != "" {
 			o.Channel = cfg.Channel
 		}
 		if cfg.MaintenanceWindow != "" {
 			o.MaintenanceWindow = cfg.MaintenanceWindow
 		}
 		if cfg.PubKey != "" {
 			o.PubKeyPath = cfg.PubKey
 		}
 		if cfg.HealthcheckURL != "" {
 			o.HealthcheckURL = cfg.HealthcheckURL
 		}
 		if cfg.AutoRollbackAfter > 0 {
 			o.AutoRollbackAfter = cfg.AutoRollbackAfter
 		}
 	} else if err != nil {
 		slog.Warn("could not load update.conf", "path", o.ConfPath, "error", err)
 	}
 	// Second pass: CLI overrides config file values.
 	for i := 0; i < len(args); i++ {
 		switch args[i] {
 		case "--conf":
 			i++ // already handled above
 		case "--state":
 			if i+1 < len(args) {
 				o.StatePath = args[i+1]
 				i++
 			}
 		case "--channel":
 			if i+1 < len(args) {
 				o.Channel = args[i+1]
 				i++
 			}
 		case "--maintenance-window":
 			if i+1 < len(args) {
 				o.MaintenanceWindow = args[i+1]
 				i++
 			}
 		case "--force":
 			o.Force = true
 		case "--healthcheck-url":
 			if i+1 < len(args) {
 				o.HealthcheckURL = args[i+1]
 				i++
 			}
 		case "--auto-rollback-after":
 			if i+1 < len(args) {
 				n := 0
 				for _, ch := range args[i+1] {
 					if ch >= '0' && ch <= '9' {
 						n = n*10 + int(ch-'0')
 					} else {
 						n = 0
 						break
 					}
 				}
 				if n > 0 {
 					o.AutoRollbackAfter = n
 				}
 				i++
 			}
 		case "--kube-system-settle":
 			if i+1 < len(args) {
 				n := 0
 				for _, ch := range args[i+1] {
 					if ch >= '0' && ch <= '9' {
 						n = n*10 + int(ch-'0')
 					} else {
 						n = 0
 						break
 					}
 				}
 				if n > 0 {
 					o.KubeSystemSettle = n
 				}
 				i++
 			}
 		case "--json":
 			o.JSON = true
 		case "--server":
 			if i+1 < len(args) {
 				o.ServerURL = args[i+1]
 				i++
 			}
 		case "--registry":
 			if i+1 < len(args) {
 				o.Registry = args[i+1]
 				i++
 			}
 		case "--tag":
 			if i+1 < len(args) {
 				o.Tag = args[i+1]
 				i++
 			}
 		case "--grubenv":
 			if i+1 < len(args) {
 				o.GrubenvPath = args[i+1]
--- a/update/cmd/rollback.go
+++ b/update/cmd/rollback.go
@@ -3,14 +3,24 @@ package cmd
 import (
 	"fmt"
 	"log/slog"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 // Rollback forces an immediate switch to the other partition.
 // Use this to manually revert to the previous version.
 //
 // State transition: any → RolledBack with LastError="manual rollback".
 func Rollback(args []string) error {
 	opts := parseOpts(args)
 	env := opts.NewBootEnv()
 	st, err := state.Load(opts.StatePath)
 	if err != nil {
 		slog.Warn("state file unreadable, starting fresh", "error", err)
 		st = state.New()
 	}
 	activeSlot, err := env.ActiveSlot()
 	if err != nil {
 		return fmt.Errorf("reading active slot: %w", err)
@@ -24,9 +34,14 @@ func Rollback(args []string) error {
 	slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)
 	if err := env.ForceRollback(); err != nil {
 		_ = st.RecordError(opts.StatePath, fmt.Errorf("rollback failed: %w", err))
 		return fmt.Errorf("rollback failed: %w", err)
 	}
 	if err := st.Transition(opts.StatePath, state.PhaseRolledBack, "", "manual rollback"); err != nil {
 		slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", err)
 	}
 	fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
 	fmt.Println("Reboot to complete rollback.")
--- a/update/cmd/status.go
+++ b/update/cmd/status.go
@@ -1,10 +1,26 @@
 package cmd
 import (
 	"encoding/json"
 	"fmt"
 	"os"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 // statusReport is the JSON-emitted shape of `kubesolo-update status --json`.
 // Combines the bootloader-level A/B view with the update-agent state machine.
 type statusReport struct {
 	ActiveSlot  string             `json:"active_slot"`
 	PassiveSlot string             `json:"passive_slot"`
 	BootCounter int                `json:"boot_counter"`
 	BootSuccess bool               `json:"boot_success"`
 	State       *state.UpdateState `json:"state"`
 }
 // Status displays the current A/B slot configuration and boot state.
 // With --json, emits the full state report to stdout for orchestration
 // tooling.
 func Status(args []string) error {
 	opts := parseOpts(args)
 	env := opts.NewBootEnv()
@@ -29,6 +45,23 @@ func Status(args []string) error {
 		return fmt.Errorf("reading boot success: %w", err)
 	}
 	// State file is non-fatal: present means we have an update lifecycle
 	// recorded; absent means no update has run yet.
 	st, _ := state.Load(opts.StatePath)
 	if opts.JSON {
 		report := statusReport{
 			ActiveSlot:  activeSlot,
 			PassiveSlot: passiveSlot,
 			BootCounter: bootCounter,
 			BootSuccess: bootSuccess,
 			State:       st,
 		}
 		enc := json.NewEncoder(os.Stdout)
 		enc.SetIndent("", "  ")
 		return enc.Encode(report)
 	}
 	fmt.Println("KubeSolo OS — A/B Partition Status")
 	fmt.Println("───────────────────────────────────")
 	fmt.Printf("  Active slot:   %s\n", activeSlot)
@@ -48,5 +81,25 @@ func Status(args []string) error {
 		fmt.Printf("\n  ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter)
 	}
 	if st != nil && st.Phase != state.PhaseIdle {
 		fmt.Println("\nUpdate Lifecycle")
 		fmt.Println("───────────────────────────────────")
 		fmt.Printf("  Phase:         %s\n", st.Phase)
 		if st.FromVersion != "" {
 			fmt.Printf("  From version:  %s\n", st.FromVersion)
 		}
 		if st.ToVersion != "" {
 			fmt.Printf("  To version:    %s\n", st.ToVersion)
 		}
 		if !st.StartedAt.IsZero() {
 			fmt.Printf("  Started:       %s\n", st.StartedAt.Format("2006-01-02 15:04:05 MST"))
 		}
 		fmt.Printf("  Updated:       %s\n", st.UpdatedAt.Format("2006-01-02 15:04:05 MST"))
 		fmt.Printf("  Attempts:      %d\n", st.AttemptCount)
 		if st.LastError != "" {
 			fmt.Printf("  Last error:    %s\n", st.LastError)
 		}
 	}
 	return nil
 }
--- a/update/go.mod
+++ b/update/go.mod
@@ -1,3 +1,10 @@
 module github.com/portainer/kubesolo-os/update
 go 1.25.5
 require (
 	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opencontainers/image-spec v1.1.1 // indirect
 	golang.org/x/sync v0.14.0 // indirect
 	oras.land/oras-go/v2 v2.6.0 // indirect
 )
--- a/update/go.sum
+++ b/update/go.sum
@@ -0,0 +1,8 @@
 github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
 github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
 github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
 golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
 golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
 oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc=
 oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o=
--- a/update/main.go
+++ b/update/main.go
@@ -78,15 +78,28 @@ Commands:
  metrics      Start Prometheus-compatible metrics HTTP server
 Options:
-  --server URL     Update server URL (default: from /etc/kubesolo/update.conf)
+  --server URL          HTTP update server (mutually exclusive with --registry)
-  --grubenv PATH   Path to grubenv file (default: /boot/grub/grubenv)
+  --registry REPO       OCI registry repository, e.g. ghcr.io/portainer/kubesolo-os
-  --timeout SECS   Health check timeout in seconds (default: 120)
+                        (mutually exclusive with --server)
-  --pubkey PATH    Ed25519 public key for signature verification (optional)
+  --tag TAG             OCI tag to pull (default: channel name, then "stable")
  --conf PATH           update.conf path (default: /etc/kubesolo/update.conf)
  --state PATH          Update state file (default: /var/lib/kubesolo/update/state.json)
  --channel NAME        Update channel (default: "stable", or value from update.conf)
  --maintenance-window  HH:MM-HH:MM local time window; apply refuses outside it
  --force               Bypass maintenance-window check
  --grubenv PATH        Path to grubenv file (default: /boot/grub/grubenv)
  --timeout SECS        Health check timeout in seconds (default: 120)
  --pubkey PATH         Ed25519 public key for signature verification (optional)
  --healthcheck-url URL Optional GET probe in healthcheck; 200 = pass
  --auto-rollback-after N  healthcheck: rollback after N consecutive failures
  --kube-system-settle N healthcheck: require kube-system pods Running ≥ N seconds
  --json                For 'status': emit JSON instead of human-readable output
 Examples:
-  kubesolo-update check --server https://updates.example.com
+  kubesolo-update apply --server https://updates.example.com
-  kubesolo-update apply --server https://updates.example.com --pubkey /etc/kubesolo/update-pubkey.hex
+  kubesolo-update apply --registry ghcr.io/portainer/kubesolo-os --tag stable
  kubesolo-update apply --force                      # uses /etc/kubesolo/update.conf
  kubesolo-update healthcheck
-  kubesolo-update status
+  kubesolo-update status --json
 `)
 }
--- a/update/pkg/config/config.go
+++ b/update/pkg/config/config.go
@@ -0,0 +1,105 @@
 // Package config parses /etc/kubesolo/update.conf — the persistent
 // configuration for the update agent. Each line is "key = value"; blank
 // lines and "#"-prefixed comments are ignored. Unknown keys are tolerated
 // (forward compatibility).
 //
 // Example:
 //
 //	# Where to look for updates
 //	server = https://updates.kubesolo.example.com
 //	channel = stable
 //
 //	# Only apply between 03:00 and 05:00 local time
 //	maintenance_window = 03:00-05:00
 //
 //	pubkey = /etc/kubesolo/update-pubkey.hex
 //
 // The file is populated on first boot by cloud-init (see the cloud-init
 // updates: block) and can be hand-edited afterwards.
 package config
 import (
 	"bufio"
 	"fmt"
 	"os"
 	"strings"
 )
 // DefaultPath is where update.conf lives on a live system.
 const DefaultPath = "/etc/kubesolo/update.conf"
 // Config holds the parsed update.conf values. Empty fields mean "not set" —
 // the caller's defaults apply.
 type Config struct {
 	Server            string
 	Channel           string
 	MaintenanceWindow string
 	PubKey            string
 	// HealthcheckURL is an optional URL the healthcheck command will GET;
 	// 200 = pass, anything else = fail.
 	HealthcheckURL string
 	// AutoRollbackAfter is the number of consecutive post-boot healthcheck
 	// failures after which the agent will call Rollback automatically.
 	// 0 = disabled (default).
 	AutoRollbackAfter int
 }
 // Load reads and parses update.conf. A missing file returns an empty Config
 // (not an error) — fresh systems before cloud-init has run.
 func Load(path string) (*Config, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return &Config{}, nil
 		}
 		return nil, fmt.Errorf("open %s: %w", path, err)
 	}
 	defer f.Close()
 	c := &Config{}
 	scanner := bufio.NewScanner(f)
 	lineNo := 0
 	for scanner.Scan() {
 		lineNo++
 		line := strings.TrimSpace(scanner.Text())
 		if line == "" || strings.HasPrefix(line, "#") {
 			continue
 		}
 		eq := strings.IndexByte(line, '=')
 		if eq < 0 {
 			return nil, fmt.Errorf("%s:%d: missing '=' in line: %q", path, lineNo, line)
 		}
 		key := strings.TrimSpace(line[:eq])
 		value := strings.TrimSpace(line[eq+1:])
 		switch key {
 		case "server":
 			c.Server = value
 		case "channel":
 			c.Channel = value
 		case "maintenance_window":
 			c.MaintenanceWindow = value
 		case "pubkey":
 			c.PubKey = value
 		case "healthcheck_url":
 			c.HealthcheckURL = value
 		case "auto_rollback_after":
 			// Parse a small integer. Non-numeric values are silently
 			// ignored (forward compat); zero disables the feature.
 			n := 0
 			for _, ch := range value {
 				if ch >= '0' && ch <= '9' {
 					n = n*10 + int(ch-'0')
 				} else {
 					n = 0
 					break
 				}
 			}
 			c.AutoRollbackAfter = n
 		}
 		// Unknown keys are silently ignored for forward compatibility.
 	}
 	if err := scanner.Err(); err != nil {
 		return nil, fmt.Errorf("read %s: %w", path, err)
 	}
 	return c, nil
 }
--- a/update/pkg/config/config_test.go
+++ b/update/pkg/config/config_test.go
@@ -0,0 +1,117 @@
 package config
 import (
 	"os"
 	"path/filepath"
 	"testing"
 )
 func writeConf(t *testing.T, content string) string {
 	t.Helper()
 	path := filepath.Join(t.TempDir(), "update.conf")
 	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
 		t.Fatalf("seed: %v", err)
 	}
 	return path
 }
 func TestLoadMissingReturnsEmptyConfig(t *testing.T) {
 	c, err := Load(filepath.Join(t.TempDir(), "does-not-exist.conf"))
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if c == nil {
 		t.Fatal("Load returned nil config")
 	}
 	if c.Server != "" || c.Channel != "" || c.MaintenanceWindow != "" || c.PubKey != "" {
 		t.Errorf("expected empty config, got %+v", c)
 	}
 }
 func TestLoadAllFields(t *testing.T) {
 	path := writeConf(t, `# comment line
 server = https://updates.example.com
 channel = stable
 maintenance_window = 03:00-05:00
 pubkey = /etc/kubesolo/pub.hex
 `)
 	c, err := Load(path)
 	if err != nil {
 		t.Fatalf("load: %v", err)
 	}
 	if c.Server != "https://updates.example.com" {
 		t.Errorf("server: got %q", c.Server)
 	}
 	if c.Channel != "stable" {
 		t.Errorf("channel: got %q", c.Channel)
 	}
 	if c.MaintenanceWindow != "03:00-05:00" {
 		t.Errorf("maintenance_window: got %q", c.MaintenanceWindow)
 	}
 	if c.PubKey != "/etc/kubesolo/pub.hex" {
 		t.Errorf("pubkey: got %q", c.PubKey)
 	}
 }
 func TestLoadIgnoresUnknownKeys(t *testing.T) {
 	// Unknown keys must not be an error — supports forward-compat config
 	// fields added by newer agent versions.
 	path := writeConf(t, `server = https://x
 future_field = whatever
 channel = beta
 `)
 	c, err := Load(path)
 	if err != nil {
 		t.Fatalf("load: %v", err)
 	}
 	if c.Server != "https://x" {
 		t.Errorf("server: got %q", c.Server)
 	}
 	if c.Channel != "beta" {
 		t.Errorf("channel: got %q", c.Channel)
 	}
 }
 func TestLoadStripsWhitespace(t *testing.T) {
 	path := writeConf(t, "   server   =   https://example   \n  channel=stable\n")
 	c, err := Load(path)
 	if err != nil {
 		t.Fatalf("load: %v", err)
 	}
 	if c.Server != "https://example" {
 		t.Errorf("server: got %q (whitespace not stripped?)", c.Server)
 	}
 	if c.Channel != "stable" {
 		t.Errorf("channel: got %q", c.Channel)
 	}
 }
 func TestLoadIgnoresBlankAndCommentLines(t *testing.T) {
 	path := writeConf(t, `
 # this is a comment
 server = https://example
   # indented comment
 channel = stable
 `)
 	c, err := Load(path)
 	if err != nil {
 		t.Fatalf("load: %v", err)
 	}
 	if c.Server != "https://example" {
 		t.Errorf("server: got %q", c.Server)
 	}
 }
 func TestLoadRejectsMissingEquals(t *testing.T) {
 	// "noEqualsHere" with no '=' is a syntax error worth surfacing — likely
 	// indicates a corrupted config file.
 	path := writeConf(t, `server = https://example
 noEqualsHere
 `)
 	_, err := Load(path)
 	if err == nil {
 		t.Error("expected error on malformed line, got nil")
 	}
 }
--- a/update/pkg/config/version.go
+++ b/update/pkg/config/version.go
@@ -0,0 +1,60 @@
 package config
 import (
 	"fmt"
 	"strconv"
 	"strings"
 )
 // CompareVersions compares two semver-ish version strings.
 //
 // Accepts "v1.2.3", "1.2.3", "v1.2.3-rc1" (suffix ignored), with missing
 // components defaulting to 0 ("v1" == "1.0.0"). Returns -1 if a < b, 0 if
 // equal, +1 if a > b. Returns an error if either argument can't be parsed
 // at all.
 //
 // Used by apply.go to enforce MinCompatibleVersion. Pre-release suffix
 // handling is deliberately simple — we ignore it, treating "v1.2.3-rc1"
 // equal to "v1.2.3". Edge case: production releases should never carry
 // a pre-release suffix, and dev releases are the consumer's responsibility.
 func CompareVersions(a, b string) (int, error) {
 	pa, err := parseVersion(a)
 	if err != nil {
 		return 0, fmt.Errorf("parse %q: %w", a, err)
 	}
 	pb, err := parseVersion(b)
 	if err != nil {
 		return 0, fmt.Errorf("parse %q: %w", b, err)
 	}
 	for i := 0; i < 3; i++ {
 		if pa[i] < pb[i] {
 			return -1, nil
 		}
 		if pa[i] > pb[i] {
 			return 1, nil
 		}
 	}
 	return 0, nil
 }
 func parseVersion(s string) ([3]int, error) {
 	var out [3]int
 	s = strings.TrimSpace(s)
 	s = strings.TrimPrefix(s, "v")
 	// Drop pre-release suffix: "1.2.3-rc1" -> "1.2.3"
 	if i := strings.IndexAny(s, "-+"); i >= 0 {
 		s = s[:i]
 	}
 	parts := strings.SplitN(s, ".", 3)
 	for i, p := range parts {
 		n, err := strconv.Atoi(p)
 		if err != nil {
 			return out, fmt.Errorf("component %q not numeric", p)
 		}
 		if n < 0 {
 			return out, fmt.Errorf("component %d negative", n)
 		}
 		out[i] = n
 	}
 	return out, nil
 }
--- a/update/pkg/config/version_test.go
+++ b/update/pkg/config/version_test.go
@@ -0,0 +1,46 @@
 package config
 import "testing"
 func TestCompareVersions(t *testing.T) {
 	tests := []struct {
 		a, b string
 		want int
 	}{
 		{"v1.0.0", "v1.0.0", 0},
 		{"1.0.0", "v1.0.0", 0}, // 'v' prefix optional
 		{"v1.0.0", "v1.0.1", -1},
 		{"v1.0.1", "v1.0.0", 1},
 		{"v1.1.0", "v1.0.99", 1},
 		{"v2.0.0", "v1.99.99", 1},
 		{"v0.3.0-dev", "v0.3.0", 0},   // pre-release suffix ignored
 		{"v0.2.5", "v0.3.0", -1},
 		{"v0.3.0", "v0.2.999", 1},
 		{"v1.2", "v1.2.0", 0}, // missing component defaults to 0
 		{"v1", "v1.0.0", 0},
 	}
 	for _, tt := range tests {
 		got, err := CompareVersions(tt.a, tt.b)
 		if err != nil {
 			t.Errorf("CompareVersions(%q, %q): %v", tt.a, tt.b, err)
 			continue
 		}
 		if got != tt.want {
 			t.Errorf("CompareVersions(%q, %q) = %d, want %d", tt.a, tt.b, got, tt.want)
 		}
 	}
 }
 func TestCompareVersionsRejectsGarbage(t *testing.T) {
 	bad := []string{
 		"not-a-version",
 		"v.1.2",
 		"vabc",
 		"",
 	}
 	for _, s := range bad {
 		if _, err := CompareVersions(s, "v1.0.0"); err == nil {
 			t.Errorf("CompareVersions(%q, ...) accepted, want error", s)
 		}
 	}
 }
--- a/update/pkg/config/window.go
+++ b/update/pkg/config/window.go
@@ -0,0 +1,95 @@
 package config
 import (
 	"fmt"
 	"strconv"
 	"strings"
 	"time"
 )
 // Window is a parsed maintenance-window expression. Times are minutes since
 // midnight in the local timezone. When End < Start, the window wraps
 // midnight (e.g. 23:00-01:00 means 23:00 today through 01:00 tomorrow).
 //
 // The zero value (Start == End == 0) means "always allowed" — used for
 // the empty-string-meaning-no-window case.
 type Window struct {
 	Start int // minutes since midnight, [0, 1440)
 	End   int // minutes since midnight, [0, 1440)
 	// alwaysOpen distinguishes "no constraint" from "midnight to midnight"
 	// (the literal 00:00-00:00 window, which is a degenerate same-instant
 	// window). Set when ParseWindow is called with an empty string.
 	alwaysOpen bool
 }
 // AlwaysOpen returns true if this window imposes no constraint (the empty
 // string was parsed).
 func (w Window) AlwaysOpen() bool { return w.alwaysOpen }
 // ParseWindow parses "HH:MM-HH:MM" into a Window. Empty input returns an
 // AlwaysOpen window (no constraint). Whitespace around the input is tolerated.
 func ParseWindow(s string) (Window, error) {
 	s = strings.TrimSpace(s)
 	if s == "" {
 		return Window{alwaysOpen: true}, nil
 	}
 	parts := strings.SplitN(s, "-", 2)
 	if len(parts) != 2 {
 		return Window{}, fmt.Errorf("maintenance window %q: expected HH:MM-HH:MM", s)
 	}
 	start, err := parseHHMM(strings.TrimSpace(parts[0]))
 	if err != nil {
 		return Window{}, fmt.Errorf("maintenance window %q: start: %w", s, err)
 	}
 	end, err := parseHHMM(strings.TrimSpace(parts[1]))
 	if err != nil {
 		return Window{}, fmt.Errorf("maintenance window %q: end: %w", s, err)
 	}
 	return Window{Start: start, End: end}, nil
 }
 func parseHHMM(s string) (int, error) {
 	parts := strings.SplitN(s, ":", 2)
 	if len(parts) != 2 {
 		return 0, fmt.Errorf("%q: expected HH:MM", s)
 	}
 	h, err := strconv.Atoi(parts[0])
 	if err != nil || h < 0 || h > 23 {
 		return 0, fmt.Errorf("%q: invalid hour", s)
 	}
 	m, err := strconv.Atoi(parts[1])
 	if err != nil || m < 0 || m > 59 {
 		return 0, fmt.Errorf("%q: invalid minute", s)
 	}
 	return h*60 + m, nil
 }
 // Contains reports whether the given local time falls inside this window.
 // AlwaysOpen windows return true for any time.
 func (w Window) Contains(t time.Time) bool {
 	if w.alwaysOpen {
 		return true
 	}
 	now := t.Hour()*60 + t.Minute()
 	if w.Start == w.End {
 		// Degenerate: zero-length window. Never matches.
 		return false
 	}
 	if w.Start < w.End {
 		// Same-day window: [Start, End)
 		return now >= w.Start && now < w.End
 	}
 	// Wrapping window: [Start, 1440) ∪ [0, End)
 	return now >= w.Start || now < w.End
 }
 // String renders the window in HH:MM-HH:MM form for display. AlwaysOpen
 // renders as "always".
 func (w Window) String() string {
 	if w.alwaysOpen {
 		return "always"
 	}
 	return fmt.Sprintf("%02d:%02d-%02d:%02d",
 		w.Start/60, w.Start%60, w.End/60, w.End%60)
 }
--- a/update/pkg/config/window_test.go
+++ b/update/pkg/config/window_test.go
@@ -0,0 +1,120 @@
 package config
 import (
 	"testing"
 	"time"
 )
 func at(hour, min int) time.Time {
 	return time.Date(2026, 1, 1, hour, min, 0, 0, time.UTC)
 }
 func TestParseWindowEmpty(t *testing.T) {
 	w, err := ParseWindow("")
 	if err != nil {
 		t.Fatalf("empty window: %v", err)
 	}
 	if !w.AlwaysOpen() {
 		t.Error("empty input should produce AlwaysOpen window")
 	}
 	if !w.Contains(at(3, 0)) {
 		t.Error("AlwaysOpen window should contain any time")
 	}
 	if !w.Contains(at(23, 59)) {
 		t.Error("AlwaysOpen window should contain end-of-day")
 	}
 }
 func TestParseWindowSameDay(t *testing.T) {
 	w, err := ParseWindow("03:00-05:00")
 	if err != nil {
 		t.Fatalf("parse: %v", err)
 	}
 	tests := []struct {
 		hour, min int
 		want      bool
 	}{
 		{2, 59, false}, // just before
 		{3, 0, true},   // start (inclusive)
 		{4, 30, true},  // middle
 		{4, 59, true},  // just before end
 		{5, 0, false},  // end (exclusive)
 		{15, 0, false}, // far outside
 	}
 	for _, tt := range tests {
 		got := w.Contains(at(tt.hour, tt.min))
 		if got != tt.want {
 			t.Errorf("Contains(%02d:%02d) = %v, want %v", tt.hour, tt.min, got, tt.want)
 		}
 	}
 }
 func TestParseWindowWrappingMidnight(t *testing.T) {
 	w, err := ParseWindow("23:00-01:00")
 	if err != nil {
 		t.Fatalf("parse: %v", err)
 	}
 	tests := []struct {
 		hour, min int
 		want      bool
 	}{
 		{22, 59, false}, // just before
 		{23, 0, true},   // start (inclusive)
 		{23, 30, true},  // night-before
 		{0, 0, true},    // midnight
 		{0, 30, true},   // early morning
 		{0, 59, true},   // just before end
 		{1, 0, false},   // end (exclusive)
 		{12, 0, false},  // far outside (noon)
 	}
 	for _, tt := range tests {
 		got := w.Contains(at(tt.hour, tt.min))
 		if got != tt.want {
 			t.Errorf("Contains(%02d:%02d) wrapping = %v, want %v", tt.hour, tt.min, got, tt.want)
 		}
 	}
 }
 func TestParseWindowDegenerateZeroLength(t *testing.T) {
 	// 05:00-05:00 is a zero-length window — should never match. Different
 	// from "always" (empty string).
 	w, err := ParseWindow("05:00-05:00")
 	if err != nil {
 		t.Fatalf("parse: %v", err)
 	}
 	if w.AlwaysOpen() {
 		t.Error("05:00-05:00 must not be AlwaysOpen")
 	}
 	if w.Contains(at(5, 0)) {
 		t.Error("zero-length window must not contain its own boundary")
 	}
 }
 func TestParseWindowRejectsBadInput(t *testing.T) {
 	bad := []string{
 		"notatime",
 		"03:00",        // no end
 		"03:00-",       // empty end
 		"03:00-05",     // missing minutes
 		"24:00-05:00",  // hour out of range
 		"03:60-05:00",  // minute out of range
 		"abc:00-05:00", // non-numeric
 	}
 	for _, s := range bad {
 		_, err := ParseWindow(s)
 		if err == nil {
 			t.Errorf("ParseWindow(%q) accepted, want error", s)
 		}
 	}
 }
 func TestWindowString(t *testing.T) {
 	w, _ := ParseWindow("03:05-05:45")
 	if w.String() != "03:05-05:45" {
 		t.Errorf("String = %q, want 03:05-05:45", w.String())
 	}
 	always, _ := ParseWindow("")
 	if always.String() != "always" {
 		t.Errorf("AlwaysOpen.String = %q, want 'always'", always.String())
 	}
 }
--- a/update/pkg/health/extended.go
+++ b/update/pkg/health/extended.go
@@ -0,0 +1,125 @@
 package health
 import (
 	"context"
 	"fmt"
 	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"time"
 )
 // kubeSystemSettleSeconds is how long all kube-system pods must hold a
 // Running phase before we consider the cluster genuinely up. Catches the
 // "pod just started, will crash-loop in 5s" case.
 const kubeSystemSettleSeconds = 30
 // CheckKubeSystemReady verifies that every pod in the kube-system namespace
 // is in Running phase and has been Running for at least settle. Returns
 // (ready, error). settle defaults to 30s when zero.
 func (c *Checker) CheckKubeSystemReady(settle time.Duration) bool {
 	if settle == 0 {
 		settle = kubeSystemSettleSeconds * time.Second
 	}
 	if _, err := os.Stat(c.kubeconfigPath); err != nil {
 		return false
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
 	// jsonpath emits one line per pod: <phase>|<startTime>
 	cmd := exec.CommandContext(ctx, "kubectl",
 		"--kubeconfig", c.kubeconfigPath,
 		"get", "pods", "-n", "kube-system",
 		"-o", `jsonpath={range .items[*]}{.status.phase}|{.status.startTime}{"\n"}{end}`,
 	)
 	out, err := cmd.Output()
 	if err != nil {
 		return false
 	}
 	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
 	if len(lines) == 0 || lines[0] == "" {
 		// No pods reported. Conservatively treat as not-ready: kube-system
 		// is expected to host at least CoreDNS + pause.
 		return false
 	}
 	now := time.Now()
 	for _, line := range lines {
 		parts := strings.SplitN(line, "|", 2)
 		phase := strings.TrimSpace(parts[0])
 		if phase != "Running" {
 			return false
 		}
 		if len(parts) < 2 {
 			return false
 		}
 		start, perr := time.Parse(time.RFC3339, strings.TrimSpace(parts[1]))
 		if perr != nil {
 			return false
 		}
 		if now.Sub(start) < settle {
 			return false
 		}
 	}
 	return true
 }
 // CheckProbeURL fetches the given URL and reports whether it returned 200.
 // Empty url returns (true, nil) — the check is opt-in.
 func CheckProbeURL(url string) (bool, error) {
 	if url == "" {
 		return true, nil
 	}
 	client := &http.Client{Timeout: 5 * time.Second}
 	resp, err := client.Get(url)
 	if err != nil {
 		return false, fmt.Errorf("probe URL %s: %w", url, err)
 	}
 	defer resp.Body.Close()
 	return resp.StatusCode == http.StatusOK, nil
 }
 // CheckDiskWritable writes a small file under dataDir, fsyncs, reads it back,
 // and removes it. Confirms the data partition is mounted read-write and the
 // underlying disk is responsive. Empty dataDir defaults to /var/lib/kubesolo.
 func CheckDiskWritable(dataDir string) (bool, error) {
 	if dataDir == "" {
 		dataDir = "/var/lib/kubesolo"
 	}
 	if _, err := os.Stat(dataDir); err != nil {
 		// Data partition not mounted? That's catastrophic but we shouldn't
 		// claim the disk is fine.
 		return false, fmt.Errorf("dataDir %s: %w", dataDir, err)
 	}
 	probe := filepath.Join(dataDir, ".update-probe")
 	want := []byte("kubesolo-os healthcheck probe")
 	f, err := os.Create(probe)
 	if err != nil {
 		return false, fmt.Errorf("create probe: %w", err)
 	}
 	defer os.Remove(probe)
 	if _, err := f.Write(want); err != nil {
 		f.Close()
 		return false, fmt.Errorf("write probe: %w", err)
 	}
 	if err := f.Sync(); err != nil {
 		f.Close()
 		return false, fmt.Errorf("fsync probe: %w", err)
 	}
 	if err := f.Close(); err != nil {
 		return false, fmt.Errorf("close probe: %w", err)
 	}
 	got, err := os.ReadFile(probe)
 	if err != nil {
 		return false, fmt.Errorf("read probe: %w", err)
 	}
 	if string(got) != string(want) {
 		return false, fmt.Errorf("probe content mismatch: got %q", got)
 	}
 	return true, nil
 }
--- a/update/pkg/health/extended_test.go
+++ b/update/pkg/health/extended_test.go
@@ -0,0 +1,77 @@
 package health
 import (
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"testing"
 )
 func TestCheckProbeURLEmptyAlwaysPasses(t *testing.T) {
 	ok, err := CheckProbeURL("")
 	if err != nil {
 		t.Fatalf("CheckProbeURL(\"\"): %v", err)
 	}
 	if !ok {
 		t.Error("empty probe URL should return ok=true (check disabled)")
 	}
 }
 func TestCheckProbeURL200(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		w.WriteHeader(http.StatusOK)
 	}))
 	defer srv.Close()
 	ok, err := CheckProbeURL(srv.URL)
 	if err != nil {
 		t.Fatalf("CheckProbeURL: %v", err)
 	}
 	if !ok {
 		t.Error("expected ok=true on 200")
 	}
 }
 func TestCheckProbeURLNon200(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		w.WriteHeader(http.StatusServiceUnavailable)
 	}))
 	defer srv.Close()
 	ok, err := CheckProbeURL(srv.URL)
 	if err != nil {
 		t.Fatalf("CheckProbeURL: %v", err)
 	}
 	if ok {
 		t.Error("expected ok=false on 503")
 	}
 }
 func TestCheckProbeURLNetworkError(t *testing.T) {
 	// Port 1 is reserved (tcpmux) and never bound by Linux defaults.
 	_, err := CheckProbeURL("http://127.0.0.1:1")
 	if err == nil {
 		t.Error("expected error for unreachable URL, got nil")
 	}
 }
 func TestCheckDiskWritableHappyPath(t *testing.T) {
 	dir := t.TempDir()
 	ok, err := CheckDiskWritable(dir)
 	if err != nil {
 		t.Fatalf("CheckDiskWritable: %v", err)
 	}
 	if !ok {
 		t.Error("expected ok=true on writable temp dir")
 	}
 	// Probe file should have been cleaned up.
 	if _, err := os.Stat(filepath.Join(dir, ".update-probe")); !os.IsNotExist(err) {
 		t.Errorf("probe file not cleaned up: stat err=%v", err)
 	}
 }
 func TestCheckDiskWritableMissingDir(t *testing.T) {
 	_, err := CheckDiskWritable("/this/path/does/not/exist")
 	if err == nil {
 		t.Error("expected error for missing dataDir, got nil")
 	}
 }
--- a/update/pkg/health/health.go
+++ b/update/pkg/health/health.go
@@ -24,15 +24,20 @@ import (
 // Status represents the result of a health check.
 type Status struct {
-	Containerd bool
+	Containerd       bool
-	APIServer  bool
+	APIServer        bool
-	NodeReady  bool
+	NodeReady        bool
-	Message    string
+	KubeSystemReady  bool // optional — true unless KubeSystemSettle is non-zero
 	ProbeURL         bool // optional — true unless ProbeURL is set
 	DiskWritable     bool // optional — true unless DataDir is set
 	Message          string
 }
-// IsHealthy returns true if all checks passed.
+// IsHealthy returns true if all required checks passed. Optional checks
 // default to true when not configured, so they don't block the result.
 func (s *Status) IsHealthy() bool {
-	return s.Containerd && s.APIServer && s.NodeReady
+	return s.Containerd && s.APIServer && s.NodeReady &&
 		s.KubeSystemReady && s.ProbeURL && s.DiskWritable
 }
 // Checker performs health checks against the local KubeSolo instance.
@@ -40,6 +45,11 @@ type Checker struct {
 	kubeconfigPath string
 	apiServerAddr  string
 	timeout        time.Duration
 	// Optional gates. Zero values disable the check (it reports true).
 	KubeSystemSettle time.Duration
 	ProbeURL         string
 	DataDir          string
 }
 // NewChecker creates a health checker.
@@ -149,12 +159,37 @@ func (c *Checker) CheckNodeReady() bool {
 }
 // RunAll performs all health checks and returns the combined status.
 //
 // Optional checks (kube-system settle, user probe URL, disk writability) are
 // only run if the corresponding Checker fields are set; otherwise they
 // report true so as not to block the result.
 func (c *Checker) RunAll() *Status {
-	return &Status{
+	s := &Status{
-		Containerd: c.CheckContainerd(),
+		Containerd:      c.CheckContainerd(),
-		APIServer:  c.CheckAPIServer(),
+		APIServer:       c.CheckAPIServer(),
-		NodeReady:  c.CheckNodeReady(),
+		NodeReady:       c.CheckNodeReady(),
 		KubeSystemReady: true,
 		ProbeURL:        true,
 		DiskWritable:    true,
 	}
 	if c.KubeSystemSettle > 0 {
 		s.KubeSystemReady = c.CheckKubeSystemReady(c.KubeSystemSettle)
 	}
 	if c.ProbeURL != "" {
 		ok, err := CheckProbeURL(c.ProbeURL)
 		if err != nil {
 			slog.Warn("probe URL check failed", "url", c.ProbeURL, "error", err)
 		}
 		s.ProbeURL = ok
 	}
 	if c.DataDir != "" {
 		ok, err := CheckDiskWritable(c.DataDir)
 		if err != nil {
 			slog.Warn("disk writability check failed", "dir", c.DataDir, "error", err)
 		}
 		s.DiskWritable = ok
 	}
 	return s
 }
 // WaitForHealthy polls health checks until all pass or timeout expires.
--- a/update/pkg/health/health_test.go
+++ b/update/pkg/health/health_test.go
@@ -6,36 +6,42 @@ import (
 )
 func TestStatusIsHealthy(t *testing.T) {
 	// Helper for the new 6-field Status: all-true except the named one.
 	allBut := func(field string) Status {
 		s := Status{
 			Containerd: true, APIServer: true, NodeReady: true,
 			KubeSystemReady: true, ProbeURL: true, DiskWritable: true,
 		}
 		switch field {
 		case "Containerd":
 			s.Containerd = false
 		case "APIServer":
 			s.APIServer = false
 		case "NodeReady":
 			s.NodeReady = false
 		case "KubeSystemReady":
 			s.KubeSystemReady = false
 		case "ProbeURL":
 			s.ProbeURL = false
 		case "DiskWritable":
 			s.DiskWritable = false
 		}
 		return s
 	}
 	tests := []struct {
 		name       string
 		status     Status
 		wantHealth bool
 	}{
-		{
+		{"all healthy", allBut(""), true},
-			name:       "all healthy",
+		{"containerd down", allBut("Containerd"), false},
-			status:     Status{Containerd: true, APIServer: true, NodeReady: true},
+		{"apiserver down", allBut("APIServer"), false},
-			wantHealth: true,
+		{"node not ready", allBut("NodeReady"), false},
-		},
+		{"kube-system not ready", allBut("KubeSystemReady"), false},
-		{
+		{"probe URL failed", allBut("ProbeURL"), false},
-			name:       "containerd down",
+		{"disk not writable", allBut("DiskWritable"), false},
-			status:     Status{Containerd: false, APIServer: true, NodeReady: true},
+		{"all down", Status{}, false},
 			wantHealth: false,
 		},
 		{
 			name:       "apiserver down",
 			status:     Status{Containerd: true, APIServer: false, NodeReady: true},
 			wantHealth: false,
 		},
 		{
 			name:       "node not ready",
 			status:     Status{Containerd: true, APIServer: true, NodeReady: false},
 			wantHealth: false,
 		},
 		{
 			name:       "all down",
 			status:     Status{Containerd: false, APIServer: false, NodeReady: false},
 			wantHealth: false,
 		},
 	}
 	for _, tt := range tests {
--- a/update/pkg/health/preflight.go
+++ b/update/pkg/health/preflight.go
@@ -0,0 +1,51 @@
 package health
 import (
 	"context"
 	"fmt"
 	"os"
 	"os/exec"
 	"strings"
 	"time"
 )
 // NodeBlockLabel is the well-known label that workload authors set on the
 // local node to defer an OS update. When present and "true", apply refuses.
 const NodeBlockLabel = "updates.kubesolo.io/block"
 // CheckNodeBlocked returns (blocked, error). blocked==true means the local
 // node carries the updates.kubesolo.io/block=true label and the caller should
 // refuse the update.
 //
 // If the kubeconfig is not available (offline / pre-boot / air-gap), this
 // returns (false, nil) — silently allowing the update. That's the safe
 // behaviour for the air-gap case where the node may not be reachable from
 // the agent's perspective.
 func CheckNodeBlocked(kubeconfigPath string) (bool, error) {
 	if kubeconfigPath == "" {
 		kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
 	}
 	if _, err := os.Stat(kubeconfigPath); err != nil {
 		// No kubeconfig — assume air-gap / pre-K8s. Don't block updates.
 		return false, nil
 	}
 	// Query the node label via kubectl. We don't know the node name a
 	// priori, so we use --kubeconfig on the local admin config and ask for
 	// "the only node" (KubeSolo is single-node by design).
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
 	cmd := exec.CommandContext(ctx, "kubectl",
 		"--kubeconfig", kubeconfigPath,
 		"get", "node",
 		"-o", `jsonpath={.items[0].metadata.labels.updates\.kubesolo\.io/block}`)
 	out, err := cmd.Output()
 	if err != nil {
 		// API unreachable or no nodes — treat as not blocked (analogous to
 		// the kubeconfig-missing case). We still surface the error so the
 		// caller can decide to log it.
 		return false, fmt.Errorf("query node label: %w", err)
 	}
 	return strings.TrimSpace(string(out)) == "true", nil
 }
--- a/update/pkg/image/image.go
+++ b/update/pkg/image/image.go
@@ -35,6 +35,24 @@ type UpdateMetadata struct {
 	MetadataSigURL  string `json:"metadata_sig_url,omitempty"`
 	ReleaseNotes    string `json:"release_notes,omitempty"`
 	ReleaseDate     string `json:"release_date,omitempty"`
 	// Channel labels this artifact ("stable", "beta", "edge", ...). The agent
 	// refuses metadata whose channel doesn't match the locally-configured
 	// one. Empty in metadata means "no channel constraint, accept anything".
 	Channel string `json:"channel,omitempty"`
 	// MinCompatibleVersion is the lowest version that can upgrade to this
 	// one. The agent refuses to apply if the currently-running version is
 	// below this. Used for stepping-stone migrations (e.g. 0.2.x -> 0.3.x
 	// requires 0.2.5+ to land the state-file format first). Empty means
 	// "any source version OK".
 	MinCompatibleVersion string `json:"min_compatible_version,omitempty"`
 	// Architecture restricts this artifact to a specific GOARCH ("amd64",
 	// "arm64"). Empty means the artifact is arch-agnostic — which is rare
 	// since the kernel + initramfs are arch-specific; this should normally
 	// be populated by the build pipeline.
 	Architecture string `json:"architecture,omitempty"`
 }
 // StagedImage represents downloaded and verified update files.
--- a/update/pkg/metrics/metrics.go
+++ b/update/pkg/metrics/metrics.go
@@ -11,6 +11,9 @@
 //	kubesolo_os_update_last_check_timestamp_seconds  unix timestamp (gauge)
 //	kubesolo_os_memory_total_bytes                   total RAM (gauge)
 //	kubesolo_os_memory_available_bytes               available RAM (gauge)
 //	kubesolo_update_phase{phase}                     1 for current phase, 0 for others
 //	kubesolo_update_attempts_total                   counter — attempts at current ToVersion
 //	kubesolo_update_last_attempt_timestamp_seconds   unix timestamp of last state update
 //
 // This is a zero-dependency implementation — no Prometheus client library needed.
 // It serves metrics in the Prometheus text exposition format.
@@ -25,11 +28,14 @@ import (
 	"strings"
 	"sync"
 	"time"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 // Server is a lightweight Prometheus metrics HTTP server.
 type Server struct {
 	grubenvPath string
 	statePath   string
 	listenAddr  string
 	startTime   time.Time
@@ -47,6 +53,27 @@ func NewServer(listenAddr, grubenvPath string) *Server {
 	}
 }
 // SetStatePath sets the location of the update state.json file. If empty or
 // unset, state-derived metrics are emitted with the Idle defaults.
 func (s *Server) SetStatePath(p string) {
 	s.statePath = p
 }
 // allPhases lists every Phase value we emit as a kubesolo_update_phase
 // time-series, so consumers see all label values (with value 0 for non-current
 // phases). Mirror of validPhases in pkg/state.
 var allPhases = []state.Phase{
 	state.PhaseIdle,
 	state.PhaseChecking,
 	state.PhaseDownloading,
 	state.PhaseStaged,
 	state.PhaseActivated,
 	state.PhaseVerifying,
 	state.PhaseSuccess,
 	state.PhaseRolledBack,
 	state.PhaseFailed,
 }
 // SetUpdateAvailable records whether an update is available.
 func (s *Server) SetUpdateAvailable(available bool) {
 	s.mu.Lock()
@@ -125,9 +152,49 @@ func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
 	sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n")
 	sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail))
 	// Update lifecycle (from state.json)
 	s.writeUpdateStateMetrics(&sb)
 	fmt.Fprint(w, sb.String())
 }
 // writeUpdateStateMetrics appends update-lifecycle metrics derived from the
 // state.json file. If the file is missing or unreadable, emits the Idle
 // defaults so the metric series exists at all times.
 func (s *Server) writeUpdateStateMetrics(sb *strings.Builder) {
 	current := state.PhaseIdle
 	var attempts int
 	var lastTS float64
 	if s.statePath != "" {
 		if st, err := state.Load(s.statePath); err == nil && st != nil {
 			current = st.Phase
 			attempts = st.AttemptCount
 			if !st.UpdatedAt.IsZero() {
 				lastTS = float64(st.UpdatedAt.Unix())
 			}
 		}
 	}
 	sb.WriteString("# HELP kubesolo_update_phase Current update lifecycle phase (1 for active, 0 otherwise).\n")
 	sb.WriteString("# TYPE kubesolo_update_phase gauge\n")
 	for _, p := range allPhases {
 		v := 0
 		if p == current {
 			v = 1
 		}
 		sb.WriteString(fmt.Sprintf("kubesolo_update_phase{phase=%q} %d\n", string(p), v))
 	}
 	sb.WriteString("# HELP kubesolo_update_attempts_total Number of update attempts at the current target version.\n")
 	sb.WriteString("# TYPE kubesolo_update_attempts_total counter\n")
 	sb.WriteString(fmt.Sprintf("kubesolo_update_attempts_total %d\n", attempts))
 	sb.WriteString("# HELP kubesolo_update_last_attempt_timestamp_seconds Unix timestamp of the last state transition.\n")
 	sb.WriteString("# TYPE kubesolo_update_last_attempt_timestamp_seconds gauge\n")
 	sb.WriteString(fmt.Sprintf("kubesolo_update_last_attempt_timestamp_seconds %.0f\n", lastTS))
 }
 // readGrubenvVar reads a single variable from grubenv using simple file parse.
 func (s *Server) readGrubenvVar(key string) string {
 	data, err := os.ReadFile(s.grubenvPath)
--- a/update/pkg/metrics/metrics_test.go
+++ b/update/pkg/metrics/metrics_test.go
@@ -8,6 +8,8 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
 	"github.com/portainer/kubesolo-os/update/pkg/state"
 )
 func TestNewServer(t *testing.T) {
@@ -247,6 +249,86 @@ func TestSafeInt(t *testing.T) {
 	}
 }
 func TestUpdateStateMetricsAbsentStateFile(t *testing.T) {
 	// No state path set — should emit Idle defaults so the metric series
 	// exists from first boot.
 	s := NewServer(":9100", "/tmp/nonexistent")
 	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
 	w := httptest.NewRecorder()
 	s.handleMetrics(w, req)
 	body, _ := io.ReadAll(w.Result().Body)
 	output := string(body)
 	if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 1`) {
 		t.Errorf("expected idle=1 with no state file, got:\n%s", output)
 	}
 	if !strings.Contains(output, `kubesolo_update_phase{phase="checking"} 0`) {
 		t.Errorf("expected checking=0 with no state file, got:\n%s", output)
 	}
 	if !strings.Contains(output, "kubesolo_update_attempts_total 0") {
 		t.Errorf("expected attempts=0 with no state file, got:\n%s", output)
 	}
 }
 func TestUpdateStateMetricsActivePhase(t *testing.T) {
 	dir := t.TempDir()
 	statePath := filepath.Join(dir, "state.json")
 	st := state.New()
 	if err := st.Transition(statePath, state.PhaseDownloading, "v0.3.0", ""); err != nil {
 		t.Fatalf("seed state: %v", err)
 	}
 	s := NewServer(":9100", "/tmp/nonexistent")
 	s.SetStatePath(statePath)
 	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
 	w := httptest.NewRecorder()
 	s.handleMetrics(w, req)
 	body, _ := io.ReadAll(w.Result().Body)
 	output := string(body)
 	if !strings.Contains(output, `kubesolo_update_phase{phase="downloading"} 1`) {
 		t.Errorf("expected downloading=1, got:\n%s", output)
 	}
 	if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 0`) {
 		t.Errorf("expected idle=0 when downloading is active, got:\n%s", output)
 	}
 	if !strings.Contains(output, "kubesolo_update_attempts_total 1") {
 		t.Errorf("expected attempts=1 after first Transition, got:\n%s", output)
 	}
 	if strings.Contains(output, "kubesolo_update_last_attempt_timestamp_seconds 0\n") {
 		t.Errorf("expected non-zero timestamp after state write, got:\n%s", output)
 	}
 }
 func TestUpdateStateMetricsAllPhasesEmitted(t *testing.T) {
 	// Every phase value should appear in the output, so dashboards can graph
 	// the series cleanly.
 	s := NewServer(":9100", "/tmp/nonexistent")
 	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
 	w := httptest.NewRecorder()
 	s.handleMetrics(w, req)
 	body, _ := io.ReadAll(w.Result().Body)
 	output := string(body)
 	for _, p := range []state.Phase{
 		state.PhaseIdle, state.PhaseChecking, state.PhaseDownloading,
 		state.PhaseStaged, state.PhaseActivated, state.PhaseVerifying,
 		state.PhaseSuccess, state.PhaseRolledBack, state.PhaseFailed,
 	} {
 		needle := `kubesolo_update_phase{phase="` + string(p) + `"}`
 		if !strings.Contains(output, needle) {
 			t.Errorf("phase %q not present in metrics output", p)
 		}
 	}
 }
 func TestReadFileString(t *testing.T) {
 	dir := t.TempDir()
--- a/update/pkg/oci/oci.go
+++ b/update/pkg/oci/oci.go
@@ -0,0 +1,281 @@
 // Package oci pulls KubeSolo OS update artifacts from an OCI-compliant
 // container registry (e.g. ghcr.io). It is the registry-native alternative
 // to the legacy HTTP `latest.json` protocol implemented in pkg/image.
 //
 // # Artifact layout
 //
 // An update is published as a single OCI artifact under a tag like
 // `stable` or `v0.3.0`. The tag may point at either:
 //
 //   - A manifest index (preferred) containing per-architecture manifests.
 //     The agent picks the one matching runtime.GOARCH.
 //   - A single manifest (used for arch-specific tags such as
 //     `v0.3.0-amd64`). The agent verifies architecture against the
 //     manifest's platform annotation before trusting it.
 //
 // Each per-architecture manifest carries two layers:
 //
 //	application/vnd.kubesolo.os.kernel.v1+octet-stream     // vmlinuz / Image
 //	application/vnd.kubesolo.os.initramfs.v1+gzip          // kubesolo-os.gz
 //
 // And these annotations (read into image.UpdateMetadata):
 //
 //	io.kubesolo.os.version                "v0.3.0"
 //	io.kubesolo.os.channel                "stable"
 //	io.kubesolo.os.min_compatible_version "v0.2.0"
 //	io.kubesolo.os.architecture           "amd64"
 //	io.kubesolo.os.release_notes          (optional, short)
 //	io.kubesolo.os.release_date           (optional, RFC3339)
 //
 // The agent ignores any additional layers, so the same image can also be
 // shaped as a "scratch" container if the build pipeline finds that convenient
 // for ecosystem tooling.
 package oci
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"github.com/opencontainers/go-digest"
 	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
 	"oras.land/oras-go/v2/content"
 	"oras.land/oras-go/v2/registry/remote"
 	"github.com/portainer/kubesolo-os/update/pkg/image"
 )
 // Media types used on KubeSolo OS update artifacts. Kept here (not in
 // pkg/image) so the OCI protocol surface is fully self-contained.
 const (
 	MediaKernel    = "application/vnd.kubesolo.os.kernel.v1+octet-stream"
 	MediaInitramfs = "application/vnd.kubesolo.os.initramfs.v1+gzip"
 	AnnotVersion     = "io.kubesolo.os.version"
 	AnnotChannel     = "io.kubesolo.os.channel"
 	AnnotMinVersion  = "io.kubesolo.os.min_compatible_version"
 	AnnotArch        = "io.kubesolo.os.architecture"
 	AnnotReleaseNote = "io.kubesolo.os.release_notes"
 	AnnotReleaseDate = "io.kubesolo.os.release_date"
 )
 // Client pulls artifacts from a single OCI repository (e.g.
 // `ghcr.io/portainer/kubesolo-os`).
 //
 // Anonymous (public-pull) access is supported out of the box. For private
 // repositories, configure auth via the underlying remote.Repository.Client
 // before passing it to Resolve/Pull — that hook isn't surfaced here yet
 // (deferred until we actually need it for a private fleet).
 type Client struct {
 	repo *remote.Repository
 	// Arch is the architecture string we match against manifest indexes.
 	// Defaults to runtime.GOARCH; overridable for testing.
 	Arch string
 }
 // NewClient parses a repository reference of the form `host/path` (no tag)
 // and returns a ready-to-use Client.
 func NewClient(repoRef string) (*Client, error) {
 	repo, err := remote.NewRepository(repoRef)
 	if err != nil {
 		return nil, fmt.Errorf("invalid OCI reference %q: %w", repoRef, err)
 	}
 	// remote.NewRepository defaults to HTTPS. PlainHTTP is set per-test
 	// via the WithPlainHTTP option when we hit a httptest.Server.
 	return &Client{repo: repo, Arch: runtime.GOARCH}, nil
 }
 // WithPlainHTTP toggles the underlying registry transport to HTTP. Useful for
 // httptest-driven unit tests; do not use against production registries.
 func (c *Client) WithPlainHTTP(plain bool) *Client {
 	c.repo.PlainHTTP = plain
 	return c
 }
 // FetchMetadata resolves the tag, walks index → manifest if needed, and
 // returns an image.UpdateMetadata populated from the manifest's annotations.
 // No blobs are downloaded — this is the cheap "what's available" probe.
 func (c *Client) FetchMetadata(ctx context.Context, tag string) (*image.UpdateMetadata, error) {
 	manifest, _, err := c.resolveArchManifest(ctx, tag)
 	if err != nil {
 		return nil, err
 	}
 	return metadataFromAnnotations(manifest.Annotations), nil
 }
 // Pull resolves the tag, picks the matching-architecture manifest, downloads
 // the kernel + initramfs layers to `stageDir`, verifies their digests, and
 // returns a StagedImage compatible with the existing pkg/image consumer.
 func (c *Client) Pull(ctx context.Context, tag, stageDir string) (*image.StagedImage, *image.UpdateMetadata, error) {
 	manifest, _, err := c.resolveArchManifest(ctx, tag)
 	if err != nil {
 		return nil, nil, err
 	}
 	if err := os.MkdirAll(stageDir, 0o755); err != nil {
 		return nil, nil, fmt.Errorf("create stage dir: %w", err)
 	}
 	var kernelPath, initramfsPath string
 	for _, layer := range manifest.Layers {
 		switch layer.MediaType {
 		case MediaKernel:
 			kernelPath = filepath.Join(stageDir, "vmlinuz")
 			if err := c.fetchBlobTo(ctx, layer, kernelPath); err != nil {
 				return nil, nil, fmt.Errorf("download kernel: %w", err)
 			}
 		case MediaInitramfs:
 			initramfsPath = filepath.Join(stageDir, "kubesolo-os.gz")
 			if err := c.fetchBlobTo(ctx, layer, initramfsPath); err != nil {
 				return nil, nil, fmt.Errorf("download initramfs: %w", err)
 			}
 		default:
 			slog.Debug("oci: skipping unknown layer", "media", layer.MediaType)
 		}
 	}
 	if kernelPath == "" {
 		return nil, nil, fmt.Errorf("manifest has no %s layer", MediaKernel)
 	}
 	if initramfsPath == "" {
 		return nil, nil, fmt.Errorf("manifest has no %s layer", MediaInitramfs)
 	}
 	meta := metadataFromAnnotations(manifest.Annotations)
 	staged := &image.StagedImage{
 		VmlinuzPath:   kernelPath,
 		InitramfsPath: initramfsPath,
 		Version:       meta.Version,
 	}
 	return staged, meta, nil
 }
 // resolveArchManifest fetches the descriptor at `tag`, walks an index if
 // present, and returns the platform-specific manifest matching c.Arch.
 func (c *Client) resolveArchManifest(ctx context.Context, tag string) (*ocispec.Manifest, *ocispec.Descriptor, error) {
 	desc, err := c.repo.Resolve(ctx, tag)
 	if err != nil {
 		return nil, nil, fmt.Errorf("resolve tag %q: %w", tag, err)
 	}
 	switch desc.MediaType {
 	case ocispec.MediaTypeImageIndex, "application/vnd.docker.distribution.manifest.list.v2+json":
 		index, err := fetchJSON[ocispec.Index](ctx, c.repo, desc)
 		if err != nil {
 			return nil, nil, fmt.Errorf("fetch index: %w", err)
 		}
 		var matched *ocispec.Descriptor
 		for i := range index.Manifests {
 			m := &index.Manifests[i]
 			if m.Platform != nil && m.Platform.Architecture == c.Arch {
 				matched = m
 				break
 			}
 		}
 		if matched == nil {
 			return nil, nil, fmt.Errorf("no manifest in index for architecture %q", c.Arch)
 		}
 		manifest, err := fetchJSON[ocispec.Manifest](ctx, c.repo, *matched)
 		if err != nil {
 			return nil, nil, fmt.Errorf("fetch manifest: %w", err)
 		}
 		return manifest, matched, nil
 	case ocispec.MediaTypeImageManifest, "application/vnd.docker.distribution.manifest.v2+json":
 		manifest, err := fetchJSON[ocispec.Manifest](ctx, c.repo, desc)
 		if err != nil {
 			return nil, nil, fmt.Errorf("fetch manifest: %w", err)
 		}
 		// Single-arch tag: if it declares an arch, enforce match.
 		if archAnnot := manifest.Annotations[AnnotArch]; archAnnot != "" && archAnnot != c.Arch {
 			return nil, nil, fmt.Errorf("single-arch manifest is %q, want %q", archAnnot, c.Arch)
 		}
 		return manifest, &desc, nil
 	default:
 		return nil, nil, fmt.Errorf("unsupported media type %q at tag %q", desc.MediaType, tag)
 	}
 }
 // fetchJSON pulls a small JSON document (manifest or index) and decodes it.
 func fetchJSON[T any](ctx context.Context, store content.Fetcher, desc ocispec.Descriptor) (*T, error) {
 	rc, err := store.Fetch(ctx, desc)
 	if err != nil {
 		return nil, err
 	}
 	defer rc.Close()
 	data, err := content.ReadAll(rc, desc)
 	if err != nil {
 		return nil, err
 	}
 	var out T
 	if err := json.Unmarshal(data, &out); err != nil {
 		return nil, fmt.Errorf("decode: %w", err)
 	}
 	return &out, nil
 }
 // fetchBlobTo streams a blob to disk and verifies its digest matches.
 // Cleans up the destination file on any error so we never leave a partial.
 func (c *Client) fetchBlobTo(ctx context.Context, desc ocispec.Descriptor, dest string) (retErr error) {
 	rc, err := c.repo.Fetch(ctx, desc)
 	if err != nil {
 		return fmt.Errorf("fetch blob: %w", err)
 	}
 	defer rc.Close()
 	f, err := os.Create(dest)
 	if err != nil {
 		return fmt.Errorf("create %s: %w", dest, err)
 	}
 	defer func() {
 		if cerr := f.Close(); retErr == nil && cerr != nil {
 			retErr = cerr
 		}
 		if retErr != nil {
 			_ = os.Remove(dest)
 		}
 	}()
 	verifier := desc.Digest.Algorithm().Hash()
 	mw := io.MultiWriter(f, verifier)
 	n, err := io.Copy(mw, rc)
 	if err != nil {
 		return fmt.Errorf("stream blob: %w", err)
 	}
 	if desc.Size > 0 && n != desc.Size {
 		return fmt.Errorf("blob size mismatch: got %d, want %d", n, desc.Size)
 	}
 	got := digest.NewDigest(desc.Digest.Algorithm(), verifier)
 	if got != desc.Digest {
 		return fmt.Errorf("blob digest mismatch: got %s, want %s", got, desc.Digest)
 	}
 	return nil
 }
 // metadataFromAnnotations builds an UpdateMetadata from manifest annotations.
 // Always returns a non-nil value (missing fields stay empty).
 func metadataFromAnnotations(a map[string]string) *image.UpdateMetadata {
 	if a == nil {
 		a = map[string]string{}
 	}
 	return &image.UpdateMetadata{
 		Version:              a[AnnotVersion],
 		Channel:              a[AnnotChannel],
 		MinCompatibleVersion: a[AnnotMinVersion],
 		Architecture:         a[AnnotArch],
 		ReleaseNotes:         a[AnnotReleaseNote],
 		ReleaseDate:          a[AnnotReleaseDate],
 	}
 }
 // ErrNoManifestForArch is returned from FetchMetadata/Pull when an index has
 // no entry matching the running architecture. Exposed so callers can
 // distinguish "registry unreachable" from "this build doesn't ship for us".
 var ErrNoManifestForArch = errors.New("no manifest in index for runtime architecture")
--- a/update/pkg/oci/oci_test.go
+++ b/update/pkg/oci/oci_test.go
@@ -0,0 +1,377 @@
 package oci
 import (
 	"context"
 	"crypto/sha256"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"net/url"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"github.com/opencontainers/go-digest"
 	specs "github.com/opencontainers/image-spec/specs-go"
 	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
 )
 // fakeRegistry implements the minimum OCI distribution-spec surface our
 // Client touches: /v2/ probe, manifest fetch by tag or digest, blob fetch
 // by digest. Backed by an in-memory blob+manifest store.
 type fakeRegistry struct {
 	t          *testing.T
 	srv        *httptest.Server
 	blobs      map[digest.Digest][]byte // keyed by digest
 	manifests  map[string][]byte        // keyed by digest string (raw form)
 	tags       map[string]digest.Digest // tag -> manifest digest
 	mediaTypes map[digest.Digest]string // descriptor.MediaType per stored object
 }
 func newFakeRegistry(t *testing.T) *fakeRegistry {
 	t.Helper()
 	r := &fakeRegistry{
 		t:          t,
 		blobs:      map[digest.Digest][]byte{},
 		manifests:  map[string][]byte{},
 		tags:       map[string]digest.Digest{},
 		mediaTypes: map[digest.Digest]string{},
 	}
 	r.srv = httptest.NewServer(http.HandlerFunc(r.handle))
 	t.Cleanup(r.srv.Close)
 	return r
 }
 func (r *fakeRegistry) putBlob(media string, data []byte) digest.Digest {
 	h := sha256.Sum256(data)
 	d := digest.NewDigestFromBytes(digest.SHA256, h[:])
 	r.blobs[d] = data
 	r.mediaTypes[d] = media
 	return d
 }
 // putManifest stores a manifest/index document under both its digest and the
 // given tag, returning the digest the caller can embed in indexes.
 func (r *fakeRegistry) putManifest(tag string, media string, doc []byte) digest.Digest {
 	h := sha256.Sum256(doc)
 	d := digest.NewDigestFromBytes(digest.SHA256, h[:])
 	r.manifests[d.String()] = doc
 	r.mediaTypes[d] = media
 	if tag != "" {
 		r.tags[tag] = d
 	}
 	return d
 }
 // repoRef returns the "host:port/repo" string for use with NewClient.
 func (r *fakeRegistry) repoRef() string {
 	u, _ := url.Parse(r.srv.URL)
 	return u.Host + "/test/kubesolo-os"
 }
 func (r *fakeRegistry) handle(w http.ResponseWriter, req *http.Request) {
 	// Routes we implement:
 	//   GET /v2/                          -> 200 "{}"
 	//   GET /v2/test/kubesolo-os/manifests/<tag-or-digest> -> manifest
 	//   HEAD same                         -> same headers, no body
 	//   GET /v2/test/kubesolo-os/blobs/<digest> -> blob
 	path := req.URL.Path
 	if path == "/v2/" || path == "/v2" {
 		w.Header().Set("Docker-Distribution-API-Version", "registry/2.0")
 		w.WriteHeader(http.StatusOK)
 		_, _ = io.WriteString(w, "{}")
 		return
 	}
 	const prefix = "/v2/test/kubesolo-os/"
 	if !strings.HasPrefix(path, prefix) {
 		http.NotFound(w, req)
 		return
 	}
 	rest := strings.TrimPrefix(path, prefix)
 	switch {
 	case strings.HasPrefix(rest, "manifests/"):
 		ref := strings.TrimPrefix(rest, "manifests/")
 		var d digest.Digest
 		var data []byte
 		if td, ok := r.tags[ref]; ok {
 			d = td
 			data = r.manifests[d.String()]
 		} else if md, ok := r.manifests[ref]; ok {
 			d = digest.Digest(ref)
 			data = md
 		} else {
 			http.NotFound(w, req)
 			return
 		}
 		media := r.mediaTypes[d]
 		w.Header().Set("Content-Type", media)
 		w.Header().Set("Docker-Content-Digest", d.String())
 		w.Header().Set("Content-Length", fmt.Sprintf("%d", len(data)))
 		if req.Method == http.MethodHead {
 			return
 		}
 		_, _ = w.Write(data)
 	case strings.HasPrefix(rest, "blobs/"):
 		ref := strings.TrimPrefix(rest, "blobs/")
 		d := digest.Digest(ref)
 		blob, ok := r.blobs[d]
 		if !ok {
 			http.NotFound(w, req)
 			return
 		}
 		media := r.mediaTypes[d]
 		if media == "" {
 			media = "application/octet-stream"
 		}
 		w.Header().Set("Content-Type", media)
 		w.Header().Set("Docker-Content-Digest", d.String())
 		w.Header().Set("Content-Length", fmt.Sprintf("%d", len(blob)))
 		if req.Method == http.MethodHead {
 			return
 		}
 		_, _ = w.Write(blob)
 	default:
 		http.NotFound(w, req)
 	}
 }
 // seedSingleArchManifest puts kernel+initramfs blobs and a manifest with the
 // given annotations into the registry, tagged as `tag`.
 func (r *fakeRegistry) seedSingleArchManifest(t *testing.T, tag string, annot map[string]string) (kernelData, initramfsData []byte) {
 	t.Helper()
 	kernelData = []byte("FAKE-KERNEL-" + tag)
 	initramfsData = []byte("FAKE-INITRAMFS-" + tag)
 	kd := r.putBlob(MediaKernel, kernelData)
 	id := r.putBlob(MediaInitramfs, initramfsData)
 	// An empty config blob with sha256 of "{}" (the canonical "empty" body
 	// per OCI). We don't actually fetch the config so any valid descriptor
 	// works for the tests, but the digest still has to be syntactically valid.
 	emptyConfigBody := []byte("{}")
 	emptyConfigDigest := r.putBlob("application/vnd.oci.empty.v1+json", emptyConfigBody)
 	manifest := ocispec.Manifest{
 		Versioned: specs.Versioned{SchemaVersion: 2},
 		MediaType: ocispec.MediaTypeImageManifest,
 		Config: ocispec.Descriptor{
 			MediaType: "application/vnd.oci.empty.v1+json",
 			Size:      int64(len(emptyConfigBody)),
 			Digest:    emptyConfigDigest,
 		},
 		Layers: []ocispec.Descriptor{
 			{MediaType: MediaKernel, Digest: kd, Size: int64(len(kernelData))},
 			{MediaType: MediaInitramfs, Digest: id, Size: int64(len(initramfsData))},
 		},
 		Annotations: annot,
 	}
 	manifestBytes, err := json.Marshal(manifest)
 	if err != nil {
 		t.Fatalf("marshal manifest: %v", err)
 	}
 	r.putManifest(tag, ocispec.MediaTypeImageManifest, manifestBytes)
 	return
 }
 // seedIndex creates a manifest index pointing at per-arch manifests created
 // via seedSingleArchManifest with arch-suffixed tags, then publishes the
 // index under `tag`.
 func (r *fakeRegistry) seedIndex(t *testing.T, tag string, perArchAnnots map[string]map[string]string) {
 	t.Helper()
 	var descriptors []ocispec.Descriptor
 	for arch, annot := range perArchAnnots {
 		// Reuse seedSingleArchManifest but under an internal arch-suffixed tag
 		archTag := tag + "-" + arch
 		r.seedSingleArchManifest(t, archTag, annot)
 		d := r.tags[archTag]
 		descriptors = append(descriptors, ocispec.Descriptor{
 			MediaType: ocispec.MediaTypeImageManifest,
 			Digest:    d,
 			Size:      int64(len(r.manifests[d.String()])),
 			Platform:  &ocispec.Platform{Architecture: arch, OS: "linux"},
 		})
 	}
 	index := ocispec.Index{
 		Versioned: specs.Versioned{SchemaVersion: 2},
 		MediaType: ocispec.MediaTypeImageIndex,
 		Manifests: descriptors,
 	}
 	indexBytes, _ := json.Marshal(index)
 	r.putManifest(tag, ocispec.MediaTypeImageIndex, indexBytes)
 }
 // ---------------------------------------------------------------------------
 func TestFetchMetadataSingleArchManifest(t *testing.T) {
 	reg := newFakeRegistry(t)
 	reg.seedSingleArchManifest(t, "v0.3.0", map[string]string{
 		AnnotVersion: "v0.3.0",
 		AnnotChannel: "stable",
 		AnnotArch:    "amd64",
 	})
 	c, err := NewClient(reg.repoRef())
 	if err != nil {
 		t.Fatalf("NewClient: %v", err)
 	}
 	c.WithPlainHTTP(true)
 	c.Arch = "amd64"
 	meta, err := c.FetchMetadata(context.Background(), "v0.3.0")
 	if err != nil {
 		t.Fatalf("FetchMetadata: %v", err)
 	}
 	if meta.Version != "v0.3.0" {
 		t.Errorf("version: got %q, want v0.3.0", meta.Version)
 	}
 	if meta.Channel != "stable" {
 		t.Errorf("channel: got %q", meta.Channel)
 	}
 }
 func TestFetchMetadataIndexSelectsArch(t *testing.T) {
 	reg := newFakeRegistry(t)
 	reg.seedIndex(t, "stable", map[string]map[string]string{
 		"amd64": {AnnotVersion: "v0.3.0", AnnotChannel: "stable", AnnotArch: "amd64"},
 		"arm64": {AnnotVersion: "v0.3.0", AnnotChannel: "stable", AnnotArch: "arm64"},
 	})
 	for _, arch := range []string{"amd64", "arm64"} {
 		t.Run(arch, func(t *testing.T) {
 			c, err := NewClient(reg.repoRef())
 			if err != nil {
 				t.Fatalf("NewClient: %v", err)
 			}
 			c.WithPlainHTTP(true)
 			c.Arch = arch
 			meta, err := c.FetchMetadata(context.Background(), "stable")
 			if err != nil {
 				t.Fatalf("FetchMetadata: %v", err)
 			}
 			if meta.Architecture != arch {
 				t.Errorf("arch annotation: got %q, want %q", meta.Architecture, arch)
 			}
 			if meta.Version != "v0.3.0" {
 				t.Errorf("version: got %q, want v0.3.0", meta.Version)
 			}
 		})
 	}
 }
 func TestFetchMetadataIndexMissingArchErrors(t *testing.T) {
 	reg := newFakeRegistry(t)
 	reg.seedIndex(t, "stable", map[string]map[string]string{
 		"amd64": {AnnotVersion: "v0.3.0", AnnotArch: "amd64"},
 	})
 	c, _ := NewClient(reg.repoRef())
 	c.WithPlainHTTP(true)
 	c.Arch = "arm64" // not in the index
 	_, err := c.FetchMetadata(context.Background(), "stable")
 	if err == nil {
 		t.Fatal("expected error for missing arch, got nil")
 	}
 	if !strings.Contains(err.Error(), "arm64") {
 		t.Errorf("expected error mentioning arm64, got: %v", err)
 	}
 }
 func TestFetchMetadataSingleArchManifestRejectsCrossArch(t *testing.T) {
 	// If the manifest declares an arch via annotation and it doesn't match
 	// our runtime, Pull should refuse — defense in depth on top of the
 	// channel/version gates in cmd/apply.go.
 	reg := newFakeRegistry(t)
 	reg.seedSingleArchManifest(t, "v0.3.0-arm64", map[string]string{
 		AnnotArch: "arm64",
 	})
 	c, _ := NewClient(reg.repoRef())
 	c.WithPlainHTTP(true)
 	c.Arch = "amd64"
 	_, err := c.FetchMetadata(context.Background(), "v0.3.0-arm64")
 	if err == nil {
 		t.Fatal("expected error pulling cross-arch single-arch manifest, got nil")
 	}
 }
 func TestPullDownloadsBlobsAndVerifiesDigest(t *testing.T) {
 	reg := newFakeRegistry(t)
 	kernelData, initramfsData := reg.seedSingleArchManifest(t, "v0.3.0",
 		map[string]string{AnnotVersion: "v0.3.0", AnnotArch: "amd64"})
 	c, _ := NewClient(reg.repoRef())
 	c.WithPlainHTTP(true)
 	c.Arch = "amd64"
 	stageDir := filepath.Join(t.TempDir(), "stage")
 	staged, meta, err := c.Pull(context.Background(), "v0.3.0", stageDir)
 	if err != nil {
 		t.Fatalf("Pull: %v", err)
 	}
 	if meta.Version != "v0.3.0" {
 		t.Errorf("meta version: got %q", meta.Version)
 	}
 	if staged.Version != "v0.3.0" {
 		t.Errorf("staged version: got %q", staged.Version)
 	}
 	gotKernel, err := os.ReadFile(staged.VmlinuzPath)
 	if err != nil {
 		t.Fatalf("read kernel: %v", err)
 	}
 	if string(gotKernel) != string(kernelData) {
 		t.Errorf("kernel mismatch:\n got %q\nwant %q", gotKernel, kernelData)
 	}
 	gotInit, err := os.ReadFile(staged.InitramfsPath)
 	if err != nil {
 		t.Fatalf("read initramfs: %v", err)
 	}
 	if string(gotInit) != string(initramfsData) {
 		t.Errorf("initramfs mismatch")
 	}
 }
 func TestPullRejectsTamperedBlob(t *testing.T) {
 	// Mutate the kernel blob after it's been digested into the manifest.
 	// Pull should refuse with a digest mismatch.
 	reg := newFakeRegistry(t)
 	_, _ = reg.seedSingleArchManifest(t, "v0.3.0",
 		map[string]string{AnnotVersion: "v0.3.0", AnnotArch: "amd64"})
 	// Corrupt every stored kernel blob in the registry by replacing its body.
 	for d, m := range reg.mediaTypes {
 		if m == MediaKernel {
 			reg.blobs[d] = []byte("TAMPERED-KERNEL-WRONG-LENGTH-AND-DIGEST")
 		}
 	}
 	c, _ := NewClient(reg.repoRef())
 	c.WithPlainHTTP(true)
 	c.Arch = "amd64"
 	_, _, err := c.Pull(context.Background(), "v0.3.0", filepath.Join(t.TempDir(), "stage"))
 	if err == nil {
 		t.Fatal("expected digest mismatch error on tampered blob, got nil")
 	}
 	if !strings.Contains(err.Error(), "mismatch") {
 		t.Errorf("expected mismatch in error, got: %v", err)
 	}
 }
 func TestNewClientRejectsGarbageReference(t *testing.T) {
 	_, err := NewClient("not a valid reference")
 	if err == nil {
 		t.Error("expected error on bad reference, got nil")
 	}
 }
--- a/update/pkg/partition/freespace.go
+++ b/update/pkg/partition/freespace.go
@@ -0,0 +1,34 @@
 package partition
 import (
 	"fmt"
 	"syscall"
 )
 // FreeBytes returns the number of free bytes available on the filesystem
 // containing `path`. Uses statfs(2); path must exist and be readable.
 func FreeBytes(path string) (uint64, error) {
 	var stat syscall.Statfs_t
 	if err := syscall.Statfs(path, &stat); err != nil {
 		return 0, fmt.Errorf("statfs %s: %w", path, err)
 	}
 	// Bavail is the count of free blocks available to non-root users —
 	// matches what `df` reports. Bsize is the block size in bytes.
 	//nolint:unconvert // Bavail is uint64 on most platforms but int64 on darwin/freebsd
 	return uint64(stat.Bavail) * uint64(stat.Bsize), nil
 }
 // HasFreeSpaceFor reports whether `path`'s filesystem has at least `wantBytes`
 // of free space, with `headroomPct` reserved (e.g. 10 = require 110% of want).
 // Returns the available bytes alongside, so callers can render a useful error.
 func HasFreeSpaceFor(path string, wantBytes int64, headroomPct int) (avail uint64, ok bool, err error) {
 	avail, err = FreeBytes(path)
 	if err != nil {
 		return 0, false, err
 	}
 	if wantBytes < 0 {
 		return avail, false, fmt.Errorf("invalid wantBytes %d", wantBytes)
 	}
 	required := uint64(wantBytes) * uint64(100+headroomPct) / 100
 	return avail, avail >= required, nil
 }
--- a/update/pkg/partition/freespace_test.go
+++ b/update/pkg/partition/freespace_test.go
@@ -0,0 +1,44 @@
 package partition
 import "testing"
 func TestFreeBytesReturnsNonZeroOnTempDir(t *testing.T) {
 	b, err := FreeBytes(t.TempDir())
 	if err != nil {
 		t.Fatalf("FreeBytes: %v", err)
 	}
 	// On any sane test runner the temp filesystem has more than 1 KiB free.
 	if b < 1024 {
 		t.Errorf("FreeBytes = %d, want > 1024 on /tmp", b)
 	}
 }
 func TestFreeBytesNonExistentPath(t *testing.T) {
 	_, err := FreeBytes("/this/path/does/not/exist/at/all")
 	if err == nil {
 		t.Error("expected error for missing path, got nil")
 	}
 }
 func TestHasFreeSpaceForRejectsHugeRequest(t *testing.T) {
 	// Request 1 PiB with 10% headroom on /tmp — no test runner has that
 	// much free, so this should consistently report not-enough.
 	avail, ok, err := HasFreeSpaceFor(t.TempDir(), 1<<50, 10)
 	if err != nil {
 		t.Fatalf("HasFreeSpaceFor: %v", err)
 	}
 	if ok {
 		t.Errorf("expected insufficient space for 1PiB, got avail=%d ok=true", avail)
 	}
 }
 func TestHasFreeSpaceForAcceptsSmallRequest(t *testing.T) {
 	// 1 KiB with 10% headroom = 1.1 KiB. Any temp dir has this.
 	_, ok, err := HasFreeSpaceFor(t.TempDir(), 1024, 10)
 	if err != nil {
 		t.Fatalf("HasFreeSpaceFor: %v", err)
 	}
 	if !ok {
 		t.Error("expected sufficient space for 1KiB on /tmp")
 	}
 }
--- a/update/pkg/state/state.go
+++ b/update/pkg/state/state.go
@@ -0,0 +1,206 @@
 // Package state tracks the lifecycle of an OS update on disk.
 //
 // The state file (default /var/lib/kubesolo/update/state.json) records which
 // phase the agent is in, what versions are involved, when the attempt started,
 // any error from the last operation, and how many attempts have been made.
 // Updates are atomic via tmp+rename, so a crash mid-write doesn't corrupt the
 // state.
 //
 // Consumers:
 //   - cmd/check, cmd/apply, cmd/activate, cmd/healthcheck, cmd/rollback —
 //     transition the phase as they enter / leave their operations.
 //   - cmd/status --json — emits the raw state for orchestration tooling.
 //   - pkg/metrics — reads the state at scrape time to expose phase and
 //     attempt-count gauges.
 package state
 import (
 	"encoding/json"
 	"fmt"
 	"os"
 	"path/filepath"
 	"time"
 )
 // DefaultPath is where state.json lives on a live system. The directory is on
 // the persistent data partition so the file survives A/B slot switches.
 const DefaultPath = "/var/lib/kubesolo/update/state.json"
 // Phase represents the current step in the update lifecycle.
 //
 // Terminal phases (Success, RolledBack, Failed) describe the outcome of the
 // most recent attempt; transient phases (Checking, Downloading, Staged,
 // Activated, Verifying) describe in-progress work. Idle means no update has
 // been attempted yet, or the previous attempt has been acknowledged.
 type Phase string
 const (
 	// PhaseIdle — no update in progress.
 	PhaseIdle Phase = "idle"
 	// PhaseChecking — querying the update server for new versions.
 	PhaseChecking Phase = "checking"
 	// PhaseDownloading — pulling artifacts from the server.
 	PhaseDownloading Phase = "downloading"
 	// PhaseStaged — artifacts written to the passive partition; not yet active.
 	PhaseStaged Phase = "staged"
 	// PhaseActivated — passive slot promoted; next boot will use the new version.
 	PhaseActivated Phase = "activated"
 	// PhaseVerifying — post-boot healthcheck in progress on the new version.
 	PhaseVerifying Phase = "verifying"
 	// PhaseSuccess — last attempt completed and verified.
 	PhaseSuccess Phase = "success"
 	// PhaseRolledBack — last attempt failed verification; reverted to prior slot.
 	PhaseRolledBack Phase = "rolled_back"
 	// PhaseFailed — last attempt failed before reaching activation (download,
 	// checksum, signature, etc.). System still on the original slot.
 	PhaseFailed Phase = "failed"
 )
 // validPhases lists every legal Phase value. Anything not in this set is
 // rejected by Save() to catch typos.
 var validPhases = map[Phase]struct{}{
 	PhaseIdle:        {},
 	PhaseChecking:    {},
 	PhaseDownloading: {},
 	PhaseStaged:      {},
 	PhaseActivated:   {},
 	PhaseVerifying:   {},
 	PhaseSuccess:     {},
 	PhaseRolledBack:  {},
 	PhaseFailed:      {},
 }
 // UpdateState is the on-disk representation. Fields use JSON tags so the
 // file format is forward-compatible (extra fields ignored, missing fields
 // default).
 type UpdateState struct {
 	// Phase is the current lifecycle position.
 	Phase Phase `json:"phase"`
 	// FromVersion is the version the system was running before the attempt.
 	// Empty when no attempt has run.
 	FromVersion string `json:"from_version,omitempty"`
 	// ToVersion is the version the attempt is targeting.
 	// Empty when no attempt has run.
 	ToVersion string `json:"to_version,omitempty"`
 	// StartedAt is when the current attempt entered a non-Idle phase.
 	StartedAt time.Time `json:"started_at,omitempty"`
 	// UpdatedAt is the last time the file was written. Always set on Save().
 	UpdatedAt time.Time `json:"updated_at"`
 	// LastError carries the most recent operation error, populated when
 	// transitioning to PhaseFailed or PhaseRolledBack. Cleared on Success/Idle.
 	LastError string `json:"last_error,omitempty"`
 	// AttemptCount counts attempts at the current ToVersion. Reset when
 	// ToVersion changes or on successful completion.
 	AttemptCount int `json:"attempt_count"`
 	// HealthCheckFailures counts consecutive post-Activated healthcheck
 	// failures. Reset to 0 on a successful healthcheck or after a rollback.
 	// Used by `kubesolo-update healthcheck --auto-rollback-after N` to
 	// trigger automatic recovery on a wedged new boot.
 	HealthCheckFailures int `json:"health_check_failures,omitempty"`
 }
 // New returns a fresh Idle state with UpdatedAt set to now.
 func New() *UpdateState {
 	return &UpdateState{
 		Phase:     PhaseIdle,
 		UpdatedAt: time.Now().UTC(),
 	}
 }
 // Load reads the state from disk. If the file does not exist, returns a fresh
 // Idle state — this is the normal first-run case, not an error.
 func Load(path string) (*UpdateState, error) {
 	data, err := os.ReadFile(path)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return New(), nil
 		}
 		return nil, fmt.Errorf("read state %s: %w", path, err)
 	}
 	var s UpdateState
 	if err := json.Unmarshal(data, &s); err != nil {
 		return nil, fmt.Errorf("parse state %s: %w", path, err)
 	}
 	return &s, nil
 }
 // Save writes the state to disk atomically (tmp file + rename), so an
 // interrupted write never leaves a partial file at `path`.
 func (s *UpdateState) Save(path string) error {
 	if _, ok := validPhases[s.Phase]; !ok {
 		return fmt.Errorf("invalid phase %q", s.Phase)
 	}
 	s.UpdatedAt = time.Now().UTC()
 	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
 		return fmt.Errorf("creating state dir: %w", err)
 	}
 	data, err := json.MarshalIndent(s, "", "  ")
 	if err != nil {
 		return fmt.Errorf("marshal state: %w", err)
 	}
 	data = append(data, '\n')
 	tmp := path + ".tmp"
 	if err := os.WriteFile(tmp, data, 0o644); err != nil {
 		return fmt.Errorf("write tmp state: %w", err)
 	}
 	if err := os.Rename(tmp, path); err != nil {
 		_ = os.Remove(tmp)
 		return fmt.Errorf("rename state: %w", err)
 	}
 	return nil
 }
 // Transition moves the state to phase `next` and persists it. If `next`
 // targets a new ToVersion (different from the current one), AttemptCount is
 // reset to 1; otherwise it is left untouched. StartedAt is set when
 // transitioning out of Idle. LastError is cleared unless `next` is Failed or
 // RolledBack.
 func (s *UpdateState) Transition(path string, next Phase, toVersion, errMsg string) error {
 	now := time.Now().UTC()
 	// Reset attempt counter when targeting a new version.
 	if toVersion != "" && toVersion != s.ToVersion {
 		s.ToVersion = toVersion
 		s.AttemptCount = 0
 	}
 	// First non-Idle phase of an attempt: record start time and bump count.
 	if s.Phase == PhaseIdle && next != PhaseIdle {
 		s.StartedAt = now
 		s.AttemptCount++
 	}
 	s.Phase = next
 	switch next {
 	case PhaseFailed, PhaseRolledBack:
 		if errMsg != "" {
 			s.LastError = errMsg
 		}
 	case PhaseSuccess, PhaseIdle:
 		s.LastError = ""
 	}
 	return s.Save(path)
 }
 // RecordError marks the state as failed with the given error and saves.
 // Convenience wrapper around Transition for the most common failure path.
 func (s *UpdateState) RecordError(path string, err error) error {
 	msg := ""
 	if err != nil {
 		msg = err.Error()
 	}
 	return s.Transition(path, PhaseFailed, "", msg)
 }
 // SetFromVersion records the version the system was running when an attempt
 // started. Idempotent; only takes effect when From is empty.
 func (s *UpdateState) SetFromVersion(v string) {
 	if s.FromVersion == "" {
 		s.FromVersion = v
 	}
 }
--- a/update/pkg/state/state_test.go
+++ b/update/pkg/state/state_test.go
@@ -0,0 +1,197 @@
 package state
 import (
 	"errors"
 	"os"
 	"path/filepath"
 	"testing"
 )
 // statePath returns a per-test state file path inside t.TempDir().
 func statePath(t *testing.T) string {
 	t.Helper()
 	return filepath.Join(t.TempDir(), "state.json")
 }
 func TestLoadMissingReturnsIdle(t *testing.T) {
 	s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
 	if err != nil {
 		t.Fatalf("unexpected error loading missing state: %v", err)
 	}
 	if s.Phase != PhaseIdle {
 		t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
 	}
 }
 func TestSaveLoadRoundTrip(t *testing.T) {
 	path := statePath(t)
 	in := &UpdateState{
 		Phase:        PhaseStaged,
 		FromVersion:  "v0.2.0",
 		ToVersion:    "v0.3.0",
 		AttemptCount: 1,
 	}
 	if err := in.Save(path); err != nil {
 		t.Fatalf("save: %v", err)
 	}
 	out, err := Load(path)
 	if err != nil {
 		t.Fatalf("load: %v", err)
 	}
 	if out.Phase != in.Phase {
 		t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
 	}
 	if out.FromVersion != in.FromVersion {
 		t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
 	}
 	if out.ToVersion != in.ToVersion {
 		t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
 	}
 	if out.AttemptCount != in.AttemptCount {
 		t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
 	}
 	if out.UpdatedAt.IsZero() {
 		t.Error("UpdatedAt should be set by Save")
 	}
 }
 func TestSaveRejectsInvalidPhase(t *testing.T) {
 	s := &UpdateState{Phase: Phase("bogus")}
 	err := s.Save(statePath(t))
 	if err == nil {
 		t.Fatal("expected error saving invalid phase, got nil")
 	}
 }
 func TestSaveIsAtomic(t *testing.T) {
 	// After Save, the .tmp file should NOT exist — confirming we renamed it.
 	path := statePath(t)
 	s := New()
 	if err := s.Save(path); err != nil {
 		t.Fatalf("save: %v", err)
 	}
 	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
 		t.Errorf("tmp file still present after Save: %v", err)
 	}
 }
 func TestSaveCreatesDirectory(t *testing.T) {
 	// State directory may not exist yet (first-ever boot). Save() should mkdir.
 	dir := filepath.Join(t.TempDir(), "fresh", "subdir")
 	path := filepath.Join(dir, "state.json")
 	if err := New().Save(path); err != nil {
 		t.Fatalf("save into nonexistent dir: %v", err)
 	}
 	if _, err := os.Stat(path); err != nil {
 		t.Errorf("state file not present after Save: %v", err)
 	}
 }
 func TestTransitionIdleToChecking(t *testing.T) {
 	path := statePath(t)
 	s := New()
 	if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
 		t.Fatalf("transition: %v", err)
 	}
 	if s.Phase != PhaseChecking {
 		t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
 	}
 	if s.ToVersion != "v0.3.0" {
 		t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
 	}
 	if s.AttemptCount != 1 {
 		t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
 	}
 	if s.StartedAt.IsZero() {
 		t.Error("StartedAt should be set when leaving Idle")
 	}
 }
 func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
 	path := statePath(t)
 	s := New()
 	_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
 	_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
 	_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
 	if s.AttemptCount != 1 {
 		t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
 	}
 }
 func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
 	path := statePath(t)
 	s := New()
 	_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
 	// Now an attempt at a NEW version starts. AttemptCount should reset.
 	_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
 	if s.ToVersion != "v0.4.0" {
 		t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
 	}
 	if s.AttemptCount != 0 {
 		t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
 	}
 }
 func TestTransitionFailedRecordsError(t *testing.T) {
 	path := statePath(t)
 	s := New()
 	_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
 	_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
 	if s.Phase != PhaseFailed {
 		t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
 	}
 	if s.LastError != "checksum mismatch" {
 		t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
 	}
 }
 func TestTransitionSuccessClearsError(t *testing.T) {
 	path := statePath(t)
 	s := New()
 	_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
 	if s.LastError == "" {
 		t.Fatal("setup: LastError should be non-empty before success")
 	}
 	_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
 	if s.LastError != "" {
 		t.Errorf("last_error after success: got %q, want empty", s.LastError)
 	}
 }
 func TestRecordError(t *testing.T) {
 	path := statePath(t)
 	s := New()
 	if err := s.RecordError(path, errors.New("network down")); err != nil {
 		t.Fatalf("RecordError: %v", err)
 	}
 	if s.Phase != PhaseFailed {
 		t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
 	}
 	if s.LastError != "network down" {
 		t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
 	}
 }
 func TestSetFromVersionIdempotent(t *testing.T) {
 	s := New()
 	s.SetFromVersion("v0.2.0")
 	if s.FromVersion != "v0.2.0" {
 		t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
 	}
 	// Second call should not overwrite.
 	s.SetFromVersion("v0.1.0")
 	if s.FromVersion != "v0.2.0" {
 		t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
 	}
 }
 func TestLoadHandlesGarbageFile(t *testing.T) {
 	path := statePath(t)
 	if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
 		t.Fatalf("seed: %v", err)
 	}
 	_, err := Load(path)
 	if err == nil {
 		t.Error("expected error loading garbage, got nil")
 	}
 }