ci: gate x86 build until amd64 runner exists; ARM64 release self-sufficient

v0.3.1's first release.yaml run exposed two issues: 1. The `ubuntu-latest` label resolved to the Odroid (only runner registered with that label), which is arm64. apt-get install grub-efi-amd64-bin then failed because ports.ubuntu.com only ships arm64 packages — the amd64 grub binaries don't exist in the arm64 repo. Building x86 ISOs on an arm64 host requires either a native amd64 runner or qemu-user-static emulation; neither is set up. 2. The `arm64-linux:host` runner runs jobs directly on the Odroid host (no Docker), and actions/checkout@v4 is a JS action needing Node 20+ in $PATH. The Odroid had no Node installed at all, so checkout failed. Fixes: - `build-iso-amd64` gated `if: false` and `runs-on: amd64-linux`. The job stays in the workflow as a placeholder for when an amd64 runner is eventually registered. Flip the `if: false` line at that time and it starts working. - `release` job no longer depends on build-iso-amd64, so the workflow completes with just ARM64 + Go binaries. `if: always() && needs.X == 'success'` for the jobs we actually require. - Release body no longer promises x86 artifacts that aren't there. Replaced with a clear note about how to build x86 from source at the release tag. Operator action required for the Odroid runner: curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash - sudo apt install -y nodejs Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
release: v0.3.1
2026-05-15 16:48:58 -06:00 · 2026-05-15 16:29:06 -06:00 · 2026-05-15 16:20:33 -06:00 · 2026-05-15 15:10:09 -06:00 · 2026-05-15 14:25:11 -06:00 · 2026-05-15 11:48:43 -06:00
70 changed files with 5591 additions and 524 deletions
--- a/.gitea/workflows/build-arm64.yaml
+++ b/.gitea/workflows/build-arm64.yaml
@@ -0,0 +1,73 @@
+name: ARM64 Build
+
+# Triggers on push to main and on tags. Skipped on PRs to keep PR feedback fast;
+# manual via Gitea UI ("Run workflow") if needed.
+on:
+  push:
+    branches: [main]
+    tags: ['v*']
+  workflow_dispatch:
+
+jobs:
+  build-arm64-generic:
+    name: Build generic ARM64 disk image
+    # Routes to the Odroid self-hosted runner via the arm64-linux label.
+    # See docs/ci-runners.md for runner setup.
+    runs-on: arm64-linux
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Show host info
+        run: |
+          uname -a
+          nproc
+          free -h
+          df -h /home /tmp || df -h /
+
+      - name: Verify build prerequisites
+        run: |
+          # The Odroid runner ships these via apt; this is a sanity check.
+          which gcc make bc bison flex cpio gzip xz wget curl mkfs.ext4 mkfs.vfat \
+                 sfdisk losetup kpartx grub-mkimage qemu-system-aarch64 git busybox
+          ls -la /bin/busybox
+          file /bin/busybox | grep -q 'statically linked' || {
+              echo "ERROR: /bin/busybox is not statically linked — install busybox-static"
+              exit 1
+          }
+
+      - name: Build mainline ARM64 kernel
+        # Cached in build/cache/kernel-arm64-generic between runs (persistent
+        # working dir on the host runner). First run takes 30-60 min; reruns
+        # exit immediately once the .config + Image match.
+        run: |
+          time make kernel-arm64
+
+      - name: Build cross-arch Go binaries
+        run: make build-cross
+
+      - name: Prepare generic ARM64 rootfs
+        run: sudo make rootfs-arm64
+
+      - name: Build ARM64 UEFI disk image
+        run: sudo make disk-image-arm64
+
+      - name: Show output artifact
+        run: |
+          ls -lh output/
+          file output/*.arm64.img
+
+      - name: Boot smoke test (best-effort)
+        # KubeSolo's image import deadline can fire under QEMU TCG on the
+        # Odroid; the boot itself succeeds through stage 90 every time, but
+        # the final "KubeSolo started" health check is timing-sensitive.
+        # We mark this continue-on-error until we have KVM or real hardware.
+        continue-on-error: true
+        run: sudo make test-boot-arm64-disk
+
+      - name: Upload disk image
+        if: startsWith(github.ref, 'refs/tags/v')
+        uses: actions/upload-artifact@v4
+        with:
+          name: kubesolo-os-arm64-${{ github.ref_name }}
+          path: output/kubesolo-os-*.arm64.img
+          retention-days: 90
--- a/.gitea/workflows/ci.yaml
+++ b/.gitea/workflows/ci.yaml
@@ -62,7 +62,8 @@ jobs:
        working-directory: update

      - name: Upload binaries
-        uses: actions/upload-artifact@v4
+        # @v4 not yet fully supported by Gitea Actions runner; @v3 works.
+        uses: actions/upload-artifact@v3
        with:
          name: binaries-${{ matrix.suffix }}
          path: |
@@ -78,14 +79,39 @@ jobs:
      - name: Install shellcheck
        run: sudo apt-get update && sudo apt-get install -y shellcheck

+      # --severity=error filters out style/info/warning findings. Several of
+      # those are unavoidable in init-style scripts that source other files
+      # dynamically (SC1090/SC1091/SC2034). Exclude them explicitly so they
+      # don't fire even at warning level if we lift severity later.
+      # Codes excluded:
+      #   SC1090 — non-constant source path (we source by stage name)
+      #   SC1091 — source target not specified as input (we reference relative paths)
+      #   SC2034 — var "unused" (false positive: used via sourced scripts)
+      #   SC2002 — useless cat (style only, very common pattern in our scripts)
+      #   SC2015 — A && B || C (deliberate idiom)
+      #   SC2012 — use find not ls (style only)
+      #   SC2013 — read words not lines (style only, applies to /proc parsing)
+
      - name: Lint init scripts (POSIX sh)
-        run: shellcheck -s sh init/init.sh init/lib/*.sh init/emergency-shell.sh
+        run: |
+          shellcheck -s sh --severity=error \
+            -e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
+            init/init.sh init/lib/*.sh init/emergency-shell.sh

      - name: Lint build scripts (bash)
-        run: shellcheck -s bash build/scripts/*.sh build/config/kernel-audit.sh
+        run: |
+          shellcheck -s bash --severity=error \
+            -e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
+            build/scripts/*.sh build/config/kernel-audit.sh

      - name: Lint test scripts (bash)
-        run: shellcheck -s bash test/qemu/*.sh test/integration/*.sh test/kernel/*.sh || true
+        run: |
+          shellcheck -s bash --severity=error \
+            -e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
+            test/qemu/*.sh test/integration/*.sh test/kernel/*.sh

      - name: Lint hack scripts (bash)
-        run: shellcheck -s bash hack/*.sh || true
+        run: |
+          shellcheck -s bash --severity=error \
+            -e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
+            hack/*.sh
--- a/.gitea/workflows/release.yaml
+++ b/.gitea/workflows/release.yaml
@@ -1,5 +1,19 @@
 name: Release

+# Triggered by `git push origin vX.Y.Z`. Builds Go binaries (amd64+arm64),
+# x86_64 ISO + disk image, ARM64 disk image, computes SHA256SUMS over all
+# artifacts, and posts a Gitea release with everything attached via the
+# Gitea API.
+#
+# Notes for future-you:
+#   - upload-artifact / download-artifact are pinned to @v3 because Gitea's
+#     act_runner v1.0.x doesn't fully implement v4 yet.
+#   - The release step uses curl against Gitea's own /api/v1/repos/.../releases
+#     instead of a third-party action (softprops/action-gh-release et al);
+#     act_runner doesn't reliably proxy GitHub.com-targeted actions.
+#   - The arm64 disk-image build runs on the Odroid self-hosted runner via
+#     the `arm64-linux` label. Docs in docs/ci-runners.md.
+
 on:
  push:
    tags:
@@ -11,19 +25,16 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
-
      - uses: actions/setup-go@v5
        with:
          go-version: '1.22'
-
      - name: Test cloud-init
        run: cd cloud-init && go test ./... -count=1
-
      - name: Test update agent
        run: cd update && go test ./... -count=1

  build-binaries:
-    name: Build Binaries
+    name: Build Binaries (${{ matrix.suffix }})
    runs-on: ubuntu-latest
    needs: test
    strategy:
@@ -37,129 +48,247 @@ jobs:
            suffix: linux-arm64
    steps:
      - uses: actions/checkout@v4
-
      - uses: actions/setup-go@v5
        with:
          go-version: '1.22'
-
      - name: Get version
        id: version
        run: echo "version=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
-
      - name: Build cloud-init
        run: |
          CGO_ENABLED=0 GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} \
            go build -ldflags="-s -w -X main.version=${{ steps.version.outputs.version }}" \
            -o kubesolo-cloudinit-${{ matrix.suffix }} ./cmd/
        working-directory: cloud-init
-
      - name: Build update agent
        run: |
          CGO_ENABLED=0 GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} \
            go build -ldflags="-s -w -X main.version=${{ steps.version.outputs.version }}" \
            -o kubesolo-update-${{ matrix.suffix }} .
        working-directory: update
-
      - name: Upload binaries
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          name: binaries-${{ matrix.suffix }}
          path: |
            cloud-init/kubesolo-cloudinit-${{ matrix.suffix }}
            update/kubesolo-update-${{ matrix.suffix }}

-  build-iso:
-    name: Build ISO (amd64)
-    runs-on: ubuntu-latest
+  build-iso-amd64:
+    name: Build x86_64 ISO + disk image
+    # Routes to a runner with the `amd64-linux` label. As of v0.3.x no such
+    # runner exists in this Gitea instance — the only runner is the Odroid
+    # which is arm64 and would fail apt-installing grub-efi-amd64-bin /
+    # syslinux because those packages aren't in the arm64 ports repo. The
+    # job stays in the workflow (so it auto-runs once an amd64 runner is
+    # registered) but is gated and the release job continues without it.
+    if: false   # remove this line once an amd64-linux runner is registered
+    runs-on: amd64-linux
    needs: build-binaries
    steps:
      - uses: actions/checkout@v4
-
      - uses: actions/setup-go@v5
        with:
          go-version: '1.22'
-
      - name: Install build deps
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
            cpio gzip genisoimage isolinux syslinux syslinux-common \
            syslinux-utils xorriso xz-utils wget squashfs-tools \
-            dosfstools e2fsprogs fdisk parted bsdtar
-
-      - name: Build ISO
-        run: make iso
-
-      - name: Build disk image
-        run: make disk-image
-
-      - name: Get version
-        id: version
-        run: echo "version=$(cat VERSION)" >> $GITHUB_OUTPUT
-
-      - name: Upload ISO
-        uses: actions/upload-artifact@v4
+            dosfstools e2fsprogs fdisk parted libarchive-tools \
+            grub-common grub-efi-amd64-bin grub-pc-bin kpartx \
+            busybox-static iptables nftables
+      - name: Build kernel + ISO + disk-image
+        run: |
+          make kernel
+          make build-cloudinit build-update-agent
+          make rootfs initramfs
+          make iso
+          make disk-image
+      - name: Compress disk image
+        # The raw .img is 4 GB sparse; xz takes it to ~50-300 MB depending
+        # on dictionary level. Use -6 (default) for memory safety on the
+        # GitHub-Actions-style runner.
+        run: |
+          xz -k -T0 --memlimit-compress=1500MiB -6 output/*.img
+          ls -lh output/
+      - name: Upload x86_64 artifacts
+        uses: actions/upload-artifact@v3
        with:
-          name: iso-amd64
-          path: output/*.iso
+          name: image-amd64
+          path: |
+            output/*.iso
+            output/*.img.xz

-      - name: Upload disk image
-        uses: actions/upload-artifact@v4
+  build-disk-arm64:
+    name: Build ARM64 disk image
+    runs-on: arm64-linux
+    needs: test
+    steps:
+      - uses: actions/checkout@v4
+      - name: Show host info
+        run: |
+          uname -a
+          nproc
+          free -h
+          df -h /
+      - name: Build kernel + rootfs + disk-image
+        # Runner runs as root via systemd; explicit sudo is harmless but
+        # documented as such in docs/ci-runners.md.
+        run: |
+          make kernel-arm64
+          make build-cross
+          make rootfs-arm64
+          make disk-image-arm64
+      - name: Compress disk image
+        run: |
+          xz -k -T0 --memlimit-compress=1500MiB -6 output/*.arm64.img
+          ls -lh output/
+      - name: Upload ARM64 artifacts
+        uses: actions/upload-artifact@v3
        with:
-          name: disk-image-amd64
-          path: output/*.img
+          name: image-arm64
+          path: output/*.arm64.img.xz

  release:
-    name: Create Release
+    name: Publish Gitea Release
    runs-on: ubuntu-latest
-    needs: [build-binaries, build-iso]
+    # build-iso-amd64 is gated `if: false` in v0.3.x (no amd64 runner yet);
+    # don't block the release on it. build-disk-arm64 is required — that's
+    # the headline artifact for v0.3.x. build-binaries is required since
+    # the Go binaries are core to every release.
+    needs: [build-binaries, build-disk-arm64]
+    # `if: always()` so the release publishes even if the gated x86 job
+    # somehow ran-and-failed instead of being skipped. The downstream
+    # `find` in the Flatten step ignores missing files gracefully.
+    if: always() && needs.build-binaries.result == 'success' && needs.build-disk-arm64.result == 'success'
    steps:
      - uses: actions/checkout@v4

      - name: Get version
        id: version
-        run: echo "version=$(cat VERSION)" >> $GITHUB_OUTPUT
+        # `cat VERSION` would be stale on tag pushes (VERSION already bumped
+        # for the tag, but using ref_name is unambiguous).
+        run: echo "version=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT

      - name: Download all artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v3
        with:
          path: artifacts

-      - name: Compute checksums
+      - name: Flatten artifacts + compute checksums
        run: |
-          cd artifacts
-          find . -type f \( -name "*.iso" -o -name "*.img" -o -name "kubesolo-*" \) \
-            -exec sha256sum {} \; | sort > ../SHA256SUMS
-          cd ..
+          mkdir -p release
+          # Each upload-artifact wrote into artifacts/<name>/...
+          find artifacts -type f \( \
+            -name "*.iso" -o \
+            -name "*.img.xz" -o \
+            -name "kubesolo-*" \
+          \) -exec cp {} release/ \;
+          (cd release && sha256sum * | sort > SHA256SUMS)
+          ls -lh release/
+          cat release/SHA256SUMS

-      - name: Create release
-        uses: softprops/action-gh-release@v2
-        with:
-          name: KubeSolo OS v${{ steps.version.outputs.version }}
-          body: |
-            ## KubeSolo OS v${{ steps.version.outputs.version }}
+      - name: Install release tooling
+        run: sudo apt-get update && sudo apt-get install -y jq curl

-            ### Downloads
-            - **ISO** — Boot from CD/USB, ideal for testing
-            - **Disk Image** — Raw disk with A/B partitions + GRUB
-            - **Binaries** — Standalone cloud-init and update agent
+      - name: Render release body
+        id: body
+        run: |
+          VERSION="${{ steps.version.outputs.version }}"
+          # Strip the leading 'v' for cosmetic display in the body.
+          DISPLAY="${VERSION#v}"
+          cat > release-body.md <<EOF
+          See [docs/release-notes-${DISPLAY}.md](./docs/release-notes-${DISPLAY}.md)
+          and [CHANGELOG.md](./CHANGELOG.md) for the full release notes.

-            ### Verify
-            ```
-            sha256sum -c SHA256SUMS
-            ```
+          ### Downloads

-            ### Quick Start
-            ```bash
-            # Boot in QEMU
-            qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm \
-              -cdrom kubesolo-os-${{ steps.version.outputs.version }}.iso \
-              -nographic
-            ```
-          files: |
-            artifacts/**/*.iso
-            artifacts/**/*.img
-            artifacts/**/kubesolo-*
-            SHA256SUMS
-          draft: false
-          prerelease: false
+          - \`kubesolo-os-${DISPLAY}.arm64.img.xz\` — ARM64 raw disk image (A/B GPT, UEFI)
+          - \`kubesolo-cloudinit-linux-{amd64,arm64}\` — standalone cloud-init parser
+          - \`kubesolo-update-linux-{amd64,arm64}\` — standalone update agent
+          - \`SHA256SUMS\` — checksums for every artifact above
+
+          > **x86_64 ISO + disk image**: not built automatically yet. The
+          > release workflow's amd64 build job needs an amd64-linux runner,
+          > which this Gitea instance doesn't have yet. To produce them
+          > yourself, clone the repo at this tag and run \`make iso disk-image\`
+          > on any Linux amd64 host.
+
+          ### Verify
+
+          \`\`\`
+          sha256sum -c SHA256SUMS
+          \`\`\`
+
+          ### Quick start (ARM64)
+
+          \`\`\`
+          # On Graviton/Ampere/any UEFI ARM64 host:
+          xz -d kubesolo-os-${DISPLAY}.arm64.img.xz
+          sudo dd if=kubesolo-os-${DISPLAY}.arm64.img of=/dev/sdX bs=4M status=progress
+
+          # Under qemu-system-aarch64 (Apple Silicon w/ HVF):
+          UEFI_FW=\$(brew --prefix qemu)/share/qemu/edk2-aarch64-code.fd
+          qemu-system-aarch64 -M virt -accel hvf -cpu host -m 2048 -smp 2 \\
+            -nographic -bios "\$UEFI_FW" \\
+            -drive file=kubesolo-os-${DISPLAY}.arm64.img,format=raw,if=virtio,media=disk \\
+            -device virtio-rng-pci \\
+            -net nic,model=virtio \\
+            -net user,hostfwd=tcp::6443-:6443,hostfwd=tcp::8080-:8080
+          \`\`\`
+
+          Then from the host: \`curl http://localhost:8080 > ~/.kube/kubesolo-config\`
+          and \`kubectl --kubeconfig ~/.kube/kubesolo-config get nodes\`.
+          EOF
+          cat release-body.md
+
+      - name: Create release via Gitea API
+        env:
+          # Gitea's act_runner auto-populates this with repo-write scope.
+          # If not, set a personal access token as a secret named GITEA_TOKEN
+          # on the org and swap the var name below.
+          TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          TAG="${{ steps.version.outputs.version }}"
+          REPO_API="${{ github.api_url }}/repos/${{ github.repository }}"
+
+          # 1. Create the release. The API is GitHub-compatible at the
+          # request shape; the response includes the numeric release id we
+          # need for asset uploads.
+          PAYLOAD=$(jq -n \
+            --arg tag "$TAG" \
+            --arg name "KubeSolo OS $TAG" \
+            --rawfile body release-body.md \
+            '{tag_name: $tag, name: $name, body: $body, draft: false, prerelease: false}')
+
+          echo "==> Creating release for $TAG against $REPO_API"
+          CREATE_RESP=$(curl -fsSL -X POST \
+            -H "Authorization: token $TOKEN" \
+            -H "Content-Type: application/json" \
+            -d "$PAYLOAD" \
+            "$REPO_API/releases")
+
+          RELEASE_ID=$(echo "$CREATE_RESP" | jq -r '.id')
+          if [ -z "$RELEASE_ID" ] || [ "$RELEASE_ID" = "null" ]; then
+            echo "ERROR: Could not extract release id from response:"
+            echo "$CREATE_RESP" | jq . || echo "$CREATE_RESP"
+            exit 1
+          fi
+          echo "==> Release id: $RELEASE_ID"
+
+          # 2. Upload each asset. asset?name= names the attachment; we use
+          # the basename so users see the same filename the build produced.
+          for f in release/*; do
+            [ -f "$f" ] || continue
+            name=$(basename "$f")
+            echo "==> Uploading $name ($(du -h "$f" | cut -f1))"
+            curl -fsSL -X POST \
+              -H "Authorization: token $TOKEN" \
+              -F "attachment=@$f" \
+              "$REPO_API/releases/$RELEASE_ID/assets?name=$name" >/dev/null
+          done
+
+          echo "==> Release published: $REPO_API/../releases/tag/$TAG"
--- a/.gitignore
+++ b/.gitignore
@@ -18,8 +18,19 @@ build/rootfs-work/

 # OS
 .DS_Store
+._*
 Thumbs.db

+# Photos / screenshots — keep documentation images under docs/ instead
+*.PNG
+*.png
+*.JPG
+*.jpg
+*.JPEG
+*.jpeg
+*.HEIC
+*.heic
+
 # Go
 update/update-agent
 cloud-init/cloud-init-parser
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,250 @@ All notable changes to KubeSolo OS are documented in this file.
 Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 versioning follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.3.1] - 2026-05-15
+
+First fully-functional generic ARM64 release. v0.3.0 shipped the build
+scaffold; v0.3.1 makes it actually boot a Kubernetes cluster end-to-end
+on QEMU virt under HVF acceleration. Validated by deploying CoreDNS,
+local-path-provisioner, and an `nginx:alpine` workload — all reach
+Running, `kubectl get nodes` reports `Ready`.
+
+### Fixed
+
+- **Dual-glibc loading on ARM64** — piCore64's `/lib/libc.so.6` and the
+  build host's `/lib/$LIB_ARCH/libc.so.6` could both be resolved into the
+  same process by the dynamic linker, triggering
+  `*** stack smashing detected ***` aborts when stack frames crossed
+  between functions linked against different libcs. Fix: bundle the full
+  glibc family (libc + libpthread + libdl + libm + libresolv + librt +
+  libanl + libgcc_s + ld.so), delete piCore's duplicates in `/lib/`,
+  and write `/etc/ld.so.conf` + `ldconfig -r` so the runtime linker has
+  a deterministic search order. (`76ed2ff`)
+- **`nft` binary not bundled** — KubeSolo v1.1.4+ runs `nft add table ip
+  kubesolo-masq` for pod-masquerade setup, but `inject-kubesolo.sh` only
+  bundled `xtables-nft-multi`. Without standalone `nft` in `$PATH`,
+  KubeSolo FATAL'd at startup. Fix: copy `/usr/sbin/nft` + its
+  non-shared libs (libnftables, libedit, libjansson, libgmp, libtinfo,
+  libbsd, libmd) into the rootfs. (`51c1f78`)
+- **nftables address-family handlers** — `nf_tables` core was loaded but
+  no address families were registered, so `nft add table ip ...`
+  returned `EOPNOTSUPP`. The bool Kconfigs `CONFIG_NF_TABLES_IPV4`,
+  `CONFIG_NF_TABLES_IPV6`, `CONFIG_NF_TABLES_INET`,
+  `CONFIG_NF_TABLES_NETDEV` are required and weren't in the
+  fragment. Fix: add to `kernel-container.fragment` as `=y`. (`7e46f8f`)
+- **kube-proxy nftables-backend expression modules** — Kubernetes 1.34's
+  kube-proxy nft backend uses `numgen`, `hash`, `limit`, `log`
+  expressions. The corresponding kernel modules (`CONFIG_NFT_NUMGEN`,
+  etc.) were missing from the fragment AND the runtime module list, so
+  even after a kernel rebuild stage 30 didn't load them and stage 85's
+  `kernel.modules_disabled=1` lockdown prevented on-demand loads. Fix:
+  add to both `kernel-container.fragment` (as `=m`) and
+  `modules.list` / `modules-arm64.list`. (`31eee77`, `3bcf2e1`)
+- **`modules.list` inline-comment parser bug** — the inject script's
+  comment-strip only matched lines starting with `#`, not lines with
+  inline `# comment` tails. So `nft_numgen     # foo` was passed
+  verbatim to modprobe, resolved to nothing, and the .ko never made it
+  into the initramfs. Fix: parse with `mod="${mod%%#*}"` to strip
+  inline tails. (`bc3300e`)
+- **Banner only printed on kubeconfig success** —
+  `90-kubesolo.sh` gated the host-access banner behind `if [ -f
+  $KUBECONFIG_PATH ]`. When KubeSolo crashed early (bug #2 above) or
+  the wait loop timed out, the user never saw the connection
+  instructions. Fix: write the banner to `/etc/motd` AND print it
+  unconditionally after the wait loop. (`51c1f78`)
+- **`dev-vm-arm64.sh` missing port-8080 hostfwd** — the in-VM HTTP
+  server that serves the kubeconfig listens on port 8080, but the
+  QEMU `-net user` line only forwarded 6443 and 2222, so
+  `curl http://localhost:8080` from the host machine connected to
+  nothing. Fix: add the third hostfwd. (`fbe2d0b`)
+
+### Fixed (CI)
+
+- **`release.yaml` workflow** rewritten so v0.3.1+ tag pushes
+  auto-publish a complete release page on Gitea: `actions/upload-artifact`
+  pinned to `@v3` for act_runner compatibility, the
+  `softprops/action-gh-release@v2` step replaced with a direct `curl`
+  against `/api/v1/repos/.../releases` (`softprops` hard-codes
+  `api.github.com` so it silently no-ops on Gitea), added a
+  `build-disk-arm64` job that builds on the `arm64-linux` runner.
+  v0.3.0's manual-upload-only release was the canary that exposed all
+  three bugs. (`f8c308d`)
+
+### Known issues carried forward to v0.3.2
+
+These don't block normal operation but are tracked:
+
+- `xt_comment` userspace extension load fails on the iptables-nft path,
+  causing kubelet's KUBE-FIREWALL rule install to skip. Reported as
+  `Couldn't load match 'comment'` in the boot log. kubelet continues
+  without the localhost-drop rule.
+- `containerd-shim-runc-v2 -info` probe reports `runc: executable file
+  not found in $PATH`. Cosmetic — containerd uses the absolute path
+  from its config when actually launching containers.
+- `kube-proxy conntrack cleanup` logs `Failed to list conntrack entries:
+  invalid argument` every cleanup cycle. Probably needs
+  `CONFIG_NF_CONNTRACK_PROCFS` or netlink-glue tweaks.
+- Several pods restart 1–2 times on first boot due to a PLEG /
+  runtime-probe race in the kubelet startup path. Pods stabilise.
+
+## [0.3.0] - 2026-05-14
+
+The main themes: generic ARM64 (not just Raspberry Pi), an honest update
+lifecycle with state file + metrics, OCI multi-arch distribution via ghcr.io,
+and policy gates (channels, maintenance windows, version stepping-stones,
+pre-flight checks, auto-rollback).
+
+### Added
+
+- Generic ARM64 build track distinct from Raspberry Pi:
+  - `make kernel-arm64` builds a mainline kernel.org LTS kernel (6.12.10 by
+    default) from `arm64 defconfig` + shared `kernel-container.fragment` +
+    arm64 virt-host enables (VIRTIO_*, EFI_STUB, NVMe).
+  - `make disk-image-arm64` produces a UEFI-bootable raw GPT image with A/B
+    system partitions and GRUB-EFI ARM64. Targets QEMU virt, Graviton, Ampere,
+    or any UEFI ARM64 host.
+  - `hack/dev-vm-arm64.sh --disk` boots the built image through QEMU UEFI for
+    end-to-end testing.
+  - `test/qemu/test-boot-arm64-disk.sh` automated boot smoke test.
+- Bumped KubeSolo to v1.1.5 (was v1.1.0). New cloud-init flags surfaced:
+  - `kubesolo.full` (v1.1.4+) — disable edge-optimised overrides
+  - `kubesolo.disable-ipv6` (v1.1.5+)
+  - `kubesolo.db-wal-repair` (v1.1.5+) — recover from unclean shutdowns
+- Per-arch supply-chain verification: `KUBESOLO_SHA256_AMD64` and
+  `KUBESOLO_SHA256_ARM64` in `versions.env`, applied to the tarball before
+  extract.
+- `docs/arm64-architecture.md` — defines the generic-vs-RPi two-track layout.
+- `docs/arm64-status.md` — Phase 3 status snapshot, known limitations, what's
+  needed to ship.
+- `docs/ci-runners.md` — Gitea Actions runner setup (Odroid arm64-linux).
+- Update agent state machine and observability (`update/pkg/state`):
+  - Persistent on-disk `state.json` at `/var/lib/kubesolo/update/state.json`
+    (atomic write via tmp + rename). Records Phase (Idle / Checking /
+    Downloading / Staged / Activated / Verifying / Success / RolledBack /
+    Failed), FromVersion, ToVersion, StartedAt, UpdatedAt, LastError,
+    AttemptCount, HealthCheckFailures.
+  - `apply`, `activate`, `healthcheck`, `rollback` all transition state
+    explicitly on entry / exit / failure. Errors land in LastError so
+    `status` can show why.
+  - `kubesolo-update status --json` emits the full state for
+    orchestration tooling. Human-readable mode adds an "Update Lifecycle"
+    section when not idle.
+  - New Prometheus metrics: `kubesolo_update_phase{phase="..."}` (all 9
+    phase labels always emitted), `kubesolo_update_attempts_total`,
+    `kubesolo_update_last_attempt_timestamp_seconds`.
+- Channels, maintenance windows, version policy (`update/pkg/config`):
+  - `/etc/kubesolo/update.conf` (key=value, comments, missing-OK) configures
+    server, channel, maintenance_window, pubkey, healthcheck_url,
+    auto_rollback_after.
+  - `cloud-init` top-level `updates:` block writes `update.conf` on first
+    boot. Empty block leaves any existing file alone.
+  - `apply` enforces four gates before download: maintenance window,
+    channel match, runtime architecture match, min_compatible_version
+    stepping-stone. All gate failures land in the state machine as Failed
+    with a clear LastError. `--force` bypasses window + node-block-label.
+  - `UpdateMetadata` JSON gains `channel`, `min_compatible_version`,
+    `architecture` (all optional, omitempty).
+- OCI registry distribution (`update/pkg/oci`, ~280 LOC, 9 tests):
+  - `kubesolo-update apply --registry ghcr.io/<org>/kubesolo-os --tag stable`
+    pulls update artifacts from any OCI-compliant registry. Multi-arch
+    indexes resolve to the runtime.GOARCH-matching manifest automatically.
+  - Custom media types: `application/vnd.kubesolo.os.kernel.v1+octet-stream`
+    and `application/vnd.kubesolo.os.initramfs.v1+gzip`. Annotations:
+    `io.kubesolo.os.{version,channel,architecture,min_compatible_version,
+    release_notes,release_date}`.
+  - End-to-end digest verification from manifest to blobs via oras-go/v2.
+  - `build/scripts/push-oci-artifact.sh` publishes per-arch artifacts via
+    `oras`. Multi-arch index composition documented inline.
+  - Dependencies added (update module only): oras.land/oras-go/v2 and
+    transitive opencontainers/{go-digest,image-spec} + golang.org/x/sync.
+- Pre-flight gates and deeper healthcheck (`update/pkg/health` extended,
+  `update/pkg/partition` extended):
+  - Free-space pre-flight on the passive partition (image + 10% headroom)
+    via `partition.FreeBytes` / `HasFreeSpaceFor`.
+  - Node-block-label pre-flight: refuses if the local K8s node carries
+    `updates.kubesolo.io/block=true`. Silently allowed when no kubeconfig
+    (air-gap). Skipped by `--force`.
+  - `CheckKubeSystemReady` waits until every kube-system pod has held
+    Running for ≥ N seconds (configurable via
+    `--kube-system-settle`).
+  - `CheckProbeURL` GETs an operator-supplied URL; 200 = pass. Configurable
+    via `--healthcheck-url` or `healthcheck_url=` in update.conf.
+  - `CheckDiskWritable` writes / fsyncs / reads / deletes a probe file
+    under `/var/lib/kubesolo` to catch a wedged data partition.
+  - `--auto-rollback-after N` (also `auto_rollback_after=` in update.conf):
+    after N consecutive post-activation healthcheck failures, the agent
+    calls `ForceRollback()` and the operator/init reboots. Reset to 0 on
+    a clean pass.
+- `.gitea/workflows/build-arm64.yaml` — full ARM64 build on the Odroid
+  self-hosted runner. Triggers on push to main, tags, and workflow_dispatch.
+  Boot smoke test marked continue-on-error pending KVM or real-hardware
+  validation.
+
+### Changed
+
+- `build/scripts/build-kernel-arm64.sh` is now the **generic ARM64** kernel
+  build (mainline kernel.org LTS, generic UEFI/virtio).
+- Renamed `build/scripts/build-kernel-rpi.sh` (was `build-kernel-arm64.sh`).
+  RPi kernel build (raspberrypi/linux fork, bcm2711_defconfig) lives here now.
+- Renamed `build/config/kernel-container.fragment` (was
+  `rpi-kernel-config.fragment`). Misnomer: contents are arch-agnostic and now
+  shared across x86, ARM64-generic, and RPi kernels.
+- `build/scripts/build-kernel.sh` (x86) refactored to consume the shared
+  fragment via a generic `apply_fragment` function. ~50 lines of duplication
+  killed.
+- `KUBESOLO_VERSION` moved out of `fetch-components.sh` defaults into
+  `versions.env`. Bumping is now a one-line PR.
+
+### Fixed
+
+- Native ARM64 build hosts (e.g. an Odroid runner) no longer require the x86
+  cross-compiler. Both `build-kernel-arm64.sh` and `build-kernel-rpi.sh` detect
+  `uname -m` and use the host's gcc directly when arch matches.
+- ARM64 grub.cfg console ordering: `ttyAMA0` is now the primary console
+  (`console=ttyS0,... console=ttyAMA0,...`). Init output is now visible on
+  QEMU virt and most ARM64 SBCs without further configuration.
+- ARM64 boot: replaced piCore64's `/init` with our staged init at `/init` and
+  `/sbin/init`. Previously the kernel ran piCore's TCE handler which
+  segfaulted in our environment.
+- ARM64 boot: replaced piCore64's broken dynamic BusyBox with the build
+  host's `busybox-static`. piCore's binary triggered EL0 instruction-abort
+  panics on QEMU virt under both `-cpu cortex-a72` and `-cpu max`.
+- POSIX-character-class portability: `tr -d '[:space:]'` in
+  `30-kernel-modules.sh` and `40-sysctl.sh` replaced with explicit
+  `' \t\r\n'`. Ubuntu's busybox-static 1.30.1 doesn't parse `[:space:]` and
+  instead deletes the literal characters `[ : s p a c e ]`, which truncated
+  module names (`virtio_net` → `virtio_nt`, etc.) and sysctl keys.
+- `inject-kubesolo.sh` no longer copies `init/lib/functions.sh` into
+  `init.d/`. Previously the main init loop tried to run it as a stage after
+  stage 90 and panicked with "Init completed without exec'ing KubeSolo".
+- ARM64 disk image: `TARGET_ARCH=arm64 create-disk-image.sh` produces
+  `BOOTAA64.EFI` via `grub-mkimage -O arm64-efi` (not `bootx64.efi`). Skips
+  the BIOS-only `grub-install --target=i386-pc` step.
+- `build/Dockerfile.builder`: added `grub-efi-amd64-bin`, `grub-efi-arm64-bin`,
+  `grub-pc-bin`, `grub-common`, `grub2-common`, and `busybox-static` so the
+  Docker-based build flow can produce ARM64 disk images and gets the same
+  BusyBox swap behaviour as native builds.
+
+### Known limitations (deferred to follow-up)
+
+- **ARM64 LABEL= resolution** doesn't work yet — piCore's `blkid`/`findfs`
+  crash in QEMU and our static busybox lacks the applets. Hardcoded
+  `/dev/vda4` as a workaround in `build/grub/grub-arm64.cfg`. Production
+  fix: ship static `blkid`/`findfs` or replace LABEL resolution with a
+  sysfs walk.
+- **AppArmor profile load fails on ARM64** (apparmor_parser ABI mismatch).
+  Init reports it; boot continues without enforcement.
+- **OCI signature verification** is deferred. The HTTP transport still
+  honours `--pubkey` for `.sig` files; the OCI transport is digest-verified
+  end-to-end via oras-go but does not yet consume cosign-style referrer
+  attestations. Targeted for v0.3.1.
+- **Real-hardware validation** of the generic ARM64 image is still
+  pending. Builds and boots end-to-end under QEMU virt; production
+  certification waits on a Graviton / Ampere run.
+- **QEMU TCG performance** can trigger KubeSolo's first-boot image-import
+  deadline. Not a defect in the OS itself; real hardware and KVM-accelerated
+  QEMU complete the import in seconds.
+
 ## [0.2.0] - 2026-02-12

 ### Added
--- a/60
+++ b/60
@@ -1,8 +1,8 @@
 .PHONY: all fetch kernel build-cloudinit build-update-agent build-cross rootfs initramfs \
-       iso disk-image oci-image rpi-image \
-       kernel-arm64 rootfs-arm64 \
+       iso disk-image disk-image-arm64 oci-image rpi-image \
+       kernel-arm64 kernel-rpi rootfs-arm64 rootfs-arm64-rpi \
       test-boot test-k8s test-persistence test-deploy test-storage test-security test-all \
-       test-boot-arm64 test-cloudinit test-update-agent \
+       test-boot-arm64 test-boot-arm64-disk test-cloudinit test-update-agent \
       bench-boot bench-resources \
       dev-vm dev-vm-shell dev-vm-arm64 quick docker-build shellcheck \
       kernel-audit clean distclean help
@@ -73,21 +73,43 @@ build-cross:
 	$(BUILD_DIR)/scripts/build-cross.sh

 # =============================================================================
-# ARM64 Raspberry Pi targets
+# ARM64 generic targets (mainline kernel, UEFI, virtio — for cloud / SBCs)
 # =============================================================================
 kernel-arm64:
-	@echo "==> Building ARM64 kernel for Raspberry Pi..."
+	@echo "==> Building generic ARM64 kernel (mainline LTS)..."
 	$(BUILD_DIR)/scripts/build-kernel-arm64.sh

+# Generic ARM64 rootfs consumes the mainline kernel modules.
 rootfs-arm64: build-cross
-	@echo "==> Preparing ARM64 rootfs..."
+	@echo "==> Preparing generic ARM64 rootfs..."
 	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/fetch-components.sh
 	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/extract-core.sh
-	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/inject-kubesolo.sh
-	@echo "==> Packing ARM64 initramfs..."
+	TARGET_ARCH=arm64 TARGET_VARIANT=generic $(BUILD_DIR)/scripts/inject-kubesolo.sh
+	@echo "==> Packing generic ARM64 initramfs..."
 	$(BUILD_DIR)/scripts/pack-initramfs.sh

-rpi-image: rootfs-arm64 kernel-arm64
+disk-image-arm64: rootfs-arm64 kernel-arm64
+	@echo "==> Creating generic ARM64 disk image (UEFI + GRUB A/B)..."
+	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/create-disk-image.sh
+	@echo "==> Built: $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).arm64.img"
+
+# =============================================================================
+# ARM64 Raspberry Pi targets (RPi-patched kernel, firmware blobs, SD card)
+# =============================================================================
+kernel-rpi:
+	@echo "==> Building RPi kernel (raspberrypi/linux)..."
+	$(BUILD_DIR)/scripts/build-kernel-rpi.sh
+
+# RPi-flavoured rootfs consumes the RPi kernel modules.
+rootfs-arm64-rpi: build-cross
+	@echo "==> Preparing RPi ARM64 rootfs..."
+	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/fetch-components.sh
+	TARGET_ARCH=arm64 $(BUILD_DIR)/scripts/extract-core.sh
+	TARGET_ARCH=arm64 TARGET_VARIANT=rpi $(BUILD_DIR)/scripts/inject-kubesolo.sh
+	@echo "==> Packing RPi ARM64 initramfs..."
+	$(BUILD_DIR)/scripts/pack-initramfs.sh
+
+rpi-image: rootfs-arm64-rpi kernel-rpi
 	@echo "==> Creating Raspberry Pi SD card image..."
 	$(BUILD_DIR)/scripts/create-rpi-image.sh
 	@echo "==> Built: $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).rpi.img"
@@ -127,9 +149,13 @@ test-security: iso
 	test/integration/test-security-hardening.sh $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).iso

 test-boot-arm64:
-	@echo "==> Testing ARM64 boot in QEMU..."
+	@echo "==> Testing ARM64 boot in QEMU (direct kernel)..."
 	test/qemu/test-boot-arm64.sh

+test-boot-arm64-disk: disk-image-arm64
+	@echo "==> Testing ARM64 UEFI disk boot in QEMU..."
+	test/qemu/test-boot-arm64-disk.sh $(OUTPUT_DIR)/$(OS_NAME)-$(VERSION).arm64.img
+
 test-all: test-boot test-k8s test-persistence

 # Cloud-init Go tests
@@ -246,10 +272,15 @@ help:
 	@echo "  make quick              Fast rebuild (re-inject + repack + ISO only)"
 	@echo "  make docker-build       Reproducible build inside Docker"
 	@echo ""
+	@echo "Build targets (ARM64 generic — UEFI / cloud / SBCs):"
+	@echo "  make kernel-arm64       Build mainline ARM64 kernel from kernel.org LTS"
+	@echo "  make rootfs-arm64       Prepare generic ARM64 rootfs (mainline kernel modules)"
+	@echo "  make disk-image-arm64   Create UEFI-bootable A/B GPT disk image (.arm64.img)"
+	@echo ""
 	@echo "Build targets (ARM64 Raspberry Pi):"
-	@echo "  make kernel-arm64       Build ARM64 kernel from raspberrypi/linux"
-	@echo "  make rootfs-arm64       Extract + prepare ARM64 rootfs from piCore64"
-	@echo "  make rpi-image          Create Raspberry Pi SD card image with A/B partitions"
+	@echo "  make kernel-rpi         Build RPi kernel from raspberrypi/linux"
+	@echo "  make rootfs-arm64-rpi   Prepare RPi-flavoured rootfs (RPi kernel modules)"
+	@echo "  make rpi-image          Create Raspberry Pi SD card image with A/B autoboot"
 	@echo ""
 	@echo "Test targets:"
 	@echo "  make test-boot          Boot ISO in QEMU, verify boot success"
@@ -262,7 +293,8 @@ help:
 	@echo "  make test-update-agent  Run update agent Go unit tests"
 	@echo "  make test-update        A/B update cycle integration test"
 	@echo "  make test-rollback      Forced rollback integration test"
-	@echo "  make test-boot-arm64    ARM64 boot test in QEMU aarch64"
+	@echo "  make test-boot-arm64    ARM64 boot test (direct kernel, fast)"
+	@echo "  make test-boot-arm64-disk  ARM64 full UEFI disk-boot test"
 	@echo "  make test-all           Run core tests (boot + k8s + persistence)"
 	@echo "  make test-integ         Run full integration suite"
 	@echo "  make bench-boot         Benchmark boot performance (3 runs)"
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 An immutable, bootable Linux distribution purpose-built for [KubeSolo](https://github.com/portainer/kubesolo) — Portainer's ultra-lightweight single-node Kubernetes.

-> **Status:** All 6 phases complete. Boots and runs K8s workloads. Portainer Edge Agent tested and connected.
+> **Status (v0.3.0):** x86_64 and generic ARM64 (UEFI / virtio / mainline kernel) both build and boot end-to-end. Update agent has an explicit state machine, OCI registry distribution alongside HTTP, channel + maintenance-window + version-stepping-stone gates, and auto-rollback. ARM64 Raspberry Pi support remains paused pending physical hardware. See [docs/release-notes-0.3.0.md](docs/release-notes-0.3.0.md) for the full v0.3.0 changelog.

 ## What is this?

@@ -24,23 +24,34 @@ KubeSolo OS combines **Tiny Core Linux** (~11 MB) with **KubeSolo** (single-bina

 ## Quick Start

+### x86_64 ISO
+
 ```bash
-# Fetch Tiny Core ISO + KubeSolo binary
-make fetch
-
-# Build custom kernel (first time only, ~25 min, cached)
-make kernel
-
-# Build Go binaries
+make fetch          # Tiny Core ISO + KubeSolo binary
+make kernel         # Custom kernel (first time only, ~25 min, cached)
 make build-cloudinit build-update-agent
-
-# Build bootable ISO
 make rootfs initramfs iso
-
-# Test in QEMU
 make dev-vm
 ```

+### Generic ARM64 disk image (v0.3.0+)
+
+For Graviton / Ampere / generic UEFI ARM64 hosts:
+
+```bash
+make kernel-arm64       # Mainline 6.12 LTS kernel (first time only, ~30-60 min)
+make rootfs-arm64       # Mainline kernel modules + KubeSolo arm64
+make disk-image-arm64   # UEFI-bootable A/B GPT image
+make test-boot-arm64-disk  # boot smoke test under qemu-system-aarch64
+```
+
+### Raspberry Pi (work in progress)
+
+Build path lives at `make kernel-rpi` / `make rpi-image`; needs physical
+hardware to validate the firmware + autoboot.txt path. See
+[docs/arm64-architecture.md](docs/arm64-architecture.md) for the two-track
+build layout.
+
 Or build everything at once inside Docker:

 ```bash
@@ -227,13 +238,19 @@ Metrics include: `kubesolo_os_info`, `boot_success`, `boot_counter`, `uptime_sec

 | Phase | Scope | Status |
 |-------|-------|--------|
-| 1 | PoC: boot Tiny Core + KubeSolo, verify K8s | Complete |
+| 1 | PoC: boot Tiny Core + KubeSolo, verify K8s | Complete (x86_64) |
 | 2 | Cloud-init Go parser, network, hostname | Complete |
-| 3 | A/B atomic updates, GRUB, rollback agent | Complete |
+| 3 | A/B atomic updates, GRUB, rollback agent | Complete (x86_64) |
 | 4 | Ed25519 signing, Portainer Edge, SSH extension | Complete |
-| 5 | CI/CD, OCI distribution, Prometheus metrics, ARM64 | Complete |
-| 6 | Security hardening, AppArmor, ARM64 RPi support | Complete |
-| - | Custom kernel build for container runtime fixes | Complete |
+| 5 | CI/CD, OCI distribution, Prometheus metrics, ARM64 cross-compile | Complete |
+| 6 | Security hardening, AppArmor | Complete |
+| - | Custom kernel build for container runtime fixes | Complete (x86_64) |
+| 7 | ARM64 generic (mainline kernel, UEFI, virtio) | Complete (v0.3.0, QEMU validated) |
+| 8 | Update engine v2 (state machine, channels, OCI, pre-flight gates) | Complete (v0.3.0) |
+| - | ARM64 Raspberry Pi (custom kernel, firmware, SD card image) | Paused — needs hardware |
+| - | OCI cosign signature verification | Planned for v0.3.1 |
+| - | LABEL=KSOLODATA on ARM64 (replace blkid/findfs path) | Planned for v0.3.1 |
+| - | Real-hardware ARM64 validation (Graviton / Ampere) | Planned for v0.3.1 |

 ## License

--- a/2
+++ b/2
@@ -1 +1 @@
-0.2.0
+0.3.1
--- a/build/Dockerfile.builder
+++ b/build/Dockerfile.builder
@@ -18,6 +18,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    file \
    flex \
    genisoimage \
+    grub-common \
+    grub-efi-amd64-bin \
+    grub-efi-arm64-bin \
+    grub-pc-bin \
+    grub2-common \
    gzip \
    isolinux \
    iptables \
@@ -25,6 +30,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libarchive-tools \
    libelf-dev \
    libssl-dev \
+    nftables \
    make \
    parted \
    squashfs-tools \
@@ -35,6 +41,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    apparmor-utils \
    gcc-aarch64-linux-gnu \
    binutils-aarch64-linux-gnu \
+    busybox-static \
    git \
    kpartx \
    unzip \
@@ -49,6 +56,13 @@ RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \
    | tar -C /usr/local -xzf -
 ENV PATH="/usr/local/go/bin:${PATH}"

+# Install oras (OCI artifact CLI) for push-oci-artifact.sh.
+# Bump ORAS_VERSION when pushing breaks or when oras gains useful flags.
+ARG ORAS_VERSION=1.2.3
+RUN curl -fsSL "https://github.com/oras-project/oras/releases/download/v${ORAS_VERSION}/oras_${ORAS_VERSION}_linux_amd64.tar.gz" \
+    | tar -C /usr/local/bin -xzf - oras \
+    && chmod +x /usr/local/bin/oras
+
 WORKDIR /build
 COPY . /build

--- a/build/config/kernel-container.fragment
+++ b/build/config/kernel-container.fragment
@@ -0,0 +1,130 @@
+# KubeSolo OS — Shared kernel config fragment for container workloads
+#
+# Applied on top of:
+#   - Tiny Core stock config (x86_64)        via build-kernel.sh
+#   - mainline kernel.org arm64 defconfig    via build-kernel-arm64.sh
+#   - bcm2711_defconfig / bcm2712_defconfig  via build-kernel-rpi.sh
+#
+# All entries here are architecture-agnostic.
+# Apply this fragment twice with `make olddefconfig` between passes — TC's stock
+# config has CONFIG_SECURITY disabled, which causes a single-pass olddefconfig
+# to strip the security subtree before its dependencies (SYSFS, MULTIUSER) are
+# resolved.
+
+# cgroup v2 (mandatory for containerd/runc)
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_MEMCG=y
+CONFIG_CGROUP_BPF=y
+CONFIG_CFS_BANDWIDTH=y
+
+# BPF (required for cgroup v2 device control)
+CONFIG_BPF=y
+CONFIG_BPF_SYSCALL=y
+
+# Namespaces (mandatory for containers)
+CONFIG_NAMESPACES=y
+CONFIG_NET_NS=y
+CONFIG_PID_NS=y
+CONFIG_USER_NS=y
+CONFIG_UTS_NS=y
+CONFIG_IPC_NS=y
+
+# Device management
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+
+# Filesystem
+CONFIG_OVERLAY_FS=y
+CONFIG_SQUASHFS=y
+CONFIG_EXT4_FS=y
+CONFIG_VFAT_FS=y
+
+# Networking
+CONFIG_BRIDGE=m
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_NAT=m
+CONFIG_NF_TABLES=m
+CONFIG_VETH=m
+CONFIG_VXLAN=m
+
+# nftables address-family handlers. These are BOOL Kconfigs (not tristate)
+# so they have to be built into the kernel — there's no module to modprobe
+# at runtime. Without them, `nft add table ip ...` returns EOPNOTSUPP and
+# KubeSolo v1.1.4+'s pod-masquerade setup fails at boot.
+CONFIG_NF_TABLES_IPV4=y
+CONFIG_NF_TABLES_IPV6=y
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
+
+# nftables expression modules used by KubeSolo's masquerade ruleset, the
+# kube-proxy nft backend (Kubernetes 1.34+), and the xtables compat path.
+# Listed in modules.list / modules-arm64.list so init loads them at boot.
+CONFIG_NFT_NAT=m
+CONFIG_NFT_MASQ=m
+CONFIG_NFT_CT=m
+CONFIG_NFT_REDIR=m
+CONFIG_NFT_REJECT=m
+CONFIG_NFT_REJECT_INET=m
+CONFIG_NFT_COMPAT=m
+CONFIG_NFT_FIB=m
+CONFIG_NFT_FIB_IPV4=m
+CONFIG_NFT_FIB_IPV6=m
+# numgen drives kube-proxy's random / round-robin endpoint LB:
+#   `numgen random mod N vmap { ... }` in service rules.
+# Without it kube-proxy's nft sync fails with ENOENT on every service.
+CONFIG_NFT_NUMGEN=m
+# hash drives consistent-hash LB (sessionAffinity=ClientIP, etc.).
+CONFIG_NFT_HASH=m
+# objref / limit / log are used by various policy expressions kube-proxy and
+# CNI plugins emit. Including them pre-empts a future "could not process
+# rule" debug loop.
+CONFIG_NFT_OBJREF=m
+CONFIG_NFT_LIMIT=m
+CONFIG_NFT_LOG=m
+
+# IPv4 NAT bits NFT_MASQ depends on. Auto-selected on most kernels but we
+# pin them explicitly so olddefconfig doesn't strip them when the fragment
+# is applied on top of a minimal defconfig.
+CONFIG_NF_NAT_MASQUERADE=y
+
+# Security: AppArmor + Audit
+CONFIG_AUDIT=y
+CONFIG_AUDITSYSCALL=y
+CONFIG_SECURITY=y
+CONFIG_SECURITYFS=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_APPARMOR=y
+CONFIG_DEFAULT_SECURITY_APPARMOR=y
+CONFIG_LSM=lockdown,yama,apparmor
+
+# Security: seccomp
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+
+# Crypto (image verification)
+CONFIG_CRYPTO_SHA256=y
+
+# Disable unnecessary subsystems for headless edge appliance
+# CONFIG_SOUND is not set
+# CONFIG_DRM is not set
+# CONFIG_KVM is not set
+# CONFIG_MEDIA_SUPPORT is not set
+# CONFIG_WIRELESS is not set
+# CONFIG_WLAN is not set
+# CONFIG_CFG80211 is not set
+# CONFIG_BT is not set
+# CONFIG_NFC is not set
+# CONFIG_INFINIBAND is not set
+# CONFIG_PCMCIA is not set
+# CONFIG_HAMRADIO is not set
+# CONFIG_ISDN is not set
+# CONFIG_ATM is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TABLET is not set
+# CONFIG_FPGA is not set
--- a/build/config/modules-arm64.list
+++ b/build/config/modules-arm64.list
@@ -56,6 +56,17 @@ nft_fib
 nft_fib_ipv4
 nft_fib_ipv6

+# nft expressions used by the Kubernetes 1.34+ nftables kube-proxy backend.
+# Loading these at boot (stage 30) is mandatory because stage 85 sets
+# kernel.modules_disabled=1, which would otherwise block kube-proxy from
+# auto-loading them on first rule install.
+# (Note: list parser only honours full-line "#"-prefixed comments, NOT
+# inline "module # comment". Keep module names on their own line.)
+nft_numgen
+nft_hash
+nft_limit
+nft_log
+
 # Reject targets (used by kube-proxy iptables-restore rules)
 nf_reject_ipv4
 nf_reject_ipv6
--- a/build/config/modules.list
+++ b/build/config/modules.list
@@ -54,6 +54,14 @@ nft_fib
 nft_fib_ipv4
 nft_fib_ipv6

+# nft expressions used by the Kubernetes 1.34+ nftables kube-proxy backend.
+# Must be loaded at stage 30 because stage 85 sets modules_disabled=1.
+# (Parser ignores full-line "#" comments only — keep module names alone.)
+nft_numgen
+nft_hash
+nft_limit
+nft_log
+
 # Reject targets (used by kube-proxy iptables-restore rules)
 nf_reject_ipv4
 nf_reject_ipv6
--- a/build/config/rpi-kernel-config.fragment
+++ b/build/config/rpi-kernel-config.fragment
@@ -1,69 +0,0 @@
-# KubeSolo OS — Raspberry Pi kernel config overrides
-# Applied on top of bcm2711_defconfig (Pi 4) or bcm2712_defconfig (Pi 5)
-# These ensure container runtime support is enabled.
-
-# cgroup v2 (mandatory for containerd/runc)
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_CGROUP_PIDS=y
-CONFIG_MEMCG=y
-CONFIG_CGROUP_BPF=y
-CONFIG_CFS_BANDWIDTH=y
-
-# BPF (required for cgroup v2 device control)
-CONFIG_BPF=y
-CONFIG_BPF_SYSCALL=y
-
-# Namespaces (mandatory for containers)
-CONFIG_NAMESPACES=y
-CONFIG_NET_NS=y
-CONFIG_PID_NS=y
-CONFIG_USER_NS=y
-CONFIG_UTS_NS=y
-CONFIG_IPC_NS=y
-
-# Device management
-CONFIG_DEVTMPFS=y
-CONFIG_DEVTMPFS_MOUNT=y
-
-# Filesystem
-CONFIG_OVERLAY_FS=y
-CONFIG_SQUASHFS=y
-CONFIG_EXT4_FS=y
-CONFIG_VFAT_FS=y
-
-# Networking
-CONFIG_BRIDGE=m
-CONFIG_NETFILTER=y
-CONFIG_NF_CONNTRACK=m
-CONFIG_NF_NAT=m
-CONFIG_NF_TABLES=m
-CONFIG_VETH=m
-CONFIG_VXLAN=m
-
-# Security: AppArmor + Audit
-CONFIG_AUDIT=y
-CONFIG_AUDITSYSCALL=y
-CONFIG_SECURITY=y
-CONFIG_SECURITYFS=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_APPARMOR=y
-CONFIG_DEFAULT_SECURITY_APPARMOR=y
-
-# Security: seccomp
-CONFIG_SECCOMP=y
-CONFIG_SECCOMP_FILTER=y
-
-# Crypto (image verification)
-CONFIG_CRYPTO_SHA256=y
-
-# Disable unnecessary subsystems for edge appliance
-# CONFIG_SOUND is not set
-# CONFIG_DRM is not set
-# CONFIG_MEDIA_SUPPORT is not set
-# CONFIG_WIRELESS is not set
-# CONFIG_BT is not set
-# CONFIG_NFC is not set
--- a/build/config/versions.env
+++ b/build/config/versions.env
@@ -9,7 +9,13 @@ TINYCORE_ISO=CorePure64-${TINYCORE_VERSION}.iso
 TINYCORE_ISO_URL=${TINYCORE_MIRROR}/${TINYCORE_VERSION%%.*}.x/${TINYCORE_ARCH}/release/${TINYCORE_ISO}

 # KubeSolo
+# Pinned release tag from https://github.com/portainer/kubesolo/releases.
+# Bump here and re-run `make fetch` to pull a new version.
+KUBESOLO_VERSION=v1.1.5
 KUBESOLO_INSTALL_URL=https://get.kubesolo.io
+# Per-arch SHA256 of the musl tarball (verified at fetch time when non-empty).
+KUBESOLO_SHA256_AMD64=565bd5fd98fc8ce09160e646b55de3493c782d74c0e0c46ccf130ff4bcabab81
+KUBESOLO_SHA256_ARM64=db865a5e9b2617d595f9c2b7d011272edc94587621a9690e2de0f47cc94f0748

 # Build tools (used inside builder container)
 GRUB_VERSION=2.12
@@ -19,7 +25,6 @@ SYSLINUX_VERSION=6.03
 # Populate by running: sha256sum build/cache/<file>
 # Leave empty to skip verification (useful for first fetch)
 TINYCORE_ISO_SHA256=""
-KUBESOLO_SHA256=""
 NETFILTER_TCZ_SHA256=""
 NET_BRIDGING_TCZ_SHA256=""
 IPTABLES_TCZ_SHA256=""
@@ -38,5 +43,13 @@ RPI_FIRMWARE_URL=https://github.com/raspberrypi/firmware/archive/refs/tags/${RPI
 RPI_KERNEL_BRANCH=rpi-6.6.y
 RPI_KERNEL_REPO=https://github.com/raspberrypi/linux

+# Mainline Linux kernel (for generic ARM64 — kernel.org LTS)
+# Bump within the 6.12 LTS series as patch levels release.
+# 6.12 LTS is supported until Dec 2029.
+MAINLINE_KERNEL_VERSION=6.12.10
+MAINLINE_KERNEL_MAJOR=v6.x
+MAINLINE_KERNEL_URL=https://cdn.kernel.org/pub/linux/kernel/${MAINLINE_KERNEL_MAJOR}/linux-${MAINLINE_KERNEL_VERSION}.tar.xz
+MAINLINE_KERNEL_SHA256=""
+
 # Output naming
 OS_NAME=kubesolo-os
--- a/build/grub/grub-arm64.cfg
+++ b/build/grub/grub-arm64.cfg
@@ -0,0 +1,93 @@
+# KubeSolo OS — GRUB Configuration (ARM64)
+# A/B partition boot with automatic rollback.
+#
+# Same A/B logic as build/grub/grub.cfg; only the console parameters differ
+# (ARM64 PL011 / 16550-compat UART rather than x86 ttyS0).
+#
+# Partition layout:
+#   (hd0,gpt1) — EFI/Boot  (256 MB, FAT32) — contains GRUB + grubenv
+#   (hd0,gpt2) — System A  (512 MB, ext4)  — vmlinuz + kubesolo-os.gz
+#   (hd0,gpt3) — System B  (512 MB, ext4)  — vmlinuz + kubesolo-os.gz
+#   (hd0,gpt4) — Data      (remaining, ext4) — persistent K8s state
+
+set default=0
+set timeout=3
+
+load_env
+
+# --- A/B Rollback Logic (identical to amd64 grub.cfg) ---
+
+if [ "${boot_success}" != "1" ]; then
+    if [ "${boot_counter}" = "0" ]; then
+        if [ "${active_slot}" = "A" ]; then
+            set active_slot=B
+        else
+            set active_slot=A
+        fi
+        save_env active_slot
+        set boot_counter=3
+        save_env boot_counter
+    else
+        if [ "${boot_counter}" = "3" ]; then
+            set boot_counter=2
+        elif [ "${boot_counter}" = "2" ]; then
+            set boot_counter=1
+        elif [ "${boot_counter}" = "1" ]; then
+            set boot_counter=0
+        fi
+        save_env boot_counter
+    fi
+fi
+
+set boot_success=0
+save_env boot_success
+
+if [ "${active_slot}" = "A" ]; then
+    set root='(hd0,gpt2)'
+    set slot_label="System A"
+else
+    set root='(hd0,gpt3)'
+    set slot_label="System B"
+fi
+
+# --- ARM64 console string ---
+# Order matters: the LAST `console=` is the primary system console (where /dev/console
+# points and where init's stdout/stderr land). Earlier `console=` entries get mirrored
+# kernel output but don't carry process I/O.
+#
+# Covers Graviton/16550 (ttyS0) as secondary and QEMU virt / PL011 / Ampere (ttyAMA0)
+# as primary. ttyAMA0 must be last for `-nographic` QEMU + most ARM64 SBCs.
+#
+# `quiet` is intentionally omitted from the default entry while we stabilise the
+# generic ARM64 boot path. Add back once boots are reliable.
+
+menuentry "KubeSolo OS (${slot_label})" {
+    echo "Booting KubeSolo OS from ${slot_label}..."
+    echo "Boot counter: ${boot_counter}, Boot success: ${boot_success}"
+    linux /vmlinuz init=/sbin/init kubesolo.data=/dev/vda4 console=ttyS0,115200 console=ttyAMA0,115200
+    initrd /kubesolo-os.gz
+}
+
+menuentry "KubeSolo OS (${slot_label}) — Debug Mode" {
+    echo "Booting KubeSolo OS (debug) from ${slot_label}..."
+    linux /vmlinuz kubesolo.data=/dev/vda4 kubesolo.debug console=ttyS0,115200 console=ttyAMA0,115200
+    initrd /kubesolo-os.gz
+}
+
+menuentry "KubeSolo OS — Emergency Shell" {
+    echo "Booting to emergency shell..."
+    linux /vmlinuz init=/sbin/init kubesolo.shell console=ttyS0,115200 console=ttyAMA0,115200
+    initrd /kubesolo-os.gz
+}
+
+menuentry "KubeSolo OS — Boot Other Slot" {
+    if [ "${active_slot}" = "A" ]; then
+        set root='(hd0,gpt3)'
+        echo "Booting from System B (passive)..."
+    else
+        set root='(hd0,gpt2)'
+        echo "Booting from System A (passive)..."
+    fi
+    linux /vmlinuz kubesolo.data=/dev/vda4 kubesolo.debug console=ttyS0,115200 console=ttyAMA0,115200
+    initrd /kubesolo-os.gz
+}
--- a/build/scripts/build-kernel-arm64.sh
+++ b/build/scripts/build-kernel-arm64.sh
@@ -1,14 +1,20 @@
 #!/bin/bash
-# build-kernel-arm64.sh — Build ARM64 kernel for Raspberry Pi 4/5
+# build-kernel-arm64.sh — Build generic ARM64 kernel (mainline LTS)
 #
-# Uses the official raspberrypi/linux kernel fork with bcm2711_defconfig
-# as the base, overlaid with container-critical config options.
+# Builds a Linux kernel from kernel.org mainline LTS source, suitable for:
+#   - qemu-system-aarch64 -machine virt
+#   - UEFI ARM64 hosts (Ampere, Graviton, generic ARM64 servers)
+#   - Future ARM64 SBCs with UEFI/u-boot generic-distro support
 #
-# Output is cached in $CACHE_DIR/custom-kernel-arm64/ and reused across builds.
+# This is the GENERIC ARM64 build track. For Raspberry Pi specifically
+# (raspberrypi/linux fork, RPi firmware boot path, custom DTBs), see
+# build/scripts/build-kernel-rpi.sh.
+#
+# Output is cached in $CACHE_DIR/kernel-arm64-generic/ and reused across builds.
 #
 # Requirements:
 #   - gcc-aarch64-linux-gnu (cross-compiler)
-#   - Standard kernel build deps (bc, bison, flex, etc.)
+#   - Standard kernel build deps (bc, bison, flex, libelf-dev, libssl-dev)
 set -euo pipefail

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -18,94 +24,165 @@ CACHE_DIR="${CACHE_DIR:-$PROJECT_ROOT/build/cache}"
 # shellcheck source=../config/versions.env
 . "$SCRIPT_DIR/../config/versions.env"

-CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel-arm64"
+KVER="$MAINLINE_KERNEL_VERSION"
+CUSTOM_KERNEL_DIR="$CACHE_DIR/kernel-arm64-generic"
 CUSTOM_IMAGE="$CUSTOM_KERNEL_DIR/Image"
 CUSTOM_MODULES="$CUSTOM_KERNEL_DIR/modules"
-CUSTOM_DTBS="$CUSTOM_KERNEL_DIR/dtbs"

 mkdir -p "$CACHE_DIR" "$CUSTOM_KERNEL_DIR"

 # --- Skip if already built ---
-if [ -f "$CUSTOM_IMAGE" ] && [ -d "$CUSTOM_MODULES" ]; then
-    echo "==> ARM64 kernel already built (cached)"
-    echo "    Image: $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
+if [ -f "$CUSTOM_IMAGE" ] && [ -d "$CUSTOM_MODULES/lib/modules/$KVER" ]; then
+    echo "==> Generic ARM64 kernel already built (cached)"
+    echo "    Image:   $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
+    echo "    Kernel:  $KVER"
    exit 0
 fi

-# --- Verify cross-compiler ---
-if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
-    echo "ERROR: aarch64-linux-gnu-gcc not found"
-    echo "Install: apt-get install gcc-aarch64-linux-gnu"
+# --- Toolchain selection: native on arm64 hosts, cross-compile elsewhere ---
+HOST_ARCH="$(uname -m)"
+if [ "$HOST_ARCH" = "aarch64" ] || [ "$HOST_ARCH" = "arm64" ]; then
+    # Native build — use the host's gcc
+    if ! command -v gcc >/dev/null 2>&1; then
+        echo "ERROR: gcc not found"
+        echo "Install: apt-get install build-essential"
+        exit 1
+    fi
+    CROSS_COMPILE=""
+    echo "==> Native ARM64 build (host arch: $HOST_ARCH)"
+else
+    # Cross-build from x86 — use aarch64 cross-compiler
+    if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
+        echo "ERROR: aarch64-linux-gnu-gcc not found"
+        echo "Install: apt-get install gcc-aarch64-linux-gnu"
+        exit 1
+    fi
+    CROSS_COMPILE="aarch64-linux-gnu-"
+    echo "==> Cross-building ARM64 kernel from $HOST_ARCH"
+fi
+
+echo "==> Building generic ARM64 kernel (mainline $KVER)..."
+echo "    Source: $MAINLINE_KERNEL_URL"
+
+# --- Download mainline kernel source ---
+KERNEL_SRC_ARCHIVE="$CACHE_DIR/linux-${KVER}.tar.xz"
+if [ ! -f "$KERNEL_SRC_ARCHIVE" ]; then
+    echo "==> Downloading mainline kernel source (~140 MB)..."
+    wget -q --show-progress -O "$KERNEL_SRC_ARCHIVE" "$MAINLINE_KERNEL_URL" 2>/dev/null || \
+        curl -fSL "$MAINLINE_KERNEL_URL" -o "$KERNEL_SRC_ARCHIVE"
+    echo "    Downloaded: $(du -h "$KERNEL_SRC_ARCHIVE" | cut -f1)"
+else
+    echo "==> Kernel source already cached: $(du -h "$KERNEL_SRC_ARCHIVE" | cut -f1)"
+fi
+
+# --- Verify checksum if pinned ---
+if [ -n "${MAINLINE_KERNEL_SHA256:-}" ]; then
+    actual=$(sha256sum "$KERNEL_SRC_ARCHIVE" | awk '{print $1}')
+    if [ "$actual" != "$MAINLINE_KERNEL_SHA256" ]; then
+        echo "ERROR: Kernel source checksum mismatch"
+        echo "  Expected: $MAINLINE_KERNEL_SHA256"
+        echo "  Got:      $actual"
+        exit 1
+    fi
+    echo "    Checksum OK"
+fi
+
+# --- Extract to case-sensitive fs ---
+# The kernel source has files differing only by case (xt_mark.h vs xt_MARK.h).
+# Build in /tmp (ext4 on Linux runners, case-sensitive).
+KERNEL_BUILD_DIR="/tmp/kernel-build-arm64-generic"
+rm -rf "$KERNEL_BUILD_DIR"
+mkdir -p "$KERNEL_BUILD_DIR"
+
+echo "==> Extracting kernel source..."
+tar -xf "$KERNEL_SRC_ARCHIVE" -C "$KERNEL_BUILD_DIR"
+KERNEL_SRC_DIR=$(find "$KERNEL_BUILD_DIR" -maxdepth 1 -type d -name 'linux-*' | head -1)
+if [ -z "$KERNEL_SRC_DIR" ]; then
+    echo "ERROR: Could not find extracted source directory"
+    ls -la "$KERNEL_BUILD_DIR"/
    exit 1
 fi

-echo "==> Building ARM64 kernel for Raspberry Pi..."
-echo "    Branch: $RPI_KERNEL_BRANCH"
-echo "    Repo:   $RPI_KERNEL_REPO"
+cd "$KERNEL_SRC_DIR"

-# --- Download kernel source ---
-KERNEL_SRC_DIR="$CACHE_DIR/rpi-linux-${RPI_KERNEL_BRANCH}"
-if [ ! -d "$KERNEL_SRC_DIR" ]; then
-    echo "==> Downloading RPi kernel source (shallow clone)..."
-    git clone --depth 1 --branch "$RPI_KERNEL_BRANCH" \
-        "$RPI_KERNEL_REPO" "$KERNEL_SRC_DIR"
-else
-    echo "==> Kernel source already cached"
+# --- Base config: arm64 defconfig (generic ARMv8) ---
+echo "==> Applying arm64 defconfig..."
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" defconfig
+
+# --- Apply shared container fragment ---
+CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/kernel-container.fragment"
+if [ ! -f "$CONFIG_FRAGMENT" ]; then
+    echo "ERROR: Config fragment not found: $CONFIG_FRAGMENT"
+    exit 1
 fi

-# --- Build in /tmp for case-sensitivity ---
-KERNEL_BUILD_DIR="/tmp/kernel-build-arm64"
-rm -rf "$KERNEL_BUILD_DIR"
-cp -a "$KERNEL_SRC_DIR" "$KERNEL_BUILD_DIR"
-
-cd "$KERNEL_BUILD_DIR"
-
-# --- Apply base config (Pi 4 = bcm2711) ---
-echo "==> Applying bcm2711_defconfig..."
-make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- bcm2711_defconfig
-
-# --- Apply container config overrides ---
-CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/rpi-kernel-config.fragment"
-if [ -f "$CONFIG_FRAGMENT" ]; then
-    echo "==> Applying KubeSolo config overrides..."
-    while IFS= read -r line; do
-        # Skip comments and empty lines
-        case "$line" in \#*|"") continue ;; esac
-        key="${line%%=*}"
-        value="${line#*=}"
-        case "$value" in
-            y)   ./scripts/config --enable "$key" ;;
-            m)   ./scripts/config --module "$key" ;;
-            n)   ./scripts/config --disable "${key#CONFIG_}" ;;
-            *)   ./scripts/config --set-str "$key" "$value" ;;
-        esac
-    done < "$CONFIG_FRAGMENT"
-fi
-
-# Handle "is not set" comments as disables
-if [ -f "$CONFIG_FRAGMENT" ]; then
+apply_fragment() {
+    local fragment="$1"
    while IFS= read -r line; do
        case "$line" in
            "# CONFIG_"*" is not set")
-                key=$(echo "$line" | sed -n 's/^# \(CONFIG_[A-Z_]*\) is not set$/\1/p')
+                key=$(echo "$line" | sed -n 's/^# \(CONFIG_[A-Z0-9_]*\) is not set$/\1/p')
                [ -n "$key" ] && ./scripts/config --disable "${key#CONFIG_}"
+                continue
                ;;
+            \#*|"") continue ;;
        esac
-    done < "$CONFIG_FRAGMENT"
-fi
+        key="${line%%=*}"
+        value="${line#*=}"
+        case "$value" in
+            y) ./scripts/config --enable "$key" ;;
+            m) ./scripts/config --module "$key" ;;
+            n) ./scripts/config --disable "${key#CONFIG_}" ;;
+            *) ./scripts/config --set-str "$key" "$value" ;;
+        esac
+    done < "$fragment"
+}

-# Resolve dependencies
-make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig
+echo "==> Applying kernel-container.fragment (pass 1)..."
+apply_fragment "$CONFIG_FRAGMENT"
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig

-# --- Build kernel + modules + DTBs ---
+echo "==> Applying kernel-container.fragment (pass 2)..."
+apply_fragment "$CONFIG_FRAGMENT"
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
+
+# --- ARM64 virt-host specific enables ---
+# These are needed for the generic UEFI/virtio boot path but are arch-specific
+# so they live in this script rather than the shared fragment.
+echo "==> Enabling ARM64 virt-host configs..."
+./scripts/config --enable CONFIG_EFI
+./scripts/config --enable CONFIG_EFI_STUB
+./scripts/config --enable CONFIG_VIRTIO
+./scripts/config --enable CONFIG_VIRTIO_PCI
+./scripts/config --enable CONFIG_VIRTIO_BLK
+./scripts/config --enable CONFIG_VIRTIO_NET
+./scripts/config --enable CONFIG_VIRTIO_CONSOLE
+./scripts/config --enable CONFIG_VIRTIO_MMIO
+./scripts/config --enable CONFIG_HW_RANDOM_VIRTIO
+# NVMe for cloud / bare-metal ARM64 hosts that don't use virtio
+./scripts/config --enable CONFIG_BLK_DEV_NVME
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
+
+# --- Verify critical configs ---
+echo "==> Verifying critical configs..."
+for cfg in CGROUP_BPF SECURITY_APPARMOR AUDIT VIRTIO_BLK EFI_STUB; do
+    if ! grep -q "CONFIG_${cfg}=y" .config; then
+        echo "ERROR: CONFIG_${cfg} not set after olddefconfig"
+        grep "CONFIG_${cfg}" .config || echo "    (not found)"
+        exit 1
+    fi
+    echo "    CONFIG_${cfg}=y confirmed"
+done
+
+# --- Build kernel + modules (no DTBs — UEFI hosts use ACPI/virtio) ---
 NPROC=$(nproc 2>/dev/null || echo 4)
 echo ""
 echo "==> Building ARM64 kernel (${NPROC} parallel jobs)..."
-echo "    This may take 20-30 minutes..."
+echo "    This may take 20-40 minutes on a 6-core Odroid..."

-make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j"$NPROC" Image modules dtbs 2>&1
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" -j"$NPROC" Image modules 2>&1

-echo "==> ARM64 kernel build complete"
+echo "==> Kernel build complete"

 # --- Install to staging ---
 echo "==> Installing Image..."
@@ -114,31 +191,16 @@ cp arch/arm64/boot/Image "$CUSTOM_IMAGE"
 echo "==> Installing modules (stripped)..."
 rm -rf "$CUSTOM_MODULES"
 mkdir -p "$CUSTOM_MODULES"
-make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" \
    INSTALL_MOD_STRIP=1 modules_install INSTALL_MOD_PATH="$CUSTOM_MODULES"

-# Remove build/source symlinks
-KVER=$(ls "$CUSTOM_MODULES/lib/modules/" | head -1)
-rm -f "$CUSTOM_MODULES/lib/modules/$KVER/build"
-rm -f "$CUSTOM_MODULES/lib/modules/$KVER/source"
+# Pick up actual kernel version (e.g. 6.12.10 if KVER differs from package suffix)
+ACTUAL_KVER=$(ls "$CUSTOM_MODULES/lib/modules/" | head -1)
+rm -f "$CUSTOM_MODULES/lib/modules/$ACTUAL_KVER/build"
+rm -f "$CUSTOM_MODULES/lib/modules/$ACTUAL_KVER/source"

-# Run depmod
-depmod -a -b "$CUSTOM_MODULES" "$KVER" 2>/dev/null || true
+depmod -a -b "$CUSTOM_MODULES" "$ACTUAL_KVER" 2>/dev/null || true

-echo "==> Installing Device Tree Blobs..."
-rm -rf "$CUSTOM_DTBS"
-mkdir -p "$CUSTOM_DTBS/overlays"
-# Pi 4 DTBs
-cp arch/arm64/boot/dts/broadcom/bcm2711*.dtb "$CUSTOM_DTBS/" 2>/dev/null || true
-# Pi 5 DTBs
-cp arch/arm64/boot/dts/broadcom/bcm2712*.dtb "$CUSTOM_DTBS/" 2>/dev/null || true
-# Overlays we need
-for overlay in disable-wifi disable-bt; do
-    [ -f "arch/arm64/boot/dts/overlays/${overlay}.dtbo" ] && \
-        cp "arch/arm64/boot/dts/overlays/${overlay}.dtbo" "$CUSTOM_DTBS/overlays/"
-done
-
-# Save config for reference
 cp .config "$CUSTOM_KERNEL_DIR/.config"

 # --- Clean up ---
@@ -148,11 +210,10 @@ rm -rf "$KERNEL_BUILD_DIR"

 # --- Summary ---
 echo ""
-echo "==> ARM64 kernel build complete:"
+echo "==> Generic ARM64 kernel build complete:"
 echo "    Image:        $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
-echo "    Kernel ver:   $KVER"
-MOD_COUNT=$(find "$CUSTOM_MODULES/lib/modules/$KVER" -name '*.ko*' 2>/dev/null | wc -l)
+echo "    Kernel ver:   $ACTUAL_KVER"
+MOD_COUNT=$(find "$CUSTOM_MODULES/lib/modules/$ACTUAL_KVER" -name '*.ko*' 2>/dev/null | wc -l)
 echo "    Modules:      $MOD_COUNT"
-echo "    Modules size: $(du -sh "$CUSTOM_MODULES/lib/modules/$KVER" 2>/dev/null | cut -f1)"
-echo "    DTBs:         $(ls "$CUSTOM_DTBS"/*.dtb 2>/dev/null | wc -l)"
+echo "    Modules size: $(du -sh "$CUSTOM_MODULES/lib/modules/$ACTUAL_KVER" 2>/dev/null | cut -f1)"
 echo ""
--- a/build/scripts/build-kernel-rpi.sh
+++ b/build/scripts/build-kernel-rpi.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+# build-kernel-rpi.sh — Build kernel for Raspberry Pi 4/5 (ARM64)
+#
+# Uses the official raspberrypi/linux kernel fork with bcm2711_defconfig as the
+# base, overlaid with the shared container-config fragment.
+#
+# This is the RPi-specific build track. For generic ARM64 (UEFI / virtio /
+# kernel.org mainline) see build/scripts/build-kernel-arm64.sh.
+#
+# Output is cached in $CACHE_DIR/custom-kernel-rpi/ and reused across builds.
+#
+# Requirements:
+#   - gcc-aarch64-linux-gnu (cross-compiler)
+#   - Standard kernel build deps (bc, bison, flex, etc.)
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+CACHE_DIR="${CACHE_DIR:-$PROJECT_ROOT/build/cache}"
+
+# shellcheck source=../config/versions.env
+. "$SCRIPT_DIR/../config/versions.env"
+
+CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel-rpi"
+CUSTOM_IMAGE="$CUSTOM_KERNEL_DIR/Image"
+CUSTOM_MODULES="$CUSTOM_KERNEL_DIR/modules"
+CUSTOM_DTBS="$CUSTOM_KERNEL_DIR/dtbs"
+
+mkdir -p "$CACHE_DIR" "$CUSTOM_KERNEL_DIR"
+
+# --- Skip if already built ---
+if [ -f "$CUSTOM_IMAGE" ] && [ -d "$CUSTOM_MODULES" ]; then
+    echo "==> RPi kernel already built (cached)"
+    echo "    Image: $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
+    exit 0
+fi
+
+# --- Toolchain selection: native on arm64 hosts, cross-compile elsewhere ---
+HOST_ARCH="$(uname -m)"
+if [ "$HOST_ARCH" = "aarch64" ] || [ "$HOST_ARCH" = "arm64" ]; then
+    if ! command -v gcc >/dev/null 2>&1; then
+        echo "ERROR: gcc not found"
+        echo "Install: apt-get install build-essential"
+        exit 1
+    fi
+    CROSS_COMPILE=""
+    echo "==> Native ARM64 build (host arch: $HOST_ARCH)"
+else
+    if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
+        echo "ERROR: aarch64-linux-gnu-gcc not found"
+        echo "Install: apt-get install gcc-aarch64-linux-gnu"
+        exit 1
+    fi
+    CROSS_COMPILE="aarch64-linux-gnu-"
+    echo "==> Cross-building RPi kernel from $HOST_ARCH"
+fi
+
+echo "==> Building RPi kernel (raspberrypi/linux)..."
+echo "    Branch: $RPI_KERNEL_BRANCH"
+echo "    Repo:   $RPI_KERNEL_REPO"
+
+# --- Download kernel source ---
+KERNEL_SRC_DIR="$CACHE_DIR/rpi-linux-${RPI_KERNEL_BRANCH}"
+if [ ! -d "$KERNEL_SRC_DIR" ]; then
+    echo "==> Downloading RPi kernel source (shallow clone)..."
+    git clone --depth 1 --branch "$RPI_KERNEL_BRANCH" \
+        "$RPI_KERNEL_REPO" "$KERNEL_SRC_DIR"
+else
+    echo "==> Kernel source already cached"
+fi
+
+# --- Build in /tmp for case-sensitivity ---
+KERNEL_BUILD_DIR="/tmp/kernel-build-arm64"
+rm -rf "$KERNEL_BUILD_DIR"
+cp -a "$KERNEL_SRC_DIR" "$KERNEL_BUILD_DIR"
+
+cd "$KERNEL_BUILD_DIR"
+
+# --- Apply base config (Pi 4 = bcm2711) ---
+echo "==> Applying bcm2711_defconfig..."
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" bcm2711_defconfig
+
+# --- Apply container config overrides ---
+CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/kernel-container.fragment"
+if [ -f "$CONFIG_FRAGMENT" ]; then
+    echo "==> Applying KubeSolo config overrides..."
+    while IFS= read -r line; do
+        # Skip comments and empty lines
+        case "$line" in \#*|"") continue ;; esac
+        key="${line%%=*}"
+        value="${line#*=}"
+        case "$value" in
+            y)   ./scripts/config --enable "$key" ;;
+            m)   ./scripts/config --module "$key" ;;
+            n)   ./scripts/config --disable "${key#CONFIG_}" ;;
+            *)   ./scripts/config --set-str "$key" "$value" ;;
+        esac
+    done < "$CONFIG_FRAGMENT"
+fi
+
+# Handle "is not set" comments as disables
+if [ -f "$CONFIG_FRAGMENT" ]; then
+    while IFS= read -r line; do
+        case "$line" in
+            "# CONFIG_"*" is not set")
+                key=$(echo "$line" | sed -n 's/^# \(CONFIG_[A-Z_]*\) is not set$/\1/p')
+                [ -n "$key" ] && ./scripts/config --disable "${key#CONFIG_}"
+                ;;
+        esac
+    done < "$CONFIG_FRAGMENT"
+fi
+
+# Resolve dependencies
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
+
+# --- Build kernel + modules + DTBs ---
+NPROC=$(nproc 2>/dev/null || echo 4)
+echo ""
+echo "==> Building RPi kernel (${NPROC} parallel jobs)..."
+echo "    This may take 20-30 minutes..."
+
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" -j"$NPROC" Image modules dtbs 2>&1
+
+echo "==> RPi kernel build complete"
+
+# --- Install to staging ---
+echo "==> Installing Image..."
+cp arch/arm64/boot/Image "$CUSTOM_IMAGE"
+
+echo "==> Installing modules (stripped)..."
+rm -rf "$CUSTOM_MODULES"
+mkdir -p "$CUSTOM_MODULES"
+make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" \
+    INSTALL_MOD_STRIP=1 modules_install INSTALL_MOD_PATH="$CUSTOM_MODULES"
+
+# Remove build/source symlinks
+KVER=$(ls "$CUSTOM_MODULES/lib/modules/" | head -1)
+rm -f "$CUSTOM_MODULES/lib/modules/$KVER/build"
+rm -f "$CUSTOM_MODULES/lib/modules/$KVER/source"
+
+# Run depmod
+depmod -a -b "$CUSTOM_MODULES" "$KVER" 2>/dev/null || true
+
+echo "==> Installing Device Tree Blobs..."
+rm -rf "$CUSTOM_DTBS"
+mkdir -p "$CUSTOM_DTBS/overlays"
+# Pi 4 DTBs
+cp arch/arm64/boot/dts/broadcom/bcm2711*.dtb "$CUSTOM_DTBS/" 2>/dev/null || true
+# Pi 5 DTBs
+cp arch/arm64/boot/dts/broadcom/bcm2712*.dtb "$CUSTOM_DTBS/" 2>/dev/null || true
+# Overlays we need
+for overlay in disable-wifi disable-bt; do
+    [ -f "arch/arm64/boot/dts/overlays/${overlay}.dtbo" ] && \
+        cp "arch/arm64/boot/dts/overlays/${overlay}.dtbo" "$CUSTOM_DTBS/overlays/"
+done
+
+# Save config for reference
+cp .config "$CUSTOM_KERNEL_DIR/.config"
+
+# --- Clean up ---
+echo "==> Cleaning kernel build directory..."
+cd /
+rm -rf "$KERNEL_BUILD_DIR"
+
+# --- Summary ---
+echo ""
+echo "==> RPi kernel build complete:"
+echo "    Image:        $CUSTOM_IMAGE ($(du -h "$CUSTOM_IMAGE" | cut -f1))"
+echo "    Kernel ver:   $KVER"
+MOD_COUNT=$(find "$CUSTOM_MODULES/lib/modules/$KVER" -name '*.ko*' 2>/dev/null | wc -l)
+echo "    Modules:      $MOD_COUNT"
+echo "    Modules size: $(du -sh "$CUSTOM_MODULES/lib/modules/$KVER" 2>/dev/null | cut -f1)"
+echo "    DTBs:         $(ls "$CUSTOM_DTBS"/*.dtb 2>/dev/null | wc -l)"
+echo ""
--- a/build/scripts/build-kernel.sh
+++ b/build/scripts/build-kernel.sh
@@ -85,85 +85,49 @@ echo "    Source dir: $(basename "$KERNEL_SRC_DIR")"

 cd "$KERNEL_SRC_DIR"

-# --- Apply stock config + enable CONFIG_CGROUP_BPF ---
+# --- Apply stock config + shared container-config fragment ---
 echo "==> Applying stock Tiny Core config..."
 cp "$KERNEL_CFG" .config

-echo "==> Enabling required kernel configs..."
-./scripts/config --enable CONFIG_CGROUP_BPF
-./scripts/config --enable CONFIG_DEVTMPFS
-./scripts/config --enable CONFIG_DEVTMPFS_MOUNT
-./scripts/config --enable CONFIG_MEMCG
-./scripts/config --enable CONFIG_CFS_BANDWIDTH
+CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/kernel-container.fragment"
+if [ ! -f "$CONFIG_FRAGMENT" ]; then
+    echo "ERROR: Config fragment not found: $CONFIG_FRAGMENT"
+    exit 1
+fi

-# --- Strip unnecessary subsystems for smallest footprint ---
-# This is a headless K8s edge appliance — no sound, GPU, wireless, etc.
-echo "==> Disabling unnecessary subsystems for minimal footprint..."
+# Apply the fragment: each "CONFIG_X=v" line becomes the right scripts/config
+# invocation; "# CONFIG_X is not set" comments become --disable.
+apply_fragment() {
+    local fragment="$1"
+    while IFS= read -r line; do
+        case "$line" in
+            "# CONFIG_"*" is not set")
+                key=$(echo "$line" | sed -n 's/^# \(CONFIG_[A-Z0-9_]*\) is not set$/\1/p')
+                [ -n "$key" ] && ./scripts/config --disable "${key#CONFIG_}"
+                continue
+                ;;
+            \#*|"") continue ;;
+        esac
+        key="${line%%=*}"
+        value="${line#*=}"
+        case "$value" in
+            y) ./scripts/config --enable "$key" ;;
+            m) ./scripts/config --module "$key" ;;
+            n) ./scripts/config --disable "${key#CONFIG_}" ;;
+            *) ./scripts/config --set-str "$key" "$value" ;;
+        esac
+    done < "$fragment"
+}

-# Sound subsystem (not needed on headless appliance)
-./scripts/config --disable SOUND
-
-# GPU/DRM (serial console only, no display)
-./scripts/config --disable DRM
-
-# KVM hypervisor (this IS the guest/bare metal, not a hypervisor)
-./scripts/config --disable KVM
-
-# Media/camera/TV/radio (not needed)
-./scripts/config --disable MEDIA_SUPPORT
-
-# Wireless networking (wired edge device)
-./scripts/config --disable WIRELESS
-./scripts/config --disable WLAN
-./scripts/config --disable CFG80211
-
-# Bluetooth (not needed)
-./scripts/config --disable BT
-
-# NFC (not needed)
-./scripts/config --disable NFC
-
-# Infiniband (not needed on edge)
-./scripts/config --disable INFINIBAND
-
-# PCMCIA (legacy, not needed)
-./scripts/config --disable PCMCIA
-
-# Amateur radio (not needed)
-./scripts/config --disable HAMRADIO
-
-# ISDN (not needed)
-./scripts/config --disable ISDN
-
-# ATM networking (not needed)
-./scripts/config --disable ATM
-
-# Joystick/gamepad (not needed)
-./scripts/config --disable INPUT_JOYSTICK
-./scripts/config --disable INPUT_TABLET
-
-# FPGA (not needed)
-./scripts/config --disable FPGA
-
-# First pass: resolve base dependencies before adding security configs.
-# The stock TC config has "# CONFIG_SECURITY is not set" which causes
-# olddefconfig to strip security-related options if applied in a single pass.
+# Two-pass apply: TC's stock config has CONFIG_SECURITY disabled, so olddefconfig
+# strips the security subtree before its dependencies resolve. Re-applying the
+# fragment after the first olddefconfig restores those entries.
+echo "==> Applying kernel-container.fragment (pass 1)..."
+apply_fragment "$CONFIG_FRAGMENT"
 make olddefconfig

-# Security: AppArmor LSM + Audit subsystem
-# Applied AFTER first olddefconfig to ensure CONFIG_SECURITY dependencies
-# (SYSFS, MULTIUSER) are resolved before enabling the security subtree.
-echo "==> Enabling AppArmor + Audit kernel configs..."
-./scripts/config --enable CONFIG_AUDIT
-./scripts/config --enable CONFIG_AUDITSYSCALL
-./scripts/config --enable CONFIG_SECURITY
-./scripts/config --enable CONFIG_SECURITYFS
-./scripts/config --enable CONFIG_SECURITY_NETWORK
-./scripts/config --enable CONFIG_SECURITY_APPARMOR
-./scripts/config --set-str CONFIG_LSM "lockdown,yama,apparmor"
-./scripts/config --set-str CONFIG_DEFAULT_SECURITY "apparmor"
-
-# Second pass: resolve security config dependencies
+echo "==> Applying kernel-container.fragment (pass 2)..."
+apply_fragment "$CONFIG_FRAGMENT"
 make olddefconfig

 # Verify critical configs are set
--- a/build/scripts/create-disk-image.sh
+++ b/build/scripts/create-disk-image.sh
@@ -6,28 +6,61 @@
 #   Part 2: System A    (512 MB, ext4)   — vmlinuz + kubesolo-os.gz (active)
 #   Part 3: System B    (512 MB, ext4)   — vmlinuz + kubesolo-os.gz (passive)
 #   Part 4: Data        (remaining, ext4) — persistent K8s state
+#
+# Supports both x86_64 (default) and ARM64 generic UEFI targets. ARM64 RPi
+# uses a different image format — see build/scripts/create-rpi-image.sh.
+#
+# Environment:
+#   TARGET_ARCH  amd64 (default) or arm64
+#   IMG_SIZE_MB  Image size in MB (default 4096)
+#   CACHE_DIR    Build cache (default <project>/build/cache)
+#   ROOTFS_DIR   Rootfs work dir (default <project>/build/rootfs-work)
+#   OUTPUT_DIR   Output dir (default <project>/output)
 set -euo pipefail

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 ROOTFS_DIR="${ROOTFS_DIR:-$PROJECT_ROOT/build/rootfs-work}"
+CACHE_DIR="${CACHE_DIR:-$PROJECT_ROOT/build/cache}"
 OUTPUT_DIR="${OUTPUT_DIR:-$PROJECT_ROOT/output}"
 VERSION="$(cat "$PROJECT_ROOT/VERSION")"
 OS_NAME="kubesolo-os"
+TARGET_ARCH="${TARGET_ARCH:-amd64}"

-IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.img"
 IMG_SIZE_MB="${IMG_SIZE_MB:-4096}"  # 4 GB default (larger for A/B)

-VMLINUZ="$ROOTFS_DIR/vmlinuz"
+# --- Arch-specific paths ---
+case "$TARGET_ARCH" in
+    amd64)
+        IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.img"
+        VMLINUZ="$ROOTFS_DIR/vmlinuz"
+        GRUB_CFG="$PROJECT_ROOT/build/grub/grub.cfg"
+        GRUB_TARGET="x86_64-efi"
+        GRUB_EFI_BIN="bootx64.efi"
+        GRUB_INSTALL_BIOS=true
+        ;;
+    arm64)
+        IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.arm64.img"
+        VMLINUZ="$CACHE_DIR/kernel-arm64-generic/Image"
+        GRUB_CFG="$PROJECT_ROOT/build/grub/grub-arm64.cfg"
+        GRUB_TARGET="arm64-efi"
+        GRUB_EFI_BIN="BOOTAA64.EFI"
+        GRUB_INSTALL_BIOS=false
+        ;;
+    *)
+        echo "ERROR: TARGET_ARCH must be 'amd64' or 'arm64' (got: $TARGET_ARCH)"
+        exit 1
+        ;;
+esac
+
 INITRAMFS="$ROOTFS_DIR/kubesolo-os.gz"
-GRUB_CFG="$PROJECT_ROOT/build/grub/grub.cfg"
 GRUB_ENV_DEFAULTS="$PROJECT_ROOT/build/grub/grub-env-defaults"

 for f in "$VMLINUZ" "$INITRAMFS" "$GRUB_CFG" "$GRUB_ENV_DEFAULTS"; do
    [ -f "$f" ] || { echo "ERROR: Missing $f"; exit 1; }
 done

-echo "==> Creating ${IMG_SIZE_MB}MB disk image with A/B partitions..."
+echo "==> Creating ${IMG_SIZE_MB}MB ${TARGET_ARCH} disk image with A/B partitions..."
 mkdir -p "$OUTPUT_DIR"

 # Create sparse image
@@ -161,35 +194,44 @@ else
    mv "$GRUBENV_FILE.tmp" "$GRUBENV_FILE"
 fi

-# Install GRUB EFI binary if available
-if command -v grub-mkimage >/dev/null 2>&1; then
-    grub-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \
-        -p /boot/grub \
-        part_gpt ext2 fat normal linux echo all_video test search \
-        search_fs_uuid search_label configfile loadenv \
-        2>/dev/null || echo "    WARN: grub-mkimage failed — use QEMU -bios flag"
-elif command -v grub2-mkimage >/dev/null 2>&1; then
-    grub2-mkimage -O x86_64-efi -o "$MNT_EFI/EFI/BOOT/bootx64.efi" \
-        -p /boot/grub \
-        part_gpt ext2 fat normal linux echo all_video test search \
-        search_fs_uuid search_label configfile loadenv \
-        2>/dev/null || echo "    WARN: grub2-mkimage failed — use QEMU -bios flag"
+# Install GRUB EFI binary
+# Modules required: part_gpt + fat (boot partition), ext2 (system A/B),
+# normal + linux + echo + configfile + loadenv (boot menu + grubenv),
+# search_* (locate partitions by label).
+# all_video + test are x86-specific (DRM init); leave them out on arm64.
+if [ "$TARGET_ARCH" = "arm64" ]; then
+    GRUB_MODULES="part_gpt ext2 fat normal linux echo test search search_fs_uuid search_label configfile loadenv"
 else
-    echo "    WARN: grub-mkimage not found — EFI boot image not created"
-    echo "          Install grub2-tools or use QEMU -kernel/-initrd flags"
+    GRUB_MODULES="part_gpt ext2 fat normal linux echo all_video test search search_fs_uuid search_label configfile loadenv"
 fi

-# For BIOS boot: install GRUB i386-pc modules if available
-if command -v grub-install >/dev/null 2>&1; then
-    grub-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
-        --no-floppy "$LOOP" 2>/dev/null || {
-        echo "    WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
-    }
-elif command -v grub2-install >/dev/null 2>&1; then
-    grub2-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
-        --no-floppy "$LOOP" 2>/dev/null || {
-        echo "    WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
-    }
+# shellcheck disable=SC2086  # GRUB_MODULES is intentionally word-split
+if command -v grub-mkimage >/dev/null 2>&1; then
+    grub-mkimage -O "$GRUB_TARGET" -o "$MNT_EFI/EFI/BOOT/$GRUB_EFI_BIN" \
+        -p /boot/grub $GRUB_MODULES \
+        || echo "    WARN: grub-mkimage failed — use QEMU -bios flag"
+elif command -v grub2-mkimage >/dev/null 2>&1; then
+    grub2-mkimage -O "$GRUB_TARGET" -o "$MNT_EFI/EFI/BOOT/$GRUB_EFI_BIN" \
+        -p /boot/grub $GRUB_MODULES \
+        || echo "    WARN: grub2-mkimage failed — use QEMU -bios flag"
+else
+    echo "    WARN: grub-mkimage not found — EFI boot image not created"
+    echo "          Install grub-efi-${TARGET_ARCH}-bin or use QEMU -kernel/-initrd flags"
+fi
+
+# For BIOS boot: install GRUB i386-pc modules (x86 only — ARM64 is UEFI-only).
+if [ "$GRUB_INSTALL_BIOS" = "true" ]; then
+    if command -v grub-install >/dev/null 2>&1; then
+        grub-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
+            --no-floppy "$LOOP" 2>/dev/null || {
+            echo "    WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
+        }
+    elif command -v grub2-install >/dev/null 2>&1; then
+        grub2-install --target=i386-pc --boot-directory="$MNT_EFI/boot" \
+            --no-floppy "$LOOP" 2>/dev/null || {
+            echo "    WARN: BIOS GRUB install failed — EFI-only or use QEMU -kernel"
+        }
+    fi
 fi

 # --- System A Partition (active) ---
@@ -213,9 +255,9 @@ done
 sync

 echo ""
-echo "==> Disk image created: $IMG_OUTPUT"
+echo "==> ${TARGET_ARCH} disk image created: $IMG_OUTPUT"
 echo "    Size: $(du -h "$IMG_OUTPUT" | cut -f1)"
-echo "    Part 1 (KSOLOEFI):  GRUB + A/B boot config"
+echo "    Part 1 (KSOLOEFI):  GRUB ($GRUB_TARGET) + A/B boot config"
 echo "    Part 2 (KSOLOA):    System A — kernel + initramfs (active)"
 echo "    Part 3 (KSOLOB):    System B — kernel + initramfs (passive)"
 echo "    Part 4 (KSOLODATA): Persistent K8s state"
--- a/build/scripts/create-rpi-image.sh
+++ b/build/scripts/create-rpi-image.sh
@@ -31,12 +31,12 @@ IMG_OUTPUT="$OUTPUT_DIR/${OS_NAME}-${VERSION}.rpi.img"
 IMG_SIZE_MB="${IMG_SIZE_MB:-2048}"  # 2 GB default

 # ARM64 kernel (Image format, not bzImage)
-KERNEL="${CACHE_DIR}/custom-kernel-arm64/Image"
+KERNEL="${CACHE_DIR}/custom-kernel-rpi/Image"
 INITRAMFS="${ROOTFS_DIR}/kubesolo-os.gz"
 RPI_FIRMWARE_DIR="${CACHE_DIR}/rpi-firmware"
 # DTBs MUST come from the kernel build (not firmware repo) to match the kernel.
 # A DTB mismatch causes sdhci-iproc to silently fail — zero block devices.
-KERNEL_DTBS_DIR="${CACHE_DIR}/custom-kernel-arm64/dtbs"
+KERNEL_DTBS_DIR="${CACHE_DIR}/custom-kernel-rpi/dtbs"

 echo "==> Creating ${IMG_SIZE_MB}MB Raspberry Pi disk image..."

@@ -173,7 +173,7 @@ CFGTXT

    # cmdline.txt — kernel command line
    # Note: must be a single line
-    echo "console=serial0,115200 console=tty1 kubesolo.data=LABEL=KSOLODATA quiet" > "$MNT/cmdline.txt"
+    echo "console=serial0,115200 console=tty1 kubesolo.data=LABEL=KSOLODATA initcall_debug loglevel=7" > "$MNT/cmdline.txt"

    # Copy kernel as kernel8.img (RPi 3/4/5 ARM64 convention)
    cp "$KERNEL" "$MNT/kernel8.img"
--- a/build/scripts/fetch-components.sh
+++ b/build/scripts/fetch-components.sh
@@ -51,8 +51,7 @@ if [ "$FETCH_ARCH" = "arm64" ]; then
    echo "==> Fetching RPi firmware..."
    "$SCRIPT_DIR/fetch-rpi-firmware.sh"

-    # Download ARM64 KubeSolo binary
-    KUBESOLO_VERSION="${KUBESOLO_VERSION:-v1.1.0}"
+    # Download ARM64 KubeSolo binary (KUBESOLO_VERSION set from versions.env)
    KUBESOLO_BIN_ARM64="$CACHE_DIR/kubesolo-arm64"
    if [ -f "$KUBESOLO_BIN_ARM64" ]; then
        echo "==> KubeSolo ARM64 binary already cached: $KUBESOLO_BIN_ARM64"
@@ -61,17 +60,19 @@ if [ "$FETCH_ARCH" = "arm64" ]; then
        BIN_URL="https://github.com/portainer/kubesolo/releases/download/${KUBESOLO_VERSION}/kubesolo-${KUBESOLO_VERSION}-linux-arm64-musl.tar.gz"
        BIN_URL_FALLBACK="https://github.com/portainer/kubesolo/releases/download/${KUBESOLO_VERSION}/kubesolo-${KUBESOLO_VERSION}-linux-arm64.tar.gz"
        TEMP_DIR=$(mktemp -d)
+        TARBALL="$TEMP_DIR/kubesolo.tar.gz"
        echo "    URL: $BIN_URL"
-        if curl -fSL "$BIN_URL" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
+        if curl -fSL "$BIN_URL" -o "$TARBALL" 2>/dev/null; then
            echo "    Downloaded musl variant (arm64)"
-        elif curl -fSL "$BIN_URL_FALLBACK" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
+        elif curl -fSL "$BIN_URL_FALLBACK" -o "$TARBALL" 2>/dev/null; then
            echo "    Downloaded glibc variant (arm64 fallback)"
        else
            echo "ERROR: Failed to download KubeSolo ARM64 from GitHub."
            rm -rf "$TEMP_DIR"
            exit 1
        fi
-        tar -xzf "$TEMP_DIR/kubesolo.tar.gz" -C "$TEMP_DIR"
+        verify_checksum "$TARBALL" "${KUBESOLO_SHA256_ARM64:-}" "KubeSolo arm64 tarball"
+        tar -xzf "$TARBALL" -C "$TEMP_DIR"
        FOUND_BIN=$(find "$TEMP_DIR" -name "kubesolo" -type f ! -name "*.tar.gz" | head -1)
        if [ -z "$FOUND_BIN" ]; then
            echo "ERROR: Could not find kubesolo binary in extracted archive"
@@ -112,7 +113,7 @@ else
 fi

 # --- KubeSolo ---
-KUBESOLO_VERSION="${KUBESOLO_VERSION:-v1.1.0}"
+# KUBESOLO_VERSION sourced from versions.env
 KUBESOLO_BIN="$CACHE_DIR/kubesolo"

 if [ -f "$KUBESOLO_BIN" ]; then
@@ -132,11 +133,12 @@ else

    TEMP_DIR=$(mktemp -d)
    trap 'rm -rf "$TEMP_DIR"' EXIT
+    TARBALL="$TEMP_DIR/kubesolo.tar.gz"

    echo "    URL: $BIN_URL"
-    if curl -fSL "$BIN_URL" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
+    if curl -fSL "$BIN_URL" -o "$TARBALL" 2>/dev/null; then
        echo "    Downloaded musl variant"
-    elif curl -fSL "$BIN_URL_FALLBACK" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
+    elif curl -fSL "$BIN_URL_FALLBACK" -o "$TARBALL" 2>/dev/null; then
        echo "    Downloaded glibc variant (fallback)"
    else
        echo "ERROR: Failed to download KubeSolo from GitHub."
@@ -149,9 +151,10 @@ else
        echo "    3. Re-run: make rootfs"
        exit 1
    fi
+    verify_checksum "$TARBALL" "${KUBESOLO_SHA256_AMD64:-}" "KubeSolo amd64 tarball"

    # Extract binary from tarball
-    tar -xzf "$TEMP_DIR/kubesolo.tar.gz" -C "$TEMP_DIR"
+    tar -xzf "$TARBALL" -C "$TEMP_DIR"

    # Find the kubesolo binary in extracted contents
    FOUND_BIN=$(find "$TEMP_DIR" -name "kubesolo" -type f ! -name "*.tar.gz" | head -1)
@@ -169,7 +172,6 @@ else
    rm -rf "$TEMP_DIR"

    echo "==> KubeSolo binary: $KUBESOLO_BIN ($(du -h "$KUBESOLO_BIN" | cut -f1))"
-    verify_checksum "$KUBESOLO_BIN" "$KUBESOLO_SHA256" "KubeSolo binary"
 fi

 # --- Tiny Core kernel module extensions (netfilter, iptables) ---
--- a/build/scripts/inject-kubesolo.sh
+++ b/build/scripts/inject-kubesolo.sh
@@ -55,10 +55,44 @@ rm -f "$ROOTFS/sbin/init"
 cp "$PROJECT_ROOT/init/init.sh" "$ROOTFS/sbin/init"
 chmod +x "$ROOTFS/sbin/init"

-# Init stages
+# Replace the upstream /init at the rootfs root with our staged init.
+# The kernel ALWAYS runs /init when booting from an initramfs (legacy root-mount
+# fallback otherwise). piCore/TC ship their own /init; ours has to take its
+# place so the kernel runs our staged boot, not piCore's TCE handler.
+rm -f "$ROOTFS/init"
+cp "$PROJECT_ROOT/init/init.sh" "$ROOTFS/init"
+chmod +x "$ROOTFS/init"
+echo "    Installed staged init at /init and /sbin/init"
+
+# --- 2b. BusyBox override for ARM64 ---
+# piCore64 v15's BusyBox is dynamically linked and uses ARM instructions that
+# QEMU virt cannot emulate even with -cpu max, causing applets (mkdir, uname,
+# etc.) to SIGILL. Replace with the host's statically-linked busybox-static
+# package, which is built for generic ARMv8-A and runs anywhere.
+#
+# On x86 builds this isn't an issue (TC's BusyBox works fine on QEMU x86).
+if [ "$INJECT_ARCH" = "arm64" ] && [ -x /bin/busybox ]; then
+    if file /bin/busybox 2>/dev/null | grep -q 'statically linked'; then
+        cp /bin/busybox "$ROOTFS/bin/busybox"
+        # busybox.suid is used by mount/su/etc. Same binary; suid bit applied
+        # separately. We don't need suid for our use (init runs as PID 1 / uid 0).
+        cp /bin/busybox "$ROOTFS/bin/busybox.suid"
+        chmod +x "$ROOTFS/bin/busybox" "$ROOTFS/bin/busybox.suid"
+        echo "    Replaced piCore BusyBox with host's static busybox ($(du -h /bin/busybox | cut -f1))"
+    else
+        echo "    WARN: /bin/busybox on host is not static; piCore BusyBox kept (may crash in QEMU virt)"
+    fi
+fi
+
+# Init stages — copy NN-name.sh files only. functions.sh is a shared library
+# (sourced by init.sh proper), not a numbered stage; if it ends up in init.d
+# the main loop will try to run it as a stage and fail.
 mkdir -p "$ROOTFS/usr/lib/kubesolo-os/init.d"
 for stage in "$PROJECT_ROOT"/init/lib/*.sh; do
    [ -f "$stage" ] || continue
+    case "$(basename "$stage")" in
+        functions.sh) continue ;;
+    esac
    cp "$stage" "$ROOTFS/usr/lib/kubesolo-os/init.d/"
    chmod +x "$ROOTFS/usr/lib/kubesolo-os/init.d/$(basename "$stage")"
 done
@@ -109,7 +143,19 @@ fi
 # If a custom kernel was built (with CONFIG_CGROUP_BPF=y), use it.
 # Otherwise fall back to TCZ-extracted modules with manual modules.dep.
 if [ "$INJECT_ARCH" = "arm64" ]; then
-    CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel-arm64"
+    # TARGET_VARIANT selects which ARM64 kernel to consume:
+    #   rpi      -> $CACHE_DIR/custom-kernel-rpi/      (raspberrypi/linux fork)
+    #   generic  -> $CACHE_DIR/kernel-arm64-generic/   (mainline kernel.org LTS)
+    # Default is rpi for backwards compatibility with existing rpi-image target.
+    TARGET_VARIANT="${TARGET_VARIANT:-rpi}"
+    case "$TARGET_VARIANT" in
+        generic) CUSTOM_KERNEL_DIR="$CACHE_DIR/kernel-arm64-generic" ;;
+        rpi)     CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel-rpi" ;;
+        *)
+            echo "ERROR: TARGET_VARIANT must be 'rpi' or 'generic' (got: $TARGET_VARIANT)"
+            exit 1
+            ;;
+    esac
    CUSTOM_VMLINUZ="$CUSTOM_KERNEL_DIR/Image"
 else
    CUSTOM_KERNEL_DIR="$CACHE_DIR/custom-kernel"
@@ -178,9 +224,14 @@ if [ -f "$CUSTOM_VMLINUZ" ] && [ -d "$CUSTOM_MODULES/lib/modules/$KVER" ]; then
    fi

    while IFS= read -r mod; do
-        # Skip comments and blank lines
-        case "$mod" in \#*|"") continue ;; esac
-        mod=$(echo "$mod" | xargs)  # trim whitespace
+        # Strip any inline "# comment" tail before further processing —
+        # several entries in the upstream lists started carrying inline
+        # docs and silently broke module loading because modprobe got
+        # passed "name # comment" as the module name.
+        mod="${mod%%#*}"
+        # Skip blank-or-comment-only lines
+        case "$mod" in "") continue ;; esac
+        mod=$(echo "$mod" | xargs)  # trim whitespace + collapse internal
        [ -z "$mod" ] && continue

        if [ "$MODPROBE_WORKS" = true ]; then
@@ -351,7 +402,13 @@ if [ -f /usr/sbin/xtables-nft-multi ]; then
        ln -sf xtables-nft-multi "$ROOTFS/usr/sbin/$cmd"
    done

-    # Copy required shared libraries (architecture-aware paths)
+    # Copy required shared libraries (architecture-aware paths).
+    # We deliberately bundle the *full* glibc family from the build host —
+    # not just libc.so.6 — so dynamically-linked binaries we ship (nft,
+    # xtables-nft-multi, etc.) load a consistent set of libraries. Mixing
+    # glibc components across versions causes __stack_chk_guard mismatches
+    # ("stack smashing detected" aborts) when stack frames cross between
+    # functions linked against different libcs.
    mkdir -p "$ROOTFS/usr/lib/$LIB_ARCH" "$ROOTFS/lib/$LIB_ARCH"
    [ "$INJECT_ARCH" != "arm64" ] && mkdir -p "$ROOTFS/lib64"
    for lib in \
@@ -359,6 +416,13 @@ if [ -f /usr/sbin/xtables-nft-multi ]; then
        "/lib/$LIB_ARCH/libmnl.so.0"* \
        "/lib/$LIB_ARCH/libnftnl.so.11"* \
        "/lib/$LIB_ARCH/libc.so.6" \
+        "/lib/$LIB_ARCH/libpthread.so.0" \
+        "/lib/$LIB_ARCH/libdl.so.2" \
+        "/lib/$LIB_ARCH/libm.so.6" \
+        "/lib/$LIB_ARCH/libresolv.so.2" \
+        "/lib/$LIB_ARCH/librt.so.1" \
+        "/lib/$LIB_ARCH/libanl.so.1" \
+        "/lib/$LIB_ARCH/libgcc_s.so.1" \
        "$LD_SO"; do
        [ -e "$lib" ] && cp -aL "$lib" "$ROOTFS${lib}" 2>/dev/null || true
    done
@@ -374,6 +438,30 @@ else
    echo "    WARN: xtables-nft-multi not found in builder (install iptables package)"
 fi

+# Install nft (nftables CLI). KubeSolo v1.1.4+ uses `nft add table ip
+# kubesolo-masq` to own pod masquerade rules directly instead of going
+# through kube-proxy/CNI. Without nft in PATH, KubeSolo FATALs at startup
+# with: nft: executable file not found in $PATH.
+echo "    Installing nft (nftables CLI) from builder..."
+if [ -f /usr/sbin/nft ]; then
+    cp /usr/sbin/nft "$ROOTFS/usr/sbin/"
+    # nft pulls in libnftables + a few extras beyond what iptables-nft needed.
+    # libmnl, libnftnl, libxtables already copied by the iptables-nft block.
+    for lib in \
+        "/lib/$LIB_ARCH/libnftables.so.1"* \
+        "/lib/$LIB_ARCH/libedit.so.2"* \
+        "/lib/$LIB_ARCH/libjansson.so.4"* \
+        "/lib/$LIB_ARCH/libgmp.so.10"* \
+        "/lib/$LIB_ARCH/libtinfo.so.6"* \
+        "/lib/$LIB_ARCH/libbsd.so.0"* \
+        "/lib/$LIB_ARCH/libmd.so.0"*; do
+        [ -e "$lib" ] && cp -aL "$lib" "$ROOTFS${lib}" 2>/dev/null || true
+    done
+    echo "    Installed nft + shared libs"
+else
+    echo "    WARN: nft not found in builder (install nftables package) — KubeSolo v1.1.4+ pod masquerade will fail"
+fi
+
 # Kernel modules list (for init to load at boot)
 if [ "$INJECT_ARCH" = "arm64" ]; then
    cp "$PROJECT_ROOT/build/config/modules-arm64.list" "$ROOTFS/usr/lib/kubesolo-os/modules.list"
@@ -471,6 +559,54 @@ nameserver 1.1.1.1
 EOF
 fi

+# --- Resolve dual-glibc ambiguity (ARM64) ---
+# piCore64's rootfs ships glibc at /lib/libc.so.6, and we've copied the
+# build host's glibc to /lib/$LIB_ARCH/libc.so.6. Two libc.so.6 in the
+# dynamic linker's search path can lead to a process loading both — one
+# directly, one transitively — and "stack smashing detected" aborts when
+# stack frames cross between them (each libc has its own
+# __stack_chk_guard). Remove piCore's copies so resolution is unambiguous
+# and write a proper /etc/ld.so.conf + cache pointing at our copies.
+if [ "$INJECT_ARCH" = "arm64" ] && [ -d "$ROOTFS/lib/$LIB_ARCH" ]; then
+    echo "    Pruning duplicate glibc components in $ROOTFS/lib/..."
+    for lib in \
+        libc.so.6 \
+        libpthread.so.0 \
+        libdl.so.2 \
+        libm.so.6 \
+        libresolv.so.2 \
+        librt.so.1 \
+        libanl.so.1 \
+        libgcc_s.so.1; do
+        # Only delete piCore's copy when our version exists; otherwise
+        # we'd leave the binary unable to find any libc at all.
+        if [ -e "$ROOTFS/lib/$lib" ] && [ -e "$ROOTFS/lib/$LIB_ARCH/$lib" ]; then
+            rm -f "$ROOTFS/lib/$lib"
+        fi
+    done
+
+    # ld.so.conf gives our $LIB_ARCH paths precedence over piCore's /lib
+    # (defaults vary by glibc version; this makes the order explicit).
+    cat > "$ROOTFS/etc/ld.so.conf" <<EOF
+/lib/$LIB_ARCH
+/usr/lib/$LIB_ARCH
+/usr/local/lib
+/lib
+/usr/lib
+EOF
+
+    # Generate /etc/ld.so.cache. ldconfig -r treats $ROOTFS as the system
+    # root, so it reads ld.so.conf from there and writes the cache there.
+    # Works even cross-arch (it only parses ELF headers, doesn't execute).
+    if command -v ldconfig >/dev/null 2>&1; then
+        ldconfig -r "$ROOTFS" 2>/dev/null && \
+            echo "    Generated /etc/ld.so.cache via ldconfig" || \
+            echo "    WARN: ldconfig failed; falling back to default search order"
+    else
+        echo "    WARN: ldconfig not on builder; cache not generated"
+    fi
+fi
+
 # --- Summary ---
 echo ""
 echo "==> Injection complete. Rootfs contents:"
--- a/build/scripts/push-oci-artifact.sh
+++ b/build/scripts/push-oci-artifact.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+# push-oci-artifact.sh — Publish a KubeSolo OS update artifact to an OCI registry.
+#
+# Produces the artifact format consumed by `kubesolo-update --registry`:
+#
+#   <registry>/<repo>:<version>-<arch>     per-arch manifest, layers:
+#     * vmlinuz (Image on arm64)  → application/vnd.kubesolo.os.kernel.v1+octet-stream
+#     * kubesolo-os.gz            → application/vnd.kubesolo.os.initramfs.v1+gzip
+#     annotations:
+#       io.kubesolo.os.version
+#       io.kubesolo.os.channel
+#       io.kubesolo.os.architecture
+#       io.kubesolo.os.min_compatible_version (optional)
+#
+# After running this for each architecture, combine the per-arch tags into a
+# multi-arch index with `oras manifest index create` (see end of script).
+#
+# Requires: oras (>= 1.2), curl, jq.
+#
+# Usage:
+#   build/scripts/push-oci-artifact.sh \
+#       --registry ghcr.io/portainer/kubesolo-os \
+#       --arch amd64 \
+#       --channel stable \
+#       [--min-compatible-version v0.2.0]
+#
+# Authentication: oras reads ~/.docker/config.json. In CI, run
+#   `oras login ghcr.io -u USER -p TOKEN` before invoking this script
+# (or set DOCKER_CONFIG to a directory with config.json).
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+VERSION="$(cat "$PROJECT_ROOT/VERSION")"
+OUTPUT_DIR="$PROJECT_ROOT/output"
+CACHE_DIR="$PROJECT_ROOT/build/cache"
+
+REGISTRY=""
+ARCH=""
+CHANNEL="stable"
+MIN_COMPATIBLE_VERSION=""
+RELEASE_NOTES=""
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --registry) REGISTRY="$2"; shift 2 ;;
+        --arch) ARCH="$2"; shift 2 ;;
+        --channel) CHANNEL="$2"; shift 2 ;;
+        --min-compatible-version) MIN_COMPATIBLE_VERSION="$2"; shift 2 ;;
+        --release-notes) RELEASE_NOTES="$2"; shift 2 ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ -z "$REGISTRY" ] || [ -z "$ARCH" ]; then
+    echo "Usage: $0 --registry REGISTRY/REPO --arch (amd64|arm64) [--channel stable] [--min-compatible-version vX.Y.Z]" >&2
+    exit 1
+fi
+
+if ! command -v oras >/dev/null 2>&1; then
+    echo "ERROR: oras CLI not found. Install from https://oras.land/docs/installation/" >&2
+    echo "  or apt-get install oras (Ubuntu 24.04+)" >&2
+    exit 1
+fi
+
+# Locate the artifacts. For arm64 the kernel is "Image"; everywhere else it's
+# "vmlinuz". Initramfs is always kubesolo-os.gz.
+case "$ARCH" in
+    amd64)
+        KERNEL="$CACHE_DIR/custom-kernel/vmlinuz"
+        [ -f "$KERNEL" ] || KERNEL="$OUTPUT_DIR/vmlinuz"
+        KERNEL_BASENAME="vmlinuz"
+        ;;
+    arm64)
+        KERNEL="$CACHE_DIR/kernel-arm64-generic/Image"
+        KERNEL_BASENAME="vmlinuz"  # we publish under the vmlinuz name regardless;
+                                   # the consumer looks up by media type, not filename.
+        ;;
+    *)
+        echo "ERROR: unsupported --arch $ARCH (use amd64 or arm64)" >&2
+        exit 1
+        ;;
+esac
+
+INITRAMFS="$PROJECT_ROOT/build/rootfs-work/kubesolo-os.gz"
+
+if [ ! -f "$KERNEL" ]; then
+    echo "ERROR: kernel not found at $KERNEL" >&2
+    echo "  Run 'make kernel' (amd64) or 'make kernel-arm64' (arm64) first." >&2
+    exit 1
+fi
+if [ ! -f "$INITRAMFS" ]; then
+    echo "ERROR: initramfs not found at $INITRAMFS" >&2
+    echo "  Run 'make initramfs' or 'make rootfs-arm64' first." >&2
+    exit 1
+fi
+
+# Stage files in a temp dir so the basenames in the manifest are clean.
+STAGE="$(mktemp -d)"
+trap 'rm -rf "$STAGE"' EXIT
+cp "$KERNEL" "$STAGE/$KERNEL_BASENAME"
+cp "$INITRAMFS" "$STAGE/kubesolo-os.gz"
+
+KERNEL_MEDIA="application/vnd.kubesolo.os.kernel.v1+octet-stream"
+INITRD_MEDIA="application/vnd.kubesolo.os.initramfs.v1+gzip"
+
+REF="${REGISTRY}:${VERSION}-${ARCH}"
+CHANNEL_REF="${REGISTRY}:${CHANNEL}-${ARCH}"
+
+echo "==> Pushing ${REF}"
+echo "    kernel:     $KERNEL ($(du -h "$KERNEL" | cut -f1))"
+echo "    initramfs:  $INITRAMFS ($(du -h "$INITRAMFS" | cut -f1))"
+
+ORAS_ANNOTATIONS=(
+    --annotation "io.kubesolo.os.version=${VERSION}"
+    --annotation "io.kubesolo.os.channel=${CHANNEL}"
+    --annotation "io.kubesolo.os.architecture=${ARCH}"
+)
+if [ -n "$MIN_COMPATIBLE_VERSION" ]; then
+    ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.min_compatible_version=${MIN_COMPATIBLE_VERSION}")
+fi
+if [ -n "$RELEASE_NOTES" ]; then
+    ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.release_notes=${RELEASE_NOTES}")
+fi
+ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.release_date=$(date -u +%Y-%m-%dT%H:%M:%SZ)")
+
+# oras push: --artifact-type sets the manifest artifactType field;
+# file:type syntax sets per-layer media types.
+(cd "$STAGE" && oras push "$REF" \
+    --artifact-type "application/vnd.kubesolo.os.update.v1+json" \
+    "${ORAS_ANNOTATIONS[@]}" \
+    "${KERNEL_BASENAME}:${KERNEL_MEDIA}" \
+    "kubesolo-os.gz:${INITRD_MEDIA}")
+
+# Also tag as <channel>-<arch> so the manifest-index step can reference it
+# stably across patch releases.
+echo "==> Tagging ${CHANNEL_REF}"
+oras tag "$REF" "${CHANNEL}-${ARCH}"
+
+echo ""
+echo "==> Published:"
+echo "    ${REF}"
+echo "    ${CHANNEL_REF}"
+echo ""
+echo "To combine multi-arch into the channel index, run after both arches are pushed:"
+echo ""
+echo "  oras manifest index create ${REGISTRY}:${CHANNEL} \\"
+echo "      ${REGISTRY}:${CHANNEL}-amd64,platform=linux/amd64 \\"
+echo "      ${REGISTRY}:${CHANNEL}-arm64,platform=linux/arm64"
+echo ""
--- a/cloud-init/cmd/main.go
+++ b/cloud-init/cmd/main.go
@@ -97,6 +97,11 @@ func cmdApply(configPath string) error {
 		return fmt.Errorf("portainer edge agent: %w", err)
 	}

+	// 5. Write /etc/kubesolo/update.conf from updates: block (if any).
+	if err := cloudinit.ApplyUpdates(cfg, ""); err != nil {
+		return fmt.Errorf("updates: %w", err)
+	}
+
 	// 5. Save persistent configs for next boot
 	if err := cloudinit.SaveHostname(cfg, persistDataDir+"/etc-kubesolo"); err != nil {
 		slog.Warn("failed to save hostname", "error", err)
--- a/cloud-init/config.go
+++ b/cloud-init/config.go
@@ -12,12 +12,30 @@ package cloudinit

 // Config is the top-level cloud-init configuration.
 type Config struct {
-	Hostname string          `yaml:"hostname"`
-	Network  NetworkConfig   `yaml:"network"`
-	KubeSolo KubeSoloConfig  `yaml:"kubesolo"`
-	NTP      NTPConfig       `yaml:"ntp"`
-	Airgap   AirgapConfig    `yaml:"airgap"`
+	Hostname  string          `yaml:"hostname"`
+	Network   NetworkConfig   `yaml:"network"`
+	KubeSolo  KubeSoloConfig  `yaml:"kubesolo"`
+	NTP       NTPConfig       `yaml:"ntp"`
+	Airgap    AirgapConfig    `yaml:"airgap"`
 	Portainer PortainerConfig `yaml:"portainer"`
+	Updates   UpdatesConfig   `yaml:"updates"`
+}
+
+// UpdatesConfig configures the kubesolo-update agent. Written to
+// /etc/kubesolo/update.conf on first boot. See update/pkg/config.
+type UpdatesConfig struct {
+	// Server is the update server URL (HTTP or OCI registry).
+	Server string `yaml:"server"`
+	// Channel selects which channel to track ("stable", "beta", "edge").
+	// Empty = "stable".
+	Channel string `yaml:"channel"`
+	// MaintenanceWindow restricts apply to the given local time range,
+	// e.g. "03:00-05:00". Wrapping windows like "23:00-01:00" supported.
+	// Empty = no restriction.
+	MaintenanceWindow string `yaml:"maintenance_window"`
+	// PubKey is the path to the Ed25519 public key file used to verify
+	// signed update artifacts. Empty = signature verification disabled.
+	PubKey string `yaml:"pubkey"`
 }

 // NetworkConfig defines network settings.
@@ -40,6 +58,14 @@ type KubeSoloConfig struct {
 	PortainerEdgeID        string   `yaml:"portainer-edge-id"`
 	PortainerEdgeKey       string   `yaml:"portainer-edge-key"`
 	PortainerEdgeAsync     bool     `yaml:"portainer-edge-async"`
+	// v1.1.4+: skip edge-optimised overrides, use upstream k8s defaults
+	// (useful for CI and powerful machines, disabled by default).
+	Full bool `yaml:"full"`
+	// v1.1.5+: disable IPv6 in the cluster.
+	DisableIPv6 bool `yaml:"disable-ipv6"`
+	// v1.1.5+: detect SQLite WAL corruption on startup and recover from
+	// unclean shutdowns (e.g. power loss). Recommended ON for edge devices.
+	DBWALRepair bool `yaml:"db-wal-repair"`
 }

 // NTPConfig defines NTP settings.
--- a/cloud-init/examples/full-config.yaml
+++ b/cloud-init/examples/full-config.yaml
@@ -36,5 +36,50 @@ kubesolo:
  portainer-edge-key: "your-edge-key"
  portainer-edge-async: true

+  # KubeSolo v1.1.4+: skip the edge-optimised overrides and use upstream
+  # Kubernetes defaults. Useful for CI and high-spec machines. Default off.
+  full: false
+
+  # KubeSolo v1.1.5+: disable IPv6 throughout the cluster. Default off.
+  disable-ipv6: false
+
+  # KubeSolo v1.1.5+: detect SQLite WAL corruption at startup and recover
+  # from unclean shutdowns (e.g. power loss). Recommended ON for edge
+  # appliances that may lose power.
+  db-wal-repair: true
+
  # Arbitrary extra flags passed directly to the KubeSolo binary
  # extra-flags: "--disable traefik --disable servicelb"
+
+# Update agent settings (written to /etc/kubesolo/update.conf on first boot).
+# Omit any subfield to leave the corresponding default in place.
+updates:
+  # Update server URL — HTTPS for the JSON+blob protocol, or an OCI registry
+  # reference (e.g. ghcr.io/portainer/kubesolo-os) when OCI distribution
+  # lands in v0.3.
+  server: "https://updates.kubesolo.example.com"
+
+  # Channel to track. "stable" is the default; "beta"/"edge" expose
+  # pre-release artifacts. The agent refuses to apply metadata whose
+  # channel doesn't match.
+  channel: "stable"
+
+  # Maintenance window (local time, HH:MM-HH:MM, wrapping midnight OK).
+  # `apply` refuses to run outside this window unless --force is passed.
+  # Leave empty (or omit) to allow updates at any time.
+  maintenance_window: "03:00-05:00"
+
+  # Path to Ed25519 public key for signature verification. Omit to disable
+  # signature verification (NOT recommended for production fleets).
+  # pubkey: "/etc/kubesolo/update-pubkey.hex"
+
+  # Optional post-boot healthcheck probe URL. If set, healthcheck GETs it
+  # and treats anything other than HTTP 200 as a failure. Useful when your
+  # workload exposes its own readiness on a known endpoint.
+  # healthcheck_url: "http://localhost:8000/ready"
+
+  # Auto-rollback threshold: after N consecutive post-activation healthcheck
+  # failures, the agent triggers a rollback on its own. 0 disables the
+  # feature (the bootloader still does GRUB-counter-based rollback after
+  # 3 failed boots). Recommended: 3 for production fleets.
+  # auto_rollback_after: 3
--- a/cloud-init/kubesolo.go
+++ b/cloud-init/kubesolo.go
@@ -70,6 +70,18 @@ func buildExtraFlags(cfg *Config) string {
 		parts = append(parts, "--portainer-edge-async")
 	}

+	if cfg.KubeSolo.Full {
+		parts = append(parts, "--full")
+	}
+
+	if cfg.KubeSolo.DisableIPv6 {
+		parts = append(parts, "--disable-ipv6")
+	}
+
+	if cfg.KubeSolo.DBWALRepair {
+		parts = append(parts, "--db-wal-repair")
+	}
+
 	return strings.Join(parts, " ")
 }

--- a/cloud-init/updates.go
+++ b/cloud-init/updates.go
@@ -0,0 +1,57 @@
+package cloudinit
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// DefaultUpdateConfPath is where the update agent expects to find its config.
+// Kept in sync with update/pkg/config.DefaultPath.
+const DefaultUpdateConfPath = "/etc/kubesolo/update.conf"
+
+// ApplyUpdates writes /etc/kubesolo/update.conf from the cloud-init
+// updates: block. Called once per boot; idempotent (overwrites any existing
+// file with the cloud-init values).
+//
+// If the updates: block is empty (all fields blank), the file is not
+// written — preserves any hand-edited update.conf on systems that aren't
+// managed via cloud-init.
+func ApplyUpdates(cfg *Config, confPath string) error {
+	if confPath == "" {
+		confPath = DefaultUpdateConfPath
+	}
+	u := cfg.Updates
+	if u.Server == "" && u.Channel == "" && u.MaintenanceWindow == "" && u.PubKey == "" {
+		// Nothing to write — leave any existing file alone.
+		return nil
+	}
+
+	if err := os.MkdirAll(filepath.Dir(confPath), 0o755); err != nil {
+		return fmt.Errorf("creating dir for %s: %w", confPath, err)
+	}
+
+	var sb strings.Builder
+	sb.WriteString("# Generated by KubeSolo OS cloud-init — edit this file or the\n")
+	sb.WriteString("# cloud-init source YAML; subsequent first-boots will regenerate it.\n")
+	if u.Server != "" {
+		fmt.Fprintf(&sb, "server = %s\n", u.Server)
+	}
+	if u.Channel != "" {
+		fmt.Fprintf(&sb, "channel = %s\n", u.Channel)
+	}
+	if u.MaintenanceWindow != "" {
+		fmt.Fprintf(&sb, "maintenance_window = %s\n", u.MaintenanceWindow)
+	}
+	if u.PubKey != "" {
+		fmt.Fprintf(&sb, "pubkey = %s\n", u.PubKey)
+	}
+
+	if err := os.WriteFile(confPath, []byte(sb.String()), 0o644); err != nil {
+		return fmt.Errorf("writing %s: %w", confPath, err)
+	}
+	slog.Info("wrote update.conf", "path", confPath)
+	return nil
+}
--- a/cloud-init/updates_test.go
+++ b/cloud-init/updates_test.go
@@ -0,0 +1,81 @@
+package cloudinit
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func TestApplyUpdatesEmptyConfigSkipsWrite(t *testing.T) {
+	confPath := filepath.Join(t.TempDir(), "update.conf")
+	cfg := &Config{} // Updates block default-zero
+	if err := ApplyUpdates(cfg, confPath); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	if _, err := os.Stat(confPath); !os.IsNotExist(err) {
+		t.Errorf("expected no file when cloud-init Updates is empty, got %v", err)
+	}
+}
+
+func TestApplyUpdatesAllFields(t *testing.T) {
+	confPath := filepath.Join(t.TempDir(), "update.conf")
+	cfg := &Config{Updates: UpdatesConfig{
+		Server:            "https://updates.example.com",
+		Channel:           "stable",
+		MaintenanceWindow: "03:00-05:00",
+		PubKey:            "/etc/kubesolo/pub.hex",
+	}}
+	if err := ApplyUpdates(cfg, confPath); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	data, err := os.ReadFile(confPath)
+	if err != nil {
+		t.Fatalf("read: %v", err)
+	}
+	out := string(data)
+
+	wants := []string{
+		"server = https://updates.example.com",
+		"channel = stable",
+		"maintenance_window = 03:00-05:00",
+		"pubkey = /etc/kubesolo/pub.hex",
+	}
+	for _, w := range wants {
+		if !strings.Contains(out, w) {
+			t.Errorf("update.conf missing %q in output:\n%s", w, out)
+		}
+	}
+}
+
+func TestApplyUpdatesPartialFields(t *testing.T) {
+	// Only server set — others should be omitted from the file, not written
+	// as blank values.
+	confPath := filepath.Join(t.TempDir(), "update.conf")
+	cfg := &Config{Updates: UpdatesConfig{Server: "https://x.example.com"}}
+	if err := ApplyUpdates(cfg, confPath); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	data, _ := os.ReadFile(confPath)
+	out := string(data)
+	if !strings.Contains(out, "server = https://x.example.com") {
+		t.Errorf("missing server line:\n%s", out)
+	}
+	for _, unwanted := range []string{"channel = ", "maintenance_window = ", "pubkey = "} {
+		if strings.Contains(out, unwanted) {
+			t.Errorf("unexpected empty line %q present in:\n%s", unwanted, out)
+		}
+	}
+}
+
+func TestApplyUpdatesCreatesParentDir(t *testing.T) {
+	// /etc/kubesolo may not exist on first boot before cloud-init runs.
+	confPath := filepath.Join(t.TempDir(), "nested", "kubesolo", "update.conf")
+	cfg := &Config{Updates: UpdatesConfig{Server: "https://x"}}
+	if err := ApplyUpdates(cfg, confPath); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	if _, err := os.Stat(confPath); err != nil {
+		t.Errorf("file not created: %v", err)
+	}
+}
--- a/docs/arm64-architecture.md
+++ b/docs/arm64-architecture.md
@@ -0,0 +1,124 @@
+# ARM64 Build Architecture
+
+KubeSolo OS supports ARM64 via two distinct build tracks. This document defines the
+split, lists which files belong to each track, and identifies the shared substrate.
+
+## The two tracks
+
+### Generic ARM64 (UEFI / virtio / GRUB)
+
+**Target:** Any UEFI-compliant ARM64 host — Ampere/Graviton VMs, generic ARM64
+servers, `qemu-system-aarch64 -machine virt`, future SBCs that boot via UEFI.
+
+**Boot path:** UEFI firmware → GRUB-EFI → kernel + initramfs → KubeSolo init.
+
+**Kernel:** Mainline Linux (kernel.org LTS), built from `defconfig` + shared
+container-config fragment.
+
+**Storage:** virtio-blk / NVMe / SATA — detected and probed by mainline drivers.
+
+**Disk image format:** GPT, identical 4-partition layout to x86_64 (EFI + System A
+ System B + Data).
+
+### Raspberry Pi ARM64
+
+**Target:** Raspberry Pi 4 and 5 specifically.
+
+**Boot path:** RPi EEPROM → VideoCore firmware (`start4.elf`) → `config.txt` →
+kernel + DTB + initramfs → KubeSolo init. (No UEFI, no GRUB — `autoboot.txt`
+provides the A/B selection.)
+
+**Kernel:** Built from `raspberrypi/linux` fork with `bcm2711_defconfig`
+(Pi 4) or `bcm2712_defconfig` (Pi 5). RPi-patched, includes BCM-specific drivers
+(sdhci-iproc, bcm2835-mmc, GPIO, mailbox).
+
+**Storage:** SD card via `sdhci-iproc` driver — requires kernel-built DTBs to match
+the kernel binary.
+
+**Disk image format:** MBR with `autoboot.txt` A/B redirect:
+- Part 1: Boot/Control (FAT32, firmware + fallback kernel)
+- Part 2: Boot A (FAT32, kernel + DTBs + initramfs)
+- Part 3: Boot B (FAT32, same as A initially)
+- Part 4: Data (ext4)
+
+## File-by-file ownership
+
+### Shared substrate (used by both tracks)
+
+| Path | Why shared |
+|------|------------|
+| `init/` (all of it) | Boot is identical post-kernel — same staged init, same persistent mount, same KubeSolo launch |
+| `cloud-init/` | Arch-agnostic Go binary |
+| `update/` | Arch-agnostic Go binary; bootenv abstraction handles GRUB vs RPi-autoboot variants |
+| `build/scripts/inject-kubesolo.sh` | Single script; switches `LIB_ARCH` / `LD_SO` based on `TARGET_ARCH` |
+| `build/scripts/extract-core.sh` | Single script; arm64 branch uses piCore64 userland (arch-agnostic BusyBox) |
+| `build/config/modules-arm64.list` | Already generic — no BCM-specific modules; works in QEMU virt, AWS Graviton, and RPi |
+| `build/config/rpi-kernel-config.fragment` | **Misnamed.** Contents (cgroup, namespaces, netfilter, AppArmor) are arch-agnostic. Will be renamed `kernel-container.fragment` in Phase 2 and applied to x86, generic-ARM64, and RPi kernels alike. |
+| `hack/dev-vm-arm64.sh` | Uses `-machine virt` + virtio — generic, not RPi-specific |
+| `test/qemu/test-boot-arm64.sh` | Same as above |
+
+### Generic ARM64 only (to be created in Phases 2–3)
+
+| Path | Purpose |
+|------|---------|
+| `build/scripts/build-kernel-arm64.sh` *(rewritten in Phase 2)* | Build mainline kernel.org LTS from `defconfig` + shared fragment + arm64-virt enables (`VIRTIO_BLK`, `EFI_STUB`). Replaces the existing RPi-flavoured script of the same name. |
+| `build/scripts/create-disk-image-arm64.sh` *(new in Phase 3)* | Build UEFI-bootable raw disk image (GPT + System A/B + Data) using `grub-efi-arm64`. Or fold into existing `create-disk-image.sh` with an arch parameter. |
+| `build/cache/kernel-arm64-generic/` | Build output for mainline ARM64 kernel — keep separate from RPi-kernel cache. |
+
+### Raspberry Pi only (to be renamed/reorganised in Phase 2)
+
+| Path | Purpose |
+|------|---------|
+| `build/scripts/build-kernel-rpi.sh` *(renamed from `build-kernel-arm64.sh`)* | Build kernel from `raspberrypi/linux` with `bcm2711_defconfig` + shared fragment + RPi-specific overrides. |
+| `build/scripts/create-rpi-image.sh` | Build SD card image (MBR + autoboot.txt + firmware blobs + DTBs). Already correctly scoped. |
+| `build/scripts/fetch-rpi-firmware.sh` | Download VideoCore firmware blobs from `raspberrypi/firmware`. Already correctly scoped. |
+| `build/config/rpi-kernel-overrides.fragment` *(new, Phase 2)* | Pi-specific kernel config knobs (DMA, audio off, etc.) layered on top of the shared container fragment. |
+| `build/cache/custom-kernel-rpi/` *(renamed from `custom-kernel-arm64/`)* | Build output for RPi kernel — DTBs, modules, Image. |
+| `versions.env` keys: `RPI_KERNEL_BRANCH`, `RPI_KERNEL_REPO`, `RPI_FIRMWARE_TAG`, `RPI_FIRMWARE_URL`, `PICORE_*` | Already correctly named. |
+
+## Make targets
+
+| Target | Track |
+|--------|-------|
+| `make iso` | x86_64 |
+| `make disk-image` | x86_64 |
+| `make kernel` | x86_64 |
+| `make kernel-arm64` *(Phase 2: now builds mainline)* | Generic ARM64 |
+| `make rootfs-arm64` | Generic ARM64 (and reusable for RPi rootfs) |
+| `make disk-image-arm64` *(Phase 3: new)* | Generic ARM64 |
+| `make kernel-rpi` *(Phase 2: renamed from former kernel-arm64)* | RPi |
+| `make rpi-image` | RPi |
+
+## Why two tracks, not one
+
+The RPi boot path is fundamentally different from generic ARM64:
+
+- **No UEFI.** RPi boots through a multi-stage firmware chain that ends with
+  `config.txt` parsing and direct kernel load. UEFI/GRUB is not an option without
+  third-party firmware (which has its own bugs).
+- **DTB required.** RPi kernel needs a device tree blob matching the kernel binary;
+  generic ARM64 under UEFI uses ACPI or self-describing virtio.
+- **Custom drivers.** SD card (sdhci-iproc), GPIO, mailbox interfaces require
+  RPi-patched kernel sources. Mainline support exists but lags behind the
+  raspberrypi/linux fork for new boards.
+- **A/B selection mechanism.** RPi uses `autoboot.txt` + EEPROM cooperation; generic
+  ARM64 uses GRUB's `boot_default`/`boot_counter` envvars (same as x86_64).
+
+Trying to unify into a single track would force compromises in both. Two tracks
+sharing the post-kernel substrate (init, cloud-init, update agent) gives us the best
+of both: code reuse where it makes sense, divergence only where the hardware demands
+it.
+
+## Migration plan
+
+This document is descriptive of the **target** v0.3.0 layout. The current code
+(as of v0.2.0) has:
+
+- `build/scripts/build-kernel-arm64.sh` building the RPi kernel (will be renamed in
+  Phase 2).
+- `build/config/rpi-kernel-config.fragment` containing generic configs (will be
+  renamed in Phase 2).
+- No generic ARM64 kernel script (will be created in Phase 2).
+- No generic ARM64 disk image script (will be created in Phase 3).
+
+Phases 2 and 3 of the v0.3.0 plan execute the migration.
--- a/docs/arm64-status.md
+++ b/docs/arm64-status.md
@@ -0,0 +1,125 @@
+# ARM64 Generic Status (v0.3 in-progress)
+
+End-of-Phase-3 snapshot of the generic ARM64 build track.
+
+## What works
+
+End-to-end boot through QEMU on an Odroid (aarch64 Ubuntu 22.04 build host):
+
+1. `make kernel-arm64` produces a mainline 6.12.10 LTS kernel (44 MB Image, 868
+   modules)
+2. `make rootfs-arm64` extracts piCore64 userland, replaces BusyBox with
+   Ubuntu's static busybox-static, injects KubeSolo + Go agents + init scripts
+3. `make disk-image-arm64` produces a UEFI-bootable 4 GB GPT image with GRUB
+   A/B slots
+4. `hack/dev-vm-arm64.sh --disk` boots the image:
+   - UEFI firmware loads GRUB
+   - GRUB loads kernel + initramfs
+   - Custom init runs all 14 stages (early-mount, parse-cmdline, persistent-mount,
+     kernel-modules, apparmor, sysctl, cloud-init, network, hostname, clock,
+     containerd, security-lockdown, kubesolo)
+   - Data partition mounts (ext4 on vda4)
+   - Network configured (DHCP on virtio eth0)
+   - KubeSolo starts; containerd boots successfully; CoreDNS + pause images
+     register
+
+## Known limitations of the current dev setup
+
+These are debugging-environment issues, not production blockers:
+
+### 1. QEMU TCG performance hits KubeSolo's image-import deadline
+
+KubeSolo bundles its essential container images and imports them into
+containerd on first boot. Under QEMU TCG (software emulation on the Odroid's
+1.8 GB / 6-core ARM64), the import takes longer than KubeSolo's internal
+deadline, so we see:
+
+```
+failed to import images: ... context deadline exceeded
+shutdown requested before containerd was ready
+```
+
+On real ARM64 hardware (Graviton, Ampere, RPi 5, etc.) this import completes
+in seconds. KVM acceleration on the Odroid would also fix it, but the
+Odroid's vendor kernel (4.9.337-38) doesn't ship the KVM module — fixing that
+requires a host-kernel upgrade outside this project's scope.
+
+### 2. Hardcoded `/dev/vda4` data partition path
+
+Stage 20 currently expects `kubesolo.data=/dev/vda4` rather than
+`LABEL=KSOLODATA`. The LABEL= path is preferred (works regardless of disk
+naming on different hosts), but resolution depends on `blkid` and `findfs`,
+which:
+
+- piCore64 ships as dynamic util-linux binaries that crash in QEMU virt
+- Ubuntu's `busybox-static` 1.30.1 doesn't include the applets
+
+Production fix options (deferred to next phase):
+
+- Build a more comprehensive static BusyBox (Alpine's, or upstream + custom config)
+- Ship statically-linked `blkid` and `findfs` from util-linux
+- Replace LABEL resolution with a sysfs walk that reads `/sys/class/block/*/holders`
+  and `/dev/<n>` device numbers
+
+### 3. AppArmor profiles fail to load
+
+`apparmor_parser` errors on the containerd and kubelet profiles, probably
+because the parser binary or libraries copied from the build host don't
+match the rootfs's libc layout. Boot proceeds without AppArmor enforcement.
+Same fix path as #2 (better static binaries).
+
+### 4. piCore64 BusyBox swap is a build-host dependency
+
+`inject-kubesolo.sh` replaces piCore's `/bin/busybox` with the build host's
+`/bin/busybox` (Ubuntu's busybox-static package). That binary must exist on
+the build host or in the builder Docker image. Documented; works in CI
+because the Dockerfile installs busybox-static.
+
+A more reproducible approach (future work): ship a known-good ARM64 BusyBox
+binary as a tracked artifact rather than depending on the host package.
+
+### 5. busybox-static 1.30.1 has its own bugs
+
+Even after the swap, some applets misbehave inside QEMU:
+
+- `modprobe` triggers "stack smashing detected" abort (kernel modules still
+  load via direct write to /sys/... in stage 30, so this isn't fatal)
+- `tr` doesn't parse POSIX character classes like `[:space:]` — already
+  worked around by using explicit `' \t\r\n'` in our scripts
+- Missing applets: `blkid`, `findfs`, `--version`, etc.
+
+These won't necessarily manifest on real hardware (different CPU, different
+glibc interaction) but they confirm that 1.30.1 isn't the right long-term
+BusyBox.
+
+## What's needed to ship v0.3 ARM64 as production-ready
+
+In order of priority:
+
+1. **Validate on real ARM64 hardware** — boot the image on a Graviton EC2
+   instance, Ampere VPS, RPi 5 (when hardware available), or any UEFI-capable
+   ARM64 board. Confirm full KubeSolo bring-up: node Ready, pods schedule.
+2. **Fix LABEL=KSOLODATA resolution** — see option list in #2 above.
+3. **Replace busybox-static with a curated build** — see #4.
+4. **Add a Gitea workflow** that runs `make kernel-arm64 + disk-image-arm64`
+   on the Odroid runner and the QEMU boot-test as a smoke test (with the
+   expectation that KubeSolo doesn't finish first-boot under TCG).
+
+## Files exercised by the Phase 3 work
+
+| Path | Status |
+|------|--------|
+| `build/scripts/build-kernel-arm64.sh` | New — mainline 6.12.10 kernel build, native or cross |
+| `build/scripts/build-kernel-rpi.sh` | Renamed from old `build-kernel-arm64.sh` — RPi path |
+| `build/config/kernel-container.fragment` | Renamed from `rpi-kernel-config.fragment` |
+| `build/scripts/create-disk-image.sh` | Refactored — accepts `TARGET_ARCH=arm64` |
+| `build/grub/grub-arm64.cfg` | New — ARM64 console + `init=/sbin/init` |
+| `build/scripts/inject-kubesolo.sh` | Updated — BusyBox swap, `/init` install, variant routing |
+| `init/init.sh` | Updated — output to `/dev/console` for early-boot visibility |
+| `init/lib/30-kernel-modules.sh` | Fixed — `tr -d ' \t\r\n'` instead of `[:space:]` |
+| `init/lib/40-sysctl.sh` | Same fix |
+| `hack/dev-vm-arm64.sh` | Updated — `-cpu max`, UEFI `--disk` mode |
+| `test/qemu/test-boot-arm64-disk.sh` | New — CI test for UEFI boot |
+| `Makefile` | New targets: `kernel-arm64`, `kernel-rpi`, `disk-image-arm64`, `test-boot-arm64-disk`, `rootfs-arm64-rpi` |
+| `build/config/versions.env` | Pinned `MAINLINE_KERNEL_VERSION=6.12.10`, `KUBESOLO_VERSION=v1.1.0` |
+| `build/Dockerfile.builder` | Added `grub-efi-amd64-bin`, `grub-efi-arm64-bin`, `busybox-static` |
--- a/docs/ci-runners.md
+++ b/docs/ci-runners.md
@@ -0,0 +1,222 @@
+# CI Runners
+
+KubeSolo OS is built and tested on Gitea Actions runners. This document records the
+runners currently in service and how to register a new one if a host is wiped.
+
+## Active runners
+
+| Name | Host | Arch | OS | Labels | Notes |
+|------|------|------|-----|--------|-------|
+| `odroid-arm64` | `odroid.local` | aarch64 | Ubuntu 22.04 LTS | `arm64-linux`, `ubuntu-latest`, `ubuntu-24.04`, `ubuntu-22.04` | Native ARM64 builder; 6 cores, 1.8 GB RAM + 4 GB swap; runs as systemd service `act_runner` |
+
+## Workflow targeting
+
+ARM64-specific jobs target the Odroid via the `arm64-linux` label:
+
+```yaml
+jobs:
+  build-arm64:
+    runs-on: arm64-linux
+    steps:
+      - uses: actions/checkout@v4
+      - run: make rootfs-arm64
+```
+
+Generic ubuntu jobs that don't care about arch fall through to whichever runner picks
+them up first; on the Odroid they run in Docker via the `ubuntu-latest` /
+`ubuntu-22.04` / `ubuntu-24.04` labels.
+
+## Workflows in this repo
+
+| Workflow file | Trigger | Where it runs | What it produces |
+|---|---|---|---|
+| `.gitea/workflows/ci.yaml` | push / PR to main | ubuntu-latest | Go tests, cross-arch binary build, shellcheck |
+| `.gitea/workflows/build-arm64.yaml` | push to main, tags `v*`, manual | `arm64-linux` (Odroid) | ARM64 kernel + rootfs + disk image; uploads as workflow artifact only |
+| `.gitea/workflows/release.yaml` | tags `v*` | mix: ubuntu-latest + `arm64-linux` | Full release: x86 ISO + disk, ARM64 disk, Go binaries, SHA256SUMS — posted to Gitea Releases via API |
+
+### Release workflow specifics
+
+`release.yaml` is what fires when you `git push origin vX.Y.Z`. The pipeline:
+
+1. **test** — `go test` cloud-init + update modules (ubuntu-latest).
+2. **build-binaries** — cross-compiles `kubesolo-cloudinit` and
+   `kubesolo-update` for linux-amd64 + linux-arm64 with the version baked
+   in via `-X main.version=…`.
+3. **build-iso-amd64** — runs `make iso disk-image` on ubuntu-latest;
+   produces the x86_64 ISO and a `.img.xz` compressed disk image.
+4. **build-disk-arm64** — runs the same flow on the Odroid (`arm64-linux`
+   label); produces `.arm64.img.xz`.
+5. **release** — downloads everything, computes `SHA256SUMS`, calls
+   Gitea's `POST /api/v1/repos/<owner>/<repo>/releases` to create the
+   release, then `POST .../releases/<id>/assets?name=…` once per asset.
+
+Authentication uses Gitea's built-in `${{ secrets.GITHUB_TOKEN }}` — the
+runner auto-populates that secret with repo-write scope. If your runner
+is configured without that automatic token (e.g. an older `act_runner`),
+generate a personal access token with `repo:write` scope, add it as an
+org secret named `GITEA_TOKEN`, and swap the `TOKEN: ${{ secrets.GITHUB_TOKEN }}`
+line in `release.yaml` for `TOKEN: ${{ secrets.GITEA_TOKEN }}`.
+
+### Why not the GitHub Marketplace release actions?
+
+`release.yaml` used to call `softprops/action-gh-release@v2`. That action
+hard-codes calls to `api.github.com` instead of using `${{ github.api_url }}`
+(which Gitea sets to its own API). On Gitea's act_runner the action fails
+silently — the job reports green but no release is created. We replaced
+it with a direct `curl` so the behaviour is explicit and debuggable.
+
+Similarly, `actions/upload-artifact@v4` and `@download-artifact@v4` are not
+fully implemented by act_runner v1.0.x. Pin to `@v3` until upstream
+support catches up.
+
+### Manually re-running a release
+
+Releases are immutable once published, but you can:
+
+- **Delete and recreate the release** through the Gitea UI on the
+  `releases/tag/vX.Y.Z` page, then push the tag again (Gitea reuses the
+  existing tag), and re-trigger the workflow via the Actions UI.
+- **Trigger the build-arm64 workflow manually** for a one-off arm64
+  artifact: Gitea UI → Actions → ARM64 Build → Run workflow.
+
+Don't force-update a published tag — anyone who already fetched it (or
+downloaded an asset) sees a checksum mismatch. Prefer cutting a new patch
+release (vX.Y.Z+1) over rewriting a published one.
+
+## Registering a new runner
+
+### Prerequisites
+
+- Linux host (Ubuntu / Debian preferred; the install instructions below use Ubuntu
+  22.04+ paths).
+- Outbound HTTPS to the Gitea instance.
+- Root access on the runner host (the runner needs to create loop devices and run
+  `mkfs.ext4` for disk-image builds).
+- A Gitea Actions runner registration token. Get it from:
+  - **Repo-scoped:** `<repo>/settings/actions/runners` → "Create new Runner"
+  - **Org-scoped (preferred for this project):** `<org>/-/settings/actions/runners` →
+    "Create new Runner"
+  - **Site-scoped:** `/-/admin/actions/runners` → "Create new Runner"
+
+### Step 1 — Add swap if the host has <4 GB RAM
+
+Kernel builds in later phases need ~2 GB resident; tight hosts will OOM-kill `cc1`
+without swap.
+
+```bash
+sudo fallocate -l 4G /swapfile
+sudo chmod 600 /swapfile
+sudo mkswap /swapfile
+sudo swapon /swapfile
+echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
+```
+
+### Step 2 — Install the gitea-runner binary
+
+Pinned to a known-good version. Check
+<https://gitea.com/gitea/runner/releases> for the current stable tag before
+bumping.
+
+```bash
+sudo -i
+mkdir -p /opt/act_runner && cd /opt/act_runner
+
+# Bump VERSION to the current stable release as needed
+VERSION=1.0.3
+ARCH=$(uname -m | sed 's/aarch64/arm64/; s/x86_64/amd64/')
+
+curl -fL "https://gitea.com/gitea/runner/releases/download/v${VERSION}/gitea-runner-${VERSION}-linux-${ARCH}" \
+  -o act_runner
+chmod +x act_runner
+./act_runner --version
+```
+
+> The upstream project was renamed `act_runner` → `gitea-runner` at the v1.0.0
+> release. The release asset filenames use `gitea-runner-*` even though we keep the
+> local binary named `act_runner` to match this systemd unit. The CLI surface
+> (`register`, `daemon`, `generate-config`) is unchanged.
+
+### Step 3 — Register against Gitea
+
+```bash
+./act_runner register --no-interactive \
+  --instance https://git.oe74.net \
+  --token PASTE_TOKEN_HERE \
+  --name <hostname> \
+  --labels arm64-linux        # adjust label for amd64 hosts
+```
+
+This creates a `.runner` file with the registration credentials.
+
+### Step 4 — Generate and tune config
+
+```bash
+./act_runner generate-config > config.yaml
+```
+
+In `config.yaml`, confirm the `runner.labels:` block includes the labels you want.
+The `:host` suffix routes jobs directly to the host (no Docker wrapper) — required
+for disk-image builds that need loop devices and `mkfs`.
+
+Example labels for an arm64 host:
+
+```yaml
+runner:
+  labels:
+    - "arm64-linux:host"
+    - "ubuntu-latest:docker://docker.gitea.com/runner-images:ubuntu-latest"
+    - "ubuntu-24.04:docker://docker.gitea.com/runner-images:ubuntu-24.04"
+    - "ubuntu-22.04:docker://docker.gitea.com/runner-images:ubuntu-22.04"
+```
+
+### Step 5 — Install as a systemd service
+
+```bash
+cat > /etc/systemd/system/act_runner.service << 'EOF'
+[Unit]
+Description=Gitea Actions runner
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+ExecStart=/opt/act_runner/act_runner daemon --config /opt/act_runner/config.yaml
+WorkingDirectory=/opt/act_runner
+User=root
+Restart=always
+RestartSec=5
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+systemctl daemon-reload
+systemctl enable --now act_runner
+systemctl status act_runner --no-pager
+```
+
+### Step 6 — Verify in Gitea UI
+
+Visit the runners page at the scope you registered against. The runner should appear
+as `Idle` with the labels you configured.
+
+## Removing a runner
+
+On the host:
+
+```bash
+systemctl disable --now act_runner
+rm -rf /opt/act_runner /etc/systemd/system/act_runner.service
+systemctl daemon-reload
+```
+
+Then delete the runner entry from the Gitea Actions UI so Gitea stops trying to
+schedule against it.
+
+## Operational notes
+
+- The runner stores in-progress job working directories under `/tmp/act_runner` by
+  default. Large disk-image builds may need that path moved to a larger volume —
+  edit `host.workdir_parent:` in `config.yaml`.
+- Logs are visible via `journalctl -u act_runner -f`.
+- If a job is interrupted (e.g. host reboot mid-build), the Gitea UI will mark it as
+  failed/cancelled. Re-run from the Actions UI.
--- a/docs/release-notes-0.3.0.md
+++ b/docs/release-notes-0.3.0.md
@@ -0,0 +1,181 @@
+# KubeSolo OS v0.3.0 — Release Notes
+
+**Released:** 2026-05-14
+
+v0.3.0 is the second feature release after v0.2.0 and the first release that
+ships a generic ARM64 build alongside x86_64. The update agent grew up: it
+now has an explicit on-disk lifecycle, OCI registry distribution, and a
+fleet-friendly set of policy gates (channels, maintenance windows,
+version-stepping-stones, pre-flight checks, auto-rollback).
+
+This document is the operator-facing summary. The full per-phase changelog
+lives in [CHANGELOG.md](../CHANGELOG.md).
+
+## What's new
+
+### Generic ARM64 build
+
+The image you build with `make disk-image-arm64` now targets any UEFI-capable
+ARM64 host: AWS Graviton, Oracle Ampere, generic ARM64 servers, future SBCs
+with UEFI-compatible firmware. The kernel comes from kernel.org mainline LTS
+(6.12.10 by default, configurable via `MAINLINE_KERNEL_VERSION` in
+`build/config/versions.env`).
+
+This is **distinct** from the Raspberry Pi build path. RPi keeps its
+specialised kernel from `raspberrypi/linux` with bcm-defconfig + custom DTBs;
+the generic ARM64 path uses mainline + arm64-defconfig + UEFI/virtio. See
+[docs/arm64-architecture.md](arm64-architecture.md) for the file-by-file
+split.
+
+KubeSolo bumped to **v1.1.5** (was v1.1.0). New flags surfaced via cloud-init:
+- `kubesolo.full` — disable edge-optimised k8s overrides
+- `kubesolo.disable-ipv6` — disable IPv6 cluster-wide
+- `kubesolo.db-wal-repair` — recover from unclean shutdowns
+
+### Update lifecycle is now observable
+
+The update agent writes a `state.json` at `/var/lib/kubesolo/update/state.json`
+recording where the current attempt is in the lifecycle:
+
+```
+idle → checking → downloading → staged → activated → verifying → success
+                                                              ↘ rolled_back
+                                                              ↘ failed
+```
+
+`kubesolo-update status --json` emits the full state for orchestration tooling.
+The Prometheus metrics endpoint gains three new series:
+
+- `kubesolo_update_phase{phase="..."}` — 1 for current phase, 0 for others (all 9 always emitted)
+- `kubesolo_update_attempts_total`
+- `kubesolo_update_last_attempt_timestamp_seconds`
+
+### OCI registry distribution
+
+Update artifacts can now be pulled from any OCI-compliant registry alongside
+the existing HTTP `latest.json` protocol:
+
+```bash
+# HTTP, unchanged from v0.2:
+kubesolo-update apply --server https://updates.example.com
+
+# New: OCI from ghcr.io (or quay.io, harbor, zot, ...)
+kubesolo-update apply --registry ghcr.io/yourorg/kubesolo-os --tag stable
+```
+
+Multi-arch is handled transparently — the same `stable` tag points at a
+manifest index, the agent picks the manifest matching its `runtime.GOARCH`.
+
+Publish your own artifacts with `build/scripts/push-oci-artifact.sh`. See
+the script's header comment for the full publishing flow.
+
+### Policy gates
+
+`apply` now enforces five gates before destroying the passive slot:
+
+1. **Maintenance window** (configurable, e.g. `03:00-05:00`; wrapping
+   midnight supported)
+2. **Node-block-label** — refuses if the K8s node carries
+   `updates.kubesolo.io/block=true` (workload-author kill switch)
+3. **Channel** — `stable` / `beta` / `edge` must match between the artifact
+   metadata and the local channel
+4. **Architecture** — refuses cross-arch artifacts via `runtime.GOARCH` check
+5. **Min compatible version** — stepping-stone enforcement; refuses an
+   upgrade that bypasses a required intermediate version
+
+`--force` bypasses the maintenance window and node-block label (channel /
+arch / min-version are non-negotiable). Failures are recorded in `state.json`
+with a clear `LastError` field.
+
+### Healthcheck deepening + auto-rollback
+
+`kubesolo-update healthcheck` grew three optional probes:
+
+- **Kube-system pods** must hold Running for ≥ N seconds before passing
+- **Operator probe URL** — GET an operator-supplied endpoint; 200 = pass
+- **Disk smoke test** — write/fsync/read/delete a probe file under
+  `/var/lib/kubesolo` to catch a wedged data partition
+
+Plus auto-rollback: with `--auto-rollback-after N` (or `auto_rollback_after=`
+in `update.conf`), after N consecutive post-activation failures, the agent
+calls `ForceRollback()` and the operator/init is expected to reboot. The
+counter resets on a clean pass.
+
+### Persistent configuration via `/etc/kubesolo/update.conf`
+
+Cloud-init writes this file on first boot from a new `updates:` block; you
+can also hand-edit it. Recognised keys:
+
+```
+server = https://updates.example.com         # or omit if using registry
+registry =                                   # OCI registry ref (alt to server)
+channel = stable
+maintenance_window = 03:00-05:00
+pubkey = /etc/kubesolo/update-pubkey.hex
+healthcheck_url = http://localhost:8000/ready
+auto_rollback_after = 3
+```
+
+Cloud-init full reference at
+[cloud-init/examples/full-config.yaml](../cloud-init/examples/full-config.yaml).
+
+## Migration from v0.2.x
+
+This is a non-breaking release for live systems. v0.2.x → v0.3.0 changes:
+
+- **`state.json` will appear** at `/var/lib/kubesolo/update/state.json` the
+  first time a v0.3 agent runs `apply`. Pre-existing v0.2 deployments without
+  this file are fine — the agent treats a missing file as fresh Idle state.
+- **`update.conf` is optional**. v0.2 deployments that pass everything via
+  CLI flags keep working unchanged.
+- **HTTP `latest.json` protocol unchanged**. Existing update servers don't
+  need a rebuild.
+- **GRUB env (boot counter, active slot)** unchanged. The bootloader's
+  rollback behaviour is the same.
+- **No new mandatory kernel command-line parameters**.
+
+To opt into the new lifecycle, transports, and gates, drop in an
+`update.conf` (or update cloud-init) and switch to `--registry` if you want
+OCI distribution.
+
+## Known limitations
+
+These shipped intentionally with v0.3.0 and are explicitly tracked for
+v0.3.1+:
+
+- **OCI signature verification** — the OCI transport is digest-verified
+  end-to-end via oras-go, but does not yet consume cosign-style referrer
+  attestations. The HTTP transport still honours `--pubkey` for `.sig`
+  files.
+- **ARM64 LABEL=KSOLODATA** resolution doesn't work yet — piCore's
+  `blkid`/`findfs` crash on QEMU virt under our mainline kernel; the
+  static `busybox-static` we ship doesn't include those applets.
+  `build/grub/grub-arm64.cfg` hardcodes `kubesolo.data=/dev/vda4` as a
+  workaround. On real ARM64 hardware the device path may differ.
+- **Real-hardware ARM64 validation** is pending. The image builds and
+  boots end-to-end under QEMU virt; production certification waits on a
+  Graviton / Ampere run.
+- **AppArmor profile load fails on ARM64** (`apparmor_parser` ABI mismatch).
+  Init reports the failure; boot continues without AppArmor enforcement.
+- **QEMU TCG performance** can trigger KubeSolo's first-boot image-import
+  deadline. Not an OS defect; real hardware and KVM-accelerated QEMU
+  complete the import in seconds.
+
+## How to upgrade your build host
+
+```bash
+git pull
+make distclean   # optional — drops the build cache; full rebuild takes ~30 min
+make iso         # or disk-image, or disk-image-arm64
+```
+
+The Docker-based builder (`make docker-build`) regenerates its own image
+from `build/Dockerfile.builder` on next invocation; oras 1.2.3 and
+busybox-static are now included.
+
+## Acknowledgements
+
+v0.3.0 work was driven by a single multi-week pair-programming session
+working through Phases 0–9 of the v0.3 roadmap. The Odroid self-hosted
+Gitea Actions runner (`odroid.local`, arm64-linux) carried every ARM64
+build during development.
--- a/hack/dev-vm-arm64.sh
+++ b/hack/dev-vm-arm64.sh
@@ -1,64 +1,167 @@
 #!/bin/bash
 # dev-vm-arm64.sh — Launch ARM64 QEMU VM for development
 #
-# Uses qemu-system-aarch64 with -machine virt to emulate an ARM64 system.
-# This is useful for testing ARM64/RPi builds on x86_64 hosts.
+# Two modes:
+#
+#   Default (direct kernel boot — fast iteration):
+#     qemu loads the kernel Image + initramfs directly via -kernel/-initrd.
+#     Skips bootloader, UEFI firmware, and disk image entirely.
+#     Use this for kernel and init-script changes.
+#
+#   --disk (full UEFI boot — integration testing):
+#     qemu boots the .arm64.img disk image via UEFI firmware -> GRUB -> kernel.
+#     Exercises the full boot chain. Use this when changing the disk image
+#     layout, GRUB config, or anything that touches the EFI partition.
 #
 # Usage:
-#   ./hack/dev-vm-arm64.sh                           # Use default kernel + initramfs
-#   ./hack/dev-vm-arm64.sh <kernel> <initramfs>      # Specify custom paths
-#   ./hack/dev-vm-arm64.sh --debug                   # Enable debug logging
-#   ./hack/dev-vm-arm64.sh --shell                   # Drop to emergency shell
+#   ./hack/dev-vm-arm64.sh                       # direct kernel boot (default)
+#   ./hack/dev-vm-arm64.sh --disk                # full UEFI boot from built image
+#   ./hack/dev-vm-arm64.sh --debug               # enable kubesolo.debug
+#   ./hack/dev-vm-arm64.sh --shell               # drop to emergency shell
+#   ./hack/dev-vm-arm64.sh --disk /path/to.img   # boot a specific disk image
+#   ./hack/dev-vm-arm64.sh <kernel> <initramfs>  # direct boot with custom files
 set -euo pipefail

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+VERSION="$(cat "$PROJECT_ROOT/VERSION")"

+MODE="kernel"           # kernel | disk
 VMLINUZ=""
 INITRD=""
+DISK_IMAGE=""
 EXTRA_APPEND=""

-# Parse arguments
-for arg in "$@"; do
-    case "$arg" in
-        --shell)   EXTRA_APPEND="$EXTRA_APPEND kubesolo.shell" ;;
-        --debug)   EXTRA_APPEND="$EXTRA_APPEND kubesolo.debug" ;;
-        *)
-            if [ -z "$VMLINUZ" ]; then
-                VMLINUZ="$arg"
-            elif [ -z "$INITRD" ]; then
-                INITRD="$arg"
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --shell)   EXTRA_APPEND="$EXTRA_APPEND kubesolo.shell"; shift ;;
+        --debug)   EXTRA_APPEND="$EXTRA_APPEND kubesolo.debug"; shift ;;
+        --disk)
+            MODE="disk"
+            shift
+            # Optional next-arg as disk image path
+            if [ $# -gt 0 ] && [ -f "$1" ]; then
+                DISK_IMAGE="$1"
+                shift
            fi
            ;;
+        *)
+            if [ "$MODE" = "kernel" ] && [ -z "$VMLINUZ" ]; then
+                VMLINUZ="$1"
+            elif [ "$MODE" = "kernel" ] && [ -z "$INITRD" ]; then
+                INITRD="$1"
+            fi
+            shift
+            ;;
    esac
 done

-# Defaults
-VMLINUZ="${VMLINUZ:-$PROJECT_ROOT/build/cache/custom-kernel-arm64/Image}"
+# ---------------------------------------------------------------------------
+# UEFI firmware probe (used for --disk mode)
+# ---------------------------------------------------------------------------
+find_uefi_firmware() {
+    local candidates=(
+        /usr/share/qemu-efi-aarch64/QEMU_EFI.fd
+        /usr/share/AAVMF/AAVMF_CODE.fd
+        /usr/share/edk2/aarch64/QEMU_EFI.fd
+        /usr/share/qemu/edk2-aarch64-code.fd
+        /opt/homebrew/share/qemu/edk2-aarch64-code.fd
+        /usr/local/share/qemu/edk2-aarch64-code.fd
+    )
+    for f in "${candidates[@]}"; do
+        [ -f "$f" ] && echo "$f" && return 0
+    done
+    return 1
+}
+
+# ---------------------------------------------------------------------------
+# mkfs.ext4 probe (kernel mode creates a scratch data disk)
+# ---------------------------------------------------------------------------
+find_mkfs_ext4() {
+    if command -v mkfs.ext4 >/dev/null 2>&1; then
+        echo "mkfs.ext4"
+    elif [ -x "/opt/homebrew/opt/e2fsprogs/sbin/mkfs.ext4" ]; then
+        echo "/opt/homebrew/opt/e2fsprogs/sbin/mkfs.ext4"
+    elif [ -x "/usr/local/opt/e2fsprogs/sbin/mkfs.ext4" ]; then
+        echo "/usr/local/opt/e2fsprogs/sbin/mkfs.ext4"
+    fi
+}
+
+# ===========================================================================
+# Disk mode: boot the built .arm64.img through UEFI firmware + GRUB
+# ===========================================================================
+if [ "$MODE" = "disk" ]; then
+    DISK_IMAGE="${DISK_IMAGE:-$PROJECT_ROOT/output/kubesolo-os-${VERSION}.arm64.img}"
+
+    if [ ! -f "$DISK_IMAGE" ]; then
+        echo "ERROR: Disk image not found: $DISK_IMAGE"
+        echo "  Run 'make disk-image-arm64' to build it."
+        exit 1
+    fi
+
+    UEFI_FW="$(find_uefi_firmware || true)"
+    if [ -z "$UEFI_FW" ]; then
+        echo "ERROR: No ARM64 UEFI firmware found."
+        echo "  Install one of:"
+        echo "    apt install qemu-efi-aarch64        # Debian/Ubuntu"
+        echo "    dnf install edk2-aarch64            # Fedora/RHEL"
+        echo "    brew install qemu                   # macOS (bundled)"
+        exit 1
+    fi
+
+    # Pad UEFI firmware variable store to 64 MiB if QEMU expects pflash sizing.
+    # Most ARM64 EFI .fd files are 64 MB; if yours is smaller, QEMU may refuse.
+    echo "==> Launching ARM64 QEMU (UEFI disk boot)..."
+    echo "    Firmware: $UEFI_FW"
+    echo "    Disk:     $DISK_IMAGE"
+    echo ""
+    echo "    K8s API: localhost:6443"
+    echo "    SSH:     localhost:2222"
+    echo "    Press Ctrl+A X to exit QEMU"
+    echo ""
+
+    # -cpu max enables all emulated ARMv8 features (atomics, crypto, fp16).
+    # piCore64's BusyBox is built with -march=armv8-a+crypto+lse and segfaults
+    # under -cpu cortex-a72 because some required extensions aren't on by
+    # default in that model.
+    qemu-system-aarch64 \
+        -machine virt \
+        -cpu max \
+        -m 2048 \
+        -smp 2 \
+        -nographic \
+        -bios "$UEFI_FW" \
+        -drive "file=$DISK_IMAGE,format=raw,if=virtio,media=disk" \
+        -net "nic,model=virtio" \
+        -net "user,hostfwd=tcp::6443-:6443,hostfwd=tcp::2222-:22,hostfwd=tcp::8080-:8080"
+    exit 0
+fi
+
+# ===========================================================================
+# Kernel mode (default): direct -kernel / -initrd, fast iteration
+# ===========================================================================
+VMLINUZ="${VMLINUZ:-$PROJECT_ROOT/build/cache/kernel-arm64-generic/Image}"
 INITRD="${INITRD:-$PROJECT_ROOT/build/rootfs-work/kubesolo-os.gz}"

-# Verify files exist
+# Fallback: previous-generation RPi kernel cache, in case someone hasn't yet
+# rebuilt under v0.3 paths.
+if [ ! -f "$VMLINUZ" ] && [ -f "$PROJECT_ROOT/build/cache/custom-kernel-rpi/Image" ]; then
+    VMLINUZ="$PROJECT_ROOT/build/cache/custom-kernel-rpi/Image"
+    echo "==> Note: falling back to RPi kernel ($VMLINUZ)"
+fi
+
 if [ ! -f "$VMLINUZ" ]; then
    echo "ERROR: Kernel not found: $VMLINUZ"
-    echo "  Run 'make kernel-arm64' to build the ARM64 kernel."
+    echo "  Run 'make kernel-arm64' (generic) or 'make kernel-rpi' to build a kernel."
    exit 1
 fi
 if [ ! -f "$INITRD" ]; then
    echo "ERROR: Initrd not found: $INITRD"
-    echo "  Run 'make initramfs' to build the initramfs."
+    echo "  Run 'make rootfs-arm64' to build the initramfs."
    exit 1
 fi

-# Find mkfs.ext4
-MKFS_EXT4=""
-if command -v mkfs.ext4 >/dev/null 2>&1; then
-    MKFS_EXT4="mkfs.ext4"
-elif [ -x "/opt/homebrew/opt/e2fsprogs/sbin/mkfs.ext4" ]; then
-    MKFS_EXT4="/opt/homebrew/opt/e2fsprogs/sbin/mkfs.ext4"
-elif [ -x "/usr/local/opt/e2fsprogs/sbin/mkfs.ext4" ]; then
-    MKFS_EXT4="/usr/local/opt/e2fsprogs/sbin/mkfs.ext4"
-fi
-
+MKFS_EXT4="$(find_mkfs_ext4)"
 if [ -z "$MKFS_EXT4" ]; then
    echo "ERROR: mkfs.ext4 not found. Install e2fsprogs:"
    if [ "$(uname)" = "Darwin" ]; then
@@ -70,13 +173,12 @@ if [ -z "$MKFS_EXT4" ]; then
    exit 1
 fi

-# Create data disk
 DATA_DISK="$(mktemp /tmp/kubesolo-arm64-data-XXXXXX).img"
 dd if=/dev/zero of="$DATA_DISK" bs=1M count=1024 2>/dev/null
 "$MKFS_EXT4" -q -L KSOLODATA "$DATA_DISK" 2>/dev/null
 trap 'rm -f "$DATA_DISK"' EXIT

-echo "==> Launching ARM64 QEMU VM..."
+echo "==> Launching ARM64 QEMU (direct kernel boot)..."
 echo "    Kernel: $VMLINUZ"
 echo "    Initrd: $INITRD"
 echo "    Data:   $DATA_DISK"
@@ -88,7 +190,7 @@ echo ""

 qemu-system-aarch64 \
    -machine virt \
-    -cpu cortex-a72 \
+    -cpu max \
    -m 2048 \
    -smp 2 \
    -nographic \
@@ -97,4 +199,4 @@ qemu-system-aarch64 \
    -append "console=ttyAMA0 kubesolo.data=/dev/vda kubesolo.debug $EXTRA_APPEND" \
    -drive "file=$DATA_DISK,format=raw,if=virtio" \
    -net "nic,model=virtio" \
-    -net "user,hostfwd=tcp::6443-:6443,hostfwd=tcp::2222-:22"
+    -net "user,hostfwd=tcp::6443-:6443,hostfwd=tcp::2222-:22,hostfwd=tcp::8080-:8080"
--- a/init/init.sh
+++ b/init/init.sh
@@ -14,6 +14,11 @@
 #   kubesolo.cloudinit=<path> Path to cloud-init config
 #   kubesolo.flags=<flags>    Extra flags for KubeSolo binary

+# Route early boot output to /dev/console — before switch_root the kernel may
+# not have a controlling tty, and some stages echo to stderr expecting it to
+# reach the serial console. This is a no-op once the staged init proper starts.
+exec >/dev/console 2>&1
+
 set -e

 # --- Switch root: escape initramfs so runc pivot_root works ---
--- a/init/lib/20-persistent-mount.sh
+++ b/init/lib/20-persistent-mount.sh
@@ -58,12 +58,46 @@ esac

 if [ ! -b "$KUBESOLO_DATA_DEV" ]; then
    log_err "Data device $KUBESOLO_DATA_DEV not found after ${WAIT_SECS}s"
-    # Show available block devices for debugging
-    log_err "Available block devices:"
-    ls -la /dev/mmc* /dev/sd* /dev/vd* 2>/dev/null | while read -r line; do
+    # Comprehensive diagnostics for block device failure
+    log_err "=== Block device diagnostics ==="
+    log_err "--- /dev block devices ---"
+    ls -la /dev/mmc* /dev/sd* /dev/vd* /dev/nvme* 2>/dev/null | while read -r line; do
        log_err "  $line"
    done
-    return 1
+    log_err "--- /sys/class/block (kernel registered) ---"
+    ls /sys/class/block/ 2>/dev/null | while read -r line; do
+        log_err "  $line"
+    done
+    log_err "--- dmesg: MMC/SDHCI/emmc ---"
+    dmesg 2>/dev/null | grep -i -e mmc -e sdhci -e emmc | while read -r line; do
+        log_err "  $line"
+    done
+    log_err "--- dmesg: regulator ---"
+    dmesg 2>/dev/null | grep -i regulator | while read -r line; do
+        log_err "  $line"
+    done
+    log_err "--- dmesg: firmware/mailbox ---"
+    dmesg 2>/dev/null | grep -i -e 'raspberrypi' -e 'mailbox' -e 'firmware' | while read -r line; do
+        log_err "  $line"
+    done
+    log_err "--- dmesg: errors ---"
+    dmesg 2>/dev/null | grep -i -e 'error' -e 'fail' -e 'unable' | while read -r line; do
+        log_err "  $line"
+    done
+    log_err "--- Full dmesg (last 60 lines) ---"
+    dmesg 2>/dev/null | tail -60 | while read -r line; do
+        log_err "  $line"
+    done
+    log_err "=== End diagnostics ==="
+    log_err ""
+    log_err "Dropping to debug shell in 10 seconds..."
+    log_err "Run 'dmesg' to see full kernel log."
+    log_err "Run 'ls /sys/class/block/' to check block devices."
+    log_err ""
+    sleep 10
+    # Drop to interactive shell instead of returning failure
+    # (returning 1 with set -e causes kernel panic before emergency_shell)
+    exec /bin/sh </dev/console >/dev/console 2>&1
 fi

 # Mount data partition (format on first boot if unformatted)
--- a/init/lib/30-kernel-modules.sh
+++ b/init/lib/30-kernel-modules.sh
@@ -16,7 +16,11 @@ while IFS= read -r mod; do
    case "$mod" in
        '#'*|'') continue ;;
    esac
-    mod="$(echo "$mod" | tr -d '[:space:]')"
+    # NOTE: do NOT use tr -d '[:space:]' — Ubuntu's busybox-static 1.30.1 (used
+    # in the ARM64 rootfs override) doesn't parse POSIX char classes and treats
+    # them as a literal set, deleting [, :, s, p, a, c, e, ]. Use explicit
+    # whitespace chars instead so the same script works under any tr.
+    mod="$(printf '%s' "$mod" | tr -d ' \t\r\n')"
    if modprobe "$mod" 2>/dev/null; then
        LOADED=$((LOADED + 1))
    else
--- a/init/lib/40-sysctl.sh
+++ b/init/lib/40-sysctl.sh
@@ -8,8 +8,11 @@ for conf in /etc/sysctl.d/*.conf; do
        case "$key" in
            '#'*|'') continue ;;
        esac
-        key="$(echo "$key" | tr -d '[:space:]')"
-        value="$(echo "$value" | tr -d '[:space:]')"
+        # NOTE: do NOT use tr -d '[:space:]' — see 30-kernel-modules.sh for the
+        # rationale. Use explicit whitespace chars so this works under
+        # Ubuntu's busybox-static tr too.
+        key="$(printf '%s' "$key" | tr -d ' \t\r\n')"
+        value="$(printf '%s' "$value" | tr -d ' \t\r\n')"
        if [ -n "$key" ] && [ -n "$value" ]; then
            sysctl -w "${key}=${value}" >/dev/null 2>&1 || \
                log_warn "Failed to set sysctl: ${key}=${value}"
--- a/init/lib/90-kubesolo.sh
+++ b/init/lib/90-kubesolo.sh
@@ -76,6 +76,29 @@ while [ ! -f "$KUBECONFIG_PATH" ] && [ $WAIT -lt 120 ]; do
    fi
 done

+# Render the access banner. Written to /etc/motd so it's visible to anyone
+# who later shells in (SSH extension, emergency shell, console login), and
+# printed unconditionally to console below so the user sees it even when
+# KubeSolo hasn't yet finished generating the kubeconfig.
+ACCESS_BANNER="$(cat <<'BANNER'
+============================================================
+  KubeSolo OS — host access
+
+  From your host machine, run:
+
+    curl -s http://localhost:8080 > ~/.kube/kubesolo-config
+    kubectl --kubeconfig ~/.kube/kubesolo-config get nodes
+
+  Notes:
+    - port 8080 serves the kubeconfig (admin) over HTTP
+    - port 6443 serves the Kubernetes API (HTTPS)
+    - Both ports are forwarded under QEMU's `-net user,hostfwd=…` config
+
+============================================================
+BANNER
+)"
+printf '%s\n' "$ACCESS_BANNER" > /etc/motd 2>/dev/null || true
+
 if [ -f "$KUBECONFIG_PATH" ]; then
    log_ok "KubeSolo is running (PID $KUBESOLO_PID)"

@@ -95,18 +118,17 @@ if [ -f "$KUBECONFIG_PATH" ]; then
    done) &

    log_ok "Kubeconfig available via HTTP on port 8080"
-    echo ""
-    echo "============================================================"
-    echo "  From your host machine, run:"
-    echo ""
-    echo "  curl -s http://localhost:8080 > ~/.kube/kubesolo-config"
-    echo "  kubectl --kubeconfig ~/.kube/kubesolo-config get nodes"
-    echo "============================================================"
-    echo ""
 else
    log_warn "Kubeconfig not found after ${WAIT}s — KubeSolo may still be starting"
    log_warn "Check manually: cat $KUBECONFIG_PATH"
 fi

+# Show the banner regardless of kubeconfig state: the HTTP server above only
+# starts on success, but printing the instructions during the long first-boot
+# wait is useful and harmless (user retries the curl until it 200s).
+echo ""
+printf '%s\n' "$ACCESS_BANNER"
+echo ""
+
 # Keep init alive — wait on KubeSolo process
 wait $KUBESOLO_PID
--- a/test/qemu/test-boot-arm64-disk.sh
+++ b/test/qemu/test-boot-arm64-disk.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# test-boot-arm64-disk.sh — Boot the ARM64 .arm64.img via UEFI + GRUB and
+# verify the init system reaches stage 90.
+#
+# This is the full-stack integration test: UEFI firmware -> GRUB -> kernel ->
+# initramfs -> staged init. Contrast with test-boot-arm64.sh which skips the
+# bootloader and loads kernel/initramfs directly.
+#
+# Exit 0 = PASS, Exit 1 = FAIL.
+#
+# Usage: ./test/qemu/test-boot-arm64-disk.sh [disk.img]
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+VERSION="$(cat "$PROJECT_ROOT/VERSION")"
+
+DISK_IMAGE="${1:-$PROJECT_ROOT/output/kubesolo-os-${VERSION}.arm64.img}"
+TIMEOUT=180
+
+echo "==> ARM64 UEFI Disk Boot Test"
+echo "    Disk image: $DISK_IMAGE"
+echo "    Timeout:    ${TIMEOUT}s"
+
+if [ ! -f "$DISK_IMAGE" ]; then
+    echo "ERROR: Disk image not found: $DISK_IMAGE"
+    echo "  Run 'make disk-image-arm64' to build it."
+    exit 1
+fi
+
+if ! command -v qemu-system-aarch64 >/dev/null 2>&1; then
+    echo "ERROR: qemu-system-aarch64 not found."
+    echo "  apt install qemu-system-arm   # Debian/Ubuntu"
+    echo "  dnf install qemu-system-aarch64  # Fedora/RHEL"
+    exit 1
+fi
+
+# --- Locate UEFI firmware ---
+UEFI_FW=""
+for candidate in \
+    /usr/share/qemu-efi-aarch64/QEMU_EFI.fd \
+    /usr/share/AAVMF/AAVMF_CODE.fd \
+    /usr/share/edk2/aarch64/QEMU_EFI.fd \
+    /usr/share/qemu/edk2-aarch64-code.fd \
+    /opt/homebrew/share/qemu/edk2-aarch64-code.fd \
+    /usr/local/share/qemu/edk2-aarch64-code.fd
+do
+    if [ -f "$candidate" ]; then
+        UEFI_FW="$candidate"
+        break
+    fi
+done
+
+if [ -z "$UEFI_FW" ]; then
+    echo "ERROR: No ARM64 UEFI firmware found."
+    echo "  apt install qemu-efi-aarch64"
+    exit 1
+fi
+
+echo "    UEFI fw:    $UEFI_FW"
+
+# Copy disk image to a scratch file so the test doesn't mutate the source.
+# UEFI will write to grubenv on the EFI partition; we don't want to bake those
+# changes into the canonical build artifact.
+SCRATCH_DISK=$(mktemp /tmp/kubesolo-arm64-disk-test-XXXXXX.img)
+SERIAL_LOG=$(mktemp /tmp/kubesolo-arm64-disk-serial-XXXXXX.log)
+QEMU_PID=""
+
+cleanup() {
+    [ -n "$QEMU_PID" ] && kill "$QEMU_PID" 2>/dev/null || true
+    rm -f "$SCRATCH_DISK" "$SERIAL_LOG"
+}
+trap cleanup EXIT
+
+cp --reflink=auto "$DISK_IMAGE" "$SCRATCH_DISK" 2>/dev/null || cp "$DISK_IMAGE" "$SCRATCH_DISK"
+
+# --- Launch QEMU ---
+qemu-system-aarch64 \
+    -machine virt \
+    -cpu cortex-a72 \
+    -m 2048 \
+    -smp 2 \
+    -nographic \
+    -bios "$UEFI_FW" \
+    -drive "file=$SCRATCH_DISK,format=raw,if=virtio,media=disk" \
+    -net nic,model=virtio \
+    -net user \
+    -serial "file:$SERIAL_LOG" &
+QEMU_PID=$!
+
+echo "    Waiting for boot (PID $QEMU_PID)..."
+ELAPSED=0
+SUCCESS=0
+while [ "$ELAPSED" -lt "$TIMEOUT" ]; do
+    if grep -q "\[kubesolo-init\] \[OK\] Stage 90-kubesolo.sh complete" "$SERIAL_LOG" 2>/dev/null; then
+        SUCCESS=1
+        break
+    fi
+    if grep -q "KubeSolo is running" "$SERIAL_LOG" 2>/dev/null; then
+        SUCCESS=1
+        break
+    fi
+    if ! kill -0 "$QEMU_PID" 2>/dev/null; then
+        echo ""
+        echo "==> FAIL: QEMU exited prematurely"
+        echo "    Last 30 lines of serial output:"
+        tail -30 "$SERIAL_LOG" 2>/dev/null || echo "    (no output)"
+        exit 1
+    fi
+    sleep 2
+    ELAPSED=$((ELAPSED + 2))
+    printf "\r    Elapsed: %ds / %ds" "$ELAPSED" "$TIMEOUT"
+done
+echo ""
+
+kill "$QEMU_PID" 2>/dev/null || true
+wait "$QEMU_PID" 2>/dev/null || true
+QEMU_PID=""
+
+if [ "$SUCCESS" = "1" ]; then
+    echo "==> ARM64 UEFI Disk Boot Test PASSED (${ELAPSED}s)"
+    exit 0
+fi
+
+echo "==> ARM64 UEFI Disk Boot Test FAILED (timeout ${TIMEOUT}s)"
+echo ""
+echo "==> Last 50 lines of serial output:"
+tail -50 "$SERIAL_LOG" 2>/dev/null || echo "    (no output)"
+exit 1
--- a/update/cmd/activate.go
+++ b/update/cmd/activate.go
@@ -3,23 +3,35 @@ package cmd
 import (
 	"fmt"
 	"log/slog"
+
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

 // Activate switches the boot target to the passive partition.
 // After activation, the next reboot will boot from the new partition
 // with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
+//
+// State transition: Staged → Activated. On failure → Failed.
 func Activate(args []string) error {
 	opts := parseOpts(args)
 	env := opts.NewBootEnv()

+	st, err := state.Load(opts.StatePath)
+	if err != nil {
+		slog.Warn("state file unreadable, starting fresh", "error", err)
+		st = state.New()
+	}
+
 	// Get passive slot (the one we want to boot into)
 	passiveSlot, err := env.PassiveSlot()
 	if err != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
 		return fmt.Errorf("reading passive slot: %w", err)
 	}

 	activeSlot, err := env.ActiveSlot()
 	if err != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("reading active slot: %w", err))
 		return fmt.Errorf("reading active slot: %w", err)
 	}

@@ -27,9 +39,14 @@ func Activate(args []string) error {

 	// Set the passive slot as active with fresh boot counter
 	if err := env.ActivateSlot(passiveSlot); err != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("activating slot %s: %w", passiveSlot, err))
 		return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
 	}

+	if err := st.Transition(opts.StatePath, state.PhaseActivated, "", ""); err != nil {
+		slog.Warn("state transition failed", "phase", state.PhaseActivated, "error", err)
+	}
+
 	fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
 	fmt.Println("Boot counter set to 3. Reboot to start the new version.")
 	fmt.Println("The system will automatically roll back if health checks fail 3 times.")
--- a/update/cmd/apply.go
+++ b/update/cmd/apply.go
@@ -1,73 +1,240 @@
 package cmd

 import (
+	"context"
 	"fmt"
 	"log/slog"
+	"os"
+	"runtime"
+	"time"

+	"github.com/portainer/kubesolo-os/update/pkg/config"
+	"github.com/portainer/kubesolo-os/update/pkg/health"
 	"github.com/portainer/kubesolo-os/update/pkg/image"
+	"github.com/portainer/kubesolo-os/update/pkg/oci"
 	"github.com/portainer/kubesolo-os/update/pkg/partition"
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

+// applyMetadataGates enforces channel / architecture / min-version policy on
+// resolved update metadata, regardless of transport (HTTP or OCI). Records
+// any failure to the state file before returning.
+func applyMetadataGates(opts opts, st *state.UpdateState, meta *image.UpdateMetadata) error {
+	if meta.Channel != "" && meta.Channel != opts.Channel {
+		err := fmt.Errorf("metadata channel %q does not match local channel %q",
+			meta.Channel, opts.Channel)
+		_ = st.RecordError(opts.StatePath, err)
+		return err
+	}
+	if meta.Architecture != "" && meta.Architecture != runtime.GOARCH {
+		err := fmt.Errorf("metadata architecture %q does not match runtime %q",
+			meta.Architecture, runtime.GOARCH)
+		_ = st.RecordError(opts.StatePath, err)
+		return err
+	}
+	if meta.MinCompatibleVersion != "" && st.FromVersion != "" {
+		cmp, cerr := config.CompareVersions(st.FromVersion, meta.MinCompatibleVersion)
+		if cerr != nil {
+			slog.Warn("min-version comparison failed", "error", cerr,
+				"from", st.FromVersion, "min", meta.MinCompatibleVersion)
+		} else if cmp < 0 {
+			err := fmt.Errorf("current version %s is below min_compatible_version %s; install %s first",
+				st.FromVersion, meta.MinCompatibleVersion, meta.MinCompatibleVersion)
+			_ = st.RecordError(opts.StatePath, err)
+			return err
+		}
+	}
+	return nil
+}
+
 // Apply downloads a new OS image and writes it to the passive partition.
 // It does NOT activate the new partition — use 'activate' for that.
+//
+// State transitions: Idle/Success/Failed → Checking → Downloading → Staged.
+// On any error the state moves to Failed with LastError set.
 func Apply(args []string) error {
 	opts := parseOpts(args)

-	if opts.ServerURL == "" {
-		return fmt.Errorf("--server is required")
+	if opts.ServerURL == "" && opts.Registry == "" {
+		return fmt.Errorf("--server or --registry is required (or set in /etc/kubesolo/update.conf)")
+	}
+	if opts.ServerURL != "" && opts.Registry != "" {
+		return fmt.Errorf("--server and --registry are mutually exclusive")
+	}
+
+	// Maintenance window gate — earliest cheap check, before any HTTP work.
+	// Skipped with --force.
+	window, werr := config.ParseWindow(opts.MaintenanceWindow)
+	if werr != nil {
+		return fmt.Errorf("parse maintenance_window: %w", werr)
+	}
+	if !opts.Force && !window.Contains(time.Now()) {
+		return fmt.Errorf("outside maintenance window (%s); pass --force to override",
+			window.String())
+	}
+
+	// Node-block-label gate — workload authors can defer an update by
+	// labeling the node updates.kubesolo.io/block=true. Skipped with --force
+	// and silently bypassed when the K8s API isn't reachable (air-gap).
+	if !opts.Force {
+		blocked, berr := health.CheckNodeBlocked("")
+		if berr != nil {
+			slog.Warn("node-block check failed, allowing update", "error", berr)
+		} else if blocked {
+			return fmt.Errorf("node carries label %s=true; refusing update (pass --force to override)",
+				health.NodeBlockLabel)
+		}
+	}
+
+	st, err := state.Load(opts.StatePath)
+	if err != nil {
+		// Don't block the operation on a corrupt state file. Log + recover.
+		slog.Warn("state file unreadable, starting fresh", "error", err)
+		st = state.New()
 	}

 	env := opts.NewBootEnv()

+	// Record the current running version as the "from" reference. The active
+	// slot's version file is the most reliable source.
+	activeSlot, slotErr := env.ActiveSlot()
+	if slotErr == nil {
+		if partInfo, perr := partition.GetSlotPartition(activeSlot); perr == nil {
+			mp := "/tmp/kubesolo-active-" + activeSlot
+			if merr := partition.MountReadOnly(partInfo.Device, mp); merr == nil {
+				if v, rerr := partition.ReadVersion(mp); rerr == nil {
+					st.SetFromVersion(v)
+				}
+				partition.Unmount(mp)
+			}
+		}
+	}
+
 	// Determine passive slot
 	passiveSlot, err := env.PassiveSlot()
 	if err != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
 		return fmt.Errorf("reading passive slot: %w", err)
 	}

 	slog.Info("applying update", "target_slot", passiveSlot)
-
-	// Check for update
 	stageDir := "/tmp/kubesolo-update-stage"
-	client := image.NewClient(opts.ServerURL, stageDir)
-	defer client.Cleanup()

-	// Enable signature verification if public key is configured
-	if opts.PubKeyPath != "" {
-		client.SetPublicKeyPath(opts.PubKeyPath)
-		slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
+	if err := st.Transition(opts.StatePath, state.PhaseChecking, "", ""); err != nil {
+		slog.Warn("state transition failed", "phase", state.PhaseChecking, "error", err)
 	}

-	meta, err := client.CheckForUpdate()
-	if err != nil {
-		return fmt.Errorf("checking for update: %w", err)
+	// Resolve metadata via the configured transport. OCI registry mode pulls
+	// the manifest only; HTTP mode hits latest.json.
+	var (
+		meta   *image.UpdateMetadata
+		staged *image.StagedImage
+	)
+	if opts.Registry != "" {
+		ociClient, err := oci.NewClient(opts.Registry)
+		if err != nil {
+			_ = st.RecordError(opts.StatePath, fmt.Errorf("oci client: %w", err))
+			return fmt.Errorf("oci client: %w", err)
+		}
+		tag := opts.Tag
+		if tag == "" {
+			tag = opts.Channel
+		}
+		if tag == "" {
+			tag = "stable"
+		}
+		meta, err = ociClient.FetchMetadata(context.Background(), tag)
+		if err != nil {
+			_ = st.RecordError(opts.StatePath, fmt.Errorf("oci fetch metadata: %w", err))
+			return fmt.Errorf("oci fetch metadata: %w", err)
+		}
+		if err := applyMetadataGates(opts, st, meta); err != nil {
+			return err
+		}
+		if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
+			slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
+		}
+		staged, _, err = ociClient.Pull(context.Background(), tag, stageDir)
+		if err != nil {
+			_ = st.RecordError(opts.StatePath, fmt.Errorf("oci pull: %w", err))
+			return fmt.Errorf("oci pull: %w", err)
+		}
+	} else {
+		client := image.NewClient(opts.ServerURL, stageDir)
+		defer client.Cleanup()
+		if opts.PubKeyPath != "" {
+			client.SetPublicKeyPath(opts.PubKeyPath)
+			slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
+		}
+		var err error
+		meta, err = client.CheckForUpdate()
+		if err != nil {
+			_ = st.RecordError(opts.StatePath, fmt.Errorf("checking for update: %w", err))
+			return fmt.Errorf("checking for update: %w", err)
+		}
+		if err := applyMetadataGates(opts, st, meta); err != nil {
+			return err
+		}
+		if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
+			slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
+		}
+		staged, err = client.Download(meta)
+		if err != nil {
+			_ = st.RecordError(opts.StatePath, fmt.Errorf("downloading update: %w", err))
+			return fmt.Errorf("downloading update: %w", err)
+		}
 	}

-	slog.Info("update available", "version", meta.Version)
-
-	// Download and verify
-	staged, err := client.Download(meta)
-	if err != nil {
-		return fmt.Errorf("downloading update: %w", err)
-	}
+	slog.Info("update available", "version", meta.Version, "channel", meta.Channel, "arch", meta.Architecture)

 	// Mount passive partition
 	partInfo, err := partition.GetSlotPartition(passiveSlot)
 	if err != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("finding passive partition: %w", err))
 		return fmt.Errorf("finding passive partition: %w", err)
 	}

 	mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
 	if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("mounting passive partition: %w", err))
 		return fmt.Errorf("mounting passive partition: %w", err)
 	}
 	defer partition.Unmount(mountPoint)

+	// Free-space pre-write check: the passive partition must have at least
+	// (kernel + initramfs) + 10% headroom. Catches corrupted-FS reports and
+	// shrunk/wrong-size partitions before we destroy the existing slot data.
+	var imgSize int64
+	for _, p := range []string{staged.VmlinuzPath, staged.InitramfsPath} {
+		fi, ferr := os.Stat(p)
+		if ferr != nil {
+			_ = st.RecordError(opts.StatePath, fmt.Errorf("stat staged file %s: %w", p, ferr))
+			return fmt.Errorf("stat staged file %s: %w", p, ferr)
+		}
+		imgSize += fi.Size()
+	}
+	avail, ok, ferr := partition.HasFreeSpaceFor(mountPoint, imgSize, 10)
+	if ferr != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("free-space check: %w", ferr))
+		return fmt.Errorf("free-space check: %w", ferr)
+	}
+	if !ok {
+		err := fmt.Errorf("insufficient space on %s: have %.1f MiB, need %.1f MiB (image + 10%% headroom)",
+			passiveSlot, float64(avail)/(1<<20), float64(imgSize)*1.1/(1<<20))
+		_ = st.RecordError(opts.StatePath, err)
+		return err
+	}
+
 	// Write image to passive partition
 	if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))
 		return fmt.Errorf("writing system image: %w", err)
 	}

+	if err := st.Transition(opts.StatePath, state.PhaseStaged, staged.Version, ""); err != nil {
+		slog.Warn("state transition failed", "phase", state.PhaseStaged, "error", err)
+	}
+
 	fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
 	fmt.Println("Run 'kubesolo-update activate' to boot into the new version")

--- a/update/cmd/healthcheck.go
+++ b/update/cmd/healthcheck.go
@@ -6,16 +6,32 @@ import (
 	"time"

 	"github.com/portainer/kubesolo-os/update/pkg/health"
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

 // Healthcheck performs post-boot health verification.
 // If all checks pass, it marks the boot as successful in GRUB.
 // This should be run after every boot (typically via a systemd unit or
 // init script) to confirm the system is healthy.
+//
+// State transition: Activated → Verifying → Success on pass, → Failed on fail.
+// If state isn't in Activated (e.g. manual run on a long-stable system), the
+// state file is left alone — healthcheck still does its job.
+//
+// When --auto-rollback-after N is set, consecutive post-Activated failures
+// are counted in state.HealthCheckFailures. On the Nth failure, the agent
+// calls Rollback() and the operator is expected to reboot (this command
+// does not reboot the host — that's policy left to systemd/init).
 func Healthcheck(args []string) error {
 	opts := parseOpts(args)
 	env := opts.NewBootEnv()

+	st, err := state.Load(opts.StatePath)
+	if err != nil {
+		slog.Warn("state file unreadable, starting fresh", "error", err)
+		st = state.New()
+	}
+
 	// Check if already marked successful
 	success, err := env.BootSuccess()
 	if err != nil {
@@ -26,30 +42,94 @@ func Healthcheck(args []string) error {
 		return nil
 	}

+	// Only transition state if we're post-activation. Manual healthcheck on a
+	// long-stable system shouldn't reset Idle → Verifying.
+	postActivation := st.Phase == state.PhaseActivated
+	if postActivation {
+		if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil {
+			slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err)
+		}
+	}
+
 	timeout := time.Duration(opts.TimeoutSecs) * time.Second
 	checker := health.NewChecker("", "", timeout)
+	checker.ProbeURL = opts.HealthcheckURL
+	if opts.KubeSystemSettle > 0 {
+		checker.KubeSystemSettle = time.Duration(opts.KubeSystemSettle) * time.Second
+	}
+	// Probe the data partition every healthcheck so a wedged disk fails fast.
+	checker.DataDir = "/var/lib/kubesolo"

-	slog.Info("running post-boot health checks", "timeout", timeout)
+	slog.Info("running post-boot health checks",
+		"timeout", timeout,
+		"probe_url", checker.ProbeURL,
+		"kube_system_settle", checker.KubeSystemSettle)

 	status, err := checker.WaitForHealthy()
 	if err != nil {
 		fmt.Printf("Health check FAILED: %s\n", status.Message)
-		fmt.Printf("  containerd: %v\n", status.Containerd)
-		fmt.Printf("  apiserver:  %v\n", status.APIServer)
-		fmt.Printf("  node_ready: %v\n", status.NodeReady)
+		printStatusBreakdown(status)
 		fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
+
+		if postActivation {
+			st.HealthCheckFailures++
+			_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
+
+			// Auto-rollback escalation. Only trigger when post-Activated;
+			// don't second-guess a healthy long-running system.
+			if opts.AutoRollbackAfter > 0 && st.HealthCheckFailures >= opts.AutoRollbackAfter {
+				slog.Warn("auto-rollback threshold reached",
+					"failures", st.HealthCheckFailures,
+					"threshold", opts.AutoRollbackAfter)
+				if rerr := env.ForceRollback(); rerr != nil {
+					slog.Error("auto-rollback failed", "error", rerr)
+					return err // return the original healthcheck error
+				}
+				if terr := st.Transition(opts.StatePath, state.PhaseRolledBack, "",
+					fmt.Sprintf("auto-rollback after %d healthcheck failures", st.HealthCheckFailures)); terr != nil {
+					slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", terr)
+				}
+				fmt.Println("\nAuto-rollback triggered. Reboot to complete the rollback.")
+			}
+		}
 		return err
 	}

 	// Mark boot as successful
 	if err := env.MarkBootSuccess(); err != nil {
+		if postActivation {
+			_ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err))
+		}
 		return fmt.Errorf("marking boot success: %w", err)
 	}

+	if postActivation {
+		// Reset failure counter on a clean pass.
+		st.HealthCheckFailures = 0
+		if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
+			slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
+		}
+	}
+
 	fmt.Println("Health check PASSED — boot marked successful")
-	fmt.Printf("  containerd: %v\n", status.Containerd)
-	fmt.Printf("  apiserver:  %v\n", status.APIServer)
-	fmt.Printf("  node_ready: %v\n", status.NodeReady)
+	printStatusBreakdown(status)

 	return nil
 }
+
+// printStatusBreakdown emits a human-readable per-check summary. Only emits
+// optional check lines when they actually ran.
+func printStatusBreakdown(s *health.Status) {
+	fmt.Printf("  containerd:        %v\n", s.Containerd)
+	fmt.Printf("  apiserver:         %v\n", s.APIServer)
+	fmt.Printf("  node_ready:        %v\n", s.NodeReady)
+	if !s.KubeSystemReady {
+		fmt.Printf("  kube-system pods:  %v\n", s.KubeSystemReady)
+	}
+	if !s.ProbeURL {
+		fmt.Printf("  probe URL:         %v\n", s.ProbeURL)
+	}
+	if !s.DiskWritable {
+		fmt.Printf("  disk writable:     %v\n", s.DiskWritable)
+	}
+}
--- a/update/cmd/metrics.go
+++ b/update/cmd/metrics.go
@@ -5,6 +5,7 @@ import (
 	"fmt"

 	"github.com/portainer/kubesolo-os/update/pkg/metrics"
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

 // Metrics starts the Prometheus-compatible metrics HTTP server.
@@ -12,10 +13,12 @@ func Metrics(args []string) error {
 	fs := flag.NewFlagSet("metrics", flag.ExitOnError)
 	listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address")
 	grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file")
+	statePath := fs.String("state", state.DefaultPath, "Path to update state.json")
 	if err := fs.Parse(args); err != nil {
 		return fmt.Errorf("parse flags: %w", err)
 	}

 	srv := metrics.NewServer(*listenAddr, *grubenvPath)
+	srv.SetStatePath(*statePath)
 	return srv.ListenAndServe()
 }
--- a/update/cmd/opts.go
+++ b/update/cmd/opts.go
@@ -1,17 +1,32 @@
 package cmd

 import (
+	"log/slog"
+
 	"github.com/portainer/kubesolo-os/update/pkg/bootenv"
+	"github.com/portainer/kubesolo-os/update/pkg/config"
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

 // opts holds shared command-line options for all subcommands.
 type opts struct {
-	ServerURL   string
-	GrubenvPath string
-	TimeoutSecs int
-	PubKeyPath  string
-	BootEnvType string // "grub" or "rpi"
-	BootEnvPath string // path for RPi boot control dir
+	ServerURL         string
+	Registry          string // OCI registry ref (e.g. ghcr.io/foo/kubesolo-os). Mutually exclusive with ServerURL.
+	Tag               string // OCI tag to pull (default: equal to Channel, falling back to "stable")
+	GrubenvPath       string
+	TimeoutSecs       int
+	PubKeyPath        string
+	BootEnvType       string // "grub" or "rpi"
+	BootEnvPath       string // path for RPi boot control dir
+	StatePath          string // location of state.json (default: state.DefaultPath)
+	ConfPath           string // location of update.conf (default: config.DefaultPath)
+	Channel            string // update channel ("stable" by default)
+	MaintenanceWindow  string // "HH:MM-HH:MM" or empty for always-allow
+	HealthcheckURL     string // optional GET probe for healthcheck
+	AutoRollbackAfter  int    // healthcheck: rollback after N consecutive failures (0=off)
+	KubeSystemSettle   int    // healthcheck: kube-system pods must be Running for N seconds (0=disabled)
+	Force              bool   // bypass maintenance window
+	JSON               bool   // status: emit JSON instead of human-readable
 }

 // NewBootEnv creates a BootEnv from the parsed options.
@@ -25,21 +40,129 @@ func (o opts) NewBootEnv() bootenv.BootEnv {
 }

 // parseOpts extracts command-line flags from args.
-// Simple parser — no external dependencies.
+//
+// Precedence: explicit CLI flags > /etc/kubesolo/update.conf > package
+// defaults. The config file is loaded first so any CLI flag overrides it.
+//
+// Unknown flags are ignored (forward-compat).
 func parseOpts(args []string) opts {
 	o := opts{
 		GrubenvPath: "/boot/grub/grubenv",
 		TimeoutSecs: 120,
 		BootEnvType: "grub",
+		StatePath:   state.DefaultPath,
+		ConfPath:    config.DefaultPath,
+		Channel:     "stable",
 	}

+	// First pass: pick up --conf so it can point at a different file before
+	// we load. (Tests pass --conf <tempdir>/update.conf.)
+	for i := 0; i < len(args); i++ {
+		if args[i] == "--conf" && i+1 < len(args) {
+			o.ConfPath = args[i+1]
+		}
+	}
+
+	// Load config file. Missing file is fine (fresh system, no cloud-init yet).
+	if cfg, err := config.Load(o.ConfPath); err == nil && cfg != nil {
+		if cfg.Server != "" {
+			o.ServerURL = cfg.Server
+		}
+		if cfg.Channel != "" {
+			o.Channel = cfg.Channel
+		}
+		if cfg.MaintenanceWindow != "" {
+			o.MaintenanceWindow = cfg.MaintenanceWindow
+		}
+		if cfg.PubKey != "" {
+			o.PubKeyPath = cfg.PubKey
+		}
+		if cfg.HealthcheckURL != "" {
+			o.HealthcheckURL = cfg.HealthcheckURL
+		}
+		if cfg.AutoRollbackAfter > 0 {
+			o.AutoRollbackAfter = cfg.AutoRollbackAfter
+		}
+	} else if err != nil {
+		slog.Warn("could not load update.conf", "path", o.ConfPath, "error", err)
+	}
+
+	// Second pass: CLI overrides config file values.
 	for i := 0; i < len(args); i++ {
 		switch args[i] {
+		case "--conf":
+			i++ // already handled above
+		case "--state":
+			if i+1 < len(args) {
+				o.StatePath = args[i+1]
+				i++
+			}
+		case "--channel":
+			if i+1 < len(args) {
+				o.Channel = args[i+1]
+				i++
+			}
+		case "--maintenance-window":
+			if i+1 < len(args) {
+				o.MaintenanceWindow = args[i+1]
+				i++
+			}
+		case "--force":
+			o.Force = true
+		case "--healthcheck-url":
+			if i+1 < len(args) {
+				o.HealthcheckURL = args[i+1]
+				i++
+			}
+		case "--auto-rollback-after":
+			if i+1 < len(args) {
+				n := 0
+				for _, ch := range args[i+1] {
+					if ch >= '0' && ch <= '9' {
+						n = n*10 + int(ch-'0')
+					} else {
+						n = 0
+						break
+					}
+				}
+				if n > 0 {
+					o.AutoRollbackAfter = n
+				}
+				i++
+			}
+		case "--kube-system-settle":
+			if i+1 < len(args) {
+				n := 0
+				for _, ch := range args[i+1] {
+					if ch >= '0' && ch <= '9' {
+						n = n*10 + int(ch-'0')
+					} else {
+						n = 0
+						break
+					}
+				}
+				if n > 0 {
+					o.KubeSystemSettle = n
+				}
+				i++
+			}
+		case "--json":
+			o.JSON = true
 		case "--server":
 			if i+1 < len(args) {
 				o.ServerURL = args[i+1]
 				i++
 			}
+		case "--registry":
+			if i+1 < len(args) {
+				o.Registry = args[i+1]
+				i++
+			}
+		case "--tag":
+			if i+1 < len(args) {
+				o.Tag = args[i+1]
+				i++
+			}
 		case "--grubenv":
 			if i+1 < len(args) {
 				o.GrubenvPath = args[i+1]
--- a/update/cmd/rollback.go
+++ b/update/cmd/rollback.go
@@ -3,14 +3,24 @@ package cmd
 import (
 	"fmt"
 	"log/slog"
+
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

 // Rollback forces an immediate switch to the other partition.
 // Use this to manually revert to the previous version.
+//
+// State transition: any → RolledBack with LastError="manual rollback".
 func Rollback(args []string) error {
 	opts := parseOpts(args)
 	env := opts.NewBootEnv()

+	st, err := state.Load(opts.StatePath)
+	if err != nil {
+		slog.Warn("state file unreadable, starting fresh", "error", err)
+		st = state.New()
+	}
+
 	activeSlot, err := env.ActiveSlot()
 	if err != nil {
 		return fmt.Errorf("reading active slot: %w", err)
@@ -24,9 +34,14 @@ func Rollback(args []string) error {
 	slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)

 	if err := env.ForceRollback(); err != nil {
+		_ = st.RecordError(opts.StatePath, fmt.Errorf("rollback failed: %w", err))
 		return fmt.Errorf("rollback failed: %w", err)
 	}

+	if err := st.Transition(opts.StatePath, state.PhaseRolledBack, "", "manual rollback"); err != nil {
+		slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", err)
+	}
+
 	fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
 	fmt.Println("Reboot to complete rollback.")

--- a/update/cmd/status.go
+++ b/update/cmd/status.go
@@ -1,10 +1,26 @@
 package cmd

 import (
+	"encoding/json"
 	"fmt"
+	"os"
+
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

+// statusReport is the JSON-emitted shape of `kubesolo-update status --json`.
+// Combines the bootloader-level A/B view with the update-agent state machine.
+type statusReport struct {
+	ActiveSlot  string             `json:"active_slot"`
+	PassiveSlot string             `json:"passive_slot"`
+	BootCounter int                `json:"boot_counter"`
+	BootSuccess bool               `json:"boot_success"`
+	State       *state.UpdateState `json:"state"`
+}
+
 // Status displays the current A/B slot configuration and boot state.
+// With --json, emits the full state report to stdout for orchestration
+// tooling.
 func Status(args []string) error {
 	opts := parseOpts(args)
 	env := opts.NewBootEnv()
@@ -29,6 +45,23 @@ func Status(args []string) error {
 		return fmt.Errorf("reading boot success: %w", err)
 	}

+	// State file is non-fatal: present means we have an update lifecycle
+	// recorded; absent means no update has run yet.
+	st, _ := state.Load(opts.StatePath)
+
+	if opts.JSON {
+		report := statusReport{
+			ActiveSlot:  activeSlot,
+			PassiveSlot: passiveSlot,
+			BootCounter: bootCounter,
+			BootSuccess: bootSuccess,
+			State:       st,
+		}
+		enc := json.NewEncoder(os.Stdout)
+		enc.SetIndent("", "  ")
+		return enc.Encode(report)
+	}
+
 	fmt.Println("KubeSolo OS — A/B Partition Status")
 	fmt.Println("───────────────────────────────────")
 	fmt.Printf("  Active slot:   %s\n", activeSlot)
@@ -48,5 +81,25 @@ func Status(args []string) error {
 		fmt.Printf("\n  ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter)
 	}

+	if st != nil && st.Phase != state.PhaseIdle {
+		fmt.Println("\nUpdate Lifecycle")
+		fmt.Println("───────────────────────────────────")
+		fmt.Printf("  Phase:         %s\n", st.Phase)
+		if st.FromVersion != "" {
+			fmt.Printf("  From version:  %s\n", st.FromVersion)
+		}
+		if st.ToVersion != "" {
+			fmt.Printf("  To version:    %s\n", st.ToVersion)
+		}
+		if !st.StartedAt.IsZero() {
+			fmt.Printf("  Started:       %s\n", st.StartedAt.Format("2006-01-02 15:04:05 MST"))
+		}
+		fmt.Printf("  Updated:       %s\n", st.UpdatedAt.Format("2006-01-02 15:04:05 MST"))
+		fmt.Printf("  Attempts:      %d\n", st.AttemptCount)
+		if st.LastError != "" {
+			fmt.Printf("  Last error:    %s\n", st.LastError)
+		}
+	}
+
 	return nil
 }
--- a/update/go.mod
+++ b/update/go.mod
@@ -1,3 +1,10 @@
 module github.com/portainer/kubesolo-os/update

 go 1.25.5
+
+require (
+	github.com/opencontainers/go-digest v1.0.0 // indirect
+	github.com/opencontainers/image-spec v1.1.1 // indirect
+	golang.org/x/sync v0.14.0 // indirect
+	oras.land/oras-go/v2 v2.6.0 // indirect
+)
--- a/update/go.sum
+++ b/update/go.sum
@@ -0,0 +1,8 @@
+github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
+github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
+github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
+github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
+golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
+golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc=
+oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o=
--- a/update/main.go
+++ b/update/main.go
@@ -78,15 +78,28 @@ Commands:
  metrics      Start Prometheus-compatible metrics HTTP server

 Options:
-  --server URL     Update server URL (default: from /etc/kubesolo/update.conf)
-  --grubenv PATH   Path to grubenv file (default: /boot/grub/grubenv)
-  --timeout SECS   Health check timeout in seconds (default: 120)
-  --pubkey PATH    Ed25519 public key for signature verification (optional)
+  --server URL          HTTP update server (mutually exclusive with --registry)
+  --registry REPO       OCI registry repository, e.g. ghcr.io/portainer/kubesolo-os
+                        (mutually exclusive with --server)
+  --tag TAG             OCI tag to pull (default: channel name, then "stable")
+  --conf PATH           update.conf path (default: /etc/kubesolo/update.conf)
+  --state PATH          Update state file (default: /var/lib/kubesolo/update/state.json)
+  --channel NAME        Update channel (default: "stable", or value from update.conf)
+  --maintenance-window  HH:MM-HH:MM local time window; apply refuses outside it
+  --force               Bypass maintenance-window check
+  --grubenv PATH        Path to grubenv file (default: /boot/grub/grubenv)
+  --timeout SECS        Health check timeout in seconds (default: 120)
+  --pubkey PATH         Ed25519 public key for signature verification (optional)
+  --healthcheck-url URL Optional GET probe in healthcheck; 200 = pass
+  --auto-rollback-after N  healthcheck: rollback after N consecutive failures
+  --kube-system-settle N healthcheck: require kube-system pods Running ≥ N seconds
+  --json                For 'status': emit JSON instead of human-readable output

 Examples:
-  kubesolo-update check --server https://updates.example.com
-  kubesolo-update apply --server https://updates.example.com --pubkey /etc/kubesolo/update-pubkey.hex
+  kubesolo-update apply --server https://updates.example.com
+  kubesolo-update apply --registry ghcr.io/portainer/kubesolo-os --tag stable
+  kubesolo-update apply --force                      # uses /etc/kubesolo/update.conf
  kubesolo-update healthcheck
-  kubesolo-update status
+  kubesolo-update status --json
 `)
 }
--- a/update/pkg/config/config.go
+++ b/update/pkg/config/config.go
@@ -0,0 +1,105 @@
+// Package config parses /etc/kubesolo/update.conf — the persistent
+// configuration for the update agent. Each line is "key = value"; blank
+// lines and "#"-prefixed comments are ignored. Unknown keys are tolerated
+// (forward compatibility).
+//
+// Example:
+//
+//	# Where to look for updates
+//	server = https://updates.kubesolo.example.com
+//	channel = stable
+//
+//	# Only apply between 03:00 and 05:00 local time
+//	maintenance_window = 03:00-05:00
+//
+//	pubkey = /etc/kubesolo/update-pubkey.hex
+//
+// The file is populated on first boot by cloud-init (see the cloud-init
+// updates: block) and can be hand-edited afterwards.
+package config
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strings"
+)
+
+// DefaultPath is where update.conf lives on a live system.
+const DefaultPath = "/etc/kubesolo/update.conf"
+
+// Config holds the parsed update.conf values. Empty fields mean "not set" —
+// the caller's defaults apply.
+type Config struct {
+	Server            string
+	Channel           string
+	MaintenanceWindow string
+	PubKey            string
+	// HealthcheckURL is an optional URL the healthcheck command will GET;
+	// 200 = pass, anything else = fail.
+	HealthcheckURL string
+	// AutoRollbackAfter is the number of consecutive post-boot healthcheck
+	// failures after which the agent will call Rollback automatically.
+	// 0 = disabled (default).
+	AutoRollbackAfter int
+}
+
+// Load reads and parses update.conf. A missing file returns an empty Config
+// (not an error) — fresh systems before cloud-init has run.
+func Load(path string) (*Config, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return &Config{}, nil
+		}
+		return nil, fmt.Errorf("open %s: %w", path, err)
+	}
+	defer f.Close()
+
+	c := &Config{}
+	scanner := bufio.NewScanner(f)
+	lineNo := 0
+	for scanner.Scan() {
+		lineNo++
+		line := strings.TrimSpace(scanner.Text())
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		eq := strings.IndexByte(line, '=')
+		if eq < 0 {
+			return nil, fmt.Errorf("%s:%d: missing '=' in line: %q", path, lineNo, line)
+		}
+		key := strings.TrimSpace(line[:eq])
+		value := strings.TrimSpace(line[eq+1:])
+		switch key {
+		case "server":
+			c.Server = value
+		case "channel":
+			c.Channel = value
+		case "maintenance_window":
+			c.MaintenanceWindow = value
+		case "pubkey":
+			c.PubKey = value
+		case "healthcheck_url":
+			c.HealthcheckURL = value
+		case "auto_rollback_after":
+			// Parse a small integer. Non-numeric values are silently
+			// ignored (forward compat); zero disables the feature.
+			n := 0
+			for _, ch := range value {
+				if ch >= '0' && ch <= '9' {
+					n = n*10 + int(ch-'0')
+				} else {
+					n = 0
+					break
+				}
+			}
+			c.AutoRollbackAfter = n
+		}
+		// Unknown keys are silently ignored for forward compatibility.
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("read %s: %w", path, err)
+	}
+	return c, nil
+}
--- a/update/pkg/config/config_test.go
+++ b/update/pkg/config/config_test.go
@@ -0,0 +1,117 @@
+package config
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func writeConf(t *testing.T, content string) string {
+	t.Helper()
+	path := filepath.Join(t.TempDir(), "update.conf")
+	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+	return path
+}
+
+func TestLoadMissingReturnsEmptyConfig(t *testing.T) {
+	c, err := Load(filepath.Join(t.TempDir(), "does-not-exist.conf"))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if c == nil {
+		t.Fatal("Load returned nil config")
+	}
+	if c.Server != "" || c.Channel != "" || c.MaintenanceWindow != "" || c.PubKey != "" {
+		t.Errorf("expected empty config, got %+v", c)
+	}
+}
+
+func TestLoadAllFields(t *testing.T) {
+	path := writeConf(t, `# comment line
+server = https://updates.example.com
+channel = stable
+maintenance_window = 03:00-05:00
+pubkey = /etc/kubesolo/pub.hex
+`)
+	c, err := Load(path)
+	if err != nil {
+		t.Fatalf("load: %v", err)
+	}
+	if c.Server != "https://updates.example.com" {
+		t.Errorf("server: got %q", c.Server)
+	}
+	if c.Channel != "stable" {
+		t.Errorf("channel: got %q", c.Channel)
+	}
+	if c.MaintenanceWindow != "03:00-05:00" {
+		t.Errorf("maintenance_window: got %q", c.MaintenanceWindow)
+	}
+	if c.PubKey != "/etc/kubesolo/pub.hex" {
+		t.Errorf("pubkey: got %q", c.PubKey)
+	}
+}
+
+func TestLoadIgnoresUnknownKeys(t *testing.T) {
+	// Unknown keys must not be an error — supports forward-compat config
+	// fields added by newer agent versions.
+	path := writeConf(t, `server = https://x
+future_field = whatever
+channel = beta
+`)
+	c, err := Load(path)
+	if err != nil {
+		t.Fatalf("load: %v", err)
+	}
+	if c.Server != "https://x" {
+		t.Errorf("server: got %q", c.Server)
+	}
+	if c.Channel != "beta" {
+		t.Errorf("channel: got %q", c.Channel)
+	}
+}
+
+func TestLoadStripsWhitespace(t *testing.T) {
+	path := writeConf(t, "   server   =   https://example   \n  channel=stable\n")
+	c, err := Load(path)
+	if err != nil {
+		t.Fatalf("load: %v", err)
+	}
+	if c.Server != "https://example" {
+		t.Errorf("server: got %q (whitespace not stripped?)", c.Server)
+	}
+	if c.Channel != "stable" {
+		t.Errorf("channel: got %q", c.Channel)
+	}
+}
+
+func TestLoadIgnoresBlankAndCommentLines(t *testing.T) {
+	path := writeConf(t, `
+# this is a comment
+
+server = https://example
+   # indented comment
+channel = stable
+
+`)
+	c, err := Load(path)
+	if err != nil {
+		t.Fatalf("load: %v", err)
+	}
+	if c.Server != "https://example" {
+		t.Errorf("server: got %q", c.Server)
+	}
+}
+
+func TestLoadRejectsMissingEquals(t *testing.T) {
+	// "noEqualsHere" with no '=' is a syntax error worth surfacing — likely
+	// indicates a corrupted config file.
+	path := writeConf(t, `server = https://example
+noEqualsHere
+`)
+	_, err := Load(path)
+	if err == nil {
+		t.Error("expected error on malformed line, got nil")
+	}
+}
--- a/update/pkg/config/version.go
+++ b/update/pkg/config/version.go
@@ -0,0 +1,60 @@
+package config
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// CompareVersions compares two semver-ish version strings.
+//
+// Accepts "v1.2.3", "1.2.3", "v1.2.3-rc1" (suffix ignored), with missing
+// components defaulting to 0 ("v1" == "1.0.0"). Returns -1 if a < b, 0 if
+// equal, +1 if a > b. Returns an error if either argument can't be parsed
+// at all.
+//
+// Used by apply.go to enforce MinCompatibleVersion. Pre-release suffix
+// handling is deliberately simple — we ignore it, treating "v1.2.3-rc1"
+// equal to "v1.2.3". Edge case: production releases should never carry
+// a pre-release suffix, and dev releases are the consumer's responsibility.
+func CompareVersions(a, b string) (int, error) {
+	pa, err := parseVersion(a)
+	if err != nil {
+		return 0, fmt.Errorf("parse %q: %w", a, err)
+	}
+	pb, err := parseVersion(b)
+	if err != nil {
+		return 0, fmt.Errorf("parse %q: %w", b, err)
+	}
+	for i := 0; i < 3; i++ {
+		if pa[i] < pb[i] {
+			return -1, nil
+		}
+		if pa[i] > pb[i] {
+			return 1, nil
+		}
+	}
+	return 0, nil
+}
+
+func parseVersion(s string) ([3]int, error) {
+	var out [3]int
+	s = strings.TrimSpace(s)
+	s = strings.TrimPrefix(s, "v")
+	// Drop pre-release suffix: "1.2.3-rc1" -> "1.2.3"
+	if i := strings.IndexAny(s, "-+"); i >= 0 {
+		s = s[:i]
+	}
+	parts := strings.SplitN(s, ".", 3)
+	for i, p := range parts {
+		n, err := strconv.Atoi(p)
+		if err != nil {
+			return out, fmt.Errorf("component %q not numeric", p)
+		}
+		if n < 0 {
+			return out, fmt.Errorf("component %d negative", n)
+		}
+		out[i] = n
+	}
+	return out, nil
+}
--- a/update/pkg/config/version_test.go
+++ b/update/pkg/config/version_test.go
@@ -0,0 +1,46 @@
+package config
+
+import "testing"
+
+func TestCompareVersions(t *testing.T) {
+	tests := []struct {
+		a, b string
+		want int
+	}{
+		{"v1.0.0", "v1.0.0", 0},
+		{"1.0.0", "v1.0.0", 0}, // 'v' prefix optional
+		{"v1.0.0", "v1.0.1", -1},
+		{"v1.0.1", "v1.0.0", 1},
+		{"v1.1.0", "v1.0.99", 1},
+		{"v2.0.0", "v1.99.99", 1},
+		{"v0.3.0-dev", "v0.3.0", 0},   // pre-release suffix ignored
+		{"v0.2.5", "v0.3.0", -1},
+		{"v0.3.0", "v0.2.999", 1},
+		{"v1.2", "v1.2.0", 0}, // missing component defaults to 0
+		{"v1", "v1.0.0", 0},
+	}
+	for _, tt := range tests {
+		got, err := CompareVersions(tt.a, tt.b)
+		if err != nil {
+			t.Errorf("CompareVersions(%q, %q): %v", tt.a, tt.b, err)
+			continue
+		}
+		if got != tt.want {
+			t.Errorf("CompareVersions(%q, %q) = %d, want %d", tt.a, tt.b, got, tt.want)
+		}
+	}
+}
+
+func TestCompareVersionsRejectsGarbage(t *testing.T) {
+	bad := []string{
+		"not-a-version",
+		"v.1.2",
+		"vabc",
+		"",
+	}
+	for _, s := range bad {
+		if _, err := CompareVersions(s, "v1.0.0"); err == nil {
+			t.Errorf("CompareVersions(%q, ...) accepted, want error", s)
+		}
+	}
+}
--- a/update/pkg/config/window.go
+++ b/update/pkg/config/window.go
@@ -0,0 +1,95 @@
+package config
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// Window is a parsed maintenance-window expression. Times are minutes since
+// midnight in the local timezone. When End < Start, the window wraps
+// midnight (e.g. 23:00-01:00 means 23:00 today through 01:00 tomorrow).
+//
+// The zero value (Start == End == 0) means "always allowed" — used for
+// the empty-string-meaning-no-window case.
+type Window struct {
+	Start int // minutes since midnight, [0, 1440)
+	End   int // minutes since midnight, [0, 1440)
+
+	// alwaysOpen distinguishes "no constraint" from "midnight to midnight"
+	// (the literal 00:00-00:00 window, which is a degenerate same-instant
+	// window). Set when ParseWindow is called with an empty string.
+	alwaysOpen bool
+}
+
+// AlwaysOpen returns true if this window imposes no constraint (the empty
+// string was parsed).
+func (w Window) AlwaysOpen() bool { return w.alwaysOpen }
+
+// ParseWindow parses "HH:MM-HH:MM" into a Window. Empty input returns an
+// AlwaysOpen window (no constraint). Whitespace around the input is tolerated.
+func ParseWindow(s string) (Window, error) {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return Window{alwaysOpen: true}, nil
+	}
+	parts := strings.SplitN(s, "-", 2)
+	if len(parts) != 2 {
+		return Window{}, fmt.Errorf("maintenance window %q: expected HH:MM-HH:MM", s)
+	}
+	start, err := parseHHMM(strings.TrimSpace(parts[0]))
+	if err != nil {
+		return Window{}, fmt.Errorf("maintenance window %q: start: %w", s, err)
+	}
+	end, err := parseHHMM(strings.TrimSpace(parts[1]))
+	if err != nil {
+		return Window{}, fmt.Errorf("maintenance window %q: end: %w", s, err)
+	}
+	return Window{Start: start, End: end}, nil
+}
+
+func parseHHMM(s string) (int, error) {
+	parts := strings.SplitN(s, ":", 2)
+	if len(parts) != 2 {
+		return 0, fmt.Errorf("%q: expected HH:MM", s)
+	}
+	h, err := strconv.Atoi(parts[0])
+	if err != nil || h < 0 || h > 23 {
+		return 0, fmt.Errorf("%q: invalid hour", s)
+	}
+	m, err := strconv.Atoi(parts[1])
+	if err != nil || m < 0 || m > 59 {
+		return 0, fmt.Errorf("%q: invalid minute", s)
+	}
+	return h*60 + m, nil
+}
+
+// Contains reports whether the given local time falls inside this window.
+// AlwaysOpen windows return true for any time.
+func (w Window) Contains(t time.Time) bool {
+	if w.alwaysOpen {
+		return true
+	}
+	now := t.Hour()*60 + t.Minute()
+	if w.Start == w.End {
+		// Degenerate: zero-length window. Never matches.
+		return false
+	}
+	if w.Start < w.End {
+		// Same-day window: [Start, End)
+		return now >= w.Start && now < w.End
+	}
+	// Wrapping window: [Start, 1440) ∪ [0, End)
+	return now >= w.Start || now < w.End
+}
+
+// String renders the window in HH:MM-HH:MM form for display. AlwaysOpen
+// renders as "always".
+func (w Window) String() string {
+	if w.alwaysOpen {
+		return "always"
+	}
+	return fmt.Sprintf("%02d:%02d-%02d:%02d",
+		w.Start/60, w.Start%60, w.End/60, w.End%60)
+}
--- a/update/pkg/config/window_test.go
+++ b/update/pkg/config/window_test.go
@@ -0,0 +1,120 @@
+package config
+
+import (
+	"testing"
+	"time"
+)
+
+func at(hour, min int) time.Time {
+	return time.Date(2026, 1, 1, hour, min, 0, 0, time.UTC)
+}
+
+func TestParseWindowEmpty(t *testing.T) {
+	w, err := ParseWindow("")
+	if err != nil {
+		t.Fatalf("empty window: %v", err)
+	}
+	if !w.AlwaysOpen() {
+		t.Error("empty input should produce AlwaysOpen window")
+	}
+	if !w.Contains(at(3, 0)) {
+		t.Error("AlwaysOpen window should contain any time")
+	}
+	if !w.Contains(at(23, 59)) {
+		t.Error("AlwaysOpen window should contain end-of-day")
+	}
+}
+
+func TestParseWindowSameDay(t *testing.T) {
+	w, err := ParseWindow("03:00-05:00")
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	tests := []struct {
+		hour, min int
+		want      bool
+	}{
+		{2, 59, false}, // just before
+		{3, 0, true},   // start (inclusive)
+		{4, 30, true},  // middle
+		{4, 59, true},  // just before end
+		{5, 0, false},  // end (exclusive)
+		{15, 0, false}, // far outside
+	}
+	for _, tt := range tests {
+		got := w.Contains(at(tt.hour, tt.min))
+		if got != tt.want {
+			t.Errorf("Contains(%02d:%02d) = %v, want %v", tt.hour, tt.min, got, tt.want)
+		}
+	}
+}
+
+func TestParseWindowWrappingMidnight(t *testing.T) {
+	w, err := ParseWindow("23:00-01:00")
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	tests := []struct {
+		hour, min int
+		want      bool
+	}{
+		{22, 59, false}, // just before
+		{23, 0, true},   // start (inclusive)
+		{23, 30, true},  // night-before
+		{0, 0, true},    // midnight
+		{0, 30, true},   // early morning
+		{0, 59, true},   // just before end
+		{1, 0, false},   // end (exclusive)
+		{12, 0, false},  // far outside (noon)
+	}
+	for _, tt := range tests {
+		got := w.Contains(at(tt.hour, tt.min))
+		if got != tt.want {
+			t.Errorf("Contains(%02d:%02d) wrapping = %v, want %v", tt.hour, tt.min, got, tt.want)
+		}
+	}
+}
+
+func TestParseWindowDegenerateZeroLength(t *testing.T) {
+	// 05:00-05:00 is a zero-length window — should never match. Different
+	// from "always" (empty string).
+	w, err := ParseWindow("05:00-05:00")
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if w.AlwaysOpen() {
+		t.Error("05:00-05:00 must not be AlwaysOpen")
+	}
+	if w.Contains(at(5, 0)) {
+		t.Error("zero-length window must not contain its own boundary")
+	}
+}
+
+func TestParseWindowRejectsBadInput(t *testing.T) {
+	bad := []string{
+		"notatime",
+		"03:00",        // no end
+		"03:00-",       // empty end
+		"03:00-05",     // missing minutes
+		"24:00-05:00",  // hour out of range
+		"03:60-05:00",  // minute out of range
+		"abc:00-05:00", // non-numeric
+	}
+	for _, s := range bad {
+		_, err := ParseWindow(s)
+		if err == nil {
+			t.Errorf("ParseWindow(%q) accepted, want error", s)
+		}
+	}
+}
+
+func TestWindowString(t *testing.T) {
+	w, _ := ParseWindow("03:05-05:45")
+	if w.String() != "03:05-05:45" {
+		t.Errorf("String = %q, want 03:05-05:45", w.String())
+	}
+	always, _ := ParseWindow("")
+	if always.String() != "always" {
+		t.Errorf("AlwaysOpen.String = %q, want 'always'", always.String())
+	}
+}
--- a/update/pkg/health/extended.go
+++ b/update/pkg/health/extended.go
@@ -0,0 +1,125 @@
+package health
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+// kubeSystemSettleSeconds is how long all kube-system pods must hold a
+// Running phase before we consider the cluster genuinely up. Catches the
+// "pod just started, will crash-loop in 5s" case.
+const kubeSystemSettleSeconds = 30
+
+// CheckKubeSystemReady verifies that every pod in the kube-system namespace
+// is in Running phase and has been Running for at least settle. Returns
+// (ready, error). settle defaults to 30s when zero.
+func (c *Checker) CheckKubeSystemReady(settle time.Duration) bool {
+	if settle == 0 {
+		settle = kubeSystemSettleSeconds * time.Second
+	}
+	if _, err := os.Stat(c.kubeconfigPath); err != nil {
+		return false
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	// jsonpath emits one line per pod: <phase>|<startTime>
+	cmd := exec.CommandContext(ctx, "kubectl",
+		"--kubeconfig", c.kubeconfigPath,
+		"get", "pods", "-n", "kube-system",
+		"-o", `jsonpath={range .items[*]}{.status.phase}|{.status.startTime}{"\n"}{end}`,
+	)
+	out, err := cmd.Output()
+	if err != nil {
+		return false
+	}
+	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
+	if len(lines) == 0 || lines[0] == "" {
+		// No pods reported. Conservatively treat as not-ready: kube-system
+		// is expected to host at least CoreDNS + pause.
+		return false
+	}
+	now := time.Now()
+	for _, line := range lines {
+		parts := strings.SplitN(line, "|", 2)
+		phase := strings.TrimSpace(parts[0])
+		if phase != "Running" {
+			return false
+		}
+		if len(parts) < 2 {
+			return false
+		}
+		start, perr := time.Parse(time.RFC3339, strings.TrimSpace(parts[1]))
+		if perr != nil {
+			return false
+		}
+		if now.Sub(start) < settle {
+			return false
+		}
+	}
+	return true
+}
+
+// CheckProbeURL fetches the given URL and reports whether it returned 200.
+// Empty url returns (true, nil) — the check is opt-in.
+func CheckProbeURL(url string) (bool, error) {
+	if url == "" {
+		return true, nil
+	}
+	client := &http.Client{Timeout: 5 * time.Second}
+	resp, err := client.Get(url)
+	if err != nil {
+		return false, fmt.Errorf("probe URL %s: %w", url, err)
+	}
+	defer resp.Body.Close()
+	return resp.StatusCode == http.StatusOK, nil
+}
+
+// CheckDiskWritable writes a small file under dataDir, fsyncs, reads it back,
+// and removes it. Confirms the data partition is mounted read-write and the
+// underlying disk is responsive. Empty dataDir defaults to /var/lib/kubesolo.
+func CheckDiskWritable(dataDir string) (bool, error) {
+	if dataDir == "" {
+		dataDir = "/var/lib/kubesolo"
+	}
+	if _, err := os.Stat(dataDir); err != nil {
+		// Data partition not mounted? That's catastrophic but we shouldn't
+		// claim the disk is fine.
+		return false, fmt.Errorf("dataDir %s: %w", dataDir, err)
+	}
+	probe := filepath.Join(dataDir, ".update-probe")
+	want := []byte("kubesolo-os healthcheck probe")
+
+	f, err := os.Create(probe)
+	if err != nil {
+		return false, fmt.Errorf("create probe: %w", err)
+	}
+	defer os.Remove(probe)
+
+	if _, err := f.Write(want); err != nil {
+		f.Close()
+		return false, fmt.Errorf("write probe: %w", err)
+	}
+	if err := f.Sync(); err != nil {
+		f.Close()
+		return false, fmt.Errorf("fsync probe: %w", err)
+	}
+	if err := f.Close(); err != nil {
+		return false, fmt.Errorf("close probe: %w", err)
+	}
+
+	got, err := os.ReadFile(probe)
+	if err != nil {
+		return false, fmt.Errorf("read probe: %w", err)
+	}
+	if string(got) != string(want) {
+		return false, fmt.Errorf("probe content mismatch: got %q", got)
+	}
+	return true, nil
+}
--- a/update/pkg/health/extended_test.go
+++ b/update/pkg/health/extended_test.go
@@ -0,0 +1,77 @@
+package health
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestCheckProbeURLEmptyAlwaysPasses(t *testing.T) {
+	ok, err := CheckProbeURL("")
+	if err != nil {
+		t.Fatalf("CheckProbeURL(\"\"): %v", err)
+	}
+	if !ok {
+		t.Error("empty probe URL should return ok=true (check disabled)")
+	}
+}
+
+func TestCheckProbeURL200(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+	ok, err := CheckProbeURL(srv.URL)
+	if err != nil {
+		t.Fatalf("CheckProbeURL: %v", err)
+	}
+	if !ok {
+		t.Error("expected ok=true on 200")
+	}
+}
+
+func TestCheckProbeURLNon200(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusServiceUnavailable)
+	}))
+	defer srv.Close()
+	ok, err := CheckProbeURL(srv.URL)
+	if err != nil {
+		t.Fatalf("CheckProbeURL: %v", err)
+	}
+	if ok {
+		t.Error("expected ok=false on 503")
+	}
+}
+
+func TestCheckProbeURLNetworkError(t *testing.T) {
+	// Port 1 is reserved (tcpmux) and never bound by Linux defaults.
+	_, err := CheckProbeURL("http://127.0.0.1:1")
+	if err == nil {
+		t.Error("expected error for unreachable URL, got nil")
+	}
+}
+
+func TestCheckDiskWritableHappyPath(t *testing.T) {
+	dir := t.TempDir()
+	ok, err := CheckDiskWritable(dir)
+	if err != nil {
+		t.Fatalf("CheckDiskWritable: %v", err)
+	}
+	if !ok {
+		t.Error("expected ok=true on writable temp dir")
+	}
+	// Probe file should have been cleaned up.
+	if _, err := os.Stat(filepath.Join(dir, ".update-probe")); !os.IsNotExist(err) {
+		t.Errorf("probe file not cleaned up: stat err=%v", err)
+	}
+}
+
+func TestCheckDiskWritableMissingDir(t *testing.T) {
+	_, err := CheckDiskWritable("/this/path/does/not/exist")
+	if err == nil {
+		t.Error("expected error for missing dataDir, got nil")
+	}
+}
--- a/update/pkg/health/health.go
+++ b/update/pkg/health/health.go
@@ -24,15 +24,20 @@ import (

 // Status represents the result of a health check.
 type Status struct {
-	Containerd bool
-	APIServer  bool
-	NodeReady  bool
-	Message    string
+	Containerd       bool
+	APIServer        bool
+	NodeReady        bool
+	KubeSystemReady  bool // optional — true unless KubeSystemSettle is non-zero
+	ProbeURL         bool // optional — true unless ProbeURL is set
+	DiskWritable     bool // optional — true unless DataDir is set
+	Message          string
 }

-// IsHealthy returns true if all checks passed.
+// IsHealthy returns true if all required checks passed. Optional checks
+// default to true when not configured, so they don't block the result.
 func (s *Status) IsHealthy() bool {
-	return s.Containerd && s.APIServer && s.NodeReady
+	return s.Containerd && s.APIServer && s.NodeReady &&
+		s.KubeSystemReady && s.ProbeURL && s.DiskWritable
 }

 // Checker performs health checks against the local KubeSolo instance.
@@ -40,6 +45,11 @@ type Checker struct {
 	kubeconfigPath string
 	apiServerAddr  string
 	timeout        time.Duration
+
+	// Optional gates. Zero values disable the check (it reports true).
+	KubeSystemSettle time.Duration
+	ProbeURL         string
+	DataDir          string
 }

 // NewChecker creates a health checker.
@@ -149,12 +159,37 @@ func (c *Checker) CheckNodeReady() bool {
 }

 // RunAll performs all health checks and returns the combined status.
+//
+// Optional checks (kube-system settle, user probe URL, disk writability) are
+// only run if the corresponding Checker fields are set; otherwise they
+// report true so as not to block the result.
 func (c *Checker) RunAll() *Status {
-	return &Status{
-		Containerd: c.CheckContainerd(),
-		APIServer:  c.CheckAPIServer(),
-		NodeReady:  c.CheckNodeReady(),
+	s := &Status{
+		Containerd:      c.CheckContainerd(),
+		APIServer:       c.CheckAPIServer(),
+		NodeReady:       c.CheckNodeReady(),
+		KubeSystemReady: true,
+		ProbeURL:        true,
+		DiskWritable:    true,
 	}
+	if c.KubeSystemSettle > 0 {
+		s.KubeSystemReady = c.CheckKubeSystemReady(c.KubeSystemSettle)
+	}
+	if c.ProbeURL != "" {
+		ok, err := CheckProbeURL(c.ProbeURL)
+		if err != nil {
+			slog.Warn("probe URL check failed", "url", c.ProbeURL, "error", err)
+		}
+		s.ProbeURL = ok
+	}
+	if c.DataDir != "" {
+		ok, err := CheckDiskWritable(c.DataDir)
+		if err != nil {
+			slog.Warn("disk writability check failed", "dir", c.DataDir, "error", err)
+		}
+		s.DiskWritable = ok
+	}
+	return s
 }

 // WaitForHealthy polls health checks until all pass or timeout expires.
--- a/update/pkg/health/health_test.go
+++ b/update/pkg/health/health_test.go
@@ -6,36 +6,42 @@ import (
 )

 func TestStatusIsHealthy(t *testing.T) {
+	// Helper for the new 6-field Status: all-true except the named one.
+	allBut := func(field string) Status {
+		s := Status{
+			Containerd: true, APIServer: true, NodeReady: true,
+			KubeSystemReady: true, ProbeURL: true, DiskWritable: true,
+		}
+		switch field {
+		case "Containerd":
+			s.Containerd = false
+		case "APIServer":
+			s.APIServer = false
+		case "NodeReady":
+			s.NodeReady = false
+		case "KubeSystemReady":
+			s.KubeSystemReady = false
+		case "ProbeURL":
+			s.ProbeURL = false
+		case "DiskWritable":
+			s.DiskWritable = false
+		}
+		return s
+	}
+
 	tests := []struct {
 		name       string
 		status     Status
 		wantHealth bool
 	}{
-		{
-			name:       "all healthy",
-			status:     Status{Containerd: true, APIServer: true, NodeReady: true},
-			wantHealth: true,
-		},
-		{
-			name:       "containerd down",
-			status:     Status{Containerd: false, APIServer: true, NodeReady: true},
-			wantHealth: false,
-		},
-		{
-			name:       "apiserver down",
-			status:     Status{Containerd: true, APIServer: false, NodeReady: true},
-			wantHealth: false,
-		},
-		{
-			name:       "node not ready",
-			status:     Status{Containerd: true, APIServer: true, NodeReady: false},
-			wantHealth: false,
-		},
-		{
-			name:       "all down",
-			status:     Status{Containerd: false, APIServer: false, NodeReady: false},
-			wantHealth: false,
-		},
+		{"all healthy", allBut(""), true},
+		{"containerd down", allBut("Containerd"), false},
+		{"apiserver down", allBut("APIServer"), false},
+		{"node not ready", allBut("NodeReady"), false},
+		{"kube-system not ready", allBut("KubeSystemReady"), false},
+		{"probe URL failed", allBut("ProbeURL"), false},
+		{"disk not writable", allBut("DiskWritable"), false},
+		{"all down", Status{}, false},
 	}

 	for _, tt := range tests {
--- a/update/pkg/health/preflight.go
+++ b/update/pkg/health/preflight.go
@@ -0,0 +1,51 @@
+package health
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+	"time"
+)
+
+// NodeBlockLabel is the well-known label that workload authors set on the
+// local node to defer an OS update. When present and "true", apply refuses.
+const NodeBlockLabel = "updates.kubesolo.io/block"
+
+// CheckNodeBlocked returns (blocked, error). blocked==true means the local
+// node carries the updates.kubesolo.io/block=true label and the caller should
+// refuse the update.
+//
+// If the kubeconfig is not available (offline / pre-boot / air-gap), this
+// returns (false, nil) — silently allowing the update. That's the safe
+// behaviour for the air-gap case where the node may not be reachable from
+// the agent's perspective.
+func CheckNodeBlocked(kubeconfigPath string) (bool, error) {
+	if kubeconfigPath == "" {
+		kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
+	}
+	if _, err := os.Stat(kubeconfigPath); err != nil {
+		// No kubeconfig — assume air-gap / pre-K8s. Don't block updates.
+		return false, nil
+	}
+
+	// Query the node label via kubectl. We don't know the node name a
+	// priori, so we use --kubeconfig on the local admin config and ask for
+	// "the only node" (KubeSolo is single-node by design).
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "kubectl",
+		"--kubeconfig", kubeconfigPath,
+		"get", "node",
+		"-o", `jsonpath={.items[0].metadata.labels.updates\.kubesolo\.io/block}`)
+	out, err := cmd.Output()
+	if err != nil {
+		// API unreachable or no nodes — treat as not blocked (analogous to
+		// the kubeconfig-missing case). We still surface the error so the
+		// caller can decide to log it.
+		return false, fmt.Errorf("query node label: %w", err)
+	}
+	return strings.TrimSpace(string(out)) == "true", nil
+}
--- a/update/pkg/image/image.go
+++ b/update/pkg/image/image.go
@@ -35,6 +35,24 @@ type UpdateMetadata struct {
 	MetadataSigURL  string `json:"metadata_sig_url,omitempty"`
 	ReleaseNotes    string `json:"release_notes,omitempty"`
 	ReleaseDate     string `json:"release_date,omitempty"`
+
+	// Channel labels this artifact ("stable", "beta", "edge", ...). The agent
+	// refuses metadata whose channel doesn't match the locally-configured
+	// one. Empty in metadata means "no channel constraint, accept anything".
+	Channel string `json:"channel,omitempty"`
+
+	// MinCompatibleVersion is the lowest version that can upgrade to this
+	// one. The agent refuses to apply if the currently-running version is
+	// below this. Used for stepping-stone migrations (e.g. 0.2.x -> 0.3.x
+	// requires 0.2.5+ to land the state-file format first). Empty means
+	// "any source version OK".
+	MinCompatibleVersion string `json:"min_compatible_version,omitempty"`
+
+	// Architecture restricts this artifact to a specific GOARCH ("amd64",
+	// "arm64"). Empty means the artifact is arch-agnostic — which is rare
+	// since the kernel + initramfs are arch-specific; this should normally
+	// be populated by the build pipeline.
+	Architecture string `json:"architecture,omitempty"`
 }

 // StagedImage represents downloaded and verified update files.
--- a/update/pkg/metrics/metrics.go
+++ b/update/pkg/metrics/metrics.go
@@ -11,6 +11,9 @@
 //	kubesolo_os_update_last_check_timestamp_seconds  unix timestamp (gauge)
 //	kubesolo_os_memory_total_bytes                   total RAM (gauge)
 //	kubesolo_os_memory_available_bytes               available RAM (gauge)
+//	kubesolo_update_phase{phase}                     1 for current phase, 0 for others
+//	kubesolo_update_attempts_total                   counter — attempts at current ToVersion
+//	kubesolo_update_last_attempt_timestamp_seconds   unix timestamp of last state update
 //
 // This is a zero-dependency implementation — no Prometheus client library needed.
 // It serves metrics in the Prometheus text exposition format.
@@ -25,11 +28,14 @@ import (
 	"strings"
 	"sync"
 	"time"
+
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

 // Server is a lightweight Prometheus metrics HTTP server.
 type Server struct {
 	grubenvPath string
+	statePath   string
 	listenAddr  string
 	startTime   time.Time

@@ -47,6 +53,27 @@ func NewServer(listenAddr, grubenvPath string) *Server {
 	}
 }

+// SetStatePath sets the location of the update state.json file. If empty or
+// unset, state-derived metrics are emitted with the Idle defaults.
+func (s *Server) SetStatePath(p string) {
+	s.statePath = p
+}
+
+// allPhases lists every Phase value we emit as a kubesolo_update_phase
+// time-series, so consumers see all label values (with value 0 for non-current
+// phases). Mirror of validPhases in pkg/state.
+var allPhases = []state.Phase{
+	state.PhaseIdle,
+	state.PhaseChecking,
+	state.PhaseDownloading,
+	state.PhaseStaged,
+	state.PhaseActivated,
+	state.PhaseVerifying,
+	state.PhaseSuccess,
+	state.PhaseRolledBack,
+	state.PhaseFailed,
+}
+
 // SetUpdateAvailable records whether an update is available.
 func (s *Server) SetUpdateAvailable(available bool) {
 	s.mu.Lock()
@@ -125,9 +152,49 @@ func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
 	sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n")
 	sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail))

+	// Update lifecycle (from state.json)
+	s.writeUpdateStateMetrics(&sb)
+
 	fmt.Fprint(w, sb.String())
 }

+// writeUpdateStateMetrics appends update-lifecycle metrics derived from the
+// state.json file. If the file is missing or unreadable, emits the Idle
+// defaults so the metric series exists at all times.
+func (s *Server) writeUpdateStateMetrics(sb *strings.Builder) {
+	current := state.PhaseIdle
+	var attempts int
+	var lastTS float64
+
+	if s.statePath != "" {
+		if st, err := state.Load(s.statePath); err == nil && st != nil {
+			current = st.Phase
+			attempts = st.AttemptCount
+			if !st.UpdatedAt.IsZero() {
+				lastTS = float64(st.UpdatedAt.Unix())
+			}
+		}
+	}
+
+	sb.WriteString("# HELP kubesolo_update_phase Current update lifecycle phase (1 for active, 0 otherwise).\n")
+	sb.WriteString("# TYPE kubesolo_update_phase gauge\n")
+	for _, p := range allPhases {
+		v := 0
+		if p == current {
+			v = 1
+		}
+		sb.WriteString(fmt.Sprintf("kubesolo_update_phase{phase=%q} %d\n", string(p), v))
+	}
+
+	sb.WriteString("# HELP kubesolo_update_attempts_total Number of update attempts at the current target version.\n")
+	sb.WriteString("# TYPE kubesolo_update_attempts_total counter\n")
+	sb.WriteString(fmt.Sprintf("kubesolo_update_attempts_total %d\n", attempts))
+
+	sb.WriteString("# HELP kubesolo_update_last_attempt_timestamp_seconds Unix timestamp of the last state transition.\n")
+	sb.WriteString("# TYPE kubesolo_update_last_attempt_timestamp_seconds gauge\n")
+	sb.WriteString(fmt.Sprintf("kubesolo_update_last_attempt_timestamp_seconds %.0f\n", lastTS))
+}
+
 // readGrubenvVar reads a single variable from grubenv using simple file parse.
 func (s *Server) readGrubenvVar(key string) string {
 	data, err := os.ReadFile(s.grubenvPath)
--- a/update/pkg/metrics/metrics_test.go
+++ b/update/pkg/metrics/metrics_test.go
@@ -8,6 +8,8 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+
+	"github.com/portainer/kubesolo-os/update/pkg/state"
 )

 func TestNewServer(t *testing.T) {
@@ -247,6 +249,86 @@ func TestSafeInt(t *testing.T) {
 	}
 }

+func TestUpdateStateMetricsAbsentStateFile(t *testing.T) {
+	// No state path set — should emit Idle defaults so the metric series
+	// exists from first boot.
+	s := NewServer(":9100", "/tmp/nonexistent")
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	w := httptest.NewRecorder()
+	s.handleMetrics(w, req)
+
+	body, _ := io.ReadAll(w.Result().Body)
+	output := string(body)
+
+	if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 1`) {
+		t.Errorf("expected idle=1 with no state file, got:\n%s", output)
+	}
+	if !strings.Contains(output, `kubesolo_update_phase{phase="checking"} 0`) {
+		t.Errorf("expected checking=0 with no state file, got:\n%s", output)
+	}
+	if !strings.Contains(output, "kubesolo_update_attempts_total 0") {
+		t.Errorf("expected attempts=0 with no state file, got:\n%s", output)
+	}
+}
+
+func TestUpdateStateMetricsActivePhase(t *testing.T) {
+	dir := t.TempDir()
+	statePath := filepath.Join(dir, "state.json")
+
+	st := state.New()
+	if err := st.Transition(statePath, state.PhaseDownloading, "v0.3.0", ""); err != nil {
+		t.Fatalf("seed state: %v", err)
+	}
+
+	s := NewServer(":9100", "/tmp/nonexistent")
+	s.SetStatePath(statePath)
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	w := httptest.NewRecorder()
+	s.handleMetrics(w, req)
+
+	body, _ := io.ReadAll(w.Result().Body)
+	output := string(body)
+
+	if !strings.Contains(output, `kubesolo_update_phase{phase="downloading"} 1`) {
+		t.Errorf("expected downloading=1, got:\n%s", output)
+	}
+	if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 0`) {
+		t.Errorf("expected idle=0 when downloading is active, got:\n%s", output)
+	}
+	if !strings.Contains(output, "kubesolo_update_attempts_total 1") {
+		t.Errorf("expected attempts=1 after first Transition, got:\n%s", output)
+	}
+	if strings.Contains(output, "kubesolo_update_last_attempt_timestamp_seconds 0\n") {
+		t.Errorf("expected non-zero timestamp after state write, got:\n%s", output)
+	}
+}
+
+func TestUpdateStateMetricsAllPhasesEmitted(t *testing.T) {
+	// Every phase value should appear in the output, so dashboards can graph
+	// the series cleanly.
+	s := NewServer(":9100", "/tmp/nonexistent")
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	w := httptest.NewRecorder()
+	s.handleMetrics(w, req)
+
+	body, _ := io.ReadAll(w.Result().Body)
+	output := string(body)
+
+	for _, p := range []state.Phase{
+		state.PhaseIdle, state.PhaseChecking, state.PhaseDownloading,
+		state.PhaseStaged, state.PhaseActivated, state.PhaseVerifying,
+		state.PhaseSuccess, state.PhaseRolledBack, state.PhaseFailed,
+	} {
+		needle := `kubesolo_update_phase{phase="` + string(p) + `"}`
+		if !strings.Contains(output, needle) {
+			t.Errorf("phase %q not present in metrics output", p)
+		}
+	}
+}
+
 func TestReadFileString(t *testing.T) {
 	dir := t.TempDir()

--- a/update/pkg/oci/oci.go
+++ b/update/pkg/oci/oci.go
@@ -0,0 +1,281 @@
+// Package oci pulls KubeSolo OS update artifacts from an OCI-compliant
+// container registry (e.g. ghcr.io). It is the registry-native alternative
+// to the legacy HTTP `latest.json` protocol implemented in pkg/image.
+//
+// # Artifact layout
+//
+// An update is published as a single OCI artifact under a tag like
+// `stable` or `v0.3.0`. The tag may point at either:
+//
+//   - A manifest index (preferred) containing per-architecture manifests.
+//     The agent picks the one matching runtime.GOARCH.
+//   - A single manifest (used for arch-specific tags such as
+//     `v0.3.0-amd64`). The agent verifies architecture against the
+//     manifest's platform annotation before trusting it.
+//
+// Each per-architecture manifest carries two layers:
+//
+//	application/vnd.kubesolo.os.kernel.v1+octet-stream     // vmlinuz / Image
+//	application/vnd.kubesolo.os.initramfs.v1+gzip          // kubesolo-os.gz
+//
+// And these annotations (read into image.UpdateMetadata):
+//
+//	io.kubesolo.os.version                "v0.3.0"
+//	io.kubesolo.os.channel                "stable"
+//	io.kubesolo.os.min_compatible_version "v0.2.0"
+//	io.kubesolo.os.architecture           "amd64"
+//	io.kubesolo.os.release_notes          (optional, short)
+//	io.kubesolo.os.release_date           (optional, RFC3339)
+//
+// The agent ignores any additional layers, so the same image can also be
+// shaped as a "scratch" container if the build pipeline finds that convenient
+// for ecosystem tooling.
+package oci
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+
+	"github.com/opencontainers/go-digest"
+	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
+	"oras.land/oras-go/v2/content"
+	"oras.land/oras-go/v2/registry/remote"
+
+	"github.com/portainer/kubesolo-os/update/pkg/image"
+)
+
+// Media types used on KubeSolo OS update artifacts. Kept here (not in
+// pkg/image) so the OCI protocol surface is fully self-contained.
+const (
+	MediaKernel    = "application/vnd.kubesolo.os.kernel.v1+octet-stream"
+	MediaInitramfs = "application/vnd.kubesolo.os.initramfs.v1+gzip"
+
+	AnnotVersion     = "io.kubesolo.os.version"
+	AnnotChannel     = "io.kubesolo.os.channel"
+	AnnotMinVersion  = "io.kubesolo.os.min_compatible_version"
+	AnnotArch        = "io.kubesolo.os.architecture"
+	AnnotReleaseNote = "io.kubesolo.os.release_notes"
+	AnnotReleaseDate = "io.kubesolo.os.release_date"
+)
+
+// Client pulls artifacts from a single OCI repository (e.g.
+// `ghcr.io/portainer/kubesolo-os`).
+//
+// Anonymous (public-pull) access is supported out of the box. For private
+// repositories, configure auth via the underlying remote.Repository.Client
+// before passing it to Resolve/Pull — that hook isn't surfaced here yet
+// (deferred until we actually need it for a private fleet).
+type Client struct {
+	repo *remote.Repository
+	// Arch is the architecture string we match against manifest indexes.
+	// Defaults to runtime.GOARCH; overridable for testing.
+	Arch string
+}
+
+// NewClient parses a repository reference of the form `host/path` (no tag)
+// and returns a ready-to-use Client.
+func NewClient(repoRef string) (*Client, error) {
+	repo, err := remote.NewRepository(repoRef)
+	if err != nil {
+		return nil, fmt.Errorf("invalid OCI reference %q: %w", repoRef, err)
+	}
+	// remote.NewRepository defaults to HTTPS. PlainHTTP is set per-test
+	// via the WithPlainHTTP option when we hit a httptest.Server.
+	return &Client{repo: repo, Arch: runtime.GOARCH}, nil
+}
+
+// WithPlainHTTP toggles the underlying registry transport to HTTP. Useful for
+// httptest-driven unit tests; do not use against production registries.
+func (c *Client) WithPlainHTTP(plain bool) *Client {
+	c.repo.PlainHTTP = plain
+	return c
+}
+
+// FetchMetadata resolves the tag, walks index → manifest if needed, and
+// returns an image.UpdateMetadata populated from the manifest's annotations.
+// No blobs are downloaded — this is the cheap "what's available" probe.
+func (c *Client) FetchMetadata(ctx context.Context, tag string) (*image.UpdateMetadata, error) {
+	manifest, _, err := c.resolveArchManifest(ctx, tag)
+	if err != nil {
+		return nil, err
+	}
+	return metadataFromAnnotations(manifest.Annotations), nil
+}
+
+// Pull resolves the tag, picks the matching-architecture manifest, downloads
+// the kernel + initramfs layers to `stageDir`, verifies their digests, and
+// returns a StagedImage compatible with the existing pkg/image consumer.
+func (c *Client) Pull(ctx context.Context, tag, stageDir string) (*image.StagedImage, *image.UpdateMetadata, error) {
+	manifest, _, err := c.resolveArchManifest(ctx, tag)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	if err := os.MkdirAll(stageDir, 0o755); err != nil {
+		return nil, nil, fmt.Errorf("create stage dir: %w", err)
+	}
+
+	var kernelPath, initramfsPath string
+	for _, layer := range manifest.Layers {
+		switch layer.MediaType {
+		case MediaKernel:
+			kernelPath = filepath.Join(stageDir, "vmlinuz")
+			if err := c.fetchBlobTo(ctx, layer, kernelPath); err != nil {
+				return nil, nil, fmt.Errorf("download kernel: %w", err)
+			}
+		case MediaInitramfs:
+			initramfsPath = filepath.Join(stageDir, "kubesolo-os.gz")
+			if err := c.fetchBlobTo(ctx, layer, initramfsPath); err != nil {
+				return nil, nil, fmt.Errorf("download initramfs: %w", err)
+			}
+		default:
+			slog.Debug("oci: skipping unknown layer", "media", layer.MediaType)
+		}
+	}
+
+	if kernelPath == "" {
+		return nil, nil, fmt.Errorf("manifest has no %s layer", MediaKernel)
+	}
+	if initramfsPath == "" {
+		return nil, nil, fmt.Errorf("manifest has no %s layer", MediaInitramfs)
+	}
+
+	meta := metadataFromAnnotations(manifest.Annotations)
+	staged := &image.StagedImage{
+		VmlinuzPath:   kernelPath,
+		InitramfsPath: initramfsPath,
+		Version:       meta.Version,
+	}
+	return staged, meta, nil
+}
+
+// resolveArchManifest fetches the descriptor at `tag`, walks an index if
+// present, and returns the platform-specific manifest matching c.Arch.
+func (c *Client) resolveArchManifest(ctx context.Context, tag string) (*ocispec.Manifest, *ocispec.Descriptor, error) {
+	desc, err := c.repo.Resolve(ctx, tag)
+	if err != nil {
+		return nil, nil, fmt.Errorf("resolve tag %q: %w", tag, err)
+	}
+
+	switch desc.MediaType {
+	case ocispec.MediaTypeImageIndex, "application/vnd.docker.distribution.manifest.list.v2+json":
+		index, err := fetchJSON[ocispec.Index](ctx, c.repo, desc)
+		if err != nil {
+			return nil, nil, fmt.Errorf("fetch index: %w", err)
+		}
+		var matched *ocispec.Descriptor
+		for i := range index.Manifests {
+			m := &index.Manifests[i]
+			if m.Platform != nil && m.Platform.Architecture == c.Arch {
+				matched = m
+				break
+			}
+		}
+		if matched == nil {
+			return nil, nil, fmt.Errorf("no manifest in index for architecture %q", c.Arch)
+		}
+		manifest, err := fetchJSON[ocispec.Manifest](ctx, c.repo, *matched)
+		if err != nil {
+			return nil, nil, fmt.Errorf("fetch manifest: %w", err)
+		}
+		return manifest, matched, nil
+
+	case ocispec.MediaTypeImageManifest, "application/vnd.docker.distribution.manifest.v2+json":
+		manifest, err := fetchJSON[ocispec.Manifest](ctx, c.repo, desc)
+		if err != nil {
+			return nil, nil, fmt.Errorf("fetch manifest: %w", err)
+		}
+		// Single-arch tag: if it declares an arch, enforce match.
+		if archAnnot := manifest.Annotations[AnnotArch]; archAnnot != "" && archAnnot != c.Arch {
+			return nil, nil, fmt.Errorf("single-arch manifest is %q, want %q", archAnnot, c.Arch)
+		}
+		return manifest, &desc, nil
+
+	default:
+		return nil, nil, fmt.Errorf("unsupported media type %q at tag %q", desc.MediaType, tag)
+	}
+}
+
+// fetchJSON pulls a small JSON document (manifest or index) and decodes it.
+func fetchJSON[T any](ctx context.Context, store content.Fetcher, desc ocispec.Descriptor) (*T, error) {
+	rc, err := store.Fetch(ctx, desc)
+	if err != nil {
+		return nil, err
+	}
+	defer rc.Close()
+	data, err := content.ReadAll(rc, desc)
+	if err != nil {
+		return nil, err
+	}
+	var out T
+	if err := json.Unmarshal(data, &out); err != nil {
+		return nil, fmt.Errorf("decode: %w", err)
+	}
+	return &out, nil
+}
+
+// fetchBlobTo streams a blob to disk and verifies its digest matches.
+// Cleans up the destination file on any error so we never leave a partial.
+func (c *Client) fetchBlobTo(ctx context.Context, desc ocispec.Descriptor, dest string) (retErr error) {
+	rc, err := c.repo.Fetch(ctx, desc)
+	if err != nil {
+		return fmt.Errorf("fetch blob: %w", err)
+	}
+	defer rc.Close()
+
+	f, err := os.Create(dest)
+	if err != nil {
+		return fmt.Errorf("create %s: %w", dest, err)
+	}
+	defer func() {
+		if cerr := f.Close(); retErr == nil && cerr != nil {
+			retErr = cerr
+		}
+		if retErr != nil {
+			_ = os.Remove(dest)
+		}
+	}()
+
+	verifier := desc.Digest.Algorithm().Hash()
+	mw := io.MultiWriter(f, verifier)
+	n, err := io.Copy(mw, rc)
+	if err != nil {
+		return fmt.Errorf("stream blob: %w", err)
+	}
+	if desc.Size > 0 && n != desc.Size {
+		return fmt.Errorf("blob size mismatch: got %d, want %d", n, desc.Size)
+	}
+	got := digest.NewDigest(desc.Digest.Algorithm(), verifier)
+	if got != desc.Digest {
+		return fmt.Errorf("blob digest mismatch: got %s, want %s", got, desc.Digest)
+	}
+	return nil
+}
+
+// metadataFromAnnotations builds an UpdateMetadata from manifest annotations.
+// Always returns a non-nil value (missing fields stay empty).
+func metadataFromAnnotations(a map[string]string) *image.UpdateMetadata {
+	if a == nil {
+		a = map[string]string{}
+	}
+	return &image.UpdateMetadata{
+		Version:              a[AnnotVersion],
+		Channel:              a[AnnotChannel],
+		MinCompatibleVersion: a[AnnotMinVersion],
+		Architecture:         a[AnnotArch],
+		ReleaseNotes:         a[AnnotReleaseNote],
+		ReleaseDate:          a[AnnotReleaseDate],
+	}
+}
+
+// ErrNoManifestForArch is returned from FetchMetadata/Pull when an index has
+// no entry matching the running architecture. Exposed so callers can
+// distinguish "registry unreachable" from "this build doesn't ship for us".
+var ErrNoManifestForArch = errors.New("no manifest in index for runtime architecture")
--- a/update/pkg/oci/oci_test.go
+++ b/update/pkg/oci/oci_test.go
@@ -0,0 +1,377 @@
+package oci
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/go-digest"
+	specs "github.com/opencontainers/image-spec/specs-go"
+	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
+)
+
+// fakeRegistry implements the minimum OCI distribution-spec surface our
+// Client touches: /v2/ probe, manifest fetch by tag or digest, blob fetch
+// by digest. Backed by an in-memory blob+manifest store.
+type fakeRegistry struct {
+	t          *testing.T
+	srv        *httptest.Server
+	blobs      map[digest.Digest][]byte // keyed by digest
+	manifests  map[string][]byte        // keyed by digest string (raw form)
+	tags       map[string]digest.Digest // tag -> manifest digest
+	mediaTypes map[digest.Digest]string // descriptor.MediaType per stored object
+}
+
+func newFakeRegistry(t *testing.T) *fakeRegistry {
+	t.Helper()
+	r := &fakeRegistry{
+		t:          t,
+		blobs:      map[digest.Digest][]byte{},
+		manifests:  map[string][]byte{},
+		tags:       map[string]digest.Digest{},
+		mediaTypes: map[digest.Digest]string{},
+	}
+	r.srv = httptest.NewServer(http.HandlerFunc(r.handle))
+	t.Cleanup(r.srv.Close)
+	return r
+}
+
+func (r *fakeRegistry) putBlob(media string, data []byte) digest.Digest {
+	h := sha256.Sum256(data)
+	d := digest.NewDigestFromBytes(digest.SHA256, h[:])
+	r.blobs[d] = data
+	r.mediaTypes[d] = media
+	return d
+}
+
+// putManifest stores a manifest/index document under both its digest and the
+// given tag, returning the digest the caller can embed in indexes.
+func (r *fakeRegistry) putManifest(tag string, media string, doc []byte) digest.Digest {
+	h := sha256.Sum256(doc)
+	d := digest.NewDigestFromBytes(digest.SHA256, h[:])
+	r.manifests[d.String()] = doc
+	r.mediaTypes[d] = media
+	if tag != "" {
+		r.tags[tag] = d
+	}
+	return d
+}
+
+// repoRef returns the "host:port/repo" string for use with NewClient.
+func (r *fakeRegistry) repoRef() string {
+	u, _ := url.Parse(r.srv.URL)
+	return u.Host + "/test/kubesolo-os"
+}
+
+func (r *fakeRegistry) handle(w http.ResponseWriter, req *http.Request) {
+	// Routes we implement:
+	//   GET /v2/                          -> 200 "{}"
+	//   GET /v2/test/kubesolo-os/manifests/<tag-or-digest> -> manifest
+	//   HEAD same                         -> same headers, no body
+	//   GET /v2/test/kubesolo-os/blobs/<digest> -> blob
+
+	path := req.URL.Path
+	if path == "/v2/" || path == "/v2" {
+		w.Header().Set("Docker-Distribution-API-Version", "registry/2.0")
+		w.WriteHeader(http.StatusOK)
+		_, _ = io.WriteString(w, "{}")
+		return
+	}
+
+	const prefix = "/v2/test/kubesolo-os/"
+	if !strings.HasPrefix(path, prefix) {
+		http.NotFound(w, req)
+		return
+	}
+	rest := strings.TrimPrefix(path, prefix)
+
+	switch {
+	case strings.HasPrefix(rest, "manifests/"):
+		ref := strings.TrimPrefix(rest, "manifests/")
+		var d digest.Digest
+		var data []byte
+		if td, ok := r.tags[ref]; ok {
+			d = td
+			data = r.manifests[d.String()]
+		} else if md, ok := r.manifests[ref]; ok {
+			d = digest.Digest(ref)
+			data = md
+		} else {
+			http.NotFound(w, req)
+			return
+		}
+		media := r.mediaTypes[d]
+		w.Header().Set("Content-Type", media)
+		w.Header().Set("Docker-Content-Digest", d.String())
+		w.Header().Set("Content-Length", fmt.Sprintf("%d", len(data)))
+		if req.Method == http.MethodHead {
+			return
+		}
+		_, _ = w.Write(data)
+
+	case strings.HasPrefix(rest, "blobs/"):
+		ref := strings.TrimPrefix(rest, "blobs/")
+		d := digest.Digest(ref)
+		blob, ok := r.blobs[d]
+		if !ok {
+			http.NotFound(w, req)
+			return
+		}
+		media := r.mediaTypes[d]
+		if media == "" {
+			media = "application/octet-stream"
+		}
+		w.Header().Set("Content-Type", media)
+		w.Header().Set("Docker-Content-Digest", d.String())
+		w.Header().Set("Content-Length", fmt.Sprintf("%d", len(blob)))
+		if req.Method == http.MethodHead {
+			return
+		}
+		_, _ = w.Write(blob)
+
+	default:
+		http.NotFound(w, req)
+	}
+}
+
+// seedSingleArchManifest puts kernel+initramfs blobs and a manifest with the
+// given annotations into the registry, tagged as `tag`.
+func (r *fakeRegistry) seedSingleArchManifest(t *testing.T, tag string, annot map[string]string) (kernelData, initramfsData []byte) {
+	t.Helper()
+	kernelData = []byte("FAKE-KERNEL-" + tag)
+	initramfsData = []byte("FAKE-INITRAMFS-" + tag)
+
+	kd := r.putBlob(MediaKernel, kernelData)
+	id := r.putBlob(MediaInitramfs, initramfsData)
+
+	// An empty config blob with sha256 of "{}" (the canonical "empty" body
+	// per OCI). We don't actually fetch the config so any valid descriptor
+	// works for the tests, but the digest still has to be syntactically valid.
+	emptyConfigBody := []byte("{}")
+	emptyConfigDigest := r.putBlob("application/vnd.oci.empty.v1+json", emptyConfigBody)
+
+	manifest := ocispec.Manifest{
+		Versioned: specs.Versioned{SchemaVersion: 2},
+		MediaType: ocispec.MediaTypeImageManifest,
+		Config: ocispec.Descriptor{
+			MediaType: "application/vnd.oci.empty.v1+json",
+			Size:      int64(len(emptyConfigBody)),
+			Digest:    emptyConfigDigest,
+		},
+		Layers: []ocispec.Descriptor{
+			{MediaType: MediaKernel, Digest: kd, Size: int64(len(kernelData))},
+			{MediaType: MediaInitramfs, Digest: id, Size: int64(len(initramfsData))},
+		},
+		Annotations: annot,
+	}
+	manifestBytes, err := json.Marshal(manifest)
+	if err != nil {
+		t.Fatalf("marshal manifest: %v", err)
+	}
+	r.putManifest(tag, ocispec.MediaTypeImageManifest, manifestBytes)
+	return
+}
+
+// seedIndex creates a manifest index pointing at per-arch manifests created
+// via seedSingleArchManifest with arch-suffixed tags, then publishes the
+// index under `tag`.
+func (r *fakeRegistry) seedIndex(t *testing.T, tag string, perArchAnnots map[string]map[string]string) {
+	t.Helper()
+	var descriptors []ocispec.Descriptor
+	for arch, annot := range perArchAnnots {
+		// Reuse seedSingleArchManifest but under an internal arch-suffixed tag
+		archTag := tag + "-" + arch
+		r.seedSingleArchManifest(t, archTag, annot)
+		d := r.tags[archTag]
+		descriptors = append(descriptors, ocispec.Descriptor{
+			MediaType: ocispec.MediaTypeImageManifest,
+			Digest:    d,
+			Size:      int64(len(r.manifests[d.String()])),
+			Platform:  &ocispec.Platform{Architecture: arch, OS: "linux"},
+		})
+	}
+	index := ocispec.Index{
+		Versioned: specs.Versioned{SchemaVersion: 2},
+		MediaType: ocispec.MediaTypeImageIndex,
+		Manifests: descriptors,
+	}
+	indexBytes, _ := json.Marshal(index)
+	r.putManifest(tag, ocispec.MediaTypeImageIndex, indexBytes)
+}
+
+// ---------------------------------------------------------------------------
+
+func TestFetchMetadataSingleArchManifest(t *testing.T) {
+	reg := newFakeRegistry(t)
+	reg.seedSingleArchManifest(t, "v0.3.0", map[string]string{
+		AnnotVersion: "v0.3.0",
+		AnnotChannel: "stable",
+		AnnotArch:    "amd64",
+	})
+
+	c, err := NewClient(reg.repoRef())
+	if err != nil {
+		t.Fatalf("NewClient: %v", err)
+	}
+	c.WithPlainHTTP(true)
+	c.Arch = "amd64"
+
+	meta, err := c.FetchMetadata(context.Background(), "v0.3.0")
+	if err != nil {
+		t.Fatalf("FetchMetadata: %v", err)
+	}
+	if meta.Version != "v0.3.0" {
+		t.Errorf("version: got %q, want v0.3.0", meta.Version)
+	}
+	if meta.Channel != "stable" {
+		t.Errorf("channel: got %q", meta.Channel)
+	}
+}
+
+func TestFetchMetadataIndexSelectsArch(t *testing.T) {
+	reg := newFakeRegistry(t)
+	reg.seedIndex(t, "stable", map[string]map[string]string{
+		"amd64": {AnnotVersion: "v0.3.0", AnnotChannel: "stable", AnnotArch: "amd64"},
+		"arm64": {AnnotVersion: "v0.3.0", AnnotChannel: "stable", AnnotArch: "arm64"},
+	})
+
+	for _, arch := range []string{"amd64", "arm64"} {
+		t.Run(arch, func(t *testing.T) {
+			c, err := NewClient(reg.repoRef())
+			if err != nil {
+				t.Fatalf("NewClient: %v", err)
+			}
+			c.WithPlainHTTP(true)
+			c.Arch = arch
+
+			meta, err := c.FetchMetadata(context.Background(), "stable")
+			if err != nil {
+				t.Fatalf("FetchMetadata: %v", err)
+			}
+			if meta.Architecture != arch {
+				t.Errorf("arch annotation: got %q, want %q", meta.Architecture, arch)
+			}
+			if meta.Version != "v0.3.0" {
+				t.Errorf("version: got %q, want v0.3.0", meta.Version)
+			}
+		})
+	}
+}
+
+func TestFetchMetadataIndexMissingArchErrors(t *testing.T) {
+	reg := newFakeRegistry(t)
+	reg.seedIndex(t, "stable", map[string]map[string]string{
+		"amd64": {AnnotVersion: "v0.3.0", AnnotArch: "amd64"},
+	})
+
+	c, _ := NewClient(reg.repoRef())
+	c.WithPlainHTTP(true)
+	c.Arch = "arm64" // not in the index
+
+	_, err := c.FetchMetadata(context.Background(), "stable")
+	if err == nil {
+		t.Fatal("expected error for missing arch, got nil")
+	}
+	if !strings.Contains(err.Error(), "arm64") {
+		t.Errorf("expected error mentioning arm64, got: %v", err)
+	}
+}
+
+func TestFetchMetadataSingleArchManifestRejectsCrossArch(t *testing.T) {
+	// If the manifest declares an arch via annotation and it doesn't match
+	// our runtime, Pull should refuse — defense in depth on top of the
+	// channel/version gates in cmd/apply.go.
+	reg := newFakeRegistry(t)
+	reg.seedSingleArchManifest(t, "v0.3.0-arm64", map[string]string{
+		AnnotArch: "arm64",
+	})
+
+	c, _ := NewClient(reg.repoRef())
+	c.WithPlainHTTP(true)
+	c.Arch = "amd64"
+
+	_, err := c.FetchMetadata(context.Background(), "v0.3.0-arm64")
+	if err == nil {
+		t.Fatal("expected error pulling cross-arch single-arch manifest, got nil")
+	}
+}
+
+func TestPullDownloadsBlobsAndVerifiesDigest(t *testing.T) {
+	reg := newFakeRegistry(t)
+	kernelData, initramfsData := reg.seedSingleArchManifest(t, "v0.3.0",
+		map[string]string{AnnotVersion: "v0.3.0", AnnotArch: "amd64"})
+
+	c, _ := NewClient(reg.repoRef())
+	c.WithPlainHTTP(true)
+	c.Arch = "amd64"
+
+	stageDir := filepath.Join(t.TempDir(), "stage")
+	staged, meta, err := c.Pull(context.Background(), "v0.3.0", stageDir)
+	if err != nil {
+		t.Fatalf("Pull: %v", err)
+	}
+	if meta.Version != "v0.3.0" {
+		t.Errorf("meta version: got %q", meta.Version)
+	}
+	if staged.Version != "v0.3.0" {
+		t.Errorf("staged version: got %q", staged.Version)
+	}
+
+	gotKernel, err := os.ReadFile(staged.VmlinuzPath)
+	if err != nil {
+		t.Fatalf("read kernel: %v", err)
+	}
+	if string(gotKernel) != string(kernelData) {
+		t.Errorf("kernel mismatch:\n got %q\nwant %q", gotKernel, kernelData)
+	}
+	gotInit, err := os.ReadFile(staged.InitramfsPath)
+	if err != nil {
+		t.Fatalf("read initramfs: %v", err)
+	}
+	if string(gotInit) != string(initramfsData) {
+		t.Errorf("initramfs mismatch")
+	}
+}
+
+func TestPullRejectsTamperedBlob(t *testing.T) {
+	// Mutate the kernel blob after it's been digested into the manifest.
+	// Pull should refuse with a digest mismatch.
+	reg := newFakeRegistry(t)
+	_, _ = reg.seedSingleArchManifest(t, "v0.3.0",
+		map[string]string{AnnotVersion: "v0.3.0", AnnotArch: "amd64"})
+
+	// Corrupt every stored kernel blob in the registry by replacing its body.
+	for d, m := range reg.mediaTypes {
+		if m == MediaKernel {
+			reg.blobs[d] = []byte("TAMPERED-KERNEL-WRONG-LENGTH-AND-DIGEST")
+		}
+	}
+
+	c, _ := NewClient(reg.repoRef())
+	c.WithPlainHTTP(true)
+	c.Arch = "amd64"
+
+	_, _, err := c.Pull(context.Background(), "v0.3.0", filepath.Join(t.TempDir(), "stage"))
+	if err == nil {
+		t.Fatal("expected digest mismatch error on tampered blob, got nil")
+	}
+	if !strings.Contains(err.Error(), "mismatch") {
+		t.Errorf("expected mismatch in error, got: %v", err)
+	}
+}
+
+func TestNewClientRejectsGarbageReference(t *testing.T) {
+	_, err := NewClient("not a valid reference")
+	if err == nil {
+		t.Error("expected error on bad reference, got nil")
+	}
+}
--- a/update/pkg/partition/freespace.go
+++ b/update/pkg/partition/freespace.go
@@ -0,0 +1,34 @@
+package partition
+
+import (
+	"fmt"
+	"syscall"
+)
+
+// FreeBytes returns the number of free bytes available on the filesystem
+// containing `path`. Uses statfs(2); path must exist and be readable.
+func FreeBytes(path string) (uint64, error) {
+	var stat syscall.Statfs_t
+	if err := syscall.Statfs(path, &stat); err != nil {
+		return 0, fmt.Errorf("statfs %s: %w", path, err)
+	}
+	// Bavail is the count of free blocks available to non-root users —
+	// matches what `df` reports. Bsize is the block size in bytes.
+	//nolint:unconvert // Bavail is uint64 on most platforms but int64 on darwin/freebsd
+	return uint64(stat.Bavail) * uint64(stat.Bsize), nil
+}
+
+// HasFreeSpaceFor reports whether `path`'s filesystem has at least `wantBytes`
+// of free space, with `headroomPct` reserved (e.g. 10 = require 110% of want).
+// Returns the available bytes alongside, so callers can render a useful error.
+func HasFreeSpaceFor(path string, wantBytes int64, headroomPct int) (avail uint64, ok bool, err error) {
+	avail, err = FreeBytes(path)
+	if err != nil {
+		return 0, false, err
+	}
+	if wantBytes < 0 {
+		return avail, false, fmt.Errorf("invalid wantBytes %d", wantBytes)
+	}
+	required := uint64(wantBytes) * uint64(100+headroomPct) / 100
+	return avail, avail >= required, nil
+}
--- a/update/pkg/partition/freespace_test.go
+++ b/update/pkg/partition/freespace_test.go
@@ -0,0 +1,44 @@
+package partition
+
+import "testing"
+
+func TestFreeBytesReturnsNonZeroOnTempDir(t *testing.T) {
+	b, err := FreeBytes(t.TempDir())
+	if err != nil {
+		t.Fatalf("FreeBytes: %v", err)
+	}
+	// On any sane test runner the temp filesystem has more than 1 KiB free.
+	if b < 1024 {
+		t.Errorf("FreeBytes = %d, want > 1024 on /tmp", b)
+	}
+}
+
+func TestFreeBytesNonExistentPath(t *testing.T) {
+	_, err := FreeBytes("/this/path/does/not/exist/at/all")
+	if err == nil {
+		t.Error("expected error for missing path, got nil")
+	}
+}
+
+func TestHasFreeSpaceForRejectsHugeRequest(t *testing.T) {
+	// Request 1 PiB with 10% headroom on /tmp — no test runner has that
+	// much free, so this should consistently report not-enough.
+	avail, ok, err := HasFreeSpaceFor(t.TempDir(), 1<<50, 10)
+	if err != nil {
+		t.Fatalf("HasFreeSpaceFor: %v", err)
+	}
+	if ok {
+		t.Errorf("expected insufficient space for 1PiB, got avail=%d ok=true", avail)
+	}
+}
+
+func TestHasFreeSpaceForAcceptsSmallRequest(t *testing.T) {
+	// 1 KiB with 10% headroom = 1.1 KiB. Any temp dir has this.
+	_, ok, err := HasFreeSpaceFor(t.TempDir(), 1024, 10)
+	if err != nil {
+		t.Fatalf("HasFreeSpaceFor: %v", err)
+	}
+	if !ok {
+		t.Error("expected sufficient space for 1KiB on /tmp")
+	}
+}
--- a/update/pkg/state/state.go
+++ b/update/pkg/state/state.go
@@ -0,0 +1,206 @@
+// Package state tracks the lifecycle of an OS update on disk.
+//
+// The state file (default /var/lib/kubesolo/update/state.json) records which
+// phase the agent is in, what versions are involved, when the attempt started,
+// any error from the last operation, and how many attempts have been made.
+// Updates are atomic via tmp+rename, so a crash mid-write doesn't corrupt the
+// state.
+//
+// Consumers:
+//   - cmd/check, cmd/apply, cmd/activate, cmd/healthcheck, cmd/rollback —
+//     transition the phase as they enter / leave their operations.
+//   - cmd/status --json — emits the raw state for orchestration tooling.
+//   - pkg/metrics — reads the state at scrape time to expose phase and
+//     attempt-count gauges.
+package state
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"time"
+)
+
+// DefaultPath is where state.json lives on a live system. The directory is on
+// the persistent data partition so the file survives A/B slot switches.
+const DefaultPath = "/var/lib/kubesolo/update/state.json"
+
+// Phase represents the current step in the update lifecycle.
+//
+// Terminal phases (Success, RolledBack, Failed) describe the outcome of the
+// most recent attempt; transient phases (Checking, Downloading, Staged,
+// Activated, Verifying) describe in-progress work. Idle means no update has
+// been attempted yet, or the previous attempt has been acknowledged.
+type Phase string
+
+const (
+	// PhaseIdle — no update in progress.
+	PhaseIdle Phase = "idle"
+	// PhaseChecking — querying the update server for new versions.
+	PhaseChecking Phase = "checking"
+	// PhaseDownloading — pulling artifacts from the server.
+	PhaseDownloading Phase = "downloading"
+	// PhaseStaged — artifacts written to the passive partition; not yet active.
+	PhaseStaged Phase = "staged"
+	// PhaseActivated — passive slot promoted; next boot will use the new version.
+	PhaseActivated Phase = "activated"
+	// PhaseVerifying — post-boot healthcheck in progress on the new version.
+	PhaseVerifying Phase = "verifying"
+	// PhaseSuccess — last attempt completed and verified.
+	PhaseSuccess Phase = "success"
+	// PhaseRolledBack — last attempt failed verification; reverted to prior slot.
+	PhaseRolledBack Phase = "rolled_back"
+	// PhaseFailed — last attempt failed before reaching activation (download,
+	// checksum, signature, etc.). System still on the original slot.
+	PhaseFailed Phase = "failed"
+)
+
+// validPhases lists every legal Phase value. Anything not in this set is
+// rejected by Save() to catch typos.
+var validPhases = map[Phase]struct{}{
+	PhaseIdle:        {},
+	PhaseChecking:    {},
+	PhaseDownloading: {},
+	PhaseStaged:      {},
+	PhaseActivated:   {},
+	PhaseVerifying:   {},
+	PhaseSuccess:     {},
+	PhaseRolledBack:  {},
+	PhaseFailed:      {},
+}
+
+// UpdateState is the on-disk representation. Fields use JSON tags so the
+// file format is forward-compatible (extra fields ignored, missing fields
+// default).
+type UpdateState struct {
+	// Phase is the current lifecycle position.
+	Phase Phase `json:"phase"`
+	// FromVersion is the version the system was running before the attempt.
+	// Empty when no attempt has run.
+	FromVersion string `json:"from_version,omitempty"`
+	// ToVersion is the version the attempt is targeting.
+	// Empty when no attempt has run.
+	ToVersion string `json:"to_version,omitempty"`
+	// StartedAt is when the current attempt entered a non-Idle phase.
+	StartedAt time.Time `json:"started_at,omitempty"`
+	// UpdatedAt is the last time the file was written. Always set on Save().
+	UpdatedAt time.Time `json:"updated_at"`
+	// LastError carries the most recent operation error, populated when
+	// transitioning to PhaseFailed or PhaseRolledBack. Cleared on Success/Idle.
+	LastError string `json:"last_error,omitempty"`
+	// AttemptCount counts attempts at the current ToVersion. Reset when
+	// ToVersion changes or on successful completion.
+	AttemptCount int `json:"attempt_count"`
+
+	// HealthCheckFailures counts consecutive post-Activated healthcheck
+	// failures. Reset to 0 on a successful healthcheck or after a rollback.
+	// Used by `kubesolo-update healthcheck --auto-rollback-after N` to
+	// trigger automatic recovery on a wedged new boot.
+	HealthCheckFailures int `json:"health_check_failures,omitempty"`
+}
+
+// New returns a fresh Idle state with UpdatedAt set to now.
+func New() *UpdateState {
+	return &UpdateState{
+		Phase:     PhaseIdle,
+		UpdatedAt: time.Now().UTC(),
+	}
+}
+
+// Load reads the state from disk. If the file does not exist, returns a fresh
+// Idle state — this is the normal first-run case, not an error.
+func Load(path string) (*UpdateState, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return New(), nil
+		}
+		return nil, fmt.Errorf("read state %s: %w", path, err)
+	}
+	var s UpdateState
+	if err := json.Unmarshal(data, &s); err != nil {
+		return nil, fmt.Errorf("parse state %s: %w", path, err)
+	}
+	return &s, nil
+}
+
+// Save writes the state to disk atomically (tmp file + rename), so an
+// interrupted write never leaves a partial file at `path`.
+func (s *UpdateState) Save(path string) error {
+	if _, ok := validPhases[s.Phase]; !ok {
+		return fmt.Errorf("invalid phase %q", s.Phase)
+	}
+	s.UpdatedAt = time.Now().UTC()
+
+	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+		return fmt.Errorf("creating state dir: %w", err)
+	}
+
+	data, err := json.MarshalIndent(s, "", "  ")
+	if err != nil {
+		return fmt.Errorf("marshal state: %w", err)
+	}
+	data = append(data, '\n')
+
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, data, 0o644); err != nil {
+		return fmt.Errorf("write tmp state: %w", err)
+	}
+	if err := os.Rename(tmp, path); err != nil {
+		_ = os.Remove(tmp)
+		return fmt.Errorf("rename state: %w", err)
+	}
+	return nil
+}
+
+// Transition moves the state to phase `next` and persists it. If `next`
+// targets a new ToVersion (different from the current one), AttemptCount is
+// reset to 1; otherwise it is left untouched. StartedAt is set when
+// transitioning out of Idle. LastError is cleared unless `next` is Failed or
+// RolledBack.
+func (s *UpdateState) Transition(path string, next Phase, toVersion, errMsg string) error {
+	now := time.Now().UTC()
+
+	// Reset attempt counter when targeting a new version.
+	if toVersion != "" && toVersion != s.ToVersion {
+		s.ToVersion = toVersion
+		s.AttemptCount = 0
+	}
+
+	// First non-Idle phase of an attempt: record start time and bump count.
+	if s.Phase == PhaseIdle && next != PhaseIdle {
+		s.StartedAt = now
+		s.AttemptCount++
+	}
+
+	s.Phase = next
+	switch next {
+	case PhaseFailed, PhaseRolledBack:
+		if errMsg != "" {
+			s.LastError = errMsg
+		}
+	case PhaseSuccess, PhaseIdle:
+		s.LastError = ""
+	}
+
+	return s.Save(path)
+}
+
+// RecordError marks the state as failed with the given error and saves.
+// Convenience wrapper around Transition for the most common failure path.
+func (s *UpdateState) RecordError(path string, err error) error {
+	msg := ""
+	if err != nil {
+		msg = err.Error()
+	}
+	return s.Transition(path, PhaseFailed, "", msg)
+}
+
+// SetFromVersion records the version the system was running when an attempt
+// started. Idempotent; only takes effect when From is empty.
+func (s *UpdateState) SetFromVersion(v string) {
+	if s.FromVersion == "" {
+		s.FromVersion = v
+	}
+}
--- a/update/pkg/state/state_test.go
+++ b/update/pkg/state/state_test.go
@@ -0,0 +1,197 @@
+package state
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// statePath returns a per-test state file path inside t.TempDir().
+func statePath(t *testing.T) string {
+	t.Helper()
+	return filepath.Join(t.TempDir(), "state.json")
+}
+
+func TestLoadMissingReturnsIdle(t *testing.T) {
+	s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
+	if err != nil {
+		t.Fatalf("unexpected error loading missing state: %v", err)
+	}
+	if s.Phase != PhaseIdle {
+		t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
+	}
+}
+
+func TestSaveLoadRoundTrip(t *testing.T) {
+	path := statePath(t)
+	in := &UpdateState{
+		Phase:        PhaseStaged,
+		FromVersion:  "v0.2.0",
+		ToVersion:    "v0.3.0",
+		AttemptCount: 1,
+	}
+	if err := in.Save(path); err != nil {
+		t.Fatalf("save: %v", err)
+	}
+	out, err := Load(path)
+	if err != nil {
+		t.Fatalf("load: %v", err)
+	}
+	if out.Phase != in.Phase {
+		t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
+	}
+	if out.FromVersion != in.FromVersion {
+		t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
+	}
+	if out.ToVersion != in.ToVersion {
+		t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
+	}
+	if out.AttemptCount != in.AttemptCount {
+		t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
+	}
+	if out.UpdatedAt.IsZero() {
+		t.Error("UpdatedAt should be set by Save")
+	}
+}
+
+func TestSaveRejectsInvalidPhase(t *testing.T) {
+	s := &UpdateState{Phase: Phase("bogus")}
+	err := s.Save(statePath(t))
+	if err == nil {
+		t.Fatal("expected error saving invalid phase, got nil")
+	}
+}
+
+func TestSaveIsAtomic(t *testing.T) {
+	// After Save, the .tmp file should NOT exist — confirming we renamed it.
+	path := statePath(t)
+	s := New()
+	if err := s.Save(path); err != nil {
+		t.Fatalf("save: %v", err)
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Errorf("tmp file still present after Save: %v", err)
+	}
+}
+
+func TestSaveCreatesDirectory(t *testing.T) {
+	// State directory may not exist yet (first-ever boot). Save() should mkdir.
+	dir := filepath.Join(t.TempDir(), "fresh", "subdir")
+	path := filepath.Join(dir, "state.json")
+	if err := New().Save(path); err != nil {
+		t.Fatalf("save into nonexistent dir: %v", err)
+	}
+	if _, err := os.Stat(path); err != nil {
+		t.Errorf("state file not present after Save: %v", err)
+	}
+}
+
+func TestTransitionIdleToChecking(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
+		t.Fatalf("transition: %v", err)
+	}
+	if s.Phase != PhaseChecking {
+		t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
+	}
+	if s.ToVersion != "v0.3.0" {
+		t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
+	}
+	if s.AttemptCount != 1 {
+		t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
+	}
+	if s.StartedAt.IsZero() {
+		t.Error("StartedAt should be set when leaving Idle")
+	}
+}
+
+func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
+	_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
+	_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
+	if s.AttemptCount != 1 {
+		t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
+	}
+}
+
+func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
+	// Now an attempt at a NEW version starts. AttemptCount should reset.
+	_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
+	if s.ToVersion != "v0.4.0" {
+		t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
+	}
+	if s.AttemptCount != 0 {
+		t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
+	}
+}
+
+func TestTransitionFailedRecordsError(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
+	_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
+	if s.Phase != PhaseFailed {
+		t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
+	}
+	if s.LastError != "checksum mismatch" {
+		t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
+	}
+}
+
+func TestTransitionSuccessClearsError(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
+	if s.LastError == "" {
+		t.Fatal("setup: LastError should be non-empty before success")
+	}
+	_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
+	if s.LastError != "" {
+		t.Errorf("last_error after success: got %q, want empty", s.LastError)
+	}
+}
+
+func TestRecordError(t *testing.T) {
+	path := statePath(t)
+	s := New()
+	if err := s.RecordError(path, errors.New("network down")); err != nil {
+		t.Fatalf("RecordError: %v", err)
+	}
+	if s.Phase != PhaseFailed {
+		t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
+	}
+	if s.LastError != "network down" {
+		t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
+	}
+}
+
+func TestSetFromVersionIdempotent(t *testing.T) {
+	s := New()
+	s.SetFromVersion("v0.2.0")
+	if s.FromVersion != "v0.2.0" {
+		t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
+	}
+	// Second call should not overwrite.
+	s.SetFromVersion("v0.1.0")
+	if s.FromVersion != "v0.2.0" {
+		t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
+	}
+}
+
+func TestLoadHandlesGarbageFile(t *testing.T) {
+	path := statePath(t)
+	if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+	_, err := Load(path)
+	if err == nil {
+		t.Error("expected error loading garbage, got nil")
+	}
+}
@@ -1 +1 @@
 .2.0
 .3.1