Compare commits
32 Commits
80aca5e372
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 53268a1564 | |||
| e1b8a69294 | |||
| fb24e641ce | |||
| 48267e1cbc | |||
| 04a5cd2cd3 | |||
| eb39787cf3 | |||
| 81b29fd237 | |||
| fbe2d0bfdb | |||
| bc3300e7e7 | |||
| 3bcf2e115f | |||
| 31eee77397 | |||
| 7e46f8fdc2 | |||
| 76ed2ffc14 | |||
| 51c1f78aea | |||
| f8c308d9b7 | |||
| 3b47e7af68 | |||
| 9fb894c5af | |||
| 28de656b97 | |||
| dfed6ddba8 | |||
| bce565e2f7 | |||
| 0c6e200585 | |||
| 1b44c9d621 | |||
| de10de0ef3 | |||
| 1de36289a5 | |||
| 31aac701db | |||
| 06e12a79bd | |||
| dc48caa959 | |||
| 65938d6d04 | |||
| 5cf81049f6 | |||
| 863f498cc2 | |||
| 05ab108de1 | |||
| c20f5a2e8c |
88
.gitea/workflows/build-arm64.yaml
Normal file
88
.gitea/workflows/build-arm64.yaml
Normal file
@@ -0,0 +1,88 @@
|
||||
name: ARM64 Build
|
||||
|
||||
# Smoke-test workflow for main-branch ARM64 builds. Triggers on push to main
|
||||
# (so we catch breakages early) and on manual dispatch.
|
||||
#
|
||||
# Tag pushes are intentionally NOT a trigger — release.yaml handles tags and
|
||||
# also produces the disk image. Triggering both on the same tag wastes an
|
||||
# hour of Odroid time on a duplicate kernel build.
|
||||
#
|
||||
# `paths-ignore` keeps workflow-file and docs-only commits from kicking off
|
||||
# a 60-minute Odroid rebuild. If you change a kernel fragment, init script,
|
||||
# or build/script, this WILL fire — that's by design.
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths-ignore:
|
||||
- '.gitea/workflows/**'
|
||||
- '.github/workflows/**'
|
||||
- 'docs/**'
|
||||
- '*.md'
|
||||
- 'CHANGELOG.md'
|
||||
- 'README.md'
|
||||
- '.gitignore'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build-arm64-generic:
|
||||
name: Build generic ARM64 disk image
|
||||
# Routes to the Odroid self-hosted runner via the arm64-linux label.
|
||||
# See docs/ci-runners.md for runner setup.
|
||||
runs-on: arm64-linux
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Show host info
|
||||
run: |
|
||||
uname -a
|
||||
nproc
|
||||
free -h
|
||||
df -h /home /tmp || df -h /
|
||||
|
||||
- name: Verify build prerequisites
|
||||
run: |
|
||||
# The Odroid runner ships these via apt; this is a sanity check.
|
||||
which gcc make bc bison flex cpio gzip xz wget curl mkfs.ext4 mkfs.vfat \
|
||||
sfdisk losetup kpartx grub-mkimage qemu-system-aarch64 git busybox
|
||||
ls -la /bin/busybox
|
||||
file /bin/busybox | grep -q 'statically linked' || {
|
||||
echo "ERROR: /bin/busybox is not statically linked — install busybox-static"
|
||||
exit 1
|
||||
}
|
||||
|
||||
- name: Build mainline ARM64 kernel
|
||||
# Cached in build/cache/kernel-arm64-generic between runs (persistent
|
||||
# working dir on the host runner). First run takes 30-60 min; reruns
|
||||
# exit immediately once the .config + Image match.
|
||||
run: |
|
||||
time make kernel-arm64
|
||||
|
||||
- name: Build cross-arch Go binaries
|
||||
run: make build-cross
|
||||
|
||||
- name: Prepare generic ARM64 rootfs
|
||||
run: sudo make rootfs-arm64
|
||||
|
||||
- name: Build ARM64 UEFI disk image
|
||||
run: sudo make disk-image-arm64
|
||||
|
||||
- name: Show output artifact
|
||||
run: |
|
||||
ls -lh output/
|
||||
file output/*.arm64.img
|
||||
|
||||
- name: Boot smoke test (best-effort)
|
||||
# KubeSolo's image import deadline can fire under QEMU TCG on the
|
||||
# Odroid; the boot itself succeeds through stage 90 every time, but
|
||||
# the final "KubeSolo started" health check is timing-sensitive.
|
||||
# We mark this continue-on-error until we have KVM or real hardware.
|
||||
continue-on-error: true
|
||||
run: sudo make test-boot-arm64-disk
|
||||
|
||||
- name: Upload disk image
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: kubesolo-os-arm64-${{ github.ref_name }}
|
||||
path: output/kubesolo-os-*.arm64.img
|
||||
retention-days: 90
|
||||
@@ -62,7 +62,8 @@ jobs:
|
||||
working-directory: update
|
||||
|
||||
- name: Upload binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
# @v4 not yet fully supported by Gitea Actions runner; @v3 works.
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: binaries-${{ matrix.suffix }}
|
||||
path: |
|
||||
@@ -78,14 +79,39 @@ jobs:
|
||||
- name: Install shellcheck
|
||||
run: sudo apt-get update && sudo apt-get install -y shellcheck
|
||||
|
||||
# --severity=error filters out style/info/warning findings. Several of
|
||||
# those are unavoidable in init-style scripts that source other files
|
||||
# dynamically (SC1090/SC1091/SC2034). Exclude them explicitly so they
|
||||
# don't fire even at warning level if we lift severity later.
|
||||
# Codes excluded:
|
||||
# SC1090 — non-constant source path (we source by stage name)
|
||||
# SC1091 — source target not specified as input (we reference relative paths)
|
||||
# SC2034 — var "unused" (false positive: used via sourced scripts)
|
||||
# SC2002 — useless cat (style only, very common pattern in our scripts)
|
||||
# SC2015 — A && B || C (deliberate idiom)
|
||||
# SC2012 — use find not ls (style only)
|
||||
# SC2013 — read words not lines (style only, applies to /proc parsing)
|
||||
|
||||
- name: Lint init scripts (POSIX sh)
|
||||
run: shellcheck -s sh init/init.sh init/lib/*.sh init/emergency-shell.sh
|
||||
run: |
|
||||
shellcheck -s sh --severity=error \
|
||||
-e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
|
||||
init/init.sh init/lib/*.sh init/emergency-shell.sh
|
||||
|
||||
- name: Lint build scripts (bash)
|
||||
run: shellcheck -s bash build/scripts/*.sh build/config/kernel-audit.sh
|
||||
run: |
|
||||
shellcheck -s bash --severity=error \
|
||||
-e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
|
||||
build/scripts/*.sh build/config/kernel-audit.sh
|
||||
|
||||
- name: Lint test scripts (bash)
|
||||
run: shellcheck -s bash test/qemu/*.sh test/integration/*.sh test/kernel/*.sh || true
|
||||
run: |
|
||||
shellcheck -s bash --severity=error \
|
||||
-e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
|
||||
test/qemu/*.sh test/integration/*.sh test/kernel/*.sh
|
||||
|
||||
- name: Lint hack scripts (bash)
|
||||
run: shellcheck -s bash hack/*.sh || true
|
||||
run: |
|
||||
shellcheck -s bash --severity=error \
|
||||
-e SC1090,SC1091,SC2034,SC2002,SC2015,SC2012,SC2013 \
|
||||
hack/*.sh
|
||||
|
||||
@@ -1,5 +1,19 @@
|
||||
name: Release
|
||||
|
||||
# Triggered by `git push origin vX.Y.Z`. Builds Go binaries (amd64+arm64),
|
||||
# x86_64 ISO + disk image, ARM64 disk image, computes SHA256SUMS over all
|
||||
# artifacts, and posts a Gitea release with everything attached via the
|
||||
# Gitea API.
|
||||
#
|
||||
# Notes for future-you:
|
||||
# - upload-artifact / download-artifact are pinned to @v3 because Gitea's
|
||||
# act_runner v1.0.x doesn't fully implement v4 yet.
|
||||
# - The release step uses curl against Gitea's own /api/v1/repos/.../releases
|
||||
# instead of a third-party action (softprops/action-gh-release et al);
|
||||
# act_runner doesn't reliably proxy GitHub.com-targeted actions.
|
||||
# - The arm64 disk-image build runs on the Odroid self-hosted runner via
|
||||
# the `arm64-linux` label. Docs in docs/ci-runners.md.
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
@@ -11,19 +25,16 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.22'
|
||||
|
||||
- name: Test cloud-init
|
||||
run: cd cloud-init && go test ./... -count=1
|
||||
|
||||
- name: Test update agent
|
||||
run: cd update && go test ./... -count=1
|
||||
|
||||
build-binaries:
|
||||
name: Build Binaries
|
||||
name: Build Binaries (${{ matrix.suffix }})
|
||||
runs-on: ubuntu-latest
|
||||
needs: test
|
||||
strategy:
|
||||
@@ -37,129 +48,248 @@ jobs:
|
||||
suffix: linux-arm64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.22'
|
||||
|
||||
- name: Get version
|
||||
id: version
|
||||
run: echo "version=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build cloud-init
|
||||
run: |
|
||||
CGO_ENABLED=0 GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} \
|
||||
go build -ldflags="-s -w -X main.version=${{ steps.version.outputs.version }}" \
|
||||
-o kubesolo-cloudinit-${{ matrix.suffix }} ./cmd/
|
||||
working-directory: cloud-init
|
||||
|
||||
- name: Build update agent
|
||||
run: |
|
||||
CGO_ENABLED=0 GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} \
|
||||
go build -ldflags="-s -w -X main.version=${{ steps.version.outputs.version }}" \
|
||||
-o kubesolo-update-${{ matrix.suffix }} .
|
||||
working-directory: update
|
||||
|
||||
- name: Upload binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: binaries-${{ matrix.suffix }}
|
||||
path: |
|
||||
cloud-init/kubesolo-cloudinit-${{ matrix.suffix }}
|
||||
update/kubesolo-update-${{ matrix.suffix }}
|
||||
|
||||
build-iso:
|
||||
name: Build ISO (amd64)
|
||||
build-iso-amd64:
|
||||
name: Build x86_64 ISO + disk image
|
||||
# Gated until an amd64-linux runner is registered. We use `runs-on:
|
||||
# ubuntu-latest` (which the Odroid claims) so SOME runner picks the job
|
||||
# up and evaluates `if: false`, marking it `skipped` instead of leaving
|
||||
# it `queued` forever — the latter holds the overall run in `queued`
|
||||
# state even when every load-bearing job is complete. When we get an
|
||||
# amd64 runner, flip `if: false` to `false` -> `true` (and flip the
|
||||
# `runs-on:` back to `amd64-linux`).
|
||||
if: false
|
||||
runs-on: ubuntu-latest
|
||||
needs: build-binaries
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.22'
|
||||
|
||||
- name: Install build deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
cpio gzip genisoimage isolinux syslinux syslinux-common \
|
||||
syslinux-utils xorriso xz-utils wget squashfs-tools \
|
||||
dosfstools e2fsprogs fdisk parted bsdtar
|
||||
|
||||
- name: Build ISO
|
||||
run: make iso
|
||||
|
||||
- name: Build disk image
|
||||
run: make disk-image
|
||||
|
||||
- name: Get version
|
||||
id: version
|
||||
run: echo "version=$(cat VERSION)" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Upload ISO
|
||||
uses: actions/upload-artifact@v4
|
||||
dosfstools e2fsprogs fdisk parted libarchive-tools \
|
||||
grub-common grub-efi-amd64-bin grub-pc-bin kpartx \
|
||||
busybox-static iptables nftables
|
||||
- name: Build kernel + ISO + disk-image
|
||||
run: |
|
||||
make kernel
|
||||
make build-cloudinit build-update-agent
|
||||
make rootfs initramfs
|
||||
make iso
|
||||
make disk-image
|
||||
- name: Compress disk image
|
||||
# The raw .img is 4 GB sparse; xz takes it to ~50-300 MB depending
|
||||
# on dictionary level. Use -6 (default) for memory safety on the
|
||||
# GitHub-Actions-style runner.
|
||||
run: |
|
||||
xz -k -T0 --memlimit-compress=1500MiB -6 output/*.img
|
||||
ls -lh output/
|
||||
- name: Upload x86_64 artifacts
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: iso-amd64
|
||||
path: output/*.iso
|
||||
name: image-amd64
|
||||
path: |
|
||||
output/*.iso
|
||||
output/*.img.xz
|
||||
|
||||
- name: Upload disk image
|
||||
uses: actions/upload-artifact@v4
|
||||
build-disk-arm64:
|
||||
name: Build ARM64 disk image
|
||||
runs-on: arm64-linux
|
||||
needs: test
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Show host info
|
||||
run: |
|
||||
uname -a
|
||||
nproc
|
||||
free -h
|
||||
df -h /
|
||||
- name: Build kernel + rootfs + disk-image
|
||||
# Runner runs as root via systemd; explicit sudo is harmless but
|
||||
# documented as such in docs/ci-runners.md.
|
||||
run: |
|
||||
make kernel-arm64
|
||||
make build-cross
|
||||
make rootfs-arm64
|
||||
make disk-image-arm64
|
||||
- name: Compress disk image
|
||||
run: |
|
||||
xz -k -T0 --memlimit-compress=1500MiB -6 output/*.arm64.img
|
||||
ls -lh output/
|
||||
- name: Upload ARM64 artifacts
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: disk-image-amd64
|
||||
path: output/*.img
|
||||
name: image-arm64
|
||||
path: output/*.arm64.img.xz
|
||||
|
||||
release:
|
||||
name: Create Release
|
||||
name: Publish Gitea Release
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build-binaries, build-iso]
|
||||
# build-iso-amd64 is gated `if: false` in v0.3.x (no amd64 runner yet);
|
||||
# don't block the release on it. build-disk-arm64 is required — that's
|
||||
# the headline artifact for v0.3.x. build-binaries is required since
|
||||
# the Go binaries are core to every release.
|
||||
needs: [build-binaries, build-disk-arm64]
|
||||
# `if: always()` so the release publishes even if the gated x86 job
|
||||
# somehow ran-and-failed instead of being skipped. The downstream
|
||||
# `find` in the Flatten step ignores missing files gracefully.
|
||||
if: always() && needs.build-binaries.result == 'success' && needs.build-disk-arm64.result == 'success'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Get version
|
||||
id: version
|
||||
run: echo "version=$(cat VERSION)" >> $GITHUB_OUTPUT
|
||||
# `cat VERSION` would be stale on tag pushes (VERSION already bumped
|
||||
# for the tag, but using ref_name is unambiguous).
|
||||
run: echo "version=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Download all artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
path: artifacts
|
||||
|
||||
- name: Compute checksums
|
||||
- name: Flatten artifacts + compute checksums
|
||||
run: |
|
||||
cd artifacts
|
||||
find . -type f \( -name "*.iso" -o -name "*.img" -o -name "kubesolo-*" \) \
|
||||
-exec sha256sum {} \; | sort > ../SHA256SUMS
|
||||
cd ..
|
||||
mkdir -p release
|
||||
# Each upload-artifact wrote into artifacts/<name>/...
|
||||
find artifacts -type f \( \
|
||||
-name "*.iso" -o \
|
||||
-name "*.img.xz" -o \
|
||||
-name "kubesolo-*" \
|
||||
\) -exec cp {} release/ \;
|
||||
(cd release && sha256sum * | sort > SHA256SUMS)
|
||||
ls -lh release/
|
||||
cat release/SHA256SUMS
|
||||
|
||||
- name: Create release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
name: KubeSolo OS v${{ steps.version.outputs.version }}
|
||||
body: |
|
||||
## KubeSolo OS v${{ steps.version.outputs.version }}
|
||||
- name: Install release tooling
|
||||
run: sudo apt-get update && sudo apt-get install -y jq curl
|
||||
|
||||
- name: Render release body
|
||||
id: body
|
||||
run: |
|
||||
VERSION="${{ steps.version.outputs.version }}"
|
||||
# Strip the leading 'v' for cosmetic display in the body.
|
||||
DISPLAY="${VERSION#v}"
|
||||
cat > release-body.md <<EOF
|
||||
See [docs/release-notes-${DISPLAY}.md](./docs/release-notes-${DISPLAY}.md)
|
||||
and [CHANGELOG.md](./CHANGELOG.md) for the full release notes.
|
||||
|
||||
### Downloads
|
||||
- **ISO** — Boot from CD/USB, ideal for testing
|
||||
- **Disk Image** — Raw disk with A/B partitions + GRUB
|
||||
- **Binaries** — Standalone cloud-init and update agent
|
||||
|
||||
- \`kubesolo-os-${DISPLAY}.arm64.img.xz\` — ARM64 raw disk image (A/B GPT, UEFI)
|
||||
- \`kubesolo-cloudinit-linux-{amd64,arm64}\` — standalone cloud-init parser
|
||||
- \`kubesolo-update-linux-{amd64,arm64}\` — standalone update agent
|
||||
- \`SHA256SUMS\` — checksums for every artifact above
|
||||
|
||||
> **x86_64 ISO + disk image**: not built automatically yet. The
|
||||
> release workflow's amd64 build job needs an amd64-linux runner,
|
||||
> which this Gitea instance doesn't have yet. To produce them
|
||||
> yourself, clone the repo at this tag and run \`make iso disk-image\`
|
||||
> on any Linux amd64 host.
|
||||
|
||||
### Verify
|
||||
```
|
||||
sha256sum -c SHA256SUMS
|
||||
```
|
||||
|
||||
### Quick Start
|
||||
```bash
|
||||
# Boot in QEMU
|
||||
qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm \
|
||||
-cdrom kubesolo-os-${{ steps.version.outputs.version }}.iso \
|
||||
-nographic
|
||||
```
|
||||
files: |
|
||||
artifacts/**/*.iso
|
||||
artifacts/**/*.img
|
||||
artifacts/**/kubesolo-*
|
||||
SHA256SUMS
|
||||
draft: false
|
||||
prerelease: false
|
||||
\`\`\`
|
||||
sha256sum -c SHA256SUMS
|
||||
\`\`\`
|
||||
|
||||
### Quick start (ARM64)
|
||||
|
||||
\`\`\`
|
||||
# On Graviton/Ampere/any UEFI ARM64 host:
|
||||
xz -d kubesolo-os-${DISPLAY}.arm64.img.xz
|
||||
sudo dd if=kubesolo-os-${DISPLAY}.arm64.img of=/dev/sdX bs=4M status=progress
|
||||
|
||||
# Under qemu-system-aarch64 (Apple Silicon w/ HVF):
|
||||
UEFI_FW=\$(brew --prefix qemu)/share/qemu/edk2-aarch64-code.fd
|
||||
qemu-system-aarch64 -M virt -accel hvf -cpu host -m 2048 -smp 2 \\
|
||||
-nographic -bios "\$UEFI_FW" \\
|
||||
-drive file=kubesolo-os-${DISPLAY}.arm64.img,format=raw,if=virtio,media=disk \\
|
||||
-device virtio-rng-pci \\
|
||||
-net nic,model=virtio \\
|
||||
-net user,hostfwd=tcp::6443-:6443,hostfwd=tcp::8080-:8080
|
||||
\`\`\`
|
||||
|
||||
Then from the host: \`curl http://localhost:8080 > ~/.kube/kubesolo-config\`
|
||||
and \`kubectl --kubeconfig ~/.kube/kubesolo-config get nodes\`.
|
||||
EOF
|
||||
cat release-body.md
|
||||
|
||||
- name: Create release via Gitea API
|
||||
env:
|
||||
# Gitea's act_runner auto-populates this with repo-write scope.
|
||||
# If not, set a personal access token as a secret named GITEA_TOKEN
|
||||
# on the org and swap the var name below.
|
||||
TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
TAG="${{ steps.version.outputs.version }}"
|
||||
REPO_API="${{ github.api_url }}/repos/${{ github.repository }}"
|
||||
|
||||
# 1. Create the release. The API is GitHub-compatible at the
|
||||
# request shape; the response includes the numeric release id we
|
||||
# need for asset uploads.
|
||||
PAYLOAD=$(jq -n \
|
||||
--arg tag "$TAG" \
|
||||
--arg name "KubeSolo OS $TAG" \
|
||||
--rawfile body release-body.md \
|
||||
'{tag_name: $tag, name: $name, body: $body, draft: false, prerelease: false}')
|
||||
|
||||
echo "==> Creating release for $TAG against $REPO_API"
|
||||
CREATE_RESP=$(curl -fsSL -X POST \
|
||||
-H "Authorization: token $TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$PAYLOAD" \
|
||||
"$REPO_API/releases")
|
||||
|
||||
RELEASE_ID=$(echo "$CREATE_RESP" | jq -r '.id')
|
||||
if [ -z "$RELEASE_ID" ] || [ "$RELEASE_ID" = "null" ]; then
|
||||
echo "ERROR: Could not extract release id from response:"
|
||||
echo "$CREATE_RESP" | jq . || echo "$CREATE_RESP"
|
||||
exit 1
|
||||
fi
|
||||
echo "==> Release id: $RELEASE_ID"
|
||||
|
||||
# 2. Upload each asset. asset?name= names the attachment; we use
|
||||
# the basename so users see the same filename the build produced.
|
||||
for f in release/*; do
|
||||
[ -f "$f" ] || continue
|
||||
name=$(basename "$f")
|
||||
echo "==> Uploading $name ($(du -h "$f" | cut -f1))"
|
||||
curl -fsSL -X POST \
|
||||
-H "Authorization: token $TOKEN" \
|
||||
-F "attachment=@$f" \
|
||||
"$REPO_API/releases/$RELEASE_ID/assets?name=$name" >/dev/null
|
||||
done
|
||||
|
||||
echo "==> Release published: $REPO_API/../releases/tag/$TAG"
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -16,6 +16,12 @@ build/rootfs-work/
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Secrets — never commit
|
||||
.env
|
||||
.env.*
|
||||
*.token
|
||||
*.pat
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
._*
|
||||
|
||||
277
CHANGELOG.md
277
CHANGELOG.md
@@ -5,6 +5,283 @@ All notable changes to KubeSolo OS are documented in this file.
|
||||
Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
||||
versioning follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
Pure CI / repository housekeeping; no runtime changes since v0.3.1. All
|
||||
items below shake out workflow-loop bugs exposed by the v0.3.1 release
|
||||
flow on Gitea Actions.
|
||||
|
||||
### Fixed (CI)
|
||||
|
||||
- `build-arm64.yaml` no longer triggers on tag pushes. `release.yaml`
|
||||
already produces the ARM64 disk image as part of the release flow, so
|
||||
triggering both on the same tag wasted an hour of Odroid runner time
|
||||
on a duplicate kernel build. (`04a5cd2`)
|
||||
- The gated `build-iso-amd64` job in `release.yaml` (`if: false` until an
|
||||
amd64-linux runner exists) used to advertise `runs-on: amd64-linux`.
|
||||
With no matching runner, Gitea left the job queued forever and the
|
||||
overall workflow run never transitioned to `success` — even though
|
||||
every load-bearing job had finished and the release was already
|
||||
published. Now uses `runs-on: ubuntu-latest` so any runner picks the
|
||||
job up just long enough to evaluate `if: false` and mark it `skipped`.
|
||||
(`fb24e64`)
|
||||
- `build-arm64.yaml` now ignores workflow-file, docs, and `*.md` changes
|
||||
via `paths-ignore` (`.gitea/workflows/**`, `.github/workflows/**`,
|
||||
`docs/**`, top-level `*.md`, `.gitignore`). Workflow- / docs-only
|
||||
commits no longer kick off a 60-minute kernel rebuild on the Odroid.
|
||||
Any change to a kernel fragment, init script, or build script still
|
||||
triggers the full build, as intended. (`e1b8a69`)
|
||||
|
||||
### Changed
|
||||
|
||||
- `.gitignore` now excludes `.env`, `.env.*`, `*.token`, `*.pat` to keep
|
||||
Gitea PATs and other credentials used during release ops from being
|
||||
accidentally committed. (`48267e1`)
|
||||
|
||||
## [0.3.1] - 2026-05-15
|
||||
|
||||
First fully-functional generic ARM64 release. v0.3.0 shipped the build
|
||||
scaffold; v0.3.1 makes it actually boot a Kubernetes cluster end-to-end
|
||||
on QEMU virt under HVF acceleration. Validated by deploying CoreDNS,
|
||||
local-path-provisioner, and an `nginx:alpine` workload — all reach
|
||||
Running, `kubectl get nodes` reports `Ready`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- **Dual-glibc loading on ARM64** — piCore64's `/lib/libc.so.6` and the
|
||||
build host's `/lib/$LIB_ARCH/libc.so.6` could both be resolved into the
|
||||
same process by the dynamic linker, triggering
|
||||
`*** stack smashing detected ***` aborts when stack frames crossed
|
||||
between functions linked against different libcs. Fix: bundle the full
|
||||
glibc family (libc + libpthread + libdl + libm + libresolv + librt +
|
||||
libanl + libgcc_s + ld.so), delete piCore's duplicates in `/lib/`,
|
||||
and write `/etc/ld.so.conf` + `ldconfig -r` so the runtime linker has
|
||||
a deterministic search order. (`76ed2ff`)
|
||||
- **`nft` binary not bundled** — KubeSolo v1.1.4+ runs `nft add table ip
|
||||
kubesolo-masq` for pod-masquerade setup, but `inject-kubesolo.sh` only
|
||||
bundled `xtables-nft-multi`. Without standalone `nft` in `$PATH`,
|
||||
KubeSolo FATAL'd at startup. Fix: copy `/usr/sbin/nft` + its
|
||||
non-shared libs (libnftables, libedit, libjansson, libgmp, libtinfo,
|
||||
libbsd, libmd) into the rootfs. (`51c1f78`)
|
||||
- **nftables address-family handlers** — `nf_tables` core was loaded but
|
||||
no address families were registered, so `nft add table ip ...`
|
||||
returned `EOPNOTSUPP`. The bool Kconfigs `CONFIG_NF_TABLES_IPV4`,
|
||||
`CONFIG_NF_TABLES_IPV6`, `CONFIG_NF_TABLES_INET`,
|
||||
`CONFIG_NF_TABLES_NETDEV` are required and weren't in the
|
||||
fragment. Fix: add to `kernel-container.fragment` as `=y`. (`7e46f8f`)
|
||||
- **kube-proxy nftables-backend expression modules** — Kubernetes 1.34's
|
||||
kube-proxy nft backend uses `numgen`, `hash`, `limit`, `log`
|
||||
expressions. The corresponding kernel modules (`CONFIG_NFT_NUMGEN`,
|
||||
etc.) were missing from the fragment AND the runtime module list, so
|
||||
even after a kernel rebuild stage 30 didn't load them and stage 85's
|
||||
`kernel.modules_disabled=1` lockdown prevented on-demand loads. Fix:
|
||||
add to both `kernel-container.fragment` (as `=m`) and
|
||||
`modules.list` / `modules-arm64.list`. (`31eee77`, `3bcf2e1`)
|
||||
- **`modules.list` inline-comment parser bug** — the inject script's
|
||||
comment-strip only matched lines starting with `#`, not lines with
|
||||
inline `# comment` tails. So `nft_numgen # foo` was passed
|
||||
verbatim to modprobe, resolved to nothing, and the .ko never made it
|
||||
into the initramfs. Fix: parse with `mod="${mod%%#*}"` to strip
|
||||
inline tails. (`bc3300e`)
|
||||
- **Banner only printed on kubeconfig success** —
|
||||
`90-kubesolo.sh` gated the host-access banner behind `if [ -f
|
||||
$KUBECONFIG_PATH ]`. When KubeSolo crashed early (bug #2 above) or
|
||||
the wait loop timed out, the user never saw the connection
|
||||
instructions. Fix: write the banner to `/etc/motd` AND print it
|
||||
unconditionally after the wait loop. (`51c1f78`)
|
||||
- **`dev-vm-arm64.sh` missing port-8080 hostfwd** — the in-VM HTTP
|
||||
server that serves the kubeconfig listens on port 8080, but the
|
||||
QEMU `-net user` line only forwarded 6443 and 2222, so
|
||||
`curl http://localhost:8080` from the host machine connected to
|
||||
nothing. Fix: add the third hostfwd. (`fbe2d0b`)
|
||||
|
||||
### Fixed (CI)
|
||||
|
||||
- **`release.yaml` workflow** rewritten so v0.3.1+ tag pushes
|
||||
auto-publish a complete release page on Gitea: `actions/upload-artifact`
|
||||
pinned to `@v3` for act_runner compatibility, the
|
||||
`softprops/action-gh-release@v2` step replaced with a direct `curl`
|
||||
against `/api/v1/repos/.../releases` (`softprops` hard-codes
|
||||
`api.github.com` so it silently no-ops on Gitea), added a
|
||||
`build-disk-arm64` job that builds on the `arm64-linux` runner.
|
||||
v0.3.0's manual-upload-only release was the canary that exposed all
|
||||
three bugs. (`f8c308d`)
|
||||
|
||||
### Known issues carried forward to v0.3.2
|
||||
|
||||
These don't block normal operation but are tracked:
|
||||
|
||||
- `xt_comment` userspace extension load fails on the iptables-nft path,
|
||||
causing kubelet's KUBE-FIREWALL rule install to skip. Reported as
|
||||
`Couldn't load match 'comment'` in the boot log. kubelet continues
|
||||
without the localhost-drop rule.
|
||||
- `containerd-shim-runc-v2 -info` probe reports `runc: executable file
|
||||
not found in $PATH`. Cosmetic — containerd uses the absolute path
|
||||
from its config when actually launching containers.
|
||||
- `kube-proxy conntrack cleanup` logs `Failed to list conntrack entries:
|
||||
invalid argument` every cleanup cycle. Probably needs
|
||||
`CONFIG_NF_CONNTRACK_PROCFS` or netlink-glue tweaks.
|
||||
- Several pods restart 1–2 times on first boot due to a PLEG /
|
||||
runtime-probe race in the kubelet startup path. Pods stabilise.
|
||||
|
||||
## [0.3.0] - 2026-05-14
|
||||
|
||||
The main themes: generic ARM64 (not just Raspberry Pi), an honest update
|
||||
lifecycle with state file + metrics, OCI multi-arch distribution via ghcr.io,
|
||||
and policy gates (channels, maintenance windows, version stepping-stones,
|
||||
pre-flight checks, auto-rollback).
|
||||
|
||||
### Added
|
||||
|
||||
- Generic ARM64 build track distinct from Raspberry Pi:
|
||||
- `make kernel-arm64` builds a mainline kernel.org LTS kernel (6.12.10 by
|
||||
default) from `arm64 defconfig` + shared `kernel-container.fragment` +
|
||||
arm64 virt-host enables (VIRTIO_*, EFI_STUB, NVMe).
|
||||
- `make disk-image-arm64` produces a UEFI-bootable raw GPT image with A/B
|
||||
system partitions and GRUB-EFI ARM64. Targets QEMU virt, Graviton, Ampere,
|
||||
or any UEFI ARM64 host.
|
||||
- `hack/dev-vm-arm64.sh --disk` boots the built image through QEMU UEFI for
|
||||
end-to-end testing.
|
||||
- `test/qemu/test-boot-arm64-disk.sh` automated boot smoke test.
|
||||
- Bumped KubeSolo to v1.1.5 (was v1.1.0). New cloud-init flags surfaced:
|
||||
- `kubesolo.full` (v1.1.4+) — disable edge-optimised overrides
|
||||
- `kubesolo.disable-ipv6` (v1.1.5+)
|
||||
- `kubesolo.db-wal-repair` (v1.1.5+) — recover from unclean shutdowns
|
||||
- Per-arch supply-chain verification: `KUBESOLO_SHA256_AMD64` and
|
||||
`KUBESOLO_SHA256_ARM64` in `versions.env`, applied to the tarball before
|
||||
extract.
|
||||
- `docs/arm64-architecture.md` — defines the generic-vs-RPi two-track layout.
|
||||
- `docs/arm64-status.md` — Phase 3 status snapshot, known limitations, what's
|
||||
needed to ship.
|
||||
- `docs/ci-runners.md` — Gitea Actions runner setup (Odroid arm64-linux).
|
||||
- Update agent state machine and observability (`update/pkg/state`):
|
||||
- Persistent on-disk `state.json` at `/var/lib/kubesolo/update/state.json`
|
||||
(atomic write via tmp + rename). Records Phase (Idle / Checking /
|
||||
Downloading / Staged / Activated / Verifying / Success / RolledBack /
|
||||
Failed), FromVersion, ToVersion, StartedAt, UpdatedAt, LastError,
|
||||
AttemptCount, HealthCheckFailures.
|
||||
- `apply`, `activate`, `healthcheck`, `rollback` all transition state
|
||||
explicitly on entry / exit / failure. Errors land in LastError so
|
||||
`status` can show why.
|
||||
- `kubesolo-update status --json` emits the full state for
|
||||
orchestration tooling. Human-readable mode adds an "Update Lifecycle"
|
||||
section when not idle.
|
||||
- New Prometheus metrics: `kubesolo_update_phase{phase="..."}` (all 9
|
||||
phase labels always emitted), `kubesolo_update_attempts_total`,
|
||||
`kubesolo_update_last_attempt_timestamp_seconds`.
|
||||
- Channels, maintenance windows, version policy (`update/pkg/config`):
|
||||
- `/etc/kubesolo/update.conf` (key=value, comments, missing-OK) configures
|
||||
server, channel, maintenance_window, pubkey, healthcheck_url,
|
||||
auto_rollback_after.
|
||||
- `cloud-init` top-level `updates:` block writes `update.conf` on first
|
||||
boot. Empty block leaves any existing file alone.
|
||||
- `apply` enforces four gates before download: maintenance window,
|
||||
channel match, runtime architecture match, min_compatible_version
|
||||
stepping-stone. All gate failures land in the state machine as Failed
|
||||
with a clear LastError. `--force` bypasses window + node-block-label.
|
||||
- `UpdateMetadata` JSON gains `channel`, `min_compatible_version`,
|
||||
`architecture` (all optional, omitempty).
|
||||
- OCI registry distribution (`update/pkg/oci`, ~280 LOC, 9 tests):
|
||||
- `kubesolo-update apply --registry ghcr.io/<org>/kubesolo-os --tag stable`
|
||||
pulls update artifacts from any OCI-compliant registry. Multi-arch
|
||||
indexes resolve to the runtime.GOARCH-matching manifest automatically.
|
||||
- Custom media types: `application/vnd.kubesolo.os.kernel.v1+octet-stream`
|
||||
and `application/vnd.kubesolo.os.initramfs.v1+gzip`. Annotations:
|
||||
`io.kubesolo.os.{version,channel,architecture,min_compatible_version,
|
||||
release_notes,release_date}`.
|
||||
- End-to-end digest verification from manifest to blobs via oras-go/v2.
|
||||
- `build/scripts/push-oci-artifact.sh` publishes per-arch artifacts via
|
||||
`oras`. Multi-arch index composition documented inline.
|
||||
- Dependencies added (update module only): oras.land/oras-go/v2 and
|
||||
transitive opencontainers/{go-digest,image-spec} + golang.org/x/sync.
|
||||
- Pre-flight gates and deeper healthcheck (`update/pkg/health` extended,
|
||||
`update/pkg/partition` extended):
|
||||
- Free-space pre-flight on the passive partition (image + 10% headroom)
|
||||
via `partition.FreeBytes` / `HasFreeSpaceFor`.
|
||||
- Node-block-label pre-flight: refuses if the local K8s node carries
|
||||
`updates.kubesolo.io/block=true`. Silently allowed when no kubeconfig
|
||||
(air-gap). Skipped by `--force`.
|
||||
- `CheckKubeSystemReady` waits until every kube-system pod has held
|
||||
Running for ≥ N seconds (configurable via
|
||||
`--kube-system-settle`).
|
||||
- `CheckProbeURL` GETs an operator-supplied URL; 200 = pass. Configurable
|
||||
via `--healthcheck-url` or `healthcheck_url=` in update.conf.
|
||||
- `CheckDiskWritable` writes / fsyncs / reads / deletes a probe file
|
||||
under `/var/lib/kubesolo` to catch a wedged data partition.
|
||||
- `--auto-rollback-after N` (also `auto_rollback_after=` in update.conf):
|
||||
after N consecutive post-activation healthcheck failures, the agent
|
||||
calls `ForceRollback()` and the operator/init reboots. Reset to 0 on
|
||||
a clean pass.
|
||||
- `.gitea/workflows/build-arm64.yaml` — full ARM64 build on the Odroid
|
||||
self-hosted runner. Triggers on push to main, tags, and workflow_dispatch.
|
||||
Boot smoke test marked continue-on-error pending KVM or real-hardware
|
||||
validation.
|
||||
|
||||
### Changed
|
||||
|
||||
- `build/scripts/build-kernel-arm64.sh` is now the **generic ARM64** kernel
|
||||
build (mainline kernel.org LTS, generic UEFI/virtio).
|
||||
- Renamed `build/scripts/build-kernel-rpi.sh` (was `build-kernel-arm64.sh`).
|
||||
RPi kernel build (raspberrypi/linux fork, bcm2711_defconfig) lives here now.
|
||||
- Renamed `build/config/kernel-container.fragment` (was
|
||||
`rpi-kernel-config.fragment`). Misnomer: contents are arch-agnostic and now
|
||||
shared across x86, ARM64-generic, and RPi kernels.
|
||||
- `build/scripts/build-kernel.sh` (x86) refactored to consume the shared
|
||||
fragment via a generic `apply_fragment` function. ~50 lines of duplication
|
||||
killed.
|
||||
- `KUBESOLO_VERSION` moved out of `fetch-components.sh` defaults into
|
||||
`versions.env`. Bumping is now a one-line PR.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Native ARM64 build hosts (e.g. an Odroid runner) no longer require the x86
|
||||
cross-compiler. Both `build-kernel-arm64.sh` and `build-kernel-rpi.sh` detect
|
||||
`uname -m` and use the host's gcc directly when arch matches.
|
||||
- ARM64 grub.cfg console ordering: `ttyAMA0` is now the primary console
|
||||
(`console=ttyS0,... console=ttyAMA0,...`). Init output is now visible on
|
||||
QEMU virt and most ARM64 SBCs without further configuration.
|
||||
- ARM64 boot: replaced piCore64's `/init` with our staged init at `/init` and
|
||||
`/sbin/init`. Previously the kernel ran piCore's TCE handler which
|
||||
segfaulted in our environment.
|
||||
- ARM64 boot: replaced piCore64's broken dynamic BusyBox with the build
|
||||
host's `busybox-static`. piCore's binary triggered EL0 instruction-abort
|
||||
panics on QEMU virt under both `-cpu cortex-a72` and `-cpu max`.
|
||||
- POSIX-character-class portability: `tr -d '[:space:]'` in
|
||||
`30-kernel-modules.sh` and `40-sysctl.sh` replaced with explicit
|
||||
`' \t\r\n'`. Ubuntu's busybox-static 1.30.1 doesn't parse `[:space:]` and
|
||||
instead deletes the literal characters `[ : s p a c e ]`, which truncated
|
||||
module names (`virtio_net` → `virtio_nt`, etc.) and sysctl keys.
|
||||
- `inject-kubesolo.sh` no longer copies `init/lib/functions.sh` into
|
||||
`init.d/`. Previously the main init loop tried to run it as a stage after
|
||||
stage 90 and panicked with "Init completed without exec'ing KubeSolo".
|
||||
- ARM64 disk image: `TARGET_ARCH=arm64 create-disk-image.sh` produces
|
||||
`BOOTAA64.EFI` via `grub-mkimage -O arm64-efi` (not `bootx64.efi`). Skips
|
||||
the BIOS-only `grub-install --target=i386-pc` step.
|
||||
- `build/Dockerfile.builder`: added `grub-efi-amd64-bin`, `grub-efi-arm64-bin`,
|
||||
`grub-pc-bin`, `grub-common`, `grub2-common`, and `busybox-static` so the
|
||||
Docker-based build flow can produce ARM64 disk images and gets the same
|
||||
BusyBox swap behaviour as native builds.
|
||||
|
||||
### Known limitations (deferred to follow-up)
|
||||
|
||||
- **ARM64 LABEL= resolution** doesn't work yet — piCore's `blkid`/`findfs`
|
||||
crash in QEMU and our static busybox lacks the applets. Hardcoded
|
||||
`/dev/vda4` as a workaround in `build/grub/grub-arm64.cfg`. Production
|
||||
fix: ship static `blkid`/`findfs` or replace LABEL resolution with a
|
||||
sysfs walk.
|
||||
- **AppArmor profile load fails on ARM64** (apparmor_parser ABI mismatch).
|
||||
Init reports it; boot continues without enforcement.
|
||||
- **OCI signature verification** is deferred. The HTTP transport still
|
||||
honours `--pubkey` for `.sig` files; the OCI transport is digest-verified
|
||||
end-to-end via oras-go but does not yet consume cosign-style referrer
|
||||
attestations. Targeted for v0.3.1.
|
||||
- **Real-hardware validation** of the generic ARM64 image is still
|
||||
pending. Builds and boots end-to-end under QEMU virt; production
|
||||
certification waits on a Graviton / Ampere run.
|
||||
- **QEMU TCG performance** can trigger KubeSolo's first-boot image-import
|
||||
deadline. Not a defect in the OS itself; real hardware and KVM-accelerated
|
||||
QEMU complete the import in seconds.
|
||||
|
||||
## [0.2.0] - 2026-02-12
|
||||
|
||||
### Added
|
||||
|
||||
42
README.md
42
README.md
@@ -2,7 +2,7 @@
|
||||
|
||||
An immutable, bootable Linux distribution purpose-built for [KubeSolo](https://github.com/portainer/kubesolo) — Portainer's ultra-lightweight single-node Kubernetes.
|
||||
|
||||
> **Status:** x86_64 is stable — boots and runs K8s workloads, Portainer Edge Agent tested and connected. ARM64 generic UEFI is the active focus for v0.3.0; ARM64 Raspberry Pi support is paused pending physical hardware testing.
|
||||
> **Status (v0.3.1):** First fully-validated generic ARM64 release. x86_64 and ARM64 (UEFI / virtio / mainline kernel) both build and boot end-to-end; v0.3.1 closes the dual-glibc, nftables address-family, and kube-proxy expression-module gaps that kept v0.3.0 from reaching a Ready node on ARM64. Validated end-to-end under QEMU virt + HVF on Apple Silicon: `kubectl get nodes` reports `Ready`, CoreDNS, local-path-provisioner, and an nginx test workload all `Running`. The update agent has an explicit state machine, OCI registry distribution alongside HTTP, channel + maintenance-window + version-stepping-stone gates, and auto-rollback. ARM64 Raspberry Pi support remains paused pending physical hardware. See [CHANGELOG.md](CHANGELOG.md) for the full v0.3.1 changelog and [docs/release-notes-0.3.0.md](docs/release-notes-0.3.0.md) for the v0.3.0 milestone summary.
|
||||
|
||||
## What is this?
|
||||
|
||||
@@ -24,23 +24,34 @@ KubeSolo OS combines **Tiny Core Linux** (~11 MB) with **KubeSolo** (single-bina
|
||||
|
||||
## Quick Start
|
||||
|
||||
### x86_64 ISO
|
||||
|
||||
```bash
|
||||
# Fetch Tiny Core ISO + KubeSolo binary
|
||||
make fetch
|
||||
|
||||
# Build custom kernel (first time only, ~25 min, cached)
|
||||
make kernel
|
||||
|
||||
# Build Go binaries
|
||||
make fetch # Tiny Core ISO + KubeSolo binary
|
||||
make kernel # Custom kernel (first time only, ~25 min, cached)
|
||||
make build-cloudinit build-update-agent
|
||||
|
||||
# Build bootable ISO
|
||||
make rootfs initramfs iso
|
||||
|
||||
# Test in QEMU
|
||||
make dev-vm
|
||||
```
|
||||
|
||||
### Generic ARM64 disk image (v0.3.0+)
|
||||
|
||||
For Graviton / Ampere / generic UEFI ARM64 hosts:
|
||||
|
||||
```bash
|
||||
make kernel-arm64 # Mainline 6.12 LTS kernel (first time only, ~30-60 min)
|
||||
make rootfs-arm64 # Mainline kernel modules + KubeSolo arm64
|
||||
make disk-image-arm64 # UEFI-bootable A/B GPT image
|
||||
make test-boot-arm64-disk # boot smoke test under qemu-system-aarch64
|
||||
```
|
||||
|
||||
### Raspberry Pi (work in progress)
|
||||
|
||||
Build path lives at `make kernel-rpi` / `make rpi-image`; needs physical
|
||||
hardware to validate the firmware + autoboot.txt path. See
|
||||
[docs/arm64-architecture.md](docs/arm64-architecture.md) for the two-track
|
||||
build layout.
|
||||
|
||||
Or build everything at once inside Docker:
|
||||
|
||||
```bash
|
||||
@@ -234,9 +245,12 @@ Metrics include: `kubesolo_os_info`, `boot_success`, `boot_counter`, `uptime_sec
|
||||
| 5 | CI/CD, OCI distribution, Prometheus metrics, ARM64 cross-compile | Complete |
|
||||
| 6 | Security hardening, AppArmor | Complete |
|
||||
| - | Custom kernel build for container runtime fixes | Complete (x86_64) |
|
||||
| 7 | ARM64 generic (mainline kernel, UEFI, virtio) | In progress (v0.3.0) |
|
||||
| 8 | Update engine v2 (state machine, OCI distribution, channels) | In progress (v0.3.0) |
|
||||
| 7 | ARM64 generic (mainline kernel, UEFI, virtio) | Complete (v0.3.1, K8s Ready under QEMU virt+HVF) |
|
||||
| 8 | Update engine v2 (state machine, channels, OCI, pre-flight gates) | Complete (v0.3.0) |
|
||||
| - | ARM64 Raspberry Pi (custom kernel, firmware, SD card image) | Paused — needs hardware |
|
||||
| - | OCI cosign signature verification | Planned for v0.3.2 |
|
||||
| - | LABEL=KSOLODATA on ARM64 (replace blkid/findfs path) | Planned for v0.3.2 |
|
||||
| - | Real-hardware ARM64 validation (Graviton / Ampere) | Planned for v0.3.2 |
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libarchive-tools \
|
||||
libelf-dev \
|
||||
libssl-dev \
|
||||
nftables \
|
||||
make \
|
||||
parted \
|
||||
squashfs-tools \
|
||||
@@ -40,6 +41,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
apparmor-utils \
|
||||
gcc-aarch64-linux-gnu \
|
||||
binutils-aarch64-linux-gnu \
|
||||
busybox-static \
|
||||
git \
|
||||
kpartx \
|
||||
unzip \
|
||||
@@ -54,6 +56,13 @@ RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \
|
||||
| tar -C /usr/local -xzf -
|
||||
ENV PATH="/usr/local/go/bin:${PATH}"
|
||||
|
||||
# Install oras (OCI artifact CLI) for push-oci-artifact.sh.
|
||||
# Bump ORAS_VERSION when pushing breaks or when oras gains useful flags.
|
||||
ARG ORAS_VERSION=1.2.3
|
||||
RUN curl -fsSL "https://github.com/oras-project/oras/releases/download/v${ORAS_VERSION}/oras_${ORAS_VERSION}_linux_amd64.tar.gz" \
|
||||
| tar -C /usr/local/bin -xzf - oras \
|
||||
&& chmod +x /usr/local/bin/oras
|
||||
|
||||
WORKDIR /build
|
||||
COPY . /build
|
||||
|
||||
|
||||
@@ -53,6 +53,46 @@ CONFIG_NF_TABLES=m
|
||||
CONFIG_VETH=m
|
||||
CONFIG_VXLAN=m
|
||||
|
||||
# nftables address-family handlers. These are BOOL Kconfigs (not tristate)
|
||||
# so they have to be built into the kernel — there's no module to modprobe
|
||||
# at runtime. Without them, `nft add table ip ...` returns EOPNOTSUPP and
|
||||
# KubeSolo v1.1.4+'s pod-masquerade setup fails at boot.
|
||||
CONFIG_NF_TABLES_IPV4=y
|
||||
CONFIG_NF_TABLES_IPV6=y
|
||||
CONFIG_NF_TABLES_INET=y
|
||||
CONFIG_NF_TABLES_NETDEV=y
|
||||
|
||||
# nftables expression modules used by KubeSolo's masquerade ruleset, the
|
||||
# kube-proxy nft backend (Kubernetes 1.34+), and the xtables compat path.
|
||||
# Listed in modules.list / modules-arm64.list so init loads them at boot.
|
||||
CONFIG_NFT_NAT=m
|
||||
CONFIG_NFT_MASQ=m
|
||||
CONFIG_NFT_CT=m
|
||||
CONFIG_NFT_REDIR=m
|
||||
CONFIG_NFT_REJECT=m
|
||||
CONFIG_NFT_REJECT_INET=m
|
||||
CONFIG_NFT_COMPAT=m
|
||||
CONFIG_NFT_FIB=m
|
||||
CONFIG_NFT_FIB_IPV4=m
|
||||
CONFIG_NFT_FIB_IPV6=m
|
||||
# numgen drives kube-proxy's random / round-robin endpoint LB:
|
||||
# `numgen random mod N vmap { ... }` in service rules.
|
||||
# Without it kube-proxy's nft sync fails with ENOENT on every service.
|
||||
CONFIG_NFT_NUMGEN=m
|
||||
# hash drives consistent-hash LB (sessionAffinity=ClientIP, etc.).
|
||||
CONFIG_NFT_HASH=m
|
||||
# objref / limit / log are used by various policy expressions kube-proxy and
|
||||
# CNI plugins emit. Including them pre-empts a future "could not process
|
||||
# rule" debug loop.
|
||||
CONFIG_NFT_OBJREF=m
|
||||
CONFIG_NFT_LIMIT=m
|
||||
CONFIG_NFT_LOG=m
|
||||
|
||||
# IPv4 NAT bits NFT_MASQ depends on. Auto-selected on most kernels but we
|
||||
# pin them explicitly so olddefconfig doesn't strip them when the fragment
|
||||
# is applied on top of a minimal defconfig.
|
||||
CONFIG_NF_NAT_MASQUERADE=y
|
||||
|
||||
# Security: AppArmor + Audit
|
||||
CONFIG_AUDIT=y
|
||||
CONFIG_AUDITSYSCALL=y
|
||||
|
||||
@@ -56,6 +56,17 @@ nft_fib
|
||||
nft_fib_ipv4
|
||||
nft_fib_ipv6
|
||||
|
||||
# nft expressions used by the Kubernetes 1.34+ nftables kube-proxy backend.
|
||||
# Loading these at boot (stage 30) is mandatory because stage 85 sets
|
||||
# kernel.modules_disabled=1, which would otherwise block kube-proxy from
|
||||
# auto-loading them on first rule install.
|
||||
# (Note: list parser only honours full-line "#"-prefixed comments, NOT
|
||||
# inline "module # comment". Keep module names on their own line.)
|
||||
nft_numgen
|
||||
nft_hash
|
||||
nft_limit
|
||||
nft_log
|
||||
|
||||
# Reject targets (used by kube-proxy iptables-restore rules)
|
||||
nf_reject_ipv4
|
||||
nf_reject_ipv6
|
||||
|
||||
@@ -54,6 +54,14 @@ nft_fib
|
||||
nft_fib_ipv4
|
||||
nft_fib_ipv6
|
||||
|
||||
# nft expressions used by the Kubernetes 1.34+ nftables kube-proxy backend.
|
||||
# Must be loaded at stage 30 because stage 85 sets modules_disabled=1.
|
||||
# (Parser ignores full-line "#" comments only — keep module names alone.)
|
||||
nft_numgen
|
||||
nft_hash
|
||||
nft_limit
|
||||
nft_log
|
||||
|
||||
# Reject targets (used by kube-proxy iptables-restore rules)
|
||||
nf_reject_ipv4
|
||||
nf_reject_ipv6
|
||||
|
||||
@@ -11,8 +11,11 @@ TINYCORE_ISO_URL=${TINYCORE_MIRROR}/${TINYCORE_VERSION%%.*}.x/${TINYCORE_ARCH}/r
|
||||
# KubeSolo
|
||||
# Pinned release tag from https://github.com/portainer/kubesolo/releases.
|
||||
# Bump here and re-run `make fetch` to pull a new version.
|
||||
KUBESOLO_VERSION=v1.1.0
|
||||
KUBESOLO_VERSION=v1.1.5
|
||||
KUBESOLO_INSTALL_URL=https://get.kubesolo.io
|
||||
# Per-arch SHA256 of the musl tarball (verified at fetch time when non-empty).
|
||||
KUBESOLO_SHA256_AMD64=565bd5fd98fc8ce09160e646b55de3493c782d74c0e0c46ccf130ff4bcabab81
|
||||
KUBESOLO_SHA256_ARM64=db865a5e9b2617d595f9c2b7d011272edc94587621a9690e2de0f47cc94f0748
|
||||
|
||||
# Build tools (used inside builder container)
|
||||
GRUB_VERSION=2.12
|
||||
@@ -22,7 +25,6 @@ SYSLINUX_VERSION=6.03
|
||||
# Populate by running: sha256sum build/cache/<file>
|
||||
# Leave empty to skip verification (useful for first fetch)
|
||||
TINYCORE_ISO_SHA256=""
|
||||
KUBESOLO_SHA256=""
|
||||
NETFILTER_TCZ_SHA256=""
|
||||
NET_BRIDGING_TCZ_SHA256=""
|
||||
IPTABLES_TCZ_SHA256=""
|
||||
|
||||
@@ -51,25 +51,32 @@ else
|
||||
fi
|
||||
|
||||
# --- ARM64 console string ---
|
||||
# Covers QEMU virt (ttyAMA0), Ampere/RPi-equivalent PL011 (ttyAMA0), and
|
||||
# Graviton/16550-compat (ttyS0). Last `console=` becomes the system console.
|
||||
# Order matters: the LAST `console=` is the primary system console (where /dev/console
|
||||
# points and where init's stdout/stderr land). Earlier `console=` entries get mirrored
|
||||
# kernel output but don't carry process I/O.
|
||||
#
|
||||
# Covers Graviton/16550 (ttyS0) as secondary and QEMU virt / PL011 / Ampere (ttyAMA0)
|
||||
# as primary. ttyAMA0 must be last for `-nographic` QEMU + most ARM64 SBCs.
|
||||
#
|
||||
# `quiet` is intentionally omitted from the default entry while we stabilise the
|
||||
# generic ARM64 boot path. Add back once boots are reliable.
|
||||
|
||||
menuentry "KubeSolo OS (${slot_label})" {
|
||||
echo "Booting KubeSolo OS from ${slot_label}..."
|
||||
echo "Boot counter: ${boot_counter}, Boot success: ${boot_success}"
|
||||
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA console=ttyAMA0,115200 console=ttyS0,115200 quiet
|
||||
linux /vmlinuz init=/sbin/init kubesolo.data=/dev/vda4 console=ttyS0,115200 console=ttyAMA0,115200
|
||||
initrd /kubesolo-os.gz
|
||||
}
|
||||
|
||||
menuentry "KubeSolo OS (${slot_label}) — Debug Mode" {
|
||||
echo "Booting KubeSolo OS (debug) from ${slot_label}..."
|
||||
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyAMA0,115200 console=ttyS0,115200
|
||||
linux /vmlinuz kubesolo.data=/dev/vda4 kubesolo.debug console=ttyS0,115200 console=ttyAMA0,115200
|
||||
initrd /kubesolo-os.gz
|
||||
}
|
||||
|
||||
menuentry "KubeSolo OS — Emergency Shell" {
|
||||
echo "Booting to emergency shell..."
|
||||
linux /vmlinuz kubesolo.shell console=ttyAMA0,115200 console=ttyS0,115200
|
||||
linux /vmlinuz init=/sbin/init kubesolo.shell console=ttyS0,115200 console=ttyAMA0,115200
|
||||
initrd /kubesolo-os.gz
|
||||
}
|
||||
|
||||
@@ -81,6 +88,6 @@ menuentry "KubeSolo OS — Boot Other Slot" {
|
||||
set root='(hd0,gpt2)'
|
||||
echo "Booting from System A (passive)..."
|
||||
fi
|
||||
linux /vmlinuz kubesolo.data=LABEL=KSOLODATA kubesolo.debug console=ttyAMA0,115200 console=ttyS0,115200
|
||||
linux /vmlinuz kubesolo.data=/dev/vda4 kubesolo.debug console=ttyS0,115200 console=ttyAMA0,115200
|
||||
initrd /kubesolo-os.gz
|
||||
}
|
||||
|
||||
@@ -39,11 +39,26 @@ if [ -f "$CUSTOM_IMAGE" ] && [ -d "$CUSTOM_MODULES/lib/modules/$KVER" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Verify cross-compiler ---
|
||||
if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
|
||||
# --- Toolchain selection: native on arm64 hosts, cross-compile elsewhere ---
|
||||
HOST_ARCH="$(uname -m)"
|
||||
if [ "$HOST_ARCH" = "aarch64" ] || [ "$HOST_ARCH" = "arm64" ]; then
|
||||
# Native build — use the host's gcc
|
||||
if ! command -v gcc >/dev/null 2>&1; then
|
||||
echo "ERROR: gcc not found"
|
||||
echo "Install: apt-get install build-essential"
|
||||
exit 1
|
||||
fi
|
||||
CROSS_COMPILE=""
|
||||
echo "==> Native ARM64 build (host arch: $HOST_ARCH)"
|
||||
else
|
||||
# Cross-build from x86 — use aarch64 cross-compiler
|
||||
if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
|
||||
echo "ERROR: aarch64-linux-gnu-gcc not found"
|
||||
echo "Install: apt-get install gcc-aarch64-linux-gnu"
|
||||
exit 1
|
||||
fi
|
||||
CROSS_COMPILE="aarch64-linux-gnu-"
|
||||
echo "==> Cross-building ARM64 kernel from $HOST_ARCH"
|
||||
fi
|
||||
|
||||
echo "==> Building generic ARM64 kernel (mainline $KVER)..."
|
||||
@@ -92,7 +107,7 @@ cd "$KERNEL_SRC_DIR"
|
||||
|
||||
# --- Base config: arm64 defconfig (generic ARMv8) ---
|
||||
echo "==> Applying arm64 defconfig..."
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- defconfig
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" defconfig
|
||||
|
||||
# --- Apply shared container fragment ---
|
||||
CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/kernel-container.fragment"
|
||||
@@ -125,11 +140,11 @@ apply_fragment() {
|
||||
|
||||
echo "==> Applying kernel-container.fragment (pass 1)..."
|
||||
apply_fragment "$CONFIG_FRAGMENT"
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
|
||||
|
||||
echo "==> Applying kernel-container.fragment (pass 2)..."
|
||||
apply_fragment "$CONFIG_FRAGMENT"
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
|
||||
|
||||
# --- ARM64 virt-host specific enables ---
|
||||
# These are needed for the generic UEFI/virtio boot path but are arch-specific
|
||||
@@ -146,7 +161,7 @@ echo "==> Enabling ARM64 virt-host configs..."
|
||||
./scripts/config --enable CONFIG_HW_RANDOM_VIRTIO
|
||||
# NVMe for cloud / bare-metal ARM64 hosts that don't use virtio
|
||||
./scripts/config --enable CONFIG_BLK_DEV_NVME
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
|
||||
|
||||
# --- Verify critical configs ---
|
||||
echo "==> Verifying critical configs..."
|
||||
@@ -165,7 +180,7 @@ echo ""
|
||||
echo "==> Building ARM64 kernel (${NPROC} parallel jobs)..."
|
||||
echo " This may take 20-40 minutes on a 6-core Odroid..."
|
||||
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j"$NPROC" Image modules 2>&1
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" -j"$NPROC" Image modules 2>&1
|
||||
|
||||
echo "==> Kernel build complete"
|
||||
|
||||
@@ -176,7 +191,7 @@ cp arch/arm64/boot/Image "$CUSTOM_IMAGE"
|
||||
echo "==> Installing modules (stripped)..."
|
||||
rm -rf "$CUSTOM_MODULES"
|
||||
mkdir -p "$CUSTOM_MODULES"
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" \
|
||||
INSTALL_MOD_STRIP=1 modules_install INSTALL_MOD_PATH="$CUSTOM_MODULES"
|
||||
|
||||
# Pick up actual kernel version (e.g. 6.12.10 if KVER differs from package suffix)
|
||||
|
||||
@@ -35,11 +35,24 @@ if [ -f "$CUSTOM_IMAGE" ] && [ -d "$CUSTOM_MODULES" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Verify cross-compiler ---
|
||||
if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
|
||||
# --- Toolchain selection: native on arm64 hosts, cross-compile elsewhere ---
|
||||
HOST_ARCH="$(uname -m)"
|
||||
if [ "$HOST_ARCH" = "aarch64" ] || [ "$HOST_ARCH" = "arm64" ]; then
|
||||
if ! command -v gcc >/dev/null 2>&1; then
|
||||
echo "ERROR: gcc not found"
|
||||
echo "Install: apt-get install build-essential"
|
||||
exit 1
|
||||
fi
|
||||
CROSS_COMPILE=""
|
||||
echo "==> Native ARM64 build (host arch: $HOST_ARCH)"
|
||||
else
|
||||
if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
|
||||
echo "ERROR: aarch64-linux-gnu-gcc not found"
|
||||
echo "Install: apt-get install gcc-aarch64-linux-gnu"
|
||||
exit 1
|
||||
fi
|
||||
CROSS_COMPILE="aarch64-linux-gnu-"
|
||||
echo "==> Cross-building RPi kernel from $HOST_ARCH"
|
||||
fi
|
||||
|
||||
echo "==> Building RPi kernel (raspberrypi/linux)..."
|
||||
@@ -65,7 +78,7 @@ cd "$KERNEL_BUILD_DIR"
|
||||
|
||||
# --- Apply base config (Pi 4 = bcm2711) ---
|
||||
echo "==> Applying bcm2711_defconfig..."
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- bcm2711_defconfig
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" bcm2711_defconfig
|
||||
|
||||
# --- Apply container config overrides ---
|
||||
CONFIG_FRAGMENT="$PROJECT_ROOT/build/config/kernel-container.fragment"
|
||||
@@ -98,7 +111,7 @@ if [ -f "$CONFIG_FRAGMENT" ]; then
|
||||
fi
|
||||
|
||||
# Resolve dependencies
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" olddefconfig
|
||||
|
||||
# --- Build kernel + modules + DTBs ---
|
||||
NPROC=$(nproc 2>/dev/null || echo 4)
|
||||
@@ -106,7 +119,7 @@ echo ""
|
||||
echo "==> Building RPi kernel (${NPROC} parallel jobs)..."
|
||||
echo " This may take 20-30 minutes..."
|
||||
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j"$NPROC" Image modules dtbs 2>&1
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" -j"$NPROC" Image modules dtbs 2>&1
|
||||
|
||||
echo "==> RPi kernel build complete"
|
||||
|
||||
@@ -117,7 +130,7 @@ cp arch/arm64/boot/Image "$CUSTOM_IMAGE"
|
||||
echo "==> Installing modules (stripped)..."
|
||||
rm -rf "$CUSTOM_MODULES"
|
||||
mkdir -p "$CUSTOM_MODULES"
|
||||
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \
|
||||
make ARCH=arm64 CROSS_COMPILE="$CROSS_COMPILE" \
|
||||
INSTALL_MOD_STRIP=1 modules_install INSTALL_MOD_PATH="$CUSTOM_MODULES"
|
||||
|
||||
# Remove build/source symlinks
|
||||
|
||||
@@ -60,17 +60,19 @@ if [ "$FETCH_ARCH" = "arm64" ]; then
|
||||
BIN_URL="https://github.com/portainer/kubesolo/releases/download/${KUBESOLO_VERSION}/kubesolo-${KUBESOLO_VERSION}-linux-arm64-musl.tar.gz"
|
||||
BIN_URL_FALLBACK="https://github.com/portainer/kubesolo/releases/download/${KUBESOLO_VERSION}/kubesolo-${KUBESOLO_VERSION}-linux-arm64.tar.gz"
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
TARBALL="$TEMP_DIR/kubesolo.tar.gz"
|
||||
echo " URL: $BIN_URL"
|
||||
if curl -fSL "$BIN_URL" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
|
||||
if curl -fSL "$BIN_URL" -o "$TARBALL" 2>/dev/null; then
|
||||
echo " Downloaded musl variant (arm64)"
|
||||
elif curl -fSL "$BIN_URL_FALLBACK" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
|
||||
elif curl -fSL "$BIN_URL_FALLBACK" -o "$TARBALL" 2>/dev/null; then
|
||||
echo " Downloaded glibc variant (arm64 fallback)"
|
||||
else
|
||||
echo "ERROR: Failed to download KubeSolo ARM64 from GitHub."
|
||||
rm -rf "$TEMP_DIR"
|
||||
exit 1
|
||||
fi
|
||||
tar -xzf "$TEMP_DIR/kubesolo.tar.gz" -C "$TEMP_DIR"
|
||||
verify_checksum "$TARBALL" "${KUBESOLO_SHA256_ARM64:-}" "KubeSolo arm64 tarball"
|
||||
tar -xzf "$TARBALL" -C "$TEMP_DIR"
|
||||
FOUND_BIN=$(find "$TEMP_DIR" -name "kubesolo" -type f ! -name "*.tar.gz" | head -1)
|
||||
if [ -z "$FOUND_BIN" ]; then
|
||||
echo "ERROR: Could not find kubesolo binary in extracted archive"
|
||||
@@ -131,11 +133,12 @@ else
|
||||
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TEMP_DIR"' EXIT
|
||||
TARBALL="$TEMP_DIR/kubesolo.tar.gz"
|
||||
|
||||
echo " URL: $BIN_URL"
|
||||
if curl -fSL "$BIN_URL" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
|
||||
if curl -fSL "$BIN_URL" -o "$TARBALL" 2>/dev/null; then
|
||||
echo " Downloaded musl variant"
|
||||
elif curl -fSL "$BIN_URL_FALLBACK" -o "$TEMP_DIR/kubesolo.tar.gz" 2>/dev/null; then
|
||||
elif curl -fSL "$BIN_URL_FALLBACK" -o "$TARBALL" 2>/dev/null; then
|
||||
echo " Downloaded glibc variant (fallback)"
|
||||
else
|
||||
echo "ERROR: Failed to download KubeSolo from GitHub."
|
||||
@@ -148,9 +151,10 @@ else
|
||||
echo " 3. Re-run: make rootfs"
|
||||
exit 1
|
||||
fi
|
||||
verify_checksum "$TARBALL" "${KUBESOLO_SHA256_AMD64:-}" "KubeSolo amd64 tarball"
|
||||
|
||||
# Extract binary from tarball
|
||||
tar -xzf "$TEMP_DIR/kubesolo.tar.gz" -C "$TEMP_DIR"
|
||||
tar -xzf "$TARBALL" -C "$TEMP_DIR"
|
||||
|
||||
# Find the kubesolo binary in extracted contents
|
||||
FOUND_BIN=$(find "$TEMP_DIR" -name "kubesolo" -type f ! -name "*.tar.gz" | head -1)
|
||||
@@ -168,7 +172,6 @@ else
|
||||
rm -rf "$TEMP_DIR"
|
||||
|
||||
echo "==> KubeSolo binary: $KUBESOLO_BIN ($(du -h "$KUBESOLO_BIN" | cut -f1))"
|
||||
verify_checksum "$KUBESOLO_BIN" "$KUBESOLO_SHA256" "KubeSolo binary"
|
||||
fi
|
||||
|
||||
# --- Tiny Core kernel module extensions (netfilter, iptables) ---
|
||||
|
||||
@@ -55,10 +55,44 @@ rm -f "$ROOTFS/sbin/init"
|
||||
cp "$PROJECT_ROOT/init/init.sh" "$ROOTFS/sbin/init"
|
||||
chmod +x "$ROOTFS/sbin/init"
|
||||
|
||||
# Init stages
|
||||
# Replace the upstream /init at the rootfs root with our staged init.
|
||||
# The kernel ALWAYS runs /init when booting from an initramfs (legacy root-mount
|
||||
# fallback otherwise). piCore/TC ship their own /init; ours has to take its
|
||||
# place so the kernel runs our staged boot, not piCore's TCE handler.
|
||||
rm -f "$ROOTFS/init"
|
||||
cp "$PROJECT_ROOT/init/init.sh" "$ROOTFS/init"
|
||||
chmod +x "$ROOTFS/init"
|
||||
echo " Installed staged init at /init and /sbin/init"
|
||||
|
||||
# --- 2b. BusyBox override for ARM64 ---
|
||||
# piCore64 v15's BusyBox is dynamically linked and uses ARM instructions that
|
||||
# QEMU virt cannot emulate even with -cpu max, causing applets (mkdir, uname,
|
||||
# etc.) to SIGILL. Replace with the host's statically-linked busybox-static
|
||||
# package, which is built for generic ARMv8-A and runs anywhere.
|
||||
#
|
||||
# On x86 builds this isn't an issue (TC's BusyBox works fine on QEMU x86).
|
||||
if [ "$INJECT_ARCH" = "arm64" ] && [ -x /bin/busybox ]; then
|
||||
if file /bin/busybox 2>/dev/null | grep -q 'statically linked'; then
|
||||
cp /bin/busybox "$ROOTFS/bin/busybox"
|
||||
# busybox.suid is used by mount/su/etc. Same binary; suid bit applied
|
||||
# separately. We don't need suid for our use (init runs as PID 1 / uid 0).
|
||||
cp /bin/busybox "$ROOTFS/bin/busybox.suid"
|
||||
chmod +x "$ROOTFS/bin/busybox" "$ROOTFS/bin/busybox.suid"
|
||||
echo " Replaced piCore BusyBox with host's static busybox ($(du -h /bin/busybox | cut -f1))"
|
||||
else
|
||||
echo " WARN: /bin/busybox on host is not static; piCore BusyBox kept (may crash in QEMU virt)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Init stages — copy NN-name.sh files only. functions.sh is a shared library
|
||||
# (sourced by init.sh proper), not a numbered stage; if it ends up in init.d
|
||||
# the main loop will try to run it as a stage and fail.
|
||||
mkdir -p "$ROOTFS/usr/lib/kubesolo-os/init.d"
|
||||
for stage in "$PROJECT_ROOT"/init/lib/*.sh; do
|
||||
[ -f "$stage" ] || continue
|
||||
case "$(basename "$stage")" in
|
||||
functions.sh) continue ;;
|
||||
esac
|
||||
cp "$stage" "$ROOTFS/usr/lib/kubesolo-os/init.d/"
|
||||
chmod +x "$ROOTFS/usr/lib/kubesolo-os/init.d/$(basename "$stage")"
|
||||
done
|
||||
@@ -190,9 +224,14 @@ if [ -f "$CUSTOM_VMLINUZ" ] && [ -d "$CUSTOM_MODULES/lib/modules/$KVER" ]; then
|
||||
fi
|
||||
|
||||
while IFS= read -r mod; do
|
||||
# Skip comments and blank lines
|
||||
case "$mod" in \#*|"") continue ;; esac
|
||||
mod=$(echo "$mod" | xargs) # trim whitespace
|
||||
# Strip any inline "# comment" tail before further processing —
|
||||
# several entries in the upstream lists started carrying inline
|
||||
# docs and silently broke module loading because modprobe got
|
||||
# passed "name # comment" as the module name.
|
||||
mod="${mod%%#*}"
|
||||
# Skip blank-or-comment-only lines
|
||||
case "$mod" in "") continue ;; esac
|
||||
mod=$(echo "$mod" | xargs) # trim whitespace + collapse internal
|
||||
[ -z "$mod" ] && continue
|
||||
|
||||
if [ "$MODPROBE_WORKS" = true ]; then
|
||||
@@ -363,7 +402,13 @@ if [ -f /usr/sbin/xtables-nft-multi ]; then
|
||||
ln -sf xtables-nft-multi "$ROOTFS/usr/sbin/$cmd"
|
||||
done
|
||||
|
||||
# Copy required shared libraries (architecture-aware paths)
|
||||
# Copy required shared libraries (architecture-aware paths).
|
||||
# We deliberately bundle the *full* glibc family from the build host —
|
||||
# not just libc.so.6 — so dynamically-linked binaries we ship (nft,
|
||||
# xtables-nft-multi, etc.) load a consistent set of libraries. Mixing
|
||||
# glibc components across versions causes __stack_chk_guard mismatches
|
||||
# ("stack smashing detected" aborts) when stack frames cross between
|
||||
# functions linked against different libcs.
|
||||
mkdir -p "$ROOTFS/usr/lib/$LIB_ARCH" "$ROOTFS/lib/$LIB_ARCH"
|
||||
[ "$INJECT_ARCH" != "arm64" ] && mkdir -p "$ROOTFS/lib64"
|
||||
for lib in \
|
||||
@@ -371,6 +416,13 @@ if [ -f /usr/sbin/xtables-nft-multi ]; then
|
||||
"/lib/$LIB_ARCH/libmnl.so.0"* \
|
||||
"/lib/$LIB_ARCH/libnftnl.so.11"* \
|
||||
"/lib/$LIB_ARCH/libc.so.6" \
|
||||
"/lib/$LIB_ARCH/libpthread.so.0" \
|
||||
"/lib/$LIB_ARCH/libdl.so.2" \
|
||||
"/lib/$LIB_ARCH/libm.so.6" \
|
||||
"/lib/$LIB_ARCH/libresolv.so.2" \
|
||||
"/lib/$LIB_ARCH/librt.so.1" \
|
||||
"/lib/$LIB_ARCH/libanl.so.1" \
|
||||
"/lib/$LIB_ARCH/libgcc_s.so.1" \
|
||||
"$LD_SO"; do
|
||||
[ -e "$lib" ] && cp -aL "$lib" "$ROOTFS${lib}" 2>/dev/null || true
|
||||
done
|
||||
@@ -386,6 +438,30 @@ else
|
||||
echo " WARN: xtables-nft-multi not found in builder (install iptables package)"
|
||||
fi
|
||||
|
||||
# Install nft (nftables CLI). KubeSolo v1.1.4+ uses `nft add table ip
|
||||
# kubesolo-masq` to own pod masquerade rules directly instead of going
|
||||
# through kube-proxy/CNI. Without nft in PATH, KubeSolo FATALs at startup
|
||||
# with: nft: executable file not found in $PATH.
|
||||
echo " Installing nft (nftables CLI) from builder..."
|
||||
if [ -f /usr/sbin/nft ]; then
|
||||
cp /usr/sbin/nft "$ROOTFS/usr/sbin/"
|
||||
# nft pulls in libnftables + a few extras beyond what iptables-nft needed.
|
||||
# libmnl, libnftnl, libxtables already copied by the iptables-nft block.
|
||||
for lib in \
|
||||
"/lib/$LIB_ARCH/libnftables.so.1"* \
|
||||
"/lib/$LIB_ARCH/libedit.so.2"* \
|
||||
"/lib/$LIB_ARCH/libjansson.so.4"* \
|
||||
"/lib/$LIB_ARCH/libgmp.so.10"* \
|
||||
"/lib/$LIB_ARCH/libtinfo.so.6"* \
|
||||
"/lib/$LIB_ARCH/libbsd.so.0"* \
|
||||
"/lib/$LIB_ARCH/libmd.so.0"*; do
|
||||
[ -e "$lib" ] && cp -aL "$lib" "$ROOTFS${lib}" 2>/dev/null || true
|
||||
done
|
||||
echo " Installed nft + shared libs"
|
||||
else
|
||||
echo " WARN: nft not found in builder (install nftables package) — KubeSolo v1.1.4+ pod masquerade will fail"
|
||||
fi
|
||||
|
||||
# Kernel modules list (for init to load at boot)
|
||||
if [ "$INJECT_ARCH" = "arm64" ]; then
|
||||
cp "$PROJECT_ROOT/build/config/modules-arm64.list" "$ROOTFS/usr/lib/kubesolo-os/modules.list"
|
||||
@@ -483,6 +559,54 @@ nameserver 1.1.1.1
|
||||
EOF
|
||||
fi
|
||||
|
||||
# --- Resolve dual-glibc ambiguity (ARM64) ---
|
||||
# piCore64's rootfs ships glibc at /lib/libc.so.6, and we've copied the
|
||||
# build host's glibc to /lib/$LIB_ARCH/libc.so.6. Two libc.so.6 in the
|
||||
# dynamic linker's search path can lead to a process loading both — one
|
||||
# directly, one transitively — and "stack smashing detected" aborts when
|
||||
# stack frames cross between them (each libc has its own
|
||||
# __stack_chk_guard). Remove piCore's copies so resolution is unambiguous
|
||||
# and write a proper /etc/ld.so.conf + cache pointing at our copies.
|
||||
if [ "$INJECT_ARCH" = "arm64" ] && [ -d "$ROOTFS/lib/$LIB_ARCH" ]; then
|
||||
echo " Pruning duplicate glibc components in $ROOTFS/lib/..."
|
||||
for lib in \
|
||||
libc.so.6 \
|
||||
libpthread.so.0 \
|
||||
libdl.so.2 \
|
||||
libm.so.6 \
|
||||
libresolv.so.2 \
|
||||
librt.so.1 \
|
||||
libanl.so.1 \
|
||||
libgcc_s.so.1; do
|
||||
# Only delete piCore's copy when our version exists; otherwise
|
||||
# we'd leave the binary unable to find any libc at all.
|
||||
if [ -e "$ROOTFS/lib/$lib" ] && [ -e "$ROOTFS/lib/$LIB_ARCH/$lib" ]; then
|
||||
rm -f "$ROOTFS/lib/$lib"
|
||||
fi
|
||||
done
|
||||
|
||||
# ld.so.conf gives our $LIB_ARCH paths precedence over piCore's /lib
|
||||
# (defaults vary by glibc version; this makes the order explicit).
|
||||
cat > "$ROOTFS/etc/ld.so.conf" <<EOF
|
||||
/lib/$LIB_ARCH
|
||||
/usr/lib/$LIB_ARCH
|
||||
/usr/local/lib
|
||||
/lib
|
||||
/usr/lib
|
||||
EOF
|
||||
|
||||
# Generate /etc/ld.so.cache. ldconfig -r treats $ROOTFS as the system
|
||||
# root, so it reads ld.so.conf from there and writes the cache there.
|
||||
# Works even cross-arch (it only parses ELF headers, doesn't execute).
|
||||
if command -v ldconfig >/dev/null 2>&1; then
|
||||
ldconfig -r "$ROOTFS" 2>/dev/null && \
|
||||
echo " Generated /etc/ld.so.cache via ldconfig" || \
|
||||
echo " WARN: ldconfig failed; falling back to default search order"
|
||||
else
|
||||
echo " WARN: ldconfig not on builder; cache not generated"
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- Summary ---
|
||||
echo ""
|
||||
echo "==> Injection complete. Rootfs contents:"
|
||||
|
||||
150
build/scripts/push-oci-artifact.sh
Executable file
150
build/scripts/push-oci-artifact.sh
Executable file
@@ -0,0 +1,150 @@
|
||||
#!/bin/bash
|
||||
# push-oci-artifact.sh — Publish a KubeSolo OS update artifact to an OCI registry.
|
||||
#
|
||||
# Produces the artifact format consumed by `kubesolo-update --registry`:
|
||||
#
|
||||
# <registry>/<repo>:<version>-<arch> per-arch manifest, layers:
|
||||
# * vmlinuz (Image on arm64) → application/vnd.kubesolo.os.kernel.v1+octet-stream
|
||||
# * kubesolo-os.gz → application/vnd.kubesolo.os.initramfs.v1+gzip
|
||||
# annotations:
|
||||
# io.kubesolo.os.version
|
||||
# io.kubesolo.os.channel
|
||||
# io.kubesolo.os.architecture
|
||||
# io.kubesolo.os.min_compatible_version (optional)
|
||||
#
|
||||
# After running this for each architecture, combine the per-arch tags into a
|
||||
# multi-arch index with `oras manifest index create` (see end of script).
|
||||
#
|
||||
# Requires: oras (>= 1.2), curl, jq.
|
||||
#
|
||||
# Usage:
|
||||
# build/scripts/push-oci-artifact.sh \
|
||||
# --registry ghcr.io/portainer/kubesolo-os \
|
||||
# --arch amd64 \
|
||||
# --channel stable \
|
||||
# [--min-compatible-version v0.2.0]
|
||||
#
|
||||
# Authentication: oras reads ~/.docker/config.json. In CI, run
|
||||
# `oras login ghcr.io -u USER -p TOKEN` before invoking this script
|
||||
# (or set DOCKER_CONFIG to a directory with config.json).
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
VERSION="$(cat "$PROJECT_ROOT/VERSION")"
|
||||
OUTPUT_DIR="$PROJECT_ROOT/output"
|
||||
CACHE_DIR="$PROJECT_ROOT/build/cache"
|
||||
|
||||
REGISTRY=""
|
||||
ARCH=""
|
||||
CHANNEL="stable"
|
||||
MIN_COMPATIBLE_VERSION=""
|
||||
RELEASE_NOTES=""
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--registry) REGISTRY="$2"; shift 2 ;;
|
||||
--arch) ARCH="$2"; shift 2 ;;
|
||||
--channel) CHANNEL="$2"; shift 2 ;;
|
||||
--min-compatible-version) MIN_COMPATIBLE_VERSION="$2"; shift 2 ;;
|
||||
--release-notes) RELEASE_NOTES="$2"; shift 2 ;;
|
||||
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$REGISTRY" ] || [ -z "$ARCH" ]; then
|
||||
echo "Usage: $0 --registry REGISTRY/REPO --arch (amd64|arm64) [--channel stable] [--min-compatible-version vX.Y.Z]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v oras >/dev/null 2>&1; then
|
||||
echo "ERROR: oras CLI not found. Install from https://oras.land/docs/installation/" >&2
|
||||
echo " or apt-get install oras (Ubuntu 24.04+)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Locate the artifacts. For arm64 the kernel is "Image"; everywhere else it's
|
||||
# "vmlinuz". Initramfs is always kubesolo-os.gz.
|
||||
case "$ARCH" in
|
||||
amd64)
|
||||
KERNEL="$CACHE_DIR/custom-kernel/vmlinuz"
|
||||
[ -f "$KERNEL" ] || KERNEL="$OUTPUT_DIR/vmlinuz"
|
||||
KERNEL_BASENAME="vmlinuz"
|
||||
;;
|
||||
arm64)
|
||||
KERNEL="$CACHE_DIR/kernel-arm64-generic/Image"
|
||||
KERNEL_BASENAME="vmlinuz" # we publish under the vmlinuz name regardless;
|
||||
# the consumer looks up by media type, not filename.
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: unsupported --arch $ARCH (use amd64 or arm64)" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
INITRAMFS="$PROJECT_ROOT/build/rootfs-work/kubesolo-os.gz"
|
||||
|
||||
if [ ! -f "$KERNEL" ]; then
|
||||
echo "ERROR: kernel not found at $KERNEL" >&2
|
||||
echo " Run 'make kernel' (amd64) or 'make kernel-arm64' (arm64) first." >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f "$INITRAMFS" ]; then
|
||||
echo "ERROR: initramfs not found at $INITRAMFS" >&2
|
||||
echo " Run 'make initramfs' or 'make rootfs-arm64' first." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Stage files in a temp dir so the basenames in the manifest are clean.
|
||||
STAGE="$(mktemp -d)"
|
||||
trap 'rm -rf "$STAGE"' EXIT
|
||||
cp "$KERNEL" "$STAGE/$KERNEL_BASENAME"
|
||||
cp "$INITRAMFS" "$STAGE/kubesolo-os.gz"
|
||||
|
||||
KERNEL_MEDIA="application/vnd.kubesolo.os.kernel.v1+octet-stream"
|
||||
INITRD_MEDIA="application/vnd.kubesolo.os.initramfs.v1+gzip"
|
||||
|
||||
REF="${REGISTRY}:${VERSION}-${ARCH}"
|
||||
CHANNEL_REF="${REGISTRY}:${CHANNEL}-${ARCH}"
|
||||
|
||||
echo "==> Pushing ${REF}"
|
||||
echo " kernel: $KERNEL ($(du -h "$KERNEL" | cut -f1))"
|
||||
echo " initramfs: $INITRAMFS ($(du -h "$INITRAMFS" | cut -f1))"
|
||||
|
||||
ORAS_ANNOTATIONS=(
|
||||
--annotation "io.kubesolo.os.version=${VERSION}"
|
||||
--annotation "io.kubesolo.os.channel=${CHANNEL}"
|
||||
--annotation "io.kubesolo.os.architecture=${ARCH}"
|
||||
)
|
||||
if [ -n "$MIN_COMPATIBLE_VERSION" ]; then
|
||||
ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.min_compatible_version=${MIN_COMPATIBLE_VERSION}")
|
||||
fi
|
||||
if [ -n "$RELEASE_NOTES" ]; then
|
||||
ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.release_notes=${RELEASE_NOTES}")
|
||||
fi
|
||||
ORAS_ANNOTATIONS+=(--annotation "io.kubesolo.os.release_date=$(date -u +%Y-%m-%dT%H:%M:%SZ)")
|
||||
|
||||
# oras push: --artifact-type sets the manifest artifactType field;
|
||||
# file:type syntax sets per-layer media types.
|
||||
(cd "$STAGE" && oras push "$REF" \
|
||||
--artifact-type "application/vnd.kubesolo.os.update.v1+json" \
|
||||
"${ORAS_ANNOTATIONS[@]}" \
|
||||
"${KERNEL_BASENAME}:${KERNEL_MEDIA}" \
|
||||
"kubesolo-os.gz:${INITRD_MEDIA}")
|
||||
|
||||
# Also tag as <channel>-<arch> so the manifest-index step can reference it
|
||||
# stably across patch releases.
|
||||
echo "==> Tagging ${CHANNEL_REF}"
|
||||
oras tag "$REF" "${CHANNEL}-${ARCH}"
|
||||
|
||||
echo ""
|
||||
echo "==> Published:"
|
||||
echo " ${REF}"
|
||||
echo " ${CHANNEL_REF}"
|
||||
echo ""
|
||||
echo "To combine multi-arch into the channel index, run after both arches are pushed:"
|
||||
echo ""
|
||||
echo " oras manifest index create ${REGISTRY}:${CHANNEL} \\"
|
||||
echo " ${REGISTRY}:${CHANNEL}-amd64,platform=linux/amd64 \\"
|
||||
echo " ${REGISTRY}:${CHANNEL}-arm64,platform=linux/arm64"
|
||||
echo ""
|
||||
@@ -97,6 +97,11 @@ func cmdApply(configPath string) error {
|
||||
return fmt.Errorf("portainer edge agent: %w", err)
|
||||
}
|
||||
|
||||
// 5. Write /etc/kubesolo/update.conf from updates: block (if any).
|
||||
if err := cloudinit.ApplyUpdates(cfg, ""); err != nil {
|
||||
return fmt.Errorf("updates: %w", err)
|
||||
}
|
||||
|
||||
// 5. Save persistent configs for next boot
|
||||
if err := cloudinit.SaveHostname(cfg, persistDataDir+"/etc-kubesolo"); err != nil {
|
||||
slog.Warn("failed to save hostname", "error", err)
|
||||
|
||||
@@ -18,6 +18,24 @@ type Config struct {
|
||||
NTP NTPConfig `yaml:"ntp"`
|
||||
Airgap AirgapConfig `yaml:"airgap"`
|
||||
Portainer PortainerConfig `yaml:"portainer"`
|
||||
Updates UpdatesConfig `yaml:"updates"`
|
||||
}
|
||||
|
||||
// UpdatesConfig configures the kubesolo-update agent. Written to
|
||||
// /etc/kubesolo/update.conf on first boot. See update/pkg/config.
|
||||
type UpdatesConfig struct {
|
||||
// Server is the update server URL (HTTP or OCI registry).
|
||||
Server string `yaml:"server"`
|
||||
// Channel selects which channel to track ("stable", "beta", "edge").
|
||||
// Empty = "stable".
|
||||
Channel string `yaml:"channel"`
|
||||
// MaintenanceWindow restricts apply to the given local time range,
|
||||
// e.g. "03:00-05:00". Wrapping windows like "23:00-01:00" supported.
|
||||
// Empty = no restriction.
|
||||
MaintenanceWindow string `yaml:"maintenance_window"`
|
||||
// PubKey is the path to the Ed25519 public key file used to verify
|
||||
// signed update artifacts. Empty = signature verification disabled.
|
||||
PubKey string `yaml:"pubkey"`
|
||||
}
|
||||
|
||||
// NetworkConfig defines network settings.
|
||||
@@ -40,6 +58,14 @@ type KubeSoloConfig struct {
|
||||
PortainerEdgeID string `yaml:"portainer-edge-id"`
|
||||
PortainerEdgeKey string `yaml:"portainer-edge-key"`
|
||||
PortainerEdgeAsync bool `yaml:"portainer-edge-async"`
|
||||
// v1.1.4+: skip edge-optimised overrides, use upstream k8s defaults
|
||||
// (useful for CI and powerful machines, disabled by default).
|
||||
Full bool `yaml:"full"`
|
||||
// v1.1.5+: disable IPv6 in the cluster.
|
||||
DisableIPv6 bool `yaml:"disable-ipv6"`
|
||||
// v1.1.5+: detect SQLite WAL corruption on startup and recover from
|
||||
// unclean shutdowns (e.g. power loss). Recommended ON for edge devices.
|
||||
DBWALRepair bool `yaml:"db-wal-repair"`
|
||||
}
|
||||
|
||||
// NTPConfig defines NTP settings.
|
||||
|
||||
@@ -36,5 +36,50 @@ kubesolo:
|
||||
portainer-edge-key: "your-edge-key"
|
||||
portainer-edge-async: true
|
||||
|
||||
# KubeSolo v1.1.4+: skip the edge-optimised overrides and use upstream
|
||||
# Kubernetes defaults. Useful for CI and high-spec machines. Default off.
|
||||
full: false
|
||||
|
||||
# KubeSolo v1.1.5+: disable IPv6 throughout the cluster. Default off.
|
||||
disable-ipv6: false
|
||||
|
||||
# KubeSolo v1.1.5+: detect SQLite WAL corruption at startup and recover
|
||||
# from unclean shutdowns (e.g. power loss). Recommended ON for edge
|
||||
# appliances that may lose power.
|
||||
db-wal-repair: true
|
||||
|
||||
# Arbitrary extra flags passed directly to the KubeSolo binary
|
||||
# extra-flags: "--disable traefik --disable servicelb"
|
||||
|
||||
# Update agent settings (written to /etc/kubesolo/update.conf on first boot).
|
||||
# Omit any subfield to leave the corresponding default in place.
|
||||
updates:
|
||||
# Update server URL — HTTPS for the JSON+blob protocol, or an OCI registry
|
||||
# reference (e.g. ghcr.io/portainer/kubesolo-os) when OCI distribution
|
||||
# lands in v0.3.
|
||||
server: "https://updates.kubesolo.example.com"
|
||||
|
||||
# Channel to track. "stable" is the default; "beta"/"edge" expose
|
||||
# pre-release artifacts. The agent refuses to apply metadata whose
|
||||
# channel doesn't match.
|
||||
channel: "stable"
|
||||
|
||||
# Maintenance window (local time, HH:MM-HH:MM, wrapping midnight OK).
|
||||
# `apply` refuses to run outside this window unless --force is passed.
|
||||
# Leave empty (or omit) to allow updates at any time.
|
||||
maintenance_window: "03:00-05:00"
|
||||
|
||||
# Path to Ed25519 public key for signature verification. Omit to disable
|
||||
# signature verification (NOT recommended for production fleets).
|
||||
# pubkey: "/etc/kubesolo/update-pubkey.hex"
|
||||
|
||||
# Optional post-boot healthcheck probe URL. If set, healthcheck GETs it
|
||||
# and treats anything other than HTTP 200 as a failure. Useful when your
|
||||
# workload exposes its own readiness on a known endpoint.
|
||||
# healthcheck_url: "http://localhost:8000/ready"
|
||||
|
||||
# Auto-rollback threshold: after N consecutive post-activation healthcheck
|
||||
# failures, the agent triggers a rollback on its own. 0 disables the
|
||||
# feature (the bootloader still does GRUB-counter-based rollback after
|
||||
# 3 failed boots). Recommended: 3 for production fleets.
|
||||
# auto_rollback_after: 3
|
||||
|
||||
@@ -70,6 +70,18 @@ func buildExtraFlags(cfg *Config) string {
|
||||
parts = append(parts, "--portainer-edge-async")
|
||||
}
|
||||
|
||||
if cfg.KubeSolo.Full {
|
||||
parts = append(parts, "--full")
|
||||
}
|
||||
|
||||
if cfg.KubeSolo.DisableIPv6 {
|
||||
parts = append(parts, "--disable-ipv6")
|
||||
}
|
||||
|
||||
if cfg.KubeSolo.DBWALRepair {
|
||||
parts = append(parts, "--db-wal-repair")
|
||||
}
|
||||
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
|
||||
57
cloud-init/updates.go
Normal file
57
cloud-init/updates.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package cloudinit
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DefaultUpdateConfPath is where the update agent expects to find its config.
|
||||
// Kept in sync with update/pkg/config.DefaultPath.
|
||||
const DefaultUpdateConfPath = "/etc/kubesolo/update.conf"
|
||||
|
||||
// ApplyUpdates writes /etc/kubesolo/update.conf from the cloud-init
|
||||
// updates: block. Called once per boot; idempotent (overwrites any existing
|
||||
// file with the cloud-init values).
|
||||
//
|
||||
// If the updates: block is empty (all fields blank), the file is not
|
||||
// written — preserves any hand-edited update.conf on systems that aren't
|
||||
// managed via cloud-init.
|
||||
func ApplyUpdates(cfg *Config, confPath string) error {
|
||||
if confPath == "" {
|
||||
confPath = DefaultUpdateConfPath
|
||||
}
|
||||
u := cfg.Updates
|
||||
if u.Server == "" && u.Channel == "" && u.MaintenanceWindow == "" && u.PubKey == "" {
|
||||
// Nothing to write — leave any existing file alone.
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(filepath.Dir(confPath), 0o755); err != nil {
|
||||
return fmt.Errorf("creating dir for %s: %w", confPath, err)
|
||||
}
|
||||
|
||||
var sb strings.Builder
|
||||
sb.WriteString("# Generated by KubeSolo OS cloud-init — edit this file or the\n")
|
||||
sb.WriteString("# cloud-init source YAML; subsequent first-boots will regenerate it.\n")
|
||||
if u.Server != "" {
|
||||
fmt.Fprintf(&sb, "server = %s\n", u.Server)
|
||||
}
|
||||
if u.Channel != "" {
|
||||
fmt.Fprintf(&sb, "channel = %s\n", u.Channel)
|
||||
}
|
||||
if u.MaintenanceWindow != "" {
|
||||
fmt.Fprintf(&sb, "maintenance_window = %s\n", u.MaintenanceWindow)
|
||||
}
|
||||
if u.PubKey != "" {
|
||||
fmt.Fprintf(&sb, "pubkey = %s\n", u.PubKey)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(confPath, []byte(sb.String()), 0o644); err != nil {
|
||||
return fmt.Errorf("writing %s: %w", confPath, err)
|
||||
}
|
||||
slog.Info("wrote update.conf", "path", confPath)
|
||||
return nil
|
||||
}
|
||||
81
cloud-init/updates_test.go
Normal file
81
cloud-init/updates_test.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package cloudinit
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestApplyUpdatesEmptyConfigSkipsWrite(t *testing.T) {
|
||||
confPath := filepath.Join(t.TempDir(), "update.conf")
|
||||
cfg := &Config{} // Updates block default-zero
|
||||
if err := ApplyUpdates(cfg, confPath); err != nil {
|
||||
t.Fatalf("apply: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(confPath); !os.IsNotExist(err) {
|
||||
t.Errorf("expected no file when cloud-init Updates is empty, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyUpdatesAllFields(t *testing.T) {
|
||||
confPath := filepath.Join(t.TempDir(), "update.conf")
|
||||
cfg := &Config{Updates: UpdatesConfig{
|
||||
Server: "https://updates.example.com",
|
||||
Channel: "stable",
|
||||
MaintenanceWindow: "03:00-05:00",
|
||||
PubKey: "/etc/kubesolo/pub.hex",
|
||||
}}
|
||||
if err := ApplyUpdates(cfg, confPath); err != nil {
|
||||
t.Fatalf("apply: %v", err)
|
||||
}
|
||||
data, err := os.ReadFile(confPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read: %v", err)
|
||||
}
|
||||
out := string(data)
|
||||
|
||||
wants := []string{
|
||||
"server = https://updates.example.com",
|
||||
"channel = stable",
|
||||
"maintenance_window = 03:00-05:00",
|
||||
"pubkey = /etc/kubesolo/pub.hex",
|
||||
}
|
||||
for _, w := range wants {
|
||||
if !strings.Contains(out, w) {
|
||||
t.Errorf("update.conf missing %q in output:\n%s", w, out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyUpdatesPartialFields(t *testing.T) {
|
||||
// Only server set — others should be omitted from the file, not written
|
||||
// as blank values.
|
||||
confPath := filepath.Join(t.TempDir(), "update.conf")
|
||||
cfg := &Config{Updates: UpdatesConfig{Server: "https://x.example.com"}}
|
||||
if err := ApplyUpdates(cfg, confPath); err != nil {
|
||||
t.Fatalf("apply: %v", err)
|
||||
}
|
||||
data, _ := os.ReadFile(confPath)
|
||||
out := string(data)
|
||||
if !strings.Contains(out, "server = https://x.example.com") {
|
||||
t.Errorf("missing server line:\n%s", out)
|
||||
}
|
||||
for _, unwanted := range []string{"channel = ", "maintenance_window = ", "pubkey = "} {
|
||||
if strings.Contains(out, unwanted) {
|
||||
t.Errorf("unexpected empty line %q present in:\n%s", unwanted, out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyUpdatesCreatesParentDir(t *testing.T) {
|
||||
// /etc/kubesolo may not exist on first boot before cloud-init runs.
|
||||
confPath := filepath.Join(t.TempDir(), "nested", "kubesolo", "update.conf")
|
||||
cfg := &Config{Updates: UpdatesConfig{Server: "https://x"}}
|
||||
if err := ApplyUpdates(cfg, confPath); err != nil {
|
||||
t.Fatalf("apply: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(confPath); err != nil {
|
||||
t.Errorf("file not created: %v", err)
|
||||
}
|
||||
}
|
||||
125
docs/arm64-status.md
Normal file
125
docs/arm64-status.md
Normal file
@@ -0,0 +1,125 @@
|
||||
# ARM64 Generic Status (v0.3 in-progress)
|
||||
|
||||
End-of-Phase-3 snapshot of the generic ARM64 build track.
|
||||
|
||||
## What works
|
||||
|
||||
End-to-end boot through QEMU on an Odroid (aarch64 Ubuntu 22.04 build host):
|
||||
|
||||
1. `make kernel-arm64` produces a mainline 6.12.10 LTS kernel (44 MB Image, 868
|
||||
modules)
|
||||
2. `make rootfs-arm64` extracts piCore64 userland, replaces BusyBox with
|
||||
Ubuntu's static busybox-static, injects KubeSolo + Go agents + init scripts
|
||||
3. `make disk-image-arm64` produces a UEFI-bootable 4 GB GPT image with GRUB
|
||||
A/B slots
|
||||
4. `hack/dev-vm-arm64.sh --disk` boots the image:
|
||||
- UEFI firmware loads GRUB
|
||||
- GRUB loads kernel + initramfs
|
||||
- Custom init runs all 14 stages (early-mount, parse-cmdline, persistent-mount,
|
||||
kernel-modules, apparmor, sysctl, cloud-init, network, hostname, clock,
|
||||
containerd, security-lockdown, kubesolo)
|
||||
- Data partition mounts (ext4 on vda4)
|
||||
- Network configured (DHCP on virtio eth0)
|
||||
- KubeSolo starts; containerd boots successfully; CoreDNS + pause images
|
||||
register
|
||||
|
||||
## Known limitations of the current dev setup
|
||||
|
||||
These are debugging-environment issues, not production blockers:
|
||||
|
||||
### 1. QEMU TCG performance hits KubeSolo's image-import deadline
|
||||
|
||||
KubeSolo bundles its essential container images and imports them into
|
||||
containerd on first boot. Under QEMU TCG (software emulation on the Odroid's
|
||||
1.8 GB / 6-core ARM64), the import takes longer than KubeSolo's internal
|
||||
deadline, so we see:
|
||||
|
||||
```
|
||||
failed to import images: ... context deadline exceeded
|
||||
shutdown requested before containerd was ready
|
||||
```
|
||||
|
||||
On real ARM64 hardware (Graviton, Ampere, RPi 5, etc.) this import completes
|
||||
in seconds. KVM acceleration on the Odroid would also fix it, but the
|
||||
Odroid's vendor kernel (4.9.337-38) doesn't ship the KVM module — fixing that
|
||||
requires a host-kernel upgrade outside this project's scope.
|
||||
|
||||
### 2. Hardcoded `/dev/vda4` data partition path
|
||||
|
||||
Stage 20 currently expects `kubesolo.data=/dev/vda4` rather than
|
||||
`LABEL=KSOLODATA`. The LABEL= path is preferred (works regardless of disk
|
||||
naming on different hosts), but resolution depends on `blkid` and `findfs`,
|
||||
which:
|
||||
|
||||
- piCore64 ships as dynamic util-linux binaries that crash in QEMU virt
|
||||
- Ubuntu's `busybox-static` 1.30.1 doesn't include the applets
|
||||
|
||||
Production fix options (deferred to next phase):
|
||||
|
||||
- Build a more comprehensive static BusyBox (Alpine's, or upstream + custom config)
|
||||
- Ship statically-linked `blkid` and `findfs` from util-linux
|
||||
- Replace LABEL resolution with a sysfs walk that reads `/sys/class/block/*/holders`
|
||||
and `/dev/<n>` device numbers
|
||||
|
||||
### 3. AppArmor profiles fail to load
|
||||
|
||||
`apparmor_parser` errors on the containerd and kubelet profiles, probably
|
||||
because the parser binary or libraries copied from the build host don't
|
||||
match the rootfs's libc layout. Boot proceeds without AppArmor enforcement.
|
||||
Same fix path as #2 (better static binaries).
|
||||
|
||||
### 4. piCore64 BusyBox swap is a build-host dependency
|
||||
|
||||
`inject-kubesolo.sh` replaces piCore's `/bin/busybox` with the build host's
|
||||
`/bin/busybox` (Ubuntu's busybox-static package). That binary must exist on
|
||||
the build host or in the builder Docker image. Documented; works in CI
|
||||
because the Dockerfile installs busybox-static.
|
||||
|
||||
A more reproducible approach (future work): ship a known-good ARM64 BusyBox
|
||||
binary as a tracked artifact rather than depending on the host package.
|
||||
|
||||
### 5. busybox-static 1.30.1 has its own bugs
|
||||
|
||||
Even after the swap, some applets misbehave inside QEMU:
|
||||
|
||||
- `modprobe` triggers "stack smashing detected" abort (kernel modules still
|
||||
load via direct write to /sys/... in stage 30, so this isn't fatal)
|
||||
- `tr` doesn't parse POSIX character classes like `[:space:]` — already
|
||||
worked around by using explicit `' \t\r\n'` in our scripts
|
||||
- Missing applets: `blkid`, `findfs`, `--version`, etc.
|
||||
|
||||
These won't necessarily manifest on real hardware (different CPU, different
|
||||
glibc interaction) but they confirm that 1.30.1 isn't the right long-term
|
||||
BusyBox.
|
||||
|
||||
## What's needed to ship v0.3 ARM64 as production-ready
|
||||
|
||||
In order of priority:
|
||||
|
||||
1. **Validate on real ARM64 hardware** — boot the image on a Graviton EC2
|
||||
instance, Ampere VPS, RPi 5 (when hardware available), or any UEFI-capable
|
||||
ARM64 board. Confirm full KubeSolo bring-up: node Ready, pods schedule.
|
||||
2. **Fix LABEL=KSOLODATA resolution** — see option list in #2 above.
|
||||
3. **Replace busybox-static with a curated build** — see #4.
|
||||
4. **Add a Gitea workflow** that runs `make kernel-arm64 + disk-image-arm64`
|
||||
on the Odroid runner and the QEMU boot-test as a smoke test (with the
|
||||
expectation that KubeSolo doesn't finish first-boot under TCG).
|
||||
|
||||
## Files exercised by the Phase 3 work
|
||||
|
||||
| Path | Status |
|
||||
|------|--------|
|
||||
| `build/scripts/build-kernel-arm64.sh` | New — mainline 6.12.10 kernel build, native or cross |
|
||||
| `build/scripts/build-kernel-rpi.sh` | Renamed from old `build-kernel-arm64.sh` — RPi path |
|
||||
| `build/config/kernel-container.fragment` | Renamed from `rpi-kernel-config.fragment` |
|
||||
| `build/scripts/create-disk-image.sh` | Refactored — accepts `TARGET_ARCH=arm64` |
|
||||
| `build/grub/grub-arm64.cfg` | New — ARM64 console + `init=/sbin/init` |
|
||||
| `build/scripts/inject-kubesolo.sh` | Updated — BusyBox swap, `/init` install, variant routing |
|
||||
| `init/init.sh` | Updated — output to `/dev/console` for early-boot visibility |
|
||||
| `init/lib/30-kernel-modules.sh` | Fixed — `tr -d ' \t\r\n'` instead of `[:space:]` |
|
||||
| `init/lib/40-sysctl.sh` | Same fix |
|
||||
| `hack/dev-vm-arm64.sh` | Updated — `-cpu max`, UEFI `--disk` mode |
|
||||
| `test/qemu/test-boot-arm64-disk.sh` | New — CI test for UEFI boot |
|
||||
| `Makefile` | New targets: `kernel-arm64`, `kernel-rpi`, `disk-image-arm64`, `test-boot-arm64-disk`, `rootfs-arm64-rpi` |
|
||||
| `build/config/versions.env` | Pinned `MAINLINE_KERNEL_VERSION=6.12.10`, `KUBESOLO_VERSION=v1.1.0` |
|
||||
| `build/Dockerfile.builder` | Added `grub-efi-amd64-bin`, `grub-efi-arm64-bin`, `busybox-static` |
|
||||
@@ -26,6 +26,63 @@ Generic ubuntu jobs that don't care about arch fall through to whichever runner
|
||||
them up first; on the Odroid they run in Docker via the `ubuntu-latest` /
|
||||
`ubuntu-22.04` / `ubuntu-24.04` labels.
|
||||
|
||||
## Workflows in this repo
|
||||
|
||||
| Workflow file | Trigger | Where it runs | What it produces |
|
||||
|---|---|---|---|
|
||||
| `.gitea/workflows/ci.yaml` | push / PR to main | ubuntu-latest | Go tests, cross-arch binary build, shellcheck |
|
||||
| `.gitea/workflows/build-arm64.yaml` | push to main, tags `v*`, manual | `arm64-linux` (Odroid) | ARM64 kernel + rootfs + disk image; uploads as workflow artifact only |
|
||||
| `.gitea/workflows/release.yaml` | tags `v*` | mix: ubuntu-latest + `arm64-linux` | Full release: x86 ISO + disk, ARM64 disk, Go binaries, SHA256SUMS — posted to Gitea Releases via API |
|
||||
|
||||
### Release workflow specifics
|
||||
|
||||
`release.yaml` is what fires when you `git push origin vX.Y.Z`. The pipeline:
|
||||
|
||||
1. **test** — `go test` cloud-init + update modules (ubuntu-latest).
|
||||
2. **build-binaries** — cross-compiles `kubesolo-cloudinit` and
|
||||
`kubesolo-update` for linux-amd64 + linux-arm64 with the version baked
|
||||
in via `-X main.version=…`.
|
||||
3. **build-iso-amd64** — runs `make iso disk-image` on ubuntu-latest;
|
||||
produces the x86_64 ISO and a `.img.xz` compressed disk image.
|
||||
4. **build-disk-arm64** — runs the same flow on the Odroid (`arm64-linux`
|
||||
label); produces `.arm64.img.xz`.
|
||||
5. **release** — downloads everything, computes `SHA256SUMS`, calls
|
||||
Gitea's `POST /api/v1/repos/<owner>/<repo>/releases` to create the
|
||||
release, then `POST .../releases/<id>/assets?name=…` once per asset.
|
||||
|
||||
Authentication uses Gitea's built-in `${{ secrets.GITHUB_TOKEN }}` — the
|
||||
runner auto-populates that secret with repo-write scope. If your runner
|
||||
is configured without that automatic token (e.g. an older `act_runner`),
|
||||
generate a personal access token with `repo:write` scope, add it as an
|
||||
org secret named `GITEA_TOKEN`, and swap the `TOKEN: ${{ secrets.GITHUB_TOKEN }}`
|
||||
line in `release.yaml` for `TOKEN: ${{ secrets.GITEA_TOKEN }}`.
|
||||
|
||||
### Why not the GitHub Marketplace release actions?
|
||||
|
||||
`release.yaml` used to call `softprops/action-gh-release@v2`. That action
|
||||
hard-codes calls to `api.github.com` instead of using `${{ github.api_url }}`
|
||||
(which Gitea sets to its own API). On Gitea's act_runner the action fails
|
||||
silently — the job reports green but no release is created. We replaced
|
||||
it with a direct `curl` so the behaviour is explicit and debuggable.
|
||||
|
||||
Similarly, `actions/upload-artifact@v4` and `@download-artifact@v4` are not
|
||||
fully implemented by act_runner v1.0.x. Pin to `@v3` until upstream
|
||||
support catches up.
|
||||
|
||||
### Manually re-running a release
|
||||
|
||||
Releases are immutable once published, but you can:
|
||||
|
||||
- **Delete and recreate the release** through the Gitea UI on the
|
||||
`releases/tag/vX.Y.Z` page, then push the tag again (Gitea reuses the
|
||||
existing tag), and re-trigger the workflow via the Actions UI.
|
||||
- **Trigger the build-arm64 workflow manually** for a one-off arm64
|
||||
artifact: Gitea UI → Actions → ARM64 Build → Run workflow.
|
||||
|
||||
Don't force-update a published tag — anyone who already fetched it (or
|
||||
downloaded an asset) sees a checksum mismatch. Prefer cutting a new patch
|
||||
release (vX.Y.Z+1) over rewriting a published one.
|
||||
|
||||
## Registering a new runner
|
||||
|
||||
### Prerequisites
|
||||
|
||||
181
docs/release-notes-0.3.0.md
Normal file
181
docs/release-notes-0.3.0.md
Normal file
@@ -0,0 +1,181 @@
|
||||
# KubeSolo OS v0.3.0 — Release Notes
|
||||
|
||||
**Released:** 2026-05-14
|
||||
|
||||
v0.3.0 is the second feature release after v0.2.0 and the first release that
|
||||
ships a generic ARM64 build alongside x86_64. The update agent grew up: it
|
||||
now has an explicit on-disk lifecycle, OCI registry distribution, and a
|
||||
fleet-friendly set of policy gates (channels, maintenance windows,
|
||||
version-stepping-stones, pre-flight checks, auto-rollback).
|
||||
|
||||
This document is the operator-facing summary. The full per-phase changelog
|
||||
lives in [CHANGELOG.md](../CHANGELOG.md).
|
||||
|
||||
## What's new
|
||||
|
||||
### Generic ARM64 build
|
||||
|
||||
The image you build with `make disk-image-arm64` now targets any UEFI-capable
|
||||
ARM64 host: AWS Graviton, Oracle Ampere, generic ARM64 servers, future SBCs
|
||||
with UEFI-compatible firmware. The kernel comes from kernel.org mainline LTS
|
||||
(6.12.10 by default, configurable via `MAINLINE_KERNEL_VERSION` in
|
||||
`build/config/versions.env`).
|
||||
|
||||
This is **distinct** from the Raspberry Pi build path. RPi keeps its
|
||||
specialised kernel from `raspberrypi/linux` with bcm-defconfig + custom DTBs;
|
||||
the generic ARM64 path uses mainline + arm64-defconfig + UEFI/virtio. See
|
||||
[docs/arm64-architecture.md](arm64-architecture.md) for the file-by-file
|
||||
split.
|
||||
|
||||
KubeSolo bumped to **v1.1.5** (was v1.1.0). New flags surfaced via cloud-init:
|
||||
- `kubesolo.full` — disable edge-optimised k8s overrides
|
||||
- `kubesolo.disable-ipv6` — disable IPv6 cluster-wide
|
||||
- `kubesolo.db-wal-repair` — recover from unclean shutdowns
|
||||
|
||||
### Update lifecycle is now observable
|
||||
|
||||
The update agent writes a `state.json` at `/var/lib/kubesolo/update/state.json`
|
||||
recording where the current attempt is in the lifecycle:
|
||||
|
||||
```
|
||||
idle → checking → downloading → staged → activated → verifying → success
|
||||
↘ rolled_back
|
||||
↘ failed
|
||||
```
|
||||
|
||||
`kubesolo-update status --json` emits the full state for orchestration tooling.
|
||||
The Prometheus metrics endpoint gains three new series:
|
||||
|
||||
- `kubesolo_update_phase{phase="..."}` — 1 for current phase, 0 for others (all 9 always emitted)
|
||||
- `kubesolo_update_attempts_total`
|
||||
- `kubesolo_update_last_attempt_timestamp_seconds`
|
||||
|
||||
### OCI registry distribution
|
||||
|
||||
Update artifacts can now be pulled from any OCI-compliant registry alongside
|
||||
the existing HTTP `latest.json` protocol:
|
||||
|
||||
```bash
|
||||
# HTTP, unchanged from v0.2:
|
||||
kubesolo-update apply --server https://updates.example.com
|
||||
|
||||
# New: OCI from ghcr.io (or quay.io, harbor, zot, ...)
|
||||
kubesolo-update apply --registry ghcr.io/yourorg/kubesolo-os --tag stable
|
||||
```
|
||||
|
||||
Multi-arch is handled transparently — the same `stable` tag points at a
|
||||
manifest index, the agent picks the manifest matching its `runtime.GOARCH`.
|
||||
|
||||
Publish your own artifacts with `build/scripts/push-oci-artifact.sh`. See
|
||||
the script's header comment for the full publishing flow.
|
||||
|
||||
### Policy gates
|
||||
|
||||
`apply` now enforces five gates before destroying the passive slot:
|
||||
|
||||
1. **Maintenance window** (configurable, e.g. `03:00-05:00`; wrapping
|
||||
midnight supported)
|
||||
2. **Node-block-label** — refuses if the K8s node carries
|
||||
`updates.kubesolo.io/block=true` (workload-author kill switch)
|
||||
3. **Channel** — `stable` / `beta` / `edge` must match between the artifact
|
||||
metadata and the local channel
|
||||
4. **Architecture** — refuses cross-arch artifacts via `runtime.GOARCH` check
|
||||
5. **Min compatible version** — stepping-stone enforcement; refuses an
|
||||
upgrade that bypasses a required intermediate version
|
||||
|
||||
`--force` bypasses the maintenance window and node-block label (channel /
|
||||
arch / min-version are non-negotiable). Failures are recorded in `state.json`
|
||||
with a clear `LastError` field.
|
||||
|
||||
### Healthcheck deepening + auto-rollback
|
||||
|
||||
`kubesolo-update healthcheck` grew three optional probes:
|
||||
|
||||
- **Kube-system pods** must hold Running for ≥ N seconds before passing
|
||||
- **Operator probe URL** — GET an operator-supplied endpoint; 200 = pass
|
||||
- **Disk smoke test** — write/fsync/read/delete a probe file under
|
||||
`/var/lib/kubesolo` to catch a wedged data partition
|
||||
|
||||
Plus auto-rollback: with `--auto-rollback-after N` (or `auto_rollback_after=`
|
||||
in `update.conf`), after N consecutive post-activation failures, the agent
|
||||
calls `ForceRollback()` and the operator/init is expected to reboot. The
|
||||
counter resets on a clean pass.
|
||||
|
||||
### Persistent configuration via `/etc/kubesolo/update.conf`
|
||||
|
||||
Cloud-init writes this file on first boot from a new `updates:` block; you
|
||||
can also hand-edit it. Recognised keys:
|
||||
|
||||
```
|
||||
server = https://updates.example.com # or omit if using registry
|
||||
registry = # OCI registry ref (alt to server)
|
||||
channel = stable
|
||||
maintenance_window = 03:00-05:00
|
||||
pubkey = /etc/kubesolo/update-pubkey.hex
|
||||
healthcheck_url = http://localhost:8000/ready
|
||||
auto_rollback_after = 3
|
||||
```
|
||||
|
||||
Cloud-init full reference at
|
||||
[cloud-init/examples/full-config.yaml](../cloud-init/examples/full-config.yaml).
|
||||
|
||||
## Migration from v0.2.x
|
||||
|
||||
This is a non-breaking release for live systems. v0.2.x → v0.3.0 changes:
|
||||
|
||||
- **`state.json` will appear** at `/var/lib/kubesolo/update/state.json` the
|
||||
first time a v0.3 agent runs `apply`. Pre-existing v0.2 deployments without
|
||||
this file are fine — the agent treats a missing file as fresh Idle state.
|
||||
- **`update.conf` is optional**. v0.2 deployments that pass everything via
|
||||
CLI flags keep working unchanged.
|
||||
- **HTTP `latest.json` protocol unchanged**. Existing update servers don't
|
||||
need a rebuild.
|
||||
- **GRUB env (boot counter, active slot)** unchanged. The bootloader's
|
||||
rollback behaviour is the same.
|
||||
- **No new mandatory kernel command-line parameters**.
|
||||
|
||||
To opt into the new lifecycle, transports, and gates, drop in an
|
||||
`update.conf` (or update cloud-init) and switch to `--registry` if you want
|
||||
OCI distribution.
|
||||
|
||||
## Known limitations
|
||||
|
||||
These shipped intentionally with v0.3.0 and are explicitly tracked for
|
||||
v0.3.1+:
|
||||
|
||||
- **OCI signature verification** — the OCI transport is digest-verified
|
||||
end-to-end via oras-go, but does not yet consume cosign-style referrer
|
||||
attestations. The HTTP transport still honours `--pubkey` for `.sig`
|
||||
files.
|
||||
- **ARM64 LABEL=KSOLODATA** resolution doesn't work yet — piCore's
|
||||
`blkid`/`findfs` crash on QEMU virt under our mainline kernel; the
|
||||
static `busybox-static` we ship doesn't include those applets.
|
||||
`build/grub/grub-arm64.cfg` hardcodes `kubesolo.data=/dev/vda4` as a
|
||||
workaround. On real ARM64 hardware the device path may differ.
|
||||
- **Real-hardware ARM64 validation** is pending. The image builds and
|
||||
boots end-to-end under QEMU virt; production certification waits on a
|
||||
Graviton / Ampere run.
|
||||
- **AppArmor profile load fails on ARM64** (`apparmor_parser` ABI mismatch).
|
||||
Init reports the failure; boot continues without AppArmor enforcement.
|
||||
- **QEMU TCG performance** can trigger KubeSolo's first-boot image-import
|
||||
deadline. Not an OS defect; real hardware and KVM-accelerated QEMU
|
||||
complete the import in seconds.
|
||||
|
||||
## How to upgrade your build host
|
||||
|
||||
```bash
|
||||
git pull
|
||||
make distclean # optional — drops the build cache; full rebuild takes ~30 min
|
||||
make iso # or disk-image, or disk-image-arm64
|
||||
```
|
||||
|
||||
The Docker-based builder (`make docker-build`) regenerates its own image
|
||||
from `build/Dockerfile.builder` on next invocation; oras 1.2.3 and
|
||||
busybox-static are now included.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
v0.3.0 work was driven by a single multi-week pair-programming session
|
||||
working through Phases 0–9 of the v0.3 roadmap. The Odroid self-hosted
|
||||
Gitea Actions runner (`odroid.local`, arm64-linux) carried every ARM64
|
||||
build during development.
|
||||
@@ -120,16 +120,20 @@ if [ "$MODE" = "disk" ]; then
|
||||
echo " Press Ctrl+A X to exit QEMU"
|
||||
echo ""
|
||||
|
||||
# -cpu max enables all emulated ARMv8 features (atomics, crypto, fp16).
|
||||
# piCore64's BusyBox is built with -march=armv8-a+crypto+lse and segfaults
|
||||
# under -cpu cortex-a72 because some required extensions aren't on by
|
||||
# default in that model.
|
||||
qemu-system-aarch64 \
|
||||
-machine virt \
|
||||
-cpu cortex-a72 \
|
||||
-cpu max \
|
||||
-m 2048 \
|
||||
-smp 2 \
|
||||
-nographic \
|
||||
-bios "$UEFI_FW" \
|
||||
-drive "file=$DISK_IMAGE,format=raw,if=virtio,media=disk" \
|
||||
-net "nic,model=virtio" \
|
||||
-net "user,hostfwd=tcp::6443-:6443,hostfwd=tcp::2222-:22"
|
||||
-net "user,hostfwd=tcp::6443-:6443,hostfwd=tcp::2222-:22,hostfwd=tcp::8080-:8080"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
@@ -186,7 +190,7 @@ echo ""
|
||||
|
||||
qemu-system-aarch64 \
|
||||
-machine virt \
|
||||
-cpu cortex-a72 \
|
||||
-cpu max \
|
||||
-m 2048 \
|
||||
-smp 2 \
|
||||
-nographic \
|
||||
@@ -195,4 +199,4 @@ qemu-system-aarch64 \
|
||||
-append "console=ttyAMA0 kubesolo.data=/dev/vda kubesolo.debug $EXTRA_APPEND" \
|
||||
-drive "file=$DATA_DISK,format=raw,if=virtio" \
|
||||
-net "nic,model=virtio" \
|
||||
-net "user,hostfwd=tcp::6443-:6443,hostfwd=tcp::2222-:22"
|
||||
-net "user,hostfwd=tcp::6443-:6443,hostfwd=tcp::2222-:22,hostfwd=tcp::8080-:8080"
|
||||
|
||||
@@ -14,6 +14,11 @@
|
||||
# kubesolo.cloudinit=<path> Path to cloud-init config
|
||||
# kubesolo.flags=<flags> Extra flags for KubeSolo binary
|
||||
|
||||
# Route early boot output to /dev/console — before switch_root the kernel may
|
||||
# not have a controlling tty, and some stages echo to stderr expecting it to
|
||||
# reach the serial console. This is a no-op once the staged init proper starts.
|
||||
exec >/dev/console 2>&1
|
||||
|
||||
set -e
|
||||
|
||||
# --- Switch root: escape initramfs so runc pivot_root works ---
|
||||
|
||||
@@ -16,7 +16,11 @@ while IFS= read -r mod; do
|
||||
case "$mod" in
|
||||
'#'*|'') continue ;;
|
||||
esac
|
||||
mod="$(echo "$mod" | tr -d '[:space:]')"
|
||||
# NOTE: do NOT use tr -d '[:space:]' — Ubuntu's busybox-static 1.30.1 (used
|
||||
# in the ARM64 rootfs override) doesn't parse POSIX char classes and treats
|
||||
# them as a literal set, deleting [, :, s, p, a, c, e, ]. Use explicit
|
||||
# whitespace chars instead so the same script works under any tr.
|
||||
mod="$(printf '%s' "$mod" | tr -d ' \t\r\n')"
|
||||
if modprobe "$mod" 2>/dev/null; then
|
||||
LOADED=$((LOADED + 1))
|
||||
else
|
||||
|
||||
@@ -8,8 +8,11 @@ for conf in /etc/sysctl.d/*.conf; do
|
||||
case "$key" in
|
||||
'#'*|'') continue ;;
|
||||
esac
|
||||
key="$(echo "$key" | tr -d '[:space:]')"
|
||||
value="$(echo "$value" | tr -d '[:space:]')"
|
||||
# NOTE: do NOT use tr -d '[:space:]' — see 30-kernel-modules.sh for the
|
||||
# rationale. Use explicit whitespace chars so this works under
|
||||
# Ubuntu's busybox-static tr too.
|
||||
key="$(printf '%s' "$key" | tr -d ' \t\r\n')"
|
||||
value="$(printf '%s' "$value" | tr -d ' \t\r\n')"
|
||||
if [ -n "$key" ] && [ -n "$value" ]; then
|
||||
sysctl -w "${key}=${value}" >/dev/null 2>&1 || \
|
||||
log_warn "Failed to set sysctl: ${key}=${value}"
|
||||
|
||||
@@ -76,6 +76,29 @@ while [ ! -f "$KUBECONFIG_PATH" ] && [ $WAIT -lt 120 ]; do
|
||||
fi
|
||||
done
|
||||
|
||||
# Render the access banner. Written to /etc/motd so it's visible to anyone
|
||||
# who later shells in (SSH extension, emergency shell, console login), and
|
||||
# printed unconditionally to console below so the user sees it even when
|
||||
# KubeSolo hasn't yet finished generating the kubeconfig.
|
||||
ACCESS_BANNER="$(cat <<'BANNER'
|
||||
============================================================
|
||||
KubeSolo OS — host access
|
||||
|
||||
From your host machine, run:
|
||||
|
||||
curl -s http://localhost:8080 > ~/.kube/kubesolo-config
|
||||
kubectl --kubeconfig ~/.kube/kubesolo-config get nodes
|
||||
|
||||
Notes:
|
||||
- port 8080 serves the kubeconfig (admin) over HTTP
|
||||
- port 6443 serves the Kubernetes API (HTTPS)
|
||||
- Both ports are forwarded under QEMU's `-net user,hostfwd=…` config
|
||||
|
||||
============================================================
|
||||
BANNER
|
||||
)"
|
||||
printf '%s\n' "$ACCESS_BANNER" > /etc/motd 2>/dev/null || true
|
||||
|
||||
if [ -f "$KUBECONFIG_PATH" ]; then
|
||||
log_ok "KubeSolo is running (PID $KUBESOLO_PID)"
|
||||
|
||||
@@ -95,18 +118,17 @@ if [ -f "$KUBECONFIG_PATH" ]; then
|
||||
done) &
|
||||
|
||||
log_ok "Kubeconfig available via HTTP on port 8080"
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo " From your host machine, run:"
|
||||
echo ""
|
||||
echo " curl -s http://localhost:8080 > ~/.kube/kubesolo-config"
|
||||
echo " kubectl --kubeconfig ~/.kube/kubesolo-config get nodes"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
else
|
||||
log_warn "Kubeconfig not found after ${WAIT}s — KubeSolo may still be starting"
|
||||
log_warn "Check manually: cat $KUBECONFIG_PATH"
|
||||
fi
|
||||
|
||||
# Show the banner regardless of kubeconfig state: the HTTP server above only
|
||||
# starts on success, but printing the instructions during the long first-boot
|
||||
# wait is useful and harmless (user retries the curl until it 200s).
|
||||
echo ""
|
||||
printf '%s\n' "$ACCESS_BANNER"
|
||||
echo ""
|
||||
|
||||
# Keep init alive — wait on KubeSolo process
|
||||
wait $KUBESOLO_PID
|
||||
|
||||
@@ -3,23 +3,35 @@ package cmd
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Activate switches the boot target to the passive partition.
|
||||
// After activation, the next reboot will boot from the new partition
|
||||
// with boot_counter=3. If health checks fail 3 times, GRUB auto-rolls back.
|
||||
//
|
||||
// State transition: Staged → Activated. On failure → Failed.
|
||||
func Activate(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
env := opts.NewBootEnv()
|
||||
|
||||
st, err := state.Load(opts.StatePath)
|
||||
if err != nil {
|
||||
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||
st = state.New()
|
||||
}
|
||||
|
||||
// Get passive slot (the one we want to boot into)
|
||||
passiveSlot, err := env.PassiveSlot()
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
|
||||
return fmt.Errorf("reading passive slot: %w", err)
|
||||
}
|
||||
|
||||
activeSlot, err := env.ActiveSlot()
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading active slot: %w", err))
|
||||
return fmt.Errorf("reading active slot: %w", err)
|
||||
}
|
||||
|
||||
@@ -27,9 +39,14 @@ func Activate(args []string) error {
|
||||
|
||||
// Set the passive slot as active with fresh boot counter
|
||||
if err := env.ActivateSlot(passiveSlot); err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("activating slot %s: %w", passiveSlot, err))
|
||||
return fmt.Errorf("activating slot %s: %w", passiveSlot, err)
|
||||
}
|
||||
|
||||
if err := st.Transition(opts.StatePath, state.PhaseActivated, "", ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseActivated, "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Slot %s activated (was %s)\n", passiveSlot, activeSlot)
|
||||
fmt.Println("Boot counter set to 3. Reboot to start the new version.")
|
||||
fmt.Println("The system will automatically roll back if health checks fail 3 times.")
|
||||
|
||||
@@ -1,73 +1,240 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/config"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/health"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/image"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/oci"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/partition"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// applyMetadataGates enforces channel / architecture / min-version policy on
|
||||
// resolved update metadata, regardless of transport (HTTP or OCI). Records
|
||||
// any failure to the state file before returning.
|
||||
func applyMetadataGates(opts opts, st *state.UpdateState, meta *image.UpdateMetadata) error {
|
||||
if meta.Channel != "" && meta.Channel != opts.Channel {
|
||||
err := fmt.Errorf("metadata channel %q does not match local channel %q",
|
||||
meta.Channel, opts.Channel)
|
||||
_ = st.RecordError(opts.StatePath, err)
|
||||
return err
|
||||
}
|
||||
if meta.Architecture != "" && meta.Architecture != runtime.GOARCH {
|
||||
err := fmt.Errorf("metadata architecture %q does not match runtime %q",
|
||||
meta.Architecture, runtime.GOARCH)
|
||||
_ = st.RecordError(opts.StatePath, err)
|
||||
return err
|
||||
}
|
||||
if meta.MinCompatibleVersion != "" && st.FromVersion != "" {
|
||||
cmp, cerr := config.CompareVersions(st.FromVersion, meta.MinCompatibleVersion)
|
||||
if cerr != nil {
|
||||
slog.Warn("min-version comparison failed", "error", cerr,
|
||||
"from", st.FromVersion, "min", meta.MinCompatibleVersion)
|
||||
} else if cmp < 0 {
|
||||
err := fmt.Errorf("current version %s is below min_compatible_version %s; install %s first",
|
||||
st.FromVersion, meta.MinCompatibleVersion, meta.MinCompatibleVersion)
|
||||
_ = st.RecordError(opts.StatePath, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Apply downloads a new OS image and writes it to the passive partition.
|
||||
// It does NOT activate the new partition — use 'activate' for that.
|
||||
//
|
||||
// State transitions: Idle/Success/Failed → Checking → Downloading → Staged.
|
||||
// On any error the state moves to Failed with LastError set.
|
||||
func Apply(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
|
||||
if opts.ServerURL == "" {
|
||||
return fmt.Errorf("--server is required")
|
||||
if opts.ServerURL == "" && opts.Registry == "" {
|
||||
return fmt.Errorf("--server or --registry is required (or set in /etc/kubesolo/update.conf)")
|
||||
}
|
||||
if opts.ServerURL != "" && opts.Registry != "" {
|
||||
return fmt.Errorf("--server and --registry are mutually exclusive")
|
||||
}
|
||||
|
||||
// Maintenance window gate — earliest cheap check, before any HTTP work.
|
||||
// Skipped with --force.
|
||||
window, werr := config.ParseWindow(opts.MaintenanceWindow)
|
||||
if werr != nil {
|
||||
return fmt.Errorf("parse maintenance_window: %w", werr)
|
||||
}
|
||||
if !opts.Force && !window.Contains(time.Now()) {
|
||||
return fmt.Errorf("outside maintenance window (%s); pass --force to override",
|
||||
window.String())
|
||||
}
|
||||
|
||||
// Node-block-label gate — workload authors can defer an update by
|
||||
// labeling the node updates.kubesolo.io/block=true. Skipped with --force
|
||||
// and silently bypassed when the K8s API isn't reachable (air-gap).
|
||||
if !opts.Force {
|
||||
blocked, berr := health.CheckNodeBlocked("")
|
||||
if berr != nil {
|
||||
slog.Warn("node-block check failed, allowing update", "error", berr)
|
||||
} else if blocked {
|
||||
return fmt.Errorf("node carries label %s=true; refusing update (pass --force to override)",
|
||||
health.NodeBlockLabel)
|
||||
}
|
||||
}
|
||||
|
||||
st, err := state.Load(opts.StatePath)
|
||||
if err != nil {
|
||||
// Don't block the operation on a corrupt state file. Log + recover.
|
||||
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||
st = state.New()
|
||||
}
|
||||
|
||||
env := opts.NewBootEnv()
|
||||
|
||||
// Record the current running version as the "from" reference. The active
|
||||
// slot's version file is the most reliable source.
|
||||
activeSlot, slotErr := env.ActiveSlot()
|
||||
if slotErr == nil {
|
||||
if partInfo, perr := partition.GetSlotPartition(activeSlot); perr == nil {
|
||||
mp := "/tmp/kubesolo-active-" + activeSlot
|
||||
if merr := partition.MountReadOnly(partInfo.Device, mp); merr == nil {
|
||||
if v, rerr := partition.ReadVersion(mp); rerr == nil {
|
||||
st.SetFromVersion(v)
|
||||
}
|
||||
partition.Unmount(mp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine passive slot
|
||||
passiveSlot, err := env.PassiveSlot()
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("reading passive slot: %w", err))
|
||||
return fmt.Errorf("reading passive slot: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("applying update", "target_slot", passiveSlot)
|
||||
|
||||
// Check for update
|
||||
stageDir := "/tmp/kubesolo-update-stage"
|
||||
|
||||
if err := st.Transition(opts.StatePath, state.PhaseChecking, "", ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseChecking, "error", err)
|
||||
}
|
||||
|
||||
// Resolve metadata via the configured transport. OCI registry mode pulls
|
||||
// the manifest only; HTTP mode hits latest.json.
|
||||
var (
|
||||
meta *image.UpdateMetadata
|
||||
staged *image.StagedImage
|
||||
)
|
||||
if opts.Registry != "" {
|
||||
ociClient, err := oci.NewClient(opts.Registry)
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("oci client: %w", err))
|
||||
return fmt.Errorf("oci client: %w", err)
|
||||
}
|
||||
tag := opts.Tag
|
||||
if tag == "" {
|
||||
tag = opts.Channel
|
||||
}
|
||||
if tag == "" {
|
||||
tag = "stable"
|
||||
}
|
||||
meta, err = ociClient.FetchMetadata(context.Background(), tag)
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("oci fetch metadata: %w", err))
|
||||
return fmt.Errorf("oci fetch metadata: %w", err)
|
||||
}
|
||||
if err := applyMetadataGates(opts, st, meta); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
|
||||
}
|
||||
staged, _, err = ociClient.Pull(context.Background(), tag, stageDir)
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("oci pull: %w", err))
|
||||
return fmt.Errorf("oci pull: %w", err)
|
||||
}
|
||||
} else {
|
||||
client := image.NewClient(opts.ServerURL, stageDir)
|
||||
defer client.Cleanup()
|
||||
|
||||
// Enable signature verification if public key is configured
|
||||
if opts.PubKeyPath != "" {
|
||||
client.SetPublicKeyPath(opts.PubKeyPath)
|
||||
slog.Info("signature verification enabled", "pubkey", opts.PubKeyPath)
|
||||
}
|
||||
|
||||
meta, err := client.CheckForUpdate()
|
||||
var err error
|
||||
meta, err = client.CheckForUpdate()
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("checking for update: %w", err))
|
||||
return fmt.Errorf("checking for update: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("update available", "version", meta.Version)
|
||||
|
||||
// Download and verify
|
||||
staged, err := client.Download(meta)
|
||||
if err := applyMetadataGates(opts, st, meta); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := st.Transition(opts.StatePath, state.PhaseDownloading, meta.Version, ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseDownloading, "error", err)
|
||||
}
|
||||
staged, err = client.Download(meta)
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("downloading update: %w", err))
|
||||
return fmt.Errorf("downloading update: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("update available", "version", meta.Version, "channel", meta.Channel, "arch", meta.Architecture)
|
||||
|
||||
// Mount passive partition
|
||||
partInfo, err := partition.GetSlotPartition(passiveSlot)
|
||||
if err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("finding passive partition: %w", err))
|
||||
return fmt.Errorf("finding passive partition: %w", err)
|
||||
}
|
||||
|
||||
mountPoint := "/tmp/kubesolo-passive-" + passiveSlot
|
||||
if err := partition.MountReadWrite(partInfo.Device, mountPoint); err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("mounting passive partition: %w", err))
|
||||
return fmt.Errorf("mounting passive partition: %w", err)
|
||||
}
|
||||
defer partition.Unmount(mountPoint)
|
||||
|
||||
// Free-space pre-write check: the passive partition must have at least
|
||||
// (kernel + initramfs) + 10% headroom. Catches corrupted-FS reports and
|
||||
// shrunk/wrong-size partitions before we destroy the existing slot data.
|
||||
var imgSize int64
|
||||
for _, p := range []string{staged.VmlinuzPath, staged.InitramfsPath} {
|
||||
fi, ferr := os.Stat(p)
|
||||
if ferr != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("stat staged file %s: %w", p, ferr))
|
||||
return fmt.Errorf("stat staged file %s: %w", p, ferr)
|
||||
}
|
||||
imgSize += fi.Size()
|
||||
}
|
||||
avail, ok, ferr := partition.HasFreeSpaceFor(mountPoint, imgSize, 10)
|
||||
if ferr != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("free-space check: %w", ferr))
|
||||
return fmt.Errorf("free-space check: %w", ferr)
|
||||
}
|
||||
if !ok {
|
||||
err := fmt.Errorf("insufficient space on %s: have %.1f MiB, need %.1f MiB (image + 10%% headroom)",
|
||||
passiveSlot, float64(avail)/(1<<20), float64(imgSize)*1.1/(1<<20))
|
||||
_ = st.RecordError(opts.StatePath, err)
|
||||
return err
|
||||
}
|
||||
|
||||
// Write image to passive partition
|
||||
if err := partition.WriteSystemImage(mountPoint, staged.VmlinuzPath, staged.InitramfsPath, staged.Version); err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("writing system image: %w", err))
|
||||
return fmt.Errorf("writing system image: %w", err)
|
||||
}
|
||||
|
||||
if err := st.Transition(opts.StatePath, state.PhaseStaged, staged.Version, ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseStaged, "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Update v%s written to slot %s (%s)\n", staged.Version, passiveSlot, partInfo.Device)
|
||||
fmt.Println("Run 'kubesolo-update activate' to boot into the new version")
|
||||
|
||||
|
||||
@@ -6,16 +6,32 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/health"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Healthcheck performs post-boot health verification.
|
||||
// If all checks pass, it marks the boot as successful in GRUB.
|
||||
// This should be run after every boot (typically via a systemd unit or
|
||||
// init script) to confirm the system is healthy.
|
||||
//
|
||||
// State transition: Activated → Verifying → Success on pass, → Failed on fail.
|
||||
// If state isn't in Activated (e.g. manual run on a long-stable system), the
|
||||
// state file is left alone — healthcheck still does its job.
|
||||
//
|
||||
// When --auto-rollback-after N is set, consecutive post-Activated failures
|
||||
// are counted in state.HealthCheckFailures. On the Nth failure, the agent
|
||||
// calls Rollback() and the operator is expected to reboot (this command
|
||||
// does not reboot the host — that's policy left to systemd/init).
|
||||
func Healthcheck(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
env := opts.NewBootEnv()
|
||||
|
||||
st, err := state.Load(opts.StatePath)
|
||||
if err != nil {
|
||||
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||
st = state.New()
|
||||
}
|
||||
|
||||
// Check if already marked successful
|
||||
success, err := env.BootSuccess()
|
||||
if err != nil {
|
||||
@@ -26,30 +42,94 @@ func Healthcheck(args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Only transition state if we're post-activation. Manual healthcheck on a
|
||||
// long-stable system shouldn't reset Idle → Verifying.
|
||||
postActivation := st.Phase == state.PhaseActivated
|
||||
if postActivation {
|
||||
if err := st.Transition(opts.StatePath, state.PhaseVerifying, "", ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseVerifying, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
timeout := time.Duration(opts.TimeoutSecs) * time.Second
|
||||
checker := health.NewChecker("", "", timeout)
|
||||
checker.ProbeURL = opts.HealthcheckURL
|
||||
if opts.KubeSystemSettle > 0 {
|
||||
checker.KubeSystemSettle = time.Duration(opts.KubeSystemSettle) * time.Second
|
||||
}
|
||||
// Probe the data partition every healthcheck so a wedged disk fails fast.
|
||||
checker.DataDir = "/var/lib/kubesolo"
|
||||
|
||||
slog.Info("running post-boot health checks", "timeout", timeout)
|
||||
slog.Info("running post-boot health checks",
|
||||
"timeout", timeout,
|
||||
"probe_url", checker.ProbeURL,
|
||||
"kube_system_settle", checker.KubeSystemSettle)
|
||||
|
||||
status, err := checker.WaitForHealthy()
|
||||
if err != nil {
|
||||
fmt.Printf("Health check FAILED: %s\n", status.Message)
|
||||
fmt.Printf(" containerd: %v\n", status.Containerd)
|
||||
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
||||
fmt.Printf(" node_ready: %v\n", status.NodeReady)
|
||||
printStatusBreakdown(status)
|
||||
fmt.Println("\nBoot NOT marked successful — system may roll back on next reboot")
|
||||
|
||||
if postActivation {
|
||||
st.HealthCheckFailures++
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("post-boot health check failed: %s", status.Message))
|
||||
|
||||
// Auto-rollback escalation. Only trigger when post-Activated;
|
||||
// don't second-guess a healthy long-running system.
|
||||
if opts.AutoRollbackAfter > 0 && st.HealthCheckFailures >= opts.AutoRollbackAfter {
|
||||
slog.Warn("auto-rollback threshold reached",
|
||||
"failures", st.HealthCheckFailures,
|
||||
"threshold", opts.AutoRollbackAfter)
|
||||
if rerr := env.ForceRollback(); rerr != nil {
|
||||
slog.Error("auto-rollback failed", "error", rerr)
|
||||
return err // return the original healthcheck error
|
||||
}
|
||||
if terr := st.Transition(opts.StatePath, state.PhaseRolledBack, "",
|
||||
fmt.Sprintf("auto-rollback after %d healthcheck failures", st.HealthCheckFailures)); terr != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", terr)
|
||||
}
|
||||
fmt.Println("\nAuto-rollback triggered. Reboot to complete the rollback.")
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Mark boot as successful
|
||||
if err := env.MarkBootSuccess(); err != nil {
|
||||
if postActivation {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("marking boot success: %w", err))
|
||||
}
|
||||
return fmt.Errorf("marking boot success: %w", err)
|
||||
}
|
||||
|
||||
if postActivation {
|
||||
// Reset failure counter on a clean pass.
|
||||
st.HealthCheckFailures = 0
|
||||
if err := st.Transition(opts.StatePath, state.PhaseSuccess, "", ""); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseSuccess, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println("Health check PASSED — boot marked successful")
|
||||
fmt.Printf(" containerd: %v\n", status.Containerd)
|
||||
fmt.Printf(" apiserver: %v\n", status.APIServer)
|
||||
fmt.Printf(" node_ready: %v\n", status.NodeReady)
|
||||
printStatusBreakdown(status)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// printStatusBreakdown emits a human-readable per-check summary. Only emits
|
||||
// optional check lines when they actually ran.
|
||||
func printStatusBreakdown(s *health.Status) {
|
||||
fmt.Printf(" containerd: %v\n", s.Containerd)
|
||||
fmt.Printf(" apiserver: %v\n", s.APIServer)
|
||||
fmt.Printf(" node_ready: %v\n", s.NodeReady)
|
||||
if !s.KubeSystemReady {
|
||||
fmt.Printf(" kube-system pods: %v\n", s.KubeSystemReady)
|
||||
}
|
||||
if !s.ProbeURL {
|
||||
fmt.Printf(" probe URL: %v\n", s.ProbeURL)
|
||||
}
|
||||
if !s.DiskWritable {
|
||||
fmt.Printf(" disk writable: %v\n", s.DiskWritable)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/metrics"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Metrics starts the Prometheus-compatible metrics HTTP server.
|
||||
@@ -12,10 +13,12 @@ func Metrics(args []string) error {
|
||||
fs := flag.NewFlagSet("metrics", flag.ExitOnError)
|
||||
listenAddr := fs.String("listen", ":9100", "Metrics HTTP listen address")
|
||||
grubenvPath := fs.String("grubenv", "/boot/grub/grubenv", "Path to grubenv file")
|
||||
statePath := fs.String("state", state.DefaultPath, "Path to update state.json")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return fmt.Errorf("parse flags: %w", err)
|
||||
}
|
||||
|
||||
srv := metrics.NewServer(*listenAddr, *grubenvPath)
|
||||
srv.SetStatePath(*statePath)
|
||||
return srv.ListenAndServe()
|
||||
}
|
||||
|
||||
@@ -1,17 +1,32 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/bootenv"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/config"
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// opts holds shared command-line options for all subcommands.
|
||||
type opts struct {
|
||||
ServerURL string
|
||||
Registry string // OCI registry ref (e.g. ghcr.io/foo/kubesolo-os). Mutually exclusive with ServerURL.
|
||||
Tag string // OCI tag to pull (default: equal to Channel, falling back to "stable")
|
||||
GrubenvPath string
|
||||
TimeoutSecs int
|
||||
PubKeyPath string
|
||||
BootEnvType string // "grub" or "rpi"
|
||||
BootEnvPath string // path for RPi boot control dir
|
||||
StatePath string // location of state.json (default: state.DefaultPath)
|
||||
ConfPath string // location of update.conf (default: config.DefaultPath)
|
||||
Channel string // update channel ("stable" by default)
|
||||
MaintenanceWindow string // "HH:MM-HH:MM" or empty for always-allow
|
||||
HealthcheckURL string // optional GET probe for healthcheck
|
||||
AutoRollbackAfter int // healthcheck: rollback after N consecutive failures (0=off)
|
||||
KubeSystemSettle int // healthcheck: kube-system pods must be Running for N seconds (0=disabled)
|
||||
Force bool // bypass maintenance window
|
||||
JSON bool // status: emit JSON instead of human-readable
|
||||
}
|
||||
|
||||
// NewBootEnv creates a BootEnv from the parsed options.
|
||||
@@ -25,21 +40,129 @@ func (o opts) NewBootEnv() bootenv.BootEnv {
|
||||
}
|
||||
|
||||
// parseOpts extracts command-line flags from args.
|
||||
// Simple parser — no external dependencies.
|
||||
//
|
||||
// Precedence: explicit CLI flags > /etc/kubesolo/update.conf > package
|
||||
// defaults. The config file is loaded first so any CLI flag overrides it.
|
||||
//
|
||||
// Unknown flags are ignored (forward-compat).
|
||||
func parseOpts(args []string) opts {
|
||||
o := opts{
|
||||
GrubenvPath: "/boot/grub/grubenv",
|
||||
TimeoutSecs: 120,
|
||||
BootEnvType: "grub",
|
||||
StatePath: state.DefaultPath,
|
||||
ConfPath: config.DefaultPath,
|
||||
Channel: "stable",
|
||||
}
|
||||
|
||||
// First pass: pick up --conf so it can point at a different file before
|
||||
// we load. (Tests pass --conf <tempdir>/update.conf.)
|
||||
for i := 0; i < len(args); i++ {
|
||||
if args[i] == "--conf" && i+1 < len(args) {
|
||||
o.ConfPath = args[i+1]
|
||||
}
|
||||
}
|
||||
|
||||
// Load config file. Missing file is fine (fresh system, no cloud-init yet).
|
||||
if cfg, err := config.Load(o.ConfPath); err == nil && cfg != nil {
|
||||
if cfg.Server != "" {
|
||||
o.ServerURL = cfg.Server
|
||||
}
|
||||
if cfg.Channel != "" {
|
||||
o.Channel = cfg.Channel
|
||||
}
|
||||
if cfg.MaintenanceWindow != "" {
|
||||
o.MaintenanceWindow = cfg.MaintenanceWindow
|
||||
}
|
||||
if cfg.PubKey != "" {
|
||||
o.PubKeyPath = cfg.PubKey
|
||||
}
|
||||
if cfg.HealthcheckURL != "" {
|
||||
o.HealthcheckURL = cfg.HealthcheckURL
|
||||
}
|
||||
if cfg.AutoRollbackAfter > 0 {
|
||||
o.AutoRollbackAfter = cfg.AutoRollbackAfter
|
||||
}
|
||||
} else if err != nil {
|
||||
slog.Warn("could not load update.conf", "path", o.ConfPath, "error", err)
|
||||
}
|
||||
|
||||
// Second pass: CLI overrides config file values.
|
||||
for i := 0; i < len(args); i++ {
|
||||
switch args[i] {
|
||||
case "--conf":
|
||||
i++ // already handled above
|
||||
case "--state":
|
||||
if i+1 < len(args) {
|
||||
o.StatePath = args[i+1]
|
||||
i++
|
||||
}
|
||||
case "--channel":
|
||||
if i+1 < len(args) {
|
||||
o.Channel = args[i+1]
|
||||
i++
|
||||
}
|
||||
case "--maintenance-window":
|
||||
if i+1 < len(args) {
|
||||
o.MaintenanceWindow = args[i+1]
|
||||
i++
|
||||
}
|
||||
case "--force":
|
||||
o.Force = true
|
||||
case "--healthcheck-url":
|
||||
if i+1 < len(args) {
|
||||
o.HealthcheckURL = args[i+1]
|
||||
i++
|
||||
}
|
||||
case "--auto-rollback-after":
|
||||
if i+1 < len(args) {
|
||||
n := 0
|
||||
for _, ch := range args[i+1] {
|
||||
if ch >= '0' && ch <= '9' {
|
||||
n = n*10 + int(ch-'0')
|
||||
} else {
|
||||
n = 0
|
||||
break
|
||||
}
|
||||
}
|
||||
if n > 0 {
|
||||
o.AutoRollbackAfter = n
|
||||
}
|
||||
i++
|
||||
}
|
||||
case "--kube-system-settle":
|
||||
if i+1 < len(args) {
|
||||
n := 0
|
||||
for _, ch := range args[i+1] {
|
||||
if ch >= '0' && ch <= '9' {
|
||||
n = n*10 + int(ch-'0')
|
||||
} else {
|
||||
n = 0
|
||||
break
|
||||
}
|
||||
}
|
||||
if n > 0 {
|
||||
o.KubeSystemSettle = n
|
||||
}
|
||||
i++
|
||||
}
|
||||
case "--json":
|
||||
o.JSON = true
|
||||
case "--server":
|
||||
if i+1 < len(args) {
|
||||
o.ServerURL = args[i+1]
|
||||
i++
|
||||
}
|
||||
case "--registry":
|
||||
if i+1 < len(args) {
|
||||
o.Registry = args[i+1]
|
||||
i++
|
||||
}
|
||||
case "--tag":
|
||||
if i+1 < len(args) {
|
||||
o.Tag = args[i+1]
|
||||
i++
|
||||
}
|
||||
case "--grubenv":
|
||||
if i+1 < len(args) {
|
||||
o.GrubenvPath = args[i+1]
|
||||
|
||||
@@ -3,14 +3,24 @@ package cmd
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Rollback forces an immediate switch to the other partition.
|
||||
// Use this to manually revert to the previous version.
|
||||
//
|
||||
// State transition: any → RolledBack with LastError="manual rollback".
|
||||
func Rollback(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
env := opts.NewBootEnv()
|
||||
|
||||
st, err := state.Load(opts.StatePath)
|
||||
if err != nil {
|
||||
slog.Warn("state file unreadable, starting fresh", "error", err)
|
||||
st = state.New()
|
||||
}
|
||||
|
||||
activeSlot, err := env.ActiveSlot()
|
||||
if err != nil {
|
||||
return fmt.Errorf("reading active slot: %w", err)
|
||||
@@ -24,9 +34,14 @@ func Rollback(args []string) error {
|
||||
slog.Info("forcing rollback", "from", activeSlot, "to", passiveSlot)
|
||||
|
||||
if err := env.ForceRollback(); err != nil {
|
||||
_ = st.RecordError(opts.StatePath, fmt.Errorf("rollback failed: %w", err))
|
||||
return fmt.Errorf("rollback failed: %w", err)
|
||||
}
|
||||
|
||||
if err := st.Transition(opts.StatePath, state.PhaseRolledBack, "", "manual rollback"); err != nil {
|
||||
slog.Warn("state transition failed", "phase", state.PhaseRolledBack, "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Rolled back: %s → %s\n", activeSlot, passiveSlot)
|
||||
fmt.Println("Reboot to complete rollback.")
|
||||
|
||||
|
||||
@@ -1,10 +1,26 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// statusReport is the JSON-emitted shape of `kubesolo-update status --json`.
|
||||
// Combines the bootloader-level A/B view with the update-agent state machine.
|
||||
type statusReport struct {
|
||||
ActiveSlot string `json:"active_slot"`
|
||||
PassiveSlot string `json:"passive_slot"`
|
||||
BootCounter int `json:"boot_counter"`
|
||||
BootSuccess bool `json:"boot_success"`
|
||||
State *state.UpdateState `json:"state"`
|
||||
}
|
||||
|
||||
// Status displays the current A/B slot configuration and boot state.
|
||||
// With --json, emits the full state report to stdout for orchestration
|
||||
// tooling.
|
||||
func Status(args []string) error {
|
||||
opts := parseOpts(args)
|
||||
env := opts.NewBootEnv()
|
||||
@@ -29,6 +45,23 @@ func Status(args []string) error {
|
||||
return fmt.Errorf("reading boot success: %w", err)
|
||||
}
|
||||
|
||||
// State file is non-fatal: present means we have an update lifecycle
|
||||
// recorded; absent means no update has run yet.
|
||||
st, _ := state.Load(opts.StatePath)
|
||||
|
||||
if opts.JSON {
|
||||
report := statusReport{
|
||||
ActiveSlot: activeSlot,
|
||||
PassiveSlot: passiveSlot,
|
||||
BootCounter: bootCounter,
|
||||
BootSuccess: bootSuccess,
|
||||
State: st,
|
||||
}
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
return enc.Encode(report)
|
||||
}
|
||||
|
||||
fmt.Println("KubeSolo OS — A/B Partition Status")
|
||||
fmt.Println("───────────────────────────────────")
|
||||
fmt.Printf(" Active slot: %s\n", activeSlot)
|
||||
@@ -48,5 +81,25 @@ func Status(args []string) error {
|
||||
fmt.Printf("\n ⚠ Boot pending verification (%d attempts remaining)\n", bootCounter)
|
||||
}
|
||||
|
||||
if st != nil && st.Phase != state.PhaseIdle {
|
||||
fmt.Println("\nUpdate Lifecycle")
|
||||
fmt.Println("───────────────────────────────────")
|
||||
fmt.Printf(" Phase: %s\n", st.Phase)
|
||||
if st.FromVersion != "" {
|
||||
fmt.Printf(" From version: %s\n", st.FromVersion)
|
||||
}
|
||||
if st.ToVersion != "" {
|
||||
fmt.Printf(" To version: %s\n", st.ToVersion)
|
||||
}
|
||||
if !st.StartedAt.IsZero() {
|
||||
fmt.Printf(" Started: %s\n", st.StartedAt.Format("2006-01-02 15:04:05 MST"))
|
||||
}
|
||||
fmt.Printf(" Updated: %s\n", st.UpdatedAt.Format("2006-01-02 15:04:05 MST"))
|
||||
fmt.Printf(" Attempts: %d\n", st.AttemptCount)
|
||||
if st.LastError != "" {
|
||||
fmt.Printf(" Last error: %s\n", st.LastError)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
module github.com/portainer/kubesolo-os/update
|
||||
|
||||
go 1.25.5
|
||||
|
||||
require (
|
||||
github.com/opencontainers/go-digest v1.0.0 // indirect
|
||||
github.com/opencontainers/image-spec v1.1.1 // indirect
|
||||
golang.org/x/sync v0.14.0 // indirect
|
||||
oras.land/oras-go/v2 v2.6.0 // indirect
|
||||
)
|
||||
|
||||
8
update/go.sum
Normal file
8
update/go.sum
Normal file
@@ -0,0 +1,8 @@
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
|
||||
github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
|
||||
golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
|
||||
golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
|
||||
oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc=
|
||||
oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o=
|
||||
@@ -78,15 +78,28 @@ Commands:
|
||||
metrics Start Prometheus-compatible metrics HTTP server
|
||||
|
||||
Options:
|
||||
--server URL Update server URL (default: from /etc/kubesolo/update.conf)
|
||||
--server URL HTTP update server (mutually exclusive with --registry)
|
||||
--registry REPO OCI registry repository, e.g. ghcr.io/portainer/kubesolo-os
|
||||
(mutually exclusive with --server)
|
||||
--tag TAG OCI tag to pull (default: channel name, then "stable")
|
||||
--conf PATH update.conf path (default: /etc/kubesolo/update.conf)
|
||||
--state PATH Update state file (default: /var/lib/kubesolo/update/state.json)
|
||||
--channel NAME Update channel (default: "stable", or value from update.conf)
|
||||
--maintenance-window HH:MM-HH:MM local time window; apply refuses outside it
|
||||
--force Bypass maintenance-window check
|
||||
--grubenv PATH Path to grubenv file (default: /boot/grub/grubenv)
|
||||
--timeout SECS Health check timeout in seconds (default: 120)
|
||||
--pubkey PATH Ed25519 public key for signature verification (optional)
|
||||
--healthcheck-url URL Optional GET probe in healthcheck; 200 = pass
|
||||
--auto-rollback-after N healthcheck: rollback after N consecutive failures
|
||||
--kube-system-settle N healthcheck: require kube-system pods Running ≥ N seconds
|
||||
--json For 'status': emit JSON instead of human-readable output
|
||||
|
||||
Examples:
|
||||
kubesolo-update check --server https://updates.example.com
|
||||
kubesolo-update apply --server https://updates.example.com --pubkey /etc/kubesolo/update-pubkey.hex
|
||||
kubesolo-update apply --server https://updates.example.com
|
||||
kubesolo-update apply --registry ghcr.io/portainer/kubesolo-os --tag stable
|
||||
kubesolo-update apply --force # uses /etc/kubesolo/update.conf
|
||||
kubesolo-update healthcheck
|
||||
kubesolo-update status
|
||||
kubesolo-update status --json
|
||||
`)
|
||||
}
|
||||
|
||||
105
update/pkg/config/config.go
Normal file
105
update/pkg/config/config.go
Normal file
@@ -0,0 +1,105 @@
|
||||
// Package config parses /etc/kubesolo/update.conf — the persistent
|
||||
// configuration for the update agent. Each line is "key = value"; blank
|
||||
// lines and "#"-prefixed comments are ignored. Unknown keys are tolerated
|
||||
// (forward compatibility).
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// # Where to look for updates
|
||||
// server = https://updates.kubesolo.example.com
|
||||
// channel = stable
|
||||
//
|
||||
// # Only apply between 03:00 and 05:00 local time
|
||||
// maintenance_window = 03:00-05:00
|
||||
//
|
||||
// pubkey = /etc/kubesolo/update-pubkey.hex
|
||||
//
|
||||
// The file is populated on first boot by cloud-init (see the cloud-init
|
||||
// updates: block) and can be hand-edited afterwards.
|
||||
package config
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DefaultPath is where update.conf lives on a live system.
|
||||
const DefaultPath = "/etc/kubesolo/update.conf"
|
||||
|
||||
// Config holds the parsed update.conf values. Empty fields mean "not set" —
|
||||
// the caller's defaults apply.
|
||||
type Config struct {
|
||||
Server string
|
||||
Channel string
|
||||
MaintenanceWindow string
|
||||
PubKey string
|
||||
// HealthcheckURL is an optional URL the healthcheck command will GET;
|
||||
// 200 = pass, anything else = fail.
|
||||
HealthcheckURL string
|
||||
// AutoRollbackAfter is the number of consecutive post-boot healthcheck
|
||||
// failures after which the agent will call Rollback automatically.
|
||||
// 0 = disabled (default).
|
||||
AutoRollbackAfter int
|
||||
}
|
||||
|
||||
// Load reads and parses update.conf. A missing file returns an empty Config
|
||||
// (not an error) — fresh systems before cloud-init has run.
|
||||
func Load(path string) (*Config, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return &Config{}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("open %s: %w", path, err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
c := &Config{}
|
||||
scanner := bufio.NewScanner(f)
|
||||
lineNo := 0
|
||||
for scanner.Scan() {
|
||||
lineNo++
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
eq := strings.IndexByte(line, '=')
|
||||
if eq < 0 {
|
||||
return nil, fmt.Errorf("%s:%d: missing '=' in line: %q", path, lineNo, line)
|
||||
}
|
||||
key := strings.TrimSpace(line[:eq])
|
||||
value := strings.TrimSpace(line[eq+1:])
|
||||
switch key {
|
||||
case "server":
|
||||
c.Server = value
|
||||
case "channel":
|
||||
c.Channel = value
|
||||
case "maintenance_window":
|
||||
c.MaintenanceWindow = value
|
||||
case "pubkey":
|
||||
c.PubKey = value
|
||||
case "healthcheck_url":
|
||||
c.HealthcheckURL = value
|
||||
case "auto_rollback_after":
|
||||
// Parse a small integer. Non-numeric values are silently
|
||||
// ignored (forward compat); zero disables the feature.
|
||||
n := 0
|
||||
for _, ch := range value {
|
||||
if ch >= '0' && ch <= '9' {
|
||||
n = n*10 + int(ch-'0')
|
||||
} else {
|
||||
n = 0
|
||||
break
|
||||
}
|
||||
}
|
||||
c.AutoRollbackAfter = n
|
||||
}
|
||||
// Unknown keys are silently ignored for forward compatibility.
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("read %s: %w", path, err)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
117
update/pkg/config/config_test.go
Normal file
117
update/pkg/config/config_test.go
Normal file
@@ -0,0 +1,117 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func writeConf(t *testing.T, content string) string {
|
||||
t.Helper()
|
||||
path := filepath.Join(t.TempDir(), "update.conf")
|
||||
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("seed: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func TestLoadMissingReturnsEmptyConfig(t *testing.T) {
|
||||
c, err := Load(filepath.Join(t.TempDir(), "does-not-exist.conf"))
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if c == nil {
|
||||
t.Fatal("Load returned nil config")
|
||||
}
|
||||
if c.Server != "" || c.Channel != "" || c.MaintenanceWindow != "" || c.PubKey != "" {
|
||||
t.Errorf("expected empty config, got %+v", c)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadAllFields(t *testing.T) {
|
||||
path := writeConf(t, `# comment line
|
||||
server = https://updates.example.com
|
||||
channel = stable
|
||||
maintenance_window = 03:00-05:00
|
||||
pubkey = /etc/kubesolo/pub.hex
|
||||
`)
|
||||
c, err := Load(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.Server != "https://updates.example.com" {
|
||||
t.Errorf("server: got %q", c.Server)
|
||||
}
|
||||
if c.Channel != "stable" {
|
||||
t.Errorf("channel: got %q", c.Channel)
|
||||
}
|
||||
if c.MaintenanceWindow != "03:00-05:00" {
|
||||
t.Errorf("maintenance_window: got %q", c.MaintenanceWindow)
|
||||
}
|
||||
if c.PubKey != "/etc/kubesolo/pub.hex" {
|
||||
t.Errorf("pubkey: got %q", c.PubKey)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadIgnoresUnknownKeys(t *testing.T) {
|
||||
// Unknown keys must not be an error — supports forward-compat config
|
||||
// fields added by newer agent versions.
|
||||
path := writeConf(t, `server = https://x
|
||||
future_field = whatever
|
||||
channel = beta
|
||||
`)
|
||||
c, err := Load(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.Server != "https://x" {
|
||||
t.Errorf("server: got %q", c.Server)
|
||||
}
|
||||
if c.Channel != "beta" {
|
||||
t.Errorf("channel: got %q", c.Channel)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadStripsWhitespace(t *testing.T) {
|
||||
path := writeConf(t, " server = https://example \n channel=stable\n")
|
||||
c, err := Load(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.Server != "https://example" {
|
||||
t.Errorf("server: got %q (whitespace not stripped?)", c.Server)
|
||||
}
|
||||
if c.Channel != "stable" {
|
||||
t.Errorf("channel: got %q", c.Channel)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadIgnoresBlankAndCommentLines(t *testing.T) {
|
||||
path := writeConf(t, `
|
||||
# this is a comment
|
||||
|
||||
server = https://example
|
||||
# indented comment
|
||||
channel = stable
|
||||
|
||||
`)
|
||||
c, err := Load(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.Server != "https://example" {
|
||||
t.Errorf("server: got %q", c.Server)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadRejectsMissingEquals(t *testing.T) {
|
||||
// "noEqualsHere" with no '=' is a syntax error worth surfacing — likely
|
||||
// indicates a corrupted config file.
|
||||
path := writeConf(t, `server = https://example
|
||||
noEqualsHere
|
||||
`)
|
||||
_, err := Load(path)
|
||||
if err == nil {
|
||||
t.Error("expected error on malformed line, got nil")
|
||||
}
|
||||
}
|
||||
60
update/pkg/config/version.go
Normal file
60
update/pkg/config/version.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CompareVersions compares two semver-ish version strings.
|
||||
//
|
||||
// Accepts "v1.2.3", "1.2.3", "v1.2.3-rc1" (suffix ignored), with missing
|
||||
// components defaulting to 0 ("v1" == "1.0.0"). Returns -1 if a < b, 0 if
|
||||
// equal, +1 if a > b. Returns an error if either argument can't be parsed
|
||||
// at all.
|
||||
//
|
||||
// Used by apply.go to enforce MinCompatibleVersion. Pre-release suffix
|
||||
// handling is deliberately simple — we ignore it, treating "v1.2.3-rc1"
|
||||
// equal to "v1.2.3". Edge case: production releases should never carry
|
||||
// a pre-release suffix, and dev releases are the consumer's responsibility.
|
||||
func CompareVersions(a, b string) (int, error) {
|
||||
pa, err := parseVersion(a)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parse %q: %w", a, err)
|
||||
}
|
||||
pb, err := parseVersion(b)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parse %q: %w", b, err)
|
||||
}
|
||||
for i := 0; i < 3; i++ {
|
||||
if pa[i] < pb[i] {
|
||||
return -1, nil
|
||||
}
|
||||
if pa[i] > pb[i] {
|
||||
return 1, nil
|
||||
}
|
||||
}
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func parseVersion(s string) ([3]int, error) {
|
||||
var out [3]int
|
||||
s = strings.TrimSpace(s)
|
||||
s = strings.TrimPrefix(s, "v")
|
||||
// Drop pre-release suffix: "1.2.3-rc1" -> "1.2.3"
|
||||
if i := strings.IndexAny(s, "-+"); i >= 0 {
|
||||
s = s[:i]
|
||||
}
|
||||
parts := strings.SplitN(s, ".", 3)
|
||||
for i, p := range parts {
|
||||
n, err := strconv.Atoi(p)
|
||||
if err != nil {
|
||||
return out, fmt.Errorf("component %q not numeric", p)
|
||||
}
|
||||
if n < 0 {
|
||||
return out, fmt.Errorf("component %d negative", n)
|
||||
}
|
||||
out[i] = n
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
46
update/pkg/config/version_test.go
Normal file
46
update/pkg/config/version_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package config
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestCompareVersions(t *testing.T) {
|
||||
tests := []struct {
|
||||
a, b string
|
||||
want int
|
||||
}{
|
||||
{"v1.0.0", "v1.0.0", 0},
|
||||
{"1.0.0", "v1.0.0", 0}, // 'v' prefix optional
|
||||
{"v1.0.0", "v1.0.1", -1},
|
||||
{"v1.0.1", "v1.0.0", 1},
|
||||
{"v1.1.0", "v1.0.99", 1},
|
||||
{"v2.0.0", "v1.99.99", 1},
|
||||
{"v0.3.0-dev", "v0.3.0", 0}, // pre-release suffix ignored
|
||||
{"v0.2.5", "v0.3.0", -1},
|
||||
{"v0.3.0", "v0.2.999", 1},
|
||||
{"v1.2", "v1.2.0", 0}, // missing component defaults to 0
|
||||
{"v1", "v1.0.0", 0},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got, err := CompareVersions(tt.a, tt.b)
|
||||
if err != nil {
|
||||
t.Errorf("CompareVersions(%q, %q): %v", tt.a, tt.b, err)
|
||||
continue
|
||||
}
|
||||
if got != tt.want {
|
||||
t.Errorf("CompareVersions(%q, %q) = %d, want %d", tt.a, tt.b, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCompareVersionsRejectsGarbage(t *testing.T) {
|
||||
bad := []string{
|
||||
"not-a-version",
|
||||
"v.1.2",
|
||||
"vabc",
|
||||
"",
|
||||
}
|
||||
for _, s := range bad {
|
||||
if _, err := CompareVersions(s, "v1.0.0"); err == nil {
|
||||
t.Errorf("CompareVersions(%q, ...) accepted, want error", s)
|
||||
}
|
||||
}
|
||||
}
|
||||
95
update/pkg/config/window.go
Normal file
95
update/pkg/config/window.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Window is a parsed maintenance-window expression. Times are minutes since
|
||||
// midnight in the local timezone. When End < Start, the window wraps
|
||||
// midnight (e.g. 23:00-01:00 means 23:00 today through 01:00 tomorrow).
|
||||
//
|
||||
// The zero value (Start == End == 0) means "always allowed" — used for
|
||||
// the empty-string-meaning-no-window case.
|
||||
type Window struct {
|
||||
Start int // minutes since midnight, [0, 1440)
|
||||
End int // minutes since midnight, [0, 1440)
|
||||
|
||||
// alwaysOpen distinguishes "no constraint" from "midnight to midnight"
|
||||
// (the literal 00:00-00:00 window, which is a degenerate same-instant
|
||||
// window). Set when ParseWindow is called with an empty string.
|
||||
alwaysOpen bool
|
||||
}
|
||||
|
||||
// AlwaysOpen returns true if this window imposes no constraint (the empty
|
||||
// string was parsed).
|
||||
func (w Window) AlwaysOpen() bool { return w.alwaysOpen }
|
||||
|
||||
// ParseWindow parses "HH:MM-HH:MM" into a Window. Empty input returns an
|
||||
// AlwaysOpen window (no constraint). Whitespace around the input is tolerated.
|
||||
func ParseWindow(s string) (Window, error) {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return Window{alwaysOpen: true}, nil
|
||||
}
|
||||
parts := strings.SplitN(s, "-", 2)
|
||||
if len(parts) != 2 {
|
||||
return Window{}, fmt.Errorf("maintenance window %q: expected HH:MM-HH:MM", s)
|
||||
}
|
||||
start, err := parseHHMM(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
return Window{}, fmt.Errorf("maintenance window %q: start: %w", s, err)
|
||||
}
|
||||
end, err := parseHHMM(strings.TrimSpace(parts[1]))
|
||||
if err != nil {
|
||||
return Window{}, fmt.Errorf("maintenance window %q: end: %w", s, err)
|
||||
}
|
||||
return Window{Start: start, End: end}, nil
|
||||
}
|
||||
|
||||
func parseHHMM(s string) (int, error) {
|
||||
parts := strings.SplitN(s, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
return 0, fmt.Errorf("%q: expected HH:MM", s)
|
||||
}
|
||||
h, err := strconv.Atoi(parts[0])
|
||||
if err != nil || h < 0 || h > 23 {
|
||||
return 0, fmt.Errorf("%q: invalid hour", s)
|
||||
}
|
||||
m, err := strconv.Atoi(parts[1])
|
||||
if err != nil || m < 0 || m > 59 {
|
||||
return 0, fmt.Errorf("%q: invalid minute", s)
|
||||
}
|
||||
return h*60 + m, nil
|
||||
}
|
||||
|
||||
// Contains reports whether the given local time falls inside this window.
|
||||
// AlwaysOpen windows return true for any time.
|
||||
func (w Window) Contains(t time.Time) bool {
|
||||
if w.alwaysOpen {
|
||||
return true
|
||||
}
|
||||
now := t.Hour()*60 + t.Minute()
|
||||
if w.Start == w.End {
|
||||
// Degenerate: zero-length window. Never matches.
|
||||
return false
|
||||
}
|
||||
if w.Start < w.End {
|
||||
// Same-day window: [Start, End)
|
||||
return now >= w.Start && now < w.End
|
||||
}
|
||||
// Wrapping window: [Start, 1440) ∪ [0, End)
|
||||
return now >= w.Start || now < w.End
|
||||
}
|
||||
|
||||
// String renders the window in HH:MM-HH:MM form for display. AlwaysOpen
|
||||
// renders as "always".
|
||||
func (w Window) String() string {
|
||||
if w.alwaysOpen {
|
||||
return "always"
|
||||
}
|
||||
return fmt.Sprintf("%02d:%02d-%02d:%02d",
|
||||
w.Start/60, w.Start%60, w.End/60, w.End%60)
|
||||
}
|
||||
120
update/pkg/config/window_test.go
Normal file
120
update/pkg/config/window_test.go
Normal file
@@ -0,0 +1,120 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func at(hour, min int) time.Time {
|
||||
return time.Date(2026, 1, 1, hour, min, 0, 0, time.UTC)
|
||||
}
|
||||
|
||||
func TestParseWindowEmpty(t *testing.T) {
|
||||
w, err := ParseWindow("")
|
||||
if err != nil {
|
||||
t.Fatalf("empty window: %v", err)
|
||||
}
|
||||
if !w.AlwaysOpen() {
|
||||
t.Error("empty input should produce AlwaysOpen window")
|
||||
}
|
||||
if !w.Contains(at(3, 0)) {
|
||||
t.Error("AlwaysOpen window should contain any time")
|
||||
}
|
||||
if !w.Contains(at(23, 59)) {
|
||||
t.Error("AlwaysOpen window should contain end-of-day")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseWindowSameDay(t *testing.T) {
|
||||
w, err := ParseWindow("03:00-05:00")
|
||||
if err != nil {
|
||||
t.Fatalf("parse: %v", err)
|
||||
}
|
||||
tests := []struct {
|
||||
hour, min int
|
||||
want bool
|
||||
}{
|
||||
{2, 59, false}, // just before
|
||||
{3, 0, true}, // start (inclusive)
|
||||
{4, 30, true}, // middle
|
||||
{4, 59, true}, // just before end
|
||||
{5, 0, false}, // end (exclusive)
|
||||
{15, 0, false}, // far outside
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := w.Contains(at(tt.hour, tt.min))
|
||||
if got != tt.want {
|
||||
t.Errorf("Contains(%02d:%02d) = %v, want %v", tt.hour, tt.min, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseWindowWrappingMidnight(t *testing.T) {
|
||||
w, err := ParseWindow("23:00-01:00")
|
||||
if err != nil {
|
||||
t.Fatalf("parse: %v", err)
|
||||
}
|
||||
tests := []struct {
|
||||
hour, min int
|
||||
want bool
|
||||
}{
|
||||
{22, 59, false}, // just before
|
||||
{23, 0, true}, // start (inclusive)
|
||||
{23, 30, true}, // night-before
|
||||
{0, 0, true}, // midnight
|
||||
{0, 30, true}, // early morning
|
||||
{0, 59, true}, // just before end
|
||||
{1, 0, false}, // end (exclusive)
|
||||
{12, 0, false}, // far outside (noon)
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := w.Contains(at(tt.hour, tt.min))
|
||||
if got != tt.want {
|
||||
t.Errorf("Contains(%02d:%02d) wrapping = %v, want %v", tt.hour, tt.min, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseWindowDegenerateZeroLength(t *testing.T) {
|
||||
// 05:00-05:00 is a zero-length window — should never match. Different
|
||||
// from "always" (empty string).
|
||||
w, err := ParseWindow("05:00-05:00")
|
||||
if err != nil {
|
||||
t.Fatalf("parse: %v", err)
|
||||
}
|
||||
if w.AlwaysOpen() {
|
||||
t.Error("05:00-05:00 must not be AlwaysOpen")
|
||||
}
|
||||
if w.Contains(at(5, 0)) {
|
||||
t.Error("zero-length window must not contain its own boundary")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseWindowRejectsBadInput(t *testing.T) {
|
||||
bad := []string{
|
||||
"notatime",
|
||||
"03:00", // no end
|
||||
"03:00-", // empty end
|
||||
"03:00-05", // missing minutes
|
||||
"24:00-05:00", // hour out of range
|
||||
"03:60-05:00", // minute out of range
|
||||
"abc:00-05:00", // non-numeric
|
||||
}
|
||||
for _, s := range bad {
|
||||
_, err := ParseWindow(s)
|
||||
if err == nil {
|
||||
t.Errorf("ParseWindow(%q) accepted, want error", s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowString(t *testing.T) {
|
||||
w, _ := ParseWindow("03:05-05:45")
|
||||
if w.String() != "03:05-05:45" {
|
||||
t.Errorf("String = %q, want 03:05-05:45", w.String())
|
||||
}
|
||||
always, _ := ParseWindow("")
|
||||
if always.String() != "always" {
|
||||
t.Errorf("AlwaysOpen.String = %q, want 'always'", always.String())
|
||||
}
|
||||
}
|
||||
125
update/pkg/health/extended.go
Normal file
125
update/pkg/health/extended.go
Normal file
@@ -0,0 +1,125 @@
|
||||
package health
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// kubeSystemSettleSeconds is how long all kube-system pods must hold a
|
||||
// Running phase before we consider the cluster genuinely up. Catches the
|
||||
// "pod just started, will crash-loop in 5s" case.
|
||||
const kubeSystemSettleSeconds = 30
|
||||
|
||||
// CheckKubeSystemReady verifies that every pod in the kube-system namespace
|
||||
// is in Running phase and has been Running for at least settle. Returns
|
||||
// (ready, error). settle defaults to 30s when zero.
|
||||
func (c *Checker) CheckKubeSystemReady(settle time.Duration) bool {
|
||||
if settle == 0 {
|
||||
settle = kubeSystemSettleSeconds * time.Second
|
||||
}
|
||||
if _, err := os.Stat(c.kubeconfigPath); err != nil {
|
||||
return false
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// jsonpath emits one line per pod: <phase>|<startTime>
|
||||
cmd := exec.CommandContext(ctx, "kubectl",
|
||||
"--kubeconfig", c.kubeconfigPath,
|
||||
"get", "pods", "-n", "kube-system",
|
||||
"-o", `jsonpath={range .items[*]}{.status.phase}|{.status.startTime}{"\n"}{end}`,
|
||||
)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||
if len(lines) == 0 || lines[0] == "" {
|
||||
// No pods reported. Conservatively treat as not-ready: kube-system
|
||||
// is expected to host at least CoreDNS + pause.
|
||||
return false
|
||||
}
|
||||
now := time.Now()
|
||||
for _, line := range lines {
|
||||
parts := strings.SplitN(line, "|", 2)
|
||||
phase := strings.TrimSpace(parts[0])
|
||||
if phase != "Running" {
|
||||
return false
|
||||
}
|
||||
if len(parts) < 2 {
|
||||
return false
|
||||
}
|
||||
start, perr := time.Parse(time.RFC3339, strings.TrimSpace(parts[1]))
|
||||
if perr != nil {
|
||||
return false
|
||||
}
|
||||
if now.Sub(start) < settle {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// CheckProbeURL fetches the given URL and reports whether it returned 200.
|
||||
// Empty url returns (true, nil) — the check is opt-in.
|
||||
func CheckProbeURL(url string) (bool, error) {
|
||||
if url == "" {
|
||||
return true, nil
|
||||
}
|
||||
client := &http.Client{Timeout: 5 * time.Second}
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("probe URL %s: %w", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
return resp.StatusCode == http.StatusOK, nil
|
||||
}
|
||||
|
||||
// CheckDiskWritable writes a small file under dataDir, fsyncs, reads it back,
|
||||
// and removes it. Confirms the data partition is mounted read-write and the
|
||||
// underlying disk is responsive. Empty dataDir defaults to /var/lib/kubesolo.
|
||||
func CheckDiskWritable(dataDir string) (bool, error) {
|
||||
if dataDir == "" {
|
||||
dataDir = "/var/lib/kubesolo"
|
||||
}
|
||||
if _, err := os.Stat(dataDir); err != nil {
|
||||
// Data partition not mounted? That's catastrophic but we shouldn't
|
||||
// claim the disk is fine.
|
||||
return false, fmt.Errorf("dataDir %s: %w", dataDir, err)
|
||||
}
|
||||
probe := filepath.Join(dataDir, ".update-probe")
|
||||
want := []byte("kubesolo-os healthcheck probe")
|
||||
|
||||
f, err := os.Create(probe)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("create probe: %w", err)
|
||||
}
|
||||
defer os.Remove(probe)
|
||||
|
||||
if _, err := f.Write(want); err != nil {
|
||||
f.Close()
|
||||
return false, fmt.Errorf("write probe: %w", err)
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
f.Close()
|
||||
return false, fmt.Errorf("fsync probe: %w", err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return false, fmt.Errorf("close probe: %w", err)
|
||||
}
|
||||
|
||||
got, err := os.ReadFile(probe)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("read probe: %w", err)
|
||||
}
|
||||
if string(got) != string(want) {
|
||||
return false, fmt.Errorf("probe content mismatch: got %q", got)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
77
update/pkg/health/extended_test.go
Normal file
77
update/pkg/health/extended_test.go
Normal file
@@ -0,0 +1,77 @@
|
||||
package health
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestCheckProbeURLEmptyAlwaysPasses(t *testing.T) {
|
||||
ok, err := CheckProbeURL("")
|
||||
if err != nil {
|
||||
t.Fatalf("CheckProbeURL(\"\"): %v", err)
|
||||
}
|
||||
if !ok {
|
||||
t.Error("empty probe URL should return ok=true (check disabled)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckProbeURL200(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer srv.Close()
|
||||
ok, err := CheckProbeURL(srv.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("CheckProbeURL: %v", err)
|
||||
}
|
||||
if !ok {
|
||||
t.Error("expected ok=true on 200")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckProbeURLNon200(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
}))
|
||||
defer srv.Close()
|
||||
ok, err := CheckProbeURL(srv.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("CheckProbeURL: %v", err)
|
||||
}
|
||||
if ok {
|
||||
t.Error("expected ok=false on 503")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckProbeURLNetworkError(t *testing.T) {
|
||||
// Port 1 is reserved (tcpmux) and never bound by Linux defaults.
|
||||
_, err := CheckProbeURL("http://127.0.0.1:1")
|
||||
if err == nil {
|
||||
t.Error("expected error for unreachable URL, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckDiskWritableHappyPath(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
ok, err := CheckDiskWritable(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("CheckDiskWritable: %v", err)
|
||||
}
|
||||
if !ok {
|
||||
t.Error("expected ok=true on writable temp dir")
|
||||
}
|
||||
// Probe file should have been cleaned up.
|
||||
if _, err := os.Stat(filepath.Join(dir, ".update-probe")); !os.IsNotExist(err) {
|
||||
t.Errorf("probe file not cleaned up: stat err=%v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckDiskWritableMissingDir(t *testing.T) {
|
||||
_, err := CheckDiskWritable("/this/path/does/not/exist")
|
||||
if err == nil {
|
||||
t.Error("expected error for missing dataDir, got nil")
|
||||
}
|
||||
}
|
||||
@@ -27,12 +27,17 @@ type Status struct {
|
||||
Containerd bool
|
||||
APIServer bool
|
||||
NodeReady bool
|
||||
KubeSystemReady bool // optional — true unless KubeSystemSettle is non-zero
|
||||
ProbeURL bool // optional — true unless ProbeURL is set
|
||||
DiskWritable bool // optional — true unless DataDir is set
|
||||
Message string
|
||||
}
|
||||
|
||||
// IsHealthy returns true if all checks passed.
|
||||
// IsHealthy returns true if all required checks passed. Optional checks
|
||||
// default to true when not configured, so they don't block the result.
|
||||
func (s *Status) IsHealthy() bool {
|
||||
return s.Containerd && s.APIServer && s.NodeReady
|
||||
return s.Containerd && s.APIServer && s.NodeReady &&
|
||||
s.KubeSystemReady && s.ProbeURL && s.DiskWritable
|
||||
}
|
||||
|
||||
// Checker performs health checks against the local KubeSolo instance.
|
||||
@@ -40,6 +45,11 @@ type Checker struct {
|
||||
kubeconfigPath string
|
||||
apiServerAddr string
|
||||
timeout time.Duration
|
||||
|
||||
// Optional gates. Zero values disable the check (it reports true).
|
||||
KubeSystemSettle time.Duration
|
||||
ProbeURL string
|
||||
DataDir string
|
||||
}
|
||||
|
||||
// NewChecker creates a health checker.
|
||||
@@ -149,12 +159,37 @@ func (c *Checker) CheckNodeReady() bool {
|
||||
}
|
||||
|
||||
// RunAll performs all health checks and returns the combined status.
|
||||
//
|
||||
// Optional checks (kube-system settle, user probe URL, disk writability) are
|
||||
// only run if the corresponding Checker fields are set; otherwise they
|
||||
// report true so as not to block the result.
|
||||
func (c *Checker) RunAll() *Status {
|
||||
return &Status{
|
||||
s := &Status{
|
||||
Containerd: c.CheckContainerd(),
|
||||
APIServer: c.CheckAPIServer(),
|
||||
NodeReady: c.CheckNodeReady(),
|
||||
KubeSystemReady: true,
|
||||
ProbeURL: true,
|
||||
DiskWritable: true,
|
||||
}
|
||||
if c.KubeSystemSettle > 0 {
|
||||
s.KubeSystemReady = c.CheckKubeSystemReady(c.KubeSystemSettle)
|
||||
}
|
||||
if c.ProbeURL != "" {
|
||||
ok, err := CheckProbeURL(c.ProbeURL)
|
||||
if err != nil {
|
||||
slog.Warn("probe URL check failed", "url", c.ProbeURL, "error", err)
|
||||
}
|
||||
s.ProbeURL = ok
|
||||
}
|
||||
if c.DataDir != "" {
|
||||
ok, err := CheckDiskWritable(c.DataDir)
|
||||
if err != nil {
|
||||
slog.Warn("disk writability check failed", "dir", c.DataDir, "error", err)
|
||||
}
|
||||
s.DiskWritable = ok
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// WaitForHealthy polls health checks until all pass or timeout expires.
|
||||
|
||||
@@ -6,36 +6,42 @@ import (
|
||||
)
|
||||
|
||||
func TestStatusIsHealthy(t *testing.T) {
|
||||
// Helper for the new 6-field Status: all-true except the named one.
|
||||
allBut := func(field string) Status {
|
||||
s := Status{
|
||||
Containerd: true, APIServer: true, NodeReady: true,
|
||||
KubeSystemReady: true, ProbeURL: true, DiskWritable: true,
|
||||
}
|
||||
switch field {
|
||||
case "Containerd":
|
||||
s.Containerd = false
|
||||
case "APIServer":
|
||||
s.APIServer = false
|
||||
case "NodeReady":
|
||||
s.NodeReady = false
|
||||
case "KubeSystemReady":
|
||||
s.KubeSystemReady = false
|
||||
case "ProbeURL":
|
||||
s.ProbeURL = false
|
||||
case "DiskWritable":
|
||||
s.DiskWritable = false
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
status Status
|
||||
wantHealth bool
|
||||
}{
|
||||
{
|
||||
name: "all healthy",
|
||||
status: Status{Containerd: true, APIServer: true, NodeReady: true},
|
||||
wantHealth: true,
|
||||
},
|
||||
{
|
||||
name: "containerd down",
|
||||
status: Status{Containerd: false, APIServer: true, NodeReady: true},
|
||||
wantHealth: false,
|
||||
},
|
||||
{
|
||||
name: "apiserver down",
|
||||
status: Status{Containerd: true, APIServer: false, NodeReady: true},
|
||||
wantHealth: false,
|
||||
},
|
||||
{
|
||||
name: "node not ready",
|
||||
status: Status{Containerd: true, APIServer: true, NodeReady: false},
|
||||
wantHealth: false,
|
||||
},
|
||||
{
|
||||
name: "all down",
|
||||
status: Status{Containerd: false, APIServer: false, NodeReady: false},
|
||||
wantHealth: false,
|
||||
},
|
||||
{"all healthy", allBut(""), true},
|
||||
{"containerd down", allBut("Containerd"), false},
|
||||
{"apiserver down", allBut("APIServer"), false},
|
||||
{"node not ready", allBut("NodeReady"), false},
|
||||
{"kube-system not ready", allBut("KubeSystemReady"), false},
|
||||
{"probe URL failed", allBut("ProbeURL"), false},
|
||||
{"disk not writable", allBut("DiskWritable"), false},
|
||||
{"all down", Status{}, false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
|
||||
51
update/pkg/health/preflight.go
Normal file
51
update/pkg/health/preflight.go
Normal file
@@ -0,0 +1,51 @@
|
||||
package health
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// NodeBlockLabel is the well-known label that workload authors set on the
|
||||
// local node to defer an OS update. When present and "true", apply refuses.
|
||||
const NodeBlockLabel = "updates.kubesolo.io/block"
|
||||
|
||||
// CheckNodeBlocked returns (blocked, error). blocked==true means the local
|
||||
// node carries the updates.kubesolo.io/block=true label and the caller should
|
||||
// refuse the update.
|
||||
//
|
||||
// If the kubeconfig is not available (offline / pre-boot / air-gap), this
|
||||
// returns (false, nil) — silently allowing the update. That's the safe
|
||||
// behaviour for the air-gap case where the node may not be reachable from
|
||||
// the agent's perspective.
|
||||
func CheckNodeBlocked(kubeconfigPath string) (bool, error) {
|
||||
if kubeconfigPath == "" {
|
||||
kubeconfigPath = "/var/lib/kubesolo/pki/admin/admin.kubeconfig"
|
||||
}
|
||||
if _, err := os.Stat(kubeconfigPath); err != nil {
|
||||
// No kubeconfig — assume air-gap / pre-K8s. Don't block updates.
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Query the node label via kubectl. We don't know the node name a
|
||||
// priori, so we use --kubeconfig on the local admin config and ask for
|
||||
// "the only node" (KubeSolo is single-node by design).
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "kubectl",
|
||||
"--kubeconfig", kubeconfigPath,
|
||||
"get", "node",
|
||||
"-o", `jsonpath={.items[0].metadata.labels.updates\.kubesolo\.io/block}`)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
// API unreachable or no nodes — treat as not blocked (analogous to
|
||||
// the kubeconfig-missing case). We still surface the error so the
|
||||
// caller can decide to log it.
|
||||
return false, fmt.Errorf("query node label: %w", err)
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "true", nil
|
||||
}
|
||||
@@ -35,6 +35,24 @@ type UpdateMetadata struct {
|
||||
MetadataSigURL string `json:"metadata_sig_url,omitempty"`
|
||||
ReleaseNotes string `json:"release_notes,omitempty"`
|
||||
ReleaseDate string `json:"release_date,omitempty"`
|
||||
|
||||
// Channel labels this artifact ("stable", "beta", "edge", ...). The agent
|
||||
// refuses metadata whose channel doesn't match the locally-configured
|
||||
// one. Empty in metadata means "no channel constraint, accept anything".
|
||||
Channel string `json:"channel,omitempty"`
|
||||
|
||||
// MinCompatibleVersion is the lowest version that can upgrade to this
|
||||
// one. The agent refuses to apply if the currently-running version is
|
||||
// below this. Used for stepping-stone migrations (e.g. 0.2.x -> 0.3.x
|
||||
// requires 0.2.5+ to land the state-file format first). Empty means
|
||||
// "any source version OK".
|
||||
MinCompatibleVersion string `json:"min_compatible_version,omitempty"`
|
||||
|
||||
// Architecture restricts this artifact to a specific GOARCH ("amd64",
|
||||
// "arm64"). Empty means the artifact is arch-agnostic — which is rare
|
||||
// since the kernel + initramfs are arch-specific; this should normally
|
||||
// be populated by the build pipeline.
|
||||
Architecture string `json:"architecture,omitempty"`
|
||||
}
|
||||
|
||||
// StagedImage represents downloaded and verified update files.
|
||||
|
||||
@@ -11,6 +11,9 @@
|
||||
// kubesolo_os_update_last_check_timestamp_seconds unix timestamp (gauge)
|
||||
// kubesolo_os_memory_total_bytes total RAM (gauge)
|
||||
// kubesolo_os_memory_available_bytes available RAM (gauge)
|
||||
// kubesolo_update_phase{phase} 1 for current phase, 0 for others
|
||||
// kubesolo_update_attempts_total counter — attempts at current ToVersion
|
||||
// kubesolo_update_last_attempt_timestamp_seconds unix timestamp of last state update
|
||||
//
|
||||
// This is a zero-dependency implementation — no Prometheus client library needed.
|
||||
// It serves metrics in the Prometheus text exposition format.
|
||||
@@ -25,11 +28,14 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
// Server is a lightweight Prometheus metrics HTTP server.
|
||||
type Server struct {
|
||||
grubenvPath string
|
||||
statePath string
|
||||
listenAddr string
|
||||
startTime time.Time
|
||||
|
||||
@@ -47,6 +53,27 @@ func NewServer(listenAddr, grubenvPath string) *Server {
|
||||
}
|
||||
}
|
||||
|
||||
// SetStatePath sets the location of the update state.json file. If empty or
|
||||
// unset, state-derived metrics are emitted with the Idle defaults.
|
||||
func (s *Server) SetStatePath(p string) {
|
||||
s.statePath = p
|
||||
}
|
||||
|
||||
// allPhases lists every Phase value we emit as a kubesolo_update_phase
|
||||
// time-series, so consumers see all label values (with value 0 for non-current
|
||||
// phases). Mirror of validPhases in pkg/state.
|
||||
var allPhases = []state.Phase{
|
||||
state.PhaseIdle,
|
||||
state.PhaseChecking,
|
||||
state.PhaseDownloading,
|
||||
state.PhaseStaged,
|
||||
state.PhaseActivated,
|
||||
state.PhaseVerifying,
|
||||
state.PhaseSuccess,
|
||||
state.PhaseRolledBack,
|
||||
state.PhaseFailed,
|
||||
}
|
||||
|
||||
// SetUpdateAvailable records whether an update is available.
|
||||
func (s *Server) SetUpdateAvailable(available bool) {
|
||||
s.mu.Lock()
|
||||
@@ -125,9 +152,49 @@ func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
sb.WriteString("# TYPE kubesolo_os_memory_available_bytes gauge\n")
|
||||
sb.WriteString(fmt.Sprintf("kubesolo_os_memory_available_bytes %d\n", memAvail))
|
||||
|
||||
// Update lifecycle (from state.json)
|
||||
s.writeUpdateStateMetrics(&sb)
|
||||
|
||||
fmt.Fprint(w, sb.String())
|
||||
}
|
||||
|
||||
// writeUpdateStateMetrics appends update-lifecycle metrics derived from the
|
||||
// state.json file. If the file is missing or unreadable, emits the Idle
|
||||
// defaults so the metric series exists at all times.
|
||||
func (s *Server) writeUpdateStateMetrics(sb *strings.Builder) {
|
||||
current := state.PhaseIdle
|
||||
var attempts int
|
||||
var lastTS float64
|
||||
|
||||
if s.statePath != "" {
|
||||
if st, err := state.Load(s.statePath); err == nil && st != nil {
|
||||
current = st.Phase
|
||||
attempts = st.AttemptCount
|
||||
if !st.UpdatedAt.IsZero() {
|
||||
lastTS = float64(st.UpdatedAt.Unix())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sb.WriteString("# HELP kubesolo_update_phase Current update lifecycle phase (1 for active, 0 otherwise).\n")
|
||||
sb.WriteString("# TYPE kubesolo_update_phase gauge\n")
|
||||
for _, p := range allPhases {
|
||||
v := 0
|
||||
if p == current {
|
||||
v = 1
|
||||
}
|
||||
sb.WriteString(fmt.Sprintf("kubesolo_update_phase{phase=%q} %d\n", string(p), v))
|
||||
}
|
||||
|
||||
sb.WriteString("# HELP kubesolo_update_attempts_total Number of update attempts at the current target version.\n")
|
||||
sb.WriteString("# TYPE kubesolo_update_attempts_total counter\n")
|
||||
sb.WriteString(fmt.Sprintf("kubesolo_update_attempts_total %d\n", attempts))
|
||||
|
||||
sb.WriteString("# HELP kubesolo_update_last_attempt_timestamp_seconds Unix timestamp of the last state transition.\n")
|
||||
sb.WriteString("# TYPE kubesolo_update_last_attempt_timestamp_seconds gauge\n")
|
||||
sb.WriteString(fmt.Sprintf("kubesolo_update_last_attempt_timestamp_seconds %.0f\n", lastTS))
|
||||
}
|
||||
|
||||
// readGrubenvVar reads a single variable from grubenv using simple file parse.
|
||||
func (s *Server) readGrubenvVar(key string) string {
|
||||
data, err := os.ReadFile(s.grubenvPath)
|
||||
|
||||
@@ -8,6 +8,8 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/state"
|
||||
)
|
||||
|
||||
func TestNewServer(t *testing.T) {
|
||||
@@ -247,6 +249,86 @@ func TestSafeInt(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStateMetricsAbsentStateFile(t *testing.T) {
|
||||
// No state path set — should emit Idle defaults so the metric series
|
||||
// exists from first boot.
|
||||
s := NewServer(":9100", "/tmp/nonexistent")
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
w := httptest.NewRecorder()
|
||||
s.handleMetrics(w, req)
|
||||
|
||||
body, _ := io.ReadAll(w.Result().Body)
|
||||
output := string(body)
|
||||
|
||||
if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 1`) {
|
||||
t.Errorf("expected idle=1 with no state file, got:\n%s", output)
|
||||
}
|
||||
if !strings.Contains(output, `kubesolo_update_phase{phase="checking"} 0`) {
|
||||
t.Errorf("expected checking=0 with no state file, got:\n%s", output)
|
||||
}
|
||||
if !strings.Contains(output, "kubesolo_update_attempts_total 0") {
|
||||
t.Errorf("expected attempts=0 with no state file, got:\n%s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStateMetricsActivePhase(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
statePath := filepath.Join(dir, "state.json")
|
||||
|
||||
st := state.New()
|
||||
if err := st.Transition(statePath, state.PhaseDownloading, "v0.3.0", ""); err != nil {
|
||||
t.Fatalf("seed state: %v", err)
|
||||
}
|
||||
|
||||
s := NewServer(":9100", "/tmp/nonexistent")
|
||||
s.SetStatePath(statePath)
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
w := httptest.NewRecorder()
|
||||
s.handleMetrics(w, req)
|
||||
|
||||
body, _ := io.ReadAll(w.Result().Body)
|
||||
output := string(body)
|
||||
|
||||
if !strings.Contains(output, `kubesolo_update_phase{phase="downloading"} 1`) {
|
||||
t.Errorf("expected downloading=1, got:\n%s", output)
|
||||
}
|
||||
if !strings.Contains(output, `kubesolo_update_phase{phase="idle"} 0`) {
|
||||
t.Errorf("expected idle=0 when downloading is active, got:\n%s", output)
|
||||
}
|
||||
if !strings.Contains(output, "kubesolo_update_attempts_total 1") {
|
||||
t.Errorf("expected attempts=1 after first Transition, got:\n%s", output)
|
||||
}
|
||||
if strings.Contains(output, "kubesolo_update_last_attempt_timestamp_seconds 0\n") {
|
||||
t.Errorf("expected non-zero timestamp after state write, got:\n%s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStateMetricsAllPhasesEmitted(t *testing.T) {
|
||||
// Every phase value should appear in the output, so dashboards can graph
|
||||
// the series cleanly.
|
||||
s := NewServer(":9100", "/tmp/nonexistent")
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
w := httptest.NewRecorder()
|
||||
s.handleMetrics(w, req)
|
||||
|
||||
body, _ := io.ReadAll(w.Result().Body)
|
||||
output := string(body)
|
||||
|
||||
for _, p := range []state.Phase{
|
||||
state.PhaseIdle, state.PhaseChecking, state.PhaseDownloading,
|
||||
state.PhaseStaged, state.PhaseActivated, state.PhaseVerifying,
|
||||
state.PhaseSuccess, state.PhaseRolledBack, state.PhaseFailed,
|
||||
} {
|
||||
needle := `kubesolo_update_phase{phase="` + string(p) + `"}`
|
||||
if !strings.Contains(output, needle) {
|
||||
t.Errorf("phase %q not present in metrics output", p)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadFileString(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
|
||||
281
update/pkg/oci/oci.go
Normal file
281
update/pkg/oci/oci.go
Normal file
@@ -0,0 +1,281 @@
|
||||
// Package oci pulls KubeSolo OS update artifacts from an OCI-compliant
|
||||
// container registry (e.g. ghcr.io). It is the registry-native alternative
|
||||
// to the legacy HTTP `latest.json` protocol implemented in pkg/image.
|
||||
//
|
||||
// # Artifact layout
|
||||
//
|
||||
// An update is published as a single OCI artifact under a tag like
|
||||
// `stable` or `v0.3.0`. The tag may point at either:
|
||||
//
|
||||
// - A manifest index (preferred) containing per-architecture manifests.
|
||||
// The agent picks the one matching runtime.GOARCH.
|
||||
// - A single manifest (used for arch-specific tags such as
|
||||
// `v0.3.0-amd64`). The agent verifies architecture against the
|
||||
// manifest's platform annotation before trusting it.
|
||||
//
|
||||
// Each per-architecture manifest carries two layers:
|
||||
//
|
||||
// application/vnd.kubesolo.os.kernel.v1+octet-stream // vmlinuz / Image
|
||||
// application/vnd.kubesolo.os.initramfs.v1+gzip // kubesolo-os.gz
|
||||
//
|
||||
// And these annotations (read into image.UpdateMetadata):
|
||||
//
|
||||
// io.kubesolo.os.version "v0.3.0"
|
||||
// io.kubesolo.os.channel "stable"
|
||||
// io.kubesolo.os.min_compatible_version "v0.2.0"
|
||||
// io.kubesolo.os.architecture "amd64"
|
||||
// io.kubesolo.os.release_notes (optional, short)
|
||||
// io.kubesolo.os.release_date (optional, RFC3339)
|
||||
//
|
||||
// The agent ignores any additional layers, so the same image can also be
|
||||
// shaped as a "scratch" container if the build pipeline finds that convenient
|
||||
// for ecosystem tooling.
|
||||
package oci
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
|
||||
"github.com/opencontainers/go-digest"
|
||||
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||
"oras.land/oras-go/v2/content"
|
||||
"oras.land/oras-go/v2/registry/remote"
|
||||
|
||||
"github.com/portainer/kubesolo-os/update/pkg/image"
|
||||
)
|
||||
|
||||
// Media types used on KubeSolo OS update artifacts. Kept here (not in
|
||||
// pkg/image) so the OCI protocol surface is fully self-contained.
|
||||
const (
|
||||
MediaKernel = "application/vnd.kubesolo.os.kernel.v1+octet-stream"
|
||||
MediaInitramfs = "application/vnd.kubesolo.os.initramfs.v1+gzip"
|
||||
|
||||
AnnotVersion = "io.kubesolo.os.version"
|
||||
AnnotChannel = "io.kubesolo.os.channel"
|
||||
AnnotMinVersion = "io.kubesolo.os.min_compatible_version"
|
||||
AnnotArch = "io.kubesolo.os.architecture"
|
||||
AnnotReleaseNote = "io.kubesolo.os.release_notes"
|
||||
AnnotReleaseDate = "io.kubesolo.os.release_date"
|
||||
)
|
||||
|
||||
// Client pulls artifacts from a single OCI repository (e.g.
|
||||
// `ghcr.io/portainer/kubesolo-os`).
|
||||
//
|
||||
// Anonymous (public-pull) access is supported out of the box. For private
|
||||
// repositories, configure auth via the underlying remote.Repository.Client
|
||||
// before passing it to Resolve/Pull — that hook isn't surfaced here yet
|
||||
// (deferred until we actually need it for a private fleet).
|
||||
type Client struct {
|
||||
repo *remote.Repository
|
||||
// Arch is the architecture string we match against manifest indexes.
|
||||
// Defaults to runtime.GOARCH; overridable for testing.
|
||||
Arch string
|
||||
}
|
||||
|
||||
// NewClient parses a repository reference of the form `host/path` (no tag)
|
||||
// and returns a ready-to-use Client.
|
||||
func NewClient(repoRef string) (*Client, error) {
|
||||
repo, err := remote.NewRepository(repoRef)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid OCI reference %q: %w", repoRef, err)
|
||||
}
|
||||
// remote.NewRepository defaults to HTTPS. PlainHTTP is set per-test
|
||||
// via the WithPlainHTTP option when we hit a httptest.Server.
|
||||
return &Client{repo: repo, Arch: runtime.GOARCH}, nil
|
||||
}
|
||||
|
||||
// WithPlainHTTP toggles the underlying registry transport to HTTP. Useful for
|
||||
// httptest-driven unit tests; do not use against production registries.
|
||||
func (c *Client) WithPlainHTTP(plain bool) *Client {
|
||||
c.repo.PlainHTTP = plain
|
||||
return c
|
||||
}
|
||||
|
||||
// FetchMetadata resolves the tag, walks index → manifest if needed, and
|
||||
// returns an image.UpdateMetadata populated from the manifest's annotations.
|
||||
// No blobs are downloaded — this is the cheap "what's available" probe.
|
||||
func (c *Client) FetchMetadata(ctx context.Context, tag string) (*image.UpdateMetadata, error) {
|
||||
manifest, _, err := c.resolveArchManifest(ctx, tag)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return metadataFromAnnotations(manifest.Annotations), nil
|
||||
}
|
||||
|
||||
// Pull resolves the tag, picks the matching-architecture manifest, downloads
|
||||
// the kernel + initramfs layers to `stageDir`, verifies their digests, and
|
||||
// returns a StagedImage compatible with the existing pkg/image consumer.
|
||||
func (c *Client) Pull(ctx context.Context, tag, stageDir string) (*image.StagedImage, *image.UpdateMetadata, error) {
|
||||
manifest, _, err := c.resolveArchManifest(ctx, tag)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(stageDir, 0o755); err != nil {
|
||||
return nil, nil, fmt.Errorf("create stage dir: %w", err)
|
||||
}
|
||||
|
||||
var kernelPath, initramfsPath string
|
||||
for _, layer := range manifest.Layers {
|
||||
switch layer.MediaType {
|
||||
case MediaKernel:
|
||||
kernelPath = filepath.Join(stageDir, "vmlinuz")
|
||||
if err := c.fetchBlobTo(ctx, layer, kernelPath); err != nil {
|
||||
return nil, nil, fmt.Errorf("download kernel: %w", err)
|
||||
}
|
||||
case MediaInitramfs:
|
||||
initramfsPath = filepath.Join(stageDir, "kubesolo-os.gz")
|
||||
if err := c.fetchBlobTo(ctx, layer, initramfsPath); err != nil {
|
||||
return nil, nil, fmt.Errorf("download initramfs: %w", err)
|
||||
}
|
||||
default:
|
||||
slog.Debug("oci: skipping unknown layer", "media", layer.MediaType)
|
||||
}
|
||||
}
|
||||
|
||||
if kernelPath == "" {
|
||||
return nil, nil, fmt.Errorf("manifest has no %s layer", MediaKernel)
|
||||
}
|
||||
if initramfsPath == "" {
|
||||
return nil, nil, fmt.Errorf("manifest has no %s layer", MediaInitramfs)
|
||||
}
|
||||
|
||||
meta := metadataFromAnnotations(manifest.Annotations)
|
||||
staged := &image.StagedImage{
|
||||
VmlinuzPath: kernelPath,
|
||||
InitramfsPath: initramfsPath,
|
||||
Version: meta.Version,
|
||||
}
|
||||
return staged, meta, nil
|
||||
}
|
||||
|
||||
// resolveArchManifest fetches the descriptor at `tag`, walks an index if
|
||||
// present, and returns the platform-specific manifest matching c.Arch.
|
||||
func (c *Client) resolveArchManifest(ctx context.Context, tag string) (*ocispec.Manifest, *ocispec.Descriptor, error) {
|
||||
desc, err := c.repo.Resolve(ctx, tag)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("resolve tag %q: %w", tag, err)
|
||||
}
|
||||
|
||||
switch desc.MediaType {
|
||||
case ocispec.MediaTypeImageIndex, "application/vnd.docker.distribution.manifest.list.v2+json":
|
||||
index, err := fetchJSON[ocispec.Index](ctx, c.repo, desc)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("fetch index: %w", err)
|
||||
}
|
||||
var matched *ocispec.Descriptor
|
||||
for i := range index.Manifests {
|
||||
m := &index.Manifests[i]
|
||||
if m.Platform != nil && m.Platform.Architecture == c.Arch {
|
||||
matched = m
|
||||
break
|
||||
}
|
||||
}
|
||||
if matched == nil {
|
||||
return nil, nil, fmt.Errorf("no manifest in index for architecture %q", c.Arch)
|
||||
}
|
||||
manifest, err := fetchJSON[ocispec.Manifest](ctx, c.repo, *matched)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("fetch manifest: %w", err)
|
||||
}
|
||||
return manifest, matched, nil
|
||||
|
||||
case ocispec.MediaTypeImageManifest, "application/vnd.docker.distribution.manifest.v2+json":
|
||||
manifest, err := fetchJSON[ocispec.Manifest](ctx, c.repo, desc)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("fetch manifest: %w", err)
|
||||
}
|
||||
// Single-arch tag: if it declares an arch, enforce match.
|
||||
if archAnnot := manifest.Annotations[AnnotArch]; archAnnot != "" && archAnnot != c.Arch {
|
||||
return nil, nil, fmt.Errorf("single-arch manifest is %q, want %q", archAnnot, c.Arch)
|
||||
}
|
||||
return manifest, &desc, nil
|
||||
|
||||
default:
|
||||
return nil, nil, fmt.Errorf("unsupported media type %q at tag %q", desc.MediaType, tag)
|
||||
}
|
||||
}
|
||||
|
||||
// fetchJSON pulls a small JSON document (manifest or index) and decodes it.
|
||||
func fetchJSON[T any](ctx context.Context, store content.Fetcher, desc ocispec.Descriptor) (*T, error) {
|
||||
rc, err := store.Fetch(ctx, desc)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rc.Close()
|
||||
data, err := content.ReadAll(rc, desc)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out T
|
||||
if err := json.Unmarshal(data, &out); err != nil {
|
||||
return nil, fmt.Errorf("decode: %w", err)
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// fetchBlobTo streams a blob to disk and verifies its digest matches.
|
||||
// Cleans up the destination file on any error so we never leave a partial.
|
||||
func (c *Client) fetchBlobTo(ctx context.Context, desc ocispec.Descriptor, dest string) (retErr error) {
|
||||
rc, err := c.repo.Fetch(ctx, desc)
|
||||
if err != nil {
|
||||
return fmt.Errorf("fetch blob: %w", err)
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create %s: %w", dest, err)
|
||||
}
|
||||
defer func() {
|
||||
if cerr := f.Close(); retErr == nil && cerr != nil {
|
||||
retErr = cerr
|
||||
}
|
||||
if retErr != nil {
|
||||
_ = os.Remove(dest)
|
||||
}
|
||||
}()
|
||||
|
||||
verifier := desc.Digest.Algorithm().Hash()
|
||||
mw := io.MultiWriter(f, verifier)
|
||||
n, err := io.Copy(mw, rc)
|
||||
if err != nil {
|
||||
return fmt.Errorf("stream blob: %w", err)
|
||||
}
|
||||
if desc.Size > 0 && n != desc.Size {
|
||||
return fmt.Errorf("blob size mismatch: got %d, want %d", n, desc.Size)
|
||||
}
|
||||
got := digest.NewDigest(desc.Digest.Algorithm(), verifier)
|
||||
if got != desc.Digest {
|
||||
return fmt.Errorf("blob digest mismatch: got %s, want %s", got, desc.Digest)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// metadataFromAnnotations builds an UpdateMetadata from manifest annotations.
|
||||
// Always returns a non-nil value (missing fields stay empty).
|
||||
func metadataFromAnnotations(a map[string]string) *image.UpdateMetadata {
|
||||
if a == nil {
|
||||
a = map[string]string{}
|
||||
}
|
||||
return &image.UpdateMetadata{
|
||||
Version: a[AnnotVersion],
|
||||
Channel: a[AnnotChannel],
|
||||
MinCompatibleVersion: a[AnnotMinVersion],
|
||||
Architecture: a[AnnotArch],
|
||||
ReleaseNotes: a[AnnotReleaseNote],
|
||||
ReleaseDate: a[AnnotReleaseDate],
|
||||
}
|
||||
}
|
||||
|
||||
// ErrNoManifestForArch is returned from FetchMetadata/Pull when an index has
|
||||
// no entry matching the running architecture. Exposed so callers can
|
||||
// distinguish "registry unreachable" from "this build doesn't ship for us".
|
||||
var ErrNoManifestForArch = errors.New("no manifest in index for runtime architecture")
|
||||
377
update/pkg/oci/oci_test.go
Normal file
377
update/pkg/oci/oci_test.go
Normal file
@@ -0,0 +1,377 @@
|
||||
package oci
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/go-digest"
|
||||
specs "github.com/opencontainers/image-spec/specs-go"
|
||||
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||
)
|
||||
|
||||
// fakeRegistry implements the minimum OCI distribution-spec surface our
|
||||
// Client touches: /v2/ probe, manifest fetch by tag or digest, blob fetch
|
||||
// by digest. Backed by an in-memory blob+manifest store.
|
||||
type fakeRegistry struct {
|
||||
t *testing.T
|
||||
srv *httptest.Server
|
||||
blobs map[digest.Digest][]byte // keyed by digest
|
||||
manifests map[string][]byte // keyed by digest string (raw form)
|
||||
tags map[string]digest.Digest // tag -> manifest digest
|
||||
mediaTypes map[digest.Digest]string // descriptor.MediaType per stored object
|
||||
}
|
||||
|
||||
func newFakeRegistry(t *testing.T) *fakeRegistry {
|
||||
t.Helper()
|
||||
r := &fakeRegistry{
|
||||
t: t,
|
||||
blobs: map[digest.Digest][]byte{},
|
||||
manifests: map[string][]byte{},
|
||||
tags: map[string]digest.Digest{},
|
||||
mediaTypes: map[digest.Digest]string{},
|
||||
}
|
||||
r.srv = httptest.NewServer(http.HandlerFunc(r.handle))
|
||||
t.Cleanup(r.srv.Close)
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *fakeRegistry) putBlob(media string, data []byte) digest.Digest {
|
||||
h := sha256.Sum256(data)
|
||||
d := digest.NewDigestFromBytes(digest.SHA256, h[:])
|
||||
r.blobs[d] = data
|
||||
r.mediaTypes[d] = media
|
||||
return d
|
||||
}
|
||||
|
||||
// putManifest stores a manifest/index document under both its digest and the
|
||||
// given tag, returning the digest the caller can embed in indexes.
|
||||
func (r *fakeRegistry) putManifest(tag string, media string, doc []byte) digest.Digest {
|
||||
h := sha256.Sum256(doc)
|
||||
d := digest.NewDigestFromBytes(digest.SHA256, h[:])
|
||||
r.manifests[d.String()] = doc
|
||||
r.mediaTypes[d] = media
|
||||
if tag != "" {
|
||||
r.tags[tag] = d
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// repoRef returns the "host:port/repo" string for use with NewClient.
|
||||
func (r *fakeRegistry) repoRef() string {
|
||||
u, _ := url.Parse(r.srv.URL)
|
||||
return u.Host + "/test/kubesolo-os"
|
||||
}
|
||||
|
||||
func (r *fakeRegistry) handle(w http.ResponseWriter, req *http.Request) {
|
||||
// Routes we implement:
|
||||
// GET /v2/ -> 200 "{}"
|
||||
// GET /v2/test/kubesolo-os/manifests/<tag-or-digest> -> manifest
|
||||
// HEAD same -> same headers, no body
|
||||
// GET /v2/test/kubesolo-os/blobs/<digest> -> blob
|
||||
|
||||
path := req.URL.Path
|
||||
if path == "/v2/" || path == "/v2" {
|
||||
w.Header().Set("Docker-Distribution-API-Version", "registry/2.0")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = io.WriteString(w, "{}")
|
||||
return
|
||||
}
|
||||
|
||||
const prefix = "/v2/test/kubesolo-os/"
|
||||
if !strings.HasPrefix(path, prefix) {
|
||||
http.NotFound(w, req)
|
||||
return
|
||||
}
|
||||
rest := strings.TrimPrefix(path, prefix)
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(rest, "manifests/"):
|
||||
ref := strings.TrimPrefix(rest, "manifests/")
|
||||
var d digest.Digest
|
||||
var data []byte
|
||||
if td, ok := r.tags[ref]; ok {
|
||||
d = td
|
||||
data = r.manifests[d.String()]
|
||||
} else if md, ok := r.manifests[ref]; ok {
|
||||
d = digest.Digest(ref)
|
||||
data = md
|
||||
} else {
|
||||
http.NotFound(w, req)
|
||||
return
|
||||
}
|
||||
media := r.mediaTypes[d]
|
||||
w.Header().Set("Content-Type", media)
|
||||
w.Header().Set("Docker-Content-Digest", d.String())
|
||||
w.Header().Set("Content-Length", fmt.Sprintf("%d", len(data)))
|
||||
if req.Method == http.MethodHead {
|
||||
return
|
||||
}
|
||||
_, _ = w.Write(data)
|
||||
|
||||
case strings.HasPrefix(rest, "blobs/"):
|
||||
ref := strings.TrimPrefix(rest, "blobs/")
|
||||
d := digest.Digest(ref)
|
||||
blob, ok := r.blobs[d]
|
||||
if !ok {
|
||||
http.NotFound(w, req)
|
||||
return
|
||||
}
|
||||
media := r.mediaTypes[d]
|
||||
if media == "" {
|
||||
media = "application/octet-stream"
|
||||
}
|
||||
w.Header().Set("Content-Type", media)
|
||||
w.Header().Set("Docker-Content-Digest", d.String())
|
||||
w.Header().Set("Content-Length", fmt.Sprintf("%d", len(blob)))
|
||||
if req.Method == http.MethodHead {
|
||||
return
|
||||
}
|
||||
_, _ = w.Write(blob)
|
||||
|
||||
default:
|
||||
http.NotFound(w, req)
|
||||
}
|
||||
}
|
||||
|
||||
// seedSingleArchManifest puts kernel+initramfs blobs and a manifest with the
|
||||
// given annotations into the registry, tagged as `tag`.
|
||||
func (r *fakeRegistry) seedSingleArchManifest(t *testing.T, tag string, annot map[string]string) (kernelData, initramfsData []byte) {
|
||||
t.Helper()
|
||||
kernelData = []byte("FAKE-KERNEL-" + tag)
|
||||
initramfsData = []byte("FAKE-INITRAMFS-" + tag)
|
||||
|
||||
kd := r.putBlob(MediaKernel, kernelData)
|
||||
id := r.putBlob(MediaInitramfs, initramfsData)
|
||||
|
||||
// An empty config blob with sha256 of "{}" (the canonical "empty" body
|
||||
// per OCI). We don't actually fetch the config so any valid descriptor
|
||||
// works for the tests, but the digest still has to be syntactically valid.
|
||||
emptyConfigBody := []byte("{}")
|
||||
emptyConfigDigest := r.putBlob("application/vnd.oci.empty.v1+json", emptyConfigBody)
|
||||
|
||||
manifest := ocispec.Manifest{
|
||||
Versioned: specs.Versioned{SchemaVersion: 2},
|
||||
MediaType: ocispec.MediaTypeImageManifest,
|
||||
Config: ocispec.Descriptor{
|
||||
MediaType: "application/vnd.oci.empty.v1+json",
|
||||
Size: int64(len(emptyConfigBody)),
|
||||
Digest: emptyConfigDigest,
|
||||
},
|
||||
Layers: []ocispec.Descriptor{
|
||||
{MediaType: MediaKernel, Digest: kd, Size: int64(len(kernelData))},
|
||||
{MediaType: MediaInitramfs, Digest: id, Size: int64(len(initramfsData))},
|
||||
},
|
||||
Annotations: annot,
|
||||
}
|
||||
manifestBytes, err := json.Marshal(manifest)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal manifest: %v", err)
|
||||
}
|
||||
r.putManifest(tag, ocispec.MediaTypeImageManifest, manifestBytes)
|
||||
return
|
||||
}
|
||||
|
||||
// seedIndex creates a manifest index pointing at per-arch manifests created
|
||||
// via seedSingleArchManifest with arch-suffixed tags, then publishes the
|
||||
// index under `tag`.
|
||||
func (r *fakeRegistry) seedIndex(t *testing.T, tag string, perArchAnnots map[string]map[string]string) {
|
||||
t.Helper()
|
||||
var descriptors []ocispec.Descriptor
|
||||
for arch, annot := range perArchAnnots {
|
||||
// Reuse seedSingleArchManifest but under an internal arch-suffixed tag
|
||||
archTag := tag + "-" + arch
|
||||
r.seedSingleArchManifest(t, archTag, annot)
|
||||
d := r.tags[archTag]
|
||||
descriptors = append(descriptors, ocispec.Descriptor{
|
||||
MediaType: ocispec.MediaTypeImageManifest,
|
||||
Digest: d,
|
||||
Size: int64(len(r.manifests[d.String()])),
|
||||
Platform: &ocispec.Platform{Architecture: arch, OS: "linux"},
|
||||
})
|
||||
}
|
||||
index := ocispec.Index{
|
||||
Versioned: specs.Versioned{SchemaVersion: 2},
|
||||
MediaType: ocispec.MediaTypeImageIndex,
|
||||
Manifests: descriptors,
|
||||
}
|
||||
indexBytes, _ := json.Marshal(index)
|
||||
r.putManifest(tag, ocispec.MediaTypeImageIndex, indexBytes)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestFetchMetadataSingleArchManifest(t *testing.T) {
|
||||
reg := newFakeRegistry(t)
|
||||
reg.seedSingleArchManifest(t, "v0.3.0", map[string]string{
|
||||
AnnotVersion: "v0.3.0",
|
||||
AnnotChannel: "stable",
|
||||
AnnotArch: "amd64",
|
||||
})
|
||||
|
||||
c, err := NewClient(reg.repoRef())
|
||||
if err != nil {
|
||||
t.Fatalf("NewClient: %v", err)
|
||||
}
|
||||
c.WithPlainHTTP(true)
|
||||
c.Arch = "amd64"
|
||||
|
||||
meta, err := c.FetchMetadata(context.Background(), "v0.3.0")
|
||||
if err != nil {
|
||||
t.Fatalf("FetchMetadata: %v", err)
|
||||
}
|
||||
if meta.Version != "v0.3.0" {
|
||||
t.Errorf("version: got %q, want v0.3.0", meta.Version)
|
||||
}
|
||||
if meta.Channel != "stable" {
|
||||
t.Errorf("channel: got %q", meta.Channel)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchMetadataIndexSelectsArch(t *testing.T) {
|
||||
reg := newFakeRegistry(t)
|
||||
reg.seedIndex(t, "stable", map[string]map[string]string{
|
||||
"amd64": {AnnotVersion: "v0.3.0", AnnotChannel: "stable", AnnotArch: "amd64"},
|
||||
"arm64": {AnnotVersion: "v0.3.0", AnnotChannel: "stable", AnnotArch: "arm64"},
|
||||
})
|
||||
|
||||
for _, arch := range []string{"amd64", "arm64"} {
|
||||
t.Run(arch, func(t *testing.T) {
|
||||
c, err := NewClient(reg.repoRef())
|
||||
if err != nil {
|
||||
t.Fatalf("NewClient: %v", err)
|
||||
}
|
||||
c.WithPlainHTTP(true)
|
||||
c.Arch = arch
|
||||
|
||||
meta, err := c.FetchMetadata(context.Background(), "stable")
|
||||
if err != nil {
|
||||
t.Fatalf("FetchMetadata: %v", err)
|
||||
}
|
||||
if meta.Architecture != arch {
|
||||
t.Errorf("arch annotation: got %q, want %q", meta.Architecture, arch)
|
||||
}
|
||||
if meta.Version != "v0.3.0" {
|
||||
t.Errorf("version: got %q, want v0.3.0", meta.Version)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchMetadataIndexMissingArchErrors(t *testing.T) {
|
||||
reg := newFakeRegistry(t)
|
||||
reg.seedIndex(t, "stable", map[string]map[string]string{
|
||||
"amd64": {AnnotVersion: "v0.3.0", AnnotArch: "amd64"},
|
||||
})
|
||||
|
||||
c, _ := NewClient(reg.repoRef())
|
||||
c.WithPlainHTTP(true)
|
||||
c.Arch = "arm64" // not in the index
|
||||
|
||||
_, err := c.FetchMetadata(context.Background(), "stable")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for missing arch, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "arm64") {
|
||||
t.Errorf("expected error mentioning arm64, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchMetadataSingleArchManifestRejectsCrossArch(t *testing.T) {
|
||||
// If the manifest declares an arch via annotation and it doesn't match
|
||||
// our runtime, Pull should refuse — defense in depth on top of the
|
||||
// channel/version gates in cmd/apply.go.
|
||||
reg := newFakeRegistry(t)
|
||||
reg.seedSingleArchManifest(t, "v0.3.0-arm64", map[string]string{
|
||||
AnnotArch: "arm64",
|
||||
})
|
||||
|
||||
c, _ := NewClient(reg.repoRef())
|
||||
c.WithPlainHTTP(true)
|
||||
c.Arch = "amd64"
|
||||
|
||||
_, err := c.FetchMetadata(context.Background(), "v0.3.0-arm64")
|
||||
if err == nil {
|
||||
t.Fatal("expected error pulling cross-arch single-arch manifest, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPullDownloadsBlobsAndVerifiesDigest(t *testing.T) {
|
||||
reg := newFakeRegistry(t)
|
||||
kernelData, initramfsData := reg.seedSingleArchManifest(t, "v0.3.0",
|
||||
map[string]string{AnnotVersion: "v0.3.0", AnnotArch: "amd64"})
|
||||
|
||||
c, _ := NewClient(reg.repoRef())
|
||||
c.WithPlainHTTP(true)
|
||||
c.Arch = "amd64"
|
||||
|
||||
stageDir := filepath.Join(t.TempDir(), "stage")
|
||||
staged, meta, err := c.Pull(context.Background(), "v0.3.0", stageDir)
|
||||
if err != nil {
|
||||
t.Fatalf("Pull: %v", err)
|
||||
}
|
||||
if meta.Version != "v0.3.0" {
|
||||
t.Errorf("meta version: got %q", meta.Version)
|
||||
}
|
||||
if staged.Version != "v0.3.0" {
|
||||
t.Errorf("staged version: got %q", staged.Version)
|
||||
}
|
||||
|
||||
gotKernel, err := os.ReadFile(staged.VmlinuzPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read kernel: %v", err)
|
||||
}
|
||||
if string(gotKernel) != string(kernelData) {
|
||||
t.Errorf("kernel mismatch:\n got %q\nwant %q", gotKernel, kernelData)
|
||||
}
|
||||
gotInit, err := os.ReadFile(staged.InitramfsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read initramfs: %v", err)
|
||||
}
|
||||
if string(gotInit) != string(initramfsData) {
|
||||
t.Errorf("initramfs mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPullRejectsTamperedBlob(t *testing.T) {
|
||||
// Mutate the kernel blob after it's been digested into the manifest.
|
||||
// Pull should refuse with a digest mismatch.
|
||||
reg := newFakeRegistry(t)
|
||||
_, _ = reg.seedSingleArchManifest(t, "v0.3.0",
|
||||
map[string]string{AnnotVersion: "v0.3.0", AnnotArch: "amd64"})
|
||||
|
||||
// Corrupt every stored kernel blob in the registry by replacing its body.
|
||||
for d, m := range reg.mediaTypes {
|
||||
if m == MediaKernel {
|
||||
reg.blobs[d] = []byte("TAMPERED-KERNEL-WRONG-LENGTH-AND-DIGEST")
|
||||
}
|
||||
}
|
||||
|
||||
c, _ := NewClient(reg.repoRef())
|
||||
c.WithPlainHTTP(true)
|
||||
c.Arch = "amd64"
|
||||
|
||||
_, _, err := c.Pull(context.Background(), "v0.3.0", filepath.Join(t.TempDir(), "stage"))
|
||||
if err == nil {
|
||||
t.Fatal("expected digest mismatch error on tampered blob, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "mismatch") {
|
||||
t.Errorf("expected mismatch in error, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewClientRejectsGarbageReference(t *testing.T) {
|
||||
_, err := NewClient("not a valid reference")
|
||||
if err == nil {
|
||||
t.Error("expected error on bad reference, got nil")
|
||||
}
|
||||
}
|
||||
34
update/pkg/partition/freespace.go
Normal file
34
update/pkg/partition/freespace.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package partition
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// FreeBytes returns the number of free bytes available on the filesystem
|
||||
// containing `path`. Uses statfs(2); path must exist and be readable.
|
||||
func FreeBytes(path string) (uint64, error) {
|
||||
var stat syscall.Statfs_t
|
||||
if err := syscall.Statfs(path, &stat); err != nil {
|
||||
return 0, fmt.Errorf("statfs %s: %w", path, err)
|
||||
}
|
||||
// Bavail is the count of free blocks available to non-root users —
|
||||
// matches what `df` reports. Bsize is the block size in bytes.
|
||||
//nolint:unconvert // Bavail is uint64 on most platforms but int64 on darwin/freebsd
|
||||
return uint64(stat.Bavail) * uint64(stat.Bsize), nil
|
||||
}
|
||||
|
||||
// HasFreeSpaceFor reports whether `path`'s filesystem has at least `wantBytes`
|
||||
// of free space, with `headroomPct` reserved (e.g. 10 = require 110% of want).
|
||||
// Returns the available bytes alongside, so callers can render a useful error.
|
||||
func HasFreeSpaceFor(path string, wantBytes int64, headroomPct int) (avail uint64, ok bool, err error) {
|
||||
avail, err = FreeBytes(path)
|
||||
if err != nil {
|
||||
return 0, false, err
|
||||
}
|
||||
if wantBytes < 0 {
|
||||
return avail, false, fmt.Errorf("invalid wantBytes %d", wantBytes)
|
||||
}
|
||||
required := uint64(wantBytes) * uint64(100+headroomPct) / 100
|
||||
return avail, avail >= required, nil
|
||||
}
|
||||
44
update/pkg/partition/freespace_test.go
Normal file
44
update/pkg/partition/freespace_test.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package partition
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestFreeBytesReturnsNonZeroOnTempDir(t *testing.T) {
|
||||
b, err := FreeBytes(t.TempDir())
|
||||
if err != nil {
|
||||
t.Fatalf("FreeBytes: %v", err)
|
||||
}
|
||||
// On any sane test runner the temp filesystem has more than 1 KiB free.
|
||||
if b < 1024 {
|
||||
t.Errorf("FreeBytes = %d, want > 1024 on /tmp", b)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFreeBytesNonExistentPath(t *testing.T) {
|
||||
_, err := FreeBytes("/this/path/does/not/exist/at/all")
|
||||
if err == nil {
|
||||
t.Error("expected error for missing path, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasFreeSpaceForRejectsHugeRequest(t *testing.T) {
|
||||
// Request 1 PiB with 10% headroom on /tmp — no test runner has that
|
||||
// much free, so this should consistently report not-enough.
|
||||
avail, ok, err := HasFreeSpaceFor(t.TempDir(), 1<<50, 10)
|
||||
if err != nil {
|
||||
t.Fatalf("HasFreeSpaceFor: %v", err)
|
||||
}
|
||||
if ok {
|
||||
t.Errorf("expected insufficient space for 1PiB, got avail=%d ok=true", avail)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasFreeSpaceForAcceptsSmallRequest(t *testing.T) {
|
||||
// 1 KiB with 10% headroom = 1.1 KiB. Any temp dir has this.
|
||||
_, ok, err := HasFreeSpaceFor(t.TempDir(), 1024, 10)
|
||||
if err != nil {
|
||||
t.Fatalf("HasFreeSpaceFor: %v", err)
|
||||
}
|
||||
if !ok {
|
||||
t.Error("expected sufficient space for 1KiB on /tmp")
|
||||
}
|
||||
}
|
||||
206
update/pkg/state/state.go
Normal file
206
update/pkg/state/state.go
Normal file
@@ -0,0 +1,206 @@
|
||||
// Package state tracks the lifecycle of an OS update on disk.
|
||||
//
|
||||
// The state file (default /var/lib/kubesolo/update/state.json) records which
|
||||
// phase the agent is in, what versions are involved, when the attempt started,
|
||||
// any error from the last operation, and how many attempts have been made.
|
||||
// Updates are atomic via tmp+rename, so a crash mid-write doesn't corrupt the
|
||||
// state.
|
||||
//
|
||||
// Consumers:
|
||||
// - cmd/check, cmd/apply, cmd/activate, cmd/healthcheck, cmd/rollback —
|
||||
// transition the phase as they enter / leave their operations.
|
||||
// - cmd/status --json — emits the raw state for orchestration tooling.
|
||||
// - pkg/metrics — reads the state at scrape time to expose phase and
|
||||
// attempt-count gauges.
|
||||
package state
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
)
|
||||
|
||||
// DefaultPath is where state.json lives on a live system. The directory is on
|
||||
// the persistent data partition so the file survives A/B slot switches.
|
||||
const DefaultPath = "/var/lib/kubesolo/update/state.json"
|
||||
|
||||
// Phase represents the current step in the update lifecycle.
|
||||
//
|
||||
// Terminal phases (Success, RolledBack, Failed) describe the outcome of the
|
||||
// most recent attempt; transient phases (Checking, Downloading, Staged,
|
||||
// Activated, Verifying) describe in-progress work. Idle means no update has
|
||||
// been attempted yet, or the previous attempt has been acknowledged.
|
||||
type Phase string
|
||||
|
||||
const (
|
||||
// PhaseIdle — no update in progress.
|
||||
PhaseIdle Phase = "idle"
|
||||
// PhaseChecking — querying the update server for new versions.
|
||||
PhaseChecking Phase = "checking"
|
||||
// PhaseDownloading — pulling artifacts from the server.
|
||||
PhaseDownloading Phase = "downloading"
|
||||
// PhaseStaged — artifacts written to the passive partition; not yet active.
|
||||
PhaseStaged Phase = "staged"
|
||||
// PhaseActivated — passive slot promoted; next boot will use the new version.
|
||||
PhaseActivated Phase = "activated"
|
||||
// PhaseVerifying — post-boot healthcheck in progress on the new version.
|
||||
PhaseVerifying Phase = "verifying"
|
||||
// PhaseSuccess — last attempt completed and verified.
|
||||
PhaseSuccess Phase = "success"
|
||||
// PhaseRolledBack — last attempt failed verification; reverted to prior slot.
|
||||
PhaseRolledBack Phase = "rolled_back"
|
||||
// PhaseFailed — last attempt failed before reaching activation (download,
|
||||
// checksum, signature, etc.). System still on the original slot.
|
||||
PhaseFailed Phase = "failed"
|
||||
)
|
||||
|
||||
// validPhases lists every legal Phase value. Anything not in this set is
|
||||
// rejected by Save() to catch typos.
|
||||
var validPhases = map[Phase]struct{}{
|
||||
PhaseIdle: {},
|
||||
PhaseChecking: {},
|
||||
PhaseDownloading: {},
|
||||
PhaseStaged: {},
|
||||
PhaseActivated: {},
|
||||
PhaseVerifying: {},
|
||||
PhaseSuccess: {},
|
||||
PhaseRolledBack: {},
|
||||
PhaseFailed: {},
|
||||
}
|
||||
|
||||
// UpdateState is the on-disk representation. Fields use JSON tags so the
|
||||
// file format is forward-compatible (extra fields ignored, missing fields
|
||||
// default).
|
||||
type UpdateState struct {
|
||||
// Phase is the current lifecycle position.
|
||||
Phase Phase `json:"phase"`
|
||||
// FromVersion is the version the system was running before the attempt.
|
||||
// Empty when no attempt has run.
|
||||
FromVersion string `json:"from_version,omitempty"`
|
||||
// ToVersion is the version the attempt is targeting.
|
||||
// Empty when no attempt has run.
|
||||
ToVersion string `json:"to_version,omitempty"`
|
||||
// StartedAt is when the current attempt entered a non-Idle phase.
|
||||
StartedAt time.Time `json:"started_at,omitempty"`
|
||||
// UpdatedAt is the last time the file was written. Always set on Save().
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
// LastError carries the most recent operation error, populated when
|
||||
// transitioning to PhaseFailed or PhaseRolledBack. Cleared on Success/Idle.
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
// AttemptCount counts attempts at the current ToVersion. Reset when
|
||||
// ToVersion changes or on successful completion.
|
||||
AttemptCount int `json:"attempt_count"`
|
||||
|
||||
// HealthCheckFailures counts consecutive post-Activated healthcheck
|
||||
// failures. Reset to 0 on a successful healthcheck or after a rollback.
|
||||
// Used by `kubesolo-update healthcheck --auto-rollback-after N` to
|
||||
// trigger automatic recovery on a wedged new boot.
|
||||
HealthCheckFailures int `json:"health_check_failures,omitempty"`
|
||||
}
|
||||
|
||||
// New returns a fresh Idle state with UpdatedAt set to now.
|
||||
func New() *UpdateState {
|
||||
return &UpdateState{
|
||||
Phase: PhaseIdle,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
|
||||
// Load reads the state from disk. If the file does not exist, returns a fresh
|
||||
// Idle state — this is the normal first-run case, not an error.
|
||||
func Load(path string) (*UpdateState, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return New(), nil
|
||||
}
|
||||
return nil, fmt.Errorf("read state %s: %w", path, err)
|
||||
}
|
||||
var s UpdateState
|
||||
if err := json.Unmarshal(data, &s); err != nil {
|
||||
return nil, fmt.Errorf("parse state %s: %w", path, err)
|
||||
}
|
||||
return &s, nil
|
||||
}
|
||||
|
||||
// Save writes the state to disk atomically (tmp file + rename), so an
|
||||
// interrupted write never leaves a partial file at `path`.
|
||||
func (s *UpdateState) Save(path string) error {
|
||||
if _, ok := validPhases[s.Phase]; !ok {
|
||||
return fmt.Errorf("invalid phase %q", s.Phase)
|
||||
}
|
||||
s.UpdatedAt = time.Now().UTC()
|
||||
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
||||
return fmt.Errorf("creating state dir: %w", err)
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(s, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal state: %w", err)
|
||||
}
|
||||
data = append(data, '\n')
|
||||
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0o644); err != nil {
|
||||
return fmt.Errorf("write tmp state: %w", err)
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return fmt.Errorf("rename state: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Transition moves the state to phase `next` and persists it. If `next`
|
||||
// targets a new ToVersion (different from the current one), AttemptCount is
|
||||
// reset to 1; otherwise it is left untouched. StartedAt is set when
|
||||
// transitioning out of Idle. LastError is cleared unless `next` is Failed or
|
||||
// RolledBack.
|
||||
func (s *UpdateState) Transition(path string, next Phase, toVersion, errMsg string) error {
|
||||
now := time.Now().UTC()
|
||||
|
||||
// Reset attempt counter when targeting a new version.
|
||||
if toVersion != "" && toVersion != s.ToVersion {
|
||||
s.ToVersion = toVersion
|
||||
s.AttemptCount = 0
|
||||
}
|
||||
|
||||
// First non-Idle phase of an attempt: record start time and bump count.
|
||||
if s.Phase == PhaseIdle && next != PhaseIdle {
|
||||
s.StartedAt = now
|
||||
s.AttemptCount++
|
||||
}
|
||||
|
||||
s.Phase = next
|
||||
switch next {
|
||||
case PhaseFailed, PhaseRolledBack:
|
||||
if errMsg != "" {
|
||||
s.LastError = errMsg
|
||||
}
|
||||
case PhaseSuccess, PhaseIdle:
|
||||
s.LastError = ""
|
||||
}
|
||||
|
||||
return s.Save(path)
|
||||
}
|
||||
|
||||
// RecordError marks the state as failed with the given error and saves.
|
||||
// Convenience wrapper around Transition for the most common failure path.
|
||||
func (s *UpdateState) RecordError(path string, err error) error {
|
||||
msg := ""
|
||||
if err != nil {
|
||||
msg = err.Error()
|
||||
}
|
||||
return s.Transition(path, PhaseFailed, "", msg)
|
||||
}
|
||||
|
||||
// SetFromVersion records the version the system was running when an attempt
|
||||
// started. Idempotent; only takes effect when From is empty.
|
||||
func (s *UpdateState) SetFromVersion(v string) {
|
||||
if s.FromVersion == "" {
|
||||
s.FromVersion = v
|
||||
}
|
||||
}
|
||||
197
update/pkg/state/state_test.go
Normal file
197
update/pkg/state/state_test.go
Normal file
@@ -0,0 +1,197 @@
|
||||
package state
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// statePath returns a per-test state file path inside t.TempDir().
|
||||
func statePath(t *testing.T) string {
|
||||
t.Helper()
|
||||
return filepath.Join(t.TempDir(), "state.json")
|
||||
}
|
||||
|
||||
func TestLoadMissingReturnsIdle(t *testing.T) {
|
||||
s, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json"))
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error loading missing state: %v", err)
|
||||
}
|
||||
if s.Phase != PhaseIdle {
|
||||
t.Errorf("missing file: phase=%q, want %q", s.Phase, PhaseIdle)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveLoadRoundTrip(t *testing.T) {
|
||||
path := statePath(t)
|
||||
in := &UpdateState{
|
||||
Phase: PhaseStaged,
|
||||
FromVersion: "v0.2.0",
|
||||
ToVersion: "v0.3.0",
|
||||
AttemptCount: 1,
|
||||
}
|
||||
if err := in.Save(path); err != nil {
|
||||
t.Fatalf("save: %v", err)
|
||||
}
|
||||
out, err := Load(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if out.Phase != in.Phase {
|
||||
t.Errorf("phase: got %q, want %q", out.Phase, in.Phase)
|
||||
}
|
||||
if out.FromVersion != in.FromVersion {
|
||||
t.Errorf("from_version: got %q, want %q", out.FromVersion, in.FromVersion)
|
||||
}
|
||||
if out.ToVersion != in.ToVersion {
|
||||
t.Errorf("to_version: got %q, want %q", out.ToVersion, in.ToVersion)
|
||||
}
|
||||
if out.AttemptCount != in.AttemptCount {
|
||||
t.Errorf("attempt_count: got %d, want %d", out.AttemptCount, in.AttemptCount)
|
||||
}
|
||||
if out.UpdatedAt.IsZero() {
|
||||
t.Error("UpdatedAt should be set by Save")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveRejectsInvalidPhase(t *testing.T) {
|
||||
s := &UpdateState{Phase: Phase("bogus")}
|
||||
err := s.Save(statePath(t))
|
||||
if err == nil {
|
||||
t.Fatal("expected error saving invalid phase, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveIsAtomic(t *testing.T) {
|
||||
// After Save, the .tmp file should NOT exist — confirming we renamed it.
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
if err := s.Save(path); err != nil {
|
||||
t.Fatalf("save: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Errorf("tmp file still present after Save: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveCreatesDirectory(t *testing.T) {
|
||||
// State directory may not exist yet (first-ever boot). Save() should mkdir.
|
||||
dir := filepath.Join(t.TempDir(), "fresh", "subdir")
|
||||
path := filepath.Join(dir, "state.json")
|
||||
if err := New().Save(path); err != nil {
|
||||
t.Fatalf("save into nonexistent dir: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(path); err != nil {
|
||||
t.Errorf("state file not present after Save: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionIdleToChecking(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
if err := s.Transition(path, PhaseChecking, "v0.3.0", ""); err != nil {
|
||||
t.Fatalf("transition: %v", err)
|
||||
}
|
||||
if s.Phase != PhaseChecking {
|
||||
t.Errorf("phase: got %q, want %q", s.Phase, PhaseChecking)
|
||||
}
|
||||
if s.ToVersion != "v0.3.0" {
|
||||
t.Errorf("to_version: got %q, want v0.3.0", s.ToVersion)
|
||||
}
|
||||
if s.AttemptCount != 1 {
|
||||
t.Errorf("attempt_count: got %d, want 1 (first attempt after Idle)", s.AttemptCount)
|
||||
}
|
||||
if s.StartedAt.IsZero() {
|
||||
t.Error("StartedAt should be set when leaving Idle")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionRetainsAttemptCountWithinAttempt(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
|
||||
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
|
||||
_ = s.Transition(path, PhaseStaged, "v0.3.0", "")
|
||||
if s.AttemptCount != 1 {
|
||||
t.Errorf("attempt_count after staying on same version: got %d, want 1", s.AttemptCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionResetsAttemptCountOnNewVersion(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
_ = s.Transition(path, PhaseChecking, "v0.3.0", "")
|
||||
// Now an attempt at a NEW version starts. AttemptCount should reset.
|
||||
_ = s.Transition(path, PhaseChecking, "v0.4.0", "")
|
||||
if s.ToVersion != "v0.4.0" {
|
||||
t.Errorf("to_version: got %q, want v0.4.0", s.ToVersion)
|
||||
}
|
||||
if s.AttemptCount != 0 {
|
||||
t.Errorf("attempt_count after new ToVersion: got %d, want 0 (reset)", s.AttemptCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionFailedRecordsError(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
_ = s.Transition(path, PhaseDownloading, "v0.3.0", "")
|
||||
_ = s.Transition(path, PhaseFailed, "v0.3.0", "checksum mismatch")
|
||||
if s.Phase != PhaseFailed {
|
||||
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
|
||||
}
|
||||
if s.LastError != "checksum mismatch" {
|
||||
t.Errorf("last_error: got %q, want %q", s.LastError, "checksum mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionSuccessClearsError(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
_ = s.Transition(path, PhaseFailed, "v0.3.0", "boom")
|
||||
if s.LastError == "" {
|
||||
t.Fatal("setup: LastError should be non-empty before success")
|
||||
}
|
||||
_ = s.Transition(path, PhaseSuccess, "v0.3.0", "")
|
||||
if s.LastError != "" {
|
||||
t.Errorf("last_error after success: got %q, want empty", s.LastError)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecordError(t *testing.T) {
|
||||
path := statePath(t)
|
||||
s := New()
|
||||
if err := s.RecordError(path, errors.New("network down")); err != nil {
|
||||
t.Fatalf("RecordError: %v", err)
|
||||
}
|
||||
if s.Phase != PhaseFailed {
|
||||
t.Errorf("phase: got %q, want %q", s.Phase, PhaseFailed)
|
||||
}
|
||||
if s.LastError != "network down" {
|
||||
t.Errorf("last_error: got %q, want %q", s.LastError, "network down")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetFromVersionIdempotent(t *testing.T) {
|
||||
s := New()
|
||||
s.SetFromVersion("v0.2.0")
|
||||
if s.FromVersion != "v0.2.0" {
|
||||
t.Errorf("from_version: got %q, want v0.2.0", s.FromVersion)
|
||||
}
|
||||
// Second call should not overwrite.
|
||||
s.SetFromVersion("v0.1.0")
|
||||
if s.FromVersion != "v0.2.0" {
|
||||
t.Errorf("from_version after second SetFromVersion: got %q, want v0.2.0 (immutable)", s.FromVersion)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadHandlesGarbageFile(t *testing.T) {
|
||||
path := statePath(t)
|
||||
if err := os.WriteFile(path, []byte("not json"), 0o644); err != nil {
|
||||
t.Fatalf("seed: %v", err)
|
||||
}
|
||||
_, err := Load(path)
|
||||
if err == nil {
|
||||
t.Error("expected error loading garbage, got nil")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user