Implement atomic OS updates via A/B partition scheme with automatic rollback. GRUB bootloader manages slot selection with a 3-attempt boot counter that auto-rolls back on repeated health check failures. GRUB boot config: - A/B slot selection with boot_counter/boot_success env vars - Automatic rollback when counter reaches 0 (3 failed boots) - Debug, emergency shell, and manual slot-switch menu entries Disk image (refactored): - 4-partition GPT layout: EFI + System A + System B + Data - GRUB EFI/BIOS installation with graceful fallbacks - Both system partitions populated during image creation Update agent (Go, zero external deps): - pkg/grubenv: read/write GRUB env vars (grub-editenv + manual fallback) - pkg/partition: find/mount/write system partitions by label - pkg/image: HTTP download with SHA256 verification - pkg/health: post-boot checks (containerd, API server, node Ready) - 6 CLI commands: check, apply, activate, rollback, healthcheck, status - 37 unit tests across all 4 packages Deployment: - K8s CronJob for automatic update checks (every 6 hours) - ConfigMap for update server URL - Health check Job for post-boot verification Build pipeline: - build-update-agent.sh compiles static Linux binary (~5.9 MB) - inject-kubesolo.sh includes update agent in initramfs - Makefile: build-update-agent, test-update-agent, test-update targets Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
151 lines
4.6 KiB
YAML
151 lines
4.6 KiB
YAML
# KubeSolo OS — Automatic Update CronJob
|
|
#
|
|
# This CronJob checks for OS updates every 6 hours, downloads them,
|
|
# and writes them to the passive partition. It does NOT reboot —
|
|
# the administrator must trigger a reboot to apply the update.
|
|
#
|
|
# The update agent runs as a privileged container with host access
|
|
# because it needs to:
|
|
# 1. Read/write GRUB environment (on boot partition)
|
|
# 2. Mount and write to system partitions
|
|
# 3. Access block devices via blkid
|
|
#
|
|
# Deploy: kubectl apply -f update-cronjob.yaml
|
|
# Manual trigger: kubectl create job --from=cronjob/kubesolo-update kubesolo-update-manual
|
|
#
|
|
apiVersion: batch/v1
|
|
kind: CronJob
|
|
metadata:
|
|
name: kubesolo-update
|
|
namespace: kube-system
|
|
labels:
|
|
app.kubernetes.io/name: kubesolo-update
|
|
app.kubernetes.io/component: update-agent
|
|
app.kubernetes.io/part-of: kubesolo-os
|
|
spec:
|
|
schedule: "0 */6 * * *" # Every 6 hours
|
|
concurrencyPolicy: Forbid
|
|
successfulJobsHistoryLimit: 3
|
|
failedJobsHistoryLimit: 5
|
|
jobTemplate:
|
|
spec:
|
|
backoffLimit: 1
|
|
activeDeadlineSeconds: 600 # 10 min max
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: kubesolo-update
|
|
spec:
|
|
restartPolicy: Never
|
|
hostPID: false
|
|
hostNetwork: false
|
|
containers:
|
|
- name: update
|
|
image: busybox:latest # Only used for the shell; the binary is host-mounted
|
|
command:
|
|
- /host/usr/lib/kubesolo-os/kubesolo-update
|
|
args:
|
|
- apply
|
|
- --server
|
|
- "$(UPDATE_SERVER_URL)"
|
|
env:
|
|
- name: UPDATE_SERVER_URL
|
|
valueFrom:
|
|
configMapKeyRef:
|
|
name: kubesolo-update-config
|
|
key: server-url
|
|
optional: true
|
|
securityContext:
|
|
privileged: true # Required for mount/blkid access
|
|
volumeMounts:
|
|
- name: host-root
|
|
mountPath: /host
|
|
readOnly: false
|
|
- name: dev
|
|
mountPath: /dev
|
|
- name: boot
|
|
mountPath: /boot
|
|
volumes:
|
|
- name: host-root
|
|
hostPath:
|
|
path: /
|
|
type: Directory
|
|
- name: dev
|
|
hostPath:
|
|
path: /dev
|
|
type: Directory
|
|
- name: boot
|
|
hostPath:
|
|
path: /boot
|
|
type: Directory
|
|
tolerations:
|
|
- operator: Exists # Run on any node (there's only one)
|
|
---
|
|
# ConfigMap for update server URL.
|
|
# Create/update this to point to your update server:
|
|
# kubectl -n kube-system create configmap kubesolo-update-config \
|
|
# --from-literal=server-url=https://updates.example.com
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: kubesolo-update-config
|
|
namespace: kube-system
|
|
labels:
|
|
app.kubernetes.io/name: kubesolo-update
|
|
app.kubernetes.io/component: update-agent
|
|
data:
|
|
server-url: "" # Set to your update server URL
|
|
---
|
|
# Post-boot health check — runs once at boot as a Job.
|
|
# On KubeSolo OS, this is triggered by the init system (init stage or
|
|
# systemd-equivalent), but it can also be deployed as a K8s Job for
|
|
# environments where the init system doesn't run the health check.
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: kubesolo-healthcheck
|
|
namespace: kube-system
|
|
labels:
|
|
app.kubernetes.io/name: kubesolo-healthcheck
|
|
app.kubernetes.io/component: health-check
|
|
app.kubernetes.io/part-of: kubesolo-os
|
|
spec:
|
|
backoffLimit: 3
|
|
activeDeadlineSeconds: 300 # 5 min max
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: kubesolo-healthcheck
|
|
spec:
|
|
restartPolicy: Never
|
|
hostPID: false
|
|
hostNetwork: true # Needed to reach API server at 127.0.0.1:6443
|
|
containers:
|
|
- name: healthcheck
|
|
image: busybox:latest
|
|
command:
|
|
- /host/usr/lib/kubesolo-os/kubesolo-update
|
|
args:
|
|
- healthcheck
|
|
- --timeout
|
|
- "120"
|
|
securityContext:
|
|
privileged: true # Required for grubenv write
|
|
volumeMounts:
|
|
- name: host-root
|
|
mountPath: /host
|
|
readOnly: false
|
|
- name: boot
|
|
mountPath: /boot
|
|
volumes:
|
|
- name: host-root
|
|
hostPath:
|
|
path: /
|
|
type: Directory
|
|
- name: boot
|
|
hostPath:
|
|
path: /boot
|
|
type: Directory
|
|
tolerations:
|
|
- operator: Exists
|