feat: add A/B partition updates with GRUB and Go update agent (Phase 3)
Implement atomic OS updates via A/B partition scheme with automatic rollback. GRUB bootloader manages slot selection with a 3-attempt boot counter that auto-rolls back on repeated health check failures. GRUB boot config: - A/B slot selection with boot_counter/boot_success env vars - Automatic rollback when counter reaches 0 (3 failed boots) - Debug, emergency shell, and manual slot-switch menu entries Disk image (refactored): - 4-partition GPT layout: EFI + System A + System B + Data - GRUB EFI/BIOS installation with graceful fallbacks - Both system partitions populated during image creation Update agent (Go, zero external deps): - pkg/grubenv: read/write GRUB env vars (grub-editenv + manual fallback) - pkg/partition: find/mount/write system partitions by label - pkg/image: HTTP download with SHA256 verification - pkg/health: post-boot checks (containerd, API server, node Ready) - 6 CLI commands: check, apply, activate, rollback, healthcheck, status - 37 unit tests across all 4 packages Deployment: - K8s CronJob for automatic update checks (every 6 hours) - ConfigMap for update server URL - Health check Job for post-boot verification Build pipeline: - build-update-agent.sh compiles static Linux binary (~5.9 MB) - inject-kubesolo.sh includes update agent in initramfs - Makefile: build-update-agent, test-update-agent, test-update targets Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
150
update/deploy/update-cronjob.yaml
Normal file
150
update/deploy/update-cronjob.yaml
Normal file
@@ -0,0 +1,150 @@
|
||||
# KubeSolo OS — Automatic Update CronJob
|
||||
#
|
||||
# This CronJob checks for OS updates every 6 hours, downloads them,
|
||||
# and writes them to the passive partition. It does NOT reboot —
|
||||
# the administrator must trigger a reboot to apply the update.
|
||||
#
|
||||
# The update agent runs as a privileged container with host access
|
||||
# because it needs to:
|
||||
# 1. Read/write GRUB environment (on boot partition)
|
||||
# 2. Mount and write to system partitions
|
||||
# 3. Access block devices via blkid
|
||||
#
|
||||
# Deploy: kubectl apply -f update-cronjob.yaml
|
||||
# Manual trigger: kubectl create job --from=cronjob/kubesolo-update kubesolo-update-manual
|
||||
#
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: kubesolo-update
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: kubesolo-update
|
||||
app.kubernetes.io/component: update-agent
|
||||
app.kubernetes.io/part-of: kubesolo-os
|
||||
spec:
|
||||
schedule: "0 */6 * * *" # Every 6 hours
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 5
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
activeDeadlineSeconds: 600 # 10 min max
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kubesolo-update
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
hostPID: false
|
||||
hostNetwork: false
|
||||
containers:
|
||||
- name: update
|
||||
image: busybox:latest # Only used for the shell; the binary is host-mounted
|
||||
command:
|
||||
- /host/usr/lib/kubesolo-os/kubesolo-update
|
||||
args:
|
||||
- apply
|
||||
- --server
|
||||
- "$(UPDATE_SERVER_URL)"
|
||||
env:
|
||||
- name: UPDATE_SERVER_URL
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: kubesolo-update-config
|
||||
key: server-url
|
||||
optional: true
|
||||
securityContext:
|
||||
privileged: true # Required for mount/blkid access
|
||||
volumeMounts:
|
||||
- name: host-root
|
||||
mountPath: /host
|
||||
readOnly: false
|
||||
- name: dev
|
||||
mountPath: /dev
|
||||
- name: boot
|
||||
mountPath: /boot
|
||||
volumes:
|
||||
- name: host-root
|
||||
hostPath:
|
||||
path: /
|
||||
type: Directory
|
||||
- name: dev
|
||||
hostPath:
|
||||
path: /dev
|
||||
type: Directory
|
||||
- name: boot
|
||||
hostPath:
|
||||
path: /boot
|
||||
type: Directory
|
||||
tolerations:
|
||||
- operator: Exists # Run on any node (there's only one)
|
||||
---
|
||||
# ConfigMap for update server URL.
|
||||
# Create/update this to point to your update server:
|
||||
# kubectl -n kube-system create configmap kubesolo-update-config \
|
||||
# --from-literal=server-url=https://updates.example.com
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: kubesolo-update-config
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: kubesolo-update
|
||||
app.kubernetes.io/component: update-agent
|
||||
data:
|
||||
server-url: "" # Set to your update server URL
|
||||
---
|
||||
# Post-boot health check — runs once at boot as a Job.
|
||||
# On KubeSolo OS, this is triggered by the init system (init stage or
|
||||
# systemd-equivalent), but it can also be deployed as a K8s Job for
|
||||
# environments where the init system doesn't run the health check.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: kubesolo-healthcheck
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: kubesolo-healthcheck
|
||||
app.kubernetes.io/component: health-check
|
||||
app.kubernetes.io/part-of: kubesolo-os
|
||||
spec:
|
||||
backoffLimit: 3
|
||||
activeDeadlineSeconds: 300 # 5 min max
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kubesolo-healthcheck
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
hostPID: false
|
||||
hostNetwork: true # Needed to reach API server at 127.0.0.1:6443
|
||||
containers:
|
||||
- name: healthcheck
|
||||
image: busybox:latest
|
||||
command:
|
||||
- /host/usr/lib/kubesolo-os/kubesolo-update
|
||||
args:
|
||||
- healthcheck
|
||||
- --timeout
|
||||
- "120"
|
||||
securityContext:
|
||||
privileged: true # Required for grubenv write
|
||||
volumeMounts:
|
||||
- name: host-root
|
||||
mountPath: /host
|
||||
readOnly: false
|
||||
- name: boot
|
||||
mountPath: /boot
|
||||
volumes:
|
||||
- name: host-root
|
||||
hostPath:
|
||||
path: /
|
||||
type: Directory
|
||||
- name: boot
|
||||
hostPath:
|
||||
path: /boot
|
||||
type: Directory
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
Reference in New Issue
Block a user