diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index df953b5d3..6dd98b1cd 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -22,7 +22,7 @@ jobs: - id: get_pr_info if: github.event_name == 'push' continue-on-error: true - uses: nv-gha-runners/get-pr-info@main + uses: nv-gha-runners/get-pr-info@090577647b8ddc4e06e809e264f7881650ecdccf - id: gate shell: bash diff --git a/Cargo.lock b/Cargo.lock index e4057f75c..d347ff86c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3028,6 +3028,7 @@ dependencies = [ "openshell-prover", "openshell-providers", "openshell-tui", + "openshell-vm", "owo-colors", "prost-types", "rcgen", @@ -3288,6 +3289,7 @@ dependencies = [ "serde", "serde_json", "tar", + "tempfile", "thiserror 2.0.18", "tokio", "tokio-rustls", diff --git a/architecture/README.md b/architecture/README.md index 570fce660..45457d37c 100644 --- a/architecture/README.md +++ b/architecture/README.md @@ -301,4 +301,6 @@ This opens an interactive SSH session into the sandbox, with all provider creden | [Inference Routing](inference-routing.md) | Transparent interception and sandbox-local routing of AI inference API calls to configured backends. | | [System Architecture](system-architecture.md) | Top-level system architecture diagram with all deployable components and communication flows. | | [Gateway Settings Channel](gateway-settings.md) | Runtime settings channel: two-tier key-value configuration, global policy override, settings registry, CLI/TUI commands. | +| [Custom VM Runtime](custom-vm-runtime.md) | Dual-backend VM runtime (libkrun / cloud-hypervisor), kernel configuration, and build pipeline. | +| [VM GPU Passthrough](vm-gpu-passthrough.md) | VFIO GPU passthrough for VMs: host preparation, safety checks, nvidia driver hardening, and troubleshooting. | | [TUI](tui.md) | Terminal user interface for sandbox interaction. | diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index ce4d0bf39..6dac41064 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -1,18 +1,31 @@ -# Custom libkrunfw VM Runtime +# Custom VM Runtime > Status: Experimental and work in progress (WIP). VM support is under active development and may change. ## Overview -The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a -lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel -is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel. +The OpenShell gateway VM supports two hypervisor backends: -The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or -conntrack support. This is insufficient for Kubernetes pod networking. +- **libkrun** (default) — lightweight VMM using Apple Hypervisor.framework (macOS) or KVM + (Linux). The kernel is embedded inside `libkrunfw`. Uses virtio-MMIO device transport and + gvproxy for user-space networking. +- **cloud-hypervisor** — Linux-only KVM-based VMM used for GPU passthrough (VFIO). Uses + virtio-PCI device transport, TAP networking, and requires a separate `vmlinux` kernel and + `virtiofsd` for rootfs access. + +Backend selection is automatic: `--gpu` selects cloud-hypervisor, otherwise libkrun is used. +The `--backend` flag provides explicit control (`auto`, `libkrun`, `cloud-hypervisor`). + +When `--gpu` is passed, `openshell-vm` automatically binds an eligible GPU to `vfio-pci` +and restores it to the original driver on shutdown. See +[vm-gpu-passthrough.md](vm-gpu-passthrough.md) for the full lifecycle description. + +Both backends share the same guest kernel (built from a single `openshell.kconfig` fragment) +and rootfs. -The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to -the VM kernel, enabling standard Kubernetes networking. +The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or +conntrack support. This is insufficient for Kubernetes pod networking. The custom kconfig +adds bridge CNI, iptables/nftables, conntrack, and cloud-hypervisor compatibility. ## Architecture @@ -20,10 +33,11 @@ the VM kernel, enabling standard Kubernetes networking. graph TD subgraph Host["Host (macOS / Linux)"] BIN[openshell-vm binary] - EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy"] + EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy · rootfs"] CACHE["~/.local/share/openshell/vm-runtime/{version}/"] PROV[Runtime provenance logging] GVP[gvproxy networking proxy] + CHV_BIN["cloud-hypervisor · virtiofsd · vmlinux\n(GPU runtime bundle)"] BIN --> EMB BIN -->|extracts to| CACHE @@ -44,8 +58,9 @@ graph TD INIT --> VAL --> CNI --> EXECA --> PKI --> K3S end - BIN -- "fork + krun_start_enter" --> INIT - GVP -- "virtio-net" --> Guest + BIN -- "libkrun: fork + krun_start_enter" --> INIT + BIN -- "CHV: cloud-hypervisor API + virtiofsd" --> INIT + GVP -- "virtio-net (libkrun only)" --> Guest ``` ## Embedded Runtime @@ -67,9 +82,23 @@ these to XDG cache directories with progress bars: └── ... ``` -This eliminates the need for separate bundles or downloads - a single ~120MB binary -provides everything needed to run the VM. Old cache versions are automatically -cleaned up when a new version is extracted. +When using cloud-hypervisor, an additional runtime bundle is required alongside the +binary: + +``` +target/debug/openshell-vm.runtime/ (or alongside the installed binary) +├── cloud-hypervisor # CHV binary +├── virtiofsd # virtio-fs daemon +└── vmlinux # extracted guest kernel +``` + +This bundle is built with `mise run vm:bundle-runtime` and is separate from the +embedded runtime because CHV and virtiofsd are Linux-only and not embedded in the +self-extracting binary. + +This eliminates the need for separate bundles or downloads for the default (libkrun) +path — a single ~120MB binary provides everything needed. Old cache versions are +automatically cleaned up when a new version is extracted. ### Hybrid Approach @@ -86,6 +115,31 @@ mise run vm:rootfs # Full rootfs (~2GB, includes images) mise run vm:build # Rebuild binary with full rootfs ``` +## Backend Comparison + +| | libkrun (default) | cloud-hypervisor | +|---|---|---| +| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only | +| Device transport | virtio-MMIO | virtio-PCI | +| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) | +| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) | +| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file | +| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) | +| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) | +| GPU passthrough | Not supported | VFIO PCI passthrough | +| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown | +| CLI flag | `--backend libkrun` | `--backend cloud-hypervisor` or `--gpu` | + +### Exec mode differences + +With libkrun, when `--exec ` is used, the command replaces the init process and +the VM exits when PID 1 exits. + +With cloud-hypervisor, the VM does not automatically exit when PID 1 terminates. A +wrapper init script is dynamically written to the guest rootfs that mounts necessary +filesystems, executes the user command, captures the exit code, and calls +`poweroff -f` to trigger an ACPI shutdown that cloud-hypervisor detects. + ## Network Profile The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and @@ -100,6 +154,26 @@ fast with an actionable error if they are missing. - Service VIPs: functional (ClusterIP, NodePort) - hostNetwork workarounds: not required +### Networking by backend + +- **libkrun**: Uses gvproxy for user-space virtio-net networking. No root privileges + needed. Port forwarding is handled via gvproxy configuration. +- **cloud-hypervisor**: Uses TAP networking (requires root or CAP_NET_ADMIN). When + `--net none` is passed, networking is disabled entirely (useful for `--exec` mode + tests). gvproxy is not used with cloud-hypervisor. + +## Guest Init Script + +The init script (`openshell-vm-init.sh`) runs as PID 1 in the guest. After mounting essential filesystems, it performs: + +1. **Kernel cmdline parsing** — exports environment variables passed via the kernel command line (`GPU_ENABLED`, `OPENSHELL_VM_STATE_DISK_DEVICE`, `VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). This runs after `/proc` is mounted so `/proc/cmdline` is available. + +2. **Cgroup v2 controller enablement** — enables `cpu`, `cpuset`, `memory`, `pids`, and `io` controllers in the root cgroup hierarchy (`cgroup.subtree_control`). k3s/kubelet requires these controllers; the `cpu` controller depends on `CONFIG_CGROUP_SCHED` in the kernel. + +3. **Networking** — detects `eth0` and attempts DHCP (via `udhcpc`). On failure, falls back to static IP configuration using `VM_NET_IP` and `VM_NET_GW` from the kernel cmdline (set by the CHV backend for TAP networking). DNS is configured from `VM_NET_DNS` if set, overriding any stale `/etc/resolv.conf` entries. + +4. **Capability validation** — verifies required kernel features (bridge networking, netfilter, cgroups) and fails fast with actionable errors if missing. + ## Runtime Provenance At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle: @@ -128,21 +202,35 @@ graph LR BUILD_M["Build libkrunfw.dylib + libkrun.dylib"] end + subgraph CHV["Linux CI (build-cloud-hypervisor.sh)"] + BUILD_CHV["Build cloud-hypervisor + virtiofsd"] + end + subgraph Output["target/libkrun-build/"] LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"] LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"] + CHV_OUT["cloud-hypervisor + virtiofsd\n(Linux)"] + VMLINUX["vmlinux\n(extracted from libkrunfw)"] end KCONF --> BUILD_L BUILD_L --> LIB_SO + BUILD_L --> VMLINUX KCONF --> BUILD_M BUILD_M --> LIB_DY + BUILD_CHV --> CHV_OUT ``` +The `vmlinux` kernel is extracted from the libkrunfw build and reused by cloud-hypervisor. +Both backends boot the same kernel — the kconfig fragment includes drivers for both +virtio-MMIO (libkrun) and virtio-PCI (CHV) transports. + ## Kernel Config Fragment The `openshell.kconfig` fragment enables these kernel features on top of the stock -libkrunfw kernel: +libkrunfw kernel. A single kernel binary is shared by both libkrun and cloud-hypervisor — +backend-specific drivers coexist safely (the kernel probes whichever transport the +hypervisor provides). | Feature | Key Configs | Purpose | |---------|-------------|---------| @@ -158,11 +246,18 @@ libkrunfw kernel: | IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Pod-to-pod routing | | IPVS | `CONFIG_IP_VS`, `CONFIG_IP_VS_RR`, `CONFIG_IP_VS_NFCT` | kube-proxy IPVS mode (optional) | | Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | Kubernetes QoS | -| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Container resource limits | +| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_CGROUP_CPUACCT`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS`, `CONFIG_CGROUP_FREEZER` | Container resource limits | +| Cgroup CPU | `CONFIG_CGROUP_SCHED`, `CONFIG_FAIR_GROUP_SCHED`, `CONFIG_CFS_BANDWIDTH` | cgroup v2 `cpu` controller for k3s/kubelet | | TUN/TAP | `CONFIG_TUN` | CNI plugin support | | Dummy interface | `CONFIG_DUMMY` | Fallback networking | | Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support | | Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support | +| PCI / GPU | `CONFIG_PCI`, `CONFIG_PCI_MSI`, `CONFIG_DRM` | GPU passthrough via VFIO | +| Kernel modules | `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | Loading NVIDIA drivers in guest | +| virtio-PCI transport | `CONFIG_VIRTIO_PCI` | cloud-hypervisor device bus (libkrun uses MMIO) | +| Serial console | `CONFIG_SERIAL_8250`, `CONFIG_SERIAL_8250_CONSOLE` | cloud-hypervisor console (`ttyS0`) | +| ACPI | `CONFIG_ACPI` | cloud-hypervisor power management / clean shutdown | +| x2APIC | `CONFIG_X86_X2APIC` | Multi-vCPU support (CHV uses x2APIC MADT entries) | See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with inline comments explaining why each option is needed. @@ -189,13 +284,21 @@ The standalone `openshell-vm` binary supports `openshell-vm exec -- `openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style commands work the same way they would inside the VM shell. +### Vsock by backend + +- **libkrun**: Uses libkrun's built-in vsock port mapping, which transparently + bridges the guest vsock port to a host Unix socket. +- **cloud-hypervisor**: Uses a vsock exec bridge — a host-side process that + connects an AF_VSOCK socket to a Unix domain socket, providing the same + interface to the exec agent. + ## Build Commands ```bash # One-time setup: download pre-built runtime (~30s) mise run vm:setup -# Build and run +# Build and run (libkrun, default) mise run vm # Build embedded binary with base rootfs (~120MB, recommended) @@ -210,6 +313,13 @@ mise run vm:build # Rebuild binary FROM_SOURCE=1 mise run vm:setup # Build runtime from source mise run vm:build # Then build embedded binary +# Build cloud-hypervisor runtime bundle (Linux only) +mise run vm:bundle-runtime # Builds CHV + virtiofsd + extracts vmlinux + +# Run with cloud-hypervisor backend +openshell-vm --backend cloud-hypervisor # Requires runtime bundle +openshell-vm --gpu # Auto-selects CHV with GPU passthrough + # Wipe everything and start over mise run vm:clean ``` @@ -221,20 +331,23 @@ rolling `vm-dev` GitHub Release: ### Kernel Runtime (`release-vm-kernel.yml`) -Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for all -supported platforms. Runs on-demand or when the kernel config / pinned versions change. +Builds the custom libkrunfw (kernel firmware), libkrun (VMM), gvproxy, cloud-hypervisor, +and virtiofsd for all supported platforms. Runs on-demand or when the kernel config / +pinned versions change. | Platform | Runner | Build Method | |----------|--------|-------------| -| Linux ARM64 | `build-arm64` (self-hosted) | Native `build-libkrun.sh` | -| Linux x86_64 | `build-amd64` (self-hosted) | Native `build-libkrun.sh` | -| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` | +| Linux ARM64 | `build-arm64` (self-hosted) | `build-libkrun.sh` + `build-cloud-hypervisor.sh` | +| Linux x86_64 | `build-amd64` (self-hosted) | `build-libkrun.sh` + `build-cloud-hypervisor.sh` | +| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` (no CHV) | -Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, and -provenance metadata. +Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, +and provenance metadata. Linux artifacts additionally include cloud-hypervisor, +virtiofsd, and the extracted `vmlinux` kernel. Each platform builds its own libkrunfw and libkrun natively. The kernel inside -libkrunfw is always Linux regardless of host platform. +libkrunfw is always Linux regardless of host platform. cloud-hypervisor and virtiofsd +are Linux-only (macOS does not support VFIO/KVM passthrough). ### VM Binary (`release-vm-dev.yml`) diff --git a/architecture/vm-gpu-passthrough.md b/architecture/vm-gpu-passthrough.md new file mode 100644 index 000000000..c15fd668b --- /dev/null +++ b/architecture/vm-gpu-passthrough.md @@ -0,0 +1,413 @@ +# VM GPU Passthrough + +> Status: Experimental and work in progress (WIP). GPU passthrough for the VM backend is under active development. + +## Overview + +OpenShell's VM backend can pass a physical NVIDIA GPU into a microVM using VFIO (Virtual Function I/O). This gives the guest direct access to GPU hardware, enabling CUDA workloads and `nvidia-smi` inside sandboxes without virtualization overhead. + +GPU passthrough uses cloud-hypervisor (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers. + +## Architecture + +``` +Host │ Guest (microVM) +──────────────────────────────│─────────────────────────── + NVIDIA GPU (PCI BDF addr) │ nvidia driver + CUDA + ↕ bound to vfio-pci │ ↕ + /dev/vfio/ │ /dev/nvidia* + ↕ │ ↕ + cloud-hypervisor (VFIO) ────│→ PCI device visible + ↕ │ ↕ + TAP networking │ k3s + device plugin + virtiofsd (rootfs) │ ↕ + │ sandbox pods (nvidia.com/gpu) +``` + +### Backend selection + +| Flag | Backend | GPU attached? | +|------|---------|---------------| +| (none) | libkrun | No | +| `--gpu` | cloud-hypervisor | Yes (auto-detect and bind) | +| `--gpu 0000:41:00.0` | cloud-hypervisor | Yes (specific PCI device) | +| `--backend cloud-hypervisor` | cloud-hypervisor | No (force CHV without GPU) | + +Auto mode (`--backend auto`, the default) selects cloud-hypervisor when `--gpu` is used or a VFIO PCI address is configured. Otherwise libkrun is used. + +### Automatic GPU binding + +When `--gpu` is passed (with or without a specific PCI address), the launcher automatically prepares the GPU for VFIO passthrough: + +1. **Probe** — scans `/sys/bus/pci/devices` for NVIDIA devices (vendor `0x10de`). +2. **Safety checks** — for each candidate GPU, verifies it is safe to claim (see below). If any check fails, the launcher refuses to proceed and exits with an actionable error. +3. **Bind** — unbinds the selected GPU from the `nvidia` driver and binds it to `vfio-pci`. Also binds any IOMMU group peers to `vfio-pci` for group cleanliness. +4. **Launch** — starts cloud-hypervisor with the VFIO device attached and sets `GPU_ENABLED=true` in the guest kernel cmdline. +5. **Rebind on shutdown** — when the VM exits (clean shutdown, Ctrl+C, or crash), the launcher rebinds the GPU back to the `nvidia` driver and clears `driver_override`, restoring host GPU access. Cleanup is guaranteed by a `GpuBindGuard` RAII guard that calls restore on drop, covering normal exit, early return, and panic. Only `SIGKILL` (kill -9) bypasses the guard — see Troubleshooting below for manual recovery. + +When a specific PCI address is given (`--gpu 0000:41:00.0`), the launcher targets that exact device. When `--gpu` is used without an address (`auto` mode), the launcher selects the best available GPU using the multi-GPU selection strategy. + +### Safety checks + +All safety checks are hard failures — if any check fails, the launcher prints an error and exits without binding. There is no `--force` override. + +| Check | What it detects | Failure behavior | +|-------|----------------|------------------| +| **Display attached** | GPU drives an active DRM framebuffer or is the primary rendering device | Error: "GPU 0000:xx:xx.x has active display outputs — cannot passthrough without losing host display" | +| **Active processes** | Processes holding `/dev/nvidia*` file descriptors (CUDA jobs, monitoring) | Error: "GPU 0000:xx:xx.x is in use by PID(s) — stop these processes first" | +| **IOMMU enabled** | `/sys/kernel/iommu_groups/` exists and the GPU has a group assignment | Error: "IOMMU is not enabled — add intel_iommu=on or amd_iommu=on to kernel cmdline" | +| **VFIO modules loaded** | `vfio-pci` and `vfio_iommu_type1` kernel modules are loaded | Error: "vfio-pci kernel module not loaded — run: sudo modprobe vfio-pci" | +| **Permissions** | Write access to sysfs bind/unbind and `/dev/vfio/` | Error: "insufficient permissions — run as root or with CAP_NET_ADMIN" | + +### Multi-GPU selection (`--gpu` auto mode) + +On hosts with multiple NVIDIA GPUs, the launcher selects a GPU using this priority: + +1. **Already on vfio-pci** with a clean IOMMU group — use immediately (no rebind needed). +2. **Idle (no processes, no display)** — preferred for binding. +3. **Skip** GPUs with active displays or running processes. + +If no GPU passes all safety checks, the launcher fails with per-device status listing what blocked each GPU. + +## Host preparation + +The launcher handles GPU driver binding automatically. The host only needs IOMMU and VFIO kernel modules configured. + +### 1. Enable IOMMU + +IOMMU must be enabled in both BIOS/UEFI and the Linux kernel. + +**Intel systems:** + +```shell +# Add to kernel command line (e.g. /etc/default/grub GRUB_CMDLINE_LINUX) +intel_iommu=on iommu=pt +``` + +**AMD systems:** + +```shell +# AMD IOMMU is usually enabled by default; verify or add: +amd_iommu=on iommu=pt +``` + +After editing, run `update-grub` (or equivalent) and reboot. Verify IOMMU is active: + +```shell +dmesg | grep -i iommu +# Should show: "DMAR: IOMMU enabled" or "AMD-Vi: AMD IOMMUv2" +``` + +### 2. Load VFIO kernel modules + +```shell +sudo modprobe vfio-pci +sudo modprobe vfio_iommu_type1 + +# Persist across reboots +echo "vfio-pci" | sudo tee /etc/modules-load.d/vfio-pci.conf +echo "vfio_iommu_type1" | sudo tee /etc/modules-load.d/vfio_iommu_type1.conf +``` + +### 3. Device permissions + +The launcher needs root (or `CAP_NET_ADMIN`) to bind/unbind GPU drivers and configure TAP networking: + +```shell +# Option A: run as root (simplest) +sudo openshell-vm --gpu + +# Option B: set udev rules for /dev/vfio/ access (still needs sysfs write via root) +echo 'SUBSYSTEM=="vfio", OWNER="root", GROUP="kvm", MODE="0660"' | \ + sudo tee /etc/udev/rules.d/99-vfio.rules +sudo udevadm control --reload-rules +sudo usermod -aG kvm $USER +``` + +### What the launcher does automatically + +When `--gpu` is passed, the launcher performs the following steps that previously required manual intervention: + +1. **Identifies NVIDIA GPUs** via sysfs (`/sys/bus/pci/devices/*/vendor`) +2. **Runs safety checks** — display, active processes, IOMMU, VFIO modules (see Safety checks above) +3. **Unbinds from nvidia** — writes to `/sys/bus/pci/devices//driver/unbind` +4. **Sets driver override** — writes `vfio-pci` to `/sys/bus/pci/devices//driver_override` +5. **Binds to vfio-pci** — writes to `/sys/bus/pci/drivers/vfio-pci/bind` +6. **Handles IOMMU group peers** — binds other devices in the same IOMMU group to `vfio-pci` +7. **On shutdown** — reverses all bindings, clears `driver_override`, rebinds to `nvidia` + +## Single-GPU caveats + +When the host has only one NVIDIA GPU: + +- **Display-attached GPUs are blocked.** The safety checks detect if the GPU drives an active display (DRM framebuffer). If so, the launcher refuses to bind it — this prevents accidentally killing the host desktop. On headless data center servers (the typical deployment), this check passes and the GPU is bound automatically. +- **Recovery is automatic.** When the VM exits (clean shutdown, Ctrl+C, or process crash), the launcher rebinds the GPU to the `nvidia` driver and clears `driver_override`. No manual intervention is needed. +- **Process check.** If CUDA processes are using the GPU (visible via `/dev/nvidia*` file descriptors), the launcher refuses to unbind. Stop those processes first. + +## Supported GPUs + +GPU passthrough is validated with NVIDIA data center GPUs. Consumer GPUs may work but are not officially supported (NVIDIA restricts GeForce passthrough in some driver versions). + +| GPU | Architecture | Compute Capability | Status | +|-----|-------------|-------------------|--------| +| A100 | Ampere | 8.0 | Supported | +| A30 | Ampere | 8.0 | Supported | +| H100 | Hopper | 9.0 | Supported | +| H200 | Hopper | 9.0 | Supported | +| L40 | Ada Lovelace | 8.9 | Supported | +| L40S | Ada Lovelace | 8.9 | Supported | +| L4 | Ada Lovelace | 8.9 | Supported | + +## CLI usage + +### Auto-select GPU + +```shell +# openshell-vm binary (VM backend directly) +sudo openshell-vm --gpu + +# openshell CLI (gateway deployment — requires VM backend) +OPENSHELL_GATEWAY_BACKEND=vm sudo openshell gateway start --gpu +``` + +> **Note:** The default gateway backend is Docker (containers). GPU passthrough +> requires the VM backend. Set `OPENSHELL_GATEWAY_BACKEND=vm` (or `microvm`) +> to use the VM path with `openshell gateway start`. + +### Specific PCI address (multi-GPU hosts) + +```shell +sudo openshell-vm --gpu 0000:41:00.0 +``` + +### Backend selection + +The `--backend` flag controls hypervisor selection independently of `--gpu`: + +```shell +sudo openshell-vm --gpu # auto: selects cloud-hypervisor +sudo openshell-vm --backend cloud-hypervisor # explicit CHV, no GPU +sudo openshell-vm --backend libkrun # explicit libkrun (no GPU support) +``` + +The `chv` alias is accepted as shorthand for `cloud-hypervisor`. + +### Diagnostics + +When `--gpu` is passed, the launcher runs safety checks before unbinding. If +checks fail, it exits with an actionable error: + +```text +$ sudo openshell-vm --gpu +GPU passthrough blocked by safety checks. + + Detected devices: + 0000:41:00.0: has active display outputs + 0000:42:00.0: in use by PIDs: 12345 (python3), 12400 (nvidia-smi) + + No GPU is available for passthrough. +``` + +On a headless server with an idle GPU, the pre-unbind preparation runs first: + +```text +$ sudo openshell-vm --gpu +GPU 0000:41:00.0: disabled nvidia persistence mode +GPU 0000:41:00.0: unloaded nvidia_uvm +GPU 0000:41:00.0: unloaded nvidia_drm +GPU 0000:41:00.0: unloaded nvidia_modeset +GPU 0000:41:00.0: device already unbound after nvidia module cleanup +GPU: binding 0000:41:00.0 for VFIO passthrough +``` + +On shutdown (Ctrl+C or VM exit), the original driver is restored: + +```text +^C +GPU: restoring 0000:41:00.0 (cleanup) +GPU: rebinding 0000:41:00.0 to nvidia +``` + +## VM Networking (Cloud Hypervisor) + +Cloud Hypervisor uses TAP-based networking instead of the gvproxy user-mode networking used by the libkrun backend. This has several implications for connectivity and port forwarding. + +### Network topology + +``` +Host Guest (microVM) +───────────────────────────────────── ────────────────────────── + eth0 (or primary NIC) eth0 (virtio-net) + ↕ ↕ + iptables MASQUERADE ←── NAT ──→ 192.168.249.2/24 + ↕ ↕ default gw 192.168.249.1 + vmtap0 (TAP device) ↕ + 192.168.249.1/24 ←─── L2 bridge ──→ (kernel routes) + ↕ + 127.0.0.1:{port} ←── TCP proxy ──→ {port} (k3s NodePort) +``` + +### How it works + +The CHV backend configures networking in three layers: + +**1. TAP device and guest IP assignment** + +Cloud Hypervisor creates a TAP device on the host side with IP `192.168.249.1/24`. The guest is assigned `192.168.249.2/24` via kernel command line parameters (`VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). The init script reads these from `/proc/cmdline` and uses them as the static fallback when DHCP is unavailable (CHV does not run a DHCP server). + +**2. Host-side NAT and IP forwarding** + +After booting the VM, the launcher: +- Enables IP forwarding (`/proc/sys/net/ipv4/ip_forward`) +- Adds iptables MASQUERADE rules for the `192.168.249.0/24` subnet +- Adds FORWARD rules to allow traffic to/from the VM + +This gives the guest internet access through the host. Rules are cleaned up on VM shutdown. + +**3. TCP port forwarding** + +Unlike gvproxy (which provides built-in port forwarding), CHV TAP networking requires explicit port forwarding. The launcher starts a userspace TCP proxy for each port mapping (e.g., `30051:30051`). The proxy binds to `127.0.0.1:{host_port}` and forwards connections to `192.168.249.2:{guest_port}`. + +### DNS resolution + +The launcher detects the host's upstream DNS server using a two-step lookup: + +1. Reads `/etc/resolv.conf` and picks the first nameserver that does not start with `127.` (skipping systemd-resolved's `127.0.0.53` stub and other loopback addresses). +2. If all nameservers in `/etc/resolv.conf` are loopback, falls back to `/run/systemd/resolve/resolv.conf` (the upstream resolv.conf maintained by systemd-resolved). +3. If no non-loopback nameserver is found in either file, falls back to `8.8.8.8`. + +The resolved DNS server is passed to the guest via `VM_NET_DNS=` on the kernel command line. The init script writes it to `/etc/resolv.conf` inside the guest, unconditionally overriding any stale entries from previous boot cycles. + +### Key constants + +| Constant | Value | Purpose | +|----------|-------|---------| +| `CHV_TAP_HOST_IP` | `192.168.249.1` | Host side of the TAP device | +| `CHV_TAP_GUEST_IP` | `192.168.249.2` | Guest static IP | +| `CHV_TAP_SUBNET` | `192.168.249.0/24` | Subnet for iptables rules | +| `CHV_TAP_NETMASK` | `255.255.255.0` | Subnet mask in VM payload | + +### Differences from libkrun/gvproxy networking + +| Feature | libkrun + gvproxy | CHV + TAP | +|---------|------------------|-----------| +| Network mode | User-mode (SLIRP-like) | Kernel TAP device | +| DHCP | Built-in (gvproxy) | None (static IP via cmdline) | +| Guest IP | `192.168.127.2/24` | `192.168.249.2/24` | +| Port forwarding | Built-in (gvproxy `-forward`) | Userspace TCP proxy | +| Privileges | Unprivileged | Root or `CAP_NET_ADMIN` | +| NAT | Handled by gvproxy | iptables MASQUERADE | +| DNS | gvproxy provides | Host resolver passed via cmdline | + +### Troubleshooting networking + +**"lookup registry-1.docker.io: Try again" (DNS failure)** + +The VM cannot resolve DNS. Check: + +```shell +# Verify the host DNS is non-loopback +grep nameserver /etc/resolv.conf +# If only 127.0.0.53 (systemd-resolved), find the upstream: +resolvectl status | grep 'DNS Servers' + +# Verify iptables rules are in place +sudo iptables -t nat -L POSTROUTING -n -v | grep 192.168.249 +sudo iptables -L FORWARD -n -v | grep 192.168.249 + +# Verify IP forwarding is enabled +cat /proc/sys/net/ipv4/ip_forward +``` + +**Gateway health check fails (port 30051 unreachable)** + +The TCP port forwarder may not have started, or the guest service is not yet listening: + +```shell +# Check if the port forwarder is bound on the host +ss -tlnp | grep 30051 + +# Check if the guest is reachable +ping -c1 192.168.249.2 +``` + +### Host mTLS cache and state disk + +The launcher caches mTLS certificates on the host after the first successful boot (warm boot path). If the state disk is deleted or `--reset` is used, the VM generates new PKI that won't match the cached certs. The launcher detects this — when the state disk is freshly created or reset, it clears the stale host mTLS cache and runs the cold-boot PKI fetch path. This prevents `transport error` failures on the gateway health check after a state disk reset. + +## Troubleshooting + +### "no NVIDIA PCI device found" + +The host has no NVIDIA GPU installed, or the PCI device is not visible: + +```shell +lspci -nn | grep -i nvidia +# If empty, the GPU is not detected at the PCI level +``` + +### "has active display outputs" + +The GPU drives a DRM framebuffer or is the boot VGA device. This is a hard safety check — the launcher will not unbind a display GPU. Options: + +- Use a different GPU for the monitor (iGPU, secondary card) +- Stop the display manager first: `sudo systemctl stop gdm` +- On headless servers, this should not occur — verify with `ls /sys/class/drm/card*/device` + +### "in use by PIDs: ..." + +Active processes hold `/dev/nvidia*` file descriptors. The check is host-wide +(across all NVIDIA GPUs, not per-device). The launcher lists the PIDs and +process names. Stop those processes before retrying. + +### "IOMMU not enabled or device has no IOMMU group" + +IOMMU must be enabled in both BIOS/UEFI and kernel cmdline. See Host Preparation above. + +### "VFIO kernel modules not loaded" + +```shell +sudo modprobe vfio-pci +sudo modprobe vfio_iommu_type1 +``` + +### "insufficient sysfs permissions — run as root" + +The launcher needs root to write to sysfs bind/unbind paths. Run with `sudo`. + +### GPU not rebound after crash + +If the launcher process is killed with `SIGKILL` (kill -9), the cleanup handler cannot run and the GPU remains on `vfio-pci`. Manually rebind: + +```shell +PCI_ADDR="0000:41:00.0" +echo "$PCI_ADDR" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver/unbind +echo "" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver_override +echo "$PCI_ADDR" | sudo tee /sys/bus/pci/drivers/nvidia/bind +``` + +### nvidia driver unbind deadlock (kernel bug) + +Some nvidia driver versions deadlock in their sysfs `unbind` handler — the `write()` syscall to `/sys/bus/pci/drivers/nvidia/unbind` never returns. When this happens, the subprocess enters uninterruptible sleep (D state) and becomes unkillable even by `SIGKILL`. The GPU's PCI subsystem state is corrupted and all subsequent PCI operations on the device hang. Only a host reboot clears this state. + +This is a kernel/nvidia driver bug, not an openshell-vm issue. Three mitigation layers are in place: + +1. **Pre-unbind preparation**: Before the raw sysfs unbind, the launcher disables nvidia persistence mode (`nvidia-smi -pm 0`) and unloads nvidia submodules (`nvidia_uvm`, `nvidia_drm`, `nvidia_modeset`) via `modprobe -r`. This often cascade-removes the base nvidia module entirely, unbinding the device automatically without ever touching the dangerous sysfs path. + +2. **Subprocess isolation with timeout**: All sysfs writes (and the nvidia prep commands) run in a subprocess with a timeout (10s for sysfs, 15s for prep). On timeout, the subprocess is killed and dropped without calling `wait()` — preventing the parent process from being dragged into D-state. + +3. **Post-timeout verification**: If the unbind subprocess times out but the device is actually unbound at the hardware level (which the nvidia bug can cause — the operation completes but the syscall never returns), the launcher detects this and continues with the VFIO bind. + +If you hit this issue repeatedly, check for nvidia driver updates or file a bug with NVIDIA. + +### VM boots but `nvidia-smi` fails inside guest + +- Verify the GPU rootfs includes NVIDIA drivers: `chroot /path/to/rootfs which nvidia-smi` +- Check that NVIDIA kernel modules load: `openshell-vm exec -- lsmod | grep nvidia` +- Inspect dmesg for NVIDIA driver errors: `openshell-vm exec -- dmesg | grep -i nvidia` + +## Related + +- [Custom VM Runtime](custom-vm-runtime.md) — building and customizing the libkrun VM runtime +- [System Architecture](system-architecture.md) — overall OpenShell architecture +- Implementation: [`crates/openshell-vm/src/gpu_passthrough.rs`](../crates/openshell-vm/src/gpu_passthrough.rs) diff --git a/crates/openshell-cli/Cargo.toml b/crates/openshell-cli/Cargo.toml index b3a006fdd..dd8f83bb8 100644 --- a/crates/openshell-cli/Cargo.toml +++ b/crates/openshell-cli/Cargo.toml @@ -21,6 +21,7 @@ openshell-policy = { path = "../openshell-policy" } openshell-providers = { path = "../openshell-providers" } openshell-prover = { path = "../openshell-prover" } openshell-tui = { path = "../openshell-tui" } +openshell-vm = { path = "../openshell-vm" } serde = { workspace = true } serde_json = { workspace = true } prost-types = { workspace = true } diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 292922411..05d1fb7c1 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -807,18 +807,21 @@ enum GatewayCommands { #[arg(long, env = "OPENSHELL_REGISTRY_TOKEN")] registry_token: Option, - /// Enable NVIDIA GPU passthrough. + /// Enable NVIDIA GPU support for the gateway cluster. /// - /// Passes all host GPUs into the cluster container and deploys the - /// NVIDIA k8s-device-plugin so Kubernetes workloads can request - /// `nvidia.com/gpu` resources. Requires NVIDIA drivers and the - /// NVIDIA Container Toolkit on the host. + /// **Docker path (default):** passes GPUs into the gateway container via + /// the NVIDIA Container Toolkit — CDI when the daemon supports it, else + /// Docker's `--gpus all` — and deploys the NVIDIA device plugin. Use + /// `--gpu` or `--gpu auto` only; PCI addresses are not valid CDI device + /// names on this path. /// - /// When enabled, OpenShell auto-selects CDI when the Docker daemon has - /// CDI enabled and falls back to Docker's NVIDIA GPU request path - /// (`--gpus all`) otherwise. - #[arg(long)] - gpu: bool, + /// **MicroVM path:** set `OPENSHELL_GATEWAY_BACKEND=vm` for deployments + /// that use the VM gateway. Then you may pass `--gpu` / `--gpu auto` for + /// VFIO auto-select, or `--gpu 0000:41:00.0` (PCI BDF) for a specific GPU. + /// Requires IOMMU and the GPU bound to `vfio-pci`. See + /// `architecture/vm-gpu-passthrough.md`. + #[arg(long, num_args = 0..=1, default_missing_value = "auto")] + gpu: Option, }, /// Stop the gateway (preserves state). @@ -1129,10 +1132,9 @@ enum SandboxCommands { /// Request GPU resources for the sandbox. /// /// When no gateway is running, auto-bootstrap starts a GPU-enabled - /// gateway using the same automatic injection selection as - /// `openshell gateway start --gpu`. GPU intent is also inferred - /// automatically for known GPU-designated image names such as - /// `nvidia-gpu`. + /// gateway using the Docker NVIDIA path (`--gpu auto`), same as + /// `openshell gateway start --gpu` without the microVM backend. GPU + /// intent is also inferred for known GPU image names (e.g. `nvidia-gpu`). #[arg(long)] gpu: bool, @@ -1655,12 +1657,11 @@ async fn main() -> Result<()> { registry_token, gpu, } => { - let gpu = if gpu { - vec!["auto".to_string()] - } else { - vec![] + let gpu = match gpu { + Some(val) => vec![val], + None => vec![], }; - run::gateway_admin_deploy( + let _gpu_guard = run::gateway_admin_deploy( &name, remote.as_deref(), ssh_key.as_deref(), diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index c41b53518..247f41d11 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1434,7 +1434,9 @@ pub async fn gateway_admin_deploy( registry_username: Option<&str>, registry_token: Option<&str>, gpu: Vec, -) -> Result<()> { +) -> Result> { + let (gpu, gpu_guard) = prepare_gateway_deploy_gpu(gpu, remote.as_deref())?; + let location = if remote.is_some() { "remote" } else { "local" }; // Build remote options once so we can reuse them for the existence check @@ -1457,7 +1459,7 @@ pub async fn gateway_admin_deploy( "{} Gateway '{name}' is already running.", "✓".green().bold() ); - return Ok(()); + return Ok(gpu_guard); } } } @@ -1518,7 +1520,7 @@ pub async fn gateway_admin_deploy( save_active_gateway(name)?; eprintln!("{} Active gateway set to '{name}'", "✓".green().bold()); - Ok(()) + Ok(gpu_guard) } /// Resolve the remote SSH destination for a gateway. @@ -5193,6 +5195,126 @@ fn format_timestamp_ms(ms: i64) -> String { } } +/// Environment variable selecting the gateway deployment backend for GPU checks. +/// +/// VFIO sysfs probes apply only to the microVM (`openshell-vm`) deploy path. +/// The default `openshell gateway start` flow uses Docker with the NVIDIA +/// Container Toolkit; leave this unset for that path. +const OPENSHELL_GATEWAY_BACKEND_ENV: &str = "OPENSHELL_GATEWAY_BACKEND"; + +fn gateway_deploy_uses_vm_backend() -> bool { + std::env::var(OPENSHELL_GATEWAY_BACKEND_ENV) + .ok() + .map(|v| { + matches!( + v.trim().to_ascii_lowercase().as_str(), + "vm" | "microvm" | "openshell-vm" + ) + }) + .unwrap_or(false) +} + +/// Heuristic: value looks like a PCI domain:bus:dev.fn address (Linux sysfs BDF). +fn looks_like_pci_bdf(s: &str) -> bool { + let s = s.trim(); + let rest = if let Some((prefix, after_colon)) = s.split_once(':') { + if prefix.len() == 4 && prefix.chars().all(|c| c.is_ascii_hexdigit()) { + after_colon + } else { + s + } + } else { + return false; + }; + + let Some((bus, dev_fn)) = rest.split_once(':') else { + return false; + }; + if bus.len() != 2 || !bus.chars().all(|c| c.is_ascii_hexdigit()) { + return false; + } + let Some((dev, func)) = dev_fn.split_once('.') else { + return false; + }; + if dev.len() != 2 || !dev.chars().all(|c| c.is_ascii_hexdigit()) { + return false; + } + if func.len() != 1 || !func.chars().all(|c| ('0'..='7').contains(&c)) { + return false; + } + true +} + +/// Validate `--gpu` for `gateway start`, run VFIO checks only for the VM deploy path, +/// and normalize Docker-path requests to CDI-compatible `auto`. +fn prepare_gateway_deploy_gpu( + gpu: Vec, + remote: Option<&str>, +) -> Result<( + Vec, + Option, +)> { + if gpu.is_empty() { + return Ok((gpu, None)); + } + + if gateway_deploy_uses_vm_backend() { + if remote.is_none() { + let guard = check_gpu_readiness(&gpu)?; + let selected_bdf = guard.pci_addr().unwrap_or("auto").to_string(); + let updated_gpu = vec![selected_bdf]; + return Ok((updated_gpu, Some(guard))); + } else { + eprintln!( + "{} Local VFIO GPU probe skipped (--remote): GPU readiness is checked on the remote host during deployment.", + "ℹ".cyan().bold() + ); + } + return Ok((gpu, None)); + } + + let Some(first) = gpu.first() else { + return Ok((gpu, None)); + }; + if first.as_str() != "auto" { + if looks_like_pci_bdf(first) { + return Err(miette!( + "PCI address GPU selection ({first}) is only supported for the microVM gateway backend.\n\n\ + `openshell gateway start` uses Docker by default (NVIDIA Container Toolkit / CDI, or Docker `--gpus all`). \ + Use `--gpu` or `--gpu auto` for that path.\n\n\ + For VFIO passthrough, set {}=vm and follow architecture/vm-gpu-passthrough.md.", + OPENSHELL_GATEWAY_BACKEND_ENV, + )); + } + return Err(miette!( + "Unrecognized --gpu value `{first}` for Docker gateway deploy. Use `--gpu` or `--gpu auto`.", + )); + } + + Ok((vec!["auto".to_string()], None)) +} + +/// Bind a GPU for VFIO passthrough and return an RAII guard that restores it on drop. +fn check_gpu_readiness(gpu: &[String]) -> Result { + use openshell_vm::gpu_passthrough::{GpuBindGuard, prepare_gpu_for_passthrough}; + + let requested_addr = gpu + .first() + .filter(|v| v.as_str() != "auto") + .map(|v| v.as_str()); + + let bind_state = prepare_gpu_for_passthrough(requested_addr).map_err(|e| miette!("{e}"))?; + + eprintln!( + "{} GPU {} bound to vfio-pci (was: {})", + "✓".green().bold(), + bind_state.pci_addr, + bind_state.original_driver, + ); + + Ok(GpuBindGuard::new(bind_state)) +} + #[cfg(test)] mod tests { use super::{ @@ -5416,6 +5538,16 @@ mod tests { assert!(sandbox_should_persist(false, Some(&spec))); } + #[test] + fn looks_like_pci_bdf_recognizes_sysfs_addresses() { + assert!(super::looks_like_pci_bdf("0000:41:00.0")); + assert!(super::looks_like_pci_bdf("41:00.0")); + assert!(super::looks_like_pci_bdf(" 0a:1f.7 ")); + assert!(!super::looks_like_pci_bdf("auto")); + assert!(!super::looks_like_pci_bdf("nvidia.com/gpu=all")); + assert!(!super::looks_like_pci_bdf("00:00.8")); // invalid function + } + #[test] fn image_requests_gpu_matches_known_gpu_image_names() { for image in [ diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml index 7d74b3139..388e42351 100644 --- a/crates/openshell-vm/Cargo.toml +++ b/crates/openshell-vm/Cargo.toml @@ -46,5 +46,8 @@ tokio-rustls = { workspace = true } [build-dependencies] zstd = "0.13" +[dev-dependencies] +tempfile = "3" + [lints] workspace = true diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs index 33fab9a78..f448ed0bc 100644 --- a/crates/openshell-vm/build.rs +++ b/crates/openshell-vm/build.rs @@ -12,7 +12,7 @@ //! Environment: //! `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` - Path to compressed artifacts -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::{env, fs}; fn main() { @@ -116,7 +116,7 @@ fn main() { /// Generate stub (empty) resource files so the build can complete. /// The embedded module will fail at runtime if these stubs are used. -fn generate_stub_resources(out_dir: &PathBuf) { +fn generate_stub_resources(out_dir: &Path) { let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); let (libkrun_name, libkrunfw_name) = match target_os.as_str() { diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index b3d802292..d44f044c8 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -42,3 +42,33 @@ GVPROXY_VERSION="${GVPROXY_VERSION:-v0.8.8}" # Repo: https://github.com/containers/libkrunfw # Pinned: 2026-03-27 (main branch HEAD at time of pinning) LIBKRUNFW_REF="${LIBKRUNFW_REF:-463f717bbdd916e1352a025b6fb2456e882b0b39}" + +# ── cloud-hypervisor (GPU passthrough VMM) ────────────────────────────── +# Repo: https://github.com/cloud-hypervisor/cloud-hypervisor +CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}" + +# ── virtiofsd (virtio-fs daemon for cloud-hypervisor rootfs) ──────────── +# Repo: https://gitlab.com/virtio-fs/virtiofsd +VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" + +# ── NVIDIA GPU support (GPU rootfs variant) ──────────────────────────── +# Driver branch: 570.x (open kernel modules, data-center/workstation) +# +# Compatibility matrix: +# Minimum driver version: 570 (NVIDIA 570.x open kernel modules) +# Minimum compute capability: sm_70 (Volta V100 and newer) +# Supported architectures: Volta (V100), Turing (T4, RTX 20xx), +# Ampere (A100, A10, RTX 30xx), +# Hopper (H100, H200), Ada Lovelace (L40S), +# Blackwell (B100, B200) +# Guest architecture: x86_64 only (NVIDIA does not publish +# aarch64 data-center drivers in APT form) +# Host requirements: IOMMU enabled, GPU bound to vfio-pci driver, +# host driver version >= guest driver version +# +# The 570.x branch uses the open kernel module flavour +# (nvidia-headless-570-open), required for data-center GPUs (Turing+). +# Consumer GPUs (GeForce) may work but are not officially supported +# for VFIO passthrough. +NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}" +NVIDIA_CONTAINER_TOOLKIT_VERSION="${NVIDIA_CONTAINER_TOOLKIT_VERSION:-1.17.5}" diff --git a/crates/openshell-vm/runtime/kernel/openshell.kconfig b/crates/openshell-vm/runtime/kernel/openshell.kconfig index b5f0330af..5ce14a683 100644 --- a/crates/openshell-vm/runtime/kernel/openshell.kconfig +++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig @@ -115,6 +115,10 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PIDS=y CONFIG_MEMCG=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_FREEZER=y # ── Disable kernel headers archive (avoids cpio issues in CI) ────────── # CONFIG_IKHEADERS is not set @@ -126,3 +130,29 @@ CONFIG_POSIX_MQUEUE_SYSCTL=y # ── Security features required by the sandbox runtime ─────────────────── CONFIG_SECURITY_LANDLOCK=y CONFIG_SECCOMP_FILTER=y + +# ── PCI / GPU passthrough (harmless for non-GPU boots) ────────────────── +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DRM=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y + +# ── cloud-hypervisor support ──────────────────────────────────────────── +# CHV uses virtio-PCI transport (libkrun uses virtio-MMIO). Both drivers +# coexist safely — the kernel probes whichever transport the hypervisor +# provides. +CONFIG_VIRTIO_PCI=y + +# Serial console for cloud-hypervisor (8250/16550 UART). libkrun uses +# virtio-console which is already enabled in the base config. +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y + +# ACPI support for cloud-hypervisor power management. Required for +# `poweroff -f` to trigger a clean ACPI shutdown that CHV detects. +CONFIG_ACPI=y + +# x2APIC support — Cloud Hypervisor uses x2APIC MADT entries for +# multi-vCPU VMs. Without this, only the bootstrap CPU is activated. +CONFIG_X86_X2APIC=y diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index d43046d4f..99a301f85 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -18,11 +18,16 @@ # - NO pre-initialized k3s state (cold start on first boot) # First boot will be slower (~30-60s) as k3s initializes and pulls images. # +# With --gpu, installs NVIDIA driver packages and the nvidia-container-toolkit +# into the rootfs, producing a GPU-capable variant. The launcher selects this +# rootfs when `--gpu` is passed. Only supported on x86_64 (NVIDIA does not +# publish aarch64 data-center drivers for Ubuntu in this packaging form). +# # Supports aarch64 and x86_64 guest architectures. The target architecture # is auto-detected from the host but can be overridden with --arch. # # Usage: -# ./build-rootfs.sh [--base] [--arch aarch64|x86_64] [output_dir] +# ./build-rootfs.sh [--base] [--gpu] [--arch aarch64|x86_64] [output_dir] # # If output_dir is omitted, the rootfs is built under target/rootfs-build. # @@ -43,12 +48,15 @@ fi # ── Argument parsing ─────────────────────────────────────────────────── BASE_ONLY=false +GPU_BUILD=false GUEST_ARCH="" POSITIONAL_ARGS=() while [[ $# -gt 0 ]]; do case "$1" in --base) BASE_ONLY=true; shift ;; + --gpu) + GPU_BUILD=true; shift ;; --arch) GUEST_ARCH="$2"; shift 2 ;; *) @@ -90,6 +98,14 @@ case "$GUEST_ARCH" in ;; esac +# GPU builds are only supported on x86_64 — NVIDIA does not publish +# aarch64 data-center driver packages in the same APT repository. +if [ "$GPU_BUILD" = true ] && [ "$GUEST_ARCH" != "x86_64" ]; then + echo "ERROR: --gpu is only supported for x86_64 guest architecture." >&2 + echo " Current arch: ${GUEST_ARCH}" >&2 + exit 1 +fi + # Project root (two levels up from crates/openshell-vm/scripts/) PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" DEFAULT_ROOTFS="${PROJECT_ROOT}/target/rootfs-build" @@ -125,6 +141,9 @@ if [ "$BASE_ONLY" = true ]; then echo " k3s version: ${K3S_VERSION}" echo " Output: ${ROOTFS_DIR}" echo " Mode: base (no pre-loaded images, cold start)" + if [ "$GPU_BUILD" = true ]; then + echo " GPU: yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})" + fi else echo "==> Building openshell-vm rootfs" echo " Guest arch: ${GUEST_ARCH}" @@ -132,6 +151,9 @@ else echo " Images: ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}" echo " Output: ${ROOTFS_DIR}" echo " Mode: full (pre-loaded images, pre-initialized)" + if [ "$GPU_BUILD" = true ]; then + echo " GPU: yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})" + fi fi echo "" @@ -222,8 +244,55 @@ fi docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true echo "==> Building base image..." -docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ - --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' +if [ "$GPU_BUILD" = true ]; then + docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" \ + --build-arg "NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}" \ + --build-arg "NVIDIA_CONTAINER_TOOLKIT_VERSION=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \ + -f - . <<'DOCKERFILE' +ARG BASE_IMAGE +FROM ${BASE_IMAGE} +ARG NVIDIA_DRIVER_VERSION +ARG NVIDIA_CONTAINER_TOOLKIT_VERSION +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + e2fsprogs \ + iptables \ + iproute2 \ + python3 \ + busybox-static \ + sqlite3 \ + util-linux \ + zstd \ + gnupg \ + curl \ + && rm -rf /var/lib/apt/lists/* +# busybox-static provides udhcpc for DHCP inside the VM. +RUN mkdir -p /usr/share/udhcpc && \ + ln -sf /bin/busybox /sbin/udhcpc +RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s +# ── NVIDIA driver and container toolkit ────────────────────────────── +# Add the NVIDIA package repository and install the open kernel module +# flavour of the driver plus nvidia-container-toolkit. The open modules +# are required for data-center GPUs (Turing+ / compute capability >= 7.0). +RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + > /etc/apt/sources.list.d/nvidia-container-toolkit.list +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + nvidia-headless-${NVIDIA_DRIVER_VERSION}-open \ + nvidia-utils-${NVIDIA_DRIVER_VERSION} \ + nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION}-1 \ + && rm -rf /var/lib/apt/lists/* +# Configure the NVIDIA container runtime as the default for containerd. +RUN nvidia-ctk runtime configure --runtime=containerd --set-as-default +DOCKERFILE +else + docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' ARG BASE_IMAGE FROM ${BASE_IMAGE} RUN apt-get update && \ @@ -243,6 +312,7 @@ RUN mkdir -p /usr/share/udhcpc && \ ln -sf /bin/busybox /sbin/udhcpc RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s DOCKERFILE +fi # Create a container and export the filesystem echo "==> Creating container..." @@ -363,6 +433,28 @@ for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do fi done +# ── Inject GPU manifests (when building GPU rootfs) ─────────────────── +# These are deployed by openshell-vm-init.sh when GPU_ENABLED=true. +GPU_MANIFEST_SRC="${SCRIPT_DIR}/gpu-manifests" +GPU_MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/gpu-manifests" +if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_SRC}" ]; then + echo "==> Injecting GPU manifests..." + mkdir -p "${GPU_MANIFEST_DEST}" + GPU_MANIFEST_COPIED=0 + for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do + [ -f "$manifest" ] || continue + cp "$manifest" "${GPU_MANIFEST_DEST}/" + echo " $(basename "$manifest")" + GPU_MANIFEST_COPIED=$((GPU_MANIFEST_COPIED + 1)) + done + # Sentinel only when at least one manifest was staged (empty glob must not create it). + if [ "$GPU_MANIFEST_COPIED" -gt 0 ]; then + echo "gpu" > "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" + else + echo "WARNING: No GPU manifests (*.yaml) found in ${GPU_MANIFEST_SRC}; not writing .rootfs-gpu sentinel." >&2 + fi +fi + # ── Base mode: mark rootfs type and skip pre-loading ─────────────────── if [ "$BASE_ONLY" = true ]; then @@ -384,10 +476,33 @@ if [ "$BASE_ONLY" = true ]; then exit 1 fi + if [ "$GPU_BUILD" = true ]; then + echo "==> Verifying GPU components in rootfs..." + if [ ! -f "${ROOTFS_DIR}/usr/bin/nvidia-smi" ]; then + echo "ERROR: nvidia-smi not found in rootfs." + exit 1 + fi + if [ ! -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ]; then + echo "ERROR: GPU sentinel file not found in rootfs." + exit 1 + fi + echo " nvidia-smi: found" + # nvidia-container-runtime is installed via nvidia-container-toolkit. + if ls "${ROOTFS_DIR}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then + echo " nvidia-container-runtime: found" + else + echo "WARNING: nvidia-container-runtime not found — GPU pods may not work." + fi + fi + echo "" echo "==> Base rootfs ready at: ${ROOTFS_DIR}" echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" - echo " Type: base (cold start, images pulled on demand)" + if [ "$GPU_BUILD" = true ]; then + echo " Type: base + GPU (cold start, NVIDIA driver ${NVIDIA_DRIVER_VERSION})" + else + echo " Type: base (cold start, images pulled on demand)" + fi echo "" echo "Note: First boot will take ~30-60s as k3s initializes." echo " Container images will be pulled from registries on first use." @@ -475,6 +590,15 @@ for manifest in "${MANIFEST_DEST}"/*.yaml; do cp "$manifest" "${INIT_MANIFESTS}/" done +# GPU manifests: same pre-init path as other auto-deploy manifests so k3s +# sees them during cluster bake (not only under /opt/openshell/gpu-manifests). +if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_DEST}" ]; then + for manifest in "${GPU_MANIFEST_DEST}"/*.yaml; do + [ -f "$manifest" ] || continue + cp "$manifest" "${INIT_MANIFESTS}/" + done +fi + # Patch HelmChart for local images and VM settings. HELMCHART="${INIT_MANIFESTS}/openshell-helmchart.yaml" if [ -f "$HELMCHART" ]; then @@ -741,10 +865,28 @@ if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then exit 1 fi +# ── GPU verification (full mode) ────────────────────────────────────── +if [ "$GPU_BUILD" = true ]; then + echo "==> Verifying GPU components in rootfs..." + if [ ! -f "${ROOTFS_DIR}/usr/bin/nvidia-smi" ]; then + echo "ERROR: nvidia-smi not found in rootfs." + exit 1 + fi + echo " nvidia-smi: found" + if ls "${ROOTFS_DIR}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then + echo " nvidia-container-runtime: found" + else + echo "WARNING: nvidia-container-runtime not found — GPU pods may not work." + fi +fi + echo "" echo "==> Rootfs ready at: ${ROOTFS_DIR}" echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" echo " Pre-initialized: $(cat "${ROOTFS_DIR}/opt/openshell/.initialized" 2>/dev/null || echo 'no')" +if [ "$GPU_BUILD" = true ]; then + echo " GPU: NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION}" +fi # Show k3s data size K3S_DATA="${ROOTFS_DIR}/var/lib/rancher/k3s" diff --git a/crates/openshell-vm/scripts/gpu-manifests/README.md b/crates/openshell-vm/scripts/gpu-manifests/README.md new file mode 100644 index 000000000..c72deb1aa --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/README.md @@ -0,0 +1,41 @@ +# GPU Rootfs Manifests + +These Kubernetes manifests are injected into the VM rootfs when +`build-rootfs.sh --gpu` is used. During a **full** rootfs build they are +also copied into the k3s auto-deploy manifest directory so they are +applied at pre-init time. + +**Phase 2:** deployment from `openshell-vm-init.sh` when +`GPU_ENABLED=true` is not implemented yet; that path will copy or +reconcile these manifests at VM boot. + +## NVIDIA Driver Compatibility + +| Property | Value | +|---|---| +| Driver branch | 570.x (open kernel modules) | +| Minimum compute capability | sm_70 (Volta V100 and newer) | +| Container toolkit | nvidia-container-toolkit 1.17.x | +| Device plugin Helm chart | 0.18.2 | + +### Why open kernel modules? + +The 570.x open kernel modules are required for data-center GPUs +(Volta, Turing, Ampere, Hopper, Blackwell). They are the +NVIDIA-recommended driver for passthrough and container workloads. +Consumer GPUs (GeForce) prior to Turing (sm_75) are **not supported** +with open modules — use the proprietary driver branch if needed. + +### Host requirements + +- IOMMU enabled in BIOS and kernel (`intel_iommu=on` or `amd_iommu=on`) +- GPU bound to `vfio-pci` driver on the host +- `/dev/vfio/vfio` and `/dev/vfio/` accessible +- Host NVIDIA driver version >= 570 (must match or exceed guest driver) + +### Files + +- `nvidia-device-plugin.yaml` — HelmChart CR that deploys the NVIDIA + k8s-device-plugin via the k3s Helm controller. +- `nvidia-runtime-class.yaml` — RuntimeClass object so pods can use + `runtimeClassName: nvidia`. diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml new file mode 100644 index 000000000..c1cbeaa8a --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# HelmChart CR for auto-deploying the NVIDIA k8s-device-plugin via k3s Helm controller. +# +# This manifest is copied into /var/lib/rancher/k3s/server/manifests/ by the +# VM init script when GPU_ENABLED=true. It is the VM-specific equivalent of +# deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml used by the +# Docker-based gateway. +# +# The chart installs: +# - NVIDIA device plugin DaemonSet (advertises nvidia.com/gpu resources) +# +# NFD and GFD are disabled; the device plugin's default nodeAffinity +# (which requires nvidia.com/gpu.present=true) is overridden to empty +# so it schedules on any node without requiring NFD/GFD labels. +# +# CDI injection mode: the device plugin uses deviceListStrategy=cdi-cri so that +# devices are injected via CDI hooks before container start. Sandbox pods only +# need the nvidia.com/gpu resource request — no runtimeClassName is required. +# +# k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia" +# RuntimeClass automatically, so no manual RuntimeClass manifest is needed. + +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: nvidia-device-plugin + namespace: kube-system +spec: + repo: https://nvidia.github.io/k8s-device-plugin + chart: nvidia-device-plugin + version: "0.18.2" + targetNamespace: nvidia-device-plugin + createNamespace: true + valuesContent: |- + runtimeClassName: nvidia + deviceListStrategy: cdi-cri + deviceIDStrategy: index + cdi: + nvidiaHookPath: /usr/bin/nvidia-cdi-hook + nvidiaDriverRoot: "/" + gfd: + enabled: false + nfd: + enabled: false + affinity: null diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml new file mode 100644 index 000000000..fe2ccbd6e --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# RuntimeClass for NVIDIA GPU workloads. +# Deployed alongside the device plugin when GPU_ENABLED=true. +# Pods requesting nvidia.com/gpu resources should set +# runtimeClassName: nvidia to use the NVIDIA container runtime. +--- +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh index 1cb686a31..222bcc641 100755 --- a/crates/openshell-vm/scripts/openshell-vm-init.sh +++ b/crates/openshell-vm/scripts/openshell-vm-init.sh @@ -46,6 +46,31 @@ mkdir -p /sys/fs/cgroup mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null & wait +# ── Parse kernel cmdline for env vars (cloud-hypervisor path) ──────── +# cloud-hypervisor passes environment variables via kernel cmdline +# (KEY=VALUE tokens). These are not automatically exported to init. +# Must run after /proc is mounted. +if [ -f /proc/cmdline ]; then + for token in $(cat /proc/cmdline); do + case "$token" in + GPU_ENABLED=*|OPENSHELL_VM_STATE_DISK_DEVICE=*|VM_NET_IP=*|VM_NET_GW=*|VM_NET_DNS=*) + export "$token" + ;; + esac + done +fi + +# Enable cgroup v2 controllers in the root cgroup hierarchy. +# k3s/kubelet requires cpu, cpuset, memory, and pids controllers. +# The kernel must have CONFIG_CGROUP_SCHED=y for the cpu controller. +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + for ctrl in cpu cpuset memory pids io; do + if grep -qw "$ctrl" /sys/fs/cgroup/cgroup.controllers; then + echo "+$ctrl" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true + fi + done +fi + ts "filesystems mounted" # ── Networking ────────────────────────────────────────────────────────── @@ -97,20 +122,26 @@ DHCP_SCRIPT # -n: exit if no lease, -T 1: 1s between retries, -t 3: 3 retries # -A 1: wait 1s before first retry (aggressive for local gvproxy) if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then - ts "WARNING: DHCP failed, falling back to static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + STATIC_IP="${VM_NET_IP:-192.168.127.2}" + STATIC_GW="${VM_NET_GW:-192.168.127.1}" + ts "WARNING: DHCP failed, falling back to static config ($STATIC_IP gw $STATIC_GW)" + ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true + ip route add default via "$STATIC_GW" 2>/dev/null || true fi else - # Fallback to static config if no DHCP client available. - ts "no DHCP client, using static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + STATIC_IP="${VM_NET_IP:-192.168.127.2}" + STATIC_GW="${VM_NET_GW:-192.168.127.1}" + ts "no DHCP client, using static config ($STATIC_IP gw $STATIC_GW)" + ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true + ip route add default via "$STATIC_GW" 2>/dev/null || true fi - # Ensure DNS is configured. DHCP should have set /etc/resolv.conf, - # but if it didn't (or static fallback was used), provide a default. - if [ ! -s /etc/resolv.conf ]; then + # Ensure DNS is configured. When VM_NET_DNS is set (TAP networking), + # always use it — the rootfs may have a stale resolv.conf from a + # previous gvproxy run that points to an unreachable gateway. + if [ -n "${VM_NET_DNS:-}" ]; then + echo "nameserver $VM_NET_DNS" > /etc/resolv.conf + elif [ ! -s /etc/resolv.conf ]; then echo "nameserver 8.8.8.8" > /etc/resolv.conf echo "nameserver 8.8.4.4" >> /etc/resolv.conf fi @@ -366,6 +397,35 @@ if [ "$_caps_ok" = false ]; then exit 1 fi +# ── GPU: NVIDIA driver and device plugin ───────────────────────────── +# When the VM is launched with --gpu, the Rust launcher passes +# GPU_ENABLED=true. Load the NVIDIA kernel modules, verify the device +# is visible via nvidia-smi, and confirm that the container runtime is +# available before k3s starts. + +if [ "${GPU_ENABLED:-false}" = "true" ]; then + ts "GPU mode enabled — loading NVIDIA drivers" + + modprobe nvidia || { echo "FATAL: failed to load nvidia kernel module" >&2; exit 1; } + modprobe nvidia_uvm || { echo "FATAL: failed to load nvidia_uvm kernel module" >&2; exit 1; } + modprobe nvidia_modeset || { echo "FATAL: failed to load nvidia_modeset kernel module" >&2; exit 1; } + ts "NVIDIA kernel modules loaded" + + if ! nvidia-smi > /dev/null 2>&1; then + echo "FATAL: GPU_ENABLED=true but nvidia-smi failed — GPU not visible to guest" >&2 + echo "Check: VFIO passthrough, IOMMU groups, guest kernel modules" >&2 + exit 1 + fi + ts "nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)" + + if command -v nvidia-container-runtime >/dev/null 2>&1; then + ts "nvidia-container-runtime: $(command -v nvidia-container-runtime)" + else + echo "FATAL: nvidia-container-runtime not found — GPU pods will fail" >&2 + exit 1 + fi +fi + # ── Deploy bundled manifests (cold boot only) ─────────────────────────── # On pre-initialized rootfs, manifests are already in place from the # build-time k3s boot. Skip this entirely for fast startup. @@ -411,6 +471,29 @@ else ts "skipping manifest deploy (pre-initialized)" fi +# ── GPU manifests (device plugin, runtime class) ───────────────────── +# Deployed on every boot (not just cold boot) so the device plugin is +# always present when GPU_ENABLED=true. Mirrors cluster-entrypoint.sh. +if [ "${GPU_ENABLED:-false}" = "true" ]; then + GPU_MANIFESTS="/opt/openshell/gpu-manifests" + if [ ! -d "$GPU_MANIFESTS" ]; then + echo "FATAL: GPU_ENABLED=true but GPU manifests directory missing: $GPU_MANIFESTS" >&2 + exit 1 + fi + mkdir -p "$K3S_MANIFESTS" + _gpu_manifest_deployed=false + for manifest in "$GPU_MANIFESTS"/*.yaml; do + [ -f "$manifest" ] || continue + _gpu_manifest_deployed=true + cp "$manifest" "$K3S_MANIFESTS/" + ts "deployed GPU manifest: $(basename "$manifest")" + done + if [ "$_gpu_manifest_deployed" = false ]; then + echo "FATAL: GPU_ENABLED=true but no YAML manifests found in $GPU_MANIFESTS" >&2 + exit 1 + fi +fi + # Patch manifests for VM deployment constraints. HELMCHART="$K3S_MANIFESTS/openshell-helmchart.yaml" if [ -f "$HELMCHART" ]; then @@ -737,7 +820,7 @@ K3S_ARGS=( --node-ip="$NODE_IP" --kube-apiserver-arg=bind-address=0.0.0.0 --resolv-conf=/etc/resolv.conf - --tls-san=localhost,127.0.0.1,10.0.2.15,192.168.127.2 + --tls-san="localhost,127.0.0.1,10.0.2.15,192.168.127.2,$NODE_IP" --flannel-backend=none --snapshotter=overlayfs --kube-proxy-arg=proxy-mode=nftables diff --git a/crates/openshell-vm/src/backend/cloud_hypervisor.rs b/crates/openshell-vm/src/backend/cloud_hypervisor.rs new file mode 100644 index 000000000..869b1747d --- /dev/null +++ b/crates/openshell-vm/src/backend/cloud_hypervisor.rs @@ -0,0 +1,1476 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! cloud-hypervisor backend for GPU passthrough VMs. +//! +//! Uses the cloud-hypervisor REST API over a Unix socket to manage VMs +//! with VFIO device passthrough. This backend is Linux-only and requires +//! a separate kernel image (`vmlinux`) and `virtiofsd` for the root +//! filesystem. + +use std::io::{Read, Write}; +use std::os::unix::net::UnixStream; +use std::path::{Path, PathBuf}; +use std::time::{Duration, Instant}; + +use super::VmBackend; +use crate::exec::{ + VM_EXEC_VSOCK_PORT, clear_vm_runtime_state, vm_exec_socket_path, write_vm_runtime_state, +}; +use crate::{NetBackend, VmConfig, VmError, vm_rootfs_key}; + +/// cloud-hypervisor hypervisor backend for GPU passthrough. +pub struct CloudHypervisorBackend { + /// Path to the cloud-hypervisor binary. + chv_binary: PathBuf, + /// Path to the vmlinux kernel image. + vmlinux: PathBuf, + /// Path to the virtiofsd binary. + virtiofsd: PathBuf, +} + +impl CloudHypervisorBackend { + /// Create a new cloud-hypervisor backend, validating required binaries. + pub fn new() -> Result { + let runtime_dir = crate::configured_runtime_dir()?; + + let chv_binary = runtime_dir.join("cloud-hypervisor"); + if !chv_binary.is_file() { + return Err(VmError::BinaryNotFound { + path: chv_binary.display().to_string(), + hint: "GPU passthrough requires cloud-hypervisor. Run the GPU build pipeline or set OPENSHELL_VM_RUNTIME_DIR".to_string(), + }); + } + + let vmlinux = runtime_dir.join("vmlinux"); + if !vmlinux.is_file() { + return Err(VmError::BinaryNotFound { + path: vmlinux.display().to_string(), + hint: "GPU passthrough requires a vmlinux kernel. Run the GPU build pipeline" + .to_string(), + }); + } + + let virtiofsd = runtime_dir.join("virtiofsd"); + if !virtiofsd.is_file() { + return Err(VmError::BinaryNotFound { + path: virtiofsd.display().to_string(), + hint: "GPU passthrough requires virtiofsd. Run the GPU build pipeline".to_string(), + }); + } + + Ok(Self { + chv_binary, + vmlinux, + virtiofsd, + }) + } +} + +impl VmBackend for CloudHypervisorBackend { + fn launch(&self, config: &VmConfig) -> Result { + launch_cloud_hypervisor(self, config) + } +} + +// ── REST API client ───────────────────────────────────────────────────── + +/// Send a raw HTTP/1.1 request over a Unix socket and return the response body. +/// +/// Parses the response headers to determine Content-Length so we read exactly +/// the right number of bytes without relying on EOF or Connection: close. +fn http_request_unix( + socket_path: &Path, + method: &str, + path: &str, + body: Option<&str>, +) -> Result<(u16, String), String> { + use std::io::BufRead; + + let stream = UnixStream::connect(socket_path) + .map_err(|e| format!("connect to cloud-hypervisor API: {e}"))?; + + stream + .set_read_timeout(Some(Duration::from_secs(30))) + .map_err(|e| format!("set read timeout: {e}"))?; + + let request = if let Some(body) = body { + format!( + "{method} {path} HTTP/1.1\r\n\ + Host: localhost\r\n\ + Content-Type: application/json\r\n\ + Content-Length: {}\r\n\ + \r\n\ + {body}", + body.len(), + ) + } else { + format!( + "{method} {path} HTTP/1.1\r\n\ + Host: localhost\r\n\ + \r\n" + ) + }; + + { + let mut writer = &stream; + writer + .write_all(request.as_bytes()) + .map_err(|e| format!("write to cloud-hypervisor API: {e}"))?; + } + + let mut reader = std::io::BufReader::new(&stream); + + // Read status line + let mut status_line = String::new(); + reader + .read_line(&mut status_line) + .map_err(|e| format!("read status line: {e}"))?; + + let status_code = status_line + .split_whitespace() + .nth(1) + .and_then(|code| code.parse::().ok()) + .unwrap_or(0); + + // Read headers to find Content-Length + let mut content_length: usize = 0; + loop { + let mut header_line = String::new(); + reader + .read_line(&mut header_line) + .map_err(|e| format!("read header: {e}"))?; + if header_line.trim().is_empty() { + break; + } + if let Some(val) = header_line + .strip_prefix("Content-Length:") + .or_else(|| header_line.strip_prefix("content-length:")) + { + if let Ok(len) = val.trim().parse::() { + content_length = len; + } + } + } + + // Read body based on Content-Length + let mut body_bytes = vec![0u8; content_length]; + if content_length > 0 { + reader + .read_exact(&mut body_bytes) + .map_err(|e| format!("read body ({content_length} bytes): {e}"))?; + } + + let body_str = String::from_utf8_lossy(&body_bytes).to_string(); + Ok((status_code, body_str)) +} + +/// Wait for a Unix socket to appear on the filesystem. +fn wait_for_socket(socket_path: &Path, label: &str, timeout: Duration) -> Result<(), VmError> { + let deadline = Instant::now() + timeout; + let mut interval = Duration::from_millis(10); + + while !socket_path.exists() { + if Instant::now() >= deadline { + return Err(VmError::HostSetup(format!( + "{label} socket did not appear within {}s: {}", + timeout.as_secs(), + socket_path.display(), + ))); + } + std::thread::sleep(interval); + interval = (interval * 2).min(Duration::from_millis(200)); + } + + Ok(()) +} + +/// Create the VM via the cloud-hypervisor REST API. +fn api_vm_create(socket_path: &Path, payload: &str) -> Result<(), VmError> { + let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.create", Some(payload)) + .map_err(|e| VmError::HostSetup(format!("vm.create: {e}")))?; + + if status >= 200 && status < 300 { + Ok(()) + } else { + Err(VmError::HostSetup(format!( + "vm.create returned HTTP {status}: {body}" + ))) + } +} + +/// Boot the VM. +fn api_vm_boot(socket_path: &Path) -> Result<(), VmError> { + let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.boot", None) + .map_err(|e| VmError::HostSetup(format!("vm.boot: {e}")))?; + + if status >= 200 && status < 300 { + Ok(()) + } else { + Err(VmError::HostSetup(format!( + "vm.boot returned HTTP {status}: {body}" + ))) + } +} + +/// Request a graceful shutdown. +fn api_vm_shutdown(socket_path: &Path) -> Result<(), VmError> { + let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.shutdown", None) + .map_err(|e| VmError::HostSetup(format!("vm.shutdown: {e}")))?; + + if status >= 200 && status < 300 { + Ok(()) + } else { + Err(VmError::HostSetup(format!( + "vm.shutdown returned HTTP {status}: {body}" + ))) + } +} + +/// Query VM info/status. +#[allow(dead_code)] +fn api_vm_info(socket_path: &Path) -> Result { + let (status, body) = http_request_unix(socket_path, "GET", "/api/v1/vm.info", None) + .map_err(|e| VmError::HostSetup(format!("vm.info: {e}")))?; + + if status >= 200 && status < 300 { + Ok(body) + } else { + Err(VmError::HostSetup(format!( + "vm.info returned HTTP {status}: {body}" + ))) + } +} + +/// Delete the VM. +#[allow(dead_code)] +fn api_vm_delete(socket_path: &Path) -> Result<(), VmError> { + let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.delete", None) + .map_err(|e| VmError::HostSetup(format!("vm.delete: {e}")))?; + + if status >= 200 && status < 300 { + Ok(()) + } else { + Err(VmError::HostSetup(format!( + "vm.delete returned HTTP {status}: {body}" + ))) + } +} + +// ── Build the VM create payload ───────────────────────────────────────── + +fn build_vm_create_payload( + backend: &CloudHypervisorBackend, + config: &VmConfig, + effective_exec_path: &str, + vfio_device: Option<&str>, + virtiofsd_sock: &Path, + state_disk_path: Option<&Path>, + use_tap_net: bool, + vsock_sock: &Path, + console_log: &Path, +) -> Result { + let mem_bytes = u64::from(config.mem_mib) * 1024 * 1024; + + let mut cmdline_parts = vec![ + "console=ttyS0".to_string(), + "root=rootfs".to_string(), + "rootfstype=virtiofs".to_string(), + "rw".to_string(), + "panic=-1".to_string(), + format!("init={effective_exec_path}"), + ]; + + // Pass environment variables via kernel cmdline. Unrecognised kernel + // parameters are forwarded to init as env vars. Only simple KEY=VALUE + // pairs without spaces are safe (cmdline is space-delimited, ~4096 B). + if config.gpu_enabled && config.vfio_device.is_some() { + cmdline_parts.push("GPU_ENABLED=true".to_string()); + } + if let Some(state_disk) = &config.state_disk { + cmdline_parts.push(format!( + "OPENSHELL_VM_STATE_DISK_DEVICE={}", + state_disk.guest_device + )); + } + for var in &config.env { + if var.contains('=') && !var.contains(' ') && !var.contains('"') { + cmdline_parts.push(var.clone()); + } + } + + if use_tap_net { + cmdline_parts.push(format!("VM_NET_IP={CHV_TAP_GUEST_IP}")); + cmdline_parts.push(format!("VM_NET_GW={CHV_TAP_HOST_IP}")); + cmdline_parts.push(format!("VM_NET_DNS={}", host_dns_server())); + } + + let cmdline = cmdline_parts.join(" "); + + let mut payload = serde_json::json!({ + "cpus": { + "boot_vcpus": config.vcpus, + "max_vcpus": config.vcpus, + }, + "memory": { + "size": mem_bytes, + "shared": true, + }, + "payload": { + "kernel": backend.vmlinux.display().to_string(), + "cmdline": cmdline, + }, + "fs": [{ + "tag": "rootfs", + "socket": virtiofsd_sock.display().to_string(), + "num_queues": 1, + "queue_size": 1024, + }], + "vsock": { + "cid": VSOCK_GUEST_CID, + "socket": vsock_sock.display().to_string(), + }, + "serial": { + "mode": "File", + "file": console_log.display().to_string(), + }, + "console": { + "mode": "Off", + }, + }); + + if let Some(disk_path) = state_disk_path { + payload["disks"] = serde_json::json!([{ + "path": disk_path.display().to_string(), + "readonly": false, + }]); + } + + // Cloud-hypervisor uses TAP devices for networking (requires root or + // CAP_NET_ADMIN). The gvproxy QEMU-style socket protocol is not + // compatible with CHV's NetConfig. GPU passthrough already requires + // elevated privileges, so TAP access is expected. + if use_tap_net { + payload["net"] = serde_json::json!([{ + "mac": "5a:94:ef:e4:0c:ee", + "ip": CHV_TAP_HOST_IP, + "mask": CHV_TAP_NETMASK, + }]); + } + + if let Some(vfio_path) = vfio_device { + payload["devices"] = serde_json::json!([{ + "path": format!("/sys/bus/pci/devices/{vfio_path}/"), + }]); + } + + serde_json::to_string(&payload) + .map_err(|e| VmError::HostSetup(format!("serialize vm.create payload: {e}"))) +} + +// ── Launch ────────────────────────────────────────────────────────────── + +#[allow(clippy::similar_names)] +fn launch_cloud_hypervisor( + backend: &CloudHypervisorBackend, + config: &VmConfig, +) -> Result { + let launch_start = Instant::now(); + + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let rootfs_key = vm_rootfs_key(&config.rootfs); + + // Unix domain sockets are limited to 108 characters (SUN_LEN). + // Instance rootfs paths can be deeply nested, so place sockets + // under /tmp to stay within the limit. + let sock_dir = PathBuf::from(format!("/tmp/ovm-chv-{}", std::process::id())); + std::fs::create_dir_all(&sock_dir).map_err(|e| { + VmError::HostSetup(format!("create socket dir {}: {e}", sock_dir.display())) + })?; + + let api_sock_path = sock_dir.join("api.sock"); + let vsock_sock_path = sock_dir.join("vsock.sock"); + let virtiofsd_sock_path = sock_dir.join("virtiofsd.sock"); + let console_log = config + .console_output + .clone() + .unwrap_or_else(|| run_dir.join(format!("{rootfs_key}-console.log"))); + + // Clean stale sockets + let _ = std::fs::remove_file(&api_sock_path); + let _ = std::fs::remove_file(&vsock_sock_path); + let _ = std::fs::remove_file(&virtiofsd_sock_path); + + // Start virtiofsd for the rootfs + eprintln!("Starting virtiofsd: {}", backend.virtiofsd.display()); + let virtiofsd_log = run_dir.join(format!("{rootfs_key}-virtiofsd.log")); + let virtiofsd_log_file = std::fs::File::create(&virtiofsd_log) + .map_err(|e| VmError::Fork(format!("create virtiofsd log: {e}")))?; + + let mut virtiofsd_child = std::process::Command::new(&backend.virtiofsd) + .arg(format!("--socket-path={}", virtiofsd_sock_path.display())) + .arg(format!("--shared-dir={}", config.rootfs.display())) + .arg("--cache=always") + .stdout(std::process::Stdio::null()) + .stderr(virtiofsd_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?; + + eprintln!( + "virtiofsd started (pid {}) [{:.1}s]", + virtiofsd_child.id(), + launch_start.elapsed().as_secs_f64() + ); + + // Wait for virtiofsd socket + wait_for_socket(&virtiofsd_sock_path, "virtiofsd", Duration::from_secs(5))?; + + // CHV uses TAP networking (requires root/CAP_NET_ADMIN). The gvproxy + // QEMU-style socket protocol is not compatible with cloud-hypervisor's + // NetConfig. GPU passthrough already requires elevated privileges. + let use_tap_net = !matches!(config.net, NetBackend::None); + + // For --exec mode: wrap the command so the VM powers off after it exits. + // Unlike libkrun (which exits when init terminates), cloud-hypervisor + // keeps running after PID 1 exits (kernel panics). A wrapper init script + // runs the command then calls `poweroff -f` for a clean ACPI shutdown. + let is_exec_mode = config.exec_path != "/srv/openshell-vm-init.sh"; + let wrapper_path = config.rootfs.join("tmp/chv-exec-wrapper.sh"); + let effective_exec_path; + if is_exec_mode { + let args_str = config + .args + .iter() + .map(|a| shell_escape(a)) + .collect::>() + .join(" "); + + let env_str = config + .env + .iter() + .map(|v| format!("export {}", shell_escape(v))) + .collect::>() + .join("\n"); + + let wrapper = format!( + "#!/bin/sh\n\ + mount -t proc proc /proc 2>/dev/null\n\ + mount -t sysfs sysfs /sys 2>/dev/null\n\ + mount -t devtmpfs devtmpfs /dev 2>/dev/null\n\ + {env_str}\n\ + cd {workdir}\n\ + {exec} {args}\n\ + RC=$?\n\ + # Trigger ACPI power-off so cloud-hypervisor exits cleanly.\n\ + # The rootfs may not have a `poweroff` binary, so try multiple methods.\n\ + if command -v poweroff >/dev/null 2>&1; then\n\ + poweroff -f\n\ + elif [ -x /usr/bin/busybox ]; then\n\ + /usr/bin/busybox poweroff -f\n\ + else\n\ + echo o > /proc/sysrq-trigger\n\ + fi\n\ + exit $RC\n", + env_str = env_str, + workdir = shell_escape(&config.workdir), + exec = shell_escape(&config.exec_path), + args = args_str, + ); + + if let Some(parent) = wrapper_path.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| VmError::HostSetup(format!("create wrapper dir: {e}")))?; + } + std::fs::write(&wrapper_path, &wrapper) + .map_err(|e| VmError::HostSetup(format!("write exec wrapper: {e}")))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = std::fs::set_permissions(&wrapper_path, std::fs::Permissions::from_mode(0o755)); + } + effective_exec_path = "/tmp/chv-exec-wrapper.sh".to_string(); + } else { + effective_exec_path = config.exec_path.clone(); + } + + // Start cloud-hypervisor process + eprintln!( + "Starting cloud-hypervisor: {}", + backend.chv_binary.display() + ); + + let chv_log = run_dir.join(format!("{rootfs_key}-cloud-hypervisor.log")); + let chv_log_file = std::fs::File::create(&chv_log) + .map_err(|e| VmError::Fork(format!("create cloud-hypervisor log: {e}")))?; + + let mut chv_child = std::process::Command::new(&backend.chv_binary) + .arg("--api-socket") + .arg(&api_sock_path) + .stdout(std::process::Stdio::null()) + .stderr(chv_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("start cloud-hypervisor: {e}")))?; + + let chv_pid = chv_child.id() as i32; + eprintln!( + "cloud-hypervisor started (pid {chv_pid}) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + + // Wait for API socket + wait_for_socket(&api_sock_path, "cloud-hypervisor", Duration::from_secs(10))?; + + // Build and send VM create payload + let state_disk_path = config.state_disk.as_ref().map(|sd| sd.path.as_path()); + let payload = build_vm_create_payload( + backend, + config, + &effective_exec_path, + config.vfio_device.as_deref(), + &virtiofsd_sock_path, + state_disk_path, + use_tap_net, + &vsock_sock_path, + &console_log, + )?; + + api_vm_create(&api_sock_path, &payload)?; + eprintln!("VM created [{:.1}s]", launch_start.elapsed().as_secs_f64()); + + api_vm_boot(&api_sock_path)?; + let boot_start = Instant::now(); + eprintln!("VM booting [{:.1}s]", launch_start.elapsed().as_secs_f64()); + + // Set up host-side networking for TAP (NAT, IP forwarding, masquerade) + // so the guest can reach the internet through the host. + let mut original_ip_forward: Option = None; + if use_tap_net { + match setup_chv_host_networking() { + Ok(orig) => original_ip_forward = Some(orig), + Err(e) => { + eprintln!("WARNING: host networking setup failed: {e}"); + eprintln!(" The VM may not have internet access."); + } + } + } + + // Write runtime state (vsock_bridge: true — CHV uses AF_VSOCK bridging) + if config.exec_path == "/srv/openshell-vm-init.sh" { + if let Err(err) = write_vm_runtime_state(&config.rootfs, chv_pid, &console_log, None, true) + { + let _ = api_vm_shutdown(&api_sock_path); + let _ = chv_child.kill(); + let _ = chv_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_chv_host_networking(orig); + } + clear_vm_runtime_state(&config.rootfs); + return Err(err); + } + } + + // CHV TAP networking doesn't provide built-in port forwarding like + // gvproxy. Start a TCP proxy for each port mapping so the host can + // reach guest services (e.g., the gateway health check on :30051). + if use_tap_net { + for pm in &config.port_map { + let parts: Vec<&str> = pm.split(':').collect(); + if parts.len() == 2 { + if let (Ok(hp), Ok(gp)) = (parts[0].parse::(), parts[1].parse::()) { + start_tcp_port_forwarder(hp, CHV_TAP_GUEST_IP, gp)?; + } + } + } + } + + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + // Start vsock exec bridge (exec Unix socket → CHV vsock Unix socket). + // The bridge allows `openshell-vm exec` and bootstrap to communicate + // with the guest exec agent over the standard exec socket path. + let exec_socket = vm_exec_socket_path(&config.rootfs); + start_vsock_exec_bridge(&exec_socket, &vsock_sock_path, VM_EXEC_VSOCK_PORT)?; + + // Gateway bootstrap and health check (mirrors libkrun backend). + if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { + let gateway_port = crate::gateway_host_port(config); + crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; + crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; + } + + eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); + eprintln!("Press Ctrl+C to stop."); + + // Signal forwarding: SIGINT/SIGTERM -> graceful shutdown + unsafe { + libc::signal( + libc::SIGINT, + crate::forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + crate::forward_signal as *const () as libc::sighandler_t, + ); + crate::CHILD_PID.store(chv_pid, std::sync::atomic::Ordering::Relaxed); + } + + // Wait for cloud-hypervisor to exit + let status = chv_child + .wait() + .map_err(|e| VmError::HostSetup(format!("wait for cloud-hypervisor: {e}")))?; + + // Clean up host networking rules + if let Some(ref orig) = original_ip_forward { + teardown_chv_host_networking(orig); + } + + // Cleanup + if config.exec_path == "/srv/openshell-vm-init.sh" { + clear_vm_runtime_state(&config.rootfs); + } + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + eprintln!("virtiofsd stopped"); + + // Clean up sockets and wrapper + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + if is_exec_mode { + let _ = std::fs::remove_file(&wrapper_path); + } + + let code = status.code().unwrap_or(1); + eprintln!("VM exited with code {code}"); + Ok(code) +} + +/// Escape a string for use in a shell script. Wraps in single quotes. +fn shell_escape(s: &str) -> String { + if s.is_empty() { + return "''".to_string(); + } + if !s.contains('\'') && !s.contains(' ') && !s.contains('"') && !s.contains('\\') { + return s.to_string(); + } + format!("'{}'", s.replace('\'', "'\\''")) +} + +// ── Vsock exec bridge ─────────────────────────────────────────────────── + +/// Guest CID assigned in the cloud-hypervisor vsock config. +const VSOCK_GUEST_CID: u32 = 3; + +// ── CHV TAP networking constants ──────────────────────────────────────── +// cloud-hypervisor defaults to 192.168.249.1/24 on the host side of the +// TAP device. The guest uses .2 with the host as its gateway. + +const CHV_TAP_HOST_IP: &str = "192.168.249.1"; +const CHV_TAP_GUEST_IP: &str = "192.168.249.2"; +const CHV_TAP_SUBNET: &str = "192.168.249.0/24"; +const CHV_TAP_NETMASK: &str = "255.255.255.0"; + +/// Start a background bridge: exec Unix socket → CHV vsock Unix socket. +/// +/// cloud-hypervisor exposes guest vsock via a host-side Unix socket with a +/// text protocol: connect to the socket, send `CONNECT \n`, read +/// back `OK \n`, then the stream is a raw bidirectional channel to +/// the guest vsock port. This is different from kernel `AF_VSOCK` (which +/// `vhost-vsock` uses) — CHV manages its own transport. +/// +/// This bridge creates a Unix socket at `exec_socket` and, for each +/// incoming connection, opens a connection to the CHV vsock socket, +/// performs the CONNECT handshake, and forwards data bidirectionally. +fn start_vsock_exec_bridge( + exec_socket: &Path, + chv_vsock_socket: &Path, + guest_port: u32, +) -> Result<(), VmError> { + use std::os::unix::net::UnixListener; + + if let Some(parent) = exec_socket.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + VmError::HostSetup(format!("create exec bridge dir {}: {e}", parent.display())) + })?; + } + let _ = std::fs::remove_file(exec_socket); + + let listener = UnixListener::bind(exec_socket).map_err(|e| { + VmError::HostSetup(format!( + "bind vsock exec bridge {}: {e}", + exec_socket.display() + )) + })?; + + let chv_vsock = chv_vsock_socket.to_path_buf(); + eprintln!( + "vsock exec bridge: {} → {} port {}", + exec_socket.display(), + chv_vsock.display(), + guest_port, + ); + + std::thread::spawn(move || { + vsock_bridge_accept_loop(listener, &chv_vsock, guest_port); + }); + + Ok(()) +} + +/// Accept loop for the vsock bridge background thread. +/// +/// "CONNECT rejected" (empty response) is normal during boot — the guest +/// exec agent isn't listening yet. We keep retrying those indefinitely +/// since the bootstrap caller has its own 120s timeout. Only fatal errors +/// (socket gone = VM died) cause the bridge to give up. +fn vsock_bridge_accept_loop( + listener: std::os::unix::net::UnixListener, + chv_vsock_socket: &Path, + port: u32, +) { + let mut fatal_failures: u32 = 0; + let mut logged_transient = false; + + for stream in listener.incoming() { + let client = match stream { + Ok(s) => s, + Err(e) => { + eprintln!("vsock bridge: accept: {e}"); + continue; + } + }; + + match chv_vsock_connect(chv_vsock_socket, port) { + Ok(guest) => { + fatal_failures = 0; + bridge_bidirectional(client, guest); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + fatal_failures += 1; + if fatal_failures <= 2 { + eprintln!("vsock bridge: CHV socket gone (VM exited?): {e}"); + } + if fatal_failures >= 3 { + eprintln!("vsock bridge: CHV socket not found, stopping bridge"); + return; + } + } + Err(e) => { + if !logged_transient { + eprintln!( + "vsock bridge: guest not ready on port {port} ({e}), \ + will keep retrying..." + ); + logged_transient = true; + } + } + } + } +} + +/// Connect to a guest vsock port via cloud-hypervisor's Unix socket protocol. +/// +/// CHV exposes guest vsock through a host Unix socket. The protocol is: +/// 1. Connect to the CHV vsock Unix socket +/// 2. Send: `CONNECT \n` +/// 3. Read: `OK \n` on success +/// 4. The stream is now a raw bidirectional channel to the guest port +fn chv_vsock_connect(chv_vsock_socket: &Path, port: u32) -> std::io::Result { + let mut stream = UnixStream::connect(chv_vsock_socket)?; + stream.set_read_timeout(Some(Duration::from_secs(5)))?; + stream.set_write_timeout(Some(Duration::from_secs(5)))?; + + let connect_msg = format!("CONNECT {port}\n"); + stream.write_all(connect_msg.as_bytes())?; + + let mut buf = [0u8; 64]; + let n = stream.read(&mut buf)?; + let response = std::str::from_utf8(&buf[..n]).unwrap_or(""); + + if !response.starts_with("OK") { + return Err(std::io::Error::new( + std::io::ErrorKind::ConnectionRefused, + format!("CHV vsock CONNECT rejected: {}", response.trim()), + )); + } + + stream.set_read_timeout(None)?; + stream.set_write_timeout(None)?; + Ok(stream) +} + +/// Spawn two threads that copy data between two Unix streams. +fn bridge_bidirectional(client: UnixStream, guest: UnixStream) { + let Ok(mut client_r) = client.try_clone() else { + return; + }; + let mut client_w = client; + let Ok(mut guest_r) = guest.try_clone() else { + return; + }; + let mut guest_w = guest; + + std::thread::spawn(move || { + let _ = std::io::copy(&mut client_r, &mut guest_w); + }); + std::thread::spawn(move || { + let _ = std::io::copy(&mut guest_r, &mut client_w); + }); +} + +// ── CHV host networking ───────────────────────────────────────────────── + +/// Parse a DNS server from resolv.conf content. +/// +/// Returns the first non-`127.x.x.x` nameserver, or `8.8.8.8` if none found. +/// Extracted from [`host_dns_server`] for testability. +fn parse_dns_server(content: &str) -> String { + content + .lines() + .filter(|line| line.starts_with("nameserver")) + .filter_map(|line| line.split_whitespace().nth(1)) + .find(|ip| !ip.starts_with("127.")) + .map(String::from) + .unwrap_or_else(|| "8.8.8.8".to_string()) +} + +/// Read the host's primary DNS server. +/// +/// Checks `/etc/resolv.conf` first. If every nameserver there is a loopback +/// address (e.g. systemd-resolved's `127.0.0.53`), falls back to the +/// upstream resolv.conf at `/run/systemd/resolve/resolv.conf` which +/// contains the real upstream nameservers. Final fallback is `8.8.8.8`. +fn host_dns_server() -> String { + for path in &["/etc/resolv.conf", "/run/systemd/resolve/resolv.conf"] { + if let Ok(content) = std::fs::read_to_string(path) { + let server = parse_dns_server(&content); + if server != "8.8.8.8" { + return server; + } + } + } + "8.8.8.8".to_string() +} + +/// Run a command, returning an error if it fails. +fn run_cmd(cmd: &str, args: &[&str]) -> Result<(), VmError> { + let output = std::process::Command::new(cmd) + .args(args) + .output() + .map_err(|e| VmError::HostSetup(format!("{cmd}: {e}")))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(VmError::HostSetup(format!( + "{cmd} {}: {}", + args.join(" "), + stderr.trim() + ))); + } + + Ok(()) +} + +/// Set up host-side networking so the CHV guest can reach the internet. +/// +/// 1. Enable IP forwarding (saving the original value for teardown) +/// 2. MASQUERADE outbound traffic from the VM subnet +/// 3. Allow forwarding to/from the VM subnet +/// +/// Returns the original value of `ip_forward` so the caller can restore it. +fn setup_chv_host_networking() -> Result { + let original_ip_forward = std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward") + .map(|s| s.trim().to_string()) + .unwrap_or_else(|_| "0".to_string()); + + std::fs::write("/proc/sys/net/ipv4/ip_forward", "1") + .map_err(|e| VmError::HostSetup(format!("enable IP forwarding: {e}")))?; + + run_cmd( + "iptables", + &[ + "-t", + "nat", + "-A", + "POSTROUTING", + "-s", + CHV_TAP_SUBNET, + "!", + "-d", + CHV_TAP_SUBNET, + "-j", + "MASQUERADE", + ], + )?; + + run_cmd( + "iptables", + &["-A", "FORWARD", "-s", CHV_TAP_SUBNET, "-j", "ACCEPT"], + )?; + + run_cmd( + "iptables", + &["-A", "FORWARD", "-d", CHV_TAP_SUBNET, "-j", "ACCEPT"], + )?; + + eprintln!("host networking: IP forwarding + NAT masquerade for {CHV_TAP_SUBNET}"); + Ok(original_ip_forward) +} + +/// Remove the iptables rules added by [`setup_chv_host_networking`] and +/// restore the original `ip_forward` sysctl value. +fn teardown_chv_host_networking(original_ip_forward: &str) { + let _ = run_cmd( + "iptables", + &[ + "-t", + "nat", + "-D", + "POSTROUTING", + "-s", + CHV_TAP_SUBNET, + "!", + "-d", + CHV_TAP_SUBNET, + "-j", + "MASQUERADE", + ], + ); + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-s", CHV_TAP_SUBNET, "-j", "ACCEPT"], + ); + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-d", CHV_TAP_SUBNET, "-j", "ACCEPT"], + ); + if original_ip_forward != "1" { + let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", original_ip_forward); + } + eprintln!("host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}"); +} + +/// Start a background TCP proxy that forwards `127.0.0.1:{host_port}` +/// to `{guest_ip}:{guest_port}`. +/// +/// Each accepted connection spawns two threads for bidirectional copy. +/// The listener thread runs until the process exits. +fn start_tcp_port_forwarder( + host_port: u16, + guest_ip: &str, + guest_port: u16, +) -> Result<(), VmError> { + use std::net::{TcpListener, TcpStream}; + + let listener = TcpListener::bind(("127.0.0.1", host_port)) + .map_err(|e| VmError::HostSetup(format!("bind port forwarder on :{host_port}: {e}")))?; + + let guest_addr = format!("{guest_ip}:{guest_port}"); + eprintln!("port forwarder: 127.0.0.1:{host_port} -> {guest_addr}"); + + std::thread::spawn(move || { + for stream in listener.incoming() { + let client = match stream { + Ok(s) => s, + Err(_) => continue, + }; + + let addr = guest_addr.clone(); + std::thread::spawn(move || { + if let Ok(remote) = TcpStream::connect(&addr) { + forward_tcp_bidirectional(client, remote); + } + }); + } + }); + + Ok(()) +} + +/// Copy data bidirectionally between two TCP streams until either side closes. +fn forward_tcp_bidirectional(client: std::net::TcpStream, remote: std::net::TcpStream) { + let Ok(mut client_r) = client.try_clone() else { + return; + }; + let mut client_w = client; + let Ok(mut remote_r) = remote.try_clone() else { + return; + }; + let mut remote_w = remote; + + std::thread::spawn(move || { + let _ = std::io::copy(&mut client_r, &mut remote_w); + }); + std::thread::spawn(move || { + let _ = std::io::copy(&mut remote_r, &mut client_w); + }); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn http_request_format_with_body() { + let payload = r#"{"cpus":{"boot_vcpus":4}}"#; + let request = format!( + "PUT /api/v1/vm.create HTTP/1.1\r\n\ + Host: localhost\r\n\ + Content-Type: application/json\r\n\ + Content-Length: {}\r\n\ + Connection: close\r\n\ + \r\n\ + {payload}", + payload.len(), + ); + assert!(request.contains("Content-Length: 25")); + assert!(request.contains("boot_vcpus")); + } + + #[test] + fn http_request_format_without_body() { + let request = format!( + "GET /api/v1/vm.info HTTP/1.1\r\n\ + Host: localhost\r\n\ + Connection: close\r\n\ + \r\n" + ); + assert!(request.contains("GET /api/v1/vm.info")); + assert!(!request.contains("Content-Length")); + } + + #[test] + fn build_payload_includes_vfio_device() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::None, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: true, + vfio_device: Some("0000:41:00.0".into()), + backend: crate::VmBackendChoice::CloudHypervisor, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + false, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + assert!( + payload.contains("0000:41:00.0"), + "payload should contain VFIO device" + ); + assert!( + payload.contains("boot_vcpus"), + "payload should contain vcpus config" + ); + assert!( + payload.contains("GPU_ENABLED=true"), + "payload should contain GPU_ENABLED in cmdline" + ); + } + + #[test] + fn build_payload_without_vfio() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 2, + mem_mib: 4096, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::None, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: false, + vfio_device: None, + backend: crate::VmBackendChoice::Auto, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + false, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + assert!( + !payload.contains("devices"), + "payload without VFIO should not have devices key" + ); + assert!( + !payload.contains("GPU_ENABLED"), + "payload should not contain GPU_ENABLED" + ); + } + + #[test] + fn build_payload_with_tap_net_includes_ip_and_cmdline() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec!["30051:30051".into()], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::Gvproxy { + binary: "/usr/bin/gvproxy".into(), + }, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: true, + vfio_device: Some("0000:41:00.0".into()), + backend: crate::VmBackendChoice::CloudHypervisor, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + true, // use_tap_net + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + assert!( + payload.contains("192.168.249.1"), + "net should contain TAP host IP" + ); + assert!( + payload.contains("255.255.255.0"), + "net should contain TAP netmask" + ); + assert!( + payload.contains("VM_NET_IP=192.168.249.2"), + "cmdline should contain guest IP" + ); + assert!( + payload.contains("VM_NET_GW=192.168.249.1"), + "cmdline should contain gateway IP" + ); + assert!( + payload.contains("VM_NET_DNS="), + "cmdline should contain DNS server" + ); + } + + #[test] + fn build_payload_tap_net_false_omits_net_and_vm_net_vars() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 2, + mem_mib: 4096, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::None, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: false, + vfio_device: None, + backend: crate::VmBackendChoice::Auto, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + false, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + assert!( + !payload.contains("\"net\""), + "no-tap payload should not contain net section" + ); + assert!( + !payload.contains("VM_NET_IP"), + "no-tap payload should not contain VM_NET_IP" + ); + assert!( + !payload.contains("VM_NET_GW"), + "no-tap payload should not contain VM_NET_GW" + ); + assert!( + !payload.contains("VM_NET_DNS"), + "no-tap payload should not contain VM_NET_DNS" + ); + } + + #[test] + fn build_payload_tap_net_has_correct_mac_ip_mask() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 2, + mem_mib: 4096, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::Gvproxy { + binary: "/usr/bin/gvproxy".into(), + }, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: false, + vfio_device: None, + backend: crate::VmBackendChoice::CloudHypervisor, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + true, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + let json: serde_json::Value = serde_json::from_str(&payload).unwrap(); + let net = &json["net"][0]; + assert_eq!(net["mac"], "5a:94:ef:e4:0c:ee"); + assert_eq!(net["ip"], "192.168.249.1"); + assert_eq!(net["mask"], "255.255.255.0"); + } + + #[test] + fn build_payload_vfio_and_tap_net_coexist() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::Gvproxy { + binary: "/usr/bin/gvproxy".into(), + }, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: true, + vfio_device: Some("0000:41:00.0".into()), + backend: crate::VmBackendChoice::CloudHypervisor, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + true, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + let json: serde_json::Value = serde_json::from_str(&payload).unwrap(); + assert!( + json["devices"].is_array(), + "devices section should exist for VFIO" + ); + assert!(json["net"].is_array(), "net section should exist for TAP"); + assert!( + json["devices"][0]["path"] + .as_str() + .unwrap() + .contains("0000:41:00.0"), + "VFIO device path should be present" + ); + assert_eq!(json["net"][0]["ip"], "192.168.249.1"); + } + + // ── parse_dns_server tests ────────────────────────────────────────── + + #[test] + fn parse_dns_server_returns_first_non_loopback() { + let content = "nameserver 10.0.0.1\nnameserver 8.8.8.8\n"; + assert_eq!(parse_dns_server(content), "10.0.0.1"); + } + + #[test] + fn parse_dns_server_skips_systemd_resolved() { + let content = "nameserver 127.0.0.53\nnameserver 1.1.1.1\n"; + assert_eq!(parse_dns_server(content), "1.1.1.1"); + } + + #[test] + fn parse_dns_server_skips_all_loopback_variants() { + let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\nnameserver 172.16.0.1\n"; + assert_eq!(parse_dns_server(content), "172.16.0.1"); + } + + #[test] + fn parse_dns_server_falls_back_when_only_loopback() { + let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\n"; + assert_eq!(parse_dns_server(content), "8.8.8.8"); + } + + #[test] + fn parse_dns_server_handles_empty_content() { + assert_eq!(parse_dns_server(""), "8.8.8.8"); + } + + #[test] + fn parse_dns_server_ignores_comments_and_other_lines() { + let content = "# Generated by NetworkManager\nsearch example.com\nnameserver 10.1.2.3\n"; + assert_eq!(parse_dns_server(content), "10.1.2.3"); + } + + // ── shell_escape tests ────────────────────────────────────────────── + + #[test] + fn shell_escape_empty_string() { + assert_eq!(shell_escape(""), "''"); + } + + #[test] + fn shell_escape_simple_string() { + assert_eq!(shell_escape("hello"), "hello"); + } + + #[test] + fn shell_escape_string_with_single_quotes() { + assert_eq!(shell_escape("it's"), "'it'\\''s'"); + } + + #[test] + fn shell_escape_string_with_spaces() { + assert_eq!(shell_escape("hello world"), "'hello world'"); + } + + #[test] + fn shell_escape_string_with_double_quotes() { + assert_eq!(shell_escape(r#"say "hi""#), r#"'say "hi"'"#); + } + + #[test] + fn shell_escape_string_with_backslash() { + assert_eq!(shell_escape("path\\to"), "'path\\to'"); + } +} diff --git a/crates/openshell-vm/src/backend/libkrun.rs b/crates/openshell-vm/src/backend/libkrun.rs new file mode 100644 index 000000000..1f077563a --- /dev/null +++ b/crates/openshell-vm/src/backend/libkrun.rs @@ -0,0 +1,469 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! libkrun hypervisor backend. +//! +//! Implements [`VmBackend`] using the libkrun C API for lightweight microVMs. +//! This is the original backend — on macOS it uses Hypervisor.framework, +//! on Linux it uses KVM. + +use std::ffi::CString; +use std::path::Path; +use std::time::Instant; + +use super::{VmBackend, setup_gvproxy_port_forwarding, start_gvproxy}; +use crate::exec::{clear_vm_runtime_state, write_vm_runtime_state}; +use crate::{ + GvproxyGuard, NetBackend, StateDiskConfig, VmConfig, VmError, VsockPort, bootstrap_gateway, + c_string_array, check, ffi, gateway_host_port, health, path_to_cstring, vm_rootfs_key, +}; + +/// libkrun hypervisor backend. +pub struct LibkrunBackend; + +impl VmBackend for LibkrunBackend { + fn launch(&self, config: &VmConfig) -> Result { + launch_libkrun(config) + } +} + +/// VM context wrapping the libkrun FFI context ID. +struct VmContext { + krun: &'static ffi::LibKrun, + ctx_id: u32, +} + +impl VmContext { + fn create(log_level: u32) -> Result { + let krun = ffi::libkrun()?; + unsafe { + check( + (krun.krun_init_log)( + ffi::KRUN_LOG_TARGET_DEFAULT, + crate::clamp_log_level(log_level), + ffi::KRUN_LOG_STYLE_AUTO, + ffi::KRUN_LOG_OPTION_NO_ENV, + ), + "krun_init_log", + )?; + } + + let ctx_id = unsafe { (krun.krun_create_ctx)() }; + if ctx_id < 0 { + return Err(VmError::Krun { + func: "krun_create_ctx", + code: ctx_id, + }); + } + + Ok(Self { + krun, + ctx_id: ctx_id as u32, + }) + } + + fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), + "krun_set_vm_config", + ) + } + } + + fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { + let rootfs_c = path_to_cstring(rootfs)?; + unsafe { + check( + (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), + "krun_set_root", + ) + } + } + + fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> { + let Some(add_disk3) = self.krun.krun_add_disk3 else { + return Err(VmError::HostSetup( + "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support" + .to_string(), + )); + }; + + let block_id_c = CString::new(state_disk.block_id.as_str())?; + let disk_path_c = path_to_cstring(&state_disk.path)?; + unsafe { + check( + add_disk3( + self.ctx_id, + block_id_c.as_ptr(), + disk_path_c.as_ptr(), + ffi::KRUN_DISK_FORMAT_RAW, + false, + false, + crate::state_disk_sync_mode(), + ), + "krun_add_disk3", + ) + } + } + + fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { + let workdir_c = CString::new(workdir)?; + unsafe { + check( + (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), + "krun_set_workdir", + ) + } + } + + fn disable_implicit_vsock(&self) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_disable_implicit_vsock)(self.ctx_id), + "krun_disable_implicit_vsock", + ) + } + } + + fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), + "krun_add_vsock", + ) + } + } + + #[cfg(target_os = "macos")] + fn add_net_unixgram( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + flags: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixgram)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + flags, + ), + "krun_add_net_unixgram", + ) + } + } + + #[allow(dead_code)] + fn add_net_unixstream( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixstream)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + 0, + ), + "krun_add_net_unixstream", + ) + } + } + + fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { + let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); + let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; + unsafe { + check( + (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), + "krun_set_port_map", + ) + } + } + + fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { + let socket_c = path_to_cstring(&port.socket_path)?; + unsafe { + check( + (self.krun.krun_add_vsock_port2)( + self.ctx_id, + port.port, + socket_c.as_ptr(), + port.listen, + ), + "krun_add_vsock_port2", + ) + } + } + + fn set_console_output(&self, path: &Path) -> Result<(), VmError> { + let console_c = path_to_cstring(path)?; + unsafe { + check( + (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), + "krun_set_console_output", + ) + } + } + + fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { + let exec_c = CString::new(exec_path)?; + let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); + let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; + let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); + let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; + + unsafe { + check( + (self.krun.krun_set_exec)( + self.ctx_id, + exec_c.as_ptr(), + argv_ptrs.as_ptr(), + env_ptrs.as_ptr(), + ), + "krun_set_exec", + ) + } + } + + fn start_enter(&self) -> i32 { + unsafe { (self.krun.krun_start_enter)(self.ctx_id) } + } +} + +impl Drop for VmContext { + fn drop(&mut self) { + unsafe { + let ret = (self.krun.krun_free_ctx)(self.ctx_id); + if ret < 0 { + eprintln!( + "warning: krun_free_ctx({}) failed with code {ret}", + self.ctx_id + ); + } + } + } +} + +/// Launch a VM using the libkrun backend. +/// +/// This contains the VM-specific configuration, networking, fork/exec, +/// signal forwarding, bootstrap, and cleanup logic that was previously +/// inline in `lib.rs::launch()`. +#[allow(clippy::similar_names)] +fn launch_libkrun(config: &VmConfig) -> Result { + let launch_start = Instant::now(); + + let vm = VmContext::create(config.log_level)?; + vm.set_vm_config(config.vcpus, config.mem_mib)?; + vm.set_root(&config.rootfs)?; + if let Some(state_disk) = &config.state_disk { + vm.add_state_disk(state_disk)?; + } + vm.set_workdir(&config.workdir)?; + + let mut gvproxy_guard: Option = None; + let mut gvproxy_api_sock: Option = None; + + match &config.net { + NetBackend::Tsi => {} + NetBackend::None => { + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + eprintln!("Networking: disabled (no TSI, no virtio-net)"); + } + NetBackend::Gvproxy { .. } => { + let gvproxy_setup = start_gvproxy(config, launch_start)?; + + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; + + const NET_FEATURE_CSUM: u32 = 1 << 0; + const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; + const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; + const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; + const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; + const NET_FEATURE_HOST_UFO: u32 = 1 << 14; + const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM + | NET_FEATURE_GUEST_CSUM + | NET_FEATURE_GUEST_TSO4 + | NET_FEATURE_GUEST_UFO + | NET_FEATURE_HOST_TSO4 + | NET_FEATURE_HOST_UFO; + + #[cfg(target_os = "linux")] + vm.add_net_unixstream(&gvproxy_setup.net_sock, &mac, COMPAT_NET_FEATURES)?; + #[cfg(target_os = "macos")] + { + const NET_FLAG_VFKIT: u32 = 1 << 0; + vm.add_net_unixgram( + &gvproxy_setup.net_sock, + &mac, + COMPAT_NET_FEATURES, + NET_FLAG_VFKIT, + )?; + } + + eprintln!( + "Networking: gvproxy (virtio-net) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + gvproxy_api_sock = Some(gvproxy_setup.api_sock); + gvproxy_guard = Some(gvproxy_setup.guard); + } + } + + if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { + vm.set_port_map(&config.port_map)?; + } + + for vsock_port in &config.vsock_ports { + if let Some(parent) = vsock_port.socket_path.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display())) + })?; + } + let _ = std::fs::remove_file(&vsock_port.socket_path); + vm.add_vsock_port(vsock_port)?; + } + + let console_log = config.console_output.clone().unwrap_or_else(|| { + config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs))) + }); + vm.set_console_output(&console_log)?; + + let mut env: Vec = if config.env.is_empty() { + vec![ + "HOME=/root", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + ] + .into_iter() + .map(ToOwned::to_owned) + .collect() + } else { + config.env.clone() + }; + if let Some(state_disk) = &config.state_disk + && !env + .iter() + .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE=")) + { + env.push(format!( + "OPENSHELL_VM_STATE_DISK_DEVICE={}", + state_disk.guest_device + )); + } + if config.gpu_enabled { + env.push("GPU_ENABLED=true".to_string()); + } + vm.set_exec(&config.exec_path, &config.args, &env)?; + + // Fork and enter the VM + let boot_start = Instant::now(); + eprintln!("Booting microVM..."); + + let pid = unsafe { libc::fork() }; + match pid { + -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), + 0 => { + let ret = vm.start_enter(); + eprintln!("krun_start_enter failed: {ret}"); + std::process::exit(1); + } + _ => { + if config.exec_path == "/srv/openshell-vm-init.sh" { + let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id); + if let Err(err) = + write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid, false) + { + unsafe { + libc::kill(pid, libc::SIGTERM); + } + drop(gvproxy_guard); + clear_vm_runtime_state(&config.rootfs); + return Err(err); + } + } + eprintln!( + "VM started (child pid {pid}) [{:.1}s]", + boot_start.elapsed().as_secs_f64() + ); + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + if let Some(ref api_sock) = gvproxy_api_sock { + setup_gvproxy_port_forwarding(api_sock, &config.port_map)?; + } + + if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { + let gateway_port = gateway_host_port(config); + bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; + health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; + } + + eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); + eprintln!("Press Ctrl+C to stop."); + + unsafe { + libc::signal( + libc::SIGINT, + crate::forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + crate::forward_signal as *const () as libc::sighandler_t, + ); + crate::CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); + } + + let mut status: libc::c_int = 0; + unsafe { + libc::waitpid(pid, &raw mut status, 0); + } + + if config.exec_path == "/srv/openshell-vm-init.sh" { + clear_vm_runtime_state(&config.rootfs); + } + if let Some(mut guard) = gvproxy_guard + && let Some(mut child) = guard.disarm() + { + let _ = child.kill(); + let _ = child.wait(); + eprintln!("gvproxy stopped"); + } + + if libc::WIFEXITED(status) { + let code = libc::WEXITSTATUS(status); + eprintln!("VM exited with code {code}"); + return Ok(code); + } else if libc::WIFSIGNALED(status) { + let sig = libc::WTERMSIG(status); + eprintln!("VM killed by signal {sig}"); + return Ok(128 + sig); + } + + Ok(status) + } + } +} diff --git a/crates/openshell-vm/src/backend/mod.rs b/crates/openshell-vm/src/backend/mod.rs new file mode 100644 index 000000000..9c2167fc5 --- /dev/null +++ b/crates/openshell-vm/src/backend/mod.rs @@ -0,0 +1,208 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! VM backend abstraction layer. +//! +//! Defines the [`VmBackend`] trait that all hypervisor backends implement, +//! and shared infrastructure (gvproxy startup, networking helpers) used by +//! both the libkrun and cloud-hypervisor backends. + +pub mod cloud_hypervisor; +pub mod libkrun; + +use std::path::{Path, PathBuf}; +use std::time::Instant; + +use crate::{ + GvproxyGuard, NetBackend, VmConfig, VmError, gvproxy_expose, gvproxy_socket_dir, + kill_stale_gvproxy, kill_stale_gvproxy_by_port, pick_gvproxy_ssh_port, vm_rootfs_key, +}; + +/// Trait implemented by each hypervisor backend (libkrun, cloud-hypervisor). +pub trait VmBackend { + /// Launch a VM with the given configuration. + /// + /// Returns the VM exit code. + fn launch(&self, config: &VmConfig) -> Result; +} + +/// Result of starting a gvproxy instance, used by both backends. +pub(crate) struct GvproxySetup { + pub(crate) guard: GvproxyGuard, + pub(crate) api_sock: PathBuf, + pub(crate) net_sock: PathBuf, +} + +/// Start gvproxy for the given configuration. +/// +/// Shared between libkrun and cloud-hypervisor backends. Handles stale +/// process cleanup, socket setup, and process spawning with exponential +/// backoff waiting for the network socket. +pub(crate) fn start_gvproxy( + config: &VmConfig, + launch_start: Instant, +) -> Result { + let binary = match &config.net { + NetBackend::Gvproxy { binary } => binary, + _ => { + return Err(VmError::HostSetup( + "start_gvproxy called without Gvproxy net backend".into(), + )); + } + }; + + if !binary.exists() { + return Err(VmError::BinaryNotFound { + path: binary.display().to_string(), + hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), + }); + } + + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let rootfs_key = vm_rootfs_key(&config.rootfs); + let sock_base = gvproxy_socket_dir(&config.rootfs)?; + let net_sock = sock_base.with_extension("v"); + let api_sock = sock_base.with_extension("a"); + + kill_stale_gvproxy(&config.rootfs); + for pm in &config.port_map { + if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::().ok()) { + kill_stale_gvproxy_by_port(host_port); + } + } + + let _ = std::fs::remove_file(&net_sock); + let _ = std::fs::remove_file(&api_sock); + let krun_sock = sock_base.with_extension("v-krun.sock"); + let _ = std::fs::remove_file(&krun_sock); + + eprintln!("Starting gvproxy: {}", binary.display()); + let ssh_port = pick_gvproxy_ssh_port()?; + let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); + let gvproxy_log_file = std::fs::File::create(&gvproxy_log) + .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; + + #[cfg(target_os = "linux")] + let (gvproxy_net_flag, gvproxy_net_url) = + ("-listen-qemu", format!("unix://{}", net_sock.display())); + #[cfg(target_os = "macos")] + let (gvproxy_net_flag, gvproxy_net_url) = ( + "-listen-vfkit", + format!("unixgram://{}", net_sock.display()), + ); + + let child = std::process::Command::new(binary) + .arg(gvproxy_net_flag) + .arg(&gvproxy_net_url) + .arg("-listen") + .arg(format!("unix://{}", api_sock.display())) + .arg("-ssh-port") + .arg(ssh_port.to_string()) + .stdout(std::process::Stdio::null()) + .stderr(gvproxy_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; + + eprintln!( + "gvproxy started (pid {}, ssh port {}) [{:.1}s]", + child.id(), + ssh_port, + launch_start.elapsed().as_secs_f64() + ); + + { + let deadline = Instant::now() + std::time::Duration::from_secs(5); + let mut interval = std::time::Duration::from_millis(5); + while !net_sock.exists() { + if Instant::now() >= deadline { + return Err(VmError::Fork( + "gvproxy socket did not appear within 5s".to_string(), + )); + } + std::thread::sleep(interval); + interval = (interval * 2).min(std::time::Duration::from_millis(100)); + } + } + + Ok(GvproxySetup { + guard: GvproxyGuard::new(child), + api_sock, + net_sock, + }) +} + +/// Set up port forwarding via the gvproxy HTTP API. +/// +/// Translates `host:guest` port map entries into gvproxy expose calls. +pub(crate) fn setup_gvproxy_port_forwarding( + api_sock: &Path, + port_map: &[String], +) -> Result<(), VmError> { + let fwd_start = Instant::now(); + { + let deadline = Instant::now() + std::time::Duration::from_secs(2); + let mut interval = std::time::Duration::from_millis(5); + while !api_sock.exists() { + if Instant::now() >= deadline { + eprintln!("warning: gvproxy API socket not ready after 2s, attempting anyway"); + break; + } + std::thread::sleep(interval); + interval = (interval * 2).min(std::time::Duration::from_millis(200)); + } + } + + let guest_ip = "192.168.127.2"; + + for pm in port_map { + let parts: Vec<&str> = pm.split(':').collect(); + let (host_port, guest_port) = match parts.len() { + 2 => (parts[0], parts[1]), + 1 => (parts[0], parts[0]), + _ => { + eprintln!(" skipping invalid port mapping: {pm}"); + continue; + } + }; + + let expose_body = format!( + r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# + ); + + let mut expose_ok = false; + let mut retry_interval = std::time::Duration::from_millis(100); + let expose_deadline = Instant::now() + std::time::Duration::from_secs(10); + loop { + match gvproxy_expose(api_sock, &expose_body) { + Ok(()) => { + eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); + expose_ok = true; + break; + } + Err(e) => { + if Instant::now() >= expose_deadline { + eprintln!(" port {host_port}: {e} (retries exhausted)"); + break; + } + std::thread::sleep(retry_interval); + retry_interval = (retry_interval * 2).min(std::time::Duration::from_secs(1)); + } + } + } + if !expose_ok { + return Err(VmError::HostSetup(format!( + "failed to forward port {host_port} via gvproxy" + ))); + } + } + eprintln!( + "Port forwarding ready [{:.1}s]", + fwd_start.elapsed().as_secs_f64() + ); + + Ok(()) +} diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index 6195556e1..1f8ad03fe 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -48,6 +48,22 @@ fn safe_remove_dir_all(path: &Path) -> Result { pub const VM_EXEC_VSOCK_PORT: u32 = 10_777; +/// How to connect to the VM exec agent. +/// +/// libkrun bridges each guest vsock port to a host Unix socket via +/// `krun_add_vsock_port2`. cloud-hypervisor uses standard vhost-vsock +/// with CID-based addressing — the host connects via `AF_VSOCK` or a +/// vsock-proxy/socat bridge. +#[derive(Debug, Clone)] +pub enum VsockConnectMode { + /// Connect via a host Unix socket (libkrun per-port bridging). + UnixSocket(PathBuf), + /// Connect via a vsock proxy bridge (cloud-hypervisor). + /// The path points to a socat-bridged Unix socket that forwards + /// to guest CID 3, port [`VM_EXEC_VSOCK_PORT`]. + VsockBridge(PathBuf), +} + const VM_STATE_NAME: &str = "vm-state.json"; const VM_LOCK_NAME: &str = "vm.lock"; const KUBECONFIG_ENV: &str = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"; @@ -72,6 +88,10 @@ pub struct VmRuntimeState { /// PID of the gvproxy process (if networking uses gvproxy). #[serde(default, skip_serializing_if = "Option::is_none")] pub gvproxy_pid: Option, + /// Whether this VM uses vsock-bridge mode (cloud-hypervisor) vs + /// Unix socket mode (libkrun). Defaults to false for backward compat. + #[serde(default, skip_serializing_if = "std::ops::Not::not")] + pub vsock_bridge: bool, } #[derive(Debug, Serialize)] @@ -132,6 +152,7 @@ pub fn write_vm_runtime_state( pid: i32, console_log: &Path, gvproxy_pid: Option, + vsock_bridge: bool, ) -> Result<(), VmError> { let state = VmRuntimeState { pid, @@ -141,6 +162,7 @@ pub fn write_vm_runtime_state( console_log: console_log.to_path_buf(), started_at_ms: now_ms()?, gvproxy_pid, + vsock_bridge, }; let path = vm_state_path(rootfs); let bytes = serde_json::to_vec_pretty(&state) @@ -471,10 +493,21 @@ pub fn ensure_vm_not_running(rootfs: &Path) -> Result<(), VmError> { pub fn exec_running_vm(options: VmExecOptions) -> Result { let state = load_vm_runtime_state(options.rootfs.as_deref())?; - let mut stream = UnixStream::connect(&state.socket_path).map_err(|e| { + + let connect_mode = if state.vsock_bridge { + VsockConnectMode::VsockBridge(state.socket_path.clone()) + } else { + VsockConnectMode::UnixSocket(state.socket_path.clone()) + }; + + let socket_path = match &connect_mode { + VsockConnectMode::UnixSocket(p) | VsockConnectMode::VsockBridge(p) => p, + }; + + let mut stream = UnixStream::connect(socket_path).map_err(|e| { VmError::Exec(format!( "connect to VM exec socket {}: {e}", - state.socket_path.display() + socket_path.display() )) })?; let mut writer = stream diff --git a/crates/openshell-vm/src/gpu_passthrough.rs b/crates/openshell-vm/src/gpu_passthrough.rs new file mode 100644 index 000000000..b835bca89 --- /dev/null +++ b/crates/openshell-vm/src/gpu_passthrough.rs @@ -0,0 +1,1959 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Host-side NVIDIA GPU VFIO readiness probing for VM passthrough. +//! +//! This module scans Linux sysfs (`/sys/bus/pci/devices`) for NVIDIA GPUs +//! (vendor ID `0x10de`), checks their driver binding, and verifies IOMMU +//! group cleanliness — the prerequisites for passing a physical GPU into +//! a cloud-hypervisor VM via VFIO. +//! +//! Returns per-device readiness for multi-GPU hosts. +//! +//! On non-Linux platforms, probing returns an empty list. + +use std::fmt; +use std::path::PathBuf; +use std::time::Duration; + +/// Per-device readiness state for NVIDIA GPU VFIO passthrough. +/// +/// Each variant represents a distinct readiness state for a single PCI device. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum HostNvidiaVfioReadiness { + /// The current platform does not support VFIO passthrough (non-Linux). + UnsupportedPlatform, + + /// No PCI device with NVIDIA vendor ID (`0x10de`) was found. + NoNvidiaDevice, + + /// An NVIDIA device exists but is bound to the nvidia (or other non-VFIO) driver. + BoundToNvidia, + + /// An NVIDIA device is bound to `vfio-pci` and its IOMMU group is clean — ready for passthrough. + VfioBoundReady, + + /// An NVIDIA device is bound to `vfio-pci` but its IOMMU group contains + /// devices not bound to `vfio-pci`, which prevents safe passthrough. + VfioBoundDirtyGroup, + + /// Some NVIDIA devices are bound to `vfio-pci` while others use + /// a different driver (mixed fleet). + MixedVfioAndOther, +} + +impl fmt::Display for HostNvidiaVfioReadiness { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::UnsupportedPlatform => write!( + f, + "VFIO passthrough is not supported on this platform (Linux required)" + ), + Self::NoNvidiaDevice => write!(f, "no NVIDIA PCI device found"), + Self::BoundToNvidia => { + write!(f, "NVIDIA device found but not bound to vfio-pci driver") + } + Self::VfioBoundReady => write!( + f, + "NVIDIA device bound to vfio-pci and IOMMU group is clean" + ), + Self::VfioBoundDirtyGroup => write!( + f, + "NVIDIA device bound to vfio-pci but IOMMU group contains non-VFIO devices" + ), + Self::MixedVfioAndOther => write!( + f, + "some NVIDIA devices are on vfio-pci while others use a different driver" + ), + } + } +} + +const NVIDIA_VENDOR_ID: &str = "0x10de"; + +#[cfg(target_os = "linux")] +const SYSFS_WRITE_TIMEOUT: Duration = Duration::from_secs(10); + +#[cfg(target_os = "linux")] +fn sysfs_write_with_timeout( + path: &std::path::Path, + data: &str, + timeout: Duration, +) -> Result<(), std::io::Error> { + use std::process::{Command, Stdio}; + use std::thread; + + let mut child = Command::new("sh") + .arg("-c") + .arg(format!( + r#"printf '%s' '{}' > '{}'"#, + data.replace('\'', "'\\''"), + path.display().to_string().replace('\'', "'\\''") + )) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + std::io::Error::new( + e.kind(), + format!( + "failed to spawn sysfs write subprocess for {}: {e}", + path.display() + ), + ) + })?; + + let poll_interval = Duration::from_millis(100); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => { + if status.success() { + return Ok(()); + } + let mut stderr_buf = String::new(); + if let Some(mut stderr) = child.stderr.take() { + use std::io::Read; + let _ = stderr.read_to_string(&mut stderr_buf); + } + let hint = if stderr_buf.contains("Permission denied") { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!( + "sysfs write to {} failed (exit {}){hint}: {stderr_buf}", + path.display(), + status.code().unwrap_or(-1), + ), + )); + } + Ok(None) => { + if start.elapsed() > timeout { + let pid = child.id(); + let _ = child.kill(); + // CRITICAL: Do NOT call child.wait() here. If the child + // is stuck in uninterruptible sleep (D-state) — which is + // the nvidia unbind deadlock scenario — wait() will block + // the parent indefinitely, making it unkillable too. + // + // Dropping the Child struct closes pipe handles but does + // NOT wait. The zombie child is reparented to init and + // reaped when/if it eventually exits. + drop(child); + return Err(std::io::Error::new( + std::io::ErrorKind::TimedOut, + format!( + "sysfs write to {} timed out after {:.0}s (subprocess pid {pid}) — \ + possible nvidia driver deadlock. The subprocess may still be \ + stuck in kernel space; a reboot may be required to clear it.", + path.display(), + timeout.as_secs_f64(), + ), + )); + } + thread::sleep(poll_interval); + } + Err(e) => return Err(e), + } + } +} + +/// Validates that `addr` matches the PCI BDF format `DDDD:BB:DD.F`. +fn validate_pci_addr(addr: &str) -> Result<(), std::io::Error> { + let bytes = addr.as_bytes(); + let valid = bytes.len() == 12 + && bytes[4] == b':' + && bytes[7] == b':' + && bytes[10] == b'.' + && bytes[..4].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[5..7].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[8..10].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[11].is_ascii_digit(); + if valid { + Ok(()) + } else { + Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("invalid PCI address '{addr}': expected DDDD:BB:DD.F format"), + )) + } +} + +/// Probe the host for NVIDIA GPU VFIO readiness by scanning Linux sysfs. +/// +/// Returns a per-device list of `(pci_address, readiness)` tuples for every +/// NVIDIA GPU found. On non-Linux platforms the list is empty. +/// +/// On Linux, walks `/sys/bus/pci/devices/` and for each device: +/// 1. Reads `vendor` to check for NVIDIA (`0x10de`). +/// 2. Reads the `driver` symlink to determine which kernel driver is bound. +/// 3. If bound to `vfio-pci`, inspects the `iommu_group/devices/` directory +/// to verify all group members are also on `vfio-pci`. +pub fn probe_host_nvidia_vfio_readiness() -> Vec<(String, HostNvidiaVfioReadiness)> { + #[cfg(not(target_os = "linux"))] + { + Vec::new() + } + + #[cfg(target_os = "linux")] + { + probe_linux_sysfs() + } +} + +#[cfg(target_os = "linux")] +fn probe_linux_sysfs() -> Vec<(String, HostNvidiaVfioReadiness)> { + use std::fs; + use std::path::Path; + + let pci_devices = Path::new("/sys/bus/pci/devices"); + let entries = match fs::read_dir(pci_devices) { + Ok(e) => e, + Err(_) => return Vec::new(), + }; + + let mut results = Vec::new(); + + for entry in entries.filter_map(Result::ok) { + let dev_path = entry.path(); + + let vendor = match fs::read_to_string(dev_path.join("vendor")) { + Ok(v) => v.trim().to_lowercase(), + Err(_) => continue, + }; + + if vendor != NVIDIA_VENDOR_ID { + continue; + } + + let pci_addr = entry.file_name().to_string_lossy().to_string(); + + let driver_link = dev_path.join("driver"); + let driver_name = fs::read_link(&driver_link).ok().and_then(|target| { + target + .file_name() + .map(|name| name.to_string_lossy().to_string()) + }); + + let state = match driver_name.as_deref() { + Some("vfio-pci") => { + let iommu_group_devices = dev_path.join("iommu_group/devices"); + let group_clean = match fs::read_dir(&iommu_group_devices) { + Ok(group_entries) => group_entries.filter_map(Result::ok).all(|ge| { + let peer_path = iommu_group_devices.join(ge.file_name()).join("driver"); + fs::read_link(&peer_path) + .ok() + .and_then(|t| t.file_name().map(|n| n.to_string_lossy().to_string())) + .as_deref() + == Some("vfio-pci") + }), + Err(_) => false, + }; + + if group_clean { + HostNvidiaVfioReadiness::VfioBoundReady + } else { + HostNvidiaVfioReadiness::VfioBoundDirtyGroup + } + } + _ => HostNvidiaVfioReadiness::BoundToNvidia, + }; + + results.push((pci_addr, state)); + } + + results +} + +/// Returns whether any NVIDIA GPU is fully available for VM passthrough. +/// +/// Requires `OPENSHELL_VM_GPU_E2E=1` to activate probing. When the env var +/// is unset or not `"1"`, returns `false` unconditionally so non-GPU CI +/// runners are never affected. +/// +/// When activated, checks two conditions: +/// 1. At least one NVIDIA device reports [`VfioBoundReady`]. +/// 2. The cloud-hypervisor binary exists in the runtime bundle. +pub fn nvidia_gpu_available_for_vm_passthrough() -> bool { + if std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() != Ok("1") { + return false; + } + + let has_vfio_ready = probe_host_nvidia_vfio_readiness() + .iter() + .any(|(_, state)| *state == HostNvidiaVfioReadiness::VfioBoundReady); + + if !has_vfio_ready { + return false; + } + + let chv_exists = crate::configured_runtime_dir() + .map(|dir| dir.join("cloud-hypervisor").is_file()) + .unwrap_or(false); + + chv_exists +} + +/// Sysfs root path, defaulting to "/" in production and a temp dir in tests. +#[derive(Debug, Clone)] +pub(crate) struct SysfsRoot(PathBuf); + +impl Default for SysfsRoot { + fn default() -> Self { + Self(PathBuf::from("/")) + } +} + +impl SysfsRoot { + #[cfg(test)] + pub fn new(root: PathBuf) -> Self { + Self(root) + } + + pub fn sys_bus_pci_devices(&self) -> PathBuf { + self.0.join("sys/bus/pci/devices") + } + + pub fn sys_class_drm(&self) -> PathBuf { + self.0.join("sys/class/drm") + } + + pub fn sys_module(&self, module: &str) -> PathBuf { + self.0.join("sys/module").join(module) + } + + pub fn sys_bus_pci_drivers(&self, driver: &str) -> PathBuf { + self.0.join("sys/bus/pci/drivers").join(driver) + } + + pub fn sys_kernel_iommu_groups(&self) -> PathBuf { + self.0.join("sys/kernel/iommu_groups") + } + + fn is_real_sysfs(&self) -> bool { + self.0 == std::path::Path::new("/") + } + + #[cfg(target_os = "linux")] + fn write_sysfs(&self, path: &std::path::Path, data: &str) -> Result<(), std::io::Error> { + if self.is_real_sysfs() { + sysfs_write_with_timeout(path, data, SYSFS_WRITE_TIMEOUT) + } else { + std::fs::write(path, data).map_err(|e| { + std::io::Error::new(e.kind(), format!("failed to write {}: {e}", path.display())) + }) + } + } +} + +#[cfg(target_os = "linux")] +pub(crate) fn check_display_attached(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + use std::fs; + + let drm_dir = sysfs.sys_class_drm(); + let entries = match fs::read_dir(&drm_dir) { + Ok(e) => e, + Err(_) => return false, + }; + + for entry in entries.filter_map(Result::ok) { + let name = entry.file_name().to_string_lossy().to_string(); + if !name.starts_with("card") || name.contains('-') { + continue; + } + + let card_dir = entry.path(); + let device_link = card_dir.join("device"); + + let target = match fs::read_link(&device_link) { + Ok(t) => t, + Err(_) => continue, + }; + if !target.to_string_lossy().ends_with(pci_addr) { + continue; + } + + let boot_vga_path = card_dir.join("device").join("boot_vga"); + if let Ok(val) = fs::read_to_string(&boot_vga_path) { + if val.trim() == "1" { + return true; + } + } + + if let Ok(sub_entries) = fs::read_dir(&card_dir) { + for sub in sub_entries.filter_map(Result::ok) { + let sub_name = sub.file_name().to_string_lossy().to_string(); + if sub_name.starts_with(&format!("{name}-")) { + if let Ok(status) = fs::read_to_string(sub.path().join("status")) { + if status.trim() == "connected" { + return true; + } + } + } + } + } + } + + false +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_display_attached(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +/// Checks whether any process on the host has an open handle to an NVIDIA GPU +/// device (`/dev/nvidia*`). This is a host-wide check across ALL NVIDIA GPUs, +/// not scoped to a single PCI address. Returns a list of (pid, comm) pairs. +pub(crate) fn check_active_gpu_processes() -> std::io::Result> { + use std::fs; + + let mut result = Vec::new(); + + let proc_dir = match fs::read_dir("/proc") { + Ok(d) => d, + Err(e) => { + return Err(std::io::Error::new( + e.kind(), + format!( + "cannot scan /proc for active GPU processes: {e} — \ + refusing to unbind (fail-closed)" + ), + )); + } + }; + + for proc_entry in proc_dir.filter_map(Result::ok) { + let pid: u32 = match proc_entry.file_name().to_string_lossy().parse() { + Ok(p) => p, + Err(_) => continue, + }; + + let fd_dir = proc_entry.path().join("fd"); + let fds = match fs::read_dir(&fd_dir) { + Ok(d) => d, + Err(_) => continue, + }; + + for fd_entry in fds.filter_map(Result::ok) { + if let Ok(target) = fs::read_link(fd_entry.path()) { + if target.to_string_lossy().starts_with("/dev/nvidia") { + let comm = fs::read_to_string(format!("/proc/{pid}/comm")) + .unwrap_or_default() + .trim() + .to_string(); + result.push((pid, comm)); + break; + } + } + } + } + + Ok(result) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_active_gpu_processes() -> std::io::Result> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub(crate) fn check_iommu_enabled(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + let iommu_groups = sysfs.sys_kernel_iommu_groups(); + if !iommu_groups.is_dir() { + return false; + } + sysfs + .sys_bus_pci_devices() + .join(pci_addr) + .join("iommu_group") + .exists() +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_iommu_enabled(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub(crate) fn check_vfio_modules_loaded(sysfs: &SysfsRoot) -> bool { + sysfs.sys_module("vfio_pci").is_dir() && sysfs.sys_module("vfio_iommu_type1").is_dir() +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_vfio_modules_loaded(_sysfs: &SysfsRoot) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub(crate) fn check_sysfs_permissions(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + use nix::unistd::{AccessFlags, access}; + + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + let driver_override = dev_dir.join("driver_override"); + let unbind = dev_dir.join("driver/unbind"); + let bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind"); + + let writable = |path: &std::path::Path| -> bool { access(path, AccessFlags::W_OK).is_ok() }; + + let unbind_ok = !unbind.exists() || writable(&unbind); + writable(&driver_override) && unbind_ok && writable(&bind) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_sysfs_permissions(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub(crate) fn current_driver(sysfs: &SysfsRoot, pci_addr: &str) -> Option { + let driver_link = sysfs.sys_bus_pci_devices().join(pci_addr).join("driver"); + std::fs::read_link(&driver_link) + .ok() + .and_then(|target| target.file_name().map(|n| n.to_string_lossy().to_string())) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn current_driver(_sysfs: &SysfsRoot, _pci_addr: &str) -> Option { + None +} + +/// Nvidia kernel modules that hold internal references to GPU devices and can +/// prevent a clean unbind. Unloaded in order (most-dependent first). +#[cfg(target_os = "linux")] +const NVIDIA_SUBMODULES: &[&str] = &["nvidia_uvm", "nvidia_drm", "nvidia_modeset"]; + +/// Timeout for nvidia prep commands (nvidia-smi, modprobe). These commands +/// can wedge if the nvidia driver is in a bad state. +#[cfg(target_os = "linux")] +const NVIDIA_PREP_TIMEOUT: Duration = Duration::from_secs(15); + +/// Run a command with a timeout. Returns `Some(ExitStatus)` on success, +/// `None` on timeout or spawn failure. On timeout, kills the child and +/// drops it without calling `wait()` (same D-state safety as sysfs writes). +#[cfg(target_os = "linux")] +fn run_with_timeout( + mut cmd: std::process::Command, + timeout: Duration, +) -> Option { + use std::thread; + + let mut child = match cmd.spawn() { + Ok(c) => c, + Err(_) => return None, + }; + + let poll_interval = Duration::from_millis(100); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => return Some(status), + Ok(None) => { + if start.elapsed() > timeout { + let _ = child.kill(); + drop(child); + return None; + } + thread::sleep(poll_interval); + } + Err(_) => return None, + } + } +} + +/// Best-effort preparation of the nvidia driver before a raw sysfs unbind. +/// +/// Reduces the probability of the nvidia unbind deadlock by: +/// 1. Disabling persistence mode (nvidia-persistenced holds device refs). +/// 2. Unloading nvidia submodules that keep internal references open. +/// +/// All commands run with a timeout — if `nvidia-smi` or `modprobe` hangs +/// (which can happen when the nvidia driver is in a bad state), the parent +/// process is not blocked. Failures are logged but not fatal. +#[cfg(target_os = "linux")] +fn nvidia_pre_unbind_prep(pci_addr: &str) { + use std::process::{Command, Stdio}; + + // 1. Disable persistence mode via nvidia-smi (if available). + let mut cmd = Command::new("nvidia-smi"); + cmd.args(["-i", pci_addr, "-pm", "0"]) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()); + match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) { + Some(s) if s.success() => { + eprintln!("GPU {pci_addr}: disabled nvidia persistence mode"); + } + None => { + eprintln!( + "GPU {pci_addr}: nvidia-smi timed out after {:.0}s — skipping persistence mode", + NVIDIA_PREP_TIMEOUT.as_secs_f64() + ); + } + _ => {} + } + + // 2. Unload nvidia submodules that hold device references. + // This is best-effort — modules may be in use by other GPUs. + for module in NVIDIA_SUBMODULES { + let mut cmd = Command::new("modprobe"); + cmd.args(["-r", module]) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()); + match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) { + Some(s) if s.success() => { + eprintln!("GPU {pci_addr}: unloaded {module}"); + } + None => { + eprintln!( + "GPU {pci_addr}: modprobe -r {module} timed out after {:.0}s", + NVIDIA_PREP_TIMEOUT.as_secs_f64() + ); + } + _ => {} + } + } +} + +#[cfg(target_os = "linux")] +pub(crate) fn bind_gpu_to_vfio( + sysfs: &SysfsRoot, + pci_addr: &str, +) -> Result { + validate_pci_addr(pci_addr)?; + let drv = current_driver(sysfs, pci_addr); + + if drv.as_deref() == Some("vfio-pci") { + return Ok("vfio-pci".to_string()); + } + + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + + if drv.is_some() { + let is_nvidia = drv.as_deref() == Some("nvidia"); + if is_nvidia && sysfs.is_real_sysfs() { + nvidia_pre_unbind_prep(pci_addr); + + // nvidia_pre_unbind_prep may cascade-remove the nvidia module when + // all submodules are unloaded, which automatically unbinds the device. + // Re-check before attempting the sysfs unbind write. + if current_driver(sysfs, pci_addr).is_none() { + eprintln!("GPU {pci_addr}: device already unbound after nvidia module cleanup"); + } else if current_driver(sysfs, pci_addr).as_deref() == Some("vfio-pci") { + return Ok("vfio-pci".to_string()); + } + } + + // Only attempt the sysfs unbind if a driver is still bound. + if current_driver(sysfs, pci_addr).is_some() { + let unbind = dev_dir.join("driver/unbind"); + let unbind_result = sysfs.write_sysfs(&unbind, pci_addr); + + if let Err(ref e) = unbind_result { + if e.kind() == std::io::ErrorKind::TimedOut { + // The nvidia unbind deadlock can complete the unbind at the + // hardware level while the syscall never returns to userspace. + // Check if the device is actually unbound despite the timeout. + if current_driver(sysfs, pci_addr).is_none() { + eprintln!( + "GPU {pci_addr}: sysfs unbind timed out but device is unbound — \ + continuing (zombie subprocess may linger until reboot)" + ); + } else { + return Err(std::io::Error::new( + std::io::ErrorKind::TimedOut, + format!( + "Failed to unbind {pci_addr}: timed out and device is still \ + bound to {}. A reboot may be required.", + drv.as_deref().unwrap_or("unknown"), + ), + )); + } + } else { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to unbind device at {path}{hint}", + path = unbind.display() + ), + )); + } + } + } + } + + let driver_override = dev_dir.join("driver_override"); + if let Err(e) = sysfs.write_sysfs(&driver_override, "vfio-pci") { + if let Some(ref orig) = drv { + let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind"); + let _ = sysfs.write_sysfs(&orig_bind, pci_addr); + } + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to write driver_override at {path}{hint}", + path = driver_override.display() + ), + )); + } + + let vfio_bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind"); + if let Err(e) = sysfs.write_sysfs(&vfio_bind, pci_addr) { + let _ = sysfs.write_sysfs(&driver_override, ""); + if let Some(ref orig) = drv { + let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind"); + let _ = sysfs.write_sysfs(&orig_bind, pci_addr); + } + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to bind to vfio-pci at {path}{hint} — is the vfio-pci module loaded?", + path = vfio_bind.display() + ), + )); + } + + Ok(drv.unwrap_or_default()) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn bind_gpu_to_vfio( + _sysfs: &SysfsRoot, + _pci_addr: &str, +) -> Result { + Ok(String::new()) +} + +#[cfg(target_os = "linux")] +pub(crate) fn rebind_gpu_to_original( + sysfs: &SysfsRoot, + pci_addr: &str, + original_driver: &str, +) -> Result<(), std::io::Error> { + validate_pci_addr(pci_addr)?; + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + + if current_driver(sysfs, pci_addr).is_some() { + let unbind = dev_dir.join("driver/unbind"); + sysfs.write_sysfs(&unbind, pci_addr).map_err(|e| { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + std::io::Error::new( + e.kind(), + format!( + "Failed to unbind device at {path}{hint}", + path = unbind.display() + ), + ) + })?; + } + + let driver_override = dev_dir.join("driver_override"); + sysfs.write_sysfs(&driver_override, "").map_err(|e| { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + std::io::Error::new( + e.kind(), + format!( + "Failed to clear driver_override at {path}{hint}", + path = driver_override.display() + ), + ) + })?; + + if !original_driver.is_empty() && original_driver != "none" { + let bind = sysfs.sys_bus_pci_drivers(original_driver).join("bind"); + sysfs.write_sysfs(&bind, pci_addr).map_err(|e| { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + std::io::Error::new( + e.kind(), + format!( + "Failed to rebind to {original_driver} at {path}{hint}", + path = bind.display() + ), + ) + })?; + } else { + let rescan = sysfs.0.join("sys/bus/pci/rescan"); + let _ = sysfs.write_sysfs(&rescan, "1"); + } + + Ok(()) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn rebind_gpu_to_original( + _sysfs: &SysfsRoot, + _pci_addr: &str, + _original_driver: &str, +) -> Result<(), std::io::Error> { + Ok(()) +} + +#[cfg(target_os = "linux")] +pub(crate) fn iommu_group_peers( + sysfs: &SysfsRoot, + pci_addr: &str, +) -> Result, std::io::Error> { + validate_pci_addr(pci_addr)?; + let iommu_devices = sysfs + .sys_bus_pci_devices() + .join(pci_addr) + .join("iommu_group/devices"); + + let entries = match std::fs::read_dir(&iommu_devices) { + Ok(e) => e, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(vec![]), + Err(e) => return Err(e), + }; + + let mut peers = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let name = entry.file_name().to_string_lossy().to_string(); + if name != pci_addr { + peers.push(name); + } + } + Ok(peers) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn iommu_group_peers( + _sysfs: &SysfsRoot, + _pci_addr: &str, +) -> Result, std::io::Error> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub(crate) fn bind_iommu_group_peers( + sysfs: &SysfsRoot, + pci_addr: &str, +) -> Result, std::io::Error> { + let peers = iommu_group_peers(sysfs, pci_addr)?; + let mut restore_list = Vec::new(); + + for peer in peers { + match bind_gpu_to_vfio(sysfs, &peer) { + Ok(original) => { + if original != "vfio-pci" { + restore_list.push((peer, original)); + } + } + Err(e) => { + let _ = rebind_iommu_group_peers(sysfs, &restore_list); + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to bind IOMMU peer {peer}: {e}. Rolled back {} peer(s).", + restore_list.len() + ), + )); + } + } + } + + Ok(restore_list) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn bind_iommu_group_peers( + _sysfs: &SysfsRoot, + _pci_addr: &str, +) -> Result, std::io::Error> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub(crate) fn rebind_iommu_group_peers( + sysfs: &SysfsRoot, + peers: &[(String, String)], +) -> Result<(), std::io::Error> { + let mut first_err = None; + for (peer_addr, original_driver) in peers { + if let Err(e) = rebind_gpu_to_original(sysfs, peer_addr, original_driver) { + if first_err.is_none() { + first_err = Some(e); + } + } + } + match first_err { + Some(e) => Err(e), + None => Ok(()), + } +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn rebind_iommu_group_peers( + _sysfs: &SysfsRoot, + _peers: &[(String, String)], +) -> Result<(), std::io::Error> { + Ok(()) +} + +#[cfg(target_os = "linux")] +fn is_iommu_group_clean(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + let peers = match iommu_group_peers(sysfs, pci_addr) { + Ok(p) => p, + Err(_) => return false, + }; + peers + .iter() + .all(|peer| current_driver(sysfs, peer).as_deref() == Some("vfio-pci")) +} + +#[cfg(not(target_os = "linux"))] +fn is_iommu_group_clean(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +/// Captures the bind state for a GPU so it can be restored on shutdown. +#[derive(Debug)] +pub struct GpuBindState { + /// PCI address of the GPU that was bound. + pub pci_addr: String, + /// Driver the GPU was on before binding (e.g. "nvidia"). + pub original_driver: String, + /// IOMMU group peers that were rebound, with their original drivers. + pub peer_binds: Vec<(String, String)>, + /// Whether this instance performed the bind (false if GPU was already on vfio-pci). + pub did_bind: bool, +} + +impl GpuBindState { + /// Restore the GPU and its IOMMU peers to their original drivers. + pub fn restore(&self) -> Result<(), std::io::Error> { + self.restore_with_sysfs(&SysfsRoot::default()) + } + + pub(crate) fn restore_with_sysfs(&self, sysfs: &SysfsRoot) -> Result<(), std::io::Error> { + if !self.did_bind { + return Ok(()); + } + eprintln!( + "GPU: rebinding {} to {}", + self.pci_addr, self.original_driver + ); + rebind_gpu_to_original(sysfs, &self.pci_addr, &self.original_driver)?; + rebind_iommu_group_peers(sysfs, &self.peer_binds)?; + Ok(()) + } +} + +/// RAII guard that restores GPU driver binding when dropped. +/// +/// Ensures the GPU is rebound to its original driver on normal exit, +/// early return (?), or panic. Cannot protect against SIGKILL. +pub struct GpuBindGuard { + state: Option, +} + +impl GpuBindGuard { + pub fn new(state: GpuBindState) -> Self { + Self { state: Some(state) } + } + + /// Take the state out, preventing restore on drop. + pub fn disarm(&mut self) -> Option { + self.state.take() + } + + /// Get the PCI address of the bound GPU, if any. + pub fn pci_addr(&self) -> Option<&str> { + self.state.as_ref().map(|s| s.pci_addr.as_str()) + } +} + +impl Drop for GpuBindGuard { + fn drop(&mut self) { + if let Some(ref state) = self.state { + eprintln!( + "GPU: restoring {} to {} (cleanup)", + state.pci_addr, state.original_driver + ); + if let Err(e) = state.restore() { + eprintln!("GPU: restore failed: {e}"); + } + } + } +} + +/// Prepare a GPU for VFIO passthrough: run safety checks, select, and bind. +/// +/// When `requested_bdf` is Some, targets that specific device. +/// When None (auto mode), selects the best available GPU. +/// +/// All safety checks are hard failures — if any check fails, this returns +/// an error and does not bind anything. +pub fn prepare_gpu_for_passthrough( + requested_bdf: Option<&str>, +) -> Result { + prepare_gpu_with_sysfs(&SysfsRoot::default(), requested_bdf) +} + +pub(crate) fn prepare_gpu_with_sysfs( + sysfs: &SysfsRoot, + requested_bdf: Option<&str>, +) -> Result { + match requested_bdf { + Some(bdf) => prepare_specific_gpu(sysfs, bdf), + None => prepare_auto_gpu(sysfs), + } +} + +fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result { + validate_pci_addr(bdf)?; + + let dev_dir = sysfs.sys_bus_pci_devices().join(bdf); + if !dev_dir.exists() { + return Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + format!("PCI device {bdf} not found in sysfs"), + )); + } + + let vendor = std::fs::read_to_string(dev_dir.join("vendor")) + .map(|v| v.trim().to_lowercase()) + .unwrap_or_default(); + if vendor != NVIDIA_VENDOR_ID { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("PCI device {bdf} is not an NVIDIA device (vendor: {vendor})"), + )); + } + let class = std::fs::read_to_string(dev_dir.join("class")) + .map(|c| c.trim().to_lowercase()) + .unwrap_or_default(); + if !class.starts_with("0x03") { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("PCI device {bdf} is not a GPU (class: {class})"), + )); + } + + if current_driver(sysfs, bdf).as_deref() == Some("vfio-pci") && is_iommu_group_clean(sysfs, bdf) + { + return Ok(GpuBindState { + pci_addr: bdf.to_string(), + original_driver: "vfio-pci".to_string(), + peer_binds: vec![], + did_bind: false, + }); + } + + if check_display_attached(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: has active display outputs"), + )); + } + + let procs = check_active_gpu_processes().map_err(|e| { + std::io::Error::new( + e.kind(), + format!("GPU {bdf}: cannot verify GPU is idle — {e}"), + ) + })?; + if !procs.is_empty() { + let desc: Vec = procs + .iter() + .map(|(pid, comm)| format!("{pid} ({comm})")) + .collect(); + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: in use by PIDs: {}", desc.join(", ")), + )); + } + + if !check_iommu_enabled(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: IOMMU not enabled or device has no IOMMU group"), + )); + } + + if !check_vfio_modules_loaded(sysfs) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: VFIO kernel modules not loaded"), + )); + } + + if !check_sysfs_permissions(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::PermissionDenied, + format!("GPU {bdf}: insufficient sysfs permissions — run as root"), + )); + } + + let original_driver = bind_gpu_to_vfio(sysfs, bdf)?; + let peer_binds = match bind_iommu_group_peers(sysfs, bdf) { + Ok(peers) => peers, + Err(e) => { + let _ = rebind_gpu_to_original(sysfs, bdf, &original_driver); + return Err(e); + } + }; + + Ok(GpuBindState { + pci_addr: bdf.to_string(), + original_driver, + peer_binds, + did_bind: true, + }) +} + +fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result { + let pci_dir = sysfs.sys_bus_pci_devices(); + let entries = std::fs::read_dir(&pci_dir).map_err(|e| { + std::io::Error::new(e.kind(), format!("cannot read {}: {e}", pci_dir.display())) + })?; + + let mut nvidia_addrs = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let dev_path = entry.path(); + let vendor = match std::fs::read_to_string(dev_path.join("vendor")) { + Ok(v) => v.trim().to_lowercase(), + Err(_) => continue, + }; + let class = match std::fs::read_to_string(dev_path.join("class")) { + Ok(c) => c.trim().to_lowercase(), + Err(_) => continue, + }; + if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x03") { + nvidia_addrs.push(entry.file_name().to_string_lossy().to_string()); + } + } + + if nvidia_addrs.is_empty() { + return Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + "no NVIDIA PCI device found", + )); + } + + nvidia_addrs.sort(); + + for addr in &nvidia_addrs { + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") + && is_iommu_group_clean(sysfs, addr) + { + return Ok(GpuBindState { + pci_addr: addr.clone(), + original_driver: "vfio-pci".to_string(), + peer_binds: vec![], + did_bind: false, + }); + } + } + + let mut blocked: Vec<(String, String)> = Vec::new(); + let active_procs = check_active_gpu_processes() + .map_err(|e| std::io::Error::new(e.kind(), format!("cannot verify GPUs are idle — {e}")))?; + + for addr in &nvidia_addrs { + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") { + blocked.push((addr.clone(), "IOMMU group not clean".to_string())); + continue; + } + + if check_display_attached(sysfs, addr) { + blocked.push((addr.clone(), "has active display outputs".to_string())); + continue; + } + + if !active_procs.is_empty() { + let desc: Vec = active_procs + .iter() + .map(|(pid, comm)| format!("{pid} ({comm})")) + .collect(); + blocked.push((addr.clone(), format!("in use by PIDs: {}", desc.join(", ")))); + continue; + } + + if !check_iommu_enabled(sysfs, addr) { + blocked.push((addr.clone(), "IOMMU not enabled".to_string())); + continue; + } + + if !check_vfio_modules_loaded(sysfs) { + blocked.push((addr.clone(), "VFIO modules not loaded".to_string())); + continue; + } + + if !check_sysfs_permissions(sysfs, addr) { + blocked.push((addr.clone(), "insufficient sysfs permissions".to_string())); + continue; + } + + eprintln!("GPU: binding {addr} for VFIO passthrough"); + let original_driver = bind_gpu_to_vfio(sysfs, addr)?; + let peer_binds = match bind_iommu_group_peers(sysfs, addr) { + Ok(peers) => peers, + Err(e) => { + let _ = rebind_gpu_to_original(sysfs, addr, &original_driver); + return Err(e); + } + }; + + return Ok(GpuBindState { + pci_addr: addr.clone(), + original_driver, + peer_binds, + did_bind: true, + }); + } + + let mut msg = + String::from("GPU passthrough blocked by safety checks.\n\n Detected devices:\n"); + for (addr, reason) in &blocked { + msg.push_str(&format!(" {addr}: {reason}\n")); + } + msg.push_str("\n No GPU is available for passthrough."); + + Err(std::io::Error::new(std::io::ErrorKind::Other, msg)) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::path::Path; + + #[test] + fn passthrough_gate_is_false_without_env_var() { + // SAFETY: test runs single-threaded; no other thread reads this var. + unsafe { std::env::remove_var("OPENSHELL_VM_GPU_E2E") }; + assert!( + !nvidia_gpu_available_for_vm_passthrough(), + "gate must return false when OPENSHELL_VM_GPU_E2E is unset" + ); + } + + #[test] + fn probe_returns_no_device_or_readiness_on_typical_ci() { + let results = probe_host_nvidia_vfio_readiness(); + + #[cfg(not(target_os = "linux"))] + assert!(results.is_empty(), "non-Linux should return empty Vec"); + + #[cfg(target_os = "linux")] + { + // CI machines typically have no NVIDIA GPU bound to vfio-pci. + // Accept an empty list or any per-device readiness state. + for (addr, state) in &results { + assert!(!addr.is_empty(), "PCI address should not be empty"); + assert!( + matches!( + state, + HostNvidiaVfioReadiness::BoundToNvidia + | HostNvidiaVfioReadiness::VfioBoundReady + | HostNvidiaVfioReadiness::VfioBoundDirtyGroup + ), + "unexpected per-device readiness state for {addr}: {state:?}" + ); + } + } + } + + #[test] + fn display_impl_is_meaningful() { + let states = [ + HostNvidiaVfioReadiness::UnsupportedPlatform, + HostNvidiaVfioReadiness::NoNvidiaDevice, + HostNvidiaVfioReadiness::BoundToNvidia, + HostNvidiaVfioReadiness::VfioBoundReady, + HostNvidiaVfioReadiness::VfioBoundDirtyGroup, + HostNvidiaVfioReadiness::MixedVfioAndOther, + ]; + for state in &states { + let msg = format!("{state}"); + assert!(!msg.is_empty(), "Display for {state:?} should not be empty"); + } + } + + fn mock_pci_device(root: &Path, pci_addr: &str, vendor: &str, driver: Option<&str>) { + use std::fs; + let dev_dir = root.join("sys/bus/pci/devices").join(pci_addr); + fs::create_dir_all(&dev_dir).unwrap(); + fs::write(dev_dir.join("vendor"), vendor).unwrap(); + fs::write(dev_dir.join("class"), "0x030000").unwrap(); + if let Some(drv) = driver { + let driver_dir = root.join("sys/bus/pci/drivers").join(drv); + fs::create_dir_all(&driver_dir).unwrap(); + #[cfg(unix)] + std::os::unix::fs::symlink(&driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(dev_dir.join("driver_override"), "").unwrap(); + } + + fn mock_drm_card(root: &Path, card: &str, pci_addr: &str, outputs: &[(&str, &str)]) { + use std::fs; + let card_dir = root.join("sys/class/drm").join(card); + fs::create_dir_all(&card_dir).unwrap(); + #[cfg(unix)] + std::os::unix::fs::symlink( + root.join("sys/bus/pci/devices").join(pci_addr), + card_dir.join("device"), + ) + .unwrap(); + for (output, status) in outputs { + let out_dir = card_dir.join(format!("{card}-{output}")); + fs::create_dir_all(&out_dir).unwrap(); + fs::write(out_dir.join("status"), status).unwrap(); + } + } + + fn mock_iommu_group(root: &Path, group_id: u32, members: &[&str]) { + use std::fs; + let group_dir = root.join(format!("sys/kernel/iommu_groups/{group_id}/devices")); + fs::create_dir_all(&group_dir).unwrap(); + for member in members { + let dev_dir = root.join("sys/bus/pci/devices").join(member); + fs::create_dir_all(&dev_dir).unwrap(); + #[cfg(unix)] + { + let iommu_group_target = root.join(format!("sys/kernel/iommu_groups/{group_id}")); + let _ = + std::os::unix::fs::symlink(&iommu_group_target, dev_dir.join("iommu_group")); + let _ = std::os::unix::fs::symlink(&dev_dir, group_dir.join(member)); + } + } + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_detects_active_framebuffer() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + assert!(check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_false_on_headless() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "disconnected")], + ); + assert!(!check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_false_no_drm_card() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + assert!(!check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_check_fails_without_groups_dir() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + assert!(!check_iommu_enabled(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_check_passes_with_group() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + assert!(check_iommu_enabled(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn vfio_modules_loaded_true() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + assert!(check_vfio_modules_loaded(&sysfs)); + } + + #[test] + #[cfg(target_os = "linux")] + fn vfio_modules_missing() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + assert!(!check_vfio_modules_loaded(&sysfs)); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_writable() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&bind_dir).unwrap(); + fs::write(bind_dir.join("bind"), "").unwrap(); + assert!(check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_readonly_driver_override() { + use std::os::unix::fs::PermissionsExt; + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let driver_override = root + .path() + .join("sys/bus/pci/devices/0000:41:00.0/driver_override"); + fs::set_permissions(&driver_override, fs::Permissions::from_mode(0o444)).unwrap(); + assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_readonly_bind() { + use std::os::unix::fs::PermissionsExt; + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&bind_dir).unwrap(); + let bind_path = bind_dir.join("bind"); + fs::write(&bind_path, "").unwrap(); + fs::set_permissions(&bind_path, fs::Permissions::from_mode(0o444)).unwrap(); + assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + fn mock_bindable_gpu(root: &Path, pci_addr: &str) { + mock_pci_device(root, pci_addr, "0x10de", Some("nvidia")); + let drv_unbind = root.join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + let vfio_dir = root.join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + mock_iommu_group(root, 15, &[pci_addr]); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_gpu_writes_correct_sysfs_paths() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let unbind_content = + fs::read_to_string(root.path().join("sys/bus/pci/drivers/nvidia/unbind")).unwrap(); + assert_eq!(unbind_content, "0000:41:00.0"); + + let override_content = fs::read_to_string( + root.path() + .join("sys/bus/pci/devices/0000:41:00.0/driver_override"), + ) + .unwrap(); + assert_eq!(override_content, "vfio-pci"); + + let bind_content = + fs::read_to_string(root.path().join("sys/bus/pci/drivers/vfio-pci/bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_returns_original_driver() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + + let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(result, "nvidia"); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_noop_when_already_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("vfio-pci")); + let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + + let nvidia_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::create_dir_all(nvidia_unbind.parent().unwrap()).unwrap(); + fs::write(&nvidia_unbind, "").unwrap(); + + let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(result, "vfio-pci"); + + let unbind_content = fs::read_to_string(&nvidia_unbind).unwrap(); + assert_eq!( + unbind_content, "", + "nvidia unbind should NOT have been written" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn rebind_clears_driver_override() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + } + + #[test] + #[cfg(target_os = "linux")] + fn rebind_writes_to_original_driver_bind() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap(); + + let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_peers_listed_correctly() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", None); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let peers = iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(peers, vec!["0000:41:00.1"]); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_peers_bound_together() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia")); + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!( + restore, + vec![("0000:41:00.1".to_string(), "nvidia".to_string())] + ); + + let override_content = fs::read_to_string( + root.path() + .join("sys/bus/pci/devices/0000:41:00.1/driver_override"), + ) + .unwrap(); + assert_eq!(override_content, "vfio-pci"); + } + + #[test] + #[cfg(target_os = "linux")] + fn peer_restore_rebinds_to_original() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia")); + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.1"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_iommu_group_peers(&sysfs, &restore).unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + } + + fn mock_multi_gpu_host(root: &Path) { + // GPU 0: on nvidia, has display attached + mock_bindable_gpu(root, "0000:41:00.0"); + mock_drm_card(root, "card0", "0000:41:00.0", &[("DP-1", "connected")]); + + // GPU 1: on nvidia, idle (no display, no processes) + mock_bindable_gpu(root, "0000:42:00.0"); + + // GPU 2: already on vfio-pci, clean IOMMU group + mock_pci_device(root, "0000:43:00.0", "0x10de", Some("vfio-pci")); + mock_iommu_group(root, 17, &["0000:43:00.0"]); + + fs::create_dir_all(root.join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.join("sys/module/vfio_iommu_type1")).unwrap(); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_prefers_already_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_multi_gpu_host(root.path()); + + let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap(); + assert_eq!(state.pci_addr, "0000:43:00.0"); + assert!(!state.did_bind); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_selects_idle_gpu_when_no_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + + mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia")); + mock_iommu_group(root.path(), 16, &["0000:42:00.0"]); + + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap(); + assert_eq!(state.pci_addr, "0000:42:00.0"); + assert!(state.did_bind); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_fails_when_all_blocked() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + + mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card1", + "0000:42:00.0", + &[("HDMI-1", "connected")], + ); + mock_iommu_group(root.path(), 16, &["0000:42:00.0"]); + + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("display"), + "error should mention display: {msg}" + ); + assert!( + msg.contains("0000:41:00.0"), + "error should list first GPU: {msg}" + ); + assert!( + msg.contains("0000:42:00.0"), + "error should list second GPU: {msg}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_fails_on_empty_host() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + fs::create_dir_all(root.path().join("sys/bus/pci/devices")).unwrap(); + + let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err(); + assert!( + err.to_string().contains("no NVIDIA PCI device found"), + "unexpected error: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_binds_target() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap(); + assert_eq!(state.pci_addr, "0000:41:00.0"); + assert!(state.did_bind); + assert_eq!(state.original_driver, "nvidia"); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_validates_format() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("invalid")).unwrap_err(); + assert!( + err.to_string().contains("invalid PCI address"), + "unexpected error: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_fails_display_check() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err(); + assert!( + err.to_string().contains("display"), + "error should mention display: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_fails_iommu_check() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err(); + assert!( + err.to_string().contains("IOMMU"), + "error should mention IOMMU: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn restore_round_trips() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap(); + assert!(state.did_bind); + assert_eq!(state.original_driver, "nvidia"); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + state.restore_with_sysfs(&sysfs).unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + + let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn restore_noop_when_did_not_bind() { + let state = GpuBindState { + pci_addr: "0000:43:00.0".to_string(), + original_driver: "vfio-pci".to_string(), + peer_binds: vec![], + did_bind: false, + }; + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + state.restore_with_sysfs(&sysfs).unwrap(); + } + + #[test] + fn guard_has_pci_addr() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + }; + let guard = GpuBindGuard::new(state); + assert_eq!(guard.pci_addr(), Some("0000:41:00.0")); + } + + #[test] + fn guard_disarm_returns_state() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + }; + let mut guard = GpuBindGuard::new(state); + let taken = guard.disarm(); + assert!(taken.is_some()); + assert_eq!(guard.pci_addr(), None); + } + + #[test] + fn guard_disarm_prevents_double_restore() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + }; + let mut guard = GpuBindGuard::new(state); + let _ = guard.disarm(); + let second = guard.disarm(); + assert!(second.is_none()); + } + + #[test] + fn guard_drop_noop_when_did_not_bind() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: false, + }; + let guard = GpuBindGuard::new(state); + drop(guard); + } + + #[test] + fn guard_drop_on_panic_is_safe() { + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: false, + }; + let _guard = GpuBindGuard::new(state); + panic!("test panic"); + })); + assert!(result.is_err()); + } +} diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index 2b78a7669..9b70b32cf 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -14,9 +14,11 @@ #![allow(unsafe_code)] +pub mod backend; mod embedded; mod exec; mod ffi; +pub mod gpu_passthrough; mod health; use std::ffi::CString; @@ -25,9 +27,10 @@ use std::ptr; use std::time::Instant; pub use exec::{ - VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state, - ensure_vm_not_running, exec_capture, exec_running_vm, recover_corrupt_kine_db, - reset_runtime_state, vm_exec_socket_path, vm_state_path, write_vm_runtime_state, + VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, VsockConnectMode, acquire_rootfs_lock, + clear_vm_runtime_state, ensure_vm_not_running, exec_capture, exec_running_vm, + recover_corrupt_kine_db, reset_runtime_state, vm_exec_socket_path, vm_state_path, + write_vm_runtime_state, }; // ── Error type ───────────────────────────────────────────────────────── @@ -45,6 +48,19 @@ pub enum VmError { )] RootfsNotFound { path: String }, + /// The GPU rootfs directory does not exist. + #[error( + "GPU rootfs not found: {path}\n\ + The --gpu flag requires a rootfs built with GPU support (NVIDIA drivers,\n\ + nvidia-container-toolkit, and GPU manifests).\n\ + Build one with:\n\ + \x20 ./crates/openshell-vm/scripts/build-rootfs.sh --gpu \n\ + Then either:\n\ + \x20 - Copy it to: {path}\n\ + \x20 - Or use: openshell-vm --gpu --rootfs " + )] + GpuRootfsNotFound { path: String }, + /// A path contained invalid UTF-8. #[error("path is not valid UTF-8: {0}")] InvalidPath(String), @@ -98,6 +114,18 @@ fn check(ret: i32, func: &'static str) -> Result<(), VmError> { // ── Configuration ────────────────────────────────────────────────────── +/// Hypervisor backend selection. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum VmBackendChoice { + /// Auto-select: cloud-hypervisor when a VFIO device is configured, libkrun otherwise. + #[default] + Auto, + /// Force the libkrun backend. + Libkrun, + /// Force the cloud-hypervisor backend (even without GPU/VFIO). + CloudHypervisor, +} + /// Networking backend for the microVM. #[derive(Debug, Clone)] pub enum NetBackend { @@ -202,6 +230,16 @@ pub struct VmConfig { /// Optional host-backed raw block image for mutable guest state. pub state_disk: Option, + + /// Whether GPU passthrough is enabled for this VM. + pub gpu_enabled: bool, + + /// VFIO PCI device address for GPU passthrough (e.g. `0000:41:00.0`). + /// When set, the cloud-hypervisor backend is used instead of libkrun. + pub vfio_device: Option, + + /// Hypervisor backend override. Defaults to [`VmBackendChoice::Auto`]. + pub backend: VmBackendChoice, } impl VmConfig { @@ -245,6 +283,9 @@ impl VmConfig { reset: false, gateway_name: format!("{GATEWAY_NAME_PREFIX}-default"), state_disk: Some(state_disk), + gpu_enabled: false, + vfio_device: None, + backend: VmBackendChoice::Auto, } } } @@ -277,6 +318,38 @@ pub fn named_rootfs_dir(instance_name: &str) -> Result { .join("rootfs")) } +/// Resolve the GPU rootfs path for a named instance. +/// +/// Layout: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/instances/{name}/rootfs-gpu` +/// +/// The GPU rootfs is built separately with `build-rootfs.sh --gpu` and is +/// never embedded (too large with NVIDIA drivers). If it doesn't exist, +/// callers should return [`VmError::GpuRootfsNotFound`]. +pub fn named_gpu_rootfs_dir(instance_name: &str) -> Result { + let name = sanitize_instance_name(instance_name)?; + let base = openshell_bootstrap::paths::openshell_vm_base_dir() + .map_err(|e| VmError::RuntimeState(format!("resolve openshell-vm base dir: {e}")))?; + Ok(base + .join(env!("CARGO_PKG_VERSION")) + .join("instances") + .join(name) + .join("rootfs-gpu")) +} + +/// Ensure a GPU rootfs exists for the named instance. +/// +/// Unlike [`ensure_named_rootfs`], there is no embedded GPU rootfs to +/// extract — the user must pre-build it with `build-rootfs.sh --gpu`. +pub fn ensure_gpu_rootfs(instance_name: &str) -> Result { + let gpu_rootfs = named_gpu_rootfs_dir(instance_name)?; + if gpu_rootfs.is_dir() { + return Ok(gpu_rootfs); + } + Err(VmError::GpuRootfsNotFound { + path: gpu_rootfs.display().to_string(), + }) +} + /// Ensure a named instance rootfs exists, extracting from the embedded /// rootfs tarball on first use. /// @@ -365,7 +438,9 @@ fn sanitize_instance_name(name: &str) -> Result { /// Build a null-terminated C string array from a slice of strings. /// /// Returns both the `CString` owners (to keep them alive) and the pointer array. -fn c_string_array(strings: &[&str]) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { +pub(crate) fn c_string_array( + strings: &[&str], +) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { let owned: Vec = strings .iter() .map(|s| CString::new(*s)) @@ -570,7 +645,7 @@ fn extract_json_string(json: &str, key: &str) -> Option { map.get(key)?.as_str().map(ToOwned::to_owned) } -fn clamp_log_level(level: u32) -> u32 { +pub(crate) fn clamp_log_level(level: u32) -> u32 { match level { 0 => ffi::KRUN_LOG_LEVEL_OFF, 1 => ffi::KRUN_LOG_LEVEL_ERROR, @@ -581,258 +656,29 @@ fn clamp_log_level(level: u32) -> u32 { } } -struct VmContext { - krun: &'static ffi::LibKrun, - ctx_id: u32, -} - -impl VmContext { - fn create(log_level: u32) -> Result { - let krun = ffi::libkrun()?; - unsafe { - check( - (krun.krun_init_log)( - ffi::KRUN_LOG_TARGET_DEFAULT, - clamp_log_level(log_level), - ffi::KRUN_LOG_STYLE_AUTO, - ffi::KRUN_LOG_OPTION_NO_ENV, - ), - "krun_init_log", - )?; - } - - let ctx_id = unsafe { (krun.krun_create_ctx)() }; - if ctx_id < 0 { - return Err(VmError::Krun { - func: "krun_create_ctx", - code: ctx_id, - }); - } - - Ok(Self { - krun, - ctx_id: ctx_id as u32, - }) - } - - fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), - "krun_set_vm_config", - ) - } - } - - fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { - let rootfs_c = path_to_cstring(rootfs)?; - unsafe { - check( - (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), - "krun_set_root", - ) - } - } - - fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> { - let Some(add_disk3) = self.krun.krun_add_disk3 else { - return Err(VmError::HostSetup( - "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support" - .to_string(), - )); - }; - - let block_id_c = CString::new(state_disk.block_id.as_str())?; - let disk_path_c = path_to_cstring(&state_disk.path)?; - unsafe { - check( - add_disk3( - self.ctx_id, - block_id_c.as_ptr(), - disk_path_c.as_ptr(), - ffi::KRUN_DISK_FORMAT_RAW, - false, - false, - state_disk_sync_mode(), - ), - "krun_add_disk3", - ) - } - } - - fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { - let workdir_c = CString::new(workdir)?; - unsafe { - check( - (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), - "krun_set_workdir", - ) - } - } - - fn disable_implicit_vsock(&self) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_disable_implicit_vsock)(self.ctx_id), - "krun_disable_implicit_vsock", - ) - } - } - - fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), - "krun_add_vsock", - ) - } - } - - #[cfg(target_os = "macos")] - fn add_net_unixgram( - &self, - socket_path: &Path, - mac: &[u8; 6], - features: u32, - flags: u32, - ) -> Result<(), VmError> { - let sock_c = path_to_cstring(socket_path)?; - unsafe { - check( - (self.krun.krun_add_net_unixgram)( - self.ctx_id, - sock_c.as_ptr(), - -1, - mac.as_ptr(), - features, - flags, - ), - "krun_add_net_unixgram", - ) - } - } - - #[allow(dead_code)] // FFI binding for future use (e.g. Linux networking) - fn add_net_unixstream( - &self, - socket_path: &Path, - mac: &[u8; 6], - features: u32, - ) -> Result<(), VmError> { - let sock_c = path_to_cstring(socket_path)?; - unsafe { - check( - (self.krun.krun_add_net_unixstream)( - self.ctx_id, - sock_c.as_ptr(), - -1, - mac.as_ptr(), - features, - 0, - ), - "krun_add_net_unixstream", - ) - } - } - - fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { - let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); - let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; - unsafe { - check( - (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), - "krun_set_port_map", - ) - } - } - - fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { - let socket_c = path_to_cstring(&port.socket_path)?; - unsafe { - check( - (self.krun.krun_add_vsock_port2)( - self.ctx_id, - port.port, - socket_c.as_ptr(), - port.listen, - ), - "krun_add_vsock_port2", - ) - } - } - - fn set_console_output(&self, path: &Path) -> Result<(), VmError> { - let console_c = path_to_cstring(path)?; - unsafe { - check( - (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), - "krun_set_console_output", - ) - } - } - - fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { - let exec_c = CString::new(exec_path)?; - let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); - let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; - let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); - let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; - - unsafe { - check( - (self.krun.krun_set_exec)( - self.ctx_id, - exec_c.as_ptr(), - argv_ptrs.as_ptr(), - env_ptrs.as_ptr(), - ), - "krun_set_exec", - ) - } - } - - fn start_enter(&self) -> i32 { - unsafe { (self.krun.krun_start_enter)(self.ctx_id) } - } -} - -impl Drop for VmContext { - fn drop(&mut self) { - unsafe { - let ret = (self.krun.krun_free_ctx)(self.ctx_id); - if ret < 0 { - eprintln!( - "warning: krun_free_ctx({}) failed with code {ret}", - self.ctx_id - ); - } - } - } -} - /// RAII guard that kills and waits on a gvproxy child process when dropped. /// /// This prevents orphaned gvproxy processes when early `?` returns in the /// launch function cause the child to be dropped before cleanup code runs. /// Call [`GvproxyGuard::disarm`] to take ownership of the child when it /// should outlive the guard (i.e., after a successful fork). -struct GvproxyGuard { +pub(crate) struct GvproxyGuard { child: Option, } impl GvproxyGuard { - fn new(child: std::process::Child) -> Self { + pub(crate) fn new(child: std::process::Child) -> Self { Self { child: Some(child) } } /// Take the child out of the guard, preventing it from being killed on drop. /// Use this after the launch is successful and the parent will manage cleanup. - fn disarm(&mut self) -> Option { + pub(crate) fn disarm(&mut self) -> Option { self.child.take() } /// Get the child's PID without disarming. - fn id(&self) -> Option { + pub(crate) fn id(&self) -> Option { self.child.as_ref().map(std::process::Child::id) } } @@ -852,7 +698,7 @@ impl Drop for GvproxyGuard { /// /// Sends a raw HTTP/1.1 POST request over the unix socket to avoid /// depending on `curl` being installed on the host. -fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { +pub(crate) fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { use std::io::{Read, Write}; use std::os::unix::net::UnixStream; @@ -908,7 +754,7 @@ fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { /// runtime state. If the state file was deleted (e.g. the user ran /// `rm -rf` on the data directory), we fall back to killing any gvproxy /// process holding the target ports. -fn kill_stale_gvproxy(rootfs: &Path) { +pub(crate) fn kill_stale_gvproxy(rootfs: &Path) { kill_stale_gvproxy_by_state(rootfs); } @@ -929,7 +775,7 @@ fn kill_stale_gvproxy_by_state(rootfs: &Path) { /// /// Used as a fallback when the VM state file is missing (e.g. after the /// user deleted the data directory while a VM was running). -fn kill_stale_gvproxy_by_port(port: u16) { +pub(crate) fn kill_stale_gvproxy_by_port(port: u16) { // Use lsof to find PIDs listening on the target port. let output = std::process::Command::new("lsof") .args(["-ti", &format!(":{port}")]) @@ -1009,7 +855,7 @@ fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool { false } -fn vm_rootfs_key(rootfs: &Path) -> String { +pub(crate) fn vm_rootfs_key(rootfs: &Path) -> String { let name = rootfs .file_name() .and_then(|part| part.to_str()) @@ -1078,7 +924,7 @@ fn ensure_state_disk_image(state_disk: &StateDiskConfig) -> Result<(), VmError> Ok(()) } -fn state_disk_sync_mode() -> u32 { +pub(crate) fn state_disk_sync_mode() -> u32 { #[cfg(target_os = "macos")] { ffi::KRUN_SYNC_RELAXED @@ -1154,7 +1000,7 @@ fn secure_socket_base(subdir: &str) -> Result { Ok(dir) } -fn gvproxy_socket_dir(rootfs: &Path) -> Result { +pub(crate) fn gvproxy_socket_dir(rootfs: &Path) -> Result { let dir = secure_socket_base("ovm-gv")?; // macOS unix socket path limit is tight (~104 bytes). Keep paths very short. @@ -1162,7 +1008,30 @@ fn gvproxy_socket_dir(rootfs: &Path) -> Result { Ok(dir.join(id)) } -fn gateway_host_port(config: &VmConfig) -> u16 { +/// Validate that a VFIO PCI address matches the BDF format `DDDD:BB:DD.F`. +/// +/// Rejects strings containing `/`, `..`, or non-hex characters to prevent +/// path traversal when the address is interpolated into sysfs paths. +fn validate_vfio_address(addr: &str) -> Result<(), VmError> { + let bytes = addr.as_bytes(); + if bytes.len() == 12 + && bytes[4] == b':' + && bytes[7] == b':' + && bytes[10] == b'.' + && bytes[..4].iter().all(u8::is_ascii_hexdigit) + && bytes[5..7].iter().all(u8::is_ascii_hexdigit) + && bytes[8..10].iter().all(u8::is_ascii_hexdigit) + && bytes[11].is_ascii_digit() + && bytes[11] <= b'7' + { + return Ok(()); + } + Err(VmError::HostSetup(format!( + "invalid VFIO PCI address '{addr}': expected BDF format DDDD:BB:DD.F (e.g. 0000:41:00.0)" + ))) +} + +pub(crate) fn gateway_host_port(config: &VmConfig) -> u16 { config .port_map .first() @@ -1171,7 +1040,7 @@ fn gateway_host_port(config: &VmConfig) -> u16 { .unwrap_or(DEFAULT_GATEWAY_PORT) } -fn pick_gvproxy_ssh_port() -> Result { +pub(crate) fn pick_gvproxy_ssh_port() -> Result { let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) .map_err(|e| VmError::HostSetup(format!("allocate gvproxy ssh port on localhost: {e}")))?; let port = listener @@ -1182,7 +1051,7 @@ fn pick_gvproxy_ssh_port() -> Result { Ok(port) } -fn path_to_cstring(path: &Path) -> Result { +pub(crate) fn path_to_cstring(path: &Path) -> Result { let s = path .to_str() .ok_or_else(|| VmError::InvalidPath(path.display().to_string()))?; @@ -1277,11 +1146,22 @@ pub fn launch(config: &VmConfig) -> Result { state_disk.path.display() ))); } - if let Some(state_disk) = &config.state_disk { + let fresh_state_disk = if let Some(state_disk) = &config.state_disk { + let existed_before = state_disk.path.is_file(); ensure_state_disk_image(state_disk)?; + !existed_before + } else { + false + }; + + // When the state disk is freshly created (deleted by user, --reset, or + // first boot), the VM will generate new PKI. Clear any cached host-side + // mTLS certs so `bootstrap_gateway` runs the cold-boot PKI fetch path + // instead of using stale certs that won't match the new VM CA. + if fresh_state_disk || config.reset { + clear_warm_boot_certs(&config.gateway_name); } - let launch_start = Instant::now(); eprintln!("rootfs: {}", config.rootfs.display()); if let Some(state_disk) = &config.state_disk { eprintln!( @@ -1292,8 +1172,34 @@ pub fn launch(config: &VmConfig) -> Result { } eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib); - // The runtime is embedded in the binary and extracted on first use. - // Can be overridden via OPENSHELL_VM_RUNTIME_DIR for development. + raise_nofile_limit(); + + // ── Dispatch to the appropriate backend ───────────────────────── + + let use_chv = match config.backend { + VmBackendChoice::CloudHypervisor => true, + VmBackendChoice::Libkrun => false, + VmBackendChoice::Auto => config.gpu_enabled || config.vfio_device.is_some(), + }; + + if use_chv { + #[cfg(not(target_os = "linux"))] + return Err(VmError::HostSetup( + "cloud-hypervisor backend requires Linux with KVM".into(), + )); + + #[cfg(target_os = "linux")] + { + if let Some(ref addr) = config.vfio_device { + validate_vfio_address(addr)?; + } + let chv_backend = backend::cloud_hypervisor::CloudHypervisorBackend::new()?; + return backend::VmBackend::launch(&chv_backend, config); + } + } + + // libkrun path: resolve the embedded runtime bundle and load libkrun. + // Cloud-hypervisor resolves its own binaries in CloudHypervisorBackend::new(). let runtime_gvproxy = resolve_runtime_bundle()?; let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { VmError::HostSetup(format!( @@ -1302,413 +1208,12 @@ pub fn launch(config: &VmConfig) -> Result { )) })?; configure_runtime_loader_env(runtime_dir)?; - raise_nofile_limit(); - // ── Log runtime provenance ───────────────────────────────────── - // After configuring the loader, trigger library loading so that - // provenance is captured before we proceed with VM configuration. let _ = ffi::libkrun()?; log_runtime_provenance(runtime_dir); - // ── Configure the microVM ────────────────────────────────────── - - let vm = VmContext::create(config.log_level)?; - vm.set_vm_config(config.vcpus, config.mem_mib)?; - vm.set_root(&config.rootfs)?; - if let Some(state_disk) = &config.state_disk { - vm.add_state_disk(state_disk)?; - } - vm.set_workdir(&config.workdir)?; - - // Networking setup — use a drop guard so gvproxy is killed if we - // return early via `?` before reaching the parent's cleanup code. - let mut gvproxy_guard: Option = None; - let mut gvproxy_api_sock: Option = None; - - match &config.net { - NetBackend::Tsi => { - // Default TSI — no special setup needed. - } - NetBackend::None => { - vm.disable_implicit_vsock()?; - vm.add_vsock(0)?; - eprintln!("Networking: disabled (no TSI, no virtio-net)"); - } - NetBackend::Gvproxy { binary } => { - if !binary.exists() { - return Err(VmError::BinaryNotFound { - path: binary.display().to_string(), - hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), - }); - } - - // Create temp socket paths - let run_dir = config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .to_path_buf(); - let rootfs_key = vm_rootfs_key(&config.rootfs); - let sock_base = gvproxy_socket_dir(&config.rootfs)?; - let net_sock = sock_base.with_extension("v"); - let api_sock = sock_base.with_extension("a"); - - // Kill any stale gvproxy process from a previous run. - // First try via the saved PID in the state file, then fall - // back to killing any gvproxy holding our target ports (covers - // the case where the state file was deleted). - kill_stale_gvproxy(&config.rootfs); - for pm in &config.port_map { - if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::().ok()) { - kill_stale_gvproxy_by_port(host_port); - } - } - - // Clean stale sockets (including the -krun.sock file that - // libkrun creates as its datagram endpoint on macOS). - let _ = std::fs::remove_file(&net_sock); - let _ = std::fs::remove_file(&api_sock); - let krun_sock = sock_base.with_extension("v-krun.sock"); - let _ = std::fs::remove_file(&krun_sock); - - // Start gvproxy - eprintln!("Starting gvproxy: {}", binary.display()); - let ssh_port = pick_gvproxy_ssh_port()?; - let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); - let gvproxy_log_file = std::fs::File::create(&gvproxy_log) - .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; - - // On Linux, gvproxy uses QEMU mode (SOCK_STREAM) since the vfkit - // unixgram scheme is macOS/vfkit-specific. On macOS, use vfkit mode. - #[cfg(target_os = "linux")] - let (gvproxy_net_flag, gvproxy_net_url) = - ("-listen-qemu", format!("unix://{}", net_sock.display())); - #[cfg(target_os = "macos")] - let (gvproxy_net_flag, gvproxy_net_url) = ( - "-listen-vfkit", - format!("unixgram://{}", net_sock.display()), - ); - - let child = std::process::Command::new(binary) - .arg(gvproxy_net_flag) - .arg(&gvproxy_net_url) - .arg("-listen") - .arg(format!("unix://{}", api_sock.display())) - .arg("-ssh-port") - .arg(ssh_port.to_string()) - .stdout(std::process::Stdio::null()) - .stderr(gvproxy_log_file) - .spawn() - .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; - - eprintln!( - "gvproxy started (pid {}, ssh port {}) [{:.1}s]", - child.id(), - ssh_port, - launch_start.elapsed().as_secs_f64() - ); - - // Wait for the socket to appear (exponential backoff: 5ms → 100ms). - { - let deadline = Instant::now() + std::time::Duration::from_secs(5); - let mut interval = std::time::Duration::from_millis(5); - while !net_sock.exists() { - if Instant::now() >= deadline { - return Err(VmError::Fork( - "gvproxy socket did not appear within 5s".to_string(), - )); - } - std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(100)); - } - } - - // Disable implicit TSI and add virtio-net via gvproxy - vm.disable_implicit_vsock()?; - vm.add_vsock(0)?; - // This MAC matches gvproxy's default static DHCP lease for - // 192.168.127.2. Using a different MAC can cause the gVisor - // network stack to misroute or drop packets. - let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; - - // COMPAT_NET_FEATURES from libkrun.h - const NET_FEATURE_CSUM: u32 = 1 << 0; - const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; - const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; - const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; - const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; - const NET_FEATURE_HOST_UFO: u32 = 1 << 14; - const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM - | NET_FEATURE_GUEST_CSUM - | NET_FEATURE_GUEST_TSO4 - | NET_FEATURE_GUEST_UFO - | NET_FEATURE_HOST_TSO4 - | NET_FEATURE_HOST_UFO; - - // On Linux use unixstream (SOCK_STREAM) to connect to gvproxy's - // QEMU listener. On macOS use unixgram (SOCK_DGRAM) with the vfkit - // magic byte for the vfkit listener. - #[cfg(target_os = "linux")] - vm.add_net_unixstream(&net_sock, &mac, COMPAT_NET_FEATURES)?; - #[cfg(target_os = "macos")] - { - const NET_FLAG_VFKIT: u32 = 1 << 0; - vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; - } - - eprintln!( - "Networking: gvproxy (virtio-net) [{:.1}s]", - launch_start.elapsed().as_secs_f64() - ); - gvproxy_guard = Some(GvproxyGuard::new(child)); - gvproxy_api_sock = Some(api_sock); - } - } - - // Port mapping (TSI only) - if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { - vm.set_port_map(&config.port_map)?; - } - - for vsock_port in &config.vsock_ports { - if let Some(parent) = vsock_port.socket_path.parent() { - std::fs::create_dir_all(parent).map_err(|e| { - VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display())) - })?; - } - // libkrun returns EEXIST if the socket file is already present from a - // previous run. Remove any stale socket before registering the port. - let _ = std::fs::remove_file(&vsock_port.socket_path); - vm.add_vsock_port(vsock_port)?; - } - - // Console output - let console_log = config.console_output.clone().unwrap_or_else(|| { - config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs))) - }); - vm.set_console_output(&console_log)?; - - // envp: use provided env or minimal defaults - let mut env: Vec = if config.env.is_empty() { - vec![ - "HOME=/root", - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm", - ] - .into_iter() - .map(ToOwned::to_owned) - .collect() - } else { - config.env.clone() - }; - if let Some(state_disk) = &config.state_disk - && !env - .iter() - .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE=")) - { - env.push(format!( - "OPENSHELL_VM_STATE_DISK_DEVICE={}", - state_disk.guest_device - )); - } - vm.set_exec(&config.exec_path, &config.args, &env)?; - - // ── Fork and enter the VM ────────────────────────────────────── - // - // krun_start_enter() never returns — it calls exit() when the guest - // process exits. We fork so the parent can monitor and report. - - let boot_start = Instant::now(); - eprintln!("Booting microVM..."); - - let pid = unsafe { libc::fork() }; - match pid { - -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), - 0 => { - // Child process: enter the VM (never returns on success) - let ret = vm.start_enter(); - eprintln!("krun_start_enter failed: {ret}"); - std::process::exit(1); - } - _ => { - // Parent: wait for child - if config.exec_path == "/srv/openshell-vm-init.sh" { - let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id); - if let Err(err) = - write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid) - { - unsafe { - libc::kill(pid, libc::SIGTERM); - } - // Guard drop will kill gvproxy automatically - drop(gvproxy_guard); - clear_vm_runtime_state(&config.rootfs); - return Err(err); - } - } - eprintln!( - "VM started (child pid {pid}) [{:.1}s]", - boot_start.elapsed().as_secs_f64() - ); - for pm in &config.port_map { - let host_port = pm.split(':').next().unwrap_or(pm); - eprintln!(" port {pm} -> http://localhost:{host_port}"); - } - eprintln!("Console output: {}", console_log.display()); - - // Set up gvproxy port forwarding via its HTTP API. - // The port_map entries use the same "host:guest" format - // as TSI, but here we translate them into gvproxy expose - // calls targeting the guest IP (192.168.127.2). - // - // Instead of a fixed 500ms sleep, poll the API socket with - // exponential backoff (5ms → 200ms, ~1s total budget). - if let Some(ref api_sock) = gvproxy_api_sock { - let fwd_start = Instant::now(); - // Wait for the API socket to appear (it lags slightly - // behind the vfkit data socket). - { - let deadline = Instant::now() + std::time::Duration::from_secs(2); - let mut interval = std::time::Duration::from_millis(5); - while !api_sock.exists() { - if Instant::now() >= deadline { - eprintln!( - "warning: gvproxy API socket not ready after 2s, attempting anyway" - ); - break; - } - std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(200)); - } - } - - let guest_ip = "192.168.127.2"; - - for pm in &config.port_map { - let parts: Vec<&str> = pm.split(':').collect(); - let (host_port, guest_port) = match parts.len() { - 2 => (parts[0], parts[1]), - 1 => (parts[0], parts[0]), - _ => { - eprintln!(" skipping invalid port mapping: {pm}"); - continue; - } - }; - - let expose_body = format!( - r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# - ); - - // Retry with exponential backoff — gvproxy's internal - // netstack may not be ready immediately after socket creation. - let mut expose_ok = false; - let mut retry_interval = std::time::Duration::from_millis(100); - let expose_deadline = Instant::now() + std::time::Duration::from_secs(10); - loop { - match gvproxy_expose(api_sock, &expose_body) { - Ok(()) => { - eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); - expose_ok = true; - break; - } - Err(e) => { - if Instant::now() >= expose_deadline { - eprintln!(" port {host_port}: {e} (retries exhausted)"); - break; - } - std::thread::sleep(retry_interval); - retry_interval = - (retry_interval * 2).min(std::time::Duration::from_secs(1)); - } - } - } - if !expose_ok { - return Err(VmError::HostSetup(format!( - "failed to forward port {host_port} via gvproxy" - ))); - } - } - eprintln!( - "Port forwarding ready [{:.1}s]", - fwd_start.elapsed().as_secs_f64() - ); - } - - // Bootstrap the OpenShell control plane and wait for the - // service to be reachable. Only for the gateway preset, and - // only when port forwarding is configured (i.e. the gateway - // is reachable from the host). During rootfs pre-init builds, - // no --port is specified so there is nothing to health-check - // — the build script has its own kubectl-based readiness - // checks inside the VM. - if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { - // Bootstrap stores host-side metadata and mTLS creds. - // With pre-baked rootfs (Path 1) this reads PKI directly - // from virtio-fs — no kubectl or port forwarding needed. - // Cold boot (Path 2) writes secret manifests into the - // k3s auto-deploy directory via virtio-fs. - let gateway_port = gateway_host_port(config); - bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; - - // Wait for the gRPC health check to pass. This ensures - // the service is fully operational, not just accepting - // TCP connections. The health check confirms the full - // path (gvproxy → kube-proxy nftables → pod:8080) and - // that the gRPC service is responding to requests. - health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; - } - - eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); - eprintln!("Press Ctrl+C to stop."); - - // Forward signals to child - unsafe { - libc::signal( - libc::SIGINT, - forward_signal as *const () as libc::sighandler_t, - ); - libc::signal( - libc::SIGTERM, - forward_signal as *const () as libc::sighandler_t, - ); - CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); - } - - let mut status: libc::c_int = 0; - unsafe { - libc::waitpid(pid, &raw mut status, 0); - } - - // Clean up gvproxy — disarm the guard and do explicit cleanup - // so we can print the "stopped" message. - if config.exec_path == "/srv/openshell-vm-init.sh" { - clear_vm_runtime_state(&config.rootfs); - } - if let Some(mut guard) = gvproxy_guard - && let Some(mut child) = guard.disarm() - { - let _ = child.kill(); - let _ = child.wait(); - eprintln!("gvproxy stopped"); - } - - if libc::WIFEXITED(status) { - let code = libc::WEXITSTATUS(status); - eprintln!("VM exited with code {code}"); - return Ok(code); - } else if libc::WIFSIGNALED(status) { - let sig = libc::WTERMSIG(status); - eprintln!("VM killed by signal {sig}"); - return Ok(128 + sig); - } - - Ok(status) - } - } + let libkrun_backend = backend::libkrun::LibkrunBackend; + backend::VmBackend::launch(&libkrun_backend, config) } // ── Post-boot bootstrap ──────────────────────────────────────────────── @@ -1727,7 +1232,11 @@ const DEFAULT_GATEWAY_PORT: u16 = 30051; /// 2. **First boot / post-reset**: poll the exec agent to `cat` each PEM file /// from `/opt/openshell/pki/` until the files exist (PKI generation has /// finished), then store them in `~/.config/openshell/gateways//mtls/`. -fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Result<(), VmError> { +pub(crate) fn bootstrap_gateway( + rootfs: &Path, + gateway_name: &str, + gateway_port: u16, +) -> Result<(), VmError> { let bootstrap_start = Instant::now(); let metadata = openshell_bootstrap::GatewayMetadata { @@ -1921,6 +1430,31 @@ fn is_warm_boot(gateway_name: &str) -> bool { true } +/// Remove cached mTLS certs from the host so the next `bootstrap_gateway` +/// call treats this as a cold boot and fetches fresh PKI from the VM. +/// +/// Called when the state disk is freshly created or `--reset` is used, +/// since the VM will generate new PKI that won't match stale host certs. +fn clear_warm_boot_certs(gateway_name: &str) { + let Ok(home) = std::env::var("HOME") else { + return; + }; + let config_base = + std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); + let mtls_dir = PathBuf::from(&config_base) + .join("openshell/gateways") + .join(gateway_name) + .join("mtls"); + + if mtls_dir.is_dir() { + if let Err(e) = std::fs::remove_dir_all(&mtls_dir) { + eprintln!("Warning: failed to clear stale mTLS certs: {e}"); + } else { + eprintln!("Cleared stale host mTLS certs"); + } + } +} + /// Compare the CA cert on the rootfs (authoritative source) against the /// host-side copy. If they differ, re-copy all client certs from the rootfs. /// @@ -1956,9 +1490,9 @@ fn sync_host_certs_if_stale( Ok(()) } -static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); +pub(crate) static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); -extern "C" fn forward_signal(_sig: libc::c_int) { +pub(crate) extern "C" fn forward_signal(_sig: libc::c_int) { let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed); if pid > 0 { unsafe { diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs index bb9d854b1..1b3aa6423 100644 --- a/crates/openshell-vm/src/main.rs +++ b/crates/openshell-vm/src/main.rs @@ -92,6 +92,16 @@ struct Cli { /// unclean shutdown. #[arg(long)] reset: bool, + + /// Enable GPU passthrough. Optionally specify a PCI address + /// (e.g. `0000:41:00.0`). Uses cloud-hypervisor backend with VFIO. + #[arg(long, num_args = 0..=1, default_missing_value = "auto")] + gpu: Option, + + /// Hypervisor backend: "auto" (default), "libkrun", or "cloud-hypervisor". + /// Auto selects cloud-hypervisor when --gpu is set, libkrun otherwise. + #[arg(long, default_value = "auto")] + backend: String, } #[derive(Subcommand)] @@ -196,12 +206,16 @@ fn run(cli: Cli) -> Result> { return Err("openshell-vm exec requires a command when stdin is not a TTY".into()); } } + let exec_rootfs = if let Some(explicit) = cli.rootfs { + explicit + } else if cli.gpu.is_some() { + openshell_vm::named_gpu_rootfs_dir(&cli.name)? + } else { + openshell_vm::named_rootfs_dir(&cli.name)? + }; return Ok(openshell_vm::exec_running_vm( openshell_vm::VmExecOptions { - rootfs: Some( - cli.rootfs - .unwrap_or(openshell_vm::named_rootfs_dir(&cli.name)?), - ), + rootfs: Some(exec_rootfs), command, workdir, env, @@ -223,12 +237,59 @@ fn run(cli: Cli) -> Result> { } }; - let rootfs = cli - .rootfs - .map_or_else(|| openshell_vm::ensure_named_rootfs(&cli.name), Ok)?; + let rootfs = if let Some(explicit) = cli.rootfs { + Ok(explicit) + } else if cli.gpu.is_some() { + openshell_vm::ensure_gpu_rootfs(&cli.name) + } else { + openshell_vm::ensure_named_rootfs(&cli.name) + }?; let gateway_name = openshell_vm::gateway_name(&cli.name)?; + let (gpu_enabled, vfio_device, _gpu_guard) = match cli.gpu { + Some(ref addr) if addr != "auto" => { + let state = openshell_vm::gpu_passthrough::prepare_gpu_for_passthrough(Some(addr))?; + let bdf = state.pci_addr.clone(); + ( + true, + Some(bdf), + Some(openshell_vm::gpu_passthrough::GpuBindGuard::new(state)), + ) + } + Some(_) => { + let state = openshell_vm::gpu_passthrough::prepare_gpu_for_passthrough(None)?; + let bdf = state.pci_addr.clone(); + ( + true, + Some(bdf), + Some(openshell_vm::gpu_passthrough::GpuBindGuard::new(state)), + ) + } + None => (false, None, None), + }; + + let backend_choice = match cli.backend.as_str() { + "cloud-hypervisor" | "chv" => openshell_vm::VmBackendChoice::CloudHypervisor, + "libkrun" => { + if gpu_enabled { + return Err( + "--backend libkrun is incompatible with --gpu (libkrun does not support \ + VFIO passthrough). Use --backend auto or --backend cloud-hypervisor." + .into(), + ); + } + openshell_vm::VmBackendChoice::Libkrun + } + "auto" => openshell_vm::VmBackendChoice::Auto, + other => { + return Err(format!( + "unknown --backend: {other} (expected: auto, libkrun, cloud-hypervisor)" + ) + .into()); + } + }; + let mut config = if let Some(exec_path) = cli.exec { openshell_vm::VmConfig { rootfs, @@ -246,6 +307,9 @@ fn run(cli: Cli) -> Result> { reset: cli.reset, gateway_name, state_disk: None, + gpu_enabled, + vfio_device, + backend: backend_choice, } } else { let mut c = openshell_vm::VmConfig::gateway(rootfs); @@ -261,6 +325,9 @@ fn run(cli: Cli) -> Result> { c.net = net_backend; c.reset = cli.reset; c.gateway_name = gateway_name; + c.gpu_enabled = gpu_enabled; + c.vfio_device = vfio_device; + c.backend = backend_choice; if state_disk_disabled() { c.state_disk = None; } diff --git a/crates/openshell-vm/tests/gpu_passthrough_implementation.rs b/crates/openshell-vm/tests/gpu_passthrough_implementation.rs new file mode 100644 index 000000000..4985ba39b --- /dev/null +++ b/crates/openshell-vm/tests/gpu_passthrough_implementation.rs @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration tests for GPU passthrough on real hardware. +//! +//! Gated by `OPENSHELL_VM_GPU_E2E=1`. On machines without a real GPU, +//! all tests early-return and pass. + +use openshell_vm::gpu_passthrough::{ + GpuBindGuard, HostNvidiaVfioReadiness, prepare_gpu_for_passthrough, + probe_host_nvidia_vfio_readiness, +}; + +fn gpu_e2e_enabled() -> bool { + std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() == Ok("1") +} + +#[test] +fn nvidia_gpu_passthrough_is_available() { + if !gpu_e2e_enabled() { + eprintln!("OPENSHELL_VM_GPU_E2E not set — skipping GPU passthrough gate test"); + return; + } + assert!( + openshell_vm::gpu_passthrough::nvidia_gpu_available_for_vm_passthrough(), + "GPU passthrough gate returned false on a GPU CI runner — \ + check VFIO binding and cloud-hypervisor runtime bundle" + ); +} + +#[test] +fn bind_and_rebind_real_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU"); + + let results = probe_host_nvidia_vfio_readiness(); + let (_, readiness) = results + .iter() + .find(|(a, _)| a == &state.pci_addr) + .expect("bound GPU should appear in probe"); + assert_eq!(*readiness, HostNvidiaVfioReadiness::VfioBoundReady); + + state.restore().expect("restore should succeed"); + + let results = probe_host_nvidia_vfio_readiness(); + let (_, readiness) = results + .iter() + .find(|(a, _)| a == &state.pci_addr) + .expect("restored GPU should appear in probe"); + assert_eq!(*readiness, HostNvidiaVfioReadiness::BoundToNvidia); +} + +#[test] +fn safety_checks_pass_on_ci_gpu() { + if !gpu_e2e_enabled() { + return; + } + + // `prepare_gpu_for_passthrough` runs all safety checks internally + // (display-attached, IOMMU enabled, VFIO modules loaded, sysfs + // permissions). Success here validates that the CI GPU is headless, + // IOMMU is on, and VFIO modules are loaded. + let state = prepare_gpu_for_passthrough(None) + .expect("all safety checks should pass on a headless CI GPU"); + assert!(!state.pci_addr.is_empty()); + + state.restore().expect("restore should succeed"); +} + +#[test] +fn guard_restores_on_drop_real_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU"); + let pci_addr = state.pci_addr.clone(); + + let guard = GpuBindGuard::new(state); + drop(guard); + + let output = std::process::Command::new("nvidia-smi") + .arg("--query-gpu=pci.bus_id") + .arg("--format=csv,noheader") + .output() + .expect("nvidia-smi should be available after guard drop"); + assert!( + output.status.success(), + "nvidia-smi failed after guard drop" + ); + + let stdout = String::from_utf8_lossy(&output.stdout); + let normalized_addr = pci_addr.to_uppercase(); + assert!( + stdout.to_uppercase().contains(&normalized_addr), + "nvidia-smi should list the restored GPU {pci_addr}, got: {stdout}" + ); +} + +#[test] +fn auto_select_finds_ci_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("auto-select should find a GPU on CI"); + assert!(!state.pci_addr.is_empty()); + assert!(state.did_bind); + + state.restore().expect("restore should succeed"); +} diff --git a/crates/openshell-vm/tests/vm_boot_smoke.rs b/crates/openshell-vm/tests/vm_boot_smoke.rs new file mode 100644 index 000000000..ffdb16595 --- /dev/null +++ b/crates/openshell-vm/tests/vm_boot_smoke.rs @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Non-GPU cloud-hypervisor boot smoke test. +//! +//! Boots a cloud-hypervisor VM **without** VFIO/GPU passthrough and verifies +//! the kernel boots and init runs. This catches backend regressions on regular +//! CI runners that lack GPU hardware. +//! +//! Gated on `OPENSHELL_VM_BACKEND=cloud-hypervisor` — skipped when the env +//! var is absent or set to a different backend. +//! +//! Requires the VM runtime bundle (cloud-hypervisor, vmlinux, virtiofsd, +//! rootfs) to be installed. Set `OPENSHELL_VM_RUNTIME_DIR` or run +//! `mise run vm:bundle-runtime` first. +//! +//! Run explicitly: +//! +//! ```sh +//! OPENSHELL_VM_BACKEND=cloud-hypervisor cargo test -p openshell-vm --test vm_boot_smoke +//! ``` + +#![allow(unsafe_code)] + +use std::process::{Command, Stdio}; +use std::time::Duration; + +const GATEWAY: &str = env!("CARGO_BIN_EXE_openshell-vm"); + +fn runtime_bundle_dir() -> std::path::PathBuf { + std::path::Path::new(GATEWAY) + .parent() + .expect("openshell-vm binary has no parent") + .join("openshell-vm.runtime") +} + +fn skip_unless_chv() -> bool { + if std::env::var("OPENSHELL_VM_BACKEND").as_deref() != Ok("cloud-hypervisor") { + eprintln!("OPENSHELL_VM_BACKEND != cloud-hypervisor — skipping"); + return true; + } + false +} + +fn require_bundle() { + let bundle = runtime_bundle_dir(); + if !bundle.is_dir() { + panic!( + "VM runtime bundle not found at {}. Run `mise run vm:bundle-runtime` first.", + bundle.display() + ); + } +} + +#[test] +fn cloud_hypervisor_exec_exits_cleanly() { + if skip_unless_chv() { + return; + } + require_bundle(); + + // Boot with --exec /bin/true --net none. The cloud-hypervisor backend + // wraps the exec command in a script that calls `poweroff -f` after + // completion, causing a clean ACPI shutdown. + let mut child = Command::new(GATEWAY) + .args([ + "--backend", + "cloud-hypervisor", + "--net", + "none", + "--exec", + "/bin/true", + ]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to start openshell-vm"); + + // The VM should boot, run /bin/true, and exit within ~5s. + // Give 30s for slow CI. + let timeout = Duration::from_secs(30); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => { + assert!( + status.success(), + "cloud-hypervisor --exec /bin/true exited with {status}" + ); + return; + } + Ok(None) => { + if start.elapsed() > timeout { + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGKILL) }; + let _ = child.wait(); + panic!("cloud-hypervisor VM did not exit within {timeout:?}"); + } + std::thread::sleep(Duration::from_millis(500)); + } + Err(e) => panic!("error waiting for openshell-vm: {e}"), + } + } +} + +#[test] +fn cloud_hypervisor_boots_without_gpu() { + if skip_unless_chv() { + return; + } + require_bundle(); + + // Full gateway boot requires TAP networking (root/CAP_NET_ADMIN). + // Skip unless running as root. + if !nix_is_root() { + eprintln!("skipping full gateway boot — requires root for TAP networking"); + return; + } + + let mut child = Command::new(GATEWAY) + .args(["--backend", "cloud-hypervisor"]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to start openshell-vm"); + + let addr: std::net::SocketAddr = ([127, 0, 0, 1], 30051).into(); + let timeout = Duration::from_secs(180); + let start = std::time::Instant::now(); + let mut reachable = false; + + while start.elapsed() < timeout { + if std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { + reachable = true; + break; + } + std::thread::sleep(Duration::from_secs(2)); + } + + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) }; + let _ = child.wait(); + + assert!( + reachable, + "cloud-hypervisor VM service on port 30051 not reachable within {timeout:?}" + ); +} + +fn nix_is_root() -> bool { + unsafe { libc::geteuid() == 0 } +} diff --git a/tasks/scripts/vm/build-cloud-hypervisor.sh b/tasks/scripts/vm/build-cloud-hypervisor.sh new file mode 100755 index 000000000..af0c913b1 --- /dev/null +++ b/tasks/scripts/vm/build-cloud-hypervisor.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Download pre-built cloud-hypervisor and virtiofsd binaries for GPU passthrough. +# +# These are only needed on Linux for VFIO GPU passthrough via the +# cloud-hypervisor backend. The binaries are downloaded from their +# respective GitHub release pages. +# +# Usage: +# ./build-cloud-hypervisor.sh [--output-dir ] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/_lib.sh" +ROOT="$(vm_lib_root)" + +source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true + +CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}" +VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" +OUTPUT_DIR="${ROOT}/target/libkrun-build" + +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + *) echo "Unknown argument: $1" >&2; exit 1 ;; + esac +done + +if [ "$(uname -s)" != "Linux" ]; then + echo "Error: cloud-hypervisor GPU passthrough is Linux-only" >&2 + exit 1 +fi + +mkdir -p "$OUTPUT_DIR" + +HOST_ARCH="$(uname -m)" +case "$HOST_ARCH" in + aarch64) CHV_ARCH="aarch64"; VIRTIOFSD_ARCH="aarch64" ;; + x86_64) CHV_ARCH="x86_64"; VIRTIOFSD_ARCH="x86_64" ;; + *) echo "Error: Unsupported architecture: ${HOST_ARCH}" >&2; exit 1 ;; +esac + +echo "==> Downloading cloud-hypervisor ${CLOUD_HYPERVISOR_VERSION} for ${HOST_ARCH}..." +CHV_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CLOUD_HYPERVISOR_VERSION}/cloud-hypervisor-static" +if [ "$CHV_ARCH" = "aarch64" ]; then + CHV_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CLOUD_HYPERVISOR_VERSION}/cloud-hypervisor-static-aarch64" +fi + +curl -fsSL -o "${OUTPUT_DIR}/cloud-hypervisor" "$CHV_URL" +chmod +x "${OUTPUT_DIR}/cloud-hypervisor" +echo " Downloaded: cloud-hypervisor" + +echo "==> Building virtiofsd ${VIRTIOFSD_VERSION} from source..." +VIRTIOFSD_SRC="$(mktemp -d)" +VIRTIOFSD_TARBALL_URL="https://gitlab.com/virtio-fs/virtiofsd/-/archive/${VIRTIOFSD_VERSION}/virtiofsd-${VIRTIOFSD_VERSION}.tar.gz" +curl -fsSL "$VIRTIOFSD_TARBALL_URL" | tar -xzf - -C "$VIRTIOFSD_SRC" --strip-components=1 +rm -f "${VIRTIOFSD_SRC}/Cargo.lock" + +CARGO_CMD="cargo" +if command -v mise &>/dev/null; then + CARGO_CMD="mise exec -- cargo" +fi +$CARGO_CMD build --release --manifest-path "${VIRTIOFSD_SRC}/Cargo.toml" +cp "${VIRTIOFSD_SRC}/target/release/virtiofsd" "${OUTPUT_DIR}/virtiofsd" +chmod +x "${OUTPUT_DIR}/virtiofsd" +rm -rf "$VIRTIOFSD_SRC" +echo " Built: virtiofsd" + +echo "" +echo "==> GPU passthrough binaries ready in ${OUTPUT_DIR}" +ls -lah "${OUTPUT_DIR}/cloud-hypervisor" "${OUTPUT_DIR}/virtiofsd" 2>/dev/null || true diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh index 9e2217f50..621332366 100755 --- a/tasks/scripts/vm/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -239,6 +239,18 @@ make -j"$(nproc)" cp libkrunfw.so* "$OUTPUT_DIR/" echo " Built: $(ls "$OUTPUT_DIR"/libkrunfw.so* | xargs -n1 basename | tr '\n' ' ')" +# Copy vmlinux kernel image for cloud-hypervisor GPU passthrough. +# This is the uncompressed kernel built by libkrunfw's kernel build. +if [ -f "${KERNEL_SOURCES}/vmlinux" ]; then + cp "${KERNEL_SOURCES}/vmlinux" "$OUTPUT_DIR/vmlinux" + echo " Copied vmlinux for cloud-hypervisor GPU passthrough" +elif [ -f "vmlinux" ]; then + cp "vmlinux" "$OUTPUT_DIR/vmlinux" + echo " Copied vmlinux for cloud-hypervisor GPU passthrough" +else + echo " Warning: vmlinux not found in kernel build tree (GPU passthrough will not be available)" >&2 +fi + cd "$BUILD_DIR" # ── Build libkrun (VMM) ───────────────────────────────────────────────── diff --git a/tasks/scripts/vm/download-kernel-runtime.sh b/tasks/scripts/vm/download-kernel-runtime.sh index 8f0427af9..5e60d3c75 100755 --- a/tasks/scripts/vm/download-kernel-runtime.sh +++ b/tasks/scripts/vm/download-kernel-runtime.sh @@ -81,11 +81,11 @@ DOWNLOAD_DIR="${ROOT}/target/vm-runtime-download" mkdir -p "$DOWNLOAD_DIR" "$OUTPUT_DIR" echo "==> Downloading ${TARBALL_NAME} from ${RELEASE_TAG}..." +rm -f "${DOWNLOAD_DIR}/${TARBALL_NAME}" gh release download "${RELEASE_TAG}" \ --repo "${REPO}" \ --pattern "${TARBALL_NAME}" \ - --dir "${DOWNLOAD_DIR}" \ - --clobber + --dir "${DOWNLOAD_DIR}" if [ ! -f "${DOWNLOAD_DIR}/${TARBALL_NAME}" ]; then echo "Error: Download failed — ${TARBALL_NAME} not found." >&2 diff --git a/tasks/scripts/vm/package-vm-runtime.sh b/tasks/scripts/vm/package-vm-runtime.sh index f97eec870..8b09c91ba 100755 --- a/tasks/scripts/vm/package-vm-runtime.sh +++ b/tasks/scripts/vm/package-vm-runtime.sh @@ -84,6 +84,13 @@ case "$PLATFORM" in versioned="$(ls "${PACKAGE_DIR}"/libkrunfw.so.5.* 2>/dev/null | head -n1 || true)" [ -n "$versioned" ] && cp "$versioned" "${PACKAGE_DIR}/libkrunfw.so.5" fi + # GPU passthrough binaries (optional — only included if present) + for gpu_bin in cloud-hypervisor vmlinux virtiofsd; do + if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then + cp "${BUILD_DIR}/${gpu_bin}" "${PACKAGE_DIR}/" + echo " Included GPU passthrough binary: ${gpu_bin}" + fi + done ;; darwin-aarch64) cp "${BUILD_DIR}/libkrun.dylib" "${PACKAGE_DIR}/" diff --git a/tasks/scripts/vm/sync-vm-rootfs.sh b/tasks/scripts/vm/sync-vm-rootfs.sh index 727a9dd18..2c22e360b 100755 --- a/tasks/scripts/vm/sync-vm-rootfs.sh +++ b/tasks/scripts/vm/sync-vm-rootfs.sh @@ -141,6 +141,22 @@ fi patch_vm_helmchart "${MANIFEST_DST}/openshell-helmchart.yaml" patch_vm_helmchart "${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests/openshell-helmchart.yaml" +# ── GPU manifests ────────────────────────────────────────────────────── +# Only sync if the rootfs was built with --gpu (sentinel file present). +GPU_MANIFEST_SRC="${ROOT}/crates/openshell-vm/scripts/gpu-manifests" +GPU_MANIFEST_DST="${ROOTFS_DIR}/opt/openshell/gpu-manifests" +if [ -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ] && [ -d "${GPU_MANIFEST_SRC}" ]; then + mkdir -p "${GPU_MANIFEST_DST}" + for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do + [ -f "$manifest" ] || continue + base=$(basename "$manifest") + if ! cmp -s "$manifest" "${GPU_MANIFEST_DST}/${base}" 2>/dev/null; then + cp "$manifest" "${GPU_MANIFEST_DST}/${base}" + echo " updated: /opt/openshell/gpu-manifests/${base}" + fi + done +fi + # ── Gateway image tarball ────────────────────────────────────────────── # The VM rootfs airgap-imports openshell/gateway:dev from k3s/agent/images/. # Keep that tarball in sync with the local Docker image so `mise run e2e:vm`