diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml
index df953b5d3..6dd98b1cd 100644
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -22,7 +22,7 @@ jobs:
       - id: get_pr_info
         if: github.event_name == 'push'
         continue-on-error: true
-        uses: nv-gha-runners/get-pr-info@main
+        uses: nv-gha-runners/get-pr-info@090577647b8ddc4e06e809e264f7881650ecdccf
 
       - id: gate
         shell: bash
diff --git a/Cargo.lock b/Cargo.lock
index e4057f75c..d347ff86c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3028,6 +3028,7 @@ dependencies = [
  "openshell-prover",
  "openshell-providers",
  "openshell-tui",
+ "openshell-vm",
  "owo-colors",
  "prost-types",
  "rcgen",
@@ -3288,6 +3289,7 @@ dependencies = [
  "serde",
  "serde_json",
  "tar",
+ "tempfile",
  "thiserror 2.0.18",
  "tokio",
  "tokio-rustls",
diff --git a/architecture/README.md b/architecture/README.md
index 570fce660..45457d37c 100644
--- a/architecture/README.md
+++ b/architecture/README.md
@@ -301,4 +301,6 @@ This opens an interactive SSH session into the sandbox, with all provider creden
 | [Inference Routing](inference-routing.md) | Transparent interception and sandbox-local routing of AI inference API calls to configured backends. |
 | [System Architecture](system-architecture.md) | Top-level system architecture diagram with all deployable components and communication flows. |
 | [Gateway Settings Channel](gateway-settings.md) | Runtime settings channel: two-tier key-value configuration, global policy override, settings registry, CLI/TUI commands. |
+| [Custom VM Runtime](custom-vm-runtime.md) | Dual-backend VM runtime (libkrun / cloud-hypervisor), kernel configuration, and build pipeline. |
+| [VM GPU Passthrough](vm-gpu-passthrough.md) | VFIO GPU passthrough for VMs: host preparation, safety checks, nvidia driver hardening, and troubleshooting. |
 | [TUI](tui.md) | Terminal user interface for sandbox interaction. |
diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md
index ce4d0bf39..6dac41064 100644
--- a/architecture/custom-vm-runtime.md
+++ b/architecture/custom-vm-runtime.md
@@ -1,18 +1,31 @@
-# Custom libkrunfw VM Runtime
+# Custom VM Runtime
 
 > Status: Experimental and work in progress (WIP). VM support is under active development and may change.
 
 ## Overview
 
-The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a
-lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel
-is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel.
+The OpenShell gateway VM supports two hypervisor backends:
 
-The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or
-conntrack support. This is insufficient for Kubernetes pod networking.
+- **libkrun** (default) — lightweight VMM using Apple Hypervisor.framework (macOS) or KVM
+  (Linux). The kernel is embedded inside `libkrunfw`. Uses virtio-MMIO device transport and
+  gvproxy for user-space networking.
+- **cloud-hypervisor** — Linux-only KVM-based VMM used for GPU passthrough (VFIO). Uses
+  virtio-PCI device transport, TAP networking, and requires a separate `vmlinux` kernel and
+  `virtiofsd` for rootfs access.
+
+Backend selection is automatic: `--gpu` selects cloud-hypervisor, otherwise libkrun is used.
+The `--backend` flag provides explicit control (`auto`, `libkrun`, `cloud-hypervisor`).
+
+When `--gpu` is passed, `openshell-vm` automatically binds an eligible GPU to `vfio-pci`
+and restores it to the original driver on shutdown. See
+[vm-gpu-passthrough.md](vm-gpu-passthrough.md) for the full lifecycle description.
+
+Both backends share the same guest kernel (built from a single `openshell.kconfig` fragment)
+and rootfs.
 
-The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to
-the VM kernel, enabling standard Kubernetes networking.
+The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or
+conntrack support. This is insufficient for Kubernetes pod networking. The custom kconfig
+adds bridge CNI, iptables/nftables, conntrack, and cloud-hypervisor compatibility.
 
 ## Architecture
 
@@ -20,10 +33,11 @@ the VM kernel, enabling standard Kubernetes networking.
 graph TD
     subgraph Host["Host (macOS / Linux)"]
         BIN[openshell-vm binary]
-        EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy"]
+        EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy · rootfs"]
         CACHE["~/.local/share/openshell/vm-runtime/{version}/"]
         PROV[Runtime provenance logging]
         GVP[gvproxy networking proxy]
+        CHV_BIN["cloud-hypervisor · virtiofsd · vmlinux\n(GPU runtime bundle)"]
 
         BIN --> EMB
         BIN -->|extracts to| CACHE
@@ -44,8 +58,9 @@ graph TD
         INIT --> VAL --> CNI --> EXECA --> PKI --> K3S
     end
 
-    BIN -- "fork + krun_start_enter" --> INIT
-    GVP -- "virtio-net" --> Guest
+    BIN -- "libkrun: fork + krun_start_enter" --> INIT
+    BIN -- "CHV: cloud-hypervisor API + virtiofsd" --> INIT
+    GVP -- "virtio-net (libkrun only)" --> Guest
 ```
 
 ## Embedded Runtime
@@ -67,9 +82,23 @@ these to XDG cache directories with progress bars:
 └── ...
 ```
 
-This eliminates the need for separate bundles or downloads - a single ~120MB binary
-provides everything needed to run the VM. Old cache versions are automatically
-cleaned up when a new version is extracted.
+When using cloud-hypervisor, an additional runtime bundle is required alongside the
+binary:
+
+```
+target/debug/openshell-vm.runtime/    (or alongside the installed binary)
+├── cloud-hypervisor                   # CHV binary
+├── virtiofsd                          # virtio-fs daemon
+└── vmlinux                            # extracted guest kernel
+```
+
+This bundle is built with `mise run vm:bundle-runtime` and is separate from the
+embedded runtime because CHV and virtiofsd are Linux-only and not embedded in the
+self-extracting binary.
+
+This eliminates the need for separate bundles or downloads for the default (libkrun)
+path — a single ~120MB binary provides everything needed. Old cache versions are
+automatically cleaned up when a new version is extracted.
 
 ### Hybrid Approach
 
@@ -86,6 +115,31 @@ mise run vm:rootfs                 # Full rootfs (~2GB, includes images)
 mise run vm:build                  # Rebuild binary with full rootfs
 ```
 
+## Backend Comparison
+
+| | libkrun (default) | cloud-hypervisor |
+|---|---|---|
+| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only |
+| Device transport | virtio-MMIO | virtio-PCI |
+| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) |
+| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) |
+| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file |
+| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) |
+| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) |
+| GPU passthrough | Not supported | VFIO PCI passthrough |
+| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown |
+| CLI flag | `--backend libkrun` | `--backend cloud-hypervisor` or `--gpu` |
+
+### Exec mode differences
+
+With libkrun, when `--exec <cmd>` is used, the command replaces the init process and
+the VM exits when PID 1 exits.
+
+With cloud-hypervisor, the VM does not automatically exit when PID 1 terminates. A
+wrapper init script is dynamically written to the guest rootfs that mounts necessary
+filesystems, executes the user command, captures the exit code, and calls
+`poweroff -f` to trigger an ACPI shutdown that cloud-hypervisor detects.
+
 ## Network Profile
 
 The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and
@@ -100,6 +154,26 @@ fast with an actionable error if they are missing.
 - Service VIPs: functional (ClusterIP, NodePort)
 - hostNetwork workarounds: not required
 
+### Networking by backend
+
+- **libkrun**: Uses gvproxy for user-space virtio-net networking. No root privileges
+  needed. Port forwarding is handled via gvproxy configuration.
+- **cloud-hypervisor**: Uses TAP networking (requires root or CAP_NET_ADMIN). When
+  `--net none` is passed, networking is disabled entirely (useful for `--exec` mode
+  tests). gvproxy is not used with cloud-hypervisor.
+
+## Guest Init Script
+
+The init script (`openshell-vm-init.sh`) runs as PID 1 in the guest. After mounting essential filesystems, it performs:
+
+1. **Kernel cmdline parsing** — exports environment variables passed via the kernel command line (`GPU_ENABLED`, `OPENSHELL_VM_STATE_DISK_DEVICE`, `VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). This runs after `/proc` is mounted so `/proc/cmdline` is available.
+
+2. **Cgroup v2 controller enablement** — enables `cpu`, `cpuset`, `memory`, `pids`, and `io` controllers in the root cgroup hierarchy (`cgroup.subtree_control`). k3s/kubelet requires these controllers; the `cpu` controller depends on `CONFIG_CGROUP_SCHED` in the kernel.
+
+3. **Networking** — detects `eth0` and attempts DHCP (via `udhcpc`). On failure, falls back to static IP configuration using `VM_NET_IP` and `VM_NET_GW` from the kernel cmdline (set by the CHV backend for TAP networking). DNS is configured from `VM_NET_DNS` if set, overriding any stale `/etc/resolv.conf` entries.
+
+4. **Capability validation** — verifies required kernel features (bridge networking, netfilter, cgroups) and fails fast with actionable errors if missing.
+
 ## Runtime Provenance
 
 At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle:
@@ -128,21 +202,35 @@ graph LR
         BUILD_M["Build libkrunfw.dylib + libkrun.dylib"]
     end
 
+    subgraph CHV["Linux CI (build-cloud-hypervisor.sh)"]
+        BUILD_CHV["Build cloud-hypervisor + virtiofsd"]
+    end
+
     subgraph Output["target/libkrun-build/"]
         LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"]
         LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"]
+        CHV_OUT["cloud-hypervisor + virtiofsd\n(Linux)"]
+        VMLINUX["vmlinux\n(extracted from libkrunfw)"]
     end
 
     KCONF --> BUILD_L
     BUILD_L --> LIB_SO
+    BUILD_L --> VMLINUX
     KCONF --> BUILD_M
     BUILD_M --> LIB_DY
+    BUILD_CHV --> CHV_OUT
 ```
 
+The `vmlinux` kernel is extracted from the libkrunfw build and reused by cloud-hypervisor.
+Both backends boot the same kernel — the kconfig fragment includes drivers for both
+virtio-MMIO (libkrun) and virtio-PCI (CHV) transports.
+
 ## Kernel Config Fragment
 
 The `openshell.kconfig` fragment enables these kernel features on top of the stock
-libkrunfw kernel:
+libkrunfw kernel. A single kernel binary is shared by both libkrun and cloud-hypervisor —
+backend-specific drivers coexist safely (the kernel probes whichever transport the
+hypervisor provides).
 
 | Feature | Key Configs | Purpose |
 |---------|-------------|---------|
@@ -158,11 +246,18 @@ libkrunfw kernel:
 | IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Pod-to-pod routing |
 | IPVS | `CONFIG_IP_VS`, `CONFIG_IP_VS_RR`, `CONFIG_IP_VS_NFCT` | kube-proxy IPVS mode (optional) |
 | Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | Kubernetes QoS |
-| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Container resource limits |
+| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_CGROUP_CPUACCT`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS`, `CONFIG_CGROUP_FREEZER` | Container resource limits |
+| Cgroup CPU | `CONFIG_CGROUP_SCHED`, `CONFIG_FAIR_GROUP_SCHED`, `CONFIG_CFS_BANDWIDTH` | cgroup v2 `cpu` controller for k3s/kubelet |
 | TUN/TAP | `CONFIG_TUN` | CNI plugin support |
 | Dummy interface | `CONFIG_DUMMY` | Fallback networking |
 | Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support |
 | Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support |
+| PCI / GPU | `CONFIG_PCI`, `CONFIG_PCI_MSI`, `CONFIG_DRM` | GPU passthrough via VFIO |
+| Kernel modules | `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | Loading NVIDIA drivers in guest |
+| virtio-PCI transport | `CONFIG_VIRTIO_PCI` | cloud-hypervisor device bus (libkrun uses MMIO) |
+| Serial console | `CONFIG_SERIAL_8250`, `CONFIG_SERIAL_8250_CONSOLE` | cloud-hypervisor console (`ttyS0`) |
+| ACPI | `CONFIG_ACPI` | cloud-hypervisor power management / clean shutdown |
+| x2APIC | `CONFIG_X86_X2APIC` | Multi-vCPU support (CHV uses x2APIC MADT entries) |
 
 See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with
 inline comments explaining why each option is needed.
@@ -189,13 +284,21 @@ The standalone `openshell-vm` binary supports `openshell-vm exec -- <command...>
 `openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style
 commands work the same way they would inside the VM shell.
 
+### Vsock by backend
+
+- **libkrun**: Uses libkrun's built-in vsock port mapping, which transparently
+  bridges the guest vsock port to a host Unix socket.
+- **cloud-hypervisor**: Uses a vsock exec bridge — a host-side process that
+  connects an AF_VSOCK socket to a Unix domain socket, providing the same
+  interface to the exec agent.
+
 ## Build Commands
 
 ```bash
 # One-time setup: download pre-built runtime (~30s)
 mise run vm:setup
 
-# Build and run
+# Build and run (libkrun, default)
 mise run vm
 
 # Build embedded binary with base rootfs (~120MB, recommended)
@@ -210,6 +313,13 @@ mise run vm:build                          # Rebuild binary
 FROM_SOURCE=1 mise run vm:setup            # Build runtime from source
 mise run vm:build                          # Then build embedded binary
 
+# Build cloud-hypervisor runtime bundle (Linux only)
+mise run vm:bundle-runtime                 # Builds CHV + virtiofsd + extracts vmlinux
+
+# Run with cloud-hypervisor backend
+openshell-vm --backend cloud-hypervisor    # Requires runtime bundle
+openshell-vm --gpu                         # Auto-selects CHV with GPU passthrough
+
 # Wipe everything and start over
 mise run vm:clean
 ```
@@ -221,20 +331,23 @@ rolling `vm-dev` GitHub Release:
 
 ### Kernel Runtime (`release-vm-kernel.yml`)
 
-Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for all
-supported platforms. Runs on-demand or when the kernel config / pinned versions change.
+Builds the custom libkrunfw (kernel firmware), libkrun (VMM), gvproxy, cloud-hypervisor,
+and virtiofsd for all supported platforms. Runs on-demand or when the kernel config /
+pinned versions change.
 
 | Platform | Runner | Build Method |
 |----------|--------|-------------|
-| Linux ARM64 | `build-arm64` (self-hosted) | Native `build-libkrun.sh` |
-| Linux x86_64 | `build-amd64` (self-hosted) | Native `build-libkrun.sh` |
-| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` |
+| Linux ARM64 | `build-arm64` (self-hosted) | `build-libkrun.sh` + `build-cloud-hypervisor.sh` |
+| Linux x86_64 | `build-amd64` (self-hosted) | `build-libkrun.sh` + `build-cloud-hypervisor.sh` |
+| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` (no CHV) |
 
-Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, and
-provenance metadata.
+Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy,
+and provenance metadata. Linux artifacts additionally include cloud-hypervisor,
+virtiofsd, and the extracted `vmlinux` kernel.
 
 Each platform builds its own libkrunfw and libkrun natively. The kernel inside
-libkrunfw is always Linux regardless of host platform.
+libkrunfw is always Linux regardless of host platform. cloud-hypervisor and virtiofsd
+are Linux-only (macOS does not support VFIO/KVM passthrough).
 
 ### VM Binary (`release-vm-dev.yml`)
 
diff --git a/architecture/vm-gpu-passthrough.md b/architecture/vm-gpu-passthrough.md
new file mode 100644
index 000000000..c15fd668b
--- /dev/null
+++ b/architecture/vm-gpu-passthrough.md
@@ -0,0 +1,413 @@
+# VM GPU Passthrough
+
+> Status: Experimental and work in progress (WIP). GPU passthrough for the VM backend is under active development.
+
+## Overview
+
+OpenShell's VM backend can pass a physical NVIDIA GPU into a microVM using VFIO (Virtual Function I/O). This gives the guest direct access to GPU hardware, enabling CUDA workloads and `nvidia-smi` inside sandboxes without virtualization overhead.
+
+GPU passthrough uses cloud-hypervisor (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers.
+
+## Architecture
+
+```
+Host                          │  Guest (microVM)
+──────────────────────────────│───────────────────────────
+  NVIDIA GPU (PCI BDF addr)   │  nvidia driver + CUDA
+  ↕ bound to vfio-pci         │  ↕
+  /dev/vfio/<group>            │  /dev/nvidia*
+  ↕                            │  ↕
+  cloud-hypervisor (VFIO)  ────│→ PCI device visible
+  ↕                            │  ↕
+  TAP networking               │  k3s + device plugin
+  virtiofsd (rootfs)           │  ↕
+                               │  sandbox pods (nvidia.com/gpu)
+```
+
+### Backend selection
+
+| Flag | Backend | GPU attached? |
+|------|---------|---------------|
+| (none) | libkrun | No |
+| `--gpu` | cloud-hypervisor | Yes (auto-detect and bind) |
+| `--gpu 0000:41:00.0` | cloud-hypervisor | Yes (specific PCI device) |
+| `--backend cloud-hypervisor` | cloud-hypervisor | No (force CHV without GPU) |
+
+Auto mode (`--backend auto`, the default) selects cloud-hypervisor when `--gpu` is used or a VFIO PCI address is configured. Otherwise libkrun is used.
+
+### Automatic GPU binding
+
+When `--gpu` is passed (with or without a specific PCI address), the launcher automatically prepares the GPU for VFIO passthrough:
+
+1. **Probe** — scans `/sys/bus/pci/devices` for NVIDIA devices (vendor `0x10de`).
+2. **Safety checks** — for each candidate GPU, verifies it is safe to claim (see below). If any check fails, the launcher refuses to proceed and exits with an actionable error.
+3. **Bind** — unbinds the selected GPU from the `nvidia` driver and binds it to `vfio-pci`. Also binds any IOMMU group peers to `vfio-pci` for group cleanliness.
+4. **Launch** — starts cloud-hypervisor with the VFIO device attached and sets `GPU_ENABLED=true` in the guest kernel cmdline.
+5. **Rebind on shutdown** — when the VM exits (clean shutdown, Ctrl+C, or crash), the launcher rebinds the GPU back to the `nvidia` driver and clears `driver_override`, restoring host GPU access. Cleanup is guaranteed by a `GpuBindGuard` RAII guard that calls restore on drop, covering normal exit, early return, and panic. Only `SIGKILL` (kill -9) bypasses the guard — see Troubleshooting below for manual recovery.
+
+When a specific PCI address is given (`--gpu 0000:41:00.0`), the launcher targets that exact device. When `--gpu` is used without an address (`auto` mode), the launcher selects the best available GPU using the multi-GPU selection strategy.
+
+### Safety checks
+
+All safety checks are hard failures — if any check fails, the launcher prints an error and exits without binding. There is no `--force` override.
+
+| Check | What it detects | Failure behavior |
+|-------|----------------|------------------|
+| **Display attached** | GPU drives an active DRM framebuffer or is the primary rendering device | Error: "GPU 0000:xx:xx.x has active display outputs — cannot passthrough without losing host display" |
+| **Active processes** | Processes holding `/dev/nvidia*` file descriptors (CUDA jobs, monitoring) | Error: "GPU 0000:xx:xx.x is in use by PID(s) — stop these processes first" |
+| **IOMMU enabled** | `/sys/kernel/iommu_groups/` exists and the GPU has a group assignment | Error: "IOMMU is not enabled — add intel_iommu=on or amd_iommu=on to kernel cmdline" |
+| **VFIO modules loaded** | `vfio-pci` and `vfio_iommu_type1` kernel modules are loaded | Error: "vfio-pci kernel module not loaded — run: sudo modprobe vfio-pci" |
+| **Permissions** | Write access to sysfs bind/unbind and `/dev/vfio/` | Error: "insufficient permissions — run as root or with CAP_NET_ADMIN" |
+
+### Multi-GPU selection (`--gpu` auto mode)
+
+On hosts with multiple NVIDIA GPUs, the launcher selects a GPU using this priority:
+
+1. **Already on vfio-pci** with a clean IOMMU group — use immediately (no rebind needed).
+2. **Idle (no processes, no display)** — preferred for binding.
+3. **Skip** GPUs with active displays or running processes.
+
+If no GPU passes all safety checks, the launcher fails with per-device status listing what blocked each GPU.
+
+## Host preparation
+
+The launcher handles GPU driver binding automatically. The host only needs IOMMU and VFIO kernel modules configured.
+
+### 1. Enable IOMMU
+
+IOMMU must be enabled in both BIOS/UEFI and the Linux kernel.
+
+**Intel systems:**
+
+```shell
+# Add to kernel command line (e.g. /etc/default/grub GRUB_CMDLINE_LINUX)
+intel_iommu=on iommu=pt
+```
+
+**AMD systems:**
+
+```shell
+# AMD IOMMU is usually enabled by default; verify or add:
+amd_iommu=on iommu=pt
+```
+
+After editing, run `update-grub` (or equivalent) and reboot. Verify IOMMU is active:
+
+```shell
+dmesg | grep -i iommu
+# Should show: "DMAR: IOMMU enabled" or "AMD-Vi: AMD IOMMUv2"
+```
+
+### 2. Load VFIO kernel modules
+
+```shell
+sudo modprobe vfio-pci
+sudo modprobe vfio_iommu_type1
+
+# Persist across reboots
+echo "vfio-pci" | sudo tee /etc/modules-load.d/vfio-pci.conf
+echo "vfio_iommu_type1" | sudo tee /etc/modules-load.d/vfio_iommu_type1.conf
+```
+
+### 3. Device permissions
+
+The launcher needs root (or `CAP_NET_ADMIN`) to bind/unbind GPU drivers and configure TAP networking:
+
+```shell
+# Option A: run as root (simplest)
+sudo openshell-vm --gpu
+
+# Option B: set udev rules for /dev/vfio/ access (still needs sysfs write via root)
+echo 'SUBSYSTEM=="vfio", OWNER="root", GROUP="kvm", MODE="0660"' | \
+  sudo tee /etc/udev/rules.d/99-vfio.rules
+sudo udevadm control --reload-rules
+sudo usermod -aG kvm $USER
+```
+
+### What the launcher does automatically
+
+When `--gpu` is passed, the launcher performs the following steps that previously required manual intervention:
+
+1. **Identifies NVIDIA GPUs** via sysfs (`/sys/bus/pci/devices/*/vendor`)
+2. **Runs safety checks** — display, active processes, IOMMU, VFIO modules (see Safety checks above)
+3. **Unbinds from nvidia** — writes to `/sys/bus/pci/devices/<BDF>/driver/unbind`
+4. **Sets driver override** — writes `vfio-pci` to `/sys/bus/pci/devices/<BDF>/driver_override`
+5. **Binds to vfio-pci** — writes to `/sys/bus/pci/drivers/vfio-pci/bind`
+6. **Handles IOMMU group peers** — binds other devices in the same IOMMU group to `vfio-pci`
+7. **On shutdown** — reverses all bindings, clears `driver_override`, rebinds to `nvidia`
+
+## Single-GPU caveats
+
+When the host has only one NVIDIA GPU:
+
+- **Display-attached GPUs are blocked.** The safety checks detect if the GPU drives an active display (DRM framebuffer). If so, the launcher refuses to bind it — this prevents accidentally killing the host desktop. On headless data center servers (the typical deployment), this check passes and the GPU is bound automatically.
+- **Recovery is automatic.** When the VM exits (clean shutdown, Ctrl+C, or process crash), the launcher rebinds the GPU to the `nvidia` driver and clears `driver_override`. No manual intervention is needed.
+- **Process check.** If CUDA processes are using the GPU (visible via `/dev/nvidia*` file descriptors), the launcher refuses to unbind. Stop those processes first.
+
+## Supported GPUs
+
+GPU passthrough is validated with NVIDIA data center GPUs. Consumer GPUs may work but are not officially supported (NVIDIA restricts GeForce passthrough in some driver versions).
+
+| GPU | Architecture | Compute Capability | Status |
+|-----|-------------|-------------------|--------|
+| A100 | Ampere | 8.0 | Supported |
+| A30 | Ampere | 8.0 | Supported |
+| H100 | Hopper | 9.0 | Supported |
+| H200 | Hopper | 9.0 | Supported |
+| L40 | Ada Lovelace | 8.9 | Supported |
+| L40S | Ada Lovelace | 8.9 | Supported |
+| L4 | Ada Lovelace | 8.9 | Supported |
+
+## CLI usage
+
+### Auto-select GPU
+
+```shell
+# openshell-vm binary (VM backend directly)
+sudo openshell-vm --gpu
+
+# openshell CLI (gateway deployment — requires VM backend)
+OPENSHELL_GATEWAY_BACKEND=vm sudo openshell gateway start --gpu
+```
+
+> **Note:** The default gateway backend is Docker (containers). GPU passthrough
+> requires the VM backend. Set `OPENSHELL_GATEWAY_BACKEND=vm` (or `microvm`)
+> to use the VM path with `openshell gateway start`.
+
+### Specific PCI address (multi-GPU hosts)
+
+```shell
+sudo openshell-vm --gpu 0000:41:00.0
+```
+
+### Backend selection
+
+The `--backend` flag controls hypervisor selection independently of `--gpu`:
+
+```shell
+sudo openshell-vm --gpu                           # auto: selects cloud-hypervisor
+sudo openshell-vm --backend cloud-hypervisor       # explicit CHV, no GPU
+sudo openshell-vm --backend libkrun                # explicit libkrun (no GPU support)
+```
+
+The `chv` alias is accepted as shorthand for `cloud-hypervisor`.
+
+### Diagnostics
+
+When `--gpu` is passed, the launcher runs safety checks before unbinding. If
+checks fail, it exits with an actionable error:
+
+```text
+$ sudo openshell-vm --gpu
+GPU passthrough blocked by safety checks.
+
+  Detected devices:
+    0000:41:00.0: has active display outputs
+    0000:42:00.0: in use by PIDs: 12345 (python3), 12400 (nvidia-smi)
+
+  No GPU is available for passthrough.
+```
+
+On a headless server with an idle GPU, the pre-unbind preparation runs first:
+
+```text
+$ sudo openshell-vm --gpu
+GPU 0000:41:00.0: disabled nvidia persistence mode
+GPU 0000:41:00.0: unloaded nvidia_uvm
+GPU 0000:41:00.0: unloaded nvidia_drm
+GPU 0000:41:00.0: unloaded nvidia_modeset
+GPU 0000:41:00.0: device already unbound after nvidia module cleanup
+GPU: binding 0000:41:00.0 for VFIO passthrough
+```
+
+On shutdown (Ctrl+C or VM exit), the original driver is restored:
+
+```text
+^C
+GPU: restoring 0000:41:00.0 (cleanup)
+GPU: rebinding 0000:41:00.0 to nvidia
+```
+
+## VM Networking (Cloud Hypervisor)
+
+Cloud Hypervisor uses TAP-based networking instead of the gvproxy user-mode networking used by the libkrun backend. This has several implications for connectivity and port forwarding.
+
+### Network topology
+
+```
+Host                                   Guest (microVM)
+─────────────────────────────────────  ──────────────────────────
+  eth0 (or primary NIC)                  eth0 (virtio-net)
+  ↕                                      ↕
+  iptables MASQUERADE ←── NAT ──→        192.168.249.2/24
+  ↕                                      ↕ default gw 192.168.249.1
+  vmtap0 (TAP device)                   ↕
+  192.168.249.1/24 ←─── L2 bridge ──→   (kernel routes)
+                                         ↕
+  127.0.0.1:{port} ←── TCP proxy ──→    {port} (k3s NodePort)
+```
+
+### How it works
+
+The CHV backend configures networking in three layers:
+
+**1. TAP device and guest IP assignment**
+
+Cloud Hypervisor creates a TAP device on the host side with IP `192.168.249.1/24`. The guest is assigned `192.168.249.2/24` via kernel command line parameters (`VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). The init script reads these from `/proc/cmdline` and uses them as the static fallback when DHCP is unavailable (CHV does not run a DHCP server).
+
+**2. Host-side NAT and IP forwarding**
+
+After booting the VM, the launcher:
+- Enables IP forwarding (`/proc/sys/net/ipv4/ip_forward`)
+- Adds iptables MASQUERADE rules for the `192.168.249.0/24` subnet
+- Adds FORWARD rules to allow traffic to/from the VM
+
+This gives the guest internet access through the host. Rules are cleaned up on VM shutdown.
+
+**3. TCP port forwarding**
+
+Unlike gvproxy (which provides built-in port forwarding), CHV TAP networking requires explicit port forwarding. The launcher starts a userspace TCP proxy for each port mapping (e.g., `30051:30051`). The proxy binds to `127.0.0.1:{host_port}` and forwards connections to `192.168.249.2:{guest_port}`.
+
+### DNS resolution
+
+The launcher detects the host's upstream DNS server using a two-step lookup:
+
+1. Reads `/etc/resolv.conf` and picks the first nameserver that does not start with `127.` (skipping systemd-resolved's `127.0.0.53` stub and other loopback addresses).
+2. If all nameservers in `/etc/resolv.conf` are loopback, falls back to `/run/systemd/resolve/resolv.conf` (the upstream resolv.conf maintained by systemd-resolved).
+3. If no non-loopback nameserver is found in either file, falls back to `8.8.8.8`.
+
+The resolved DNS server is passed to the guest via `VM_NET_DNS=` on the kernel command line. The init script writes it to `/etc/resolv.conf` inside the guest, unconditionally overriding any stale entries from previous boot cycles.
+
+### Key constants
+
+| Constant | Value | Purpose |
+|----------|-------|---------|
+| `CHV_TAP_HOST_IP` | `192.168.249.1` | Host side of the TAP device |
+| `CHV_TAP_GUEST_IP` | `192.168.249.2` | Guest static IP |
+| `CHV_TAP_SUBNET` | `192.168.249.0/24` | Subnet for iptables rules |
+| `CHV_TAP_NETMASK` | `255.255.255.0` | Subnet mask in VM payload |
+
+### Differences from libkrun/gvproxy networking
+
+| Feature | libkrun + gvproxy | CHV + TAP |
+|---------|------------------|-----------|
+| Network mode | User-mode (SLIRP-like) | Kernel TAP device |
+| DHCP | Built-in (gvproxy) | None (static IP via cmdline) |
+| Guest IP | `192.168.127.2/24` | `192.168.249.2/24` |
+| Port forwarding | Built-in (gvproxy `-forward`) | Userspace TCP proxy |
+| Privileges | Unprivileged | Root or `CAP_NET_ADMIN` |
+| NAT | Handled by gvproxy | iptables MASQUERADE |
+| DNS | gvproxy provides | Host resolver passed via cmdline |
+
+### Troubleshooting networking
+
+**"lookup registry-1.docker.io: Try again" (DNS failure)**
+
+The VM cannot resolve DNS. Check:
+
+```shell
+# Verify the host DNS is non-loopback
+grep nameserver /etc/resolv.conf
+# If only 127.0.0.53 (systemd-resolved), find the upstream:
+resolvectl status | grep 'DNS Servers'
+
+# Verify iptables rules are in place
+sudo iptables -t nat -L POSTROUTING -n -v | grep 192.168.249
+sudo iptables -L FORWARD -n -v | grep 192.168.249
+
+# Verify IP forwarding is enabled
+cat /proc/sys/net/ipv4/ip_forward
+```
+
+**Gateway health check fails (port 30051 unreachable)**
+
+The TCP port forwarder may not have started, or the guest service is not yet listening:
+
+```shell
+# Check if the port forwarder is bound on the host
+ss -tlnp | grep 30051
+
+# Check if the guest is reachable
+ping -c1 192.168.249.2
+```
+
+### Host mTLS cache and state disk
+
+The launcher caches mTLS certificates on the host after the first successful boot (warm boot path). If the state disk is deleted or `--reset` is used, the VM generates new PKI that won't match the cached certs. The launcher detects this — when the state disk is freshly created or reset, it clears the stale host mTLS cache and runs the cold-boot PKI fetch path. This prevents `transport error` failures on the gateway health check after a state disk reset.
+
+## Troubleshooting
+
+### "no NVIDIA PCI device found"
+
+The host has no NVIDIA GPU installed, or the PCI device is not visible:
+
+```shell
+lspci -nn | grep -i nvidia
+# If empty, the GPU is not detected at the PCI level
+```
+
+### "has active display outputs"
+
+The GPU drives a DRM framebuffer or is the boot VGA device. This is a hard safety check — the launcher will not unbind a display GPU. Options:
+
+- Use a different GPU for the monitor (iGPU, secondary card)
+- Stop the display manager first: `sudo systemctl stop gdm`
+- On headless servers, this should not occur — verify with `ls /sys/class/drm/card*/device`
+
+### "in use by PIDs: ..."
+
+Active processes hold `/dev/nvidia*` file descriptors. The check is host-wide
+(across all NVIDIA GPUs, not per-device). The launcher lists the PIDs and
+process names. Stop those processes before retrying.
+
+### "IOMMU not enabled or device has no IOMMU group"
+
+IOMMU must be enabled in both BIOS/UEFI and kernel cmdline. See Host Preparation above.
+
+### "VFIO kernel modules not loaded"
+
+```shell
+sudo modprobe vfio-pci
+sudo modprobe vfio_iommu_type1
+```
+
+### "insufficient sysfs permissions — run as root"
+
+The launcher needs root to write to sysfs bind/unbind paths. Run with `sudo`.
+
+### GPU not rebound after crash
+
+If the launcher process is killed with `SIGKILL` (kill -9), the cleanup handler cannot run and the GPU remains on `vfio-pci`. Manually rebind:
+
+```shell
+PCI_ADDR="0000:41:00.0"
+echo "$PCI_ADDR" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver/unbind
+echo "" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver_override
+echo "$PCI_ADDR" | sudo tee /sys/bus/pci/drivers/nvidia/bind
+```
+
+### nvidia driver unbind deadlock (kernel bug)
+
+Some nvidia driver versions deadlock in their sysfs `unbind` handler — the `write()` syscall to `/sys/bus/pci/drivers/nvidia/unbind` never returns. When this happens, the subprocess enters uninterruptible sleep (D state) and becomes unkillable even by `SIGKILL`. The GPU's PCI subsystem state is corrupted and all subsequent PCI operations on the device hang. Only a host reboot clears this state.
+
+This is a kernel/nvidia driver bug, not an openshell-vm issue. Three mitigation layers are in place:
+
+1. **Pre-unbind preparation**: Before the raw sysfs unbind, the launcher disables nvidia persistence mode (`nvidia-smi -pm 0`) and unloads nvidia submodules (`nvidia_uvm`, `nvidia_drm`, `nvidia_modeset`) via `modprobe -r`. This often cascade-removes the base nvidia module entirely, unbinding the device automatically without ever touching the dangerous sysfs path.
+
+2. **Subprocess isolation with timeout**: All sysfs writes (and the nvidia prep commands) run in a subprocess with a timeout (10s for sysfs, 15s for prep). On timeout, the subprocess is killed and dropped without calling `wait()` — preventing the parent process from being dragged into D-state.
+
+3. **Post-timeout verification**: If the unbind subprocess times out but the device is actually unbound at the hardware level (which the nvidia bug can cause — the operation completes but the syscall never returns), the launcher detects this and continues with the VFIO bind.
+
+If you hit this issue repeatedly, check for nvidia driver updates or file a bug with NVIDIA.
+
+### VM boots but `nvidia-smi` fails inside guest
+
+- Verify the GPU rootfs includes NVIDIA drivers: `chroot /path/to/rootfs which nvidia-smi`
+- Check that NVIDIA kernel modules load: `openshell-vm exec <name> -- lsmod | grep nvidia`
+- Inspect dmesg for NVIDIA driver errors: `openshell-vm exec <name> -- dmesg | grep -i nvidia`
+
+## Related
+
+- [Custom VM Runtime](custom-vm-runtime.md) — building and customizing the libkrun VM runtime
+- [System Architecture](system-architecture.md) — overall OpenShell architecture
+- Implementation: [`crates/openshell-vm/src/gpu_passthrough.rs`](../crates/openshell-vm/src/gpu_passthrough.rs)
diff --git a/crates/openshell-cli/Cargo.toml b/crates/openshell-cli/Cargo.toml
index b3a006fdd..dd8f83bb8 100644
--- a/crates/openshell-cli/Cargo.toml
+++ b/crates/openshell-cli/Cargo.toml
@@ -21,6 +21,7 @@ openshell-policy = { path = "../openshell-policy" }
 openshell-providers = { path = "../openshell-providers" }
 openshell-prover = { path = "../openshell-prover" }
 openshell-tui = { path = "../openshell-tui" }
+openshell-vm = { path = "../openshell-vm" }
 serde = { workspace = true }
 serde_json = { workspace = true }
 prost-types = { workspace = true }
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 292922411..05d1fb7c1 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -807,18 +807,21 @@ enum GatewayCommands {
         #[arg(long, env = "OPENSHELL_REGISTRY_TOKEN")]
         registry_token: Option<String>,
 
-        /// Enable NVIDIA GPU passthrough.
+        /// Enable NVIDIA GPU support for the gateway cluster.
         ///
-        /// Passes all host GPUs into the cluster container and deploys the
-        /// NVIDIA k8s-device-plugin so Kubernetes workloads can request
-        /// `nvidia.com/gpu` resources. Requires NVIDIA drivers and the
-        /// NVIDIA Container Toolkit on the host.
+        /// **Docker path (default):** passes GPUs into the gateway container via
+        /// the NVIDIA Container Toolkit — CDI when the daemon supports it, else
+        /// Docker's `--gpus all` — and deploys the NVIDIA device plugin. Use
+        /// `--gpu` or `--gpu auto` only; PCI addresses are not valid CDI device
+        /// names on this path.
         ///
-        /// When enabled, OpenShell auto-selects CDI when the Docker daemon has
-        /// CDI enabled and falls back to Docker's NVIDIA GPU request path
-        /// (`--gpus all`) otherwise.
-        #[arg(long)]
-        gpu: bool,
+        /// **MicroVM path:** set `OPENSHELL_GATEWAY_BACKEND=vm` for deployments
+        /// that use the VM gateway. Then you may pass `--gpu` / `--gpu auto` for
+        /// VFIO auto-select, or `--gpu 0000:41:00.0` (PCI BDF) for a specific GPU.
+        /// Requires IOMMU and the GPU bound to `vfio-pci`. See
+        /// `architecture/vm-gpu-passthrough.md`.
+        #[arg(long, num_args = 0..=1, default_missing_value = "auto")]
+        gpu: Option<String>,
     },
 
     /// Stop the gateway (preserves state).
@@ -1129,10 +1132,9 @@ enum SandboxCommands {
         /// Request GPU resources for the sandbox.
         ///
         /// When no gateway is running, auto-bootstrap starts a GPU-enabled
-        /// gateway using the same automatic injection selection as
-        /// `openshell gateway start --gpu`. GPU intent is also inferred
-        /// automatically for known GPU-designated image names such as
-        /// `nvidia-gpu`.
+        /// gateway using the Docker NVIDIA path (`--gpu auto`), same as
+        /// `openshell gateway start --gpu` without the microVM backend. GPU
+        /// intent is also inferred for known GPU image names (e.g. `nvidia-gpu`).
         #[arg(long)]
         gpu: bool,
 
@@ -1655,12 +1657,11 @@ async fn main() -> Result<()> {
                 registry_token,
                 gpu,
             } => {
-                let gpu = if gpu {
-                    vec!["auto".to_string()]
-                } else {
-                    vec![]
+                let gpu = match gpu {
+                    Some(val) => vec![val],
+                    None => vec![],
                 };
-                run::gateway_admin_deploy(
+                let _gpu_guard = run::gateway_admin_deploy(
                     &name,
                     remote.as_deref(),
                     ssh_key.as_deref(),
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index c41b53518..247f41d11 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1434,7 +1434,9 @@ pub async fn gateway_admin_deploy(
     registry_username: Option<&str>,
     registry_token: Option<&str>,
     gpu: Vec<String>,
-) -> Result<()> {
+) -> Result<Option<openshell_vm::gpu_passthrough::GpuBindGuard>> {
+    let (gpu, gpu_guard) = prepare_gateway_deploy_gpu(gpu, remote.as_deref())?;
+
     let location = if remote.is_some() { "remote" } else { "local" };
 
     // Build remote options once so we can reuse them for the existence check
@@ -1457,7 +1459,7 @@ pub async fn gateway_admin_deploy(
                     "{} Gateway '{name}' is already running.",
                     "✓".green().bold()
                 );
-                return Ok(());
+                return Ok(gpu_guard);
             }
         }
     }
@@ -1518,7 +1520,7 @@ pub async fn gateway_admin_deploy(
     save_active_gateway(name)?;
     eprintln!("{} Active gateway set to '{name}'", "✓".green().bold());
 
-    Ok(())
+    Ok(gpu_guard)
 }
 
 /// Resolve the remote SSH destination for a gateway.
@@ -5193,6 +5195,126 @@ fn format_timestamp_ms(ms: i64) -> String {
     }
 }
 
+/// Environment variable selecting the gateway deployment backend for GPU checks.
+///
+/// VFIO sysfs probes apply only to the microVM (`openshell-vm`) deploy path.
+/// The default `openshell gateway start` flow uses Docker with the NVIDIA
+/// Container Toolkit; leave this unset for that path.
+const OPENSHELL_GATEWAY_BACKEND_ENV: &str = "OPENSHELL_GATEWAY_BACKEND";
+
+fn gateway_deploy_uses_vm_backend() -> bool {
+    std::env::var(OPENSHELL_GATEWAY_BACKEND_ENV)
+        .ok()
+        .map(|v| {
+            matches!(
+                v.trim().to_ascii_lowercase().as_str(),
+                "vm" | "microvm" | "openshell-vm"
+            )
+        })
+        .unwrap_or(false)
+}
+
+/// Heuristic: value looks like a PCI domain:bus:dev.fn address (Linux sysfs BDF).
+fn looks_like_pci_bdf(s: &str) -> bool {
+    let s = s.trim();
+    let rest = if let Some((prefix, after_colon)) = s.split_once(':') {
+        if prefix.len() == 4 && prefix.chars().all(|c| c.is_ascii_hexdigit()) {
+            after_colon
+        } else {
+            s
+        }
+    } else {
+        return false;
+    };
+
+    let Some((bus, dev_fn)) = rest.split_once(':') else {
+        return false;
+    };
+    if bus.len() != 2 || !bus.chars().all(|c| c.is_ascii_hexdigit()) {
+        return false;
+    }
+    let Some((dev, func)) = dev_fn.split_once('.') else {
+        return false;
+    };
+    if dev.len() != 2 || !dev.chars().all(|c| c.is_ascii_hexdigit()) {
+        return false;
+    }
+    if func.len() != 1 || !func.chars().all(|c| ('0'..='7').contains(&c)) {
+        return false;
+    }
+    true
+}
+
+/// Validate `--gpu` for `gateway start`, run VFIO checks only for the VM deploy path,
+/// and normalize Docker-path requests to CDI-compatible `auto`.
+fn prepare_gateway_deploy_gpu(
+    gpu: Vec<String>,
+    remote: Option<&str>,
+) -> Result<(
+    Vec<String>,
+    Option<openshell_vm::gpu_passthrough::GpuBindGuard>,
+)> {
+    if gpu.is_empty() {
+        return Ok((gpu, None));
+    }
+
+    if gateway_deploy_uses_vm_backend() {
+        if remote.is_none() {
+            let guard = check_gpu_readiness(&gpu)?;
+            let selected_bdf = guard.pci_addr().unwrap_or("auto").to_string();
+            let updated_gpu = vec![selected_bdf];
+            return Ok((updated_gpu, Some(guard)));
+        } else {
+            eprintln!(
+                "{} Local VFIO GPU probe skipped (--remote): GPU readiness is checked on the remote host during deployment.",
+                "ℹ".cyan().bold()
+            );
+        }
+        return Ok((gpu, None));
+    }
+
+    let Some(first) = gpu.first() else {
+        return Ok((gpu, None));
+    };
+    if first.as_str() != "auto" {
+        if looks_like_pci_bdf(first) {
+            return Err(miette!(
+                "PCI address GPU selection ({first}) is only supported for the microVM gateway backend.\n\n\
+                 `openshell gateway start` uses Docker by default (NVIDIA Container Toolkit / CDI, or Docker `--gpus all`). \
+                 Use `--gpu` or `--gpu auto` for that path.\n\n\
+                 For VFIO passthrough, set {}=vm and follow architecture/vm-gpu-passthrough.md.",
+                OPENSHELL_GATEWAY_BACKEND_ENV,
+            ));
+        }
+        return Err(miette!(
+            "Unrecognized --gpu value `{first}` for Docker gateway deploy. Use `--gpu` or `--gpu auto`.",
+        ));
+    }
+
+    Ok((vec!["auto".to_string()], None))
+}
+
+/// Bind a GPU for VFIO passthrough and return an RAII guard that restores it on drop.
+fn check_gpu_readiness(gpu: &[String]) -> Result<openshell_vm::gpu_passthrough::GpuBindGuard> {
+    use openshell_vm::gpu_passthrough::{GpuBindGuard, prepare_gpu_for_passthrough};
+
+    let requested_addr = gpu
+        .first()
+        .filter(|v| v.as_str() != "auto")
+        .map(|v| v.as_str());
+
+    let bind_state = prepare_gpu_for_passthrough(requested_addr).map_err(|e| miette!("{e}"))?;
+
+    eprintln!(
+        "{} GPU {} bound to vfio-pci (was: {})",
+        "✓".green().bold(),
+        bind_state.pci_addr,
+        bind_state.original_driver,
+    );
+
+    Ok(GpuBindGuard::new(bind_state))
+}
+
 #[cfg(test)]
 mod tests {
     use super::{
@@ -5416,6 +5538,16 @@ mod tests {
         assert!(sandbox_should_persist(false, Some(&spec)));
     }
 
+    #[test]
+    fn looks_like_pci_bdf_recognizes_sysfs_addresses() {
+        assert!(super::looks_like_pci_bdf("0000:41:00.0"));
+        assert!(super::looks_like_pci_bdf("41:00.0"));
+        assert!(super::looks_like_pci_bdf(" 0a:1f.7 "));
+        assert!(!super::looks_like_pci_bdf("auto"));
+        assert!(!super::looks_like_pci_bdf("nvidia.com/gpu=all"));
+        assert!(!super::looks_like_pci_bdf("00:00.8")); // invalid function
+    }
+
     #[test]
     fn image_requests_gpu_matches_known_gpu_image_names() {
         for image in [
diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml
index 7d74b3139..388e42351 100644
--- a/crates/openshell-vm/Cargo.toml
+++ b/crates/openshell-vm/Cargo.toml
@@ -46,5 +46,8 @@ tokio-rustls = { workspace = true }
 [build-dependencies]
 zstd = "0.13"
 
+[dev-dependencies]
+tempfile = "3"
+
 [lints]
 workspace = true
diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs
index 33fab9a78..f448ed0bc 100644
--- a/crates/openshell-vm/build.rs
+++ b/crates/openshell-vm/build.rs
@@ -12,7 +12,7 @@
 //! Environment:
 //!   `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` - Path to compressed artifacts
 
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::{env, fs};
 
 fn main() {
@@ -116,7 +116,7 @@ fn main() {
 
 /// Generate stub (empty) resource files so the build can complete.
 /// The embedded module will fail at runtime if these stubs are used.
-fn generate_stub_resources(out_dir: &PathBuf) {
+fn generate_stub_resources(out_dir: &Path) {
     let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
 
     let (libkrun_name, libkrunfw_name) = match target_os.as_str() {
diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env
index b3d802292..d44f044c8 100644
--- a/crates/openshell-vm/pins.env
+++ b/crates/openshell-vm/pins.env
@@ -42,3 +42,33 @@ GVPROXY_VERSION="${GVPROXY_VERSION:-v0.8.8}"
 # Repo: https://github.com/containers/libkrunfw
 # Pinned: 2026-03-27 (main branch HEAD at time of pinning)
 LIBKRUNFW_REF="${LIBKRUNFW_REF:-463f717bbdd916e1352a025b6fb2456e882b0b39}"
+
+# ── cloud-hypervisor (GPU passthrough VMM) ──────────────────────────────
+# Repo: https://github.com/cloud-hypervisor/cloud-hypervisor
+CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}"
+
+# ── virtiofsd (virtio-fs daemon for cloud-hypervisor rootfs) ────────────
+# Repo: https://gitlab.com/virtio-fs/virtiofsd
+VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}"
+
+# ── NVIDIA GPU support (GPU rootfs variant) ────────────────────────────
+# Driver branch: 570.x (open kernel modules, data-center/workstation)
+#
+# Compatibility matrix:
+#   Minimum driver version:      570 (NVIDIA 570.x open kernel modules)
+#   Minimum compute capability:  sm_70 (Volta V100 and newer)
+#   Supported architectures:     Volta (V100), Turing (T4, RTX 20xx),
+#                                Ampere (A100, A10, RTX 30xx),
+#                                Hopper (H100, H200), Ada Lovelace (L40S),
+#                                Blackwell (B100, B200)
+#   Guest architecture:          x86_64 only (NVIDIA does not publish
+#                                aarch64 data-center drivers in APT form)
+#   Host requirements:           IOMMU enabled, GPU bound to vfio-pci driver,
+#                                host driver version >= guest driver version
+#
+# The 570.x branch uses the open kernel module flavour
+# (nvidia-headless-570-open), required for data-center GPUs (Turing+).
+# Consumer GPUs (GeForce) may work but are not officially supported
+# for VFIO passthrough.
+NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}"
+NVIDIA_CONTAINER_TOOLKIT_VERSION="${NVIDIA_CONTAINER_TOOLKIT_VERSION:-1.17.5}"
diff --git a/crates/openshell-vm/runtime/kernel/openshell.kconfig b/crates/openshell-vm/runtime/kernel/openshell.kconfig
index b5f0330af..5ce14a683 100644
--- a/crates/openshell-vm/runtime/kernel/openshell.kconfig
+++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig
@@ -115,6 +115,10 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PIDS=y
 CONFIG_MEMCG=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_CGROUP_FREEZER=y
 
 # ── Disable kernel headers archive (avoids cpio issues in CI) ──────────
 # CONFIG_IKHEADERS is not set
@@ -126,3 +130,29 @@ CONFIG_POSIX_MQUEUE_SYSCTL=y
 # ── Security features required by the sandbox runtime ───────────────────
 CONFIG_SECURITY_LANDLOCK=y
 CONFIG_SECCOMP_FILTER=y
+
+# ── PCI / GPU passthrough (harmless for non-GPU boots) ──────────────────
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_DRM=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+
+# ── cloud-hypervisor support ────────────────────────────────────────────
+# CHV uses virtio-PCI transport (libkrun uses virtio-MMIO). Both drivers
+# coexist safely — the kernel probes whichever transport the hypervisor
+# provides.
+CONFIG_VIRTIO_PCI=y
+
+# Serial console for cloud-hypervisor (8250/16550 UART). libkrun uses
+# virtio-console which is already enabled in the base config.
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+
+# ACPI support for cloud-hypervisor power management. Required for
+# `poweroff -f` to trigger a clean ACPI shutdown that CHV detects.
+CONFIG_ACPI=y
+
+# x2APIC support — Cloud Hypervisor uses x2APIC MADT entries for
+# multi-vCPU VMs. Without this, only the bootstrap CPU is activated.
+CONFIG_X86_X2APIC=y
diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh
index d43046d4f..99a301f85 100755
--- a/crates/openshell-vm/scripts/build-rootfs.sh
+++ b/crates/openshell-vm/scripts/build-rootfs.sh
@@ -18,11 +18,16 @@
 # - NO pre-initialized k3s state (cold start on first boot)
 # First boot will be slower (~30-60s) as k3s initializes and pulls images.
 #
+# With --gpu, installs NVIDIA driver packages and the nvidia-container-toolkit
+# into the rootfs, producing a GPU-capable variant. The launcher selects this
+# rootfs when `--gpu` is passed. Only supported on x86_64 (NVIDIA does not
+# publish aarch64 data-center drivers for Ubuntu in this packaging form).
+#
 # Supports aarch64 and x86_64 guest architectures. The target architecture
 # is auto-detected from the host but can be overridden with --arch.
 #
 # Usage:
-#   ./build-rootfs.sh [--base] [--arch aarch64|x86_64] [output_dir]
+#   ./build-rootfs.sh [--base] [--gpu] [--arch aarch64|x86_64] [output_dir]
 #
 # If output_dir is omitted, the rootfs is built under target/rootfs-build.
 #
@@ -43,12 +48,15 @@ fi
 
 # ── Argument parsing ───────────────────────────────────────────────────
 BASE_ONLY=false
+GPU_BUILD=false
 GUEST_ARCH=""
 POSITIONAL_ARGS=()
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --base)
             BASE_ONLY=true; shift ;;
+        --gpu)
+            GPU_BUILD=true; shift ;;
         --arch)
             GUEST_ARCH="$2"; shift 2 ;;
         *)
@@ -90,6 +98,14 @@ case "$GUEST_ARCH" in
         ;;
 esac
 
+# GPU builds are only supported on x86_64 — NVIDIA does not publish
+# aarch64 data-center driver packages in the same APT repository.
+if [ "$GPU_BUILD" = true ] && [ "$GUEST_ARCH" != "x86_64" ]; then
+    echo "ERROR: --gpu is only supported for x86_64 guest architecture." >&2
+    echo "       Current arch: ${GUEST_ARCH}" >&2
+    exit 1
+fi
+
 # Project root (two levels up from crates/openshell-vm/scripts/)
 PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
 DEFAULT_ROOTFS="${PROJECT_ROOT}/target/rootfs-build"
@@ -125,6 +141,9 @@ if [ "$BASE_ONLY" = true ]; then
     echo "    k3s version: ${K3S_VERSION}"
     echo "    Output:      ${ROOTFS_DIR}"
     echo "    Mode:        base (no pre-loaded images, cold start)"
+    if [ "$GPU_BUILD" = true ]; then
+        echo "    GPU:         yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})"
+    fi
 else
     echo "==> Building openshell-vm rootfs"
     echo "    Guest arch:  ${GUEST_ARCH}"
@@ -132,6 +151,9 @@ else
     echo "    Images:      ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}"
     echo "    Output:      ${ROOTFS_DIR}"
     echo "    Mode:        full (pre-loaded images, pre-initialized)"
+    if [ "$GPU_BUILD" = true ]; then
+        echo "    GPU:         yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})"
+    fi
 fi
 echo ""
 
@@ -222,8 +244,55 @@ fi
 docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true
 
 echo "==> Building base image..."
-docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \
-    --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE'
+if [ "$GPU_BUILD" = true ]; then
+    docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \
+        --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" \
+        --build-arg "NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}" \
+        --build-arg "NVIDIA_CONTAINER_TOOLKIT_VERSION=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \
+        -f - . <<'DOCKERFILE'
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+ARG NVIDIA_DRIVER_VERSION
+ARG NVIDIA_CONTAINER_TOOLKIT_VERSION
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        e2fsprogs \
+        iptables \
+        iproute2 \
+        python3 \
+        busybox-static \
+        sqlite3 \
+        util-linux \
+        zstd \
+        gnupg \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+# busybox-static provides udhcpc for DHCP inside the VM.
+RUN mkdir -p /usr/share/udhcpc && \
+    ln -sf /bin/busybox /sbin/udhcpc
+RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s
+# ── NVIDIA driver and container toolkit ──────────────────────────────
+# Add the NVIDIA package repository and install the open kernel module
+# flavour of the driver plus nvidia-container-toolkit. The open modules
+# are required for data-center GPUs (Turing+ / compute capability >= 7.0).
+RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+        | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+    && curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+        | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+        > /etc/apt/sources.list.d/nvidia-container-toolkit.list
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        nvidia-headless-${NVIDIA_DRIVER_VERSION}-open \
+        nvidia-utils-${NVIDIA_DRIVER_VERSION} \
+        nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION}-1 \
+    && rm -rf /var/lib/apt/lists/*
+# Configure the NVIDIA container runtime as the default for containerd.
+RUN nvidia-ctk runtime configure --runtime=containerd --set-as-default
+DOCKERFILE
+else
+    docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \
+        --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE'
 ARG BASE_IMAGE
 FROM ${BASE_IMAGE}
 RUN apt-get update && \
@@ -243,6 +312,7 @@ RUN mkdir -p /usr/share/udhcpc && \
     ln -sf /bin/busybox /sbin/udhcpc
 RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s
 DOCKERFILE
+fi
 
 # Create a container and export the filesystem
 echo "==> Creating container..."
@@ -363,6 +433,28 @@ for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do
     fi
 done
 
+# ── Inject GPU manifests (when building GPU rootfs) ───────────────────
+# These are deployed by openshell-vm-init.sh when GPU_ENABLED=true.
+GPU_MANIFEST_SRC="${SCRIPT_DIR}/gpu-manifests"
+GPU_MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/gpu-manifests"
+if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_SRC}" ]; then
+    echo "==> Injecting GPU manifests..."
+    mkdir -p "${GPU_MANIFEST_DEST}"
+    GPU_MANIFEST_COPIED=0
+    for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do
+        [ -f "$manifest" ] || continue
+        cp "$manifest" "${GPU_MANIFEST_DEST}/"
+        echo "    $(basename "$manifest")"
+        GPU_MANIFEST_COPIED=$((GPU_MANIFEST_COPIED + 1))
+    done
+    # Sentinel only when at least one manifest was staged (empty glob must not create it).
+    if [ "$GPU_MANIFEST_COPIED" -gt 0 ]; then
+        echo "gpu" > "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu"
+    else
+        echo "WARNING: No GPU manifests (*.yaml) found in ${GPU_MANIFEST_SRC}; not writing .rootfs-gpu sentinel." >&2
+    fi
+fi
+
 # ── Base mode: mark rootfs type and skip pre-loading ───────────────────
 
 if [ "$BASE_ONLY" = true ]; then
@@ -384,10 +476,33 @@ if [ "$BASE_ONLY" = true ]; then
         exit 1
     fi
 
+    if [ "$GPU_BUILD" = true ]; then
+        echo "==> Verifying GPU components in rootfs..."
+        if [ ! -f "${ROOTFS_DIR}/usr/bin/nvidia-smi" ]; then
+            echo "ERROR: nvidia-smi not found in rootfs."
+            exit 1
+        fi
+        if [ ! -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ]; then
+            echo "ERROR: GPU sentinel file not found in rootfs."
+            exit 1
+        fi
+        echo "    nvidia-smi: found"
+        # nvidia-container-runtime is installed via nvidia-container-toolkit.
+        if ls "${ROOTFS_DIR}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then
+            echo "    nvidia-container-runtime: found"
+        else
+            echo "WARNING: nvidia-container-runtime not found — GPU pods may not work."
+        fi
+    fi
+
     echo ""
     echo "==> Base rootfs ready at: ${ROOTFS_DIR}"
     echo "    Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)"
-    echo "    Type: base (cold start, images pulled on demand)"
+    if [ "$GPU_BUILD" = true ]; then
+        echo "    Type: base + GPU (cold start, NVIDIA driver ${NVIDIA_DRIVER_VERSION})"
+    else
+        echo "    Type: base (cold start, images pulled on demand)"
+    fi
     echo ""
     echo "Note: First boot will take ~30-60s as k3s initializes."
     echo "      Container images will be pulled from registries on first use."
@@ -475,6 +590,15 @@ for manifest in "${MANIFEST_DEST}"/*.yaml; do
     cp "$manifest" "${INIT_MANIFESTS}/"
 done
 
+# GPU manifests: same pre-init path as other auto-deploy manifests so k3s
+# sees them during cluster bake (not only under /opt/openshell/gpu-manifests).
+if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_DEST}" ]; then
+    for manifest in "${GPU_MANIFEST_DEST}"/*.yaml; do
+        [ -f "$manifest" ] || continue
+        cp "$manifest" "${INIT_MANIFESTS}/"
+    done
+fi
+
 # Patch HelmChart for local images and VM settings.
 HELMCHART="${INIT_MANIFESTS}/openshell-helmchart.yaml"
 if [ -f "$HELMCHART" ]; then
@@ -741,10 +865,28 @@ if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then
     exit 1
 fi
 
+# ── GPU verification (full mode) ──────────────────────────────────────
+if [ "$GPU_BUILD" = true ]; then
+    echo "==> Verifying GPU components in rootfs..."
+    if [ ! -f "${ROOTFS_DIR}/usr/bin/nvidia-smi" ]; then
+        echo "ERROR: nvidia-smi not found in rootfs."
+        exit 1
+    fi
+    echo "    nvidia-smi: found"
+    if ls "${ROOTFS_DIR}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then
+        echo "    nvidia-container-runtime: found"
+    else
+        echo "WARNING: nvidia-container-runtime not found — GPU pods may not work."
+    fi
+fi
+
 echo ""
 echo "==> Rootfs ready at: ${ROOTFS_DIR}"
 echo "    Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)"
 echo "    Pre-initialized: $(cat "${ROOTFS_DIR}/opt/openshell/.initialized" 2>/dev/null || echo 'no')"
+if [ "$GPU_BUILD" = true ]; then
+    echo "    GPU: NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION}"
+fi
 
 # Show k3s data size
 K3S_DATA="${ROOTFS_DIR}/var/lib/rancher/k3s"
diff --git a/crates/openshell-vm/scripts/gpu-manifests/README.md b/crates/openshell-vm/scripts/gpu-manifests/README.md
new file mode 100644
index 000000000..c72deb1aa
--- /dev/null
+++ b/crates/openshell-vm/scripts/gpu-manifests/README.md
@@ -0,0 +1,41 @@
+# GPU Rootfs Manifests
+
+These Kubernetes manifests are injected into the VM rootfs when
+`build-rootfs.sh --gpu` is used. During a **full** rootfs build they are
+also copied into the k3s auto-deploy manifest directory so they are
+applied at pre-init time.
+
+**Phase 2:** deployment from `openshell-vm-init.sh` when
+`GPU_ENABLED=true` is not implemented yet; that path will copy or
+reconcile these manifests at VM boot.
+
+## NVIDIA Driver Compatibility
+
+| Property | Value |
+|---|---|
+| Driver branch | 570.x (open kernel modules) |
+| Minimum compute capability | sm_70 (Volta V100 and newer) |
+| Container toolkit | nvidia-container-toolkit 1.17.x |
+| Device plugin Helm chart | 0.18.2 |
+
+### Why open kernel modules?
+
+The 570.x open kernel modules are required for data-center GPUs
+(Volta, Turing, Ampere, Hopper, Blackwell). They are the
+NVIDIA-recommended driver for passthrough and container workloads.
+Consumer GPUs (GeForce) prior to Turing (sm_75) are **not supported**
+with open modules — use the proprietary driver branch if needed.
+
+### Host requirements
+
+- IOMMU enabled in BIOS and kernel (`intel_iommu=on` or `amd_iommu=on`)
+- GPU bound to `vfio-pci` driver on the host
+- `/dev/vfio/vfio` and `/dev/vfio/<group>` accessible
+- Host NVIDIA driver version >= 570 (must match or exceed guest driver)
+
+### Files
+
+- `nvidia-device-plugin.yaml` — HelmChart CR that deploys the NVIDIA
+  k8s-device-plugin via the k3s Helm controller.
+- `nvidia-runtime-class.yaml` — RuntimeClass object so pods can use
+  `runtimeClassName: nvidia`.
diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml
new file mode 100644
index 000000000..c1cbeaa8a
--- /dev/null
+++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# HelmChart CR for auto-deploying the NVIDIA k8s-device-plugin via k3s Helm controller.
+#
+# This manifest is copied into /var/lib/rancher/k3s/server/manifests/ by the
+# VM init script when GPU_ENABLED=true. It is the VM-specific equivalent of
+# deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml used by the
+# Docker-based gateway.
+#
+# The chart installs:
+#   - NVIDIA device plugin DaemonSet (advertises nvidia.com/gpu resources)
+#
+# NFD and GFD are disabled; the device plugin's default nodeAffinity
+# (which requires nvidia.com/gpu.present=true) is overridden to empty
+# so it schedules on any node without requiring NFD/GFD labels.
+#
+# CDI injection mode: the device plugin uses deviceListStrategy=cdi-cri so that
+# devices are injected via CDI hooks before container start. Sandbox pods only
+# need the nvidia.com/gpu resource request — no runtimeClassName is required.
+#
+# k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia"
+# RuntimeClass automatically, so no manual RuntimeClass manifest is needed.
+
+apiVersion: helm.cattle.io/v1
+kind: HelmChart
+metadata:
+  name: nvidia-device-plugin
+  namespace: kube-system
+spec:
+  repo: https://nvidia.github.io/k8s-device-plugin
+  chart: nvidia-device-plugin
+  version: "0.18.2"
+  targetNamespace: nvidia-device-plugin
+  createNamespace: true
+  valuesContent: |-
+    runtimeClassName: nvidia
+    deviceListStrategy: cdi-cri
+    deviceIDStrategy: index
+    cdi:
+      nvidiaHookPath: /usr/bin/nvidia-cdi-hook
+    nvidiaDriverRoot: "/"
+    gfd:
+      enabled: false
+    nfd:
+      enabled: false
+    affinity: null
diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml
new file mode 100644
index 000000000..fe2ccbd6e
--- /dev/null
+++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# RuntimeClass for NVIDIA GPU workloads.
+# Deployed alongside the device plugin when GPU_ENABLED=true.
+# Pods requesting nvidia.com/gpu resources should set
+# runtimeClassName: nvidia to use the NVIDIA container runtime.
+---
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh
index 1cb686a31..222bcc641 100755
--- a/crates/openshell-vm/scripts/openshell-vm-init.sh
+++ b/crates/openshell-vm/scripts/openshell-vm-init.sh
@@ -46,6 +46,31 @@ mkdir -p /sys/fs/cgroup
 mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null &
 wait
 
+# ── Parse kernel cmdline for env vars (cloud-hypervisor path) ────────
+# cloud-hypervisor passes environment variables via kernel cmdline
+# (KEY=VALUE tokens). These are not automatically exported to init.
+# Must run after /proc is mounted.
+if [ -f /proc/cmdline ]; then
+    for token in $(cat /proc/cmdline); do
+        case "$token" in
+            GPU_ENABLED=*|OPENSHELL_VM_STATE_DISK_DEVICE=*|VM_NET_IP=*|VM_NET_GW=*|VM_NET_DNS=*)
+                export "$token"
+                ;;
+        esac
+    done
+fi
+
+# Enable cgroup v2 controllers in the root cgroup hierarchy.
+# k3s/kubelet requires cpu, cpuset, memory, and pids controllers.
+# The kernel must have CONFIG_CGROUP_SCHED=y for the cpu controller.
+if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
+    for ctrl in cpu cpuset memory pids io; do
+        if grep -qw "$ctrl" /sys/fs/cgroup/cgroup.controllers; then
+            echo "+$ctrl" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true
+        fi
+    done
+fi
+
 ts "filesystems mounted"
 
 # ── Networking ──────────────────────────────────────────────────────────
@@ -97,20 +122,26 @@ DHCP_SCRIPT
         # -n: exit if no lease, -T 1: 1s between retries, -t 3: 3 retries
         # -A 1: wait 1s before first retry (aggressive for local gvproxy)
         if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then
-            ts "WARNING: DHCP failed, falling back to static config"
-            ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true
-            ip route add default via 192.168.127.1 2>/dev/null || true
+            STATIC_IP="${VM_NET_IP:-192.168.127.2}"
+            STATIC_GW="${VM_NET_GW:-192.168.127.1}"
+            ts "WARNING: DHCP failed, falling back to static config ($STATIC_IP gw $STATIC_GW)"
+            ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true
+            ip route add default via "$STATIC_GW" 2>/dev/null || true
         fi
     else
-        # Fallback to static config if no DHCP client available.
-        ts "no DHCP client, using static config"
-        ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true
-        ip route add default via 192.168.127.1 2>/dev/null || true
+        STATIC_IP="${VM_NET_IP:-192.168.127.2}"
+        STATIC_GW="${VM_NET_GW:-192.168.127.1}"
+        ts "no DHCP client, using static config ($STATIC_IP gw $STATIC_GW)"
+        ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true
+        ip route add default via "$STATIC_GW" 2>/dev/null || true
     fi
 
-    # Ensure DNS is configured. DHCP should have set /etc/resolv.conf,
-    # but if it didn't (or static fallback was used), provide a default.
-    if [ ! -s /etc/resolv.conf ]; then
+    # Ensure DNS is configured. When VM_NET_DNS is set (TAP networking),
+    # always use it — the rootfs may have a stale resolv.conf from a
+    # previous gvproxy run that points to an unreachable gateway.
+    if [ -n "${VM_NET_DNS:-}" ]; then
+        echo "nameserver $VM_NET_DNS" > /etc/resolv.conf
+    elif [ ! -s /etc/resolv.conf ]; then
         echo "nameserver 8.8.8.8" > /etc/resolv.conf
         echo "nameserver 8.8.4.4" >> /etc/resolv.conf
     fi
@@ -366,6 +397,35 @@ if [ "$_caps_ok" = false ]; then
     exit 1
 fi
 
+# ── GPU: NVIDIA driver and device plugin ─────────────────────────────
+# When the VM is launched with --gpu, the Rust launcher passes
+# GPU_ENABLED=true. Load the NVIDIA kernel modules, verify the device
+# is visible via nvidia-smi, and confirm that the container runtime is
+# available before k3s starts.
+
+if [ "${GPU_ENABLED:-false}" = "true" ]; then
+    ts "GPU mode enabled — loading NVIDIA drivers"
+
+    modprobe nvidia || { echo "FATAL: failed to load nvidia kernel module" >&2; exit 1; }
+    modprobe nvidia_uvm || { echo "FATAL: failed to load nvidia_uvm kernel module" >&2; exit 1; }
+    modprobe nvidia_modeset || { echo "FATAL: failed to load nvidia_modeset kernel module" >&2; exit 1; }
+    ts "NVIDIA kernel modules loaded"
+
+    if ! nvidia-smi > /dev/null 2>&1; then
+        echo "FATAL: GPU_ENABLED=true but nvidia-smi failed — GPU not visible to guest" >&2
+        echo "Check: VFIO passthrough, IOMMU groups, guest kernel modules" >&2
+        exit 1
+    fi
+    ts "nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
+
+    if command -v nvidia-container-runtime >/dev/null 2>&1; then
+        ts "nvidia-container-runtime: $(command -v nvidia-container-runtime)"
+    else
+        echo "FATAL: nvidia-container-runtime not found — GPU pods will fail" >&2
+        exit 1
+    fi
+fi
+
 # ── Deploy bundled manifests (cold boot only) ───────────────────────────
 # On pre-initialized rootfs, manifests are already in place from the
 # build-time k3s boot. Skip this entirely for fast startup.
@@ -411,6 +471,29 @@ else
     ts "skipping manifest deploy (pre-initialized)"
 fi
 
+# ── GPU manifests (device plugin, runtime class) ─────────────────────
+# Deployed on every boot (not just cold boot) so the device plugin is
+# always present when GPU_ENABLED=true. Mirrors cluster-entrypoint.sh.
+if [ "${GPU_ENABLED:-false}" = "true" ]; then
+    GPU_MANIFESTS="/opt/openshell/gpu-manifests"
+    if [ ! -d "$GPU_MANIFESTS" ]; then
+        echo "FATAL: GPU_ENABLED=true but GPU manifests directory missing: $GPU_MANIFESTS" >&2
+        exit 1
+    fi
+    mkdir -p "$K3S_MANIFESTS"
+    _gpu_manifest_deployed=false
+    for manifest in "$GPU_MANIFESTS"/*.yaml; do
+        [ -f "$manifest" ] || continue
+        _gpu_manifest_deployed=true
+        cp "$manifest" "$K3S_MANIFESTS/"
+        ts "deployed GPU manifest: $(basename "$manifest")"
+    done
+    if [ "$_gpu_manifest_deployed" = false ]; then
+        echo "FATAL: GPU_ENABLED=true but no YAML manifests found in $GPU_MANIFESTS" >&2
+        exit 1
+    fi
+fi
+
 # Patch manifests for VM deployment constraints.
 HELMCHART="$K3S_MANIFESTS/openshell-helmchart.yaml"
 if [ -f "$HELMCHART" ]; then
@@ -737,7 +820,7 @@ K3S_ARGS=(
     --node-ip="$NODE_IP"
     --kube-apiserver-arg=bind-address=0.0.0.0
     --resolv-conf=/etc/resolv.conf
-    --tls-san=localhost,127.0.0.1,10.0.2.15,192.168.127.2
+    --tls-san="localhost,127.0.0.1,10.0.2.15,192.168.127.2,$NODE_IP"
     --flannel-backend=none
     --snapshotter=overlayfs
     --kube-proxy-arg=proxy-mode=nftables
diff --git a/crates/openshell-vm/src/backend/cloud_hypervisor.rs b/crates/openshell-vm/src/backend/cloud_hypervisor.rs
new file mode 100644
index 000000000..869b1747d
--- /dev/null
+++ b/crates/openshell-vm/src/backend/cloud_hypervisor.rs
@@ -0,0 +1,1476 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! cloud-hypervisor backend for GPU passthrough VMs.
+//!
+//! Uses the cloud-hypervisor REST API over a Unix socket to manage VMs
+//! with VFIO device passthrough. This backend is Linux-only and requires
+//! a separate kernel image (`vmlinux`) and `virtiofsd` for the root
+//! filesystem.
+
+use std::io::{Read, Write};
+use std::os::unix::net::UnixStream;
+use std::path::{Path, PathBuf};
+use std::time::{Duration, Instant};
+
+use super::VmBackend;
+use crate::exec::{
+    VM_EXEC_VSOCK_PORT, clear_vm_runtime_state, vm_exec_socket_path, write_vm_runtime_state,
+};
+use crate::{NetBackend, VmConfig, VmError, vm_rootfs_key};
+
+/// cloud-hypervisor hypervisor backend for GPU passthrough.
+pub struct CloudHypervisorBackend {
+    /// Path to the cloud-hypervisor binary.
+    chv_binary: PathBuf,
+    /// Path to the vmlinux kernel image.
+    vmlinux: PathBuf,
+    /// Path to the virtiofsd binary.
+    virtiofsd: PathBuf,
+}
+
+impl CloudHypervisorBackend {
+    /// Create a new cloud-hypervisor backend, validating required binaries.
+    pub fn new() -> Result<Self, VmError> {
+        let runtime_dir = crate::configured_runtime_dir()?;
+
+        let chv_binary = runtime_dir.join("cloud-hypervisor");
+        if !chv_binary.is_file() {
+            return Err(VmError::BinaryNotFound {
+                path: chv_binary.display().to_string(),
+                hint: "GPU passthrough requires cloud-hypervisor. Run the GPU build pipeline or set OPENSHELL_VM_RUNTIME_DIR".to_string(),
+            });
+        }
+
+        let vmlinux = runtime_dir.join("vmlinux");
+        if !vmlinux.is_file() {
+            return Err(VmError::BinaryNotFound {
+                path: vmlinux.display().to_string(),
+                hint: "GPU passthrough requires a vmlinux kernel. Run the GPU build pipeline"
+                    .to_string(),
+            });
+        }
+
+        let virtiofsd = runtime_dir.join("virtiofsd");
+        if !virtiofsd.is_file() {
+            return Err(VmError::BinaryNotFound {
+                path: virtiofsd.display().to_string(),
+                hint: "GPU passthrough requires virtiofsd. Run the GPU build pipeline".to_string(),
+            });
+        }
+
+        Ok(Self {
+            chv_binary,
+            vmlinux,
+            virtiofsd,
+        })
+    }
+}
+
+impl VmBackend for CloudHypervisorBackend {
+    fn launch(&self, config: &VmConfig) -> Result<i32, VmError> {
+        launch_cloud_hypervisor(self, config)
+    }
+}
+
+// ── REST API client ─────────────────────────────────────────────────────
+
+/// Send a raw HTTP/1.1 request over a Unix socket and return the response body.
+///
+/// Parses the response headers to determine Content-Length so we read exactly
+/// the right number of bytes without relying on EOF or Connection: close.
+fn http_request_unix(
+    socket_path: &Path,
+    method: &str,
+    path: &str,
+    body: Option<&str>,
+) -> Result<(u16, String), String> {
+    use std::io::BufRead;
+
+    let stream = UnixStream::connect(socket_path)
+        .map_err(|e| format!("connect to cloud-hypervisor API: {e}"))?;
+
+    stream
+        .set_read_timeout(Some(Duration::from_secs(30)))
+        .map_err(|e| format!("set read timeout: {e}"))?;
+
+    let request = if let Some(body) = body {
+        format!(
+            "{method} {path} HTTP/1.1\r\n\
+             Host: localhost\r\n\
+             Content-Type: application/json\r\n\
+             Content-Length: {}\r\n\
+             \r\n\
+             {body}",
+            body.len(),
+        )
+    } else {
+        format!(
+            "{method} {path} HTTP/1.1\r\n\
+             Host: localhost\r\n\
+             \r\n"
+        )
+    };
+
+    {
+        let mut writer = &stream;
+        writer
+            .write_all(request.as_bytes())
+            .map_err(|e| format!("write to cloud-hypervisor API: {e}"))?;
+    }
+
+    let mut reader = std::io::BufReader::new(&stream);
+
+    // Read status line
+    let mut status_line = String::new();
+    reader
+        .read_line(&mut status_line)
+        .map_err(|e| format!("read status line: {e}"))?;
+
+    let status_code = status_line
+        .split_whitespace()
+        .nth(1)
+        .and_then(|code| code.parse::<u16>().ok())
+        .unwrap_or(0);
+
+    // Read headers to find Content-Length
+    let mut content_length: usize = 0;
+    loop {
+        let mut header_line = String::new();
+        reader
+            .read_line(&mut header_line)
+            .map_err(|e| format!("read header: {e}"))?;
+        if header_line.trim().is_empty() {
+            break;
+        }
+        if let Some(val) = header_line
+            .strip_prefix("Content-Length:")
+            .or_else(|| header_line.strip_prefix("content-length:"))
+        {
+            if let Ok(len) = val.trim().parse::<usize>() {
+                content_length = len;
+            }
+        }
+    }
+
+    // Read body based on Content-Length
+    let mut body_bytes = vec![0u8; content_length];
+    if content_length > 0 {
+        reader
+            .read_exact(&mut body_bytes)
+            .map_err(|e| format!("read body ({content_length} bytes): {e}"))?;
+    }
+
+    let body_str = String::from_utf8_lossy(&body_bytes).to_string();
+    Ok((status_code, body_str))
+}
+
+/// Wait for a Unix socket to appear on the filesystem.
+fn wait_for_socket(socket_path: &Path, label: &str, timeout: Duration) -> Result<(), VmError> {
+    let deadline = Instant::now() + timeout;
+    let mut interval = Duration::from_millis(10);
+
+    while !socket_path.exists() {
+        if Instant::now() >= deadline {
+            return Err(VmError::HostSetup(format!(
+                "{label} socket did not appear within {}s: {}",
+                timeout.as_secs(),
+                socket_path.display(),
+            )));
+        }
+        std::thread::sleep(interval);
+        interval = (interval * 2).min(Duration::from_millis(200));
+    }
+
+    Ok(())
+}
+
+/// Create the VM via the cloud-hypervisor REST API.
+fn api_vm_create(socket_path: &Path, payload: &str) -> Result<(), VmError> {
+    let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.create", Some(payload))
+        .map_err(|e| VmError::HostSetup(format!("vm.create: {e}")))?;
+
+    if status >= 200 && status < 300 {
+        Ok(())
+    } else {
+        Err(VmError::HostSetup(format!(
+            "vm.create returned HTTP {status}: {body}"
+        )))
+    }
+}
+
+/// Boot the VM.
+fn api_vm_boot(socket_path: &Path) -> Result<(), VmError> {
+    let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.boot", None)
+        .map_err(|e| VmError::HostSetup(format!("vm.boot: {e}")))?;
+
+    if status >= 200 && status < 300 {
+        Ok(())
+    } else {
+        Err(VmError::HostSetup(format!(
+            "vm.boot returned HTTP {status}: {body}"
+        )))
+    }
+}
+
+/// Request a graceful shutdown.
+fn api_vm_shutdown(socket_path: &Path) -> Result<(), VmError> {
+    let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.shutdown", None)
+        .map_err(|e| VmError::HostSetup(format!("vm.shutdown: {e}")))?;
+
+    if status >= 200 && status < 300 {
+        Ok(())
+    } else {
+        Err(VmError::HostSetup(format!(
+            "vm.shutdown returned HTTP {status}: {body}"
+        )))
+    }
+}
+
+/// Query VM info/status.
+#[allow(dead_code)]
+fn api_vm_info(socket_path: &Path) -> Result<String, VmError> {
+    let (status, body) = http_request_unix(socket_path, "GET", "/api/v1/vm.info", None)
+        .map_err(|e| VmError::HostSetup(format!("vm.info: {e}")))?;
+
+    if status >= 200 && status < 300 {
+        Ok(body)
+    } else {
+        Err(VmError::HostSetup(format!(
+            "vm.info returned HTTP {status}: {body}"
+        )))
+    }
+}
+
+/// Delete the VM.
+#[allow(dead_code)]
+fn api_vm_delete(socket_path: &Path) -> Result<(), VmError> {
+    let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.delete", None)
+        .map_err(|e| VmError::HostSetup(format!("vm.delete: {e}")))?;
+
+    if status >= 200 && status < 300 {
+        Ok(())
+    } else {
+        Err(VmError::HostSetup(format!(
+            "vm.delete returned HTTP {status}: {body}"
+        )))
+    }
+}
+
+// ── Build the VM create payload ─────────────────────────────────────────
+
+fn build_vm_create_payload(
+    backend: &CloudHypervisorBackend,
+    config: &VmConfig,
+    effective_exec_path: &str,
+    vfio_device: Option<&str>,
+    virtiofsd_sock: &Path,
+    state_disk_path: Option<&Path>,
+    use_tap_net: bool,
+    vsock_sock: &Path,
+    console_log: &Path,
+) -> Result<String, VmError> {
+    let mem_bytes = u64::from(config.mem_mib) * 1024 * 1024;
+
+    let mut cmdline_parts = vec![
+        "console=ttyS0".to_string(),
+        "root=rootfs".to_string(),
+        "rootfstype=virtiofs".to_string(),
+        "rw".to_string(),
+        "panic=-1".to_string(),
+        format!("init={effective_exec_path}"),
+    ];
+
+    // Pass environment variables via kernel cmdline. Unrecognised kernel
+    // parameters are forwarded to init as env vars. Only simple KEY=VALUE
+    // pairs without spaces are safe (cmdline is space-delimited, ~4096 B).
+    if config.gpu_enabled && config.vfio_device.is_some() {
+        cmdline_parts.push("GPU_ENABLED=true".to_string());
+    }
+    if let Some(state_disk) = &config.state_disk {
+        cmdline_parts.push(format!(
+            "OPENSHELL_VM_STATE_DISK_DEVICE={}",
+            state_disk.guest_device
+        ));
+    }
+    for var in &config.env {
+        if var.contains('=') && !var.contains(' ') && !var.contains('"') {
+            cmdline_parts.push(var.clone());
+        }
+    }
+
+    if use_tap_net {
+        cmdline_parts.push(format!("VM_NET_IP={CHV_TAP_GUEST_IP}"));
+        cmdline_parts.push(format!("VM_NET_GW={CHV_TAP_HOST_IP}"));
+        cmdline_parts.push(format!("VM_NET_DNS={}", host_dns_server()));
+    }
+
+    let cmdline = cmdline_parts.join(" ");
+
+    let mut payload = serde_json::json!({
+        "cpus": {
+            "boot_vcpus": config.vcpus,
+            "max_vcpus": config.vcpus,
+        },
+        "memory": {
+            "size": mem_bytes,
+            "shared": true,
+        },
+        "payload": {
+            "kernel": backend.vmlinux.display().to_string(),
+            "cmdline": cmdline,
+        },
+        "fs": [{
+            "tag": "rootfs",
+            "socket": virtiofsd_sock.display().to_string(),
+            "num_queues": 1,
+            "queue_size": 1024,
+        }],
+        "vsock": {
+            "cid": VSOCK_GUEST_CID,
+            "socket": vsock_sock.display().to_string(),
+        },
+        "serial": {
+            "mode": "File",
+            "file": console_log.display().to_string(),
+        },
+        "console": {
+            "mode": "Off",
+        },
+    });
+
+    if let Some(disk_path) = state_disk_path {
+        payload["disks"] = serde_json::json!([{
+            "path": disk_path.display().to_string(),
+            "readonly": false,
+        }]);
+    }
+
+    // Cloud-hypervisor uses TAP devices for networking (requires root or
+    // CAP_NET_ADMIN). The gvproxy QEMU-style socket protocol is not
+    // compatible with CHV's NetConfig. GPU passthrough already requires
+    // elevated privileges, so TAP access is expected.
+    if use_tap_net {
+        payload["net"] = serde_json::json!([{
+            "mac": "5a:94:ef:e4:0c:ee",
+            "ip": CHV_TAP_HOST_IP,
+            "mask": CHV_TAP_NETMASK,
+        }]);
+    }
+
+    if let Some(vfio_path) = vfio_device {
+        payload["devices"] = serde_json::json!([{
+            "path": format!("/sys/bus/pci/devices/{vfio_path}/"),
+        }]);
+    }
+
+    serde_json::to_string(&payload)
+        .map_err(|e| VmError::HostSetup(format!("serialize vm.create payload: {e}")))
+}
+
+// ── Launch ──────────────────────────────────────────────────────────────
+
+#[allow(clippy::similar_names)]
+fn launch_cloud_hypervisor(
+    backend: &CloudHypervisorBackend,
+    config: &VmConfig,
+) -> Result<i32, VmError> {
+    let launch_start = Instant::now();
+
+    let run_dir = config
+        .rootfs
+        .parent()
+        .unwrap_or(&config.rootfs)
+        .to_path_buf();
+    let rootfs_key = vm_rootfs_key(&config.rootfs);
+
+    // Unix domain sockets are limited to 108 characters (SUN_LEN).
+    // Instance rootfs paths can be deeply nested, so place sockets
+    // under /tmp to stay within the limit.
+    let sock_dir = PathBuf::from(format!("/tmp/ovm-chv-{}", std::process::id()));
+    std::fs::create_dir_all(&sock_dir).map_err(|e| {
+        VmError::HostSetup(format!("create socket dir {}: {e}", sock_dir.display()))
+    })?;
+
+    let api_sock_path = sock_dir.join("api.sock");
+    let vsock_sock_path = sock_dir.join("vsock.sock");
+    let virtiofsd_sock_path = sock_dir.join("virtiofsd.sock");
+    let console_log = config
+        .console_output
+        .clone()
+        .unwrap_or_else(|| run_dir.join(format!("{rootfs_key}-console.log")));
+
+    // Clean stale sockets
+    let _ = std::fs::remove_file(&api_sock_path);
+    let _ = std::fs::remove_file(&vsock_sock_path);
+    let _ = std::fs::remove_file(&virtiofsd_sock_path);
+
+    // Start virtiofsd for the rootfs
+    eprintln!("Starting virtiofsd: {}", backend.virtiofsd.display());
+    let virtiofsd_log = run_dir.join(format!("{rootfs_key}-virtiofsd.log"));
+    let virtiofsd_log_file = std::fs::File::create(&virtiofsd_log)
+        .map_err(|e| VmError::Fork(format!("create virtiofsd log: {e}")))?;
+
+    let mut virtiofsd_child = std::process::Command::new(&backend.virtiofsd)
+        .arg(format!("--socket-path={}", virtiofsd_sock_path.display()))
+        .arg(format!("--shared-dir={}", config.rootfs.display()))
+        .arg("--cache=always")
+        .stdout(std::process::Stdio::null())
+        .stderr(virtiofsd_log_file)
+        .spawn()
+        .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?;
+
+    eprintln!(
+        "virtiofsd started (pid {}) [{:.1}s]",
+        virtiofsd_child.id(),
+        launch_start.elapsed().as_secs_f64()
+    );
+
+    // Wait for virtiofsd socket
+    wait_for_socket(&virtiofsd_sock_path, "virtiofsd", Duration::from_secs(5))?;
+
+    // CHV uses TAP networking (requires root/CAP_NET_ADMIN). The gvproxy
+    // QEMU-style socket protocol is not compatible with cloud-hypervisor's
+    // NetConfig. GPU passthrough already requires elevated privileges.
+    let use_tap_net = !matches!(config.net, NetBackend::None);
+
+    // For --exec mode: wrap the command so the VM powers off after it exits.
+    // Unlike libkrun (which exits when init terminates), cloud-hypervisor
+    // keeps running after PID 1 exits (kernel panics). A wrapper init script
+    // runs the command then calls `poweroff -f` for a clean ACPI shutdown.
+    let is_exec_mode = config.exec_path != "/srv/openshell-vm-init.sh";
+    let wrapper_path = config.rootfs.join("tmp/chv-exec-wrapper.sh");
+    let effective_exec_path;
+    if is_exec_mode {
+        let args_str = config
+            .args
+            .iter()
+            .map(|a| shell_escape(a))
+            .collect::<Vec<_>>()
+            .join(" ");
+
+        let env_str = config
+            .env
+            .iter()
+            .map(|v| format!("export {}", shell_escape(v)))
+            .collect::<Vec<_>>()
+            .join("\n");
+
+        let wrapper = format!(
+            "#!/bin/sh\n\
+             mount -t proc proc /proc 2>/dev/null\n\
+             mount -t sysfs sysfs /sys 2>/dev/null\n\
+             mount -t devtmpfs devtmpfs /dev 2>/dev/null\n\
+             {env_str}\n\
+             cd {workdir}\n\
+             {exec} {args}\n\
+             RC=$?\n\
+             # Trigger ACPI power-off so cloud-hypervisor exits cleanly.\n\
+             # The rootfs may not have a `poweroff` binary, so try multiple methods.\n\
+             if command -v poweroff >/dev/null 2>&1; then\n\
+               poweroff -f\n\
+             elif [ -x /usr/bin/busybox ]; then\n\
+               /usr/bin/busybox poweroff -f\n\
+             else\n\
+               echo o > /proc/sysrq-trigger\n\
+             fi\n\
+             exit $RC\n",
+            env_str = env_str,
+            workdir = shell_escape(&config.workdir),
+            exec = shell_escape(&config.exec_path),
+            args = args_str,
+        );
+
+        if let Some(parent) = wrapper_path.parent() {
+            std::fs::create_dir_all(parent)
+                .map_err(|e| VmError::HostSetup(format!("create wrapper dir: {e}")))?;
+        }
+        std::fs::write(&wrapper_path, &wrapper)
+            .map_err(|e| VmError::HostSetup(format!("write exec wrapper: {e}")))?;
+        #[cfg(unix)]
+        {
+            use std::os::unix::fs::PermissionsExt;
+            let _ = std::fs::set_permissions(&wrapper_path, std::fs::Permissions::from_mode(0o755));
+        }
+        effective_exec_path = "/tmp/chv-exec-wrapper.sh".to_string();
+    } else {
+        effective_exec_path = config.exec_path.clone();
+    }
+
+    // Start cloud-hypervisor process
+    eprintln!(
+        "Starting cloud-hypervisor: {}",
+        backend.chv_binary.display()
+    );
+
+    let chv_log = run_dir.join(format!("{rootfs_key}-cloud-hypervisor.log"));
+    let chv_log_file = std::fs::File::create(&chv_log)
+        .map_err(|e| VmError::Fork(format!("create cloud-hypervisor log: {e}")))?;
+
+    let mut chv_child = std::process::Command::new(&backend.chv_binary)
+        .arg("--api-socket")
+        .arg(&api_sock_path)
+        .stdout(std::process::Stdio::null())
+        .stderr(chv_log_file)
+        .spawn()
+        .map_err(|e| VmError::Fork(format!("start cloud-hypervisor: {e}")))?;
+
+    let chv_pid = chv_child.id() as i32;
+    eprintln!(
+        "cloud-hypervisor started (pid {chv_pid}) [{:.1}s]",
+        launch_start.elapsed().as_secs_f64()
+    );
+
+    // Wait for API socket
+    wait_for_socket(&api_sock_path, "cloud-hypervisor", Duration::from_secs(10))?;
+
+    // Build and send VM create payload
+    let state_disk_path = config.state_disk.as_ref().map(|sd| sd.path.as_path());
+    let payload = build_vm_create_payload(
+        backend,
+        config,
+        &effective_exec_path,
+        config.vfio_device.as_deref(),
+        &virtiofsd_sock_path,
+        state_disk_path,
+        use_tap_net,
+        &vsock_sock_path,
+        &console_log,
+    )?;
+
+    api_vm_create(&api_sock_path, &payload)?;
+    eprintln!("VM created [{:.1}s]", launch_start.elapsed().as_secs_f64());
+
+    api_vm_boot(&api_sock_path)?;
+    let boot_start = Instant::now();
+    eprintln!("VM booting [{:.1}s]", launch_start.elapsed().as_secs_f64());
+
+    // Set up host-side networking for TAP (NAT, IP forwarding, masquerade)
+    // so the guest can reach the internet through the host.
+    let mut original_ip_forward: Option<String> = None;
+    if use_tap_net {
+        match setup_chv_host_networking() {
+            Ok(orig) => original_ip_forward = Some(orig),
+            Err(e) => {
+                eprintln!("WARNING: host networking setup failed: {e}");
+                eprintln!("  The VM may not have internet access.");
+            }
+        }
+    }
+
+    // Write runtime state (vsock_bridge: true — CHV uses AF_VSOCK bridging)
+    if config.exec_path == "/srv/openshell-vm-init.sh" {
+        if let Err(err) = write_vm_runtime_state(&config.rootfs, chv_pid, &console_log, None, true)
+        {
+            let _ = api_vm_shutdown(&api_sock_path);
+            let _ = chv_child.kill();
+            let _ = chv_child.wait();
+            let _ = virtiofsd_child.kill();
+            let _ = virtiofsd_child.wait();
+            if let Some(ref orig) = original_ip_forward {
+                teardown_chv_host_networking(orig);
+            }
+            clear_vm_runtime_state(&config.rootfs);
+            return Err(err);
+        }
+    }
+
+    // CHV TAP networking doesn't provide built-in port forwarding like
+    // gvproxy. Start a TCP proxy for each port mapping so the host can
+    // reach guest services (e.g., the gateway health check on :30051).
+    if use_tap_net {
+        for pm in &config.port_map {
+            let parts: Vec<&str> = pm.split(':').collect();
+            if parts.len() == 2 {
+                if let (Ok(hp), Ok(gp)) = (parts[0].parse::<u16>(), parts[1].parse::<u16>()) {
+                    start_tcp_port_forwarder(hp, CHV_TAP_GUEST_IP, gp)?;
+                }
+            }
+        }
+    }
+
+    for pm in &config.port_map {
+        let host_port = pm.split(':').next().unwrap_or(pm);
+        eprintln!("  port {pm} -> http://localhost:{host_port}");
+    }
+    eprintln!("Console output: {}", console_log.display());
+
+    // Start vsock exec bridge (exec Unix socket → CHV vsock Unix socket).
+    // The bridge allows `openshell-vm exec` and bootstrap to communicate
+    // with the guest exec agent over the standard exec socket path.
+    let exec_socket = vm_exec_socket_path(&config.rootfs);
+    start_vsock_exec_bridge(&exec_socket, &vsock_sock_path, VM_EXEC_VSOCK_PORT)?;
+
+    // Gateway bootstrap and health check (mirrors libkrun backend).
+    if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() {
+        let gateway_port = crate::gateway_host_port(config);
+        crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?;
+        crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?;
+    }
+
+    eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64());
+    eprintln!("Press Ctrl+C to stop.");
+
+    // Signal forwarding: SIGINT/SIGTERM -> graceful shutdown
+    unsafe {
+        libc::signal(
+            libc::SIGINT,
+            crate::forward_signal as *const () as libc::sighandler_t,
+        );
+        libc::signal(
+            libc::SIGTERM,
+            crate::forward_signal as *const () as libc::sighandler_t,
+        );
+        crate::CHILD_PID.store(chv_pid, std::sync::atomic::Ordering::Relaxed);
+    }
+
+    // Wait for cloud-hypervisor to exit
+    let status = chv_child
+        .wait()
+        .map_err(|e| VmError::HostSetup(format!("wait for cloud-hypervisor: {e}")))?;
+
+    // Clean up host networking rules
+    if let Some(ref orig) = original_ip_forward {
+        teardown_chv_host_networking(orig);
+    }
+
+    // Cleanup
+    if config.exec_path == "/srv/openshell-vm-init.sh" {
+        clear_vm_runtime_state(&config.rootfs);
+    }
+    let _ = virtiofsd_child.kill();
+    let _ = virtiofsd_child.wait();
+    eprintln!("virtiofsd stopped");
+
+    // Clean up sockets and wrapper
+    let _ = std::fs::remove_dir_all(&sock_dir);
+    let _ = std::fs::remove_file(&exec_socket);
+    if is_exec_mode {
+        let _ = std::fs::remove_file(&wrapper_path);
+    }
+
+    let code = status.code().unwrap_or(1);
+    eprintln!("VM exited with code {code}");
+    Ok(code)
+}
+
+/// Escape a string for use in a shell script. Wraps in single quotes.
+fn shell_escape(s: &str) -> String {
+    if s.is_empty() {
+        return "''".to_string();
+    }
+    if !s.contains('\'') && !s.contains(' ') && !s.contains('"') && !s.contains('\\') {
+        return s.to_string();
+    }
+    format!("'{}'", s.replace('\'', "'\\''"))
+}
+
+// ── Vsock exec bridge ───────────────────────────────────────────────────
+
+/// Guest CID assigned in the cloud-hypervisor vsock config.
+const VSOCK_GUEST_CID: u32 = 3;
+
+// ── CHV TAP networking constants ────────────────────────────────────────
+// cloud-hypervisor defaults to 192.168.249.1/24 on the host side of the
+// TAP device. The guest uses .2 with the host as its gateway.
+
+const CHV_TAP_HOST_IP: &str = "192.168.249.1";
+const CHV_TAP_GUEST_IP: &str = "192.168.249.2";
+const CHV_TAP_SUBNET: &str = "192.168.249.0/24";
+const CHV_TAP_NETMASK: &str = "255.255.255.0";
+
+/// Start a background bridge: exec Unix socket → CHV vsock Unix socket.
+///
+/// cloud-hypervisor exposes guest vsock via a host-side Unix socket with a
+/// text protocol: connect to the socket, send `CONNECT <port>\n`, read
+/// back `OK <port>\n`, then the stream is a raw bidirectional channel to
+/// the guest vsock port. This is different from kernel `AF_VSOCK` (which
+/// `vhost-vsock` uses) — CHV manages its own transport.
+///
+/// This bridge creates a Unix socket at `exec_socket` and, for each
+/// incoming connection, opens a connection to the CHV vsock socket,
+/// performs the CONNECT handshake, and forwards data bidirectionally.
+fn start_vsock_exec_bridge(
+    exec_socket: &Path,
+    chv_vsock_socket: &Path,
+    guest_port: u32,
+) -> Result<(), VmError> {
+    use std::os::unix::net::UnixListener;
+
+    if let Some(parent) = exec_socket.parent() {
+        std::fs::create_dir_all(parent).map_err(|e| {
+            VmError::HostSetup(format!("create exec bridge dir {}: {e}", parent.display()))
+        })?;
+    }
+    let _ = std::fs::remove_file(exec_socket);
+
+    let listener = UnixListener::bind(exec_socket).map_err(|e| {
+        VmError::HostSetup(format!(
+            "bind vsock exec bridge {}: {e}",
+            exec_socket.display()
+        ))
+    })?;
+
+    let chv_vsock = chv_vsock_socket.to_path_buf();
+    eprintln!(
+        "vsock exec bridge: {} → {} port {}",
+        exec_socket.display(),
+        chv_vsock.display(),
+        guest_port,
+    );
+
+    std::thread::spawn(move || {
+        vsock_bridge_accept_loop(listener, &chv_vsock, guest_port);
+    });
+
+    Ok(())
+}
+
+/// Accept loop for the vsock bridge background thread.
+///
+/// "CONNECT rejected" (empty response) is normal during boot — the guest
+/// exec agent isn't listening yet. We keep retrying those indefinitely
+/// since the bootstrap caller has its own 120s timeout. Only fatal errors
+/// (socket gone = VM died) cause the bridge to give up.
+fn vsock_bridge_accept_loop(
+    listener: std::os::unix::net::UnixListener,
+    chv_vsock_socket: &Path,
+    port: u32,
+) {
+    let mut fatal_failures: u32 = 0;
+    let mut logged_transient = false;
+
+    for stream in listener.incoming() {
+        let client = match stream {
+            Ok(s) => s,
+            Err(e) => {
+                eprintln!("vsock bridge: accept: {e}");
+                continue;
+            }
+        };
+
+        match chv_vsock_connect(chv_vsock_socket, port) {
+            Ok(guest) => {
+                fatal_failures = 0;
+                bridge_bidirectional(client, guest);
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                fatal_failures += 1;
+                if fatal_failures <= 2 {
+                    eprintln!("vsock bridge: CHV socket gone (VM exited?): {e}");
+                }
+                if fatal_failures >= 3 {
+                    eprintln!("vsock bridge: CHV socket not found, stopping bridge");
+                    return;
+                }
+            }
+            Err(e) => {
+                if !logged_transient {
+                    eprintln!(
+                        "vsock bridge: guest not ready on port {port} ({e}), \
+                         will keep retrying..."
+                    );
+                    logged_transient = true;
+                }
+            }
+        }
+    }
+}
+
+/// Connect to a guest vsock port via cloud-hypervisor's Unix socket protocol.
+///
+/// CHV exposes guest vsock through a host Unix socket. The protocol is:
+///   1. Connect to the CHV vsock Unix socket
+///   2. Send: `CONNECT <port>\n`
+///   3. Read: `OK <port>\n` on success
+///   4. The stream is now a raw bidirectional channel to the guest port
+fn chv_vsock_connect(chv_vsock_socket: &Path, port: u32) -> std::io::Result<UnixStream> {
+    let mut stream = UnixStream::connect(chv_vsock_socket)?;
+    stream.set_read_timeout(Some(Duration::from_secs(5)))?;
+    stream.set_write_timeout(Some(Duration::from_secs(5)))?;
+
+    let connect_msg = format!("CONNECT {port}\n");
+    stream.write_all(connect_msg.as_bytes())?;
+
+    let mut buf = [0u8; 64];
+    let n = stream.read(&mut buf)?;
+    let response = std::str::from_utf8(&buf[..n]).unwrap_or("");
+
+    if !response.starts_with("OK") {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::ConnectionRefused,
+            format!("CHV vsock CONNECT rejected: {}", response.trim()),
+        ));
+    }
+
+    stream.set_read_timeout(None)?;
+    stream.set_write_timeout(None)?;
+    Ok(stream)
+}
+
+/// Spawn two threads that copy data between two Unix streams.
+fn bridge_bidirectional(client: UnixStream, guest: UnixStream) {
+    let Ok(mut client_r) = client.try_clone() else {
+        return;
+    };
+    let mut client_w = client;
+    let Ok(mut guest_r) = guest.try_clone() else {
+        return;
+    };
+    let mut guest_w = guest;
+
+    std::thread::spawn(move || {
+        let _ = std::io::copy(&mut client_r, &mut guest_w);
+    });
+    std::thread::spawn(move || {
+        let _ = std::io::copy(&mut guest_r, &mut client_w);
+    });
+}
+
+// ── CHV host networking ─────────────────────────────────────────────────
+
+/// Parse a DNS server from resolv.conf content.
+///
+/// Returns the first non-`127.x.x.x` nameserver, or `8.8.8.8` if none found.
+/// Extracted from [`host_dns_server`] for testability.
+fn parse_dns_server(content: &str) -> String {
+    content
+        .lines()
+        .filter(|line| line.starts_with("nameserver"))
+        .filter_map(|line| line.split_whitespace().nth(1))
+        .find(|ip| !ip.starts_with("127."))
+        .map(String::from)
+        .unwrap_or_else(|| "8.8.8.8".to_string())
+}
+
+/// Read the host's primary DNS server.
+///
+/// Checks `/etc/resolv.conf` first. If every nameserver there is a loopback
+/// address (e.g. systemd-resolved's `127.0.0.53`), falls back to the
+/// upstream resolv.conf at `/run/systemd/resolve/resolv.conf` which
+/// contains the real upstream nameservers. Final fallback is `8.8.8.8`.
+fn host_dns_server() -> String {
+    for path in &["/etc/resolv.conf", "/run/systemd/resolve/resolv.conf"] {
+        if let Ok(content) = std::fs::read_to_string(path) {
+            let server = parse_dns_server(&content);
+            if server != "8.8.8.8" {
+                return server;
+            }
+        }
+    }
+    "8.8.8.8".to_string()
+}
+
+/// Run a command, returning an error if it fails.
+fn run_cmd(cmd: &str, args: &[&str]) -> Result<(), VmError> {
+    let output = std::process::Command::new(cmd)
+        .args(args)
+        .output()
+        .map_err(|e| VmError::HostSetup(format!("{cmd}: {e}")))?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        return Err(VmError::HostSetup(format!(
+            "{cmd} {}: {}",
+            args.join(" "),
+            stderr.trim()
+        )));
+    }
+
+    Ok(())
+}
+
+/// Set up host-side networking so the CHV guest can reach the internet.
+///
+/// 1. Enable IP forwarding (saving the original value for teardown)
+/// 2. MASQUERADE outbound traffic from the VM subnet
+/// 3. Allow forwarding to/from the VM subnet
+///
+/// Returns the original value of `ip_forward` so the caller can restore it.
+fn setup_chv_host_networking() -> Result<String, VmError> {
+    let original_ip_forward = std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward")
+        .map(|s| s.trim().to_string())
+        .unwrap_or_else(|_| "0".to_string());
+
+    std::fs::write("/proc/sys/net/ipv4/ip_forward", "1")
+        .map_err(|e| VmError::HostSetup(format!("enable IP forwarding: {e}")))?;
+
+    run_cmd(
+        "iptables",
+        &[
+            "-t",
+            "nat",
+            "-A",
+            "POSTROUTING",
+            "-s",
+            CHV_TAP_SUBNET,
+            "!",
+            "-d",
+            CHV_TAP_SUBNET,
+            "-j",
+            "MASQUERADE",
+        ],
+    )?;
+
+    run_cmd(
+        "iptables",
+        &["-A", "FORWARD", "-s", CHV_TAP_SUBNET, "-j", "ACCEPT"],
+    )?;
+
+    run_cmd(
+        "iptables",
+        &["-A", "FORWARD", "-d", CHV_TAP_SUBNET, "-j", "ACCEPT"],
+    )?;
+
+    eprintln!("host networking: IP forwarding + NAT masquerade for {CHV_TAP_SUBNET}");
+    Ok(original_ip_forward)
+}
+
+/// Remove the iptables rules added by [`setup_chv_host_networking`] and
+/// restore the original `ip_forward` sysctl value.
+fn teardown_chv_host_networking(original_ip_forward: &str) {
+    let _ = run_cmd(
+        "iptables",
+        &[
+            "-t",
+            "nat",
+            "-D",
+            "POSTROUTING",
+            "-s",
+            CHV_TAP_SUBNET,
+            "!",
+            "-d",
+            CHV_TAP_SUBNET,
+            "-j",
+            "MASQUERADE",
+        ],
+    );
+    let _ = run_cmd(
+        "iptables",
+        &["-D", "FORWARD", "-s", CHV_TAP_SUBNET, "-j", "ACCEPT"],
+    );
+    let _ = run_cmd(
+        "iptables",
+        &["-D", "FORWARD", "-d", CHV_TAP_SUBNET, "-j", "ACCEPT"],
+    );
+    if original_ip_forward != "1" {
+        let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", original_ip_forward);
+    }
+    eprintln!("host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}");
+}
+
+/// Start a background TCP proxy that forwards `127.0.0.1:{host_port}`
+/// to `{guest_ip}:{guest_port}`.
+///
+/// Each accepted connection spawns two threads for bidirectional copy.
+/// The listener thread runs until the process exits.
+fn start_tcp_port_forwarder(
+    host_port: u16,
+    guest_ip: &str,
+    guest_port: u16,
+) -> Result<(), VmError> {
+    use std::net::{TcpListener, TcpStream};
+
+    let listener = TcpListener::bind(("127.0.0.1", host_port))
+        .map_err(|e| VmError::HostSetup(format!("bind port forwarder on :{host_port}: {e}")))?;
+
+    let guest_addr = format!("{guest_ip}:{guest_port}");
+    eprintln!("port forwarder: 127.0.0.1:{host_port} -> {guest_addr}");
+
+    std::thread::spawn(move || {
+        for stream in listener.incoming() {
+            let client = match stream {
+                Ok(s) => s,
+                Err(_) => continue,
+            };
+
+            let addr = guest_addr.clone();
+            std::thread::spawn(move || {
+                if let Ok(remote) = TcpStream::connect(&addr) {
+                    forward_tcp_bidirectional(client, remote);
+                }
+            });
+        }
+    });
+
+    Ok(())
+}
+
+/// Copy data bidirectionally between two TCP streams until either side closes.
+fn forward_tcp_bidirectional(client: std::net::TcpStream, remote: std::net::TcpStream) {
+    let Ok(mut client_r) = client.try_clone() else {
+        return;
+    };
+    let mut client_w = client;
+    let Ok(mut remote_r) = remote.try_clone() else {
+        return;
+    };
+    let mut remote_w = remote;
+
+    std::thread::spawn(move || {
+        let _ = std::io::copy(&mut client_r, &mut remote_w);
+    });
+    std::thread::spawn(move || {
+        let _ = std::io::copy(&mut remote_r, &mut client_w);
+    });
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn http_request_format_with_body() {
+        let payload = r#"{"cpus":{"boot_vcpus":4}}"#;
+        let request = format!(
+            "PUT /api/v1/vm.create HTTP/1.1\r\n\
+             Host: localhost\r\n\
+             Content-Type: application/json\r\n\
+             Content-Length: {}\r\n\
+             Connection: close\r\n\
+             \r\n\
+             {payload}",
+            payload.len(),
+        );
+        assert!(request.contains("Content-Length: 25"));
+        assert!(request.contains("boot_vcpus"));
+    }
+
+    #[test]
+    fn http_request_format_without_body() {
+        let request = format!(
+            "GET /api/v1/vm.info HTTP/1.1\r\n\
+             Host: localhost\r\n\
+             Connection: close\r\n\
+             \r\n"
+        );
+        assert!(request.contains("GET /api/v1/vm.info"));
+        assert!(!request.contains("Content-Length"));
+    }
+
+    #[test]
+    fn build_payload_includes_vfio_device() {
+        use crate::{NetBackend, VmConfig};
+
+        let config = VmConfig {
+            rootfs: "/tmp/rootfs".into(),
+            vcpus: 4,
+            mem_mib: 8192,
+            exec_path: "/srv/openshell-vm-init.sh".into(),
+            args: vec![],
+            env: vec![],
+            workdir: "/".into(),
+            port_map: vec![],
+            vsock_ports: vec![],
+            log_level: 1,
+            console_output: None,
+            net: NetBackend::None,
+            reset: false,
+            gateway_name: "test".into(),
+            state_disk: None,
+            gpu_enabled: true,
+            vfio_device: Some("0000:41:00.0".into()),
+            backend: crate::VmBackendChoice::CloudHypervisor,
+        };
+
+        let backend = CloudHypervisorBackend {
+            chv_binary: "/usr/bin/cloud-hypervisor".into(),
+            vmlinux: "/boot/vmlinux".into(),
+            virtiofsd: "/usr/bin/virtiofsd".into(),
+        };
+
+        let payload = build_vm_create_payload(
+            &backend,
+            &config,
+            &config.exec_path,
+            config.vfio_device.as_deref(),
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            false,
+            Path::new("/tmp/vsock.sock"),
+            Path::new("/tmp/console.log"),
+        )
+        .unwrap();
+
+        assert!(
+            payload.contains("0000:41:00.0"),
+            "payload should contain VFIO device"
+        );
+        assert!(
+            payload.contains("boot_vcpus"),
+            "payload should contain vcpus config"
+        );
+        assert!(
+            payload.contains("GPU_ENABLED=true"),
+            "payload should contain GPU_ENABLED in cmdline"
+        );
+    }
+
+    #[test]
+    fn build_payload_without_vfio() {
+        use crate::{NetBackend, VmConfig};
+
+        let config = VmConfig {
+            rootfs: "/tmp/rootfs".into(),
+            vcpus: 2,
+            mem_mib: 4096,
+            exec_path: "/srv/openshell-vm-init.sh".into(),
+            args: vec![],
+            env: vec![],
+            workdir: "/".into(),
+            port_map: vec![],
+            vsock_ports: vec![],
+            log_level: 1,
+            console_output: None,
+            net: NetBackend::None,
+            reset: false,
+            gateway_name: "test".into(),
+            state_disk: None,
+            gpu_enabled: false,
+            vfio_device: None,
+            backend: crate::VmBackendChoice::Auto,
+        };
+
+        let backend = CloudHypervisorBackend {
+            chv_binary: "/usr/bin/cloud-hypervisor".into(),
+            vmlinux: "/boot/vmlinux".into(),
+            virtiofsd: "/usr/bin/virtiofsd".into(),
+        };
+
+        let payload = build_vm_create_payload(
+            &backend,
+            &config,
+            &config.exec_path,
+            None,
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            false,
+            Path::new("/tmp/vsock.sock"),
+            Path::new("/tmp/console.log"),
+        )
+        .unwrap();
+
+        assert!(
+            !payload.contains("devices"),
+            "payload without VFIO should not have devices key"
+        );
+        assert!(
+            !payload.contains("GPU_ENABLED"),
+            "payload should not contain GPU_ENABLED"
+        );
+    }
+
+    #[test]
+    fn build_payload_with_tap_net_includes_ip_and_cmdline() {
+        use crate::{NetBackend, VmConfig};
+
+        let config = VmConfig {
+            rootfs: "/tmp/rootfs".into(),
+            vcpus: 4,
+            mem_mib: 8192,
+            exec_path: "/srv/openshell-vm-init.sh".into(),
+            args: vec![],
+            env: vec![],
+            workdir: "/".into(),
+            port_map: vec!["30051:30051".into()],
+            vsock_ports: vec![],
+            log_level: 1,
+            console_output: None,
+            net: NetBackend::Gvproxy {
+                binary: "/usr/bin/gvproxy".into(),
+            },
+            reset: false,
+            gateway_name: "test".into(),
+            state_disk: None,
+            gpu_enabled: true,
+            vfio_device: Some("0000:41:00.0".into()),
+            backend: crate::VmBackendChoice::CloudHypervisor,
+        };
+
+        let backend = CloudHypervisorBackend {
+            chv_binary: "/usr/bin/cloud-hypervisor".into(),
+            vmlinux: "/boot/vmlinux".into(),
+            virtiofsd: "/usr/bin/virtiofsd".into(),
+        };
+
+        let payload = build_vm_create_payload(
+            &backend,
+            &config,
+            &config.exec_path,
+            config.vfio_device.as_deref(),
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            true, // use_tap_net
+            Path::new("/tmp/vsock.sock"),
+            Path::new("/tmp/console.log"),
+        )
+        .unwrap();
+
+        assert!(
+            payload.contains("192.168.249.1"),
+            "net should contain TAP host IP"
+        );
+        assert!(
+            payload.contains("255.255.255.0"),
+            "net should contain TAP netmask"
+        );
+        assert!(
+            payload.contains("VM_NET_IP=192.168.249.2"),
+            "cmdline should contain guest IP"
+        );
+        assert!(
+            payload.contains("VM_NET_GW=192.168.249.1"),
+            "cmdline should contain gateway IP"
+        );
+        assert!(
+            payload.contains("VM_NET_DNS="),
+            "cmdline should contain DNS server"
+        );
+    }
+
+    #[test]
+    fn build_payload_tap_net_false_omits_net_and_vm_net_vars() {
+        use crate::{NetBackend, VmConfig};
+
+        let config = VmConfig {
+            rootfs: "/tmp/rootfs".into(),
+            vcpus: 2,
+            mem_mib: 4096,
+            exec_path: "/srv/openshell-vm-init.sh".into(),
+            args: vec![],
+            env: vec![],
+            workdir: "/".into(),
+            port_map: vec![],
+            vsock_ports: vec![],
+            log_level: 1,
+            console_output: None,
+            net: NetBackend::None,
+            reset: false,
+            gateway_name: "test".into(),
+            state_disk: None,
+            gpu_enabled: false,
+            vfio_device: None,
+            backend: crate::VmBackendChoice::Auto,
+        };
+
+        let backend = CloudHypervisorBackend {
+            chv_binary: "/usr/bin/cloud-hypervisor".into(),
+            vmlinux: "/boot/vmlinux".into(),
+            virtiofsd: "/usr/bin/virtiofsd".into(),
+        };
+
+        let payload = build_vm_create_payload(
+            &backend,
+            &config,
+            &config.exec_path,
+            None,
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            false,
+            Path::new("/tmp/vsock.sock"),
+            Path::new("/tmp/console.log"),
+        )
+        .unwrap();
+
+        assert!(
+            !payload.contains("\"net\""),
+            "no-tap payload should not contain net section"
+        );
+        assert!(
+            !payload.contains("VM_NET_IP"),
+            "no-tap payload should not contain VM_NET_IP"
+        );
+        assert!(
+            !payload.contains("VM_NET_GW"),
+            "no-tap payload should not contain VM_NET_GW"
+        );
+        assert!(
+            !payload.contains("VM_NET_DNS"),
+            "no-tap payload should not contain VM_NET_DNS"
+        );
+    }
+
+    #[test]
+    fn build_payload_tap_net_has_correct_mac_ip_mask() {
+        use crate::{NetBackend, VmConfig};
+
+        let config = VmConfig {
+            rootfs: "/tmp/rootfs".into(),
+            vcpus: 2,
+            mem_mib: 4096,
+            exec_path: "/srv/openshell-vm-init.sh".into(),
+            args: vec![],
+            env: vec![],
+            workdir: "/".into(),
+            port_map: vec![],
+            vsock_ports: vec![],
+            log_level: 1,
+            console_output: None,
+            net: NetBackend::Gvproxy {
+                binary: "/usr/bin/gvproxy".into(),
+            },
+            reset: false,
+            gateway_name: "test".into(),
+            state_disk: None,
+            gpu_enabled: false,
+            vfio_device: None,
+            backend: crate::VmBackendChoice::CloudHypervisor,
+        };
+
+        let backend = CloudHypervisorBackend {
+            chv_binary: "/usr/bin/cloud-hypervisor".into(),
+            vmlinux: "/boot/vmlinux".into(),
+            virtiofsd: "/usr/bin/virtiofsd".into(),
+        };
+
+        let payload = build_vm_create_payload(
+            &backend,
+            &config,
+            &config.exec_path,
+            None,
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            true,
+            Path::new("/tmp/vsock.sock"),
+            Path::new("/tmp/console.log"),
+        )
+        .unwrap();
+
+        let json: serde_json::Value = serde_json::from_str(&payload).unwrap();
+        let net = &json["net"][0];
+        assert_eq!(net["mac"], "5a:94:ef:e4:0c:ee");
+        assert_eq!(net["ip"], "192.168.249.1");
+        assert_eq!(net["mask"], "255.255.255.0");
+    }
+
+    #[test]
+    fn build_payload_vfio_and_tap_net_coexist() {
+        use crate::{NetBackend, VmConfig};
+
+        let config = VmConfig {
+            rootfs: "/tmp/rootfs".into(),
+            vcpus: 4,
+            mem_mib: 8192,
+            exec_path: "/srv/openshell-vm-init.sh".into(),
+            args: vec![],
+            env: vec![],
+            workdir: "/".into(),
+            port_map: vec![],
+            vsock_ports: vec![],
+            log_level: 1,
+            console_output: None,
+            net: NetBackend::Gvproxy {
+                binary: "/usr/bin/gvproxy".into(),
+            },
+            reset: false,
+            gateway_name: "test".into(),
+            state_disk: None,
+            gpu_enabled: true,
+            vfio_device: Some("0000:41:00.0".into()),
+            backend: crate::VmBackendChoice::CloudHypervisor,
+        };
+
+        let backend = CloudHypervisorBackend {
+            chv_binary: "/usr/bin/cloud-hypervisor".into(),
+            vmlinux: "/boot/vmlinux".into(),
+            virtiofsd: "/usr/bin/virtiofsd".into(),
+        };
+
+        let payload = build_vm_create_payload(
+            &backend,
+            &config,
+            &config.exec_path,
+            config.vfio_device.as_deref(),
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            true,
+            Path::new("/tmp/vsock.sock"),
+            Path::new("/tmp/console.log"),
+        )
+        .unwrap();
+
+        let json: serde_json::Value = serde_json::from_str(&payload).unwrap();
+        assert!(
+            json["devices"].is_array(),
+            "devices section should exist for VFIO"
+        );
+        assert!(json["net"].is_array(), "net section should exist for TAP");
+        assert!(
+            json["devices"][0]["path"]
+                .as_str()
+                .unwrap()
+                .contains("0000:41:00.0"),
+            "VFIO device path should be present"
+        );
+        assert_eq!(json["net"][0]["ip"], "192.168.249.1");
+    }
+
+    // ── parse_dns_server tests ──────────────────────────────────────────
+
+    #[test]
+    fn parse_dns_server_returns_first_non_loopback() {
+        let content = "nameserver 10.0.0.1\nnameserver 8.8.8.8\n";
+        assert_eq!(parse_dns_server(content), "10.0.0.1");
+    }
+
+    #[test]
+    fn parse_dns_server_skips_systemd_resolved() {
+        let content = "nameserver 127.0.0.53\nnameserver 1.1.1.1\n";
+        assert_eq!(parse_dns_server(content), "1.1.1.1");
+    }
+
+    #[test]
+    fn parse_dns_server_skips_all_loopback_variants() {
+        let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\nnameserver 172.16.0.1\n";
+        assert_eq!(parse_dns_server(content), "172.16.0.1");
+    }
+
+    #[test]
+    fn parse_dns_server_falls_back_when_only_loopback() {
+        let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\n";
+        assert_eq!(parse_dns_server(content), "8.8.8.8");
+    }
+
+    #[test]
+    fn parse_dns_server_handles_empty_content() {
+        assert_eq!(parse_dns_server(""), "8.8.8.8");
+    }
+
+    #[test]
+    fn parse_dns_server_ignores_comments_and_other_lines() {
+        let content = "# Generated by NetworkManager\nsearch example.com\nnameserver 10.1.2.3\n";
+        assert_eq!(parse_dns_server(content), "10.1.2.3");
+    }
+
+    // ── shell_escape tests ──────────────────────────────────────────────
+
+    #[test]
+    fn shell_escape_empty_string() {
+        assert_eq!(shell_escape(""), "''");
+    }
+
+    #[test]
+    fn shell_escape_simple_string() {
+        assert_eq!(shell_escape("hello"), "hello");
+    }
+
+    #[test]
+    fn shell_escape_string_with_single_quotes() {
+        assert_eq!(shell_escape("it's"), "'it'\\''s'");
+    }
+
+    #[test]
+    fn shell_escape_string_with_spaces() {
+        assert_eq!(shell_escape("hello world"), "'hello world'");
+    }
+
+    #[test]
+    fn shell_escape_string_with_double_quotes() {
+        assert_eq!(shell_escape(r#"say "hi""#), r#"'say "hi"'"#);
+    }
+
+    #[test]
+    fn shell_escape_string_with_backslash() {
+        assert_eq!(shell_escape("path\\to"), "'path\\to'");
+    }
+}
diff --git a/crates/openshell-vm/src/backend/libkrun.rs b/crates/openshell-vm/src/backend/libkrun.rs
new file mode 100644
index 000000000..1f077563a
--- /dev/null
+++ b/crates/openshell-vm/src/backend/libkrun.rs
@@ -0,0 +1,469 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! libkrun hypervisor backend.
+//!
+//! Implements [`VmBackend`] using the libkrun C API for lightweight microVMs.
+//! This is the original backend — on macOS it uses Hypervisor.framework,
+//! on Linux it uses KVM.
+
+use std::ffi::CString;
+use std::path::Path;
+use std::time::Instant;
+
+use super::{VmBackend, setup_gvproxy_port_forwarding, start_gvproxy};
+use crate::exec::{clear_vm_runtime_state, write_vm_runtime_state};
+use crate::{
+    GvproxyGuard, NetBackend, StateDiskConfig, VmConfig, VmError, VsockPort, bootstrap_gateway,
+    c_string_array, check, ffi, gateway_host_port, health, path_to_cstring, vm_rootfs_key,
+};
+
+/// libkrun hypervisor backend.
+pub struct LibkrunBackend;
+
+impl VmBackend for LibkrunBackend {
+    fn launch(&self, config: &VmConfig) -> Result<i32, VmError> {
+        launch_libkrun(config)
+    }
+}
+
+/// VM context wrapping the libkrun FFI context ID.
+struct VmContext {
+    krun: &'static ffi::LibKrun,
+    ctx_id: u32,
+}
+
+impl VmContext {
+    fn create(log_level: u32) -> Result<Self, VmError> {
+        let krun = ffi::libkrun()?;
+        unsafe {
+            check(
+                (krun.krun_init_log)(
+                    ffi::KRUN_LOG_TARGET_DEFAULT,
+                    crate::clamp_log_level(log_level),
+                    ffi::KRUN_LOG_STYLE_AUTO,
+                    ffi::KRUN_LOG_OPTION_NO_ENV,
+                ),
+                "krun_init_log",
+            )?;
+        }
+
+        let ctx_id = unsafe { (krun.krun_create_ctx)() };
+        if ctx_id < 0 {
+            return Err(VmError::Krun {
+                func: "krun_create_ctx",
+                code: ctx_id,
+            });
+        }
+
+        Ok(Self {
+            krun,
+            ctx_id: ctx_id as u32,
+        })
+    }
+
+    fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> {
+        unsafe {
+            check(
+                (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib),
+                "krun_set_vm_config",
+            )
+        }
+    }
+
+    fn set_root(&self, rootfs: &Path) -> Result<(), VmError> {
+        let rootfs_c = path_to_cstring(rootfs)?;
+        unsafe {
+            check(
+                (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()),
+                "krun_set_root",
+            )
+        }
+    }
+
+    fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> {
+        let Some(add_disk3) = self.krun.krun_add_disk3 else {
+            return Err(VmError::HostSetup(
+                "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support"
+                    .to_string(),
+            ));
+        };
+
+        let block_id_c = CString::new(state_disk.block_id.as_str())?;
+        let disk_path_c = path_to_cstring(&state_disk.path)?;
+        unsafe {
+            check(
+                add_disk3(
+                    self.ctx_id,
+                    block_id_c.as_ptr(),
+                    disk_path_c.as_ptr(),
+                    ffi::KRUN_DISK_FORMAT_RAW,
+                    false,
+                    false,
+                    crate::state_disk_sync_mode(),
+                ),
+                "krun_add_disk3",
+            )
+        }
+    }
+
+    fn set_workdir(&self, workdir: &str) -> Result<(), VmError> {
+        let workdir_c = CString::new(workdir)?;
+        unsafe {
+            check(
+                (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()),
+                "krun_set_workdir",
+            )
+        }
+    }
+
+    fn disable_implicit_vsock(&self) -> Result<(), VmError> {
+        unsafe {
+            check(
+                (self.krun.krun_disable_implicit_vsock)(self.ctx_id),
+                "krun_disable_implicit_vsock",
+            )
+        }
+    }
+
+    fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> {
+        unsafe {
+            check(
+                (self.krun.krun_add_vsock)(self.ctx_id, tsi_features),
+                "krun_add_vsock",
+            )
+        }
+    }
+
+    #[cfg(target_os = "macos")]
+    fn add_net_unixgram(
+        &self,
+        socket_path: &Path,
+        mac: &[u8; 6],
+        features: u32,
+        flags: u32,
+    ) -> Result<(), VmError> {
+        let sock_c = path_to_cstring(socket_path)?;
+        unsafe {
+            check(
+                (self.krun.krun_add_net_unixgram)(
+                    self.ctx_id,
+                    sock_c.as_ptr(),
+                    -1,
+                    mac.as_ptr(),
+                    features,
+                    flags,
+                ),
+                "krun_add_net_unixgram",
+            )
+        }
+    }
+
+    #[allow(dead_code)]
+    fn add_net_unixstream(
+        &self,
+        socket_path: &Path,
+        mac: &[u8; 6],
+        features: u32,
+    ) -> Result<(), VmError> {
+        let sock_c = path_to_cstring(socket_path)?;
+        unsafe {
+            check(
+                (self.krun.krun_add_net_unixstream)(
+                    self.ctx_id,
+                    sock_c.as_ptr(),
+                    -1,
+                    mac.as_ptr(),
+                    features,
+                    0,
+                ),
+                "krun_add_net_unixstream",
+            )
+        }
+    }
+
+    fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> {
+        let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect();
+        let (_port_owners, port_ptrs) = c_string_array(&port_strs)?;
+        unsafe {
+            check(
+                (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()),
+                "krun_set_port_map",
+            )
+        }
+    }
+
+    fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> {
+        let socket_c = path_to_cstring(&port.socket_path)?;
+        unsafe {
+            check(
+                (self.krun.krun_add_vsock_port2)(
+                    self.ctx_id,
+                    port.port,
+                    socket_c.as_ptr(),
+                    port.listen,
+                ),
+                "krun_add_vsock_port2",
+            )
+        }
+    }
+
+    fn set_console_output(&self, path: &Path) -> Result<(), VmError> {
+        let console_c = path_to_cstring(path)?;
+        unsafe {
+            check(
+                (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()),
+                "krun_set_console_output",
+            )
+        }
+    }
+
+    fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> {
+        let exec_c = CString::new(exec_path)?;
+        let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect();
+        let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?;
+        let env_strs: Vec<&str> = env.iter().map(String::as_str).collect();
+        let (_env_owners, env_ptrs) = c_string_array(&env_strs)?;
+
+        unsafe {
+            check(
+                (self.krun.krun_set_exec)(
+                    self.ctx_id,
+                    exec_c.as_ptr(),
+                    argv_ptrs.as_ptr(),
+                    env_ptrs.as_ptr(),
+                ),
+                "krun_set_exec",
+            )
+        }
+    }
+
+    fn start_enter(&self) -> i32 {
+        unsafe { (self.krun.krun_start_enter)(self.ctx_id) }
+    }
+}
+
+impl Drop for VmContext {
+    fn drop(&mut self) {
+        unsafe {
+            let ret = (self.krun.krun_free_ctx)(self.ctx_id);
+            if ret < 0 {
+                eprintln!(
+                    "warning: krun_free_ctx({}) failed with code {ret}",
+                    self.ctx_id
+                );
+            }
+        }
+    }
+}
+
+/// Launch a VM using the libkrun backend.
+///
+/// This contains the VM-specific configuration, networking, fork/exec,
+/// signal forwarding, bootstrap, and cleanup logic that was previously
+/// inline in `lib.rs::launch()`.
+#[allow(clippy::similar_names)]
+fn launch_libkrun(config: &VmConfig) -> Result<i32, VmError> {
+    let launch_start = Instant::now();
+
+    let vm = VmContext::create(config.log_level)?;
+    vm.set_vm_config(config.vcpus, config.mem_mib)?;
+    vm.set_root(&config.rootfs)?;
+    if let Some(state_disk) = &config.state_disk {
+        vm.add_state_disk(state_disk)?;
+    }
+    vm.set_workdir(&config.workdir)?;
+
+    let mut gvproxy_guard: Option<GvproxyGuard> = None;
+    let mut gvproxy_api_sock: Option<std::path::PathBuf> = None;
+
+    match &config.net {
+        NetBackend::Tsi => {}
+        NetBackend::None => {
+            vm.disable_implicit_vsock()?;
+            vm.add_vsock(0)?;
+            eprintln!("Networking: disabled (no TSI, no virtio-net)");
+        }
+        NetBackend::Gvproxy { .. } => {
+            let gvproxy_setup = start_gvproxy(config, launch_start)?;
+
+            vm.disable_implicit_vsock()?;
+            vm.add_vsock(0)?;
+            let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee];
+
+            const NET_FEATURE_CSUM: u32 = 1 << 0;
+            const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1;
+            const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7;
+            const NET_FEATURE_GUEST_UFO: u32 = 1 << 10;
+            const NET_FEATURE_HOST_TSO4: u32 = 1 << 11;
+            const NET_FEATURE_HOST_UFO: u32 = 1 << 14;
+            const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM
+                | NET_FEATURE_GUEST_CSUM
+                | NET_FEATURE_GUEST_TSO4
+                | NET_FEATURE_GUEST_UFO
+                | NET_FEATURE_HOST_TSO4
+                | NET_FEATURE_HOST_UFO;
+
+            #[cfg(target_os = "linux")]
+            vm.add_net_unixstream(&gvproxy_setup.net_sock, &mac, COMPAT_NET_FEATURES)?;
+            #[cfg(target_os = "macos")]
+            {
+                const NET_FLAG_VFKIT: u32 = 1 << 0;
+                vm.add_net_unixgram(
+                    &gvproxy_setup.net_sock,
+                    &mac,
+                    COMPAT_NET_FEATURES,
+                    NET_FLAG_VFKIT,
+                )?;
+            }
+
+            eprintln!(
+                "Networking: gvproxy (virtio-net) [{:.1}s]",
+                launch_start.elapsed().as_secs_f64()
+            );
+            gvproxy_api_sock = Some(gvproxy_setup.api_sock);
+            gvproxy_guard = Some(gvproxy_setup.guard);
+        }
+    }
+
+    if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) {
+        vm.set_port_map(&config.port_map)?;
+    }
+
+    for vsock_port in &config.vsock_ports {
+        if let Some(parent) = vsock_port.socket_path.parent() {
+            std::fs::create_dir_all(parent).map_err(|e| {
+                VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display()))
+            })?;
+        }
+        let _ = std::fs::remove_file(&vsock_port.socket_path);
+        vm.add_vsock_port(vsock_port)?;
+    }
+
+    let console_log = config.console_output.clone().unwrap_or_else(|| {
+        config
+            .rootfs
+            .parent()
+            .unwrap_or(&config.rootfs)
+            .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs)))
+    });
+    vm.set_console_output(&console_log)?;
+
+    let mut env: Vec<String> = if config.env.is_empty() {
+        vec![
+            "HOME=/root",
+            "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+            "TERM=xterm",
+        ]
+        .into_iter()
+        .map(ToOwned::to_owned)
+        .collect()
+    } else {
+        config.env.clone()
+    };
+    if let Some(state_disk) = &config.state_disk
+        && !env
+            .iter()
+            .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE="))
+    {
+        env.push(format!(
+            "OPENSHELL_VM_STATE_DISK_DEVICE={}",
+            state_disk.guest_device
+        ));
+    }
+    if config.gpu_enabled {
+        env.push("GPU_ENABLED=true".to_string());
+    }
+    vm.set_exec(&config.exec_path, &config.args, &env)?;
+
+    // Fork and enter the VM
+    let boot_start = Instant::now();
+    eprintln!("Booting microVM...");
+
+    let pid = unsafe { libc::fork() };
+    match pid {
+        -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())),
+        0 => {
+            let ret = vm.start_enter();
+            eprintln!("krun_start_enter failed: {ret}");
+            std::process::exit(1);
+        }
+        _ => {
+            if config.exec_path == "/srv/openshell-vm-init.sh" {
+                let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id);
+                if let Err(err) =
+                    write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid, false)
+                {
+                    unsafe {
+                        libc::kill(pid, libc::SIGTERM);
+                    }
+                    drop(gvproxy_guard);
+                    clear_vm_runtime_state(&config.rootfs);
+                    return Err(err);
+                }
+            }
+            eprintln!(
+                "VM started (child pid {pid}) [{:.1}s]",
+                boot_start.elapsed().as_secs_f64()
+            );
+            for pm in &config.port_map {
+                let host_port = pm.split(':').next().unwrap_or(pm);
+                eprintln!("  port {pm} -> http://localhost:{host_port}");
+            }
+            eprintln!("Console output: {}", console_log.display());
+
+            if let Some(ref api_sock) = gvproxy_api_sock {
+                setup_gvproxy_port_forwarding(api_sock, &config.port_map)?;
+            }
+
+            if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() {
+                let gateway_port = gateway_host_port(config);
+                bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?;
+                health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?;
+            }
+
+            eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64());
+            eprintln!("Press Ctrl+C to stop.");
+
+            unsafe {
+                libc::signal(
+                    libc::SIGINT,
+                    crate::forward_signal as *const () as libc::sighandler_t,
+                );
+                libc::signal(
+                    libc::SIGTERM,
+                    crate::forward_signal as *const () as libc::sighandler_t,
+                );
+                crate::CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed);
+            }
+
+            let mut status: libc::c_int = 0;
+            unsafe {
+                libc::waitpid(pid, &raw mut status, 0);
+            }
+
+            if config.exec_path == "/srv/openshell-vm-init.sh" {
+                clear_vm_runtime_state(&config.rootfs);
+            }
+            if let Some(mut guard) = gvproxy_guard
+                && let Some(mut child) = guard.disarm()
+            {
+                let _ = child.kill();
+                let _ = child.wait();
+                eprintln!("gvproxy stopped");
+            }
+
+            if libc::WIFEXITED(status) {
+                let code = libc::WEXITSTATUS(status);
+                eprintln!("VM exited with code {code}");
+                return Ok(code);
+            } else if libc::WIFSIGNALED(status) {
+                let sig = libc::WTERMSIG(status);
+                eprintln!("VM killed by signal {sig}");
+                return Ok(128 + sig);
+            }
+
+            Ok(status)
+        }
+    }
+}
diff --git a/crates/openshell-vm/src/backend/mod.rs b/crates/openshell-vm/src/backend/mod.rs
new file mode 100644
index 000000000..9c2167fc5
--- /dev/null
+++ b/crates/openshell-vm/src/backend/mod.rs
@@ -0,0 +1,208 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! VM backend abstraction layer.
+//!
+//! Defines the [`VmBackend`] trait that all hypervisor backends implement,
+//! and shared infrastructure (gvproxy startup, networking helpers) used by
+//! both the libkrun and cloud-hypervisor backends.
+
+pub mod cloud_hypervisor;
+pub mod libkrun;
+
+use std::path::{Path, PathBuf};
+use std::time::Instant;
+
+use crate::{
+    GvproxyGuard, NetBackend, VmConfig, VmError, gvproxy_expose, gvproxy_socket_dir,
+    kill_stale_gvproxy, kill_stale_gvproxy_by_port, pick_gvproxy_ssh_port, vm_rootfs_key,
+};
+
+/// Trait implemented by each hypervisor backend (libkrun, cloud-hypervisor).
+pub trait VmBackend {
+    /// Launch a VM with the given configuration.
+    ///
+    /// Returns the VM exit code.
+    fn launch(&self, config: &VmConfig) -> Result<i32, VmError>;
+}
+
+/// Result of starting a gvproxy instance, used by both backends.
+pub(crate) struct GvproxySetup {
+    pub(crate) guard: GvproxyGuard,
+    pub(crate) api_sock: PathBuf,
+    pub(crate) net_sock: PathBuf,
+}
+
+/// Start gvproxy for the given configuration.
+///
+/// Shared between libkrun and cloud-hypervisor backends. Handles stale
+/// process cleanup, socket setup, and process spawning with exponential
+/// backoff waiting for the network socket.
+pub(crate) fn start_gvproxy(
+    config: &VmConfig,
+    launch_start: Instant,
+) -> Result<GvproxySetup, VmError> {
+    let binary = match &config.net {
+        NetBackend::Gvproxy { binary } => binary,
+        _ => {
+            return Err(VmError::HostSetup(
+                "start_gvproxy called without Gvproxy net backend".into(),
+            ));
+        }
+    };
+
+    if !binary.exists() {
+        return Err(VmError::BinaryNotFound {
+            path: binary.display().to_string(),
+            hint: "Install Podman Desktop or place gvproxy in PATH".to_string(),
+        });
+    }
+
+    let run_dir = config
+        .rootfs
+        .parent()
+        .unwrap_or(&config.rootfs)
+        .to_path_buf();
+    let rootfs_key = vm_rootfs_key(&config.rootfs);
+    let sock_base = gvproxy_socket_dir(&config.rootfs)?;
+    let net_sock = sock_base.with_extension("v");
+    let api_sock = sock_base.with_extension("a");
+
+    kill_stale_gvproxy(&config.rootfs);
+    for pm in &config.port_map {
+        if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::<u16>().ok()) {
+            kill_stale_gvproxy_by_port(host_port);
+        }
+    }
+
+    let _ = std::fs::remove_file(&net_sock);
+    let _ = std::fs::remove_file(&api_sock);
+    let krun_sock = sock_base.with_extension("v-krun.sock");
+    let _ = std::fs::remove_file(&krun_sock);
+
+    eprintln!("Starting gvproxy: {}", binary.display());
+    let ssh_port = pick_gvproxy_ssh_port()?;
+    let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log"));
+    let gvproxy_log_file = std::fs::File::create(&gvproxy_log)
+        .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?;
+
+    #[cfg(target_os = "linux")]
+    let (gvproxy_net_flag, gvproxy_net_url) =
+        ("-listen-qemu", format!("unix://{}", net_sock.display()));
+    #[cfg(target_os = "macos")]
+    let (gvproxy_net_flag, gvproxy_net_url) = (
+        "-listen-vfkit",
+        format!("unixgram://{}", net_sock.display()),
+    );
+
+    let child = std::process::Command::new(binary)
+        .arg(gvproxy_net_flag)
+        .arg(&gvproxy_net_url)
+        .arg("-listen")
+        .arg(format!("unix://{}", api_sock.display()))
+        .arg("-ssh-port")
+        .arg(ssh_port.to_string())
+        .stdout(std::process::Stdio::null())
+        .stderr(gvproxy_log_file)
+        .spawn()
+        .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?;
+
+    eprintln!(
+        "gvproxy started (pid {}, ssh port {}) [{:.1}s]",
+        child.id(),
+        ssh_port,
+        launch_start.elapsed().as_secs_f64()
+    );
+
+    {
+        let deadline = Instant::now() + std::time::Duration::from_secs(5);
+        let mut interval = std::time::Duration::from_millis(5);
+        while !net_sock.exists() {
+            if Instant::now() >= deadline {
+                return Err(VmError::Fork(
+                    "gvproxy socket did not appear within 5s".to_string(),
+                ));
+            }
+            std::thread::sleep(interval);
+            interval = (interval * 2).min(std::time::Duration::from_millis(100));
+        }
+    }
+
+    Ok(GvproxySetup {
+        guard: GvproxyGuard::new(child),
+        api_sock,
+        net_sock,
+    })
+}
+
+/// Set up port forwarding via the gvproxy HTTP API.
+///
+/// Translates `host:guest` port map entries into gvproxy expose calls.
+pub(crate) fn setup_gvproxy_port_forwarding(
+    api_sock: &Path,
+    port_map: &[String],
+) -> Result<(), VmError> {
+    let fwd_start = Instant::now();
+    {
+        let deadline = Instant::now() + std::time::Duration::from_secs(2);
+        let mut interval = std::time::Duration::from_millis(5);
+        while !api_sock.exists() {
+            if Instant::now() >= deadline {
+                eprintln!("warning: gvproxy API socket not ready after 2s, attempting anyway");
+                break;
+            }
+            std::thread::sleep(interval);
+            interval = (interval * 2).min(std::time::Duration::from_millis(200));
+        }
+    }
+
+    let guest_ip = "192.168.127.2";
+
+    for pm in port_map {
+        let parts: Vec<&str> = pm.split(':').collect();
+        let (host_port, guest_port) = match parts.len() {
+            2 => (parts[0], parts[1]),
+            1 => (parts[0], parts[0]),
+            _ => {
+                eprintln!("  skipping invalid port mapping: {pm}");
+                continue;
+            }
+        };
+
+        let expose_body = format!(
+            r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"#
+        );
+
+        let mut expose_ok = false;
+        let mut retry_interval = std::time::Duration::from_millis(100);
+        let expose_deadline = Instant::now() + std::time::Duration::from_secs(10);
+        loop {
+            match gvproxy_expose(api_sock, &expose_body) {
+                Ok(()) => {
+                    eprintln!("  port {host_port} -> {guest_ip}:{guest_port}");
+                    expose_ok = true;
+                    break;
+                }
+                Err(e) => {
+                    if Instant::now() >= expose_deadline {
+                        eprintln!("  port {host_port}: {e} (retries exhausted)");
+                        break;
+                    }
+                    std::thread::sleep(retry_interval);
+                    retry_interval = (retry_interval * 2).min(std::time::Duration::from_secs(1));
+                }
+            }
+        }
+        if !expose_ok {
+            return Err(VmError::HostSetup(format!(
+                "failed to forward port {host_port} via gvproxy"
+            )));
+        }
+    }
+    eprintln!(
+        "Port forwarding ready [{:.1}s]",
+        fwd_start.elapsed().as_secs_f64()
+    );
+
+    Ok(())
+}
diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs
index 6195556e1..1f8ad03fe 100644
--- a/crates/openshell-vm/src/exec.rs
+++ b/crates/openshell-vm/src/exec.rs
@@ -48,6 +48,22 @@ fn safe_remove_dir_all(path: &Path) -> Result<bool, VmError> {
 
 pub const VM_EXEC_VSOCK_PORT: u32 = 10_777;
 
+/// How to connect to the VM exec agent.
+///
+/// libkrun bridges each guest vsock port to a host Unix socket via
+/// `krun_add_vsock_port2`. cloud-hypervisor uses standard vhost-vsock
+/// with CID-based addressing — the host connects via `AF_VSOCK` or a
+/// vsock-proxy/socat bridge.
+#[derive(Debug, Clone)]
+pub enum VsockConnectMode {
+    /// Connect via a host Unix socket (libkrun per-port bridging).
+    UnixSocket(PathBuf),
+    /// Connect via a vsock proxy bridge (cloud-hypervisor).
+    /// The path points to a socat-bridged Unix socket that forwards
+    /// to guest CID 3, port [`VM_EXEC_VSOCK_PORT`].
+    VsockBridge(PathBuf),
+}
+
 const VM_STATE_NAME: &str = "vm-state.json";
 const VM_LOCK_NAME: &str = "vm.lock";
 const KUBECONFIG_ENV: &str = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml";
@@ -72,6 +88,10 @@ pub struct VmRuntimeState {
     /// PID of the gvproxy process (if networking uses gvproxy).
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub gvproxy_pid: Option<u32>,
+    /// Whether this VM uses vsock-bridge mode (cloud-hypervisor) vs
+    /// Unix socket mode (libkrun). Defaults to false for backward compat.
+    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
+    pub vsock_bridge: bool,
 }
 
 #[derive(Debug, Serialize)]
@@ -132,6 +152,7 @@ pub fn write_vm_runtime_state(
     pid: i32,
     console_log: &Path,
     gvproxy_pid: Option<u32>,
+    vsock_bridge: bool,
 ) -> Result<(), VmError> {
     let state = VmRuntimeState {
         pid,
@@ -141,6 +162,7 @@ pub fn write_vm_runtime_state(
         console_log: console_log.to_path_buf(),
         started_at_ms: now_ms()?,
         gvproxy_pid,
+        vsock_bridge,
     };
     let path = vm_state_path(rootfs);
     let bytes = serde_json::to_vec_pretty(&state)
@@ -471,10 +493,21 @@ pub fn ensure_vm_not_running(rootfs: &Path) -> Result<(), VmError> {
 
 pub fn exec_running_vm(options: VmExecOptions) -> Result<i32, VmError> {
     let state = load_vm_runtime_state(options.rootfs.as_deref())?;
-    let mut stream = UnixStream::connect(&state.socket_path).map_err(|e| {
+
+    let connect_mode = if state.vsock_bridge {
+        VsockConnectMode::VsockBridge(state.socket_path.clone())
+    } else {
+        VsockConnectMode::UnixSocket(state.socket_path.clone())
+    };
+
+    let socket_path = match &connect_mode {
+        VsockConnectMode::UnixSocket(p) | VsockConnectMode::VsockBridge(p) => p,
+    };
+
+    let mut stream = UnixStream::connect(socket_path).map_err(|e| {
         VmError::Exec(format!(
             "connect to VM exec socket {}: {e}",
-            state.socket_path.display()
+            socket_path.display()
         ))
     })?;
     let mut writer = stream
diff --git a/crates/openshell-vm/src/gpu_passthrough.rs b/crates/openshell-vm/src/gpu_passthrough.rs
new file mode 100644
index 000000000..b835bca89
--- /dev/null
+++ b/crates/openshell-vm/src/gpu_passthrough.rs
@@ -0,0 +1,1959 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Host-side NVIDIA GPU VFIO readiness probing for VM passthrough.
+//!
+//! This module scans Linux sysfs (`/sys/bus/pci/devices`) for NVIDIA GPUs
+//! (vendor ID `0x10de`), checks their driver binding, and verifies IOMMU
+//! group cleanliness — the prerequisites for passing a physical GPU into
+//! a cloud-hypervisor VM via VFIO.
+//!
+//! Returns per-device readiness for multi-GPU hosts.
+//!
+//! On non-Linux platforms, probing returns an empty list.
+
+use std::fmt;
+use std::path::PathBuf;
+use std::time::Duration;
+
+/// Per-device readiness state for NVIDIA GPU VFIO passthrough.
+///
+/// Each variant represents a distinct readiness state for a single PCI device.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum HostNvidiaVfioReadiness {
+    /// The current platform does not support VFIO passthrough (non-Linux).
+    UnsupportedPlatform,
+
+    /// No PCI device with NVIDIA vendor ID (`0x10de`) was found.
+    NoNvidiaDevice,
+
+    /// An NVIDIA device exists but is bound to the nvidia (or other non-VFIO) driver.
+    BoundToNvidia,
+
+    /// An NVIDIA device is bound to `vfio-pci` and its IOMMU group is clean — ready for passthrough.
+    VfioBoundReady,
+
+    /// An NVIDIA device is bound to `vfio-pci` but its IOMMU group contains
+    /// devices not bound to `vfio-pci`, which prevents safe passthrough.
+    VfioBoundDirtyGroup,
+
+    /// Some NVIDIA devices are bound to `vfio-pci` while others use
+    /// a different driver (mixed fleet).
+    MixedVfioAndOther,
+}
+
+impl fmt::Display for HostNvidiaVfioReadiness {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::UnsupportedPlatform => write!(
+                f,
+                "VFIO passthrough is not supported on this platform (Linux required)"
+            ),
+            Self::NoNvidiaDevice => write!(f, "no NVIDIA PCI device found"),
+            Self::BoundToNvidia => {
+                write!(f, "NVIDIA device found but not bound to vfio-pci driver")
+            }
+            Self::VfioBoundReady => write!(
+                f,
+                "NVIDIA device bound to vfio-pci and IOMMU group is clean"
+            ),
+            Self::VfioBoundDirtyGroup => write!(
+                f,
+                "NVIDIA device bound to vfio-pci but IOMMU group contains non-VFIO devices"
+            ),
+            Self::MixedVfioAndOther => write!(
+                f,
+                "some NVIDIA devices are on vfio-pci while others use a different driver"
+            ),
+        }
+    }
+}
+
+const NVIDIA_VENDOR_ID: &str = "0x10de";
+
+#[cfg(target_os = "linux")]
+const SYSFS_WRITE_TIMEOUT: Duration = Duration::from_secs(10);
+
+#[cfg(target_os = "linux")]
+fn sysfs_write_with_timeout(
+    path: &std::path::Path,
+    data: &str,
+    timeout: Duration,
+) -> Result<(), std::io::Error> {
+    use std::process::{Command, Stdio};
+    use std::thread;
+
+    let mut child = Command::new("sh")
+        .arg("-c")
+        .arg(format!(
+            r#"printf '%s' '{}' > '{}'"#,
+            data.replace('\'', "'\\''"),
+            path.display().to_string().replace('\'', "'\\''")
+        ))
+        .stdin(Stdio::null())
+        .stdout(Stdio::null())
+        .stderr(Stdio::piped())
+        .spawn()
+        .map_err(|e| {
+            std::io::Error::new(
+                e.kind(),
+                format!(
+                    "failed to spawn sysfs write subprocess for {}: {e}",
+                    path.display()
+                ),
+            )
+        })?;
+
+    let poll_interval = Duration::from_millis(100);
+    let start = std::time::Instant::now();
+
+    loop {
+        match child.try_wait() {
+            Ok(Some(status)) => {
+                if status.success() {
+                    return Ok(());
+                }
+                let mut stderr_buf = String::new();
+                if let Some(mut stderr) = child.stderr.take() {
+                    use std::io::Read;
+                    let _ = stderr.read_to_string(&mut stderr_buf);
+                }
+                let hint = if stderr_buf.contains("Permission denied") {
+                    " — run as root"
+                } else {
+                    ""
+                };
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    format!(
+                        "sysfs write to {} failed (exit {}){hint}: {stderr_buf}",
+                        path.display(),
+                        status.code().unwrap_or(-1),
+                    ),
+                ));
+            }
+            Ok(None) => {
+                if start.elapsed() > timeout {
+                    let pid = child.id();
+                    let _ = child.kill();
+                    // CRITICAL: Do NOT call child.wait() here. If the child
+                    // is stuck in uninterruptible sleep (D-state) — which is
+                    // the nvidia unbind deadlock scenario — wait() will block
+                    // the parent indefinitely, making it unkillable too.
+                    //
+                    // Dropping the Child struct closes pipe handles but does
+                    // NOT wait. The zombie child is reparented to init and
+                    // reaped when/if it eventually exits.
+                    drop(child);
+                    return Err(std::io::Error::new(
+                        std::io::ErrorKind::TimedOut,
+                        format!(
+                            "sysfs write to {} timed out after {:.0}s (subprocess pid {pid}) — \
+                             possible nvidia driver deadlock. The subprocess may still be \
+                             stuck in kernel space; a reboot may be required to clear it.",
+                            path.display(),
+                            timeout.as_secs_f64(),
+                        ),
+                    ));
+                }
+                thread::sleep(poll_interval);
+            }
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+/// Validates that `addr` matches the PCI BDF format `DDDD:BB:DD.F`.
+fn validate_pci_addr(addr: &str) -> Result<(), std::io::Error> {
+    let bytes = addr.as_bytes();
+    let valid = bytes.len() == 12
+        && bytes[4] == b':'
+        && bytes[7] == b':'
+        && bytes[10] == b'.'
+        && bytes[..4].iter().all(|b| b.is_ascii_hexdigit())
+        && bytes[5..7].iter().all(|b| b.is_ascii_hexdigit())
+        && bytes[8..10].iter().all(|b| b.is_ascii_hexdigit())
+        && bytes[11].is_ascii_digit();
+    if valid {
+        Ok(())
+    } else {
+        Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            format!("invalid PCI address '{addr}': expected DDDD:BB:DD.F format"),
+        ))
+    }
+}
+
+/// Probe the host for NVIDIA GPU VFIO readiness by scanning Linux sysfs.
+///
+/// Returns a per-device list of `(pci_address, readiness)` tuples for every
+/// NVIDIA GPU found. On non-Linux platforms the list is empty.
+///
+/// On Linux, walks `/sys/bus/pci/devices/` and for each device:
+/// 1. Reads `vendor` to check for NVIDIA (`0x10de`).
+/// 2. Reads the `driver` symlink to determine which kernel driver is bound.
+/// 3. If bound to `vfio-pci`, inspects the `iommu_group/devices/` directory
+///    to verify all group members are also on `vfio-pci`.
+pub fn probe_host_nvidia_vfio_readiness() -> Vec<(String, HostNvidiaVfioReadiness)> {
+    #[cfg(not(target_os = "linux"))]
+    {
+        Vec::new()
+    }
+
+    #[cfg(target_os = "linux")]
+    {
+        probe_linux_sysfs()
+    }
+}
+
+#[cfg(target_os = "linux")]
+fn probe_linux_sysfs() -> Vec<(String, HostNvidiaVfioReadiness)> {
+    use std::fs;
+    use std::path::Path;
+
+    let pci_devices = Path::new("/sys/bus/pci/devices");
+    let entries = match fs::read_dir(pci_devices) {
+        Ok(e) => e,
+        Err(_) => return Vec::new(),
+    };
+
+    let mut results = Vec::new();
+
+    for entry in entries.filter_map(Result::ok) {
+        let dev_path = entry.path();
+
+        let vendor = match fs::read_to_string(dev_path.join("vendor")) {
+            Ok(v) => v.trim().to_lowercase(),
+            Err(_) => continue,
+        };
+
+        if vendor != NVIDIA_VENDOR_ID {
+            continue;
+        }
+
+        let pci_addr = entry.file_name().to_string_lossy().to_string();
+
+        let driver_link = dev_path.join("driver");
+        let driver_name = fs::read_link(&driver_link).ok().and_then(|target| {
+            target
+                .file_name()
+                .map(|name| name.to_string_lossy().to_string())
+        });
+
+        let state = match driver_name.as_deref() {
+            Some("vfio-pci") => {
+                let iommu_group_devices = dev_path.join("iommu_group/devices");
+                let group_clean = match fs::read_dir(&iommu_group_devices) {
+                    Ok(group_entries) => group_entries.filter_map(Result::ok).all(|ge| {
+                        let peer_path = iommu_group_devices.join(ge.file_name()).join("driver");
+                        fs::read_link(&peer_path)
+                            .ok()
+                            .and_then(|t| t.file_name().map(|n| n.to_string_lossy().to_string()))
+                            .as_deref()
+                            == Some("vfio-pci")
+                    }),
+                    Err(_) => false,
+                };
+
+                if group_clean {
+                    HostNvidiaVfioReadiness::VfioBoundReady
+                } else {
+                    HostNvidiaVfioReadiness::VfioBoundDirtyGroup
+                }
+            }
+            _ => HostNvidiaVfioReadiness::BoundToNvidia,
+        };
+
+        results.push((pci_addr, state));
+    }
+
+    results
+}
+
+/// Returns whether any NVIDIA GPU is fully available for VM passthrough.
+///
+/// Requires `OPENSHELL_VM_GPU_E2E=1` to activate probing. When the env var
+/// is unset or not `"1"`, returns `false` unconditionally so non-GPU CI
+/// runners are never affected.
+///
+/// When activated, checks two conditions:
+/// 1. At least one NVIDIA device reports [`VfioBoundReady`].
+/// 2. The cloud-hypervisor binary exists in the runtime bundle.
+pub fn nvidia_gpu_available_for_vm_passthrough() -> bool {
+    if std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() != Ok("1") {
+        return false;
+    }
+
+    let has_vfio_ready = probe_host_nvidia_vfio_readiness()
+        .iter()
+        .any(|(_, state)| *state == HostNvidiaVfioReadiness::VfioBoundReady);
+
+    if !has_vfio_ready {
+        return false;
+    }
+
+    let chv_exists = crate::configured_runtime_dir()
+        .map(|dir| dir.join("cloud-hypervisor").is_file())
+        .unwrap_or(false);
+
+    chv_exists
+}
+
+/// Sysfs root path, defaulting to "/" in production and a temp dir in tests.
+#[derive(Debug, Clone)]
+pub(crate) struct SysfsRoot(PathBuf);
+
+impl Default for SysfsRoot {
+    fn default() -> Self {
+        Self(PathBuf::from("/"))
+    }
+}
+
+impl SysfsRoot {
+    #[cfg(test)]
+    pub fn new(root: PathBuf) -> Self {
+        Self(root)
+    }
+
+    pub fn sys_bus_pci_devices(&self) -> PathBuf {
+        self.0.join("sys/bus/pci/devices")
+    }
+
+    pub fn sys_class_drm(&self) -> PathBuf {
+        self.0.join("sys/class/drm")
+    }
+
+    pub fn sys_module(&self, module: &str) -> PathBuf {
+        self.0.join("sys/module").join(module)
+    }
+
+    pub fn sys_bus_pci_drivers(&self, driver: &str) -> PathBuf {
+        self.0.join("sys/bus/pci/drivers").join(driver)
+    }
+
+    pub fn sys_kernel_iommu_groups(&self) -> PathBuf {
+        self.0.join("sys/kernel/iommu_groups")
+    }
+
+    fn is_real_sysfs(&self) -> bool {
+        self.0 == std::path::Path::new("/")
+    }
+
+    #[cfg(target_os = "linux")]
+    fn write_sysfs(&self, path: &std::path::Path, data: &str) -> Result<(), std::io::Error> {
+        if self.is_real_sysfs() {
+            sysfs_write_with_timeout(path, data, SYSFS_WRITE_TIMEOUT)
+        } else {
+            std::fs::write(path, data).map_err(|e| {
+                std::io::Error::new(e.kind(), format!("failed to write {}: {e}", path.display()))
+            })
+        }
+    }
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn check_display_attached(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    use std::fs;
+
+    let drm_dir = sysfs.sys_class_drm();
+    let entries = match fs::read_dir(&drm_dir) {
+        Ok(e) => e,
+        Err(_) => return false,
+    };
+
+    for entry in entries.filter_map(Result::ok) {
+        let name = entry.file_name().to_string_lossy().to_string();
+        if !name.starts_with("card") || name.contains('-') {
+            continue;
+        }
+
+        let card_dir = entry.path();
+        let device_link = card_dir.join("device");
+
+        let target = match fs::read_link(&device_link) {
+            Ok(t) => t,
+            Err(_) => continue,
+        };
+        if !target.to_string_lossy().ends_with(pci_addr) {
+            continue;
+        }
+
+        let boot_vga_path = card_dir.join("device").join("boot_vga");
+        if let Ok(val) = fs::read_to_string(&boot_vga_path) {
+            if val.trim() == "1" {
+                return true;
+            }
+        }
+
+        if let Ok(sub_entries) = fs::read_dir(&card_dir) {
+            for sub in sub_entries.filter_map(Result::ok) {
+                let sub_name = sub.file_name().to_string_lossy().to_string();
+                if sub_name.starts_with(&format!("{name}-")) {
+                    if let Ok(status) = fs::read_to_string(sub.path().join("status")) {
+                        if status.trim() == "connected" {
+                            return true;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    false
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn check_display_attached(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+#[cfg(target_os = "linux")]
+/// Checks whether any process on the host has an open handle to an NVIDIA GPU
+/// device (`/dev/nvidia*`). This is a host-wide check across ALL NVIDIA GPUs,
+/// not scoped to a single PCI address. Returns a list of (pid, comm) pairs.
+pub(crate) fn check_active_gpu_processes() -> std::io::Result<Vec<(u32, String)>> {
+    use std::fs;
+
+    let mut result = Vec::new();
+
+    let proc_dir = match fs::read_dir("/proc") {
+        Ok(d) => d,
+        Err(e) => {
+            return Err(std::io::Error::new(
+                e.kind(),
+                format!(
+                    "cannot scan /proc for active GPU processes: {e} — \
+                     refusing to unbind (fail-closed)"
+                ),
+            ));
+        }
+    };
+
+    for proc_entry in proc_dir.filter_map(Result::ok) {
+        let pid: u32 = match proc_entry.file_name().to_string_lossy().parse() {
+            Ok(p) => p,
+            Err(_) => continue,
+        };
+
+        let fd_dir = proc_entry.path().join("fd");
+        let fds = match fs::read_dir(&fd_dir) {
+            Ok(d) => d,
+            Err(_) => continue,
+        };
+
+        for fd_entry in fds.filter_map(Result::ok) {
+            if let Ok(target) = fs::read_link(fd_entry.path()) {
+                if target.to_string_lossy().starts_with("/dev/nvidia") {
+                    let comm = fs::read_to_string(format!("/proc/{pid}/comm"))
+                        .unwrap_or_default()
+                        .trim()
+                        .to_string();
+                    result.push((pid, comm));
+                    break;
+                }
+            }
+        }
+    }
+
+    Ok(result)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn check_active_gpu_processes() -> std::io::Result<Vec<(u32, String)>> {
+    Ok(vec![])
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn check_iommu_enabled(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    let iommu_groups = sysfs.sys_kernel_iommu_groups();
+    if !iommu_groups.is_dir() {
+        return false;
+    }
+    sysfs
+        .sys_bus_pci_devices()
+        .join(pci_addr)
+        .join("iommu_group")
+        .exists()
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn check_iommu_enabled(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn check_vfio_modules_loaded(sysfs: &SysfsRoot) -> bool {
+    sysfs.sys_module("vfio_pci").is_dir() && sysfs.sys_module("vfio_iommu_type1").is_dir()
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn check_vfio_modules_loaded(_sysfs: &SysfsRoot) -> bool {
+    false
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn check_sysfs_permissions(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    use nix::unistd::{AccessFlags, access};
+
+    let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr);
+    let driver_override = dev_dir.join("driver_override");
+    let unbind = dev_dir.join("driver/unbind");
+    let bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind");
+
+    let writable = |path: &std::path::Path| -> bool { access(path, AccessFlags::W_OK).is_ok() };
+
+    let unbind_ok = !unbind.exists() || writable(&unbind);
+    writable(&driver_override) && unbind_ok && writable(&bind)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn check_sysfs_permissions(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn current_driver(sysfs: &SysfsRoot, pci_addr: &str) -> Option<String> {
+    let driver_link = sysfs.sys_bus_pci_devices().join(pci_addr).join("driver");
+    std::fs::read_link(&driver_link)
+        .ok()
+        .and_then(|target| target.file_name().map(|n| n.to_string_lossy().to_string()))
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn current_driver(_sysfs: &SysfsRoot, _pci_addr: &str) -> Option<String> {
+    None
+}
+
+/// Nvidia kernel modules that hold internal references to GPU devices and can
+/// prevent a clean unbind. Unloaded in order (most-dependent first).
+#[cfg(target_os = "linux")]
+const NVIDIA_SUBMODULES: &[&str] = &["nvidia_uvm", "nvidia_drm", "nvidia_modeset"];
+
+/// Timeout for nvidia prep commands (nvidia-smi, modprobe). These commands
+/// can wedge if the nvidia driver is in a bad state.
+#[cfg(target_os = "linux")]
+const NVIDIA_PREP_TIMEOUT: Duration = Duration::from_secs(15);
+
+/// Run a command with a timeout. Returns `Some(ExitStatus)` on success,
+/// `None` on timeout or spawn failure. On timeout, kills the child and
+/// drops it without calling `wait()` (same D-state safety as sysfs writes).
+#[cfg(target_os = "linux")]
+fn run_with_timeout(
+    mut cmd: std::process::Command,
+    timeout: Duration,
+) -> Option<std::process::ExitStatus> {
+    use std::thread;
+
+    let mut child = match cmd.spawn() {
+        Ok(c) => c,
+        Err(_) => return None,
+    };
+
+    let poll_interval = Duration::from_millis(100);
+    let start = std::time::Instant::now();
+
+    loop {
+        match child.try_wait() {
+            Ok(Some(status)) => return Some(status),
+            Ok(None) => {
+                if start.elapsed() > timeout {
+                    let _ = child.kill();
+                    drop(child);
+                    return None;
+                }
+                thread::sleep(poll_interval);
+            }
+            Err(_) => return None,
+        }
+    }
+}
+
+/// Best-effort preparation of the nvidia driver before a raw sysfs unbind.
+///
+/// Reduces the probability of the nvidia unbind deadlock by:
+/// 1. Disabling persistence mode (nvidia-persistenced holds device refs).
+/// 2. Unloading nvidia submodules that keep internal references open.
+///
+/// All commands run with a timeout — if `nvidia-smi` or `modprobe` hangs
+/// (which can happen when the nvidia driver is in a bad state), the parent
+/// process is not blocked. Failures are logged but not fatal.
+#[cfg(target_os = "linux")]
+fn nvidia_pre_unbind_prep(pci_addr: &str) {
+    use std::process::{Command, Stdio};
+
+    // 1. Disable persistence mode via nvidia-smi (if available).
+    let mut cmd = Command::new("nvidia-smi");
+    cmd.args(["-i", pci_addr, "-pm", "0"])
+        .stdin(Stdio::null())
+        .stdout(Stdio::null())
+        .stderr(Stdio::null());
+    match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) {
+        Some(s) if s.success() => {
+            eprintln!("GPU {pci_addr}: disabled nvidia persistence mode");
+        }
+        None => {
+            eprintln!(
+                "GPU {pci_addr}: nvidia-smi timed out after {:.0}s — skipping persistence mode",
+                NVIDIA_PREP_TIMEOUT.as_secs_f64()
+            );
+        }
+        _ => {}
+    }
+
+    // 2. Unload nvidia submodules that hold device references.
+    //    This is best-effort — modules may be in use by other GPUs.
+    for module in NVIDIA_SUBMODULES {
+        let mut cmd = Command::new("modprobe");
+        cmd.args(["-r", module])
+            .stdin(Stdio::null())
+            .stdout(Stdio::null())
+            .stderr(Stdio::null());
+        match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) {
+            Some(s) if s.success() => {
+                eprintln!("GPU {pci_addr}: unloaded {module}");
+            }
+            None => {
+                eprintln!(
+                    "GPU {pci_addr}: modprobe -r {module} timed out after {:.0}s",
+                    NVIDIA_PREP_TIMEOUT.as_secs_f64()
+                );
+            }
+            _ => {}
+        }
+    }
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn bind_gpu_to_vfio(
+    sysfs: &SysfsRoot,
+    pci_addr: &str,
+) -> Result<String, std::io::Error> {
+    validate_pci_addr(pci_addr)?;
+    let drv = current_driver(sysfs, pci_addr);
+
+    if drv.as_deref() == Some("vfio-pci") {
+        return Ok("vfio-pci".to_string());
+    }
+
+    let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr);
+
+    if drv.is_some() {
+        let is_nvidia = drv.as_deref() == Some("nvidia");
+        if is_nvidia && sysfs.is_real_sysfs() {
+            nvidia_pre_unbind_prep(pci_addr);
+
+            // nvidia_pre_unbind_prep may cascade-remove the nvidia module when
+            // all submodules are unloaded, which automatically unbinds the device.
+            // Re-check before attempting the sysfs unbind write.
+            if current_driver(sysfs, pci_addr).is_none() {
+                eprintln!("GPU {pci_addr}: device already unbound after nvidia module cleanup");
+            } else if current_driver(sysfs, pci_addr).as_deref() == Some("vfio-pci") {
+                return Ok("vfio-pci".to_string());
+            }
+        }
+
+        // Only attempt the sysfs unbind if a driver is still bound.
+        if current_driver(sysfs, pci_addr).is_some() {
+            let unbind = dev_dir.join("driver/unbind");
+            let unbind_result = sysfs.write_sysfs(&unbind, pci_addr);
+
+            if let Err(ref e) = unbind_result {
+                if e.kind() == std::io::ErrorKind::TimedOut {
+                    // The nvidia unbind deadlock can complete the unbind at the
+                    // hardware level while the syscall never returns to userspace.
+                    // Check if the device is actually unbound despite the timeout.
+                    if current_driver(sysfs, pci_addr).is_none() {
+                        eprintln!(
+                            "GPU {pci_addr}: sysfs unbind timed out but device is unbound — \
+                             continuing (zombie subprocess may linger until reboot)"
+                        );
+                    } else {
+                        return Err(std::io::Error::new(
+                            std::io::ErrorKind::TimedOut,
+                            format!(
+                                "Failed to unbind {pci_addr}: timed out and device is still \
+                                 bound to {}. A reboot may be required.",
+                                drv.as_deref().unwrap_or("unknown"),
+                            ),
+                        ));
+                    }
+                } else {
+                    let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+                        " — run as root"
+                    } else {
+                        ""
+                    };
+                    return Err(std::io::Error::new(
+                        e.kind(),
+                        format!(
+                            "Failed to unbind device at {path}{hint}",
+                            path = unbind.display()
+                        ),
+                    ));
+                }
+            }
+        }
+    }
+
+    let driver_override = dev_dir.join("driver_override");
+    if let Err(e) = sysfs.write_sysfs(&driver_override, "vfio-pci") {
+        if let Some(ref orig) = drv {
+            let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind");
+            let _ = sysfs.write_sysfs(&orig_bind, pci_addr);
+        }
+        let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+            " — run as root"
+        } else {
+            ""
+        };
+        return Err(std::io::Error::new(
+            e.kind(),
+            format!(
+                "Failed to write driver_override at {path}{hint}",
+                path = driver_override.display()
+            ),
+        ));
+    }
+
+    let vfio_bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind");
+    if let Err(e) = sysfs.write_sysfs(&vfio_bind, pci_addr) {
+        let _ = sysfs.write_sysfs(&driver_override, "");
+        if let Some(ref orig) = drv {
+            let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind");
+            let _ = sysfs.write_sysfs(&orig_bind, pci_addr);
+        }
+        let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+            " — run as root"
+        } else {
+            ""
+        };
+        return Err(std::io::Error::new(
+            e.kind(),
+            format!(
+                "Failed to bind to vfio-pci at {path}{hint} — is the vfio-pci module loaded?",
+                path = vfio_bind.display()
+            ),
+        ));
+    }
+
+    Ok(drv.unwrap_or_default())
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn bind_gpu_to_vfio(
+    _sysfs: &SysfsRoot,
+    _pci_addr: &str,
+) -> Result<String, std::io::Error> {
+    Ok(String::new())
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn rebind_gpu_to_original(
+    sysfs: &SysfsRoot,
+    pci_addr: &str,
+    original_driver: &str,
+) -> Result<(), std::io::Error> {
+    validate_pci_addr(pci_addr)?;
+    let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr);
+
+    if current_driver(sysfs, pci_addr).is_some() {
+        let unbind = dev_dir.join("driver/unbind");
+        sysfs.write_sysfs(&unbind, pci_addr).map_err(|e| {
+            let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+                " — run as root"
+            } else {
+                ""
+            };
+            std::io::Error::new(
+                e.kind(),
+                format!(
+                    "Failed to unbind device at {path}{hint}",
+                    path = unbind.display()
+                ),
+            )
+        })?;
+    }
+
+    let driver_override = dev_dir.join("driver_override");
+    sysfs.write_sysfs(&driver_override, "").map_err(|e| {
+        let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+            " — run as root"
+        } else {
+            ""
+        };
+        std::io::Error::new(
+            e.kind(),
+            format!(
+                "Failed to clear driver_override at {path}{hint}",
+                path = driver_override.display()
+            ),
+        )
+    })?;
+
+    if !original_driver.is_empty() && original_driver != "none" {
+        let bind = sysfs.sys_bus_pci_drivers(original_driver).join("bind");
+        sysfs.write_sysfs(&bind, pci_addr).map_err(|e| {
+            let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+                " — run as root"
+            } else {
+                ""
+            };
+            std::io::Error::new(
+                e.kind(),
+                format!(
+                    "Failed to rebind to {original_driver} at {path}{hint}",
+                    path = bind.display()
+                ),
+            )
+        })?;
+    } else {
+        let rescan = sysfs.0.join("sys/bus/pci/rescan");
+        let _ = sysfs.write_sysfs(&rescan, "1");
+    }
+
+    Ok(())
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn rebind_gpu_to_original(
+    _sysfs: &SysfsRoot,
+    _pci_addr: &str,
+    _original_driver: &str,
+) -> Result<(), std::io::Error> {
+    Ok(())
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn iommu_group_peers(
+    sysfs: &SysfsRoot,
+    pci_addr: &str,
+) -> Result<Vec<String>, std::io::Error> {
+    validate_pci_addr(pci_addr)?;
+    let iommu_devices = sysfs
+        .sys_bus_pci_devices()
+        .join(pci_addr)
+        .join("iommu_group/devices");
+
+    let entries = match std::fs::read_dir(&iommu_devices) {
+        Ok(e) => e,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(vec![]),
+        Err(e) => return Err(e),
+    };
+
+    let mut peers = Vec::new();
+    for entry in entries.filter_map(Result::ok) {
+        let name = entry.file_name().to_string_lossy().to_string();
+        if name != pci_addr {
+            peers.push(name);
+        }
+    }
+    Ok(peers)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn iommu_group_peers(
+    _sysfs: &SysfsRoot,
+    _pci_addr: &str,
+) -> Result<Vec<String>, std::io::Error> {
+    Ok(vec![])
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn bind_iommu_group_peers(
+    sysfs: &SysfsRoot,
+    pci_addr: &str,
+) -> Result<Vec<(String, String)>, std::io::Error> {
+    let peers = iommu_group_peers(sysfs, pci_addr)?;
+    let mut restore_list = Vec::new();
+
+    for peer in peers {
+        match bind_gpu_to_vfio(sysfs, &peer) {
+            Ok(original) => {
+                if original != "vfio-pci" {
+                    restore_list.push((peer, original));
+                }
+            }
+            Err(e) => {
+                let _ = rebind_iommu_group_peers(sysfs, &restore_list);
+                return Err(std::io::Error::new(
+                    e.kind(),
+                    format!(
+                        "Failed to bind IOMMU peer {peer}: {e}. Rolled back {} peer(s).",
+                        restore_list.len()
+                    ),
+                ));
+            }
+        }
+    }
+
+    Ok(restore_list)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn bind_iommu_group_peers(
+    _sysfs: &SysfsRoot,
+    _pci_addr: &str,
+) -> Result<Vec<(String, String)>, std::io::Error> {
+    Ok(vec![])
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn rebind_iommu_group_peers(
+    sysfs: &SysfsRoot,
+    peers: &[(String, String)],
+) -> Result<(), std::io::Error> {
+    let mut first_err = None;
+    for (peer_addr, original_driver) in peers {
+        if let Err(e) = rebind_gpu_to_original(sysfs, peer_addr, original_driver) {
+            if first_err.is_none() {
+                first_err = Some(e);
+            }
+        }
+    }
+    match first_err {
+        Some(e) => Err(e),
+        None => Ok(()),
+    }
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn rebind_iommu_group_peers(
+    _sysfs: &SysfsRoot,
+    _peers: &[(String, String)],
+) -> Result<(), std::io::Error> {
+    Ok(())
+}
+
+#[cfg(target_os = "linux")]
+fn is_iommu_group_clean(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    let peers = match iommu_group_peers(sysfs, pci_addr) {
+        Ok(p) => p,
+        Err(_) => return false,
+    };
+    peers
+        .iter()
+        .all(|peer| current_driver(sysfs, peer).as_deref() == Some("vfio-pci"))
+}
+
+#[cfg(not(target_os = "linux"))]
+fn is_iommu_group_clean(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+/// Captures the bind state for a GPU so it can be restored on shutdown.
+#[derive(Debug)]
+pub struct GpuBindState {
+    /// PCI address of the GPU that was bound.
+    pub pci_addr: String,
+    /// Driver the GPU was on before binding (e.g. "nvidia").
+    pub original_driver: String,
+    /// IOMMU group peers that were rebound, with their original drivers.
+    pub peer_binds: Vec<(String, String)>,
+    /// Whether this instance performed the bind (false if GPU was already on vfio-pci).
+    pub did_bind: bool,
+}
+
+impl GpuBindState {
+    /// Restore the GPU and its IOMMU peers to their original drivers.
+    pub fn restore(&self) -> Result<(), std::io::Error> {
+        self.restore_with_sysfs(&SysfsRoot::default())
+    }
+
+    pub(crate) fn restore_with_sysfs(&self, sysfs: &SysfsRoot) -> Result<(), std::io::Error> {
+        if !self.did_bind {
+            return Ok(());
+        }
+        eprintln!(
+            "GPU: rebinding {} to {}",
+            self.pci_addr, self.original_driver
+        );
+        rebind_gpu_to_original(sysfs, &self.pci_addr, &self.original_driver)?;
+        rebind_iommu_group_peers(sysfs, &self.peer_binds)?;
+        Ok(())
+    }
+}
+
+/// RAII guard that restores GPU driver binding when dropped.
+///
+/// Ensures the GPU is rebound to its original driver on normal exit,
+/// early return (?), or panic. Cannot protect against SIGKILL.
+pub struct GpuBindGuard {
+    state: Option<GpuBindState>,
+}
+
+impl GpuBindGuard {
+    pub fn new(state: GpuBindState) -> Self {
+        Self { state: Some(state) }
+    }
+
+    /// Take the state out, preventing restore on drop.
+    pub fn disarm(&mut self) -> Option<GpuBindState> {
+        self.state.take()
+    }
+
+    /// Get the PCI address of the bound GPU, if any.
+    pub fn pci_addr(&self) -> Option<&str> {
+        self.state.as_ref().map(|s| s.pci_addr.as_str())
+    }
+}
+
+impl Drop for GpuBindGuard {
+    fn drop(&mut self) {
+        if let Some(ref state) = self.state {
+            eprintln!(
+                "GPU: restoring {} to {} (cleanup)",
+                state.pci_addr, state.original_driver
+            );
+            if let Err(e) = state.restore() {
+                eprintln!("GPU: restore failed: {e}");
+            }
+        }
+    }
+}
+
+/// Prepare a GPU for VFIO passthrough: run safety checks, select, and bind.
+///
+/// When `requested_bdf` is Some, targets that specific device.
+/// When None (auto mode), selects the best available GPU.
+///
+/// All safety checks are hard failures — if any check fails, this returns
+/// an error and does not bind anything.
+pub fn prepare_gpu_for_passthrough(
+    requested_bdf: Option<&str>,
+) -> Result<GpuBindState, std::io::Error> {
+    prepare_gpu_with_sysfs(&SysfsRoot::default(), requested_bdf)
+}
+
+pub(crate) fn prepare_gpu_with_sysfs(
+    sysfs: &SysfsRoot,
+    requested_bdf: Option<&str>,
+) -> Result<GpuBindState, std::io::Error> {
+    match requested_bdf {
+        Some(bdf) => prepare_specific_gpu(sysfs, bdf),
+        None => prepare_auto_gpu(sysfs),
+    }
+}
+
+fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result<GpuBindState, std::io::Error> {
+    validate_pci_addr(bdf)?;
+
+    let dev_dir = sysfs.sys_bus_pci_devices().join(bdf);
+    if !dev_dir.exists() {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            format!("PCI device {bdf} not found in sysfs"),
+        ));
+    }
+
+    let vendor = std::fs::read_to_string(dev_dir.join("vendor"))
+        .map(|v| v.trim().to_lowercase())
+        .unwrap_or_default();
+    if vendor != NVIDIA_VENDOR_ID {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            format!("PCI device {bdf} is not an NVIDIA device (vendor: {vendor})"),
+        ));
+    }
+    let class = std::fs::read_to_string(dev_dir.join("class"))
+        .map(|c| c.trim().to_lowercase())
+        .unwrap_or_default();
+    if !class.starts_with("0x03") {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            format!("PCI device {bdf} is not a GPU (class: {class})"),
+        ));
+    }
+
+    if current_driver(sysfs, bdf).as_deref() == Some("vfio-pci") && is_iommu_group_clean(sysfs, bdf)
+    {
+        return Ok(GpuBindState {
+            pci_addr: bdf.to_string(),
+            original_driver: "vfio-pci".to_string(),
+            peer_binds: vec![],
+            did_bind: false,
+        });
+    }
+
+    if check_display_attached(sysfs, bdf) {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            format!("GPU {bdf}: has active display outputs"),
+        ));
+    }
+
+    let procs = check_active_gpu_processes().map_err(|e| {
+        std::io::Error::new(
+            e.kind(),
+            format!("GPU {bdf}: cannot verify GPU is idle — {e}"),
+        )
+    })?;
+    if !procs.is_empty() {
+        let desc: Vec<String> = procs
+            .iter()
+            .map(|(pid, comm)| format!("{pid} ({comm})"))
+            .collect();
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            format!("GPU {bdf}: in use by PIDs: {}", desc.join(", ")),
+        ));
+    }
+
+    if !check_iommu_enabled(sysfs, bdf) {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            format!("GPU {bdf}: IOMMU not enabled or device has no IOMMU group"),
+        ));
+    }
+
+    if !check_vfio_modules_loaded(sysfs) {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            format!("GPU {bdf}: VFIO kernel modules not loaded"),
+        ));
+    }
+
+    if !check_sysfs_permissions(sysfs, bdf) {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::PermissionDenied,
+            format!("GPU {bdf}: insufficient sysfs permissions — run as root"),
+        ));
+    }
+
+    let original_driver = bind_gpu_to_vfio(sysfs, bdf)?;
+    let peer_binds = match bind_iommu_group_peers(sysfs, bdf) {
+        Ok(peers) => peers,
+        Err(e) => {
+            let _ = rebind_gpu_to_original(sysfs, bdf, &original_driver);
+            return Err(e);
+        }
+    };
+
+    Ok(GpuBindState {
+        pci_addr: bdf.to_string(),
+        original_driver,
+        peer_binds,
+        did_bind: true,
+    })
+}
+
+fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result<GpuBindState, std::io::Error> {
+    let pci_dir = sysfs.sys_bus_pci_devices();
+    let entries = std::fs::read_dir(&pci_dir).map_err(|e| {
+        std::io::Error::new(e.kind(), format!("cannot read {}: {e}", pci_dir.display()))
+    })?;
+
+    let mut nvidia_addrs = Vec::new();
+    for entry in entries.filter_map(Result::ok) {
+        let dev_path = entry.path();
+        let vendor = match std::fs::read_to_string(dev_path.join("vendor")) {
+            Ok(v) => v.trim().to_lowercase(),
+            Err(_) => continue,
+        };
+        let class = match std::fs::read_to_string(dev_path.join("class")) {
+            Ok(c) => c.trim().to_lowercase(),
+            Err(_) => continue,
+        };
+        if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x03") {
+            nvidia_addrs.push(entry.file_name().to_string_lossy().to_string());
+        }
+    }
+
+    if nvidia_addrs.is_empty() {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            "no NVIDIA PCI device found",
+        ));
+    }
+
+    nvidia_addrs.sort();
+
+    for addr in &nvidia_addrs {
+        if current_driver(sysfs, addr).as_deref() == Some("vfio-pci")
+            && is_iommu_group_clean(sysfs, addr)
+        {
+            return Ok(GpuBindState {
+                pci_addr: addr.clone(),
+                original_driver: "vfio-pci".to_string(),
+                peer_binds: vec![],
+                did_bind: false,
+            });
+        }
+    }
+
+    let mut blocked: Vec<(String, String)> = Vec::new();
+    let active_procs = check_active_gpu_processes()
+        .map_err(|e| std::io::Error::new(e.kind(), format!("cannot verify GPUs are idle — {e}")))?;
+
+    for addr in &nvidia_addrs {
+        if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") {
+            blocked.push((addr.clone(), "IOMMU group not clean".to_string()));
+            continue;
+        }
+
+        if check_display_attached(sysfs, addr) {
+            blocked.push((addr.clone(), "has active display outputs".to_string()));
+            continue;
+        }
+
+        if !active_procs.is_empty() {
+            let desc: Vec<String> = active_procs
+                .iter()
+                .map(|(pid, comm)| format!("{pid} ({comm})"))
+                .collect();
+            blocked.push((addr.clone(), format!("in use by PIDs: {}", desc.join(", "))));
+            continue;
+        }
+
+        if !check_iommu_enabled(sysfs, addr) {
+            blocked.push((addr.clone(), "IOMMU not enabled".to_string()));
+            continue;
+        }
+
+        if !check_vfio_modules_loaded(sysfs) {
+            blocked.push((addr.clone(), "VFIO modules not loaded".to_string()));
+            continue;
+        }
+
+        if !check_sysfs_permissions(sysfs, addr) {
+            blocked.push((addr.clone(), "insufficient sysfs permissions".to_string()));
+            continue;
+        }
+
+        eprintln!("GPU: binding {addr} for VFIO passthrough");
+        let original_driver = bind_gpu_to_vfio(sysfs, addr)?;
+        let peer_binds = match bind_iommu_group_peers(sysfs, addr) {
+            Ok(peers) => peers,
+            Err(e) => {
+                let _ = rebind_gpu_to_original(sysfs, addr, &original_driver);
+                return Err(e);
+            }
+        };
+
+        return Ok(GpuBindState {
+            pci_addr: addr.clone(),
+            original_driver,
+            peer_binds,
+            did_bind: true,
+        });
+    }
+
+    let mut msg =
+        String::from("GPU passthrough blocked by safety checks.\n\n  Detected devices:\n");
+    for (addr, reason) in &blocked {
+        msg.push_str(&format!("    {addr}: {reason}\n"));
+    }
+    msg.push_str("\n  No GPU is available for passthrough.");
+
+    Err(std::io::Error::new(std::io::ErrorKind::Other, msg))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs;
+    use std::path::Path;
+
+    #[test]
+    fn passthrough_gate_is_false_without_env_var() {
+        // SAFETY: test runs single-threaded; no other thread reads this var.
+        unsafe { std::env::remove_var("OPENSHELL_VM_GPU_E2E") };
+        assert!(
+            !nvidia_gpu_available_for_vm_passthrough(),
+            "gate must return false when OPENSHELL_VM_GPU_E2E is unset"
+        );
+    }
+
+    #[test]
+    fn probe_returns_no_device_or_readiness_on_typical_ci() {
+        let results = probe_host_nvidia_vfio_readiness();
+
+        #[cfg(not(target_os = "linux"))]
+        assert!(results.is_empty(), "non-Linux should return empty Vec");
+
+        #[cfg(target_os = "linux")]
+        {
+            // CI machines typically have no NVIDIA GPU bound to vfio-pci.
+            // Accept an empty list or any per-device readiness state.
+            for (addr, state) in &results {
+                assert!(!addr.is_empty(), "PCI address should not be empty");
+                assert!(
+                    matches!(
+                        state,
+                        HostNvidiaVfioReadiness::BoundToNvidia
+                            | HostNvidiaVfioReadiness::VfioBoundReady
+                            | HostNvidiaVfioReadiness::VfioBoundDirtyGroup
+                    ),
+                    "unexpected per-device readiness state for {addr}: {state:?}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn display_impl_is_meaningful() {
+        let states = [
+            HostNvidiaVfioReadiness::UnsupportedPlatform,
+            HostNvidiaVfioReadiness::NoNvidiaDevice,
+            HostNvidiaVfioReadiness::BoundToNvidia,
+            HostNvidiaVfioReadiness::VfioBoundReady,
+            HostNvidiaVfioReadiness::VfioBoundDirtyGroup,
+            HostNvidiaVfioReadiness::MixedVfioAndOther,
+        ];
+        for state in &states {
+            let msg = format!("{state}");
+            assert!(!msg.is_empty(), "Display for {state:?} should not be empty");
+        }
+    }
+
+    fn mock_pci_device(root: &Path, pci_addr: &str, vendor: &str, driver: Option<&str>) {
+        use std::fs;
+        let dev_dir = root.join("sys/bus/pci/devices").join(pci_addr);
+        fs::create_dir_all(&dev_dir).unwrap();
+        fs::write(dev_dir.join("vendor"), vendor).unwrap();
+        fs::write(dev_dir.join("class"), "0x030000").unwrap();
+        if let Some(drv) = driver {
+            let driver_dir = root.join("sys/bus/pci/drivers").join(drv);
+            fs::create_dir_all(&driver_dir).unwrap();
+            #[cfg(unix)]
+            std::os::unix::fs::symlink(&driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(dev_dir.join("driver_override"), "").unwrap();
+    }
+
+    fn mock_drm_card(root: &Path, card: &str, pci_addr: &str, outputs: &[(&str, &str)]) {
+        use std::fs;
+        let card_dir = root.join("sys/class/drm").join(card);
+        fs::create_dir_all(&card_dir).unwrap();
+        #[cfg(unix)]
+        std::os::unix::fs::symlink(
+            root.join("sys/bus/pci/devices").join(pci_addr),
+            card_dir.join("device"),
+        )
+        .unwrap();
+        for (output, status) in outputs {
+            let out_dir = card_dir.join(format!("{card}-{output}"));
+            fs::create_dir_all(&out_dir).unwrap();
+            fs::write(out_dir.join("status"), status).unwrap();
+        }
+    }
+
+    fn mock_iommu_group(root: &Path, group_id: u32, members: &[&str]) {
+        use std::fs;
+        let group_dir = root.join(format!("sys/kernel/iommu_groups/{group_id}/devices"));
+        fs::create_dir_all(&group_dir).unwrap();
+        for member in members {
+            let dev_dir = root.join("sys/bus/pci/devices").join(member);
+            fs::create_dir_all(&dev_dir).unwrap();
+            #[cfg(unix)]
+            {
+                let iommu_group_target = root.join(format!("sys/kernel/iommu_groups/{group_id}"));
+                let _ =
+                    std::os::unix::fs::symlink(&iommu_group_target, dev_dir.join("iommu_group"));
+                let _ = std::os::unix::fs::symlink(&dev_dir, group_dir.join(member));
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_attached_detects_active_framebuffer() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+        assert!(check_display_attached(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_attached_false_on_headless() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "disconnected")],
+        );
+        assert!(!check_display_attached(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_attached_false_no_drm_card() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        assert!(!check_display_attached(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn iommu_check_fails_without_groups_dir() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        assert!(!check_iommu_enabled(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn iommu_check_passes_with_group() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0"]);
+        assert!(check_iommu_enabled(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn vfio_modules_loaded_true() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+        assert!(check_vfio_modules_loaded(&sysfs));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn vfio_modules_missing() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        assert!(!check_vfio_modules_loaded(&sysfs));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn permissions_writable() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&bind_dir).unwrap();
+        fs::write(bind_dir.join("bind"), "").unwrap();
+        assert!(check_sysfs_permissions(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn permissions_readonly_driver_override() {
+        use std::os::unix::fs::PermissionsExt;
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        let driver_override = root
+            .path()
+            .join("sys/bus/pci/devices/0000:41:00.0/driver_override");
+        fs::set_permissions(&driver_override, fs::Permissions::from_mode(0o444)).unwrap();
+        assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn permissions_readonly_bind() {
+        use std::os::unix::fs::PermissionsExt;
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&bind_dir).unwrap();
+        let bind_path = bind_dir.join("bind");
+        fs::write(&bind_path, "").unwrap();
+        fs::set_permissions(&bind_path, fs::Permissions::from_mode(0o444)).unwrap();
+        assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0"));
+    }
+
+    fn mock_bindable_gpu(root: &Path, pci_addr: &str) {
+        mock_pci_device(root, pci_addr, "0x10de", Some("nvidia"));
+        let drv_unbind = root.join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::write(&drv_unbind, "").unwrap();
+        let vfio_dir = root.join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&vfio_dir).unwrap();
+        fs::write(vfio_dir.join("bind"), "").unwrap();
+        mock_iommu_group(root, 15, &[pci_addr]);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn bind_gpu_writes_correct_sysfs_paths() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+
+        bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+
+        let unbind_content =
+            fs::read_to_string(root.path().join("sys/bus/pci/drivers/nvidia/unbind")).unwrap();
+        assert_eq!(unbind_content, "0000:41:00.0");
+
+        let override_content = fs::read_to_string(
+            root.path()
+                .join("sys/bus/pci/devices/0000:41:00.0/driver_override"),
+        )
+        .unwrap();
+        assert_eq!(override_content, "vfio-pci");
+
+        let bind_content =
+            fs::read_to_string(root.path().join("sys/bus/pci/drivers/vfio-pci/bind")).unwrap();
+        assert_eq!(bind_content, "0000:41:00.0");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn bind_returns_original_driver() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+
+        let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(result, "nvidia");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn bind_noop_when_already_vfio() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("vfio-pci"));
+        let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&vfio_dir).unwrap();
+        fs::write(vfio_dir.join("bind"), "").unwrap();
+
+        let nvidia_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::create_dir_all(nvidia_unbind.parent().unwrap()).unwrap();
+        fs::write(&nvidia_unbind, "").unwrap();
+
+        let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(result, "vfio-pci");
+
+        let unbind_content = fs::read_to_string(&nvidia_unbind).unwrap();
+        assert_eq!(
+            unbind_content, "",
+            "nvidia unbind should NOT have been written"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn rebind_clears_driver_override() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        #[cfg(unix)]
+        {
+            let _ = fs::remove_file(dev_dir.join("driver"));
+            std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap();
+
+        let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap();
+        assert_eq!(override_content, "");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn rebind_writes_to_original_driver_bind() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        #[cfg(unix)]
+        {
+            let _ = fs::remove_file(dev_dir.join("driver"));
+            std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap();
+
+        let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap();
+        assert_eq!(bind_content, "0000:41:00.0");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn iommu_peers_listed_correctly() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        mock_pci_device(root.path(), "0000:41:00.1", "0x10de", None);
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]);
+
+        let peers = iommu_group_peers(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(peers, vec!["0000:41:00.1"]);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn iommu_peers_bound_together() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia"));
+        let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::write(&drv_unbind, "").unwrap();
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]);
+
+        let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(
+            restore,
+            vec![("0000:41:00.1".to_string(), "nvidia".to_string())]
+        );
+
+        let override_content = fs::read_to_string(
+            root.path()
+                .join("sys/bus/pci/devices/0000:41:00.1/driver_override"),
+        )
+        .unwrap();
+        assert_eq!(override_content, "vfio-pci");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn peer_restore_rebinds_to_original() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia"));
+        let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::write(&drv_unbind, "").unwrap();
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]);
+
+        let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap();
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.1");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        #[cfg(unix)]
+        {
+            let _ = fs::remove_file(dev_dir.join("driver"));
+            std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        rebind_iommu_group_peers(&sysfs, &restore).unwrap();
+
+        let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap();
+        assert_eq!(override_content, "");
+    }
+
+    fn mock_multi_gpu_host(root: &Path) {
+        // GPU 0: on nvidia, has display attached
+        mock_bindable_gpu(root, "0000:41:00.0");
+        mock_drm_card(root, "card0", "0000:41:00.0", &[("DP-1", "connected")]);
+
+        // GPU 1: on nvidia, idle (no display, no processes)
+        mock_bindable_gpu(root, "0000:42:00.0");
+
+        // GPU 2: already on vfio-pci, clean IOMMU group
+        mock_pci_device(root, "0000:43:00.0", "0x10de", Some("vfio-pci"));
+        mock_iommu_group(root, 17, &["0000:43:00.0"]);
+
+        fs::create_dir_all(root.join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.join("sys/module/vfio_iommu_type1")).unwrap();
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_prefers_already_vfio() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_multi_gpu_host(root.path());
+
+        let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap();
+        assert_eq!(state.pci_addr, "0000:43:00.0");
+        assert!(!state.did_bind);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_selects_idle_gpu_when_no_vfio() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0"]);
+
+        mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia"));
+        mock_iommu_group(root.path(), 16, &["0000:42:00.0"]);
+
+        let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::write(&drv_unbind, "").unwrap();
+        let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&vfio_dir).unwrap();
+        fs::write(vfio_dir.join("bind"), "").unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap();
+        assert_eq!(state.pci_addr, "0000:42:00.0");
+        assert!(state.did_bind);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_fails_when_all_blocked() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0"]);
+
+        mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card1",
+            "0000:42:00.0",
+            &[("HDMI-1", "connected")],
+        );
+        mock_iommu_group(root.path(), 16, &["0000:42:00.0"]);
+
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err();
+        let msg = err.to_string();
+        assert!(
+            msg.contains("display"),
+            "error should mention display: {msg}"
+        );
+        assert!(
+            msg.contains("0000:41:00.0"),
+            "error should list first GPU: {msg}"
+        );
+        assert!(
+            msg.contains("0000:42:00.0"),
+            "error should list second GPU: {msg}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_fails_on_empty_host() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        fs::create_dir_all(root.path().join("sys/bus/pci/devices")).unwrap();
+
+        let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err();
+        assert!(
+            err.to_string().contains("no NVIDIA PCI device found"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn specific_bdf_binds_target() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap();
+        assert_eq!(state.pci_addr, "0000:41:00.0");
+        assert!(state.did_bind);
+        assert_eq!(state.original_driver, "nvidia");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn specific_bdf_validates_format() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        let err = prepare_gpu_with_sysfs(&sysfs, Some("invalid")).unwrap_err();
+        assert!(
+            err.to_string().contains("invalid PCI address"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn specific_bdf_fails_display_check() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+
+        let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err();
+        assert!(
+            err.to_string().contains("display"),
+            "error should mention display: {err}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn specific_bdf_fails_iommu_check() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+
+        let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err();
+        assert!(
+            err.to_string().contains("IOMMU"),
+            "error should mention IOMMU: {err}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn restore_round_trips() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap();
+        assert!(state.did_bind);
+        assert_eq!(state.original_driver, "nvidia");
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        #[cfg(unix)]
+        {
+            let _ = fs::remove_file(dev_dir.join("driver"));
+            std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        state.restore_with_sysfs(&sysfs).unwrap();
+
+        let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap();
+        assert_eq!(override_content, "");
+
+        let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap();
+        assert_eq!(bind_content, "0000:41:00.0");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn restore_noop_when_did_not_bind() {
+        let state = GpuBindState {
+            pci_addr: "0000:43:00.0".to_string(),
+            original_driver: "vfio-pci".to_string(),
+            peer_binds: vec![],
+            did_bind: false,
+        };
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        state.restore_with_sysfs(&sysfs).unwrap();
+    }
+
+    #[test]
+    fn guard_has_pci_addr() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![],
+            did_bind: true,
+        };
+        let guard = GpuBindGuard::new(state);
+        assert_eq!(guard.pci_addr(), Some("0000:41:00.0"));
+    }
+
+    #[test]
+    fn guard_disarm_returns_state() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![],
+            did_bind: true,
+        };
+        let mut guard = GpuBindGuard::new(state);
+        let taken = guard.disarm();
+        assert!(taken.is_some());
+        assert_eq!(guard.pci_addr(), None);
+    }
+
+    #[test]
+    fn guard_disarm_prevents_double_restore() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![],
+            did_bind: true,
+        };
+        let mut guard = GpuBindGuard::new(state);
+        let _ = guard.disarm();
+        let second = guard.disarm();
+        assert!(second.is_none());
+    }
+
+    #[test]
+    fn guard_drop_noop_when_did_not_bind() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![],
+            did_bind: false,
+        };
+        let guard = GpuBindGuard::new(state);
+        drop(guard);
+    }
+
+    #[test]
+    fn guard_drop_on_panic_is_safe() {
+        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            let state = GpuBindState {
+                pci_addr: "0000:41:00.0".to_string(),
+                original_driver: "nvidia".to_string(),
+                peer_binds: vec![],
+                did_bind: false,
+            };
+            let _guard = GpuBindGuard::new(state);
+            panic!("test panic");
+        }));
+        assert!(result.is_err());
+    }
+}
diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs
index 2b78a7669..9b70b32cf 100644
--- a/crates/openshell-vm/src/lib.rs
+++ b/crates/openshell-vm/src/lib.rs
@@ -14,9 +14,11 @@
 
 #![allow(unsafe_code)]
 
+pub mod backend;
 mod embedded;
 mod exec;
 mod ffi;
+pub mod gpu_passthrough;
 mod health;
 
 use std::ffi::CString;
@@ -25,9 +27,10 @@ use std::ptr;
 use std::time::Instant;
 
 pub use exec::{
-    VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state,
-    ensure_vm_not_running, exec_capture, exec_running_vm, recover_corrupt_kine_db,
-    reset_runtime_state, vm_exec_socket_path, vm_state_path, write_vm_runtime_state,
+    VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, VsockConnectMode, acquire_rootfs_lock,
+    clear_vm_runtime_state, ensure_vm_not_running, exec_capture, exec_running_vm,
+    recover_corrupt_kine_db, reset_runtime_state, vm_exec_socket_path, vm_state_path,
+    write_vm_runtime_state,
 };
 
 // ── Error type ─────────────────────────────────────────────────────────
@@ -45,6 +48,19 @@ pub enum VmError {
     )]
     RootfsNotFound { path: String },
 
+    /// The GPU rootfs directory does not exist.
+    #[error(
+        "GPU rootfs not found: {path}\n\
+         The --gpu flag requires a rootfs built with GPU support (NVIDIA drivers,\n\
+         nvidia-container-toolkit, and GPU manifests).\n\
+         Build one with:\n\
+         \x20 ./crates/openshell-vm/scripts/build-rootfs.sh --gpu <output_dir>\n\
+         Then either:\n\
+         \x20 - Copy it to: {path}\n\
+         \x20 - Or use: openshell-vm --gpu --rootfs <path>"
+    )]
+    GpuRootfsNotFound { path: String },
+
     /// A path contained invalid UTF-8.
     #[error("path is not valid UTF-8: {0}")]
     InvalidPath(String),
@@ -98,6 +114,18 @@ fn check(ret: i32, func: &'static str) -> Result<(), VmError> {
 
 // ── Configuration ──────────────────────────────────────────────────────
 
+/// Hypervisor backend selection.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum VmBackendChoice {
+    /// Auto-select: cloud-hypervisor when a VFIO device is configured, libkrun otherwise.
+    #[default]
+    Auto,
+    /// Force the libkrun backend.
+    Libkrun,
+    /// Force the cloud-hypervisor backend (even without GPU/VFIO).
+    CloudHypervisor,
+}
+
 /// Networking backend for the microVM.
 #[derive(Debug, Clone)]
 pub enum NetBackend {
@@ -202,6 +230,16 @@ pub struct VmConfig {
 
     /// Optional host-backed raw block image for mutable guest state.
     pub state_disk: Option<StateDiskConfig>,
+
+    /// Whether GPU passthrough is enabled for this VM.
+    pub gpu_enabled: bool,
+
+    /// VFIO PCI device address for GPU passthrough (e.g. `0000:41:00.0`).
+    /// When set, the cloud-hypervisor backend is used instead of libkrun.
+    pub vfio_device: Option<String>,
+
+    /// Hypervisor backend override. Defaults to [`VmBackendChoice::Auto`].
+    pub backend: VmBackendChoice,
 }
 
 impl VmConfig {
@@ -245,6 +283,9 @@ impl VmConfig {
             reset: false,
             gateway_name: format!("{GATEWAY_NAME_PREFIX}-default"),
             state_disk: Some(state_disk),
+            gpu_enabled: false,
+            vfio_device: None,
+            backend: VmBackendChoice::Auto,
         }
     }
 }
@@ -277,6 +318,38 @@ pub fn named_rootfs_dir(instance_name: &str) -> Result<PathBuf, VmError> {
         .join("rootfs"))
 }
 
+/// Resolve the GPU rootfs path for a named instance.
+///
+/// Layout: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/instances/{name}/rootfs-gpu`
+///
+/// The GPU rootfs is built separately with `build-rootfs.sh --gpu` and is
+/// never embedded (too large with NVIDIA drivers). If it doesn't exist,
+/// callers should return [`VmError::GpuRootfsNotFound`].
+pub fn named_gpu_rootfs_dir(instance_name: &str) -> Result<PathBuf, VmError> {
+    let name = sanitize_instance_name(instance_name)?;
+    let base = openshell_bootstrap::paths::openshell_vm_base_dir()
+        .map_err(|e| VmError::RuntimeState(format!("resolve openshell-vm base dir: {e}")))?;
+    Ok(base
+        .join(env!("CARGO_PKG_VERSION"))
+        .join("instances")
+        .join(name)
+        .join("rootfs-gpu"))
+}
+
+/// Ensure a GPU rootfs exists for the named instance.
+///
+/// Unlike [`ensure_named_rootfs`], there is no embedded GPU rootfs to
+/// extract — the user must pre-build it with `build-rootfs.sh --gpu`.
+pub fn ensure_gpu_rootfs(instance_name: &str) -> Result<PathBuf, VmError> {
+    let gpu_rootfs = named_gpu_rootfs_dir(instance_name)?;
+    if gpu_rootfs.is_dir() {
+        return Ok(gpu_rootfs);
+    }
+    Err(VmError::GpuRootfsNotFound {
+        path: gpu_rootfs.display().to_string(),
+    })
+}
+
 /// Ensure a named instance rootfs exists, extracting from the embedded
 /// rootfs tarball on first use.
 ///
@@ -365,7 +438,9 @@ fn sanitize_instance_name(name: &str) -> Result<String, VmError> {
 /// Build a null-terminated C string array from a slice of strings.
 ///
 /// Returns both the `CString` owners (to keep them alive) and the pointer array.
-fn c_string_array(strings: &[&str]) -> Result<(Vec<CString>, Vec<*const libc::c_char>), VmError> {
+pub(crate) fn c_string_array(
+    strings: &[&str],
+) -> Result<(Vec<CString>, Vec<*const libc::c_char>), VmError> {
     let owned: Vec<CString> = strings
         .iter()
         .map(|s| CString::new(*s))
@@ -570,7 +645,7 @@ fn extract_json_string(json: &str, key: &str) -> Option<String> {
     map.get(key)?.as_str().map(ToOwned::to_owned)
 }
 
-fn clamp_log_level(level: u32) -> u32 {
+pub(crate) fn clamp_log_level(level: u32) -> u32 {
     match level {
         0 => ffi::KRUN_LOG_LEVEL_OFF,
         1 => ffi::KRUN_LOG_LEVEL_ERROR,
@@ -581,258 +656,29 @@ fn clamp_log_level(level: u32) -> u32 {
     }
 }
 
-struct VmContext {
-    krun: &'static ffi::LibKrun,
-    ctx_id: u32,
-}
-
-impl VmContext {
-    fn create(log_level: u32) -> Result<Self, VmError> {
-        let krun = ffi::libkrun()?;
-        unsafe {
-            check(
-                (krun.krun_init_log)(
-                    ffi::KRUN_LOG_TARGET_DEFAULT,
-                    clamp_log_level(log_level),
-                    ffi::KRUN_LOG_STYLE_AUTO,
-                    ffi::KRUN_LOG_OPTION_NO_ENV,
-                ),
-                "krun_init_log",
-            )?;
-        }
-
-        let ctx_id = unsafe { (krun.krun_create_ctx)() };
-        if ctx_id < 0 {
-            return Err(VmError::Krun {
-                func: "krun_create_ctx",
-                code: ctx_id,
-            });
-        }
-
-        Ok(Self {
-            krun,
-            ctx_id: ctx_id as u32,
-        })
-    }
-
-    fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> {
-        unsafe {
-            check(
-                (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib),
-                "krun_set_vm_config",
-            )
-        }
-    }
-
-    fn set_root(&self, rootfs: &Path) -> Result<(), VmError> {
-        let rootfs_c = path_to_cstring(rootfs)?;
-        unsafe {
-            check(
-                (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()),
-                "krun_set_root",
-            )
-        }
-    }
-
-    fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> {
-        let Some(add_disk3) = self.krun.krun_add_disk3 else {
-            return Err(VmError::HostSetup(
-                "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support"
-                    .to_string(),
-            ));
-        };
-
-        let block_id_c = CString::new(state_disk.block_id.as_str())?;
-        let disk_path_c = path_to_cstring(&state_disk.path)?;
-        unsafe {
-            check(
-                add_disk3(
-                    self.ctx_id,
-                    block_id_c.as_ptr(),
-                    disk_path_c.as_ptr(),
-                    ffi::KRUN_DISK_FORMAT_RAW,
-                    false,
-                    false,
-                    state_disk_sync_mode(),
-                ),
-                "krun_add_disk3",
-            )
-        }
-    }
-
-    fn set_workdir(&self, workdir: &str) -> Result<(), VmError> {
-        let workdir_c = CString::new(workdir)?;
-        unsafe {
-            check(
-                (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()),
-                "krun_set_workdir",
-            )
-        }
-    }
-
-    fn disable_implicit_vsock(&self) -> Result<(), VmError> {
-        unsafe {
-            check(
-                (self.krun.krun_disable_implicit_vsock)(self.ctx_id),
-                "krun_disable_implicit_vsock",
-            )
-        }
-    }
-
-    fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> {
-        unsafe {
-            check(
-                (self.krun.krun_add_vsock)(self.ctx_id, tsi_features),
-                "krun_add_vsock",
-            )
-        }
-    }
-
-    #[cfg(target_os = "macos")]
-    fn add_net_unixgram(
-        &self,
-        socket_path: &Path,
-        mac: &[u8; 6],
-        features: u32,
-        flags: u32,
-    ) -> Result<(), VmError> {
-        let sock_c = path_to_cstring(socket_path)?;
-        unsafe {
-            check(
-                (self.krun.krun_add_net_unixgram)(
-                    self.ctx_id,
-                    sock_c.as_ptr(),
-                    -1,
-                    mac.as_ptr(),
-                    features,
-                    flags,
-                ),
-                "krun_add_net_unixgram",
-            )
-        }
-    }
-
-    #[allow(dead_code)] // FFI binding for future use (e.g. Linux networking)
-    fn add_net_unixstream(
-        &self,
-        socket_path: &Path,
-        mac: &[u8; 6],
-        features: u32,
-    ) -> Result<(), VmError> {
-        let sock_c = path_to_cstring(socket_path)?;
-        unsafe {
-            check(
-                (self.krun.krun_add_net_unixstream)(
-                    self.ctx_id,
-                    sock_c.as_ptr(),
-                    -1,
-                    mac.as_ptr(),
-                    features,
-                    0,
-                ),
-                "krun_add_net_unixstream",
-            )
-        }
-    }
-
-    fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> {
-        let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect();
-        let (_port_owners, port_ptrs) = c_string_array(&port_strs)?;
-        unsafe {
-            check(
-                (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()),
-                "krun_set_port_map",
-            )
-        }
-    }
-
-    fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> {
-        let socket_c = path_to_cstring(&port.socket_path)?;
-        unsafe {
-            check(
-                (self.krun.krun_add_vsock_port2)(
-                    self.ctx_id,
-                    port.port,
-                    socket_c.as_ptr(),
-                    port.listen,
-                ),
-                "krun_add_vsock_port2",
-            )
-        }
-    }
-
-    fn set_console_output(&self, path: &Path) -> Result<(), VmError> {
-        let console_c = path_to_cstring(path)?;
-        unsafe {
-            check(
-                (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()),
-                "krun_set_console_output",
-            )
-        }
-    }
-
-    fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> {
-        let exec_c = CString::new(exec_path)?;
-        let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect();
-        let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?;
-        let env_strs: Vec<&str> = env.iter().map(String::as_str).collect();
-        let (_env_owners, env_ptrs) = c_string_array(&env_strs)?;
-
-        unsafe {
-            check(
-                (self.krun.krun_set_exec)(
-                    self.ctx_id,
-                    exec_c.as_ptr(),
-                    argv_ptrs.as_ptr(),
-                    env_ptrs.as_ptr(),
-                ),
-                "krun_set_exec",
-            )
-        }
-    }
-
-    fn start_enter(&self) -> i32 {
-        unsafe { (self.krun.krun_start_enter)(self.ctx_id) }
-    }
-}
-
-impl Drop for VmContext {
-    fn drop(&mut self) {
-        unsafe {
-            let ret = (self.krun.krun_free_ctx)(self.ctx_id);
-            if ret < 0 {
-                eprintln!(
-                    "warning: krun_free_ctx({}) failed with code {ret}",
-                    self.ctx_id
-                );
-            }
-        }
-    }
-}
-
 /// RAII guard that kills and waits on a gvproxy child process when dropped.
 ///
 /// This prevents orphaned gvproxy processes when early `?` returns in the
 /// launch function cause the child to be dropped before cleanup code runs.
 /// Call [`GvproxyGuard::disarm`] to take ownership of the child when it
 /// should outlive the guard (i.e., after a successful fork).
-struct GvproxyGuard {
+pub(crate) struct GvproxyGuard {
     child: Option<std::process::Child>,
 }
 
 impl GvproxyGuard {
-    fn new(child: std::process::Child) -> Self {
+    pub(crate) fn new(child: std::process::Child) -> Self {
         Self { child: Some(child) }
     }
 
     /// Take the child out of the guard, preventing it from being killed on drop.
     /// Use this after the launch is successful and the parent will manage cleanup.
-    fn disarm(&mut self) -> Option<std::process::Child> {
+    pub(crate) fn disarm(&mut self) -> Option<std::process::Child> {
         self.child.take()
     }
 
     /// Get the child's PID without disarming.
-    fn id(&self) -> Option<u32> {
+    pub(crate) fn id(&self) -> Option<u32> {
         self.child.as_ref().map(std::process::Child::id)
     }
 }
@@ -852,7 +698,7 @@ impl Drop for GvproxyGuard {
 ///
 /// Sends a raw HTTP/1.1 POST request over the unix socket to avoid
 /// depending on `curl` being installed on the host.
-fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> {
+pub(crate) fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> {
     use std::io::{Read, Write};
     use std::os::unix::net::UnixStream;
 
@@ -908,7 +754,7 @@ fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> {
 /// runtime state. If the state file was deleted (e.g. the user ran
 /// `rm -rf` on the data directory), we fall back to killing any gvproxy
 /// process holding the target ports.
-fn kill_stale_gvproxy(rootfs: &Path) {
+pub(crate) fn kill_stale_gvproxy(rootfs: &Path) {
     kill_stale_gvproxy_by_state(rootfs);
 }
 
@@ -929,7 +775,7 @@ fn kill_stale_gvproxy_by_state(rootfs: &Path) {
 ///
 /// Used as a fallback when the VM state file is missing (e.g. after the
 /// user deleted the data directory while a VM was running).
-fn kill_stale_gvproxy_by_port(port: u16) {
+pub(crate) fn kill_stale_gvproxy_by_port(port: u16) {
     // Use lsof to find PIDs listening on the target port.
     let output = std::process::Command::new("lsof")
         .args(["-ti", &format!(":{port}")])
@@ -1009,7 +855,7 @@ fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool {
     false
 }
 
-fn vm_rootfs_key(rootfs: &Path) -> String {
+pub(crate) fn vm_rootfs_key(rootfs: &Path) -> String {
     let name = rootfs
         .file_name()
         .and_then(|part| part.to_str())
@@ -1078,7 +924,7 @@ fn ensure_state_disk_image(state_disk: &StateDiskConfig) -> Result<(), VmError>
     Ok(())
 }
 
-fn state_disk_sync_mode() -> u32 {
+pub(crate) fn state_disk_sync_mode() -> u32 {
     #[cfg(target_os = "macos")]
     {
         ffi::KRUN_SYNC_RELAXED
@@ -1154,7 +1000,7 @@ fn secure_socket_base(subdir: &str) -> Result<PathBuf, VmError> {
     Ok(dir)
 }
 
-fn gvproxy_socket_dir(rootfs: &Path) -> Result<PathBuf, VmError> {
+pub(crate) fn gvproxy_socket_dir(rootfs: &Path) -> Result<PathBuf, VmError> {
     let dir = secure_socket_base("ovm-gv")?;
 
     // macOS unix socket path limit is tight (~104 bytes). Keep paths very short.
@@ -1162,7 +1008,30 @@ fn gvproxy_socket_dir(rootfs: &Path) -> Result<PathBuf, VmError> {
     Ok(dir.join(id))
 }
 
-fn gateway_host_port(config: &VmConfig) -> u16 {
+/// Validate that a VFIO PCI address matches the BDF format `DDDD:BB:DD.F`.
+///
+/// Rejects strings containing `/`, `..`, or non-hex characters to prevent
+/// path traversal when the address is interpolated into sysfs paths.
+fn validate_vfio_address(addr: &str) -> Result<(), VmError> {
+    let bytes = addr.as_bytes();
+    if bytes.len() == 12
+        && bytes[4] == b':'
+        && bytes[7] == b':'
+        && bytes[10] == b'.'
+        && bytes[..4].iter().all(u8::is_ascii_hexdigit)
+        && bytes[5..7].iter().all(u8::is_ascii_hexdigit)
+        && bytes[8..10].iter().all(u8::is_ascii_hexdigit)
+        && bytes[11].is_ascii_digit()
+        && bytes[11] <= b'7'
+    {
+        return Ok(());
+    }
+    Err(VmError::HostSetup(format!(
+        "invalid VFIO PCI address '{addr}': expected BDF format DDDD:BB:DD.F (e.g. 0000:41:00.0)"
+    )))
+}
+
+pub(crate) fn gateway_host_port(config: &VmConfig) -> u16 {
     config
         .port_map
         .first()
@@ -1171,7 +1040,7 @@ fn gateway_host_port(config: &VmConfig) -> u16 {
         .unwrap_or(DEFAULT_GATEWAY_PORT)
 }
 
-fn pick_gvproxy_ssh_port() -> Result<u16, VmError> {
+pub(crate) fn pick_gvproxy_ssh_port() -> Result<u16, VmError> {
     let listener = std::net::TcpListener::bind(("127.0.0.1", 0))
         .map_err(|e| VmError::HostSetup(format!("allocate gvproxy ssh port on localhost: {e}")))?;
     let port = listener
@@ -1182,7 +1051,7 @@ fn pick_gvproxy_ssh_port() -> Result<u16, VmError> {
     Ok(port)
 }
 
-fn path_to_cstring(path: &Path) -> Result<CString, VmError> {
+pub(crate) fn path_to_cstring(path: &Path) -> Result<CString, VmError> {
     let s = path
         .to_str()
         .ok_or_else(|| VmError::InvalidPath(path.display().to_string()))?;
@@ -1277,11 +1146,22 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
             state_disk.path.display()
         )));
     }
-    if let Some(state_disk) = &config.state_disk {
+    let fresh_state_disk = if let Some(state_disk) = &config.state_disk {
+        let existed_before = state_disk.path.is_file();
         ensure_state_disk_image(state_disk)?;
+        !existed_before
+    } else {
+        false
+    };
+
+    // When the state disk is freshly created (deleted by user, --reset, or
+    // first boot), the VM will generate new PKI. Clear any cached host-side
+    // mTLS certs so `bootstrap_gateway` runs the cold-boot PKI fetch path
+    // instead of using stale certs that won't match the new VM CA.
+    if fresh_state_disk || config.reset {
+        clear_warm_boot_certs(&config.gateway_name);
     }
 
-    let launch_start = Instant::now();
     eprintln!("rootfs: {}", config.rootfs.display());
     if let Some(state_disk) = &config.state_disk {
         eprintln!(
@@ -1292,8 +1172,34 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
     }
     eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib);
 
-    // The runtime is embedded in the binary and extracted on first use.
-    // Can be overridden via OPENSHELL_VM_RUNTIME_DIR for development.
+    raise_nofile_limit();
+
+    // ── Dispatch to the appropriate backend ─────────────────────────
+
+    let use_chv = match config.backend {
+        VmBackendChoice::CloudHypervisor => true,
+        VmBackendChoice::Libkrun => false,
+        VmBackendChoice::Auto => config.gpu_enabled || config.vfio_device.is_some(),
+    };
+
+    if use_chv {
+        #[cfg(not(target_os = "linux"))]
+        return Err(VmError::HostSetup(
+            "cloud-hypervisor backend requires Linux with KVM".into(),
+        ));
+
+        #[cfg(target_os = "linux")]
+        {
+            if let Some(ref addr) = config.vfio_device {
+                validate_vfio_address(addr)?;
+            }
+            let chv_backend = backend::cloud_hypervisor::CloudHypervisorBackend::new()?;
+            return backend::VmBackend::launch(&chv_backend, config);
+        }
+    }
+
+    // libkrun path: resolve the embedded runtime bundle and load libkrun.
+    // Cloud-hypervisor resolves its own binaries in CloudHypervisorBackend::new().
     let runtime_gvproxy = resolve_runtime_bundle()?;
     let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| {
         VmError::HostSetup(format!(
@@ -1302,413 +1208,12 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
         ))
     })?;
     configure_runtime_loader_env(runtime_dir)?;
-    raise_nofile_limit();
 
-    // ── Log runtime provenance ─────────────────────────────────────
-    // After configuring the loader, trigger library loading so that
-    // provenance is captured before we proceed with VM configuration.
     let _ = ffi::libkrun()?;
     log_runtime_provenance(runtime_dir);
 
-    // ── Configure the microVM ──────────────────────────────────────
-
-    let vm = VmContext::create(config.log_level)?;
-    vm.set_vm_config(config.vcpus, config.mem_mib)?;
-    vm.set_root(&config.rootfs)?;
-    if let Some(state_disk) = &config.state_disk {
-        vm.add_state_disk(state_disk)?;
-    }
-    vm.set_workdir(&config.workdir)?;
-
-    // Networking setup — use a drop guard so gvproxy is killed if we
-    // return early via `?` before reaching the parent's cleanup code.
-    let mut gvproxy_guard: Option<GvproxyGuard> = None;
-    let mut gvproxy_api_sock: Option<PathBuf> = None;
-
-    match &config.net {
-        NetBackend::Tsi => {
-            // Default TSI — no special setup needed.
-        }
-        NetBackend::None => {
-            vm.disable_implicit_vsock()?;
-            vm.add_vsock(0)?;
-            eprintln!("Networking: disabled (no TSI, no virtio-net)");
-        }
-        NetBackend::Gvproxy { binary } => {
-            if !binary.exists() {
-                return Err(VmError::BinaryNotFound {
-                    path: binary.display().to_string(),
-                    hint: "Install Podman Desktop or place gvproxy in PATH".to_string(),
-                });
-            }
-
-            // Create temp socket paths
-            let run_dir = config
-                .rootfs
-                .parent()
-                .unwrap_or(&config.rootfs)
-                .to_path_buf();
-            let rootfs_key = vm_rootfs_key(&config.rootfs);
-            let sock_base = gvproxy_socket_dir(&config.rootfs)?;
-            let net_sock = sock_base.with_extension("v");
-            let api_sock = sock_base.with_extension("a");
-
-            // Kill any stale gvproxy process from a previous run.
-            // First try via the saved PID in the state file, then fall
-            // back to killing any gvproxy holding our target ports (covers
-            // the case where the state file was deleted).
-            kill_stale_gvproxy(&config.rootfs);
-            for pm in &config.port_map {
-                if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::<u16>().ok()) {
-                    kill_stale_gvproxy_by_port(host_port);
-                }
-            }
-
-            // Clean stale sockets (including the -krun.sock file that
-            // libkrun creates as its datagram endpoint on macOS).
-            let _ = std::fs::remove_file(&net_sock);
-            let _ = std::fs::remove_file(&api_sock);
-            let krun_sock = sock_base.with_extension("v-krun.sock");
-            let _ = std::fs::remove_file(&krun_sock);
-
-            // Start gvproxy
-            eprintln!("Starting gvproxy: {}", binary.display());
-            let ssh_port = pick_gvproxy_ssh_port()?;
-            let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log"));
-            let gvproxy_log_file = std::fs::File::create(&gvproxy_log)
-                .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?;
-
-            // On Linux, gvproxy uses QEMU mode (SOCK_STREAM) since the vfkit
-            // unixgram scheme is macOS/vfkit-specific.  On macOS, use vfkit mode.
-            #[cfg(target_os = "linux")]
-            let (gvproxy_net_flag, gvproxy_net_url) =
-                ("-listen-qemu", format!("unix://{}", net_sock.display()));
-            #[cfg(target_os = "macos")]
-            let (gvproxy_net_flag, gvproxy_net_url) = (
-                "-listen-vfkit",
-                format!("unixgram://{}", net_sock.display()),
-            );
-
-            let child = std::process::Command::new(binary)
-                .arg(gvproxy_net_flag)
-                .arg(&gvproxy_net_url)
-                .arg("-listen")
-                .arg(format!("unix://{}", api_sock.display()))
-                .arg("-ssh-port")
-                .arg(ssh_port.to_string())
-                .stdout(std::process::Stdio::null())
-                .stderr(gvproxy_log_file)
-                .spawn()
-                .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?;
-
-            eprintln!(
-                "gvproxy started (pid {}, ssh port {}) [{:.1}s]",
-                child.id(),
-                ssh_port,
-                launch_start.elapsed().as_secs_f64()
-            );
-
-            // Wait for the socket to appear (exponential backoff: 5ms → 100ms).
-            {
-                let deadline = Instant::now() + std::time::Duration::from_secs(5);
-                let mut interval = std::time::Duration::from_millis(5);
-                while !net_sock.exists() {
-                    if Instant::now() >= deadline {
-                        return Err(VmError::Fork(
-                            "gvproxy socket did not appear within 5s".to_string(),
-                        ));
-                    }
-                    std::thread::sleep(interval);
-                    interval = (interval * 2).min(std::time::Duration::from_millis(100));
-                }
-            }
-
-            // Disable implicit TSI and add virtio-net via gvproxy
-            vm.disable_implicit_vsock()?;
-            vm.add_vsock(0)?;
-            // This MAC matches gvproxy's default static DHCP lease for
-            // 192.168.127.2. Using a different MAC can cause the gVisor
-            // network stack to misroute or drop packets.
-            let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee];
-
-            // COMPAT_NET_FEATURES from libkrun.h
-            const NET_FEATURE_CSUM: u32 = 1 << 0;
-            const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1;
-            const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7;
-            const NET_FEATURE_GUEST_UFO: u32 = 1 << 10;
-            const NET_FEATURE_HOST_TSO4: u32 = 1 << 11;
-            const NET_FEATURE_HOST_UFO: u32 = 1 << 14;
-            const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM
-                | NET_FEATURE_GUEST_CSUM
-                | NET_FEATURE_GUEST_TSO4
-                | NET_FEATURE_GUEST_UFO
-                | NET_FEATURE_HOST_TSO4
-                | NET_FEATURE_HOST_UFO;
-
-            // On Linux use unixstream (SOCK_STREAM) to connect to gvproxy's
-            // QEMU listener.  On macOS use unixgram (SOCK_DGRAM) with the vfkit
-            // magic byte for the vfkit listener.
-            #[cfg(target_os = "linux")]
-            vm.add_net_unixstream(&net_sock, &mac, COMPAT_NET_FEATURES)?;
-            #[cfg(target_os = "macos")]
-            {
-                const NET_FLAG_VFKIT: u32 = 1 << 0;
-                vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?;
-            }
-
-            eprintln!(
-                "Networking: gvproxy (virtio-net) [{:.1}s]",
-                launch_start.elapsed().as_secs_f64()
-            );
-            gvproxy_guard = Some(GvproxyGuard::new(child));
-            gvproxy_api_sock = Some(api_sock);
-        }
-    }
-
-    // Port mapping (TSI only)
-    if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) {
-        vm.set_port_map(&config.port_map)?;
-    }
-
-    for vsock_port in &config.vsock_ports {
-        if let Some(parent) = vsock_port.socket_path.parent() {
-            std::fs::create_dir_all(parent).map_err(|e| {
-                VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display()))
-            })?;
-        }
-        // libkrun returns EEXIST if the socket file is already present from a
-        // previous run. Remove any stale socket before registering the port.
-        let _ = std::fs::remove_file(&vsock_port.socket_path);
-        vm.add_vsock_port(vsock_port)?;
-    }
-
-    // Console output
-    let console_log = config.console_output.clone().unwrap_or_else(|| {
-        config
-            .rootfs
-            .parent()
-            .unwrap_or(&config.rootfs)
-            .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs)))
-    });
-    vm.set_console_output(&console_log)?;
-
-    // envp: use provided env or minimal defaults
-    let mut env: Vec<String> = if config.env.is_empty() {
-        vec![
-            "HOME=/root",
-            "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
-            "TERM=xterm",
-        ]
-        .into_iter()
-        .map(ToOwned::to_owned)
-        .collect()
-    } else {
-        config.env.clone()
-    };
-    if let Some(state_disk) = &config.state_disk
-        && !env
-            .iter()
-            .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE="))
-    {
-        env.push(format!(
-            "OPENSHELL_VM_STATE_DISK_DEVICE={}",
-            state_disk.guest_device
-        ));
-    }
-    vm.set_exec(&config.exec_path, &config.args, &env)?;
-
-    // ── Fork and enter the VM ──────────────────────────────────────
-    //
-    // krun_start_enter() never returns — it calls exit() when the guest
-    // process exits. We fork so the parent can monitor and report.
-
-    let boot_start = Instant::now();
-    eprintln!("Booting microVM...");
-
-    let pid = unsafe { libc::fork() };
-    match pid {
-        -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())),
-        0 => {
-            // Child process: enter the VM (never returns on success)
-            let ret = vm.start_enter();
-            eprintln!("krun_start_enter failed: {ret}");
-            std::process::exit(1);
-        }
-        _ => {
-            // Parent: wait for child
-            if config.exec_path == "/srv/openshell-vm-init.sh" {
-                let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id);
-                if let Err(err) =
-                    write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid)
-                {
-                    unsafe {
-                        libc::kill(pid, libc::SIGTERM);
-                    }
-                    // Guard drop will kill gvproxy automatically
-                    drop(gvproxy_guard);
-                    clear_vm_runtime_state(&config.rootfs);
-                    return Err(err);
-                }
-            }
-            eprintln!(
-                "VM started (child pid {pid}) [{:.1}s]",
-                boot_start.elapsed().as_secs_f64()
-            );
-            for pm in &config.port_map {
-                let host_port = pm.split(':').next().unwrap_or(pm);
-                eprintln!("  port {pm} -> http://localhost:{host_port}");
-            }
-            eprintln!("Console output: {}", console_log.display());
-
-            // Set up gvproxy port forwarding via its HTTP API.
-            // The port_map entries use the same "host:guest" format
-            // as TSI, but here we translate them into gvproxy expose
-            // calls targeting the guest IP (192.168.127.2).
-            //
-            // Instead of a fixed 500ms sleep, poll the API socket with
-            // exponential backoff (5ms → 200ms, ~1s total budget).
-            if let Some(ref api_sock) = gvproxy_api_sock {
-                let fwd_start = Instant::now();
-                // Wait for the API socket to appear (it lags slightly
-                // behind the vfkit data socket).
-                {
-                    let deadline = Instant::now() + std::time::Duration::from_secs(2);
-                    let mut interval = std::time::Duration::from_millis(5);
-                    while !api_sock.exists() {
-                        if Instant::now() >= deadline {
-                            eprintln!(
-                                "warning: gvproxy API socket not ready after 2s, attempting anyway"
-                            );
-                            break;
-                        }
-                        std::thread::sleep(interval);
-                        interval = (interval * 2).min(std::time::Duration::from_millis(200));
-                    }
-                }
-
-                let guest_ip = "192.168.127.2";
-
-                for pm in &config.port_map {
-                    let parts: Vec<&str> = pm.split(':').collect();
-                    let (host_port, guest_port) = match parts.len() {
-                        2 => (parts[0], parts[1]),
-                        1 => (parts[0], parts[0]),
-                        _ => {
-                            eprintln!("  skipping invalid port mapping: {pm}");
-                            continue;
-                        }
-                    };
-
-                    let expose_body = format!(
-                        r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"#
-                    );
-
-                    // Retry with exponential backoff — gvproxy's internal
-                    // netstack may not be ready immediately after socket creation.
-                    let mut expose_ok = false;
-                    let mut retry_interval = std::time::Duration::from_millis(100);
-                    let expose_deadline = Instant::now() + std::time::Duration::from_secs(10);
-                    loop {
-                        match gvproxy_expose(api_sock, &expose_body) {
-                            Ok(()) => {
-                                eprintln!("  port {host_port} -> {guest_ip}:{guest_port}");
-                                expose_ok = true;
-                                break;
-                            }
-                            Err(e) => {
-                                if Instant::now() >= expose_deadline {
-                                    eprintln!("  port {host_port}: {e} (retries exhausted)");
-                                    break;
-                                }
-                                std::thread::sleep(retry_interval);
-                                retry_interval =
-                                    (retry_interval * 2).min(std::time::Duration::from_secs(1));
-                            }
-                        }
-                    }
-                    if !expose_ok {
-                        return Err(VmError::HostSetup(format!(
-                            "failed to forward port {host_port} via gvproxy"
-                        )));
-                    }
-                }
-                eprintln!(
-                    "Port forwarding ready [{:.1}s]",
-                    fwd_start.elapsed().as_secs_f64()
-                );
-            }
-
-            // Bootstrap the OpenShell control plane and wait for the
-            // service to be reachable. Only for the gateway preset, and
-            // only when port forwarding is configured (i.e. the gateway
-            // is reachable from the host). During rootfs pre-init builds,
-            // no --port is specified so there is nothing to health-check
-            // — the build script has its own kubectl-based readiness
-            // checks inside the VM.
-            if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() {
-                // Bootstrap stores host-side metadata and mTLS creds.
-                // With pre-baked rootfs (Path 1) this reads PKI directly
-                // from virtio-fs — no kubectl or port forwarding needed.
-                // Cold boot (Path 2) writes secret manifests into the
-                // k3s auto-deploy directory via virtio-fs.
-                let gateway_port = gateway_host_port(config);
-                bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?;
-
-                // Wait for the gRPC health check to pass. This ensures
-                // the service is fully operational, not just accepting
-                // TCP connections. The health check confirms the full
-                // path (gvproxy → kube-proxy nftables → pod:8080) and
-                // that the gRPC service is responding to requests.
-                health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?;
-            }
-
-            eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64());
-            eprintln!("Press Ctrl+C to stop.");
-
-            // Forward signals to child
-            unsafe {
-                libc::signal(
-                    libc::SIGINT,
-                    forward_signal as *const () as libc::sighandler_t,
-                );
-                libc::signal(
-                    libc::SIGTERM,
-                    forward_signal as *const () as libc::sighandler_t,
-                );
-                CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed);
-            }
-
-            let mut status: libc::c_int = 0;
-            unsafe {
-                libc::waitpid(pid, &raw mut status, 0);
-            }
-
-            // Clean up gvproxy — disarm the guard and do explicit cleanup
-            // so we can print the "stopped" message.
-            if config.exec_path == "/srv/openshell-vm-init.sh" {
-                clear_vm_runtime_state(&config.rootfs);
-            }
-            if let Some(mut guard) = gvproxy_guard
-                && let Some(mut child) = guard.disarm()
-            {
-                let _ = child.kill();
-                let _ = child.wait();
-                eprintln!("gvproxy stopped");
-            }
-
-            if libc::WIFEXITED(status) {
-                let code = libc::WEXITSTATUS(status);
-                eprintln!("VM exited with code {code}");
-                return Ok(code);
-            } else if libc::WIFSIGNALED(status) {
-                let sig = libc::WTERMSIG(status);
-                eprintln!("VM killed by signal {sig}");
-                return Ok(128 + sig);
-            }
-
-            Ok(status)
-        }
-    }
+    let libkrun_backend = backend::libkrun::LibkrunBackend;
+    backend::VmBackend::launch(&libkrun_backend, config)
 }
 
 // ── Post-boot bootstrap ────────────────────────────────────────────────
@@ -1727,7 +1232,11 @@ const DEFAULT_GATEWAY_PORT: u16 = 30051;
 /// 2. **First boot / post-reset**: poll the exec agent to `cat` each PEM file
 ///    from `/opt/openshell/pki/` until the files exist (PKI generation has
 ///    finished), then store them in `~/.config/openshell/gateways/<name>/mtls/`.
-fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Result<(), VmError> {
+pub(crate) fn bootstrap_gateway(
+    rootfs: &Path,
+    gateway_name: &str,
+    gateway_port: u16,
+) -> Result<(), VmError> {
     let bootstrap_start = Instant::now();
 
     let metadata = openshell_bootstrap::GatewayMetadata {
@@ -1921,6 +1430,31 @@ fn is_warm_boot(gateway_name: &str) -> bool {
     true
 }
 
+/// Remove cached mTLS certs from the host so the next `bootstrap_gateway`
+/// call treats this as a cold boot and fetches fresh PKI from the VM.
+///
+/// Called when the state disk is freshly created or `--reset` is used,
+/// since the VM will generate new PKI that won't match stale host certs.
+fn clear_warm_boot_certs(gateway_name: &str) {
+    let Ok(home) = std::env::var("HOME") else {
+        return;
+    };
+    let config_base =
+        std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config"));
+    let mtls_dir = PathBuf::from(&config_base)
+        .join("openshell/gateways")
+        .join(gateway_name)
+        .join("mtls");
+
+    if mtls_dir.is_dir() {
+        if let Err(e) = std::fs::remove_dir_all(&mtls_dir) {
+            eprintln!("Warning: failed to clear stale mTLS certs: {e}");
+        } else {
+            eprintln!("Cleared stale host mTLS certs");
+        }
+    }
+}
+
 /// Compare the CA cert on the rootfs (authoritative source) against the
 /// host-side copy. If they differ, re-copy all client certs from the rootfs.
 ///
@@ -1956,9 +1490,9 @@ fn sync_host_certs_if_stale(
     Ok(())
 }
 
-static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0);
+pub(crate) static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0);
 
-extern "C" fn forward_signal(_sig: libc::c_int) {
+pub(crate) extern "C" fn forward_signal(_sig: libc::c_int) {
     let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed);
     if pid > 0 {
         unsafe {
diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs
index bb9d854b1..1b3aa6423 100644
--- a/crates/openshell-vm/src/main.rs
+++ b/crates/openshell-vm/src/main.rs
@@ -92,6 +92,16 @@ struct Cli {
     /// unclean shutdown.
     #[arg(long)]
     reset: bool,
+
+    /// Enable GPU passthrough. Optionally specify a PCI address
+    /// (e.g. `0000:41:00.0`). Uses cloud-hypervisor backend with VFIO.
+    #[arg(long, num_args = 0..=1, default_missing_value = "auto")]
+    gpu: Option<String>,
+
+    /// Hypervisor backend: "auto" (default), "libkrun", or "cloud-hypervisor".
+    /// Auto selects cloud-hypervisor when --gpu is set, libkrun otherwise.
+    #[arg(long, default_value = "auto")]
+    backend: String,
 }
 
 #[derive(Subcommand)]
@@ -196,12 +206,16 @@ fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
                 return Err("openshell-vm exec requires a command when stdin is not a TTY".into());
             }
         }
+        let exec_rootfs = if let Some(explicit) = cli.rootfs {
+            explicit
+        } else if cli.gpu.is_some() {
+            openshell_vm::named_gpu_rootfs_dir(&cli.name)?
+        } else {
+            openshell_vm::named_rootfs_dir(&cli.name)?
+        };
         return Ok(openshell_vm::exec_running_vm(
             openshell_vm::VmExecOptions {
-                rootfs: Some(
-                    cli.rootfs
-                        .unwrap_or(openshell_vm::named_rootfs_dir(&cli.name)?),
-                ),
+                rootfs: Some(exec_rootfs),
                 command,
                 workdir,
                 env,
@@ -223,12 +237,59 @@ fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
         }
     };
 
-    let rootfs = cli
-        .rootfs
-        .map_or_else(|| openshell_vm::ensure_named_rootfs(&cli.name), Ok)?;
+    let rootfs = if let Some(explicit) = cli.rootfs {
+        Ok(explicit)
+    } else if cli.gpu.is_some() {
+        openshell_vm::ensure_gpu_rootfs(&cli.name)
+    } else {
+        openshell_vm::ensure_named_rootfs(&cli.name)
+    }?;
 
     let gateway_name = openshell_vm::gateway_name(&cli.name)?;
 
+    let (gpu_enabled, vfio_device, _gpu_guard) = match cli.gpu {
+        Some(ref addr) if addr != "auto" => {
+            let state = openshell_vm::gpu_passthrough::prepare_gpu_for_passthrough(Some(addr))?;
+            let bdf = state.pci_addr.clone();
+            (
+                true,
+                Some(bdf),
+                Some(openshell_vm::gpu_passthrough::GpuBindGuard::new(state)),
+            )
+        }
+        Some(_) => {
+            let state = openshell_vm::gpu_passthrough::prepare_gpu_for_passthrough(None)?;
+            let bdf = state.pci_addr.clone();
+            (
+                true,
+                Some(bdf),
+                Some(openshell_vm::gpu_passthrough::GpuBindGuard::new(state)),
+            )
+        }
+        None => (false, None, None),
+    };
+
+    let backend_choice = match cli.backend.as_str() {
+        "cloud-hypervisor" | "chv" => openshell_vm::VmBackendChoice::CloudHypervisor,
+        "libkrun" => {
+            if gpu_enabled {
+                return Err(
+                    "--backend libkrun is incompatible with --gpu (libkrun does not support \
+                     VFIO passthrough). Use --backend auto or --backend cloud-hypervisor."
+                        .into(),
+                );
+            }
+            openshell_vm::VmBackendChoice::Libkrun
+        }
+        "auto" => openshell_vm::VmBackendChoice::Auto,
+        other => {
+            return Err(format!(
+                "unknown --backend: {other} (expected: auto, libkrun, cloud-hypervisor)"
+            )
+            .into());
+        }
+    };
+
     let mut config = if let Some(exec_path) = cli.exec {
         openshell_vm::VmConfig {
             rootfs,
@@ -246,6 +307,9 @@ fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
             reset: cli.reset,
             gateway_name,
             state_disk: None,
+            gpu_enabled,
+            vfio_device,
+            backend: backend_choice,
         }
     } else {
         let mut c = openshell_vm::VmConfig::gateway(rootfs);
@@ -261,6 +325,9 @@ fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
         c.net = net_backend;
         c.reset = cli.reset;
         c.gateway_name = gateway_name;
+        c.gpu_enabled = gpu_enabled;
+        c.vfio_device = vfio_device;
+        c.backend = backend_choice;
         if state_disk_disabled() {
             c.state_disk = None;
         }
diff --git a/crates/openshell-vm/tests/gpu_passthrough_implementation.rs b/crates/openshell-vm/tests/gpu_passthrough_implementation.rs
new file mode 100644
index 000000000..4985ba39b
--- /dev/null
+++ b/crates/openshell-vm/tests/gpu_passthrough_implementation.rs
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Integration tests for GPU passthrough on real hardware.
+//!
+//! Gated by `OPENSHELL_VM_GPU_E2E=1`. On machines without a real GPU,
+//! all tests early-return and pass.
+
+use openshell_vm::gpu_passthrough::{
+    GpuBindGuard, HostNvidiaVfioReadiness, prepare_gpu_for_passthrough,
+    probe_host_nvidia_vfio_readiness,
+};
+
+fn gpu_e2e_enabled() -> bool {
+    std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() == Ok("1")
+}
+
+#[test]
+fn nvidia_gpu_passthrough_is_available() {
+    if !gpu_e2e_enabled() {
+        eprintln!("OPENSHELL_VM_GPU_E2E not set — skipping GPU passthrough gate test");
+        return;
+    }
+    assert!(
+        openshell_vm::gpu_passthrough::nvidia_gpu_available_for_vm_passthrough(),
+        "GPU passthrough gate returned false on a GPU CI runner — \
+         check VFIO binding and cloud-hypervisor runtime bundle"
+    );
+}
+
+#[test]
+fn bind_and_rebind_real_gpu() {
+    if !gpu_e2e_enabled() {
+        return;
+    }
+
+    let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU");
+
+    let results = probe_host_nvidia_vfio_readiness();
+    let (_, readiness) = results
+        .iter()
+        .find(|(a, _)| a == &state.pci_addr)
+        .expect("bound GPU should appear in probe");
+    assert_eq!(*readiness, HostNvidiaVfioReadiness::VfioBoundReady);
+
+    state.restore().expect("restore should succeed");
+
+    let results = probe_host_nvidia_vfio_readiness();
+    let (_, readiness) = results
+        .iter()
+        .find(|(a, _)| a == &state.pci_addr)
+        .expect("restored GPU should appear in probe");
+    assert_eq!(*readiness, HostNvidiaVfioReadiness::BoundToNvidia);
+}
+
+#[test]
+fn safety_checks_pass_on_ci_gpu() {
+    if !gpu_e2e_enabled() {
+        return;
+    }
+
+    // `prepare_gpu_for_passthrough` runs all safety checks internally
+    // (display-attached, IOMMU enabled, VFIO modules loaded, sysfs
+    // permissions). Success here validates that the CI GPU is headless,
+    // IOMMU is on, and VFIO modules are loaded.
+    let state = prepare_gpu_for_passthrough(None)
+        .expect("all safety checks should pass on a headless CI GPU");
+    assert!(!state.pci_addr.is_empty());
+
+    state.restore().expect("restore should succeed");
+}
+
+#[test]
+fn guard_restores_on_drop_real_gpu() {
+    if !gpu_e2e_enabled() {
+        return;
+    }
+
+    let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU");
+    let pci_addr = state.pci_addr.clone();
+
+    let guard = GpuBindGuard::new(state);
+    drop(guard);
+
+    let output = std::process::Command::new("nvidia-smi")
+        .arg("--query-gpu=pci.bus_id")
+        .arg("--format=csv,noheader")
+        .output()
+        .expect("nvidia-smi should be available after guard drop");
+    assert!(
+        output.status.success(),
+        "nvidia-smi failed after guard drop"
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let normalized_addr = pci_addr.to_uppercase();
+    assert!(
+        stdout.to_uppercase().contains(&normalized_addr),
+        "nvidia-smi should list the restored GPU {pci_addr}, got: {stdout}"
+    );
+}
+
+#[test]
+fn auto_select_finds_ci_gpu() {
+    if !gpu_e2e_enabled() {
+        return;
+    }
+
+    let state = prepare_gpu_for_passthrough(None).expect("auto-select should find a GPU on CI");
+    assert!(!state.pci_addr.is_empty());
+    assert!(state.did_bind);
+
+    state.restore().expect("restore should succeed");
+}
diff --git a/crates/openshell-vm/tests/vm_boot_smoke.rs b/crates/openshell-vm/tests/vm_boot_smoke.rs
new file mode 100644
index 000000000..ffdb16595
--- /dev/null
+++ b/crates/openshell-vm/tests/vm_boot_smoke.rs
@@ -0,0 +1,151 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Non-GPU cloud-hypervisor boot smoke test.
+//!
+//! Boots a cloud-hypervisor VM **without** VFIO/GPU passthrough and verifies
+//! the kernel boots and init runs. This catches backend regressions on regular
+//! CI runners that lack GPU hardware.
+//!
+//! Gated on `OPENSHELL_VM_BACKEND=cloud-hypervisor` — skipped when the env
+//! var is absent or set to a different backend.
+//!
+//! Requires the VM runtime bundle (cloud-hypervisor, vmlinux, virtiofsd,
+//! rootfs) to be installed. Set `OPENSHELL_VM_RUNTIME_DIR` or run
+//! `mise run vm:bundle-runtime` first.
+//!
+//! Run explicitly:
+//!
+//! ```sh
+//! OPENSHELL_VM_BACKEND=cloud-hypervisor cargo test -p openshell-vm --test vm_boot_smoke
+//! ```
+
+#![allow(unsafe_code)]
+
+use std::process::{Command, Stdio};
+use std::time::Duration;
+
+const GATEWAY: &str = env!("CARGO_BIN_EXE_openshell-vm");
+
+fn runtime_bundle_dir() -> std::path::PathBuf {
+    std::path::Path::new(GATEWAY)
+        .parent()
+        .expect("openshell-vm binary has no parent")
+        .join("openshell-vm.runtime")
+}
+
+fn skip_unless_chv() -> bool {
+    if std::env::var("OPENSHELL_VM_BACKEND").as_deref() != Ok("cloud-hypervisor") {
+        eprintln!("OPENSHELL_VM_BACKEND != cloud-hypervisor — skipping");
+        return true;
+    }
+    false
+}
+
+fn require_bundle() {
+    let bundle = runtime_bundle_dir();
+    if !bundle.is_dir() {
+        panic!(
+            "VM runtime bundle not found at {}. Run `mise run vm:bundle-runtime` first.",
+            bundle.display()
+        );
+    }
+}
+
+#[test]
+fn cloud_hypervisor_exec_exits_cleanly() {
+    if skip_unless_chv() {
+        return;
+    }
+    require_bundle();
+
+    // Boot with --exec /bin/true --net none. The cloud-hypervisor backend
+    // wraps the exec command in a script that calls `poweroff -f` after
+    // completion, causing a clean ACPI shutdown.
+    let mut child = Command::new(GATEWAY)
+        .args([
+            "--backend",
+            "cloud-hypervisor",
+            "--net",
+            "none",
+            "--exec",
+            "/bin/true",
+        ])
+        .stdout(Stdio::null())
+        .stderr(Stdio::null())
+        .spawn()
+        .expect("failed to start openshell-vm");
+
+    // The VM should boot, run /bin/true, and exit within ~5s.
+    // Give 30s for slow CI.
+    let timeout = Duration::from_secs(30);
+    let start = std::time::Instant::now();
+
+    loop {
+        match child.try_wait() {
+            Ok(Some(status)) => {
+                assert!(
+                    status.success(),
+                    "cloud-hypervisor --exec /bin/true exited with {status}"
+                );
+                return;
+            }
+            Ok(None) => {
+                if start.elapsed() > timeout {
+                    let _ = unsafe { libc::kill(child.id() as i32, libc::SIGKILL) };
+                    let _ = child.wait();
+                    panic!("cloud-hypervisor VM did not exit within {timeout:?}");
+                }
+                std::thread::sleep(Duration::from_millis(500));
+            }
+            Err(e) => panic!("error waiting for openshell-vm: {e}"),
+        }
+    }
+}
+
+#[test]
+fn cloud_hypervisor_boots_without_gpu() {
+    if skip_unless_chv() {
+        return;
+    }
+    require_bundle();
+
+    // Full gateway boot requires TAP networking (root/CAP_NET_ADMIN).
+    // Skip unless running as root.
+    if !nix_is_root() {
+        eprintln!("skipping full gateway boot — requires root for TAP networking");
+        return;
+    }
+
+    let mut child = Command::new(GATEWAY)
+        .args(["--backend", "cloud-hypervisor"])
+        .stdout(Stdio::null())
+        .stderr(Stdio::null())
+        .spawn()
+        .expect("failed to start openshell-vm");
+
+    let addr: std::net::SocketAddr = ([127, 0, 0, 1], 30051).into();
+    let timeout = Duration::from_secs(180);
+    let start = std::time::Instant::now();
+    let mut reachable = false;
+
+    while start.elapsed() < timeout {
+        if std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() {
+            reachable = true;
+            break;
+        }
+        std::thread::sleep(Duration::from_secs(2));
+    }
+
+    let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) };
+    let _ = child.wait();
+
+    assert!(
+        reachable,
+        "cloud-hypervisor VM service on port 30051 not reachable within {timeout:?}"
+    );
+}
+
+fn nix_is_root() -> bool {
+    unsafe { libc::geteuid() == 0 }
+}
diff --git a/tasks/scripts/vm/build-cloud-hypervisor.sh b/tasks/scripts/vm/build-cloud-hypervisor.sh
new file mode 100755
index 000000000..af0c913b1
--- /dev/null
+++ b/tasks/scripts/vm/build-cloud-hypervisor.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Download pre-built cloud-hypervisor and virtiofsd binaries for GPU passthrough.
+#
+# These are only needed on Linux for VFIO GPU passthrough via the
+# cloud-hypervisor backend. The binaries are downloaded from their
+# respective GitHub release pages.
+#
+# Usage:
+#   ./build-cloud-hypervisor.sh [--output-dir <DIR>]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/_lib.sh"
+ROOT="$(vm_lib_root)"
+
+source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true
+
+CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}"
+VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}"
+OUTPUT_DIR="${ROOT}/target/libkrun-build"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --output-dir) OUTPUT_DIR="$2"; shift 2 ;;
+        *) echo "Unknown argument: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ "$(uname -s)" != "Linux" ]; then
+  echo "Error: cloud-hypervisor GPU passthrough is Linux-only" >&2
+  exit 1
+fi
+
+mkdir -p "$OUTPUT_DIR"
+
+HOST_ARCH="$(uname -m)"
+case "$HOST_ARCH" in
+  aarch64) CHV_ARCH="aarch64"; VIRTIOFSD_ARCH="aarch64" ;;
+  x86_64)  CHV_ARCH="x86_64";  VIRTIOFSD_ARCH="x86_64" ;;
+  *)       echo "Error: Unsupported architecture: ${HOST_ARCH}" >&2; exit 1 ;;
+esac
+
+echo "==> Downloading cloud-hypervisor ${CLOUD_HYPERVISOR_VERSION} for ${HOST_ARCH}..."
+CHV_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CLOUD_HYPERVISOR_VERSION}/cloud-hypervisor-static"
+if [ "$CHV_ARCH" = "aarch64" ]; then
+  CHV_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CLOUD_HYPERVISOR_VERSION}/cloud-hypervisor-static-aarch64"
+fi
+
+curl -fsSL -o "${OUTPUT_DIR}/cloud-hypervisor" "$CHV_URL"
+chmod +x "${OUTPUT_DIR}/cloud-hypervisor"
+echo "    Downloaded: cloud-hypervisor"
+
+echo "==> Building virtiofsd ${VIRTIOFSD_VERSION} from source..."
+VIRTIOFSD_SRC="$(mktemp -d)"
+VIRTIOFSD_TARBALL_URL="https://gitlab.com/virtio-fs/virtiofsd/-/archive/${VIRTIOFSD_VERSION}/virtiofsd-${VIRTIOFSD_VERSION}.tar.gz"
+curl -fsSL "$VIRTIOFSD_TARBALL_URL" | tar -xzf - -C "$VIRTIOFSD_SRC" --strip-components=1
+rm -f "${VIRTIOFSD_SRC}/Cargo.lock"
+
+CARGO_CMD="cargo"
+if command -v mise &>/dev/null; then
+  CARGO_CMD="mise exec -- cargo"
+fi
+$CARGO_CMD build --release --manifest-path "${VIRTIOFSD_SRC}/Cargo.toml"
+cp "${VIRTIOFSD_SRC}/target/release/virtiofsd" "${OUTPUT_DIR}/virtiofsd"
+chmod +x "${OUTPUT_DIR}/virtiofsd"
+rm -rf "$VIRTIOFSD_SRC"
+echo "    Built: virtiofsd"
+
+echo ""
+echo "==> GPU passthrough binaries ready in ${OUTPUT_DIR}"
+ls -lah "${OUTPUT_DIR}/cloud-hypervisor" "${OUTPUT_DIR}/virtiofsd" 2>/dev/null || true
diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh
index 9e2217f50..621332366 100755
--- a/tasks/scripts/vm/build-libkrun.sh
+++ b/tasks/scripts/vm/build-libkrun.sh
@@ -239,6 +239,18 @@ make -j"$(nproc)"
 cp libkrunfw.so* "$OUTPUT_DIR/"
 echo "    Built: $(ls "$OUTPUT_DIR"/libkrunfw.so* | xargs -n1 basename | tr '\n' ' ')"
 
+# Copy vmlinux kernel image for cloud-hypervisor GPU passthrough.
+# This is the uncompressed kernel built by libkrunfw's kernel build.
+if [ -f "${KERNEL_SOURCES}/vmlinux" ]; then
+  cp "${KERNEL_SOURCES}/vmlinux" "$OUTPUT_DIR/vmlinux"
+  echo "    Copied vmlinux for cloud-hypervisor GPU passthrough"
+elif [ -f "vmlinux" ]; then
+  cp "vmlinux" "$OUTPUT_DIR/vmlinux"
+  echo "    Copied vmlinux for cloud-hypervisor GPU passthrough"
+else
+  echo "    Warning: vmlinux not found in kernel build tree (GPU passthrough will not be available)" >&2
+fi
+
 cd "$BUILD_DIR"
 
 # ── Build libkrun (VMM) ─────────────────────────────────────────────────
diff --git a/tasks/scripts/vm/download-kernel-runtime.sh b/tasks/scripts/vm/download-kernel-runtime.sh
index 8f0427af9..5e60d3c75 100755
--- a/tasks/scripts/vm/download-kernel-runtime.sh
+++ b/tasks/scripts/vm/download-kernel-runtime.sh
@@ -81,11 +81,11 @@ DOWNLOAD_DIR="${ROOT}/target/vm-runtime-download"
 mkdir -p "$DOWNLOAD_DIR" "$OUTPUT_DIR"
 
 echo "==> Downloading ${TARBALL_NAME} from ${RELEASE_TAG}..."
+rm -f "${DOWNLOAD_DIR}/${TARBALL_NAME}"
 gh release download "${RELEASE_TAG}" \
     --repo "${REPO}" \
     --pattern "${TARBALL_NAME}" \
-    --dir "${DOWNLOAD_DIR}" \
-    --clobber
+    --dir "${DOWNLOAD_DIR}"
 
 if [ ! -f "${DOWNLOAD_DIR}/${TARBALL_NAME}" ]; then
     echo "Error: Download failed — ${TARBALL_NAME} not found." >&2
diff --git a/tasks/scripts/vm/package-vm-runtime.sh b/tasks/scripts/vm/package-vm-runtime.sh
index f97eec870..8b09c91ba 100755
--- a/tasks/scripts/vm/package-vm-runtime.sh
+++ b/tasks/scripts/vm/package-vm-runtime.sh
@@ -84,6 +84,13 @@ case "$PLATFORM" in
             versioned="$(ls "${PACKAGE_DIR}"/libkrunfw.so.5.* 2>/dev/null | head -n1 || true)"
             [ -n "$versioned" ] && cp "$versioned" "${PACKAGE_DIR}/libkrunfw.so.5"
         fi
+        # GPU passthrough binaries (optional — only included if present)
+        for gpu_bin in cloud-hypervisor vmlinux virtiofsd; do
+            if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then
+                cp "${BUILD_DIR}/${gpu_bin}" "${PACKAGE_DIR}/"
+                echo "    Included GPU passthrough binary: ${gpu_bin}"
+            fi
+        done
         ;;
     darwin-aarch64)
         cp "${BUILD_DIR}/libkrun.dylib" "${PACKAGE_DIR}/"
diff --git a/tasks/scripts/vm/sync-vm-rootfs.sh b/tasks/scripts/vm/sync-vm-rootfs.sh
index 727a9dd18..2c22e360b 100755
--- a/tasks/scripts/vm/sync-vm-rootfs.sh
+++ b/tasks/scripts/vm/sync-vm-rootfs.sh
@@ -141,6 +141,22 @@ fi
 patch_vm_helmchart "${MANIFEST_DST}/openshell-helmchart.yaml"
 patch_vm_helmchart "${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests/openshell-helmchart.yaml"
 
+# ── GPU manifests ──────────────────────────────────────────────────────
+# Only sync if the rootfs was built with --gpu (sentinel file present).
+GPU_MANIFEST_SRC="${ROOT}/crates/openshell-vm/scripts/gpu-manifests"
+GPU_MANIFEST_DST="${ROOTFS_DIR}/opt/openshell/gpu-manifests"
+if [ -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ] && [ -d "${GPU_MANIFEST_SRC}" ]; then
+    mkdir -p "${GPU_MANIFEST_DST}"
+    for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do
+        [ -f "$manifest" ] || continue
+        base=$(basename "$manifest")
+        if ! cmp -s "$manifest" "${GPU_MANIFEST_DST}/${base}" 2>/dev/null; then
+            cp "$manifest" "${GPU_MANIFEST_DST}/${base}"
+            echo "  updated: /opt/openshell/gpu-manifests/${base}"
+        fi
+    done
+fi
+
 # ── Gateway image tarball ──────────────────────────────────────────────
 # The VM rootfs airgap-imports openshell/gateway:dev from k3s/agent/images/.
 # Keep that tarball in sync with the local Docker image so `mise run e2e:vm`