From 078ef34a87993c0b277d4bbd7050f0d609c01919 Mon Sep 17 00:00:00 2001 From: Michail Resvanis Date: Mon, 7 Jul 2025 12:16:35 +0200 Subject: [PATCH] Add support for fabric manager shared-nvswitch mode The changes include: - add the `FABRIC_MANAGER_FABRIC_MODE` env var that configures FM with either full-passthrough (0) or shared-nvswitch (1) fabric mode. It defaults to 0. - when fabric manager mode is set to 0 no changes to the flow, i.e. execute the fabric manager daemon with its default configuration. - when fabric manager mode is set to 1: - edit the fabric manager configuration file and set `FABRIC_MODE=1`. - persist mapping of physical GPU module IDs to their PCIe address by creating a JSON file on disk (the physical GPU module IDs are available through nvidia-smi). - disable `nvidia-persistenced`, as the GPU devices should be unbound from the NVIDIA driver and bound to vfio-pci (a step executed by the vfio-manager). Signed-off-by: Michail Resvanis --- rhel9/Dockerfile | 18 ++++++- rhel9/nvidia-driver | 115 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 111 insertions(+), 22 deletions(-) diff --git a/rhel9/Dockerfile b/rhel9/Dockerfile index ad21b07d..91262751 100644 --- a/rhel9/Dockerfile +++ b/rhel9/Dockerfile @@ -53,6 +53,10 @@ ENV DISABLE_VGPU_VERSION_CHECK=$DISABLE_VGPU_VERSION_CHECK # Avoid dependency of container-toolkit for driver container ENV NVIDIA_VISIBLE_DEVICES=void +# Fabric manager fabric mode, default is 0 (full-passthrough) +ARG FABRIC_MANAGER_FABRIC_MODE=0 +ENV FABRIC_MANAGER_FABRIC_MODE=$FABRIC_MANAGER_FABRIC_MODE + ADD install.sh /tmp/ RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \ @@ -74,7 +78,19 @@ RUN if [ "$DRIVER_TYPE" != "vgpu" ]; then \ cd drivers && \ DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} && \ curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run && \ - chmod +x NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run; fi + chmod +x NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run && \ + versionArray=(${DRIVER_VERSION//./ }); \ + DRIVER_BRANCH=${versionArray[0]}; \ + dnf install git -y && \ + dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + dnf module enable -y nvidia-driver:${DRIVER_BRANCH}-dkms && \ + dnf install -y nvidia-fabric-manager-${DRIVER_VERSION}-1 nvidia-fabric-manager-devel-${DRIVER_VERSION}-1 libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION}-1 jsoncpp-devel gcc-c++ make && \ + git clone https://github.com/mresvanis/Fabric-Manager-Client.git && \ + cd Fabric-Manager-Client && \ + git checkout fix-ignoring-unix-socket && \ + make fmpm && \ + cp fmpm /usr/bin/ && \ + chmod +x /usr/bin/fmpm; fi # Fetch the installer, fabricmanager, libnvidia-nscq, libnvsdm, imex packages RUN sh /tmp/install.sh extrapkgsinstall diff --git a/rhel9/nvidia-driver b/rhel9/nvidia-driver index 2ac431b3..8d4fe36a 100755 --- a/rhel9/nvidia-driver +++ b/rhel9/nvidia-driver @@ -20,6 +20,7 @@ RHEL_MAJOR_VERSION=9 RHEL_MINOR_VERSION=${RHEL_MINOR_VERSION:-""} KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} MODPROBE_CONFIG_DIR="/etc/modprobe.d" +FABRIC_MANAGER_FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE:-0} DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} echo "DRIVER_ARCH is $DRIVER_ARCH" @@ -305,6 +306,86 @@ _ensure_nvlink5_prerequisites() ( done ) +_configure_fabric_manager_config() { + local fm_config_file="$1" + local fmpm_socket_path="$2" + + if [ "${FABRIC_MANAGER_FABRIC_MODE}" = "1" ]; then + echo "Updating NVIDIA fabric manager configuration to fabric mode ${FABRIC_MANAGER_FABRIC_MODE}..." + sed -i "s/FABRIC_MODE=.*/FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE}/g" $fm_config_file + + echo "Updating NVIDIA fabric manager configuration to use a UNIX socket instead of TCP: ${fmpm_socket_path}" + sed -i "s|^UNIX_SOCKET_PATH=.*|UNIX_SOCKET_PATH=${fmpm_socket_path}|g" $fm_config_file + sed -i "s|^FM_CMD_UNIX_SOCKET_PATH=.*|FM_CMD_UNIX_SOCKET_PATH=${fmpm_socket_path}|g" $fm_config_file + fi +} + +_setup_fabric_manager() { + local fmpm_socket_path="$1" + local fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + + _configure_fabric_manager_config "${fm_config_file}" "${fmpm_socket_path}" + + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + _configure_fabric_manager_config "${fm_config_file}" "${fmpm_socket_path}" + + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c $fm_config_file + fi +} + +# Capture GPU PCI address to physical module ID mapping and persist to JSON file. +_capture_gpu_mapping() { + local gpu_mapping + + echo "Capturing GPU PCI to Module ID mapping..." + if command -v nvidia-smi >/dev/null 2>&1; then + gpu_mapping=$(nvidia-smi -q | egrep "(Module|Bus).*Id") + if [ -n "$gpu_mapping" ]; then + echo "$gpu_mapping" + # Parse and convert to JSON format + json_entries="" + module_id="" + while IFS= read -r line; do + if [[ "$line" =~ Module\ Id.*:\ ([0-9]+) ]]; then + module_id="${BASH_REMATCH[1]}" + elif [[ "$line" =~ Bus\ Id.*:\ ([0-9A-Fa-f:\.]+) ]] && [ -n "$module_id" ]; then + pci_id="${BASH_REMATCH[1]}" + if [ -n "$json_entries" ]; then + json_entries="${json_entries}," + fi + json_entries="${json_entries}\"${pci_id}\": \"${module_id}\"" + module_id="" + fi + done <<< "$gpu_mapping" + + mkdir -p /run/nvidia-fabricmanager + echo "{${json_entries}}" > /run/nvidia-fabricmanager/gpu-pci-module-mapping.json + echo "GPU mapping saved to /run/nvidia-fabricmanager/gpu-pci-module-mapping.json" + else + echo "Warning: Could not retrieve GPU PCI to Module ID mapping" + fi + else + echo "Warning: nvidia-smi not available for GPU mapping" + fi +} + # For each kernel module configuration file mounted into the container, # parse the file contents and extract the custom module parameters that # are to be passed as input to 'modprobe'. @@ -380,6 +461,7 @@ _load_driver() { local nv_fw_search_path="$RUN_DIR/driver/lib/firmware" local set_fw_path="true" local fw_path_config_file="/sys/module/firmware_class/parameters/path" + local fmpm_socket_path="/run/nvidia-fabricmanager/fmpm.sock" for param in "${NVIDIA_MODULE_PARAMS[@]}"; do if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then set_fw_path="false" @@ -696,8 +778,12 @@ _start_vgpu_topology_daemon() { } _start_daemons() { - echo "Starting NVIDIA persistence daemon..." - nvidia-persistenced --persistence-mode + if [ "${FABRIC_MANAGER_FABRIC_MODE}" = "1" ]; then + echo "Skipping NVIDIA persistence daemon..." + else + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + fi if [ "${DRIVER_TYPE}" = "vgpu" ]; then echo "Copying gridd.conf..." @@ -715,25 +801,7 @@ _start_daemons() { _start_vgpu_topology_daemon fi - if _assert_nvlink5_system; then - _ensure_nvlink5_prerequisites || return 1 - echo "Starting NVIDIA fabric manager daemon for NVLink5+..." - - fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg - fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid - nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf - nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid - /usr/bin/nvidia-fabricmanager-start.sh --mode start \ - --fm-config-file $fm_config_file \ - --fm-pid-file $fm_pid_file \ - --nvlsm-config-file $nvlsm_config_file \ - --nvlsm-pid-file $nvlsm_pid_file - - # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches - elif _assert_nvswitch_system; then - echo "Starting NVIDIA fabric manager daemon..." - nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg - fi + _setup_fabric_manager "${fmpm_socket_path}" } _store_driver_digest() { @@ -817,6 +885,11 @@ _build() { _load() { _load_driver + + if [ "${FABRIC_MANAGER_FABRIC_MODE}" = "1" ]; then + _capture_gpu_mapping + fi + _mount_rootfs _write_kernel_update_hook _store_driver_digest