NixOS CUDA - Public Room Timeline

	NixOS CUDA	293 Members
	CUDA packages maintenance and support in nixpkgs \| https://github.com/orgs/NixOS/projects/27/ \| https://nixos.org/manual/nixpkgs/unstable/#cuda	59 Servers

You have reached the beginning of time (for this room).

Sender	Message	Time
17 Apr 2025
ereslibre	+1, let us know if you run into any issues when enabling CDI :)	19:31:30
18 Apr 2025
connor (burnt/out) (UTC-8)	SomeoneSerge (UTC+U[-12,12]) I removed all the module system stuff from https://github.com/connorbaker/cuda-packages	11:24:48
luke-skywalker	ereslibre: I got it to run with docker but still struggling to getting it to run with containerd and k8s-device-plugin.	20:46:45
ereslibre	In reply to @luke-skywalker:matrix.org ereslibre: I got it to run with docker but still struggling to getting it to run with containerd and k8s-device-plugin. Interesting. If you feel like it, please open an issue and we can follow up. I did not try to run CDI with either of those	20:48:38
20 Apr 2025
SomeoneSerge (back on matrix)	Updated https://github.com/NVIDIA/build-system-archive-import-examples/issues/5 to reflect preference for the`.note.dlopen` section over eager-loading	09:34:53
	@techyporcupine:matrix.org left the room.	18:15:53
21 Apr 2025
luke-skywalker	Redacted or Malformed Event	13:54:54
SomeoneSerge (back on matrix)	@luke-skywalker:matrix.org: the moderation bot is configured to drop all media in nixos spaces because there was a spam campaign disseminating csam matrix-wide, it's an unfortunate situation but the mods don't really have any other tools at their disposal	19:48:15
22 Apr 2025
	jaredmontoya joined the room.	09:32:38
luke-skywalker	ereslibre: where exactly should I make an issue? on nixpkgs gituhb? If so how do I indicate that its about the nvidia-container-toolkit? So far, the attached config got me to: docker (+compose) runtime working with CUDA workloads containerd runtime directly running cuda containers get rke2 running with a `config.toml` that points to all needed runtimes in the nix store. hardware.nvidia-container-toolkit = { enable = true; # package = pkgs.nvidia-container-toolkit; # # Use UUID for device naming - better for multi-GPU setups device-name-strategy = "uuid"; # one of "index", "uuid", "type-index" # Mount additional directories for compatibility mount-nvidia-docker-1-directories = true; # Mount NVIDIA executables into container mount-nvidia-executables = true; }; services.rke2 = { enable = true; role = "server"; nodeName = "workstation-0"; cni = "canal"; # \| canal # Set the node IP directly nodeIP = "${systemProfile.network.staticIP}"; debug = true; # Set cluster CIDR ranges properly extraFlags = [ "--kubelet-arg=cgroup-driver=systemd" "--cluster-cidr=10.42.0.0/16" "--service-cidr=10.43.0.0/16" "--disable-cloud-controller" # Disable cloud controller for bare metal # "--kubelet-arg=feature-gates=DevicePlugins=true" # Add this for device plugins ]; disable = ["traefik"]; # "servicelb" # environmentVars = { # NVIDIA_VISIBLE_DEVICES = "all"; # NVIDIA_DRIVER_CAPABILITIES = "all"; # # Set NVIDIA driver root to the standard location # # NVIDIA_DRIVER_ROOT = "/usr/lib/nvidia"; # # Home directory for RKE2 # HOME = "/root"; # }; }; `/var/lib/rancher/rke2/agent/etc/containerd/config.toml`	10:56:26
luke-skywalker	* ereslibre: where exactly should I make an issue? on nixpkgs gituhb? If so how do I indicate that its about the nvidia-container-toolkit? So far, the attached config got me to: docker (+compose) runtime working with CUDA workloads containerd runtime directly running cuda containers get rke2 running with a `config.toml` that points to all needed runtimes in the nix store. hardware.nvidia-container-toolkit = { enable = true; # package = pkgs.nvidia-container-toolkit; # # Use UUID for device naming - better for multi-GPU setups device-name-strategy = "uuid"; # one of "index", "uuid", "type-index" # Mount additional directories for compatibility mount-nvidia-docker-1-directories = true; # Mount NVIDIA executables into container mount-nvidia-executables = true; }; services.rke2 = { enable = true; role = "server"; nodeName = "workstation-0"; cni = "canal"; # \| canal # Set the node IP directly nodeIP = "${systemProfile.network.staticIP}"; debug = true; # Set cluster CIDR ranges properly extraFlags = [ "--kubelet-arg=cgroup-driver=systemd" "--cluster-cidr=10.42.0.0/16" "--service-cidr=10.43.0.0/16" "--disable-cloud-controller" # Disable cloud controller for bare metal # "--kubelet-arg=feature-gates=DevicePlugins=true" # Add this for device plugins ]; disable = ["traefik"]; # "servicelb" # environmentVars = { # NVIDIA_VISIBLE_DEVICES = "all"; # NVIDIA_DRIVER_CAPABILITIES = "all"; # # Set NVIDIA driver root to the standard location # # NVIDIA_DRIVER_ROOT = "/usr/lib/nvidia"; # # Home directory for RKE2 # HOME = "/root"; # }; }; `/var/lib/rancher/rke2/agent/etc/containerd/config.toml` # File generated by rke2. DO NOT EDIT. Use config.toml.tmpl instead. version = 3 root = "/var/lib/rancher/rke2/agent/containerd" state = "/run/k3s/containerd" [grpc] address = "/run/k3s/containerd/containerd.sock" [plugins.'io.containerd.internal.v1.opt'] path = "/var/lib/rancher/rke2/agent/containerd" [plugins.'io.containerd.grpc.v1.cri'] stream_server_address = "127.0.0.1" stream_server_port = "10010" [plugins.'io.containerd.cri.v1.runtime'] enable_selinux = false enable_unprivileged_ports = true enable_unprivileged_icmp = true device_ownership_from_security_context = false [plugins.'io.containerd.cri.v1.images'] snapshotter = "overlayfs" disable_snapshot_annotations = true [plugins.'io.containerd.cri.v1.images'.pinned_images] sandbox = "index.docker.io/rancher/mirrored-pause:3.6" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc.options] SystemdCgroup = true [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runhcs-wcow-process] runtime_type = "io.containerd.runhcs.v1" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia'] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia'.options] BinaryName = "/var/lib/rancher/rke2/data/v1.31.7-rke2r1-7f85e977b85d/bin/nvidia-container-runtime" SystemdCgroup = true [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia-cdi'] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia-cdi'.options] BinaryName = "/var/lib/rancher/rke2/data/v1.31.7-rke2r1-7f85e977b85d/bin/nvidia-container-runtime.cdi" SystemdCgroup = true [plugins.'io.containerd.cri.v1.images'.registry] config_path = "/var/lib/rancher/rke2/agent/etc/containerd/certs.d" However when trying to deploy the nvidia device plugin either with rke2 operator or as simple daemonset or as helm chart from the nvidia-device-plugin repo, it fails on detecing the cuda environment. for example by complaining about auto strategy.	10:59:26
luke-skywalker	My suggestion to this point, that something is looking for some specific path resolution of either the nvidia drivers or the container runtime. 🤷‍♂️	11:00:32
luke-skywalker	* My suspicion to this point is, that something is looking for some specific path resolution of either the nvidia driver / library or the container runtime. 🤷‍♂️	11:00:58
luke-skywalker	* ereslibre: where exactly should I make an issue? on nixpkgs gituhb? If so how do I indicate that its about the nvidia-container-toolkit? So far, the attached config got me to: docker (+compose) runtime working with CUDA workloads containerd runtime directly running cuda containers get rke2 running with a `config.toml` that points to all needed runtimes in the nix store. hardware.nvidia-container-toolkit = { enable = true; # package = pkgs.nvidia-container-toolkit; # # Use UUID for device naming - better for multi-GPU setups device-name-strategy = "uuid"; # one of "index", "uuid", "type-index" # Mount additional directories for compatibility mount-nvidia-docker-1-directories = true; # Mount NVIDIA executables into container mount-nvidia-executables = true; }; hardware.nvidia = { modesetting.enable = true; nvidiaPersistenced = true; }; services.rke2 = { enable = true; role = "server"; nodeName = "workstation-0"; cni = "canal"; # \| canal # Set the node IP directly nodeIP = "${systemProfile.network.staticIP}"; debug = true; # Set cluster CIDR ranges properly extraFlags = [ "--kubelet-arg=cgroup-driver=systemd" "--cluster-cidr=10.42.0.0/16" "--service-cidr=10.43.0.0/16" "--disable-cloud-controller" # Disable cloud controller for bare metal # "--kubelet-arg=feature-gates=DevicePlugins=true" # Add this for device plugins ]; disable = ["traefik"]; # "servicelb" # environmentVars = { # NVIDIA_VISIBLE_DEVICES = "all"; # NVIDIA_DRIVER_CAPABILITIES = "all"; # # Set NVIDIA driver root to the standard location # # NVIDIA_DRIVER_ROOT = "/usr/lib/nvidia"; # # Home directory for RKE2 # HOME = "/root"; # }; }; `/var/lib/rancher/rke2/agent/etc/containerd/config.toml` # File generated by rke2. DO NOT EDIT. Use config.toml.tmpl instead. version = 3 root = "/var/lib/rancher/rke2/agent/containerd" state = "/run/k3s/containerd" [grpc] address = "/run/k3s/containerd/containerd.sock" [plugins.'io.containerd.internal.v1.opt'] path = "/var/lib/rancher/rke2/agent/containerd" [plugins.'io.containerd.grpc.v1.cri'] stream_server_address = "127.0.0.1" stream_server_port = "10010" [plugins.'io.containerd.cri.v1.runtime'] enable_selinux = false enable_unprivileged_ports = true enable_unprivileged_icmp = true device_ownership_from_security_context = false [plugins.'io.containerd.cri.v1.images'] snapshotter = "overlayfs" disable_snapshot_annotations = true [plugins.'io.containerd.cri.v1.images'.pinned_images] sandbox = "index.docker.io/rancher/mirrored-pause:3.6" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc.options] SystemdCgroup = true [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runhcs-wcow-process] runtime_type = "io.containerd.runhcs.v1" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia'] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia'.options] BinaryName = "/var/lib/rancher/rke2/data/v1.31.7-rke2r1-7f85e977b85d/bin/nvidia-container-runtime" SystemdCgroup = true [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia-cdi'] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia-cdi'.options] BinaryName = "/var/lib/rancher/rke2/data/v1.31.7-rke2r1-7f85e977b85d/bin/nvidia-container-runtime.cdi" SystemdCgroup = true [plugins.'io.containerd.cri.v1.images'.registry] config_path = "/var/lib/rancher/rke2/agent/etc/containerd/certs.d" `lsmod \| grep nvidia nvidia_drm 139264 81 nvidia_modeset 1830912 26 nvidia_drm nvidia_uvm 3817472 2 nvidia 97120256 533 nvidia_uvm,nvidia_modeset video 81920 2 asus_wmi,nvidia_modeset drm_ttm_helper 20480 2 nvidia_drm` However when trying to deploy the nvidia device plugin either with rke2 operator or as simple daemonset or as helm chart from the nvidia-device-plugin repo, it fails on detecing the cuda environment. for example by complaining about auto strategy.	11:04:17
luke-skywalker	* ereslibre: where exactly should I make an issue? on nixpkgs gituhb? If so how do I indicate that its about the nvidia-container-toolkit? So far, the attached config got me to: docker (+compose) runtime working with CUDA workloads 2. containerd runtime directly running cuda containers 3. get rke2 running with a `config.toml` that points to all needed runtimes in the nix store. ```nix hardware.nvidia-container-toolkit = { enable = true; # package = pkgs.nvidia-container-toolkit; # # Use UUID for device naming - better for multi-GPU setups device-name-strategy = "uuid"; # one of "index", "uuid", "type-index" # Mount additional directories for compatibility mount-nvidia-docker-1-directories = true; # Mount NVIDIA executables into container mount-nvidia-executables = true; }; hardware.nvidia = { modesetting.enable = true; nvidiaPersistenced = true; }; services.rke2 = { enable = true; role = "server"; nodeName = "workstation-0"; cni = "canal"; # \| canal # Set the node IP directly nodeIP = "${systemProfile.network.staticIP}"; debug = true; # Set cluster CIDR ranges properly extraFlags = [ "--kubelet-arg=cgroup-driver=systemd" "--cluster-cidr=10.42.0.0/16" "--service-cidr=10.43.0.0/16" "--disable-cloud-controller" # Disable cloud controller for bare metal # "--kubelet-arg=feature-gates=DevicePlugins=true" # Add this for device plugins ]; disable = ["traefik"]; # "servicelb" # environmentVars = { # NVIDIA_VISIBLE_DEVICES = "all"; # NVIDIA_DRIVER_CAPABILITIES = "all"; # # Set NVIDIA driver root to the standard location # # NVIDIA_DRIVER_ROOT = "/usr/lib/nvidia"; # # Home directory for RKE2 # HOME = "/root"; # }; }; `/var/lib/rancher/rke2/agent/etc/containerd/config.toml` # File generated by rke2. DO NOT EDIT. Use config.toml.tmpl instead. version = 3 root = "/var/lib/rancher/rke2/agent/containerd" state = "/run/k3s/containerd" [grpc] address = "/run/k3s/containerd/containerd.sock" [plugins.'io.containerd.internal.v1.opt'] path = "/var/lib/rancher/rke2/agent/containerd" [plugins.'io.containerd.grpc.v1.cri'] stream_server_address = "127.0.0.1" stream_server_port = "10010" [plugins.'io.containerd.cri.v1.runtime'] enable_selinux = false enable_unprivileged_ports = true enable_unprivileged_icmp = true device_ownership_from_security_context = false [plugins.'io.containerd.cri.v1.images'] snapshotter = "overlayfs" disable_snapshot_annotations = true [plugins.'io.containerd.cri.v1.images'.pinned_images] sandbox = "index.docker.io/rancher/mirrored-pause:3.6" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc.options] SystemdCgroup = true [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runhcs-wcow-process] runtime_type = "io.containerd.runhcs.v1" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia'] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia'.options] BinaryName = "/var/lib/rancher/rke2/data/v1.31.7-rke2r1-7f85e977b85d/bin/nvidia-container-runtime" SystemdCgroup = true [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia-cdi'] runtime_type = "io.containerd.runc.v2" [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'nvidia-cdi'.options] BinaryName = "/var/lib/rancher/rke2/data/v1.31.7-rke2r1-7f85e977b85d/bin/nvidia-container-runtime.cdi" SystemdCgroup = true [plugins.'io.containerd.cri.v1.images'.registry] config_path = "/var/lib/rancher/rke2/agent/etc/containerd/certs.d" `lsmod \| grep nvidia nvidia_drm 139264 81 nvidia_modeset 1830912 26 nvidia_drm nvidia_uvm 3817472 2 nvidia 97120256 533 nvidia_uvm,nvidia_modeset video 81920 2 asus_wmi,nvidia_modeset drm_ttm_helper 20480 2 nvidia_drm` However when trying to deploy the nvidia device plugin either with rke2 operator or as simple daemonset or as helm chart from the nvidia-device-plugin repo, it fails on detecing the cuda environment. for example by complaining about auto strategy.	11:08:12

Show newer messages

Back to Room ListRoom Version: 9