Skip to content

Commit

Permalink
Replace nvidia-persistenced service with parallelcluster_nvidia servi…
Browse files Browse the repository at this point in the history
…ce to avoid conflicts with DLAMI

parallelcluster_nvidia service ensures the creation of the block devices /dev/nvidia0 and it is needed by the slurmd service.

parallelcluster_nvidia starts the nvidia-persistenced or run nvidia-smi to avoid race condition with other services.

Signed-off-by: Francesco Giordano <giordafr@amazon.it>
  • Loading branch information
francesco-giordano committed Jun 29, 2023
1 parent a5d6e8d commit 363d9f0
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 14 deletions.
25 changes: 17 additions & 8 deletions cookbooks/aws-parallelcluster-config/recipes/nvidia.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,23 @@
group 'root'
mode '0644'
end
# Install nvidia_persistenced. See https://download.nvidia.com/XFree86/Linux-x86_64/396.51/README/nvidia-persistenced.html
bash 'Install nvidia_persistenced' do
cwd '/usr/share/doc/NVIDIA_GLX-1.0/samples'
user 'root'

# Install ParallelCluster nvidia service.
# The service ensures the creation of the block devices /dev/nvidia0 after reboot and it is needed by the slurmd service
# cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb
#
# The service starts the nvidia-persistenced or run nvidia-smi to avoid race condition with other services
template '/etc/systemd/system/parallelcluster_nvidia.service' do
source 'nvidia/parallelcluster_nvidia_service.erb'
owner 'root'
group 'root'
code <<-NVIDIA
tar -xf nvidia-persistenced-init.tar.bz2
./nvidia-persistenced-init/install.sh
NVIDIA
mode '0644'
action :create
variables(is_nvidia_persistenced_running: is_process_running('nvidia-persistenced'))
end

service "parallelcluster_nvidia" do
supports restart: false
action %i(enable start)
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# This systemd service file, designed to trigger the creation device block file /dev/nvidia0
# The service start nvidia-persistenced if it is not already started or execute the command nvidia-smi.

[Unit]
Description=ParallelCluster NVIDIA Daemon
Wants=syslog.target

[Service]
<% if @is_nvidia_persistenced_running -%>
Type=simple
ExecStart=/usr/bin/nvidia-smi
RemainAfterExit=yes
<% else %>
Type=forking
ExecStart=/usr/bin/nvidia-persistenced --user root
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
<% end %>

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
action :create
end

# Add systemd dependency between slurmd and nvidia-persistenced for NVIDIA GPU nodes
# Add systemd dependency between slurmd and parallelcluster_nvidia for NVIDIA GPU nodes
if graphic_instance? && nvidia_installed?
directory '/etc/systemd/system/slurmd.service.d' do
user 'root'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[Unit]
After=nvidia-persistenced.service
Wants=nvidia-persistenced.service
After=parallelcluster_nvidia.service
Wants=parallelcluster_nvidia.service
10 changes: 10 additions & 0 deletions libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,16 @@ def ignore_failure(lookup)
end
end

#
# Check if a process is running
#
def is_process_running(process_name)
ps = Mixlib::ShellOut.new("ps aux | grep '#{process_name}' | egrep -v \"grep .*#{process_name}\"")
ps.run_command

!ps.stdout.strip.empty?
end

#
# Check if the instance has a GPU
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
its('content') { should include("uvm") }
end

describe service('nvidia-persistenced') do
describe service('parallelcluster_nvidia') do
it { should be_enabled }
it { should be_running }
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@

describe 'Check slurmd systemd "after" dependencies'
describe command('systemctl list-dependencies --after --plain slurmd.service') do
its('stdout') { should include "nvidia-persistenced.service" }
its('stdout') { should include "parallelcluster_nvidia.service" }
end
describe 'Check slurmd systemd requirement dependencies'
describe command('systemctl list-dependencies --plain slurmd.service') do
its('stdout') { should include "nvidia-persistenced.service" }
its('stdout') { should include "parallelcluster_nvidia.service" }
end
end

0 comments on commit 363d9f0

Please sign in to comment.