Skip to content

Commit 79b8821

Browse files
authored
refactor slurm get_cluster_info and add a test case (#917)
* refactor slurm get_cluster_info and add a test case * add preceding , back to sinfo call * actually fix sinfo call for gres
1 parent e1f2b3b commit 79b8821

File tree

5 files changed

+2389
-8
lines changed

5 files changed

+2389
-8
lines changed

lib/ood_core/job/adapters/slurm.rb

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,19 +121,28 @@ def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {}, submit_host
121121
# Get a ClusterInfo object containing information about the given cluster
122122
# @return [ClusterInfo] object containing cluster details
123123
def get_cluster_info
124-
node_cpu_info = call("sinfo", "-aho %F/%D/%C").strip.split('/')
124+
node_cpu_info = call("sinfo", "-aho %F/%C").strip.split('/').map(&:to_i)
125125
gres_length = call("sinfo", "-o %G").lines.map(&:strip).map(&:length).max + 2
126-
gres_lines = call("sinfo", "-ahNO ,nodehost,gres:#{gres_length},gresused:#{gres_length},statelong")
126+
gres_lines = call("sinfo", "-ahNO", "nodehost,gres:#{gres_length},gresused:#{gres_length},statelong")
127127
.lines.uniq.reject { |line| line.match?(/maint|drain|down/i) }.map(&:split)
128-
ClusterInfo.new(active_nodes: node_cpu_info[0].to_i,
129-
total_nodes: (node_cpu_info[3].to_i - node_cpu_info[2].to_i),
130-
active_processors: node_cpu_info[5].to_i,
131-
total_processors: (node_cpu_info[8].to_i - node_cpu_info[7].to_i),
132-
active_gpus: gres_lines.sum { |line| Slurm.gpus_from_gres(line[2]) }.to_i,
133-
total_gpus: gres_lines.sum { |line| Slurm.gpus_from_gres(line[1]) }.to_i
128+
129+
node_info = sinfo_headers.zip(node_cpu_info).to_h
130+
ClusterInfo.new(active_nodes: node_info['nodes_allocated'],
131+
total_nodes: node_info['nodes_total'],
132+
active_processors: node_info['cpus_allocated'],
133+
total_processors: node_info['cpus_total'],
134+
active_gpus: gres_lines.sum { |line| Slurm.gpus_from_gres(line[2]) },
135+
total_gpus: gres_lines.sum { |line| Slurm.gpus_from_gres(line[1]) }
134136
)
135137
end
136138

139+
def sinfo_headers
140+
[
141+
'nodes_allocated', 'nodes_idle', 'nodes_other', 'nodes_total',
142+
'cpus_allocated', 'cpus_idle', 'cpus_other', 'cpus_total'
143+
]
144+
end
145+
137146
# Get a list of hashes detailing each of the jobs on the batch server
138147
# @example Status info for all jobs
139148
# my_batch.get_jobs
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
281/16/1/298/25608/11524/244/37376
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
GRES
2+
gpu:a100:3(S:2,4,7),nsight:no_consume:1,pfsdir:scratch:no_consume:1,gpfs:scratch:no_consume:1,gpfs:ess:no_consume:1,vis:no_consume:1,overlay:apps:no_consume:1,overlay:apps_src:no_consume:1,bind:apps:no_consume:1,bind:apps_src:no_consume:1
3+
gpu:a100:2(S:2,7),nsight:no_consume:1,pfsdir:scratch:no_consume:1,gpfs:scratch:no_consume:1,gpfs:ess:no_consume:1,vis:no_consume:1,overlay:apps:no_consume:1,overlay:apps_src:no_consume:1,bind:apps:no_consume:1,bind:apps_src:no_consume:1
4+
gpu:a100:4(S:0-3),nsight:no_consume:1,pfsdir:scratch:no_consume:1,gpfs:scratch:no_consume:1,gpfs:ess:no_consume:1,vis:no_consume:1,overlay:apps:no_consume:1,overlay:apps_src:no_consume:1,bind:apps:no_consume:1,bind:apps_src:no_consume:1
5+
pfsdir:scratch:no_consume:1,gpfs:scratch:no_consume:1,gpfs:ess:no_consume:1,overlay:apps:no_consume:1,overlay:apps_src:no_consume:1,bind:apps:no_consume:1,bind:apps_src:no_consume:1

0 commit comments

Comments
 (0)