-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathseffs
More file actions
executable file
·188 lines (165 loc) · 8.31 KB
/
seffs
File metadata and controls
executable file
·188 lines (165 loc) · 8.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env elvish
#
# run seff on all output from sacct, with additional args passed to sacct
use flag
use math
use str
# these units match what ZFS uses
var kibi = 1024
var mebi = (math:pow $kibi 2)
var gibi = (math:pow $kibi 3)
var tebi = (math:pow $kibi 4)
var failed-job-states = [
&BOOT_FAIL=$true # Job terminated due to launch failure, typically due to a hardware failure (e.g. unable to boot the node or block and the job can not be requeued).
&DEADLINE=$true # Job terminated on deadline.
&FAILED=$true # Job terminated with non-zero exit code or other failure condition.
&NODE_FAIL=$true # Job terminated due to failure of one or more allocated nodes.
&OUT_OF_MEMORY=$true # Job experienced out of memory error.
&PREEMPTED=$true # Job terminated due to preemption.
&REVOKED=$true # Sibling was removed from cluster due to other cluster starting the job.
&SUSPENDED=$true # Job has an allocation, but execution has been suspended and CPUs have been released for other jobs.
&TIMEOUT=$true # Job terminated upon reaching its time limit.
]
var completed-job-states = [
&COMPLETED=$true # Job has terminated all processes on all nodes with an exit code of zero.
]
var other-job-states = [
&CANCELLED=$true # Job was explicitly cancelled by the user or system administrator. The job may or may not have been initiated.
&PENDING=$true # Job is awaiting resource allocation.
&RUNNING=$true # Job currently has an allocation.
&REQUEUED=$true # Job was requeued.
&RESIZING=$true # Job is about to change size.
]
fn sacct-json {|@args|
# sacct command may fail, so catch this
#
# work around the bad SLURM_TIME_FORMAT currently on eRI:
# /etc/profile.d/nesi.sh:export SLURM_TIME_FORMAT="%b %d %H:%M"
with E:SLURM_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S" {
try {
sacct --json $@args
} catch e {
echo >&2 "warning: sacct failed, use --days and --end to narrow search scope"
echo '{ "jobs": [] }'
}
}
}
fn sacct-selected-jobs {|days end &fail=$false|
var selected-job-states = (if $fail { put $failed-job-states } else { put $completed-job-states })
# query one day at a time, because sacct really doesn't like big queries, and with --json
# it seems to ignore all otherg filter parameters
var now = (num (date +%s))
range $days 0 | each {|day|
var day-date = (date +%Y-%m-%d -d @(- $now (* 86400 (- (+ $day $end) 1))))
echo >&2 $day-date
sacct-json --starttime $day-date --endtime $day-date"T23:59:59" | from-json | keep-if {|job| has-key $selected-job-states $job[state][current] } (one)[jobs]
}
}
fn format-size {|n &p=$false|
if $p {
put $n
} elif (< $n $kibi) {
put "0" # so small we don't care
} elif (< $n $mebi) {
put (printf "%dK" (math:round (/ $n $kibi)))
} elif (< $n $gibi) {
put (printf "%dM" (math:round (/ $n $mebi)))
} else {
put (printf "%dG" (math:round (/ $n $gibi)))
}
}
fn format-seconds {|n &p=$false|
if $p {
put $n
} else {
var hours = (math:floor (/ $n 3600))
set n = (- $n (* $hours 3600))
var mins = (math:floor (/ $n 60))
var secs = (math:round (- $n (* $mins 60)))
put (printf "%02d:%02d:%02d" $hours $mins $secs)
}
}
fn format-timestamp {|epoch-secs &p=false|
if $p {
put $epoch-secs
} else {
put (date +'%Y-%m-%d %H:%M:%S' -d @$epoch-secs)
}
}
fn sacct-job-format {|job &p=$false|
var mem-req = (* $mebi $job[required][memory])
var cpu-req = $job[required][CPUs]
var exit-code = $job[exit_code]
if (== 0 (count $job[steps])) {
# do nothing
} else {
var step0 = $job[steps][0]
var max-requested-tres = $step0[tres][requested][max]
var elapsed = $step0[time][elapsed]
var start = $step0[time][start]
var cpu-time = $step0[time][total][seconds]
var mem-used = (put $max-requested-tres | keep-if {|x| ==s $x[type] mem} (all) | each {|tres| put $tres[count] } | put (all) (num 0) | take 1) # 0 is the default
var vmem-used = (put $max-requested-tres | keep-if {|x| ==s $x[type] vmem} (all) | each {|tres| put $tres[count] } | put (all) (num 0) | take 1) # 0 is the default
# var cpu-used = (put $max-requested-tres | keep-if {|x| ==s $x[type] cpu} (all) | each {|tres| put $tres[count] } | put (all) (num 0) | take 1) # 0 is the default
var mem-eff = (math:round (* 100 (/ $mem-used $mem-req)))
var cpu-elapsed = (* $cpu-req $elapsed)
var cpu-eff = (if (> $cpu-elapsed 0) { put (math:round (* 100 (/ $cpu-time $cpu-elapsed))) } else { put 0 })
put [&id=$job[job_id] &state=$job[state][current] &status=$exit-code[status] &rc=$exit-code[return_code] &user=$job[user] &name=$job[name] &comment=$job[comment][job] &mem-used=(format-size $mem-used &p=$p) &vmem-used=(format-size $vmem-used &p=$p) &mem-req=(format-size $mem-req &p=$p) &mem-eff=$mem-eff &cpu-req=$cpu-req &cpu-time=(format-seconds $cpu-time &p=$p) &cpu-eff=$cpu-eff &start=(format-timestamp $start &p=$p) &elapsed=(format-seconds $elapsed &p=$p)]
}
}
fn filter-jobs {|&user=$nil &name=$nil &comment=$nil|
put { keep-if {|job| or (eq $user $nil) (str:contains $job[user] $user) } |
keep-if {|job| or (eq $name $nil) (str:contains $job[name] $name) } |
keep-if {|job| or (eq $comment $nil) (==s $comment "") (var job-comment = (or $job[comment][job] ""); str:contains $job-comment $comment) } }
}
fn values-to-json { var in = [(all)] ; if (== 1 (count $in)) { put $in[0] } else { put $in } | to-json }
fn jtv {|@keys|
var nu_cmd = (if (> (count $keys) 0) { put 'from json | select -i '(str:join ' ' $keys) } else { put 'from json' })
nu --no-config-file --no-history --no-std-lib --plugin-config /dev/null --stdin -c $nu_cmd
}
fn tv {|@keys| values-to-json | jtv $@keys }
fn main { |raw-opts|
var opts = (each {|opt| put [$opt[spec][long] $opt[arg]] } $raw-opts | make-map)
var help = (has-key $opts help)
var all = (has-key $opts all)
var fail = (has-key $opts fail)
var p = (has-key $opts parsable)
var user = (if $all { put $nil } elif (has-key $opts user) { put $opts[user] } else { put $E:USER })
var days = (if (has-key $opts days) { put (num $opts[days]) } else { put (num 1) })
var end = (if (has-key $opts end) { put (num $opts[end]) } else { put (num 0) })
var name = (if (has-key $opts name) { put $opts[name] } else { put $nil })
var comment = (if (has-key $opts comment) { put $opts[comment] } else { put $nil })
if (has-key $opts help) {
echo 'usage: seffs <options>
-h | --help - show this message
-a | --all - show jobs for all users, otherwise just current user
-u | --user <username> - show jobs for specified user
-n | --name <job-name> - show jobs whose name contains the specified text, otherwise all
-c | --comment <comment-text> - show jobs whose comment contains the specified text; -c "" to show all comments
-d | --days <n> - show jobs for last <n> days, otherwise just today
--end <n> - set job filter endtime for <n> days ago, with --days counting back from this
-f | --fail - show failed jobs, otherwise just successfully completed ones
-p - show sizes in bytes and times in seconds
'
} else {
var state-field = (if (has-key $opts fail) { put [state status rc] } else { put [] })
var user-field = (if (has-key $opts all) { put [user] } else { put [] })
var mem-eff-field = (if (has-key $opts fail) { put [] } else { put [mem-eff] })
var cpu-eff-field = (if (has-key $opts fail) { put [] } else { put [cpu-eff] })
var comment-field = (if (has-key $opts comment) { put [comment] } else { put [] })
var display-fields = [id $@state-field $@user-field name $@comment-field start elapsed mem-req mem-used vmem-used $@mem-eff-field cpu-req cpu-time $@cpu-eff-field]
sacct-selected-jobs $days $end &fail=$fail | (filter-jobs &user=$user &name=$name &comment=$comment) | each {|job| sacct-job-format $job &p=$p } | put [(all)] | tv $@display-fields
}
}
var raw-opts args = (flag:parse-getopt $args [
[&short=h &long=help]
[&short=u &long=user &arg-required=$true]
[&short=a &long=all]
[&short=f &long=fail]
[&short=n &long=name &arg-required=$true]
[&short=c &long=comment &arg-required=$true]
[&short=d &long=days &arg-required=$true]
[&long=end &arg-required=$true]
[&short=p &long=parsable]
])
main $raw-opts