-
Notifications
You must be signed in to change notification settings - Fork 99
/
Copy pathpsjob
executable file
·156 lines (137 loc) · 5.31 KB
/
psjob
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env bash
# Do "ps aux" on a set of nodes belonging to a single job, but exclude system processes.
export usage="$0 [-c columns | -h] jobid"
# Author: [email protected]
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/
# Requires ClusterShell with configuration for parallel commands on Slurm jobs,
# see https://wiki.fysik.dtu.dk/niflheim/SLURM#clustershell
# and https://clustershell.readthedocs.io/en/latest/intro.html
# Default value:
export columns=100
# Enable this if the gpustat command has been installed on nodes with GPUs
enable_gpustat=1
# Parse command options
while getopts "c:h" options; do
case $options in
c ) export columns=$OPTARG
if ! [[ $columns =~ ^[0-9]+$ ]]
then
echo Number of columns $columns must be a positive number
exit 1
fi
if [[ $columns -lt 5 ]] || [[ $columns -gt 1024 ]]
then
echo Unreasonable number of columns $columns
exit 1
fi
;;
h | * ) echo "Usage: $usage"
exit 1;;
esac
done
shift $((OPTIND-1))
CLUSH="/usr/bin/clush"
PS="/bin/ps"
PSFLAGS="-o pid,nlwp,state,user,start,cputime,%cpu,rssize,command --columns $columns"
if test $# -ne 1
then
echo ERROR: No jobid given
echo Usage: $usage
exit -1
fi
# Check that jobid is a number (possibly with _ separating jobid and arraytaskid)
if ! [[ ${1} =~ ^[0-9_]+$ ]]
then
echo "ERROR: <jobid> must be a number or jobid_arraytaskid"
echo "Usage: $usage"
exit 1
fi
jobid=$1
# Check if this jobid can be inquired successfully.
JOBSTATE="`squeue -h -O State:. -j $jobid`"
if test "$?" != "0"
then
echo Error inquiring about job $jobid
exit 1
fi
# Detect job arrays by counting number of words in JOBSTATE
words=( $JOBSTATE )
if [[ ${#words[@]} > 1 ]]
then
echo "ERROR: The job $jobid is a job array with multiple jobs. Please select only one of the array jobs:"
squeue -O JobArrayID,ArrayJobID,ArrayTaskID,JobID,StartTime,TimeUsed,TimeLimit -j $jobid
exit 1
fi
if test "$JOBSTATE" != "RUNNING"
then
echo The job $jobid is not running, it has state=$JOBSTATE
exit 1
fi
# For an array job, get the individual jobid corresponding to the array job:
realjobid=`squeue -h -O JobID: -j $jobid`
# Print some job information
if test "$realjobid" = "$jobid"
then
arrayprint=""
else
# For array jobs
arrayprint=",ArrayJobID:13,ArrayTaskID:14"
fi
squeue -O JobID:10,Partition:14,NumNodes:6,NumTasks:6,UserName:10$arrayprint,StartTime,TimeUsed:14,TimeLimit:14,tres-alloc: -j $jobid
# Print the NodeList
NODELIST="`squeue -h -O NodeList: -j $jobid`"
echo NODELIST: $NODELIST
bar="===================================================="
echo "$bar"
echo "Process list from 'ps' on each node in the job:"
# Execute parallel shell on the job nodes:
# The "scontrol listpids" lists all processes belonging to a Slurm job $realjobid
# Count also the number of processes (numprocs) and threads (numthreads in $2=nlwp) when $1 is a number
$CLUSH -bw $NODELIST "$PS $PSFLAGS \$(scontrol listpids $realjobid | awk 'int(\$1)>0 {print \$1}') | grep -v ' root ' | awk '{print \$0; if(\$1~/^[0-9]+\$/) {numprocs++; numthreads+=\$2}} END {printf(\"Total: %d processes and %d threads\n\", numprocs, numthreads)}'; echo -n Uptime:; uptime"
# Check for job nodes with GPU gres
gpunodes=`mktemp`
sinfo -h -N -n $NODELIST -O NodeHost,Gres | uniq | grep -i GPU > $gpunodes
if [[ -s $gpunodes ]]
then
echo "$bar"
echo "Nodes in this job with GPU Generic Resources (Gres):"
cat $gpunodes
fi
rm -f $gpunodes
# Check for GPU jobs
# Using gpustat tool from https://github.com/wookayin/gpustat
if [[ $enable_gpustat > 0 && "`squeue -h -O tres-alloc: -j $jobid`" =~ "gres/gpu" ]]
then
echo
echo "Running GPU tasks:"
echo "Node: ID GPU-type | Temp GPU% | Mem / Tot | user:process/PID(Mem)"
# color="--color"
GPUNAME_WIDTH="--gpuname-width 24"
color=""
# The "/bin/true" catches the case of grep returning an error code
# Sort output on nodenames using "sort -k 1"
$CLUSH -w $NODELIST "gpustat -upc $GPUNAME_WIDTH $color | grep \$(scontrol listpids $realjobid | awk 'int(\$1)>0 {print \"-e \" \$1}') || /bin/true" | sort -k 1
fi
# If using JobContainerType=job_container/tmpfs then print also disk space usage
# See also https://support.schedmd.com/show_bug.cgi?id=11183
# Viewing mounts on a node from outside the job: findmnt -l -o target,source,fstype,propagation
JobContainerType=`scontrol show config | grep JobContainerType | awk '{print $3}'`
if [[ "$JobContainerType" == "job_container/tmpfs" ]]
then
echo "$bar"
# echo "Slurm job_container temporary scratch spaces:"
# grep -i '^BasePath' /etc/slurm/job_container.conf
basepath=`grep -i '^BasePath' /etc/slurm/job_container.conf | awk '{split($1,a,"="); print a[2]}'`
echo "Scratch disk usage for JobID $realjobid:"
echo "Node: Usage Scratch folder"
# Sort output on nodenames using "sort -k 1"
$CLUSH -w $NODELIST "findmnt -l -o target -t xfs | grep $realjobid | xargs du -sh" | sort -k 1
echo
echo "Scratch disks on JobID $realjobid compute nodes:"
echo "Node: Size Used Avail Use% Mounted on"
# Sort output on nodenames (key 1)
$CLUSH -w $NODELIST "df -Ph $basepath | tail -n+2" | awk '{printf("%-10s %6s %6s %6s %6s %-20s\n", $1, $3, $4, $5, $6, $7)}' | sort -k 1
fi
echo "$bar"
# $CLUSH -bw $NODELIST "$PS $PSFLAGS \$(scontrol listpids $realjobid | tail -n+3 | awk '{print \$1}') | grep -v ' root '"
# You could also use: $CLUSH -bw@sj:$realjobid ...