-
Notifications
You must be signed in to change notification settings - Fork 98
/
Copy pathsratelimit
executable file
·92 lines (86 loc) · 2.66 KB
/
sratelimit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env bash
# Summary of "RPC rate limit exceeded" lines from slurmctld.log looking like:
# [2023-10-04T16:53:13.995] RPC rate limit exceeded by uid 208213 with REQUEST_JOB_STEP_CREATE, telling to back off
# This feature "rl_enable" was introduced from Slurm 23.02.
# Author: [email protected]
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/
# The slurmctld log file: The -f option selects a different log file (even with .gz)
LOG=/var/log/slurm/slurmctld.log
verbose=0
# Parse command options
while getopts "f:vh" options; do
case $options in
f ) export LOG=$OPTARG
if [[ ! -s "$LOG" ]]
then
echo "Slurmctld log file $LOG is not a regular file or is empty"
exit 1
fi
;;
v ) verbose=1
;;
h | * ) echo "Usage:"
echo "$0 [-f slurmctld-logfile] [-v]"
exit 1;;
esac
done
shift $((OPTIND-1))
# Check for rl_enable
if [[ -z "` scontrol show config | grep rl_enable`" ]]
then
# NOTE: rl_enable is only available from Slurm 23.02
echo "ERROR: The RPC rate limit parameter rl_enable is not configured in slurm.conf"
exit 1
fi
if [[ -r $LOG ]]
then
# Use zgrep so that .gz files can be read as well
zgrep "RPC rate limit exceeded by" $LOG | awk -vLOG=$LOG -vverbose=$verbose '
{
timestamp = $1
# First RPC rate limit timestamp
if (NR == 1) timestamp0 = $1
u = $8
uid[u] = $8
# The client IP-address is added in 23.11, see https://bugs.schedmd.com/show_bug.cgi?id=17988
ipaddr[u] = $12
line[u] = $0
nlines[u]++
} END {
# Sort arrays by element values:
# https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
PROCINFO["sorted_in"] = "@val_num_desc"
if (length(nlines) > 0) {
printf("RPC rate limit exceeded lines:\n")
for (u in nlines) {
# Get the username of UID=u from the passwd database
query = "getent passwd " uid[u]
while (query | getline ) {
split($0,b,":") # Split password line into fields
username[b[3]] = b[1]
useruid[b[3]] = b[3]
fullname[b[3]] = b[5]
delete b
}
close(query)
if (verbose > 0) {
printf("User %s (uid %u) has %u \"RPC rate limit exceeded\" lines. The last line is:\n",
username[uid[u]], uid[u], nlines[u])
printf(" %s\n", line[u])
printf("Print all lines by: zgrep \"RPC rate limit exceeded by uid %u\" %s\n", uid[u], LOG)
printf("\n")
} else {
printf("User %8s (uid %10u) lines: %u\n",
username[uid[u]], uid[u], nlines[u])
}
}
printf("First and last RPC rate limit timestamps were %s and %s\n", timestamp0, timestamp)
} else {
print "NOTICE: No RPC rate limit exceeded lines were found in the log file"
}
}'
else
echo "Slurmctld log file $LOG is unreadable"
fi
echo "The log file $LOG is:"
ls -l $LOG