Skip to content

Commit

Permalink
observability: CPU utilization
Browse files Browse the repository at this point in the history
* add 'out-of-cpu' (red) and 'low-cpu' (yellow) alerts
* extremely high, high, and normal watermarks, respectively: (80%, 70%, and 50%)

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Dec 4, 2024
1 parent b939d6d commit afef76b
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 13 deletions.
18 changes: 13 additions & 5 deletions cmn/cos/node_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ const (
Resilvering // warning
ResilverInterrupted // warning
NodeRestarted // warning (powercycle, crash)
OOS // red alert (see IsRed below)
OOM // red alert
OOS // out of space; red alert (see IsRed below)
OOM // out of memory; red alert
MaintenanceMode // warning
LowCapacity // (used > high); warning: OOS possible soon..
LowMemory // ditto OOM
Expand All @@ -36,20 +36,22 @@ const (
CertificateExpired // red --/--
CertificateInvalid // red --/--
KeepAliveErrors // warning (new keep-alive errors during the last 5m)
OOCPU // out of CPU; red
LowCPU // warning
)

func (f NodeStateFlags) IsOK() bool { return f == NodeStarted|ClusterStarted }

func (f NodeStateFlags) IsRed() bool {
return f.IsSet(OOS) || f.IsSet(OOM) || f.IsSet(DiskFault) || f.IsSet(NoMountpaths) || f.IsSet(NumGoroutines) ||
return f.IsSet(OOS) || f.IsSet(OOM) || f.IsSet(OOCPU) || f.IsSet(DiskFault) || f.IsSet(NoMountpaths) || f.IsSet(NumGoroutines) ||
f.IsSet(CertificateExpired)
}

func (f NodeStateFlags) IsWarn() bool {
return f.IsSet(Rebalancing) || f.IsSet(RebalanceInterrupted) ||
f.IsSet(Resilvering) || f.IsSet(ResilverInterrupted) ||
f.IsSet(NodeRestarted) || f.IsSet(MaintenanceMode) ||
f.IsSet(LowCapacity) || f.IsSet(LowMemory) ||
f.IsSet(LowCapacity) || f.IsSet(LowMemory) || f.IsSet(LowCPU) ||
f.IsSet(CertWillSoonExpire)
}

Expand All @@ -68,7 +70,7 @@ func (f NodeStateFlags) String() string {
return "ok"
}

var sb []string
sb := make([]string, 0, 4)
if f&VoteInProgress == VoteInProgress {
sb = append(sb, "vote-in-progress")
}
Expand Down Expand Up @@ -133,6 +135,12 @@ func (f NodeStateFlags) String() string {
if f&KeepAliveErrors == KeepAliveErrors {
sb = append(sb, "keep-alive-errors")
}
if f&OOCPU == OOCPU {
sb = append(sb, "out-of-cpu")
}
if f&LowCPU == LowCPU {
sb = append(sb, "low-cpu")
}

l := len(sb)
switch l {
Expand Down
62 changes: 57 additions & 5 deletions stats/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -482,33 +482,85 @@ waitStartup:

func (r *runner) StartedUp() bool { return r.startedUp.Load() }

// - check OOM, and
// - check OOM and OOCPU
// - set NodeStateFlags with both capacity and memory flags
func (r *runner) _mem(mm *memsys.MMSA, set, clr cos.NodeStateFlags) {
func (r *runner) _memload(mm *memsys.MMSA, set, clr cos.NodeStateFlags) {
_ = r.mem.Get()
pressure := mm.Pressure(&r.mem)

flags := r.nodeStateFlags() // current/old

// memory, first
switch {
case pressure >= memsys.PressureExtreme:
if !flags.IsSet(cos.OOM) {
set |= cos.OOM
clr |= cos.LowMemory
nlog.Errorln(mm.Str(&r.mem))
}
oom.FreeToOS(true)
case pressure >= memsys.PressureHigh:
set |= cos.LowMemory
clr |= cos.OOM
if !flags.IsSet(cos.LowMemory) {
set |= cos.LowMemory
nlog.Warningln(mm.Str(&r.mem))
}
default:
clr |= cos.OOM | cos.LowMemory
if flags.IsSet(cos.LowMemory | cos.OOM) {
clr |= cos.OOM | cos.LowMemory
nlog.Infoln(mm.Name, "back to normal")
}
}
r.SetClrFlag(NodeAlerts, set, clr)

// load, second
nset, nclr := _load(flags, set, clr)

r.SetClrFlag(NodeAlerts, nset, nclr)
}

// CPU utilization, load average
// - notice hardcoded watermarks: (80%, 70%, 50%); TODO config
// - compare with `fs.ThrottlePct`
func _load(flags, set, clr cos.NodeStateFlags) (cos.NodeStateFlags, cos.NodeStateFlags) {
const tag = "CPU utilization:"
var (
load = sys.MaxLoad()
cpus = runtime.NumCPU()
)
// ok
if load < float64(cpus>>1) { // 50%
if flags.IsSet(cos.LowCPU | cos.OOCPU) {
clr |= cos.OOCPU | cos.LowCPU
nlog.Infoln(tag, "back to normal")
}
return set, clr
}
// extreme
var (
fcpus = float64(cpus)
oocpu = max(fcpus*0.8, 1) // 80%
)
if load >= oocpu {
if !flags.IsSet(cos.OOCPU) {
set |= cos.OOCPU
clr |= cos.LowCPU
nlog.Errorln(tag, "extremely high [", load, cpus, "]")
}
return set, clr
}
// high
highcpu := fcpus * 0.7 // 70%
if load >= highcpu {
clr |= cos.OOCPU
if !flags.IsSet(cos.LowCPU) {
set |= cos.LowCPU
nlog.Warningln(tag, "high [", load, cpus, "]")
}
}

// (50%, 70%) is, effectively, hysteresis interval

return set, clr
}

func (r *runner) GetStats() *Node {
Expand Down
3 changes: 2 additions & 1 deletion stats/proxy_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ func (r *Prunner) log(now int64, uptime time.Duration, config *cmn.Config) {
r._next(config, now)
}

r._mem(r.node.PageMM(), 0, 0)
// memory and CPU alerts
r._memload(r.node.PageMM(), 0, 0)
}

func (r *Prunner) statsTime(newval time.Duration) {
Expand Down
4 changes: 2 additions & 2 deletions stats/target_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -632,8 +632,8 @@ func (r *Trunner) log(now int64, uptime time.Duration, config *cmn.Config) {
clr |= cos.NodeRestarted
}

// 7. separately, memory w/ set/clr flags cumulative
r._mem(r.t.PageMM(), set, clr)
// 7. separately, memory and CPU alerts
r._memload(r.t.PageMM(), set, clr)
}

func (r *Trunner) _cap(config *cmn.Config, now int64, verbose bool) (set, clr cos.NodeStateFlags) {
Expand Down

0 comments on commit afef76b

Please sign in to comment.