diff --git a/cmn/cos/node_state.go b/cmn/cos/node_state.go index ed5fde87bd..4fe8676da4 100644 --- a/cmn/cos/node_state.go +++ b/cmn/cos/node_state.go @@ -24,8 +24,8 @@ const ( Resilvering // warning ResilverInterrupted // warning NodeRestarted // warning (powercycle, crash) - OOS // red alert (see IsRed below) - OOM // red alert + OOS // out of space; red alert (see IsRed below) + OOM // out of memory; red alert MaintenanceMode // warning LowCapacity // (used > high); warning: OOS possible soon.. LowMemory // ditto OOM @@ -36,12 +36,14 @@ const ( CertificateExpired // red --/-- CertificateInvalid // red --/-- KeepAliveErrors // warning (new keep-alive errors during the last 5m) + OOCPU // out of CPU; red + LowCPU // warning ) func (f NodeStateFlags) IsOK() bool { return f == NodeStarted|ClusterStarted } func (f NodeStateFlags) IsRed() bool { - return f.IsSet(OOS) || f.IsSet(OOM) || f.IsSet(DiskFault) || f.IsSet(NoMountpaths) || f.IsSet(NumGoroutines) || + return f.IsSet(OOS) || f.IsSet(OOM) || f.IsSet(OOCPU) || f.IsSet(DiskFault) || f.IsSet(NoMountpaths) || f.IsSet(NumGoroutines) || f.IsSet(CertificateExpired) } @@ -49,7 +51,7 @@ func (f NodeStateFlags) IsWarn() bool { return f.IsSet(Rebalancing) || f.IsSet(RebalanceInterrupted) || f.IsSet(Resilvering) || f.IsSet(ResilverInterrupted) || f.IsSet(NodeRestarted) || f.IsSet(MaintenanceMode) || - f.IsSet(LowCapacity) || f.IsSet(LowMemory) || + f.IsSet(LowCapacity) || f.IsSet(LowMemory) || f.IsSet(LowCPU) || f.IsSet(CertWillSoonExpire) } @@ -68,7 +70,7 @@ func (f NodeStateFlags) String() string { return "ok" } - var sb []string + sb := make([]string, 0, 4) if f&VoteInProgress == VoteInProgress { sb = append(sb, "vote-in-progress") } @@ -133,6 +135,12 @@ func (f NodeStateFlags) String() string { if f&KeepAliveErrors == KeepAliveErrors { sb = append(sb, "keep-alive-errors") } + if f&OOCPU == OOCPU { + sb = append(sb, "out-of-cpu") + } + if f&LowCPU == LowCPU { + sb = append(sb, "low-cpu") + } l := len(sb) switch l { diff --git a/stats/common.go b/stats/common.go index 33afe79936..a8283a266c 100644 --- a/stats/common.go +++ b/stats/common.go @@ -482,33 +482,85 @@ waitStartup: func (r *runner) StartedUp() bool { return r.startedUp.Load() } -// - check OOM, and +// - check OOM and OOCPU // - set NodeStateFlags with both capacity and memory flags -func (r *runner) _mem(mm *memsys.MMSA, set, clr cos.NodeStateFlags) { +func (r *runner) _memload(mm *memsys.MMSA, set, clr cos.NodeStateFlags) { _ = r.mem.Get() pressure := mm.Pressure(&r.mem) flags := r.nodeStateFlags() // current/old + + // memory, first switch { case pressure >= memsys.PressureExtreme: if !flags.IsSet(cos.OOM) { set |= cos.OOM + clr |= cos.LowMemory nlog.Errorln(mm.Str(&r.mem)) } oom.FreeToOS(true) case pressure >= memsys.PressureHigh: - set |= cos.LowMemory clr |= cos.OOM if !flags.IsSet(cos.LowMemory) { + set |= cos.LowMemory nlog.Warningln(mm.Str(&r.mem)) } default: - clr |= cos.OOM | cos.LowMemory if flags.IsSet(cos.LowMemory | cos.OOM) { + clr |= cos.OOM | cos.LowMemory nlog.Infoln(mm.Name, "back to normal") } } - r.SetClrFlag(NodeAlerts, set, clr) + + // load, second + nset, nclr := _load(flags, set, clr) + + r.SetClrFlag(NodeAlerts, nset, nclr) +} + +// CPU utilization, load average +// - notice hardcoded watermarks: (80%, 70%, 50%); TODO config +// - compare with `fs.ThrottlePct` +func _load(flags, set, clr cos.NodeStateFlags) (cos.NodeStateFlags, cos.NodeStateFlags) { + const tag = "CPU utilization:" + var ( + load = sys.MaxLoad() + cpus = runtime.NumCPU() + ) + // ok + if load < float64(cpus>>1) { // 50% + if flags.IsSet(cos.LowCPU | cos.OOCPU) { + clr |= cos.OOCPU | cos.LowCPU + nlog.Infoln(tag, "back to normal") + } + return set, clr + } + // extreme + var ( + fcpus = float64(cpus) + oocpu = max(fcpus*0.8, 1) // 80% + ) + if load >= oocpu { + if !flags.IsSet(cos.OOCPU) { + set |= cos.OOCPU + clr |= cos.LowCPU + nlog.Errorln(tag, "extremely high [", load, cpus, "]") + } + return set, clr + } + // high + highcpu := fcpus * 0.7 // 70% + if load >= highcpu { + clr |= cos.OOCPU + if !flags.IsSet(cos.LowCPU) { + set |= cos.LowCPU + nlog.Warningln(tag, "high [", load, cpus, "]") + } + } + + // (50%, 70%) is, effectively, hysteresis interval + + return set, clr } func (r *runner) GetStats() *Node { diff --git a/stats/proxy_stats.go b/stats/proxy_stats.go index 9b5a17e5e0..fd2b127d02 100644 --- a/stats/proxy_stats.go +++ b/stats/proxy_stats.go @@ -83,7 +83,8 @@ func (r *Prunner) log(now int64, uptime time.Duration, config *cmn.Config) { r._next(config, now) } - r._mem(r.node.PageMM(), 0, 0) + // memory and CPU alerts + r._memload(r.node.PageMM(), 0, 0) } func (r *Prunner) statsTime(newval time.Duration) { diff --git a/stats/target_stats.go b/stats/target_stats.go index 00ac0a2562..8915cbc6d7 100644 --- a/stats/target_stats.go +++ b/stats/target_stats.go @@ -632,8 +632,8 @@ func (r *Trunner) log(now int64, uptime time.Duration, config *cmn.Config) { clr |= cos.NodeRestarted } - // 7. separately, memory w/ set/clr flags cumulative - r._mem(r.t.PageMM(), set, clr) + // 7. separately, memory and CPU alerts + r._memload(r.t.PageMM(), set, clr) } func (r *Trunner) _cap(config *cmn.Config, now int64, verbose bool) (set, clr cos.NodeStateFlags) {