From 55db4cb4638a41e5d7cf28ba71f38666bd7aea96 Mon Sep 17 00:00:00 2001 From: Alex Aizman Date: Fri, 30 Aug 2024 09:49:41 -0400 Subject: [PATCH] tls cert (re)loader: (valid, invalid, expired) state; more alerts * part seven, prev. commit: 1fe3c80bec3f * with refactoring Signed-off-by: Alex Aizman --- ais/htcommon.go | 5 +- ais/htrun.go | 12 +-- ais/target.go | 2 +- ais/tgtcp.go | 5 +- ais/tgtfshc.go | 2 +- ais/tgtspace.go | 5 +- ais/x509.go | 6 +- cmn/{tls => certloader}/certloader.go | 133 ++++++++++++++++++-------- cmn/client.go | 4 +- cmn/cos/node_state.go | 10 +- docs/https.md | 11 +-- reb/globrun.go | 5 +- 12 files changed, 125 insertions(+), 75 deletions(-) rename cmn/{tls => certloader}/certloader.go (55%) diff --git a/ais/htcommon.go b/ais/htcommon.go index a6d0629ad8..01055d70dc 100644 --- a/ais/htcommon.go +++ b/ais/htcommon.go @@ -25,10 +25,10 @@ import ( "github.com/NVIDIA/aistore/3rdparty/golang/mux" "github.com/NVIDIA/aistore/api/apc" "github.com/NVIDIA/aistore/cmn" + "github.com/NVIDIA/aistore/cmn/certloader" "github.com/NVIDIA/aistore/cmn/cos" "github.com/NVIDIA/aistore/cmn/debug" "github.com/NVIDIA/aistore/cmn/nlog" - aistls "github.com/NVIDIA/aistore/cmn/tls" "github.com/NVIDIA/aistore/core/meta" "github.com/NVIDIA/aistore/ext/etl" "github.com/NVIDIA/aistore/memsys" @@ -597,8 +597,7 @@ func newTLS(conf *cmn.HTTPConf) (tlsConf *tls.Config, err error) { tlsConf.ClientCAs = pool } if conf.Certificate != "" && conf.CertKey != "" { - tlsConf.GetCertificate, err = aistls.GetCert() - debug.AssertNoErr(err) + tlsConf.GetCertificate, err = certloader.GetCert() } return tlsConf, err } diff --git a/ais/htrun.go b/ais/htrun.go index d05ae392d3..9296bf9697 100644 --- a/ais/htrun.go +++ b/ais/htrun.go @@ -27,13 +27,13 @@ import ( "github.com/NVIDIA/aistore/cmn" "github.com/NVIDIA/aistore/cmn/archive" "github.com/NVIDIA/aistore/cmn/atomic" + "github.com/NVIDIA/aistore/cmn/certloader" "github.com/NVIDIA/aistore/cmn/cos" "github.com/NVIDIA/aistore/cmn/debug" "github.com/NVIDIA/aistore/cmn/jsp" "github.com/NVIDIA/aistore/cmn/k8s" "github.com/NVIDIA/aistore/cmn/mono" "github.com/NVIDIA/aistore/cmn/nlog" - aistls "github.com/NVIDIA/aistore/cmn/tls" "github.com/NVIDIA/aistore/core" "github.com/NVIDIA/aistore/core/meta" "github.com/NVIDIA/aistore/memsys" @@ -104,7 +104,7 @@ func (h *htrun) ByteMM() *memsys.MMSA { return h.smm } // NOTE: currently, only 'resume' (see also: kaSuspendMsg) func (h *htrun) smapUpdatedCB(_, _ *smapX, nfl, ofl cos.BitFlags) { if ofl.IsAnySet(meta.SnodeMaintDecomm) && !nfl.IsAnySet(meta.SnodeMaintDecomm) { - h.statsT.ClrFlag(stats.NodeAlerts, cos.MaintenanceMode) + h.statsT.ClrFlag(cos.NodeAlerts, cos.MaintenanceMode) h.keepalive.ctrl(kaResumeMsg) } } @@ -194,14 +194,14 @@ func (h *htrun) ClusterStarted() bool { return h.startup.cluster.Load() > 0 } // func (h *htrun) markClusterStarted() { h.startup.cluster.Store(mono.NanoTime()) - h.statsT.SetFlag(stats.NodeAlerts, cos.ClusterStarted) + h.statsT.SetFlag(cos.NodeAlerts, cos.ClusterStarted) } func (h *htrun) NodeStarted() bool { return h.startup.node.Load() > 0 } func (h *htrun) markNodeStarted() { h.startup.node.Store(mono.NanoTime()) - h.statsT.SetFlag(stats.NodeAlerts, cos.NodeStarted) + h.statsT.SetFlag(cos.NodeAlerts, cos.NodeStarted) } func (h *htrun) regNetHandlers(networkHandlers []networkHandler) { @@ -261,7 +261,7 @@ func (h *htrun) regNetHandlers(networkHandlers []networkHandler) { func (h *htrun) init(config *cmn.Config) { // before newTLS() below & before intra-cluster clients if config.Net.HTTP.UseHTTPS { - if err := aistls.Init(config.Net.HTTP.Certificate, config.Net.HTTP.CertKey, h.statsT); err != nil { + if err := certloader.Init(config.Net.HTTP.Certificate, config.Net.HTTP.CertKey, h.statsT); err != nil { cos.ExitLog(err) } } @@ -1144,7 +1144,7 @@ func (h *htrun) statsAndStatus() (ds *stats.NodeStatus) { Snode: h.si, }, Cluster: cos.NodeStateInfo{ - Flags: cos.NodeStateFlags(h.statsT.Get(stats.NodeAlerts)), + Flags: cos.NodeStateFlags(h.statsT.Get(cos.NodeAlerts)), }, SmapVersion: smap.Version, MemCPUInfo: apc.GetMemCPU(), diff --git a/ais/target.go b/ais/target.go index 90d411c658..ff8455af2b 100644 --- a/ais/target.go +++ b/ais/target.go @@ -507,7 +507,7 @@ func (t *target) checkRestarted(config *cmn.Config) (fatalErr, writeErr error) { fatalErr = fmt.Errorf("%s: %q is in use (duplicate or overlapping run?)", t, red.inUse) return } - t.statsT.SetFlag(stats.NodeAlerts, cos.Restarted) + t.statsT.SetFlag(cos.NodeAlerts, cos.Restarted) fs.PersistMarker(fname.NodeRestartedPrev) } fatalErr, writeErr = fs.PersistMarker(fname.NodeRestartedMarker) diff --git a/ais/tgtcp.go b/ais/tgtcp.go index 8e9bdbb716..e44a9f18de 100644 --- a/ais/tgtcp.go +++ b/ais/tgtcp.go @@ -30,7 +30,6 @@ import ( "github.com/NVIDIA/aistore/nl" "github.com/NVIDIA/aistore/reb" "github.com/NVIDIA/aistore/res" - "github.com/NVIDIA/aistore/stats" "github.com/NVIDIA/aistore/xact" "github.com/NVIDIA/aistore/xact/xreg" jsoniter "github.com/json-iterator/go" @@ -188,13 +187,13 @@ func (t *target) daeputMsg(w http.ResponseWriter, r *http.Request) { if !t.ensureIntraControl(w, r, true /* from primary */) { return } - t.statsT.SetFlag(stats.NodeAlerts, cos.MaintenanceMode) + t.statsT.SetFlag(cos.NodeAlerts, cos.MaintenanceMode) t.termKaliveX(msg.Action, true) case apc.ActShutdownCluster, apc.ActShutdownNode: if !t.ensureIntraControl(w, r, true /* from primary */) { return } - t.statsT.SetFlag(stats.NodeAlerts, cos.MaintenanceMode) + t.statsT.SetFlag(cos.NodeAlerts, cos.MaintenanceMode) t.termKaliveX(msg.Action, false) t.shutdown(msg.Action) case apc.ActRmNodeUnsafe: diff --git a/ais/tgtfshc.go b/ais/tgtfshc.go index 7cde18134f..5064e9b25f 100644 --- a/ais/tgtfshc.go +++ b/ais/tgtfshc.go @@ -61,6 +61,6 @@ func (t *target) FSHC(err error, mi *fs.Mountpath, fqn string) { func (t *target) DisableMpath(mi *fs.Mountpath) (err error) { _, err = t.fsprg.disableMpath(mi.Path, true /*dont-resilver*/) - t.statsT.SetFlag(stats.NodeAlerts, cos.DiskFault) + t.statsT.SetFlag(cos.NodeAlerts, cos.DiskFault) return err } diff --git a/ais/tgtspace.go b/ais/tgtspace.go index 3d7e4771aa..25a8a8a438 100644 --- a/ais/tgtspace.go +++ b/ais/tgtspace.go @@ -20,7 +20,6 @@ import ( "github.com/NVIDIA/aistore/ios" "github.com/NVIDIA/aistore/nl" "github.com/NVIDIA/aistore/space" - "github.com/NVIDIA/aistore/stats" "github.com/NVIDIA/aistore/xact" "github.com/NVIDIA/aistore/xact/xreg" ) @@ -65,9 +64,9 @@ func (t *target) OOS(csRefreshed *fs.CapStatus, config *cmn.Config, tcdf *fs.Tcd } if cs.IsOOS() { - t.statsT.SetFlag(stats.NodeAlerts, cos.OOS) + t.statsT.SetFlag(cos.NodeAlerts, cos.OOS) } else { - t.statsT.SetFlag(stats.NodeAlerts, cos.LowCapacity) + t.statsT.SetFlag(cos.NodeAlerts, cos.LowCapacity) } nlog.Warningln(t.String(), "running store cleanup:", cs.String()) // run serially, cleanup first and LRU second, iff out-of-space persists diff --git a/ais/x509.go b/ais/x509.go index 34a4fcb359..67580eb7dd 100644 --- a/ais/x509.go +++ b/ais/x509.go @@ -9,14 +9,14 @@ import ( "github.com/NVIDIA/aistore/api/apc" "github.com/NVIDIA/aistore/cmn" + "github.com/NVIDIA/aistore/cmn/certloader" "github.com/NVIDIA/aistore/cmn/nlog" - aistls "github.com/NVIDIA/aistore/cmn/tls" "github.com/NVIDIA/aistore/core" "github.com/NVIDIA/aistore/core/meta" ) func (h *htrun) daeLoadX509(w http.ResponseWriter, r *http.Request) { - if err := aistls.Load(); err != nil { + if err := certloader.Load(); err != nil { h.writeErr(w, r, err) } } @@ -24,7 +24,7 @@ func (h *htrun) daeLoadX509(w http.ResponseWriter, r *http.Request) { func (p *proxy) cluLoadX509(w http.ResponseWriter, r *http.Request) { // 1. self var err error - if err = aistls.Load(); err != nil { + if err = certloader.Load(); err != nil { p.writeErr(w, r, err) return } diff --git a/cmn/tls/certloader.go b/cmn/certloader/certloader.go similarity index 55% rename from cmn/tls/certloader.go rename to cmn/certloader/certloader.go index ca5e1de8e2..abf79edd5e 100644 --- a/cmn/tls/certloader.go +++ b/cmn/certloader/certloader.go @@ -1,13 +1,12 @@ -// Package tls provides support for TLS. +// Package certloader loads and reloads X.509 certs. /* * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. */ -package tls +package certloader import ( "crypto/tls" "crypto/x509" - "errors" "fmt" "os" "strings" @@ -20,9 +19,11 @@ import ( "github.com/NVIDIA/aistore/hk" ) -// TODO: can be _expired_ with invalid (non-parseable) replacement - differentiate +const name = "tls-cert-loader" -const name = "certificate-loader" +const dfltTimeInvalid = time.Hour + +const fmtErrExpired = "%s: %s expired (valid until %v)" type ( xcert struct { @@ -34,18 +35,25 @@ type ( size int64 } certLoader struct { - xcert atomic.Pointer[xcert] + tstats cos.StatsUpdater certFile string keyFile string - tstats cos.StatsUpdater + xcert atomic.Pointer[xcert] } - GetCertCB func(_ *tls.ClientHelloInfo) (*tls.Certificate, error) + // tls.Config.GetCertificate + GetCertCB func(_ *tls.ClientHelloInfo) (*tls.Certificate, error) + + // tls.Config.GetClientCertificate GetClientCertCB func(_ *tls.CertificateRequestInfo) (*tls.Certificate, error) + + errExpired struct { + msg string + } ) var ( - loader *certLoader + gcl *certLoader ) // (htrun only) @@ -54,22 +62,44 @@ func Init(certFile, keyFile string, tstats cos.StatsUpdater) (err error) { return nil } - debug.Assert(loader == nil) - loader = &certLoader{certFile: certFile, keyFile: keyFile, tstats: tstats} - if err = loader.load(false /*compare*/); err != nil { + debug.Assert(gcl == nil) + gcl = &certLoader{certFile: certFile, keyFile: keyFile, tstats: tstats} + if err = Load(); err != nil { nlog.Errorln("FATAL:", err) - loader = nil return err } - hk.Reg(name, loader.hk, loader.hktime()) + + hk.Reg(name, gcl.hk, gcl.hktime()) return nil } -func Load() error { - return loader.load(false /*compare*/) +// via (Init, API call) +func Load() (err error) { + if err = gcl.do(false /*compare*/); err == nil { + return nil + } + if isExpired(err) { + gcl.tstats.SetFlag(cos.NodeAlerts, cos.CertificateExpired) + } else { + gcl.tstats.SetFlag(cos.NodeAlerts, cos.CertificateInvalid) + } + return err +} + +func (cl *certLoader) hk() time.Duration { + if err := cl.do(true /*compare*/); err != nil { + nlog.Errorln(err) + } + return cl.hktime() } func (cl *certLoader) hktime() (d time.Duration) { + flags := cos.NodeStateFlags(cl.tstats.Get(cos.NodeAlerts)) + if flags.IsSet(cos.CertificateExpired) || flags.IsSet(cos.CertificateInvalid) { + return dfltTimeInvalid + } + + // (still) valid const warn = "X.509 will soon expire - remains:" rem := time.Until(cl.xcert.Load().notAfter) switch { @@ -84,23 +114,38 @@ func (cl *certLoader) hktime() (d time.Duration) { d = time.Minute case rem > 0: nlog.Errorln(cl.certFile, warn, rem) - d = min(10*time.Second, rem) + d = time.Minute default: // expired cl.tstats.SetFlag(cos.NodeAlerts, cos.CertificateExpired) - d = time.Hour + d = dfltTimeInvalid } return d } +func (cl *certLoader) errorf() error { + flags := cos.NodeStateFlags(cl.tstats.Get(cos.NodeAlerts)) + switch { + case flags.IsSet(cos.CertificateInvalid): + return fmt.Errorf("%s: (%s, %s) is invalid", name, cl.certFile, cl.keyFile) + case flags.IsSet(cos.CertificateExpired): + xcert := cl.xcert.Load() + msg := fmt.Sprintf(fmtErrExpired, name, cl.certFile, xcert.notAfter) + return &errExpired{msg} + default: + return nil + } +} + func (cl *certLoader) _get() *tls.Certificate { return &cl.xcert.Load().Certificate } func (cl *certLoader) _hello(*tls.ClientHelloInfo) (*tls.Certificate, error) { return cl._get(), nil } func GetCert() (GetCertCB, error) { - if loader == nil { - return nil, errors.New(name + " is ") + debug.Assert(gcl != nil, name, " not initialized") + if err := gcl.errorf(); err != nil { + return nil, err } - return loader._hello, nil + return gcl._hello, nil } func (cl *certLoader) _info(*tls.CertificateRequestInfo) (*tls.Certificate, error) { @@ -108,20 +153,14 @@ func (cl *certLoader) _info(*tls.CertificateRequestInfo) (*tls.Certificate, erro } func GetClientCert() (GetClientCertCB, error) { - if loader == nil { - return nil, errors.New(name + " is ") - } - return loader._info, nil -} - -func (cl *certLoader) hk() time.Duration { - if err := cl.load(true /*compare*/); err != nil { - nlog.Errorln(err) + debug.Assert(gcl != nil, name, " not initialized") + if err := gcl.errorf(); err != nil { + return nil, err } - return cl.hktime() + return gcl._info, nil } -func (cl *certLoader) load(compare bool) (err error) { +func (cl *certLoader) do(compare bool) (err error) { var ( finfo os.FileInfo xcert = xcert{parent: cl} @@ -129,13 +168,13 @@ func (cl *certLoader) load(compare bool) (err error) { // 1. fstat finfo, err = os.Stat(cl.certFile) if err != nil { - return fmt.Errorf("%s: failed to fstat X.509 %q, err: %w", name, cl.certFile, err) + return fmt.Errorf("%s: failed to fstat %q, err: %w", name, cl.certFile, err) } // 2. updated? if compare { xcert := cl.xcert.Load() - debug.Assert(xcert != nil, "expecting X.509 loaded at startup") + debug.Assert(xcert != nil, "expecting X.509 loaded at startup: ", cl.certFile, ", ", cl.keyFile) if finfo.ModTime() == xcert.modTime && finfo.Size() == xcert.size { return nil } @@ -144,17 +183,17 @@ func (cl *certLoader) load(compare bool) (err error) { // 3. read and parse xcert.Certificate, err = tls.LoadX509KeyPair(cl.certFile, cl.keyFile) if err != nil { - return fmt.Errorf("%s: failed to load X.509, err: %w", name, err) + return fmt.Errorf("%s: failed to load (%s, %s), err: %w", name, cl.certFile, cl.keyFile, err) } if err = xcert.ini(finfo); err != nil { return err } - // 4. keep and log - cl.tstats.ClrFlag(cos.NodeAlerts, cos.CertificateExpired) + // 4. ok + cl.tstats.ClrFlag(cos.NodeAlerts, cos.CertificateExpired|cos.CertificateInvalid) cl.xcert.Store(&xcert) - nlog.Infoln(xcert.String()) + nlog.Infoln(xcert.String()) return nil } @@ -181,7 +220,7 @@ func (x *xcert) ini(finfo os.FileInfo) (err error) { if x.Certificate.Leaf == nil { x.Certificate.Leaf, err = x509.ParseCertificate(x.Certificate.Certificate[0]) if err != nil { - return fmt.Errorf("%s: failed to parse X.509 %q, err: %w", name, x.parent.certFile, err) + return fmt.Errorf("%s: failed to parse %q, err: %w", name, x.parent.certFile, err) } } { @@ -192,9 +231,21 @@ func (x *xcert) ini(finfo os.FileInfo) (err error) { } now := time.Now() if now.After(x.notAfter) { - err = fmt.Errorf("%s: X.509 %s expired (valid until %v)", name, x.parent.certFile, x.notAfter) + msg := fmt.Sprintf(fmtErrExpired, name, x.parent.certFile, x.notAfter) + err = &errExpired{msg} } else if now.Before(x.notBefore) { - nlog.Warningln(x.parent.certFile, "X.509 is not valid _yet_: [", x.notBefore, x.notAfter, "]") + nlog.Warningln(x.parent.certFile, "is not valid _yet_: [", x.notBefore, x.notAfter, "]") } return err } + +// +// other +// + +func (e *errExpired) Error() string { return e.msg } + +func isExpired(err error) bool { + _, ok := err.(*errExpired) + return ok +} diff --git a/cmn/client.go b/cmn/client.go index 8981bfdb10..919a6a4c8a 100644 --- a/cmn/client.go +++ b/cmn/client.go @@ -15,8 +15,8 @@ import ( "time" "github.com/NVIDIA/aistore/api/env" + "github.com/NVIDIA/aistore/cmn/certloader" "github.com/NVIDIA/aistore/cmn/cos" - aistls "github.com/NVIDIA/aistore/cmn/tls" ) type ( @@ -113,7 +113,7 @@ func NewTLS(sargs TLSArgs, intra bool) (tlsConf *tls.Config, err error) { // intra-cluster client if intra { - tlsConf.GetClientCertificate, err = aistls.GetClientCert() + tlsConf.GetClientCertificate, err = certloader.GetClientCert() return tlsConf, err } diff --git a/cmn/cos/node_state.go b/cmn/cos/node_state.go index 24b234203d..d6b2eb18f9 100644 --- a/cmn/cos/node_state.go +++ b/cmn/cos/node_state.go @@ -30,9 +30,10 @@ const ( LowCapacity // (used > high); warning: OOS possible soon.. LowMemory // ditto OOM DiskFault // red - NoMountpaths // red (TODO: reserved, not used) + NoMountpaths // red: (reserved, not used) NumGoroutines // red - CertificateExpired // red (X.509 cert expired) + CertificateExpired // red: (X.509 cert expired) + CertificateInvalid // red: (X.509 cert invalid) ) func (f NodeStateFlags) IsOK() bool { return f == NodeStarted|ClusterStarted } @@ -118,7 +119,10 @@ func (f NodeStateFlags) String() string { sb = append(sb, "high-number-of-goroutines") } if f&CertificateExpired == CertificateExpired { - sb = append(sb, "TLS-certificate-expired") + sb = append(sb, "tls-cert-expired") + } + if f&CertificateInvalid == CertificateInvalid { + sb = append(sb, "tls-cert-invalid") } l := len(sb) diff --git a/docs/https.md b/docs/https.md index 5bf04fc580..a5d58a851c 100644 --- a/docs/https.md +++ b/docs/https.md @@ -128,15 +128,14 @@ In AIStore, related functionality consists of two pieces: The scope of this latter operation may be either a selected node or entire cluster. -As far as automatic adjustment of the interval, this depends on the remaining time to expire and works approximately as follows: +As far as automatic adjustment of the polling interval, the resulting value depends on the remaining time (until expired) and works [approximately](https://github.com/NVIDIA/aistore/blob/main/cmn/certloader/cl.go) as follows: | time to expire | period to check for renewal | | -- | -- | | more than 24h | 6 hours | | more than 6h | 1 hour | | more than 1h | 10m | -| more than 10m | 1m | -| 10m or less | 10s | +| more than 1s | 1m | | `expired` | 1h | Upon initial loading, or every time when reloading, an AIS node logs a record that also shows the validity bounds, e.g.: @@ -145,15 +144,15 @@ Upon initial loading, or every time when reloading, an AIS node logs a record th I 11:05:45.753438 certloader:151 server.crt[26 Aug 24 18:18 UTC, 26 Aug 25 18:18 UTC] ``` -In addition, if certificate expires, AIS node raises the namesake alert that - as usual - will show up in Grafana dashboard, CLI `show cluster` command, or both. +In addition, if certificate fails to load or expires, AIS node raises the namesake alert that - as usual - will show up in Grafana dashboard or via CLI `show cluster`, or both. ```console $ ais show cluster PROXY MEM USED(%) MEM AVAIL LOAD AVERAGE UPTIME STATUS ALERT -p[atipJhgn][P] 0.17% 27.51GiB [0.3 0.1 0.0] - online **TLS-certificate-expired** +p[atipJhgn][P] 0.17% 27.51GiB [0.3 0.1 0.0] - online **tls-cert-expired** TARGET MEM USED(%) MEM AVAIL CAP USED(%) CAP AVAIL LOAD AVERAGE STATUS ALERT -t[NlLtPtrm] 0.16% 27.51GiB 16% 367.538GiB [0.3 0.1 0.0] online **TLS-certificate-expired** +t[NlLtPtrm] 0.16% 27.51GiB 16% 367.538GiB [0.3 0.1 0.0] online **tls-cert-expired** Summary: Proxies: 1 diff --git a/reb/globrun.go b/reb/globrun.go index e0eb195edd..639303b7ae 100644 --- a/reb/globrun.go +++ b/reb/globrun.go @@ -26,7 +26,6 @@ import ( "github.com/NVIDIA/aistore/core" "github.com/NVIDIA/aistore/core/meta" "github.com/NVIDIA/aistore/fs" - "github.com/NVIDIA/aistore/stats" "github.com/NVIDIA/aistore/transport" "github.com/NVIDIA/aistore/transport/bundle" "github.com/NVIDIA/aistore/xact" @@ -224,7 +223,7 @@ func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact, t onGFN() - tstats.SetFlag(stats.NodeAlerts, cos.Rebalancing) + tstats.SetFlag(cos.NodeAlerts, cos.Rebalancing) errCnt := 0 err := reb.run(rargs) @@ -240,7 +239,7 @@ func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact, t } reb.fini(rargs, logHdr, err) - tstats.ClrFlag(stats.NodeAlerts, cos.Rebalancing) + tstats.ClrFlag(cos.NodeAlerts, cos.Rebalancing) offGFN() }