From 56f7347a4d096cd7ecffe0838910f58b2a893047 Mon Sep 17 00:00:00 2001 From: Alex Aizman Date: Tue, 27 Aug 2024 11:36:14 -0400 Subject: [PATCH] global rebalance vs targets that are being decommissioned from the _rebalancing_ perspective, a target node that is in maintenaince mode or that is being decommissioned must still be considered "active" _unless_ this target has already reached post-rebalancing (`SnodeMaintPostReb`) state Signed-off-by: Alex Aizman --- cmd/cli/go.mod | 2 +- cmd/cli/go.sum | 4 ++-- core/meta/smap.go | 16 ++++++++++++++++ reb/globrun.go | 14 +++++++------- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/cmd/cli/go.mod b/cmd/cli/go.mod index 8f11b038ac..36b04f1d7c 100644 --- a/cmd/cli/go.mod +++ b/cmd/cli/go.mod @@ -3,7 +3,7 @@ module github.com/NVIDIA/aistore/cmd/cli go 1.22.3 require ( - github.com/NVIDIA/aistore v1.3.24-0.20240826235310-8c273cfa0d36 + github.com/NVIDIA/aistore v1.3.24-0.20240827150748-31d1a799f7e5 github.com/fatih/color v1.17.0 github.com/json-iterator/go v1.1.12 github.com/onsi/ginkgo/v2 v2.20.0 diff --git a/cmd/cli/go.sum b/cmd/cli/go.sum index 0897dfb9f0..46be79d5d4 100644 --- a/cmd/cli/go.sum +++ b/cmd/cli/go.sum @@ -1,7 +1,7 @@ code.cloudfoundry.org/bytefmt v0.0.0-20190710193110-1eb035ffe2b6/go.mod h1:wN/zk7mhREp/oviagqUXY3EwuHhWyOvAdsn5Y4CzOrc= github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= -github.com/NVIDIA/aistore v1.3.24-0.20240826235310-8c273cfa0d36 h1:6WbWE3vqkTVP4i1hnHqye3yktBQaD4KDtJ0MUdcmc64= -github.com/NVIDIA/aistore v1.3.24-0.20240826235310-8c273cfa0d36/go.mod h1:si83S9r29vwIC0f0CE2Mk+25bFiaN6mmVlmuBpP4hHM= +github.com/NVIDIA/aistore v1.3.24-0.20240827150748-31d1a799f7e5 h1:ZgEB37pn2584FDlJdKPMw3AKWVfgNJL24QhPmKxRA+0= +github.com/NVIDIA/aistore v1.3.24-0.20240827150748-31d1a799f7e5/go.mod h1:si83S9r29vwIC0f0CE2Mk+25bFiaN6mmVlmuBpP4hHM= github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8= github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q= github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA= diff --git a/core/meta/smap.go b/core/meta/smap.go index 8126c2fd45..7ca32e26f1 100644 --- a/core/meta/smap.go +++ b/core/meta/smap.go @@ -461,6 +461,22 @@ func (m *Smap) HasActiveTs(except string) bool { return false } +func (m *Smap) HasPeersToRebalance(except string) bool { + for tid, t := range m.Tmap { + if tid == except { + continue + } + if !t.InMaintOrDecomm() { + return true + } + // is a "peer" if still transitioning to post-rebalance state + if !t.Flags.IsSet(SnodeMaintPostReb) { + return true + } + } + return false +} + func (m *Smap) CountActivePs() (count int) { for _, p := range m.Pmap { if !p.InMaintOrDecomm() { diff --git a/reb/globrun.go b/reb/globrun.go index 067c762940..e7c4f52db6 100644 --- a/reb/globrun.go +++ b/reb/globrun.go @@ -196,7 +196,7 @@ func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact, t reb.regRecv() - haveStreams := smap.HasActiveTs(core.T.SID()) + haveStreams := smap.HasPeersToRebalance(core.T.SID()) if bmd.IsEmpty() { haveStreams = false } @@ -552,16 +552,16 @@ func (reb *Reb) runNoEC(rargs *rebArgs) error { func (reb *Reb) rebWaitAck(rargs *rebArgs) (errCnt int) { var ( - cnt int - logHdr = reb.logHdr(rargs.id, rargs.smap) - sleep = rargs.config.Timeout.CplaneOperation.D() - maxwt = rargs.config.Rebalance.DestRetryTime.D() - xreb = reb.xctn() - smap = rargs.smap + cnt int + sleep = rargs.config.Timeout.CplaneOperation.D() + maxwt = rargs.config.Rebalance.DestRetryTime.D() + xreb = reb.xctn() + smap = rargs.smap ) maxwt += time.Duration(int64(time.Minute) * int64(rargs.smap.CountTargets()/10)) maxwt = min(maxwt, rargs.config.Rebalance.DestRetryTime.D()*2) reb.changeStage(rebStageWaitAck) + logHdr := reb.logHdr(rargs.id, rargs.smap) for { curwt := time.Duration(0)