From ed4d00c0e736ec61ed41099f5bc7df96fcee6bab Mon Sep 17 00:00:00 2001 From: Alex Aizman Date: Sun, 1 Dec 2024 09:27:52 -0500 Subject: [PATCH] stop rebalance with force * CLI: `ais stop --force` option * UUID cannot begin with 'g' - only rebalance can * rebalance: - always check for maintenance/decommission in progress (fix) - force to stop despite maintenance/decommission (advanced usage) * refactoring; usability; log Signed-off-by: Alex Aizman --- ais/prxclu.go | 50 ++++++++++++++++++++++++++----------- cmd/cli/cli/cluster_hdlr.go | 15 ++++++----- cmd/cli/cli/const.go | 2 +- cmd/cli/cli/job_hdlr.go | 50 +++++++++++++++++++++---------------- cmn/cos/uuid.go | 5 ++-- xact/api.go | 2 +- xact/base.go | 2 +- 7 files changed, 78 insertions(+), 48 deletions(-) diff --git a/ais/prxclu.go b/ais/prxclu.go index 2c9c02f7cfd..0366eabe897 100644 --- a/ais/prxclu.go +++ b/ais/prxclu.go @@ -1318,33 +1318,30 @@ func (p *proxy) blobdl(smap *smapX, xargs *xact.ArgsMsg, msg *apc.ActMsg) (tsi * } func (p *proxy) xstop(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { - var ( - xargs = xact.ArgsMsg{} - ) + var xargs xact.ArgsMsg if err := cos.MorphMarshal(msg.Value, &xargs); err != nil { p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err) return } + xargs.Kind, _ = xact.GetKindName(xargs.Kind) // display name => kind + // note: of all xaction kinds only rebalance can have a "valid rebalance ID" (see `cos.GenUUID`) + // make an exception for rebalance: assign its kind to reinforce maintenance check below + if xargs.Kind == "" && xact.IsValidRebID(xargs.ID) { + xargs.Kind = apc.ActRebalance + } + // (lso + tco) special p.lstca.abort(&xargs) if xargs.Kind == apc.ActRebalance { + // unless forced: // disallow aborting rebalance during // critical (meta.SnodeMaint => meta.SnodeMaintPostReb) and (meta.SnodeDecomm => removed) transitions - smap := p.owner.smap.get() - for _, tsi := range smap.Tmap { - if tsi.Flags.IsAnySet(meta.SnodeMaint) && !tsi.Flags.IsAnySet(meta.SnodeMaintPostReb) { - p.writeErrf(w, r, "cannot abort %s: putting %s in maintenance mode - rebalancing...", - xargs.String(), tsi.StringEx()) - return - } - if tsi.Flags.IsAnySet(meta.SnodeDecomm) { - p.writeErrf(w, r, "cannot abort %s: decommissioning %s - rebalancing...", - xargs.String(), tsi.StringEx()) - return - } + if err := p._checkMaint(&xargs); err != nil { + p.writeErr(w, r, err) + return } } @@ -1364,6 +1361,29 @@ func (p *proxy) xstop(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { freeBcastRes(results) } +func (p *proxy) _checkMaint(xargs *xact.ArgsMsg) error { + smap := p.owner.smap.get() + for _, tsi := range smap.Tmap { + switch { + case tsi.Flags == 0: + // do nothing + case tsi.Flags.IsAnySet(meta.SnodeMaint) && !tsi.Flags.IsAnySet(meta.SnodeMaintPostReb): + warn := "cluster is currently rebalancing while " + tsi.StringEx() + " transitions to maintenance mode" + if !xargs.Force { + return fmt.Errorf("cannot abort %s: %s", xargs.String(), warn) + } + nlog.Errorln("Warning:", warn, "- proceeding anyway") + case tsi.Flags.IsAnySet(meta.SnodeDecomm): + warn := "cluster is currently rebalancing while " + tsi.StringEx() + " is being decommissioned" + if !xargs.Force { + return fmt.Errorf("cannot abort %s: %s", xargs.String(), warn) + } + nlog.Errorln("Warning:", warn, "- proceeding anyway") + } + } + return nil +} + func (p *proxy) rebalanceCluster(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { // note operational priority over config-disabled `errRebalanceDisabled` if err := p.canRebalance(); err != nil && err != errRebalanceDisabled { diff --git a/cmd/cli/cli/cluster_hdlr.go b/cmd/cli/cli/cluster_hdlr.go index f9019567ab1..53d9b3facc7 100644 --- a/cmd/cli/cli/cluster_hdlr.go +++ b/cmd/cli/cli/cluster_hdlr.go @@ -92,13 +92,13 @@ var ( Name: commandStart, Usage: "rebalance ais cluster", Flags: clusterCmdsFlags[commandStart], - Action: startClusterRebalanceHandler, + Action: startRebHandler, } stopRebalance = cli.Command{ Name: commandStop, Usage: "stop rebalancing ais cluster", Flags: clusterCmdsFlags[commandStop], - Action: stopClusterRebalanceHandler, + Action: stopRebHandler, } clusterCmd = cli.Command{ @@ -543,11 +543,11 @@ func setPrimaryHandler(c *cli.Context) error { return err } -func startClusterRebalanceHandler(c *cli.Context) (err error) { +func startRebHandler(c *cli.Context) (err error) { return startXactionKind(c, apc.ActRebalance) } -func stopClusterRebalanceHandler(c *cli.Context) error { +func stopRebHandler(c *cli.Context) error { xargs := xact.ArgsMsg{Kind: apc.ActRebalance, OnlyRunning: true} _, snap, err := getAnyXactSnap(&xargs) if err != nil { @@ -556,12 +556,15 @@ func stopClusterRebalanceHandler(c *cli.Context) error { if snap == nil { return errors.New("rebalance is not running") } + return stopReb(c, snap.ID) +} - xargs.ID, xargs.OnlyRunning = snap.ID, false +func stopReb(c *cli.Context, xid string) error { + xargs := xact.ArgsMsg{Kind: apc.ActRebalance, ID: xid, Force: flagIsSet(c, forceFlag)} if err := xstop(&xargs); err != nil { return V(err) } - fmt.Fprintf(c.App.Writer, "Stopped %s[%s]\n", apc.ActRebalance, snap.ID) + actionDone(c, fmt.Sprintf("Stopped %s[%s]\n", apc.ActRebalance, xid)) return nil } diff --git a/cmd/cli/cli/const.go b/cmd/cli/cli/const.go index 5813392ad58..27f737828d8 100644 --- a/cmd/cli/cli/const.go +++ b/cmd/cli/cli/const.go @@ -497,7 +497,7 @@ var ( indent1 + "\t see also: 'ais bucket props show' and 'ais bucket props set')", } - forceFlag = cli.BoolFlag{Name: "force,f", Usage: "force an action"} + forceFlag = cli.BoolFlag{Name: "force,f", Usage: "force execution of the command " + advancedUsageOnly} forceClnFlag = cli.BoolFlag{ Name: forceFlag.Name, Usage: "disregard interrupted rebalance and possibly other conditions preventing full cleanup\n" + diff --git a/cmd/cli/cli/job_hdlr.go b/cmd/cli/cli/job_hdlr.go index 608d959a6f7..a54fad98296 100644 --- a/cmd/cli/cli/job_hdlr.go +++ b/cmd/cli/cli/job_hdlr.go @@ -51,6 +51,7 @@ const stopUsage = "terminate a single batch job or multiple jobs, e.g.:\n" + indent1 + "\t- 'stop tco-cysbohAGL'\t- terminate a given job identified by its unique ID;\n" + indent1 + "\t- 'stop copy-listrange'\t- terminate all multi-object copies;\n" + indent1 + "\t- 'stop copy-objects'\t- same as above (using display name);\n" + + indent1 + "\t- 'stop g731 --force'\t- forcefully abort global rebalance g731 (advanced usage only);\n" + indent1 + "\t- 'stop --all'\t- terminate all running jobs\n" + indent1 + tabHelpOpt + "." @@ -184,7 +185,7 @@ var ( Name: commandRebalance, Usage: "rebalance ais cluster", Flags: clusterCmdsFlags[commandStart], - Action: startClusterRebalanceHandler, + Action: startRebHandler, }, cleanupCmd, jobStartResilver, @@ -198,6 +199,7 @@ var ( stopCmdsFlags = []cli.Flag{ allRunningJobsFlag, regexJobsFlag, + forceFlag, yesFlag, } jobStopSub = cli.Command{ @@ -732,25 +734,27 @@ func stopJobHandler(c *cli.Context) error { actionWarn(c, warn) } - regex := parseStrFlag(c, regexJobsFlag) - - if xid != "" && (flagIsSet(c, allRunningJobsFlag) || regex != "") { - warn := fmt.Sprintf("in presence of %s argument ('%s') flags %s and %s will be ignored", - jobIDArgument, xid, qflprn(allRunningJobsFlag), qflprn(regexJobsFlag)) - actionWarn(c, warn) - } else if xid == "" && (flagIsSet(c, allRunningJobsFlag) || regex != "") { - switch name { - case cmdDownload, cmdDsort: - // regex supported - case commandRebalance: - warn := fmt.Sprintf("global rebalance is global (ignoring %s and %s flags)", - qflprn(allRunningJobsFlag), qflprn(regexJobsFlag)) - actionWarn(c, warn) - default: - if regex != "" { - warn := fmt.Sprintf("ignoring flag %s - "+NIY, qflprn(regexJobsFlag)) - actionWarn(c, warn) - } + // warn + var ( + warn string + regex = parseStrFlag(c, regexJobsFlag) + ) + switch { + case flagIsSet(c, allRunningJobsFlag) && regex != "": + warn = fmt.Sprintf("flags %s and %s", qflprn(allRunningJobsFlag), qflprn(regexJobsFlag)) + case flagIsSet(c, allRunningJobsFlag): + warn = "flag " + qflprn(allRunningJobsFlag) + case regex != "": + warn = "option " + qflprn(regexJobsFlag) + } + if warn != "" { + switch { + case xid != "": + actionWarn(c, fmt.Sprintf("ignoring %s in presence of %s argument ('%s')", warn, jobIDArgument, xid)) + case name == commandRebalance: + actionWarn(c, "global rebalance is _global_ - ignoring"+warn) + case regex != "" && name != cmdDownload && name != cmdDsort: + actionWarn(c, "ignoring "+warn+" -"+NIY) } } @@ -794,7 +798,11 @@ func stopJobHandler(c *cli.Context) error { case commandETL: return stopETLs(c, otherID /*etl name*/) case commandRebalance: - return stopClusterRebalanceHandler(c) + if xid == "" { + return stopRebHandler(c) + } else { + return stopReb(c, xid) + } } // generic xstop diff --git a/cmn/cos/uuid.go b/cmn/cos/uuid.go index 404f3fb83d4..b5bb6304bcd 100644 --- a/cmn/cos/uuid.go +++ b/cmn/cos/uuid.go @@ -57,12 +57,11 @@ func InitShortID(seed uint64) { func GenUUID() (uuid string) { var h, t string uuid = sid.MustGenerate() - if !isAlpha(uuid[0]) { + if c := uuid[0]; c == 'g' || !isAlpha(c) { // see also: `xact.RebID2S` tie := int(rtie.Add(1)) h = string(rune('A' + tie%26)) } - c := uuid[len(uuid)-1] - if c == '-' || c == '_' { + if c := uuid[len(uuid)-1]; c == '-' || c == '_' { tie := int(rtie.Add(1)) t = string(rune('a' + tie%26)) } diff --git a/xact/api.go b/xact/api.go index ea17627be92..ea65f34404b 100644 --- a/xact/api.go +++ b/xact/api.go @@ -384,7 +384,7 @@ func CheckValidUUID(id string) (err error) { func (args *ArgsMsg) String() string { var sb strings.Builder sb.Grow(128) - sb.WriteString("xargs-") + sb.WriteString("xa-") sb.WriteString(args.Kind) sb.WriteByte('[') if args.ID != "" { diff --git a/xact/base.go b/xact/base.go index d9e864dbd7a..8b85e87cbf4 100644 --- a/xact/base.go +++ b/xact/base.go @@ -416,7 +416,7 @@ func IsValidRebID(id string) (valid bool) { _, err := S2RebID(id) valid = err == nil } - return + return valid } func CompareRebIDs(someID, fltID string) int {