diff --git a/common/quotas/global/algorithm/requestweighted.go b/common/quotas/global/algorithm/requestweighted.go index 1770c36e6f3..38040a8f2c3 100644 --- a/common/quotas/global/algorithm/requestweighted.go +++ b/common/quotas/global/algorithm/requestweighted.go @@ -130,7 +130,10 @@ import ( "github.com/uber/cadence/common/clock" "github.com/uber/cadence/common/dynamicconfig" + "github.com/uber/cadence/common/log" + "github.com/uber/cadence/common/log/tag" "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/quotas/global/shared" ) type ( @@ -155,8 +158,9 @@ type ( impl struct { // intentionally value-typed so caller cannot mutate the fields. // manually copy data if this changes. - cfg Config - scope metrics.Scope + cfg Config + scope metrics.Scope + logger log.Logger // mut protects usage, as it is the only mutable data mut sync.Mutex @@ -266,6 +270,11 @@ type ( const ( guessNumKeys = 1024 // guesstimate at num of ratelimit keys in a cluster guessHostCount = 32 // guesstimate at num of frontend hosts in a cluster that receive traffic for each key + + // Some event-RPSes exceed 100k, but none get close to 1m. + // So, to add some buffer, 10m per second is considered "impossible". + // This is quite vague and can be changed, it essentially just serves as the logging threshold. + guessImpossibleRps = 10_000_000 ) func (p UpdateParams) Validate() error { @@ -356,11 +365,12 @@ func (c configSnapshot) missedUpdateScalar(dataAge time.Duration) PerSecond { // // This instance is effectively single-threaded, but a small sharding wrapper should allow better concurrent // throughput if needed (bound by CPU cores, as it's moderately CPU-costly). -func New(met metrics.Client, cfg Config) (RequestWeighted, error) { +func New(met metrics.Client, logger log.Logger, cfg Config) (RequestWeighted, error) { i := &impl{ - cfg: cfg, - scope: met.Scope(metrics.GlobalRatelimiterAggregator), - usage: make(map[Limit]map[Identity]requests, guessNumKeys), // start out relatively large + cfg: cfg, + scope: met.Scope(metrics.GlobalRatelimiterAggregator), + logger: logger.WithTags(tag.ComponentGlobalRatelimiter), + usage: make(map[Limit]map[Identity]requests, guessNumKeys), // start out relatively large clock: clock.NewRealTimeSource(), } @@ -394,8 +404,29 @@ func (a *impl) Update(p UpdateParams) error { var next requests prev := ih[p.ID] - aps := PerSecond(float64(req.Accepted) / float64(p.Elapsed/time.Second)) - rps := PerSecond(float64(req.Rejected) / float64(p.Elapsed/time.Second)) + + // sanity check: elapsed time should not be less than 1s, so just cap it. + // in practice this should always be safe with a >=1s configured rate, as + // the caller should not send *more* frequently than every 1s (monotonic time). + // + // but this is rather easy to trigger in tests / fuzzing, + // and extreme values lead to irrational math either way. + elapsed := math.Max(float64(p.Elapsed), float64(time.Second)) + aps := shared.SanityLogFloat(0, PerSecond(float64(req.Accepted)/(elapsed/float64(time.Second))), guessImpossibleRps, "accepted rps", a.logger) + rps := shared.SanityLogFloat(0, PerSecond(float64(req.Rejected)/(elapsed/float64(time.Second))), guessImpossibleRps, "rejected rps", a.logger) + + // zeros are not worth recording, and this also simplifies math elsewhere + // for two major reasons: + // - it prevents some divide-by-zero scenarios by simply not having actual zeros + // - it prevents weights from perpetually lowering if zeros are repeatedly sent, where they may eventually reach zero + // + // these keys will eventually gc, just leave them alone until that happens. + // currently this gc relies on the assumption that HostUsage will be called with the same set of keys "soon", + // but that is fairly easy to fix if needed. + if rps+aps == 0 { + continue + } + if prev.lastUpdate.IsZero() { initialized++ next = requests{ @@ -425,8 +456,8 @@ func (a *impl) Update(p UpdateParams) error { // TODO: max(1, actual) so this does not lead to <1 rps allowances? or maybe just 1+actual and then reduce in used-responses? // otherwise currently this may lead to rare callers getting 0.0001 rps, // and never recovering, despite steady and fair usage. - accepted: weighted(aps, prev.accepted*reduce, snap.weight), - rejected: weighted(rps, prev.rejected*reduce, snap.weight), + accepted: shared.SanityLogFloat(0, weighted(aps, prev.accepted*reduce, snap.weight), guessImpossibleRps, "weighted accepted rps", a.logger), + rejected: shared.SanityLogFloat(0, weighted(rps, prev.rejected*reduce, snap.weight), guessImpossibleRps, "weighted rejected rps", a.logger), } } } @@ -464,7 +495,10 @@ func (a *impl) getWeightsLocked(key Limit, snap configSnapshot) (weights map[Ide continue } - reduce := snap.missedUpdateScalar(age) + // should never be zero, `shouldGC` takes care of that. + reduce := shared.SanityLogFloat(0, snap.missedUpdateScalar(age), 1, "missed update", a.logger) + // similarly: should never be zero, accepted + rejected must be nonzero or they are not inserted. + // this may be reduced to very low values, but still far from == 0. actual := HostWeight((reqs.accepted + reqs.rejected) * reduce) weights[id] = actual // populate with the reduced values so it doesn't have to be calculated again @@ -480,10 +514,20 @@ func (a *impl) getWeightsLocked(key Limit, snap configSnapshot) (weights map[Ide return nil, 0, met } + // zeros anywhere here should not be possible - they are prevented from being inserted, + // and anything simply "losing weight" will only become "rather low", not zero, + // before enough passes have occurred to garbage collect it. + // + // specifically, 1rps -> 0rps takes over 1,000 halving cycles, and a single 1rps event + // during that will immediately re-set it above 0.5 and will need 1,000+ more cycles. + // + // if gc period / weight amount is set extreme enough this is "possible", + // but we are unlikely to ever cause it. for id := range ir { - // scale by the total. - // this also ensures all values are between 0 and 1 (inclusive) - weights[id] = weights[id] / total + // normalize by the total. + // this also ensures all values are between 0 and 1 (inclusive), + // though zero should be impossible. + weights[id] = shared.SanityLogFloat(0, weights[id]/total, 1, "normalized weight", a.logger) } met.Limits = 1 return weights, usedRPS, met @@ -511,11 +555,12 @@ func (a *impl) HostUsage(host Identity, limits []Limit) (usage map[Limit]HostUsa if len(hosts) > 0 { usage[lim] = HostUsage{ - // limit is known, has some usage on at least one host - Used: used, + // limit is known, has some usage on at least one host. + // usage has an "upper limit" because it is only the accepted RPS, not all requests received. + Used: shared.SanityLogFloat(0, used, guessImpossibleRps, "used rps", a.logger), // either known weight if there is info for this host, or zero if not. // zeros are interpreted as "unknown", the same as "not present". - Weight: hosts[host], + Weight: shared.SanityLogFloat(0, hosts[host], 1, "computed weight", a.logger), } } } diff --git a/common/quotas/global/algorithm/requestweighted_fuzz_test.go b/common/quotas/global/algorithm/requestweighted_fuzz_test.go new file mode 100644 index 00000000000..c81dbae4c7b --- /dev/null +++ b/common/quotas/global/algorithm/requestweighted_fuzz_test.go @@ -0,0 +1,266 @@ +// The MIT License (MIT) + +// Copyright (c) 2017-2020 Uber Technologies Inc. + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package algorithm + +import ( + "encoding/binary" + "math" + "testing" + "time" + + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" + + "github.com/uber/cadence/common/dynamicconfig" + "github.com/uber/cadence/common/log/testlogger" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/quotas/global/shared" +) + +// Calls Update and HostWeights as many times as there is sufficient data, +// to make sure multiple updates do not lead to Infs or NaNs, because floating point is hard. +// +// This is best to let run for a very long time after making changes, as odd sequences may be necessary. +// E.g. it took nearly 30 minutes to hit a case where `elapsed` had an `INT_MIN` value from the data bytes, +// which failed earlier attempts to math.Abs it with `-val` because `-INT_MIN == INT_MIN`. +// +// Your *local* fuzz corpus will help re-running to find more interesting things more quickly, +// but a cleared cache will have to start over and may take a long time. +func FuzzMultiUpdate(f *testing.F) { + f.Fuzz(func(t *testing.T, data []byte) { + accept, reject, elapsed, data, msg := ints(data) + if msg != "" { + t.Skip(msg) + } + var host, key string + if len(data) > 1 { + // single-char strings are probably best, some collision is a good thing. + host, key = string(data[0]), string(data[1]) + // data = data[2:] // leaving the data in there is fine, it'll just become part of the accept/reject ints. + } else { + t.Skip("not enough data for keys") + } + + l, obs := testlogger.NewObserved(t) // failed sanity checks will fail the fuzz test, as they use .Error + i, err := New(metrics.NewNoopMetricsClient(), l, Config{ + // TODO: fuzz test with different weights too? though only some human-friendly values are likely, like 0.25..0.75 + // extremely high or low values are kinda-intentionally allowed to misbehave since they're really not rational to set, + // but it might be a good exercise to make sure the math is reasonable even in those edge cases... + NewDataWeight: func(opts ...dynamicconfig.FilterOption) float64 { return 0.5 }, + UpdateInterval: func(opts ...dynamicconfig.FilterOption) time.Duration { return time.Second }, + DecayAfter: func(opts ...dynamicconfig.FilterOption) time.Duration { return time.Hour }, + GcAfter: func(opts ...dynamicconfig.FilterOption) time.Duration { return time.Hour }, + }) + if err != nil { + f.Fatal(err) + } + + // if it takes more than a couple seconds, fuzz considers it stuck. + // 1000 seems too long, 100 is pretty quick, probably just lower if necessary. + for iter := 0; iter < 100; iter++ { + // send as many updates as we have data for + t.Logf("accept=%d, reject=%d, elapsed=%d, host=%q, key=%q", accept, reject, elapsed, host, key) + + err = i.Update(UpdateParams{ + ID: Identity(host), + Load: map[Limit]Requests{ + Limit(key): { + Accepted: int(accept), + Rejected: int(reject), + }, + }, + Elapsed: time.Duration(int(elapsed)), + }) + if err != nil { + t.Fatal(err) + } + + u := i.(*impl).usage + + // collect and sort the layers of map keys, + // as determinism helps the fuzzing system choose branches better. + keySet := make(map[Limit]struct{}) + identSet := make(map[Identity]struct{}) + for limitKey, hostUsage := range u { + keySet[limitKey] = struct{}{} + for ident := range hostUsage { + identSet[ident] = struct{}{} + } + } + if accept+reject > 0 && len(keySet) == 0 { + t.Error("no identities") + } + if accept+reject > 0 && len(identSet) == 0 { + t.Error("no keys") + } + + keys := maps.Keys(keySet) + idents := maps.Keys(identSet) + slices.Sort(keys) + slices.Sort(idents) + + // scan for NaNs and Infs in internal data + for _, k := range keys { + hu := u[k] + for _, ident := range idents { + us := hu[ident] + if bad := fsane(us.accepted); bad != "" { + t.Errorf("%v for accepted rps:%q, key:%q", bad, k, ident) + } + if bad := fsane(us.rejected); bad != "" { + t.Errorf("%v for rejected rps:%q, key:%q", bad, k, ident) + } + } + } + + // and get all hosts and check the weight calculations for all keys too + for _, ident := range idents { + res, err := i.HostUsage(ident, keys) + if err != nil { + t.Fatal(err) + } + if len(res) == 0 { + t.Error("no results") + } + for _, k := range keys { + r, ok := res[k] + if !ok { + // currently all requested keys are expected to be returned + t.Errorf("key not found: %q", k) + } + if bad := fsane(r.Used); bad != "" { + t.Error(bad, "usage") + } + if r.Used < 0 { + t.Error("negative usage") + } + if bad := fsane(r.Weight); bad != "" { + t.Error(bad, "weight") + } + if r.Weight < 0 { + t.Error("negative weight") + } else if r.Weight > 1 { + t.Error("too much weight") + } + } + } + + // check for error logs, as this would imply sanity check violations + shared.AssertNoSanityCheckFailures(t, obs.TakeAll()) + + // refresh for the next round + accept, reject, elapsed, data, msg = ints(data) + if msg != "" { + break // not enough data for another round + } + if len(data) > 1 { + host, key = string(data[0]), string(data[1]) + // data = data[2:] // leaving the data in there is fine + } else { + break // not enough data for another round + } + } + }) +} + +// float sanity check because it's a lot to write out every time +func fsane[T ~float64](t T) string { + if math.IsNaN(float64(t)) { + return "NaN" + } + if math.IsInf(float64(t), 0) { + return "Inf" + } + return "" +} + +// helper to pull varints out of a pile of bytes. +// returns the unused portion of data, +// and returns a non-empty string if there was a problem +func ints(inData []byte) (accept, reject, elapsed int64, data []byte, err string) { + data = inData + accept, data, msg := getPositiveInt64(data) + if msg != "" { + return 0, 0, 0, nil, "not enough data" + } + reject, data, msg = getPositiveInt64(data) + if msg != "" { + return 0, 0, 0, nil, "not enough data" + } + elapsed, data, msg = getPositiveInt64(data) + if msg != "" { + return 0, 0, 0, nil, "not enough data" + } + if elapsed == 0 { + return 0, 0, 0, nil, "zero elapsed time, cannot use" + } + return accept, reject, elapsed, data, "" +} + +// helpers need helpers sometimes +func getPositiveInt64(data []byte) (int64, []byte, string) { + val, read := binary.Varint(data) + if read == 0 { + return 0, nil, "not enough data" + } + if read > 0 { + data = data[read:] // successfully read an int + } + if read < 0 { + data = data[-read:] // overflowed and stopped reading part way + } + if val < 0 { + val = -val + } + if val < 0 { + // -INT_MIN == INT_MIN, so the above check may have done nothing. + // so special case INT_MIN by rolling it over to INT_MAX. + val-- + } + // and last but not least: make sure it's below our "impossible" value when accept+reject are combined. + // partly this ensures random fuzzed rps doesn't exceed it (which is common), + // and partly it just asserts "we do not test irrational rates". + val = val % ((guessImpossibleRps - 1) / 2) + return val, data, "" +} + +// not covered by the larger fuzz test because it pins some "reasonable" values. +func FuzzMissedUpdate(f *testing.F) { + f.Fuzz(func(t *testing.T, decay, gc, age, rate int, weight float64) { + if decay <= 0 || gc <= 0 || age <= 0 || rate <= 0 { + t.Skip() + } + scalar := configSnapshot{ + weight: weight - math.Floor(weight), // 0..1, idk about negatives but it doesn't seem to break either + rate: time.Duration(rate), + decayAfter: time.Duration(decay), + gcAfter: time.Duration(gc), + }.missedUpdateScalar(time.Duration(age)) + if scalar < 0 || scalar > 1 { + t.Fail() + } + if bad := fsane(scalar); bad != "" { + t.Error(bad, "scalar") + } + }) +} diff --git a/common/quotas/global/algorithm/requestweighted_test.go b/common/quotas/global/algorithm/requestweighted_test.go index 86cd0d8126c..726dfdf3256 100644 --- a/common/quotas/global/algorithm/requestweighted_test.go +++ b/common/quotas/global/algorithm/requestweighted_test.go @@ -40,7 +40,9 @@ import ( "github.com/uber/cadence/common" "github.com/uber/cadence/common/clock" "github.com/uber/cadence/common/dynamicconfig" + "github.com/uber/cadence/common/log/testlogger" "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/quotas/global/shared" ) // just simplifies newForTest usage as most tests only care about rate @@ -60,11 +62,11 @@ func defaultConfig(rate time.Duration) configSnapshot { } } -func newValid(t require.TestingT, snap configSnapshot) (*impl, clock.MockedTimeSource) { +func newValid(t testlogger.TestingT, snap configSnapshot) (*impl, clock.MockedTimeSource) { return newForTest(t, snap, true) } -func newForTest(t require.TestingT, snap configSnapshot, validate bool) (*impl, clock.MockedTimeSource) { +func newForTest(t testlogger.TestingT, snap configSnapshot, validate bool) (*impl, clock.MockedTimeSource) { cfg := Config{ NewDataWeight: func(_ ...dynamicconfig.FilterOption) float64 { return snap.weight @@ -82,7 +84,11 @@ func newForTest(t require.TestingT, snap configSnapshot, validate bool) (*impl, var agg *impl if validate { - i, err := New(metrics.NewNoopMetricsClient(), cfg) + l, obs := testlogger.NewObserved(t) + t.Cleanup(func() { + shared.AssertNoSanityCheckFailures(t, obs.TakeAll()) + }) + i, err := New(metrics.NewNoopMetricsClient(), l, cfg) require.NoError(t, err) agg = i.(*impl) } else { @@ -767,7 +773,7 @@ func TestSimulate(t *testing.T) { require.NoError(t, err) // h2 has slightly over half weight due to greater historical use expectSimilarUsage(t, map[Limit]HostUsage{ - query: {Weight: 0.52}, + query: {Weight: 0.45}, start: {Weight: 0.53}, // still more starts than queries }, usage) usage, err = agg.HostUsage(h3, all) @@ -779,7 +785,7 @@ func TestSimulate(t *testing.T) { // - smaller numerator (lower calls by this host) // - smaller denominator (lower total calls) // - higher final value (smaller denominator has greater influence) - query: {Weight: 0.47}, + query: {Weight: 0.54}, start: {Weight: 0.46}, }, usage) }) @@ -804,8 +810,8 @@ func TestSimulate(t *testing.T) { // this is likely a faster shift than we want in practice, as it'll make allowed-request // behavior quite jumpy, which is why the initial weight is likely to be around 0.5. expectSimilarUsage(t, map[Limit]HostUsage{ - query: {Weight: 0.29, Used: 13.21}, // used is adjusting towards 15 - start: {Weight: 0.28, Used: 13.31}, + query: {Weight: 0.28, Used: 13.21}, // used is adjusting towards 15 + start: {Weight: 0.28, Used: 13.52}, }, usage) // and weights are flattening towards 0.33 usage, err = agg.HostUsage(h2, all) @@ -817,7 +823,7 @@ func TestSimulate(t *testing.T) { usage, err = agg.HostUsage(h3, all) require.NoError(t, err) expectSimilarUsage(t, map[Limit]HostUsage{ - query: {Weight: 0.35}, + query: {Weight: 0.37}, start: {Weight: 0.35}, }, usage) }) diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMissedUpdate/264c784f7bafbf5f b/common/quotas/global/algorithm/testdata/fuzz/FuzzMissedUpdate/264c784f7bafbf5f new file mode 100644 index 00000000000..017f12effe6 --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMissedUpdate/264c784f7bafbf5f @@ -0,0 +1,6 @@ +go test fuzz v1 +int(0) +int(60) +int(0) +int(0) +float64(0) diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/00649674e28cdc32 b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/00649674e28cdc32 new file mode 100644 index 00000000000..8ae6d1ea41d --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/00649674e28cdc32 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("00\x0000") diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/0bb6b094a7f63d70 b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/0bb6b094a7f63d70 new file mode 100644 index 00000000000..110c3990bb1 --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/0bb6b094a7f63d70 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("00000\x13\x89\x8c>\bەlw)4\xca|A##X\x1ca\xf6O-\xf3\x9a:\xfeS+\xe5\xafW\x1cn\xcbŴ\xe3<\xaf\xc6\xec\xb4$R\xb3\xd6Q\"\xb3\x82\x10g\xed$\xac\xdd\xc9\xfa\x88\xb7\xfe\xf3z&vY\n\x16\xfdT\x16\xcf\x12T\xbdv\x12\xf9\xb5~\xc3Y4@\x84\x16\xfc9$\xbd\xf1z\xff\x7f\xff\xff\x04\x04\xbe\xdfP\xee\xde[\xbb)\x86\v\xcd_\xd7\xca%-zO{\xccl\xbe`\x1d\xd2\xe8\x99o\xbbO\f\x10\r\x88\xaa\x1eG\x17\x19\x83}\xcfUR\xa4\x18<\xa4\xd6zhk\x8bV(\x98h\xee\xa2\xe4ns.\xb7}-u6\x83\xfa\x0f\a\x17\xd8ؑ\xd2 9ȗ(x\xa7\xfe-C\xe9{[\x01\x13\xbcE\x845C!)\x19\xf6\x1d۾\t\xd92\xfe<\xf3\xcaD2:]\x03\xfc\x16Q\x15\xc0\xd8(\x1f\x9bBg\xc1\xe8ev\xe4\xe7\xe7\xf4\x8c/\x18\x1dڱ\xfc\xb1\xfc,S<\"\x16\xfcN\x91\xc2g\xc63\x81\x87\xf8\x84\x02\x0e\xeb\xedբ\x99ˎ\x1a*\x9d\xff<)\xdb \xf0i\xa9L\x9c\xf5\x9f\xaf犨\xa1\xbc\xc4\xd4\xef\x8e>Z\t\x83?\xa0i\xdeFB\x83\x0e\x05\x05\xb2\xe0%~\xb0x\x96\xe2(FM:\xc8J\x98\xd1\xfd\x1e\x10\xef\x1b\x14\x92\x18\xab\x89\tU\xcc\x04\xe3\xe3C\x010z\x99TK\xf6N\b\u0601\xa5}t\x1dyQvem\x9c\x19]kM\xa8=\xb9:\xfd\x8e\x11\xe8\x81D\x1a%M)\xfa\xa6~k@˽\x94 U\x8d\xfbF\x17O\x905\xcc\xf2\xffMi\xa3\xdeeV\x9b\xbb\xec6\xa9\x939E\xc3\xfa\f\xbd\x92\b#,1Q\x89\xe8\x15u9\xb3{\xed\x10z\xc9!\xdfs\xe7s\x18O0\x1a\xe9\x8d\xf3\x95:\"X\x86ڰ\x19\xcb-\xfc\x0f\xa1EK`\x1fRew;/\x80\x7fP\xe1G&R\xc76\xdc,\x1a?\xe8\xcf\xc9\xeaR\xf5l\x86\xa6\xd3\tL\x14>\x87.B\x1a\xbd?Tڧ8\xa6S\xf7\xfb\x84\xf6*\xe3\xb8\xf7\x13\"c85\x93Б$\xc4{\xb9\x85J\xc0\xa9\xaam\xc5\xfc\b\xe6K\x90v\x15'J\x83\x0fK!k\xcaᚲJgw\xee\xffGYt\b\x94\xbd\x9f\xa9vG\xf0\x98\xe37\x1c\xde\x1a\xeb)n7\x8b(3H\xd1\v\xfc\x9c\xd0K\xacCd\"\xe2\xcd^i\xab\xffA\xd8[\xad\xdda\xba6\xf4\xa6\xd5\xd8`̓\xaa\x04}?%:\xe1\x9b6r2|b\xaf\x82AP\xdc\xc2\x1ct\xdc\x0e") diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/356e28f5914a0f16 b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/356e28f5914a0f16 new file mode 100644 index 00000000000..d08ef920e1a --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/356e28f5914a0f16 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("00000000000000000000000000000000") diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/449388c309f148fd b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/449388c309f148fd new file mode 100644 index 00000000000..f0258de24bb --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/449388c309f148fd @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\x9e\x9e\x9e\x9e\x9e\x9e\x9e\x9e\x9e0\x9e\x9e\x9e\x9e\x9e\x9e\x9e\x9e\x9e0000") diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/582528ddfad69eb5 b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/582528ddfad69eb5 new file mode 100644 index 00000000000..a96f5599e6b --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/582528ddfad69eb5 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0") diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/754d7f941db60f1d b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/754d7f941db60f1d new file mode 100644 index 00000000000..53af33575cc --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/754d7f941db60f1d @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0\xc8\xc8\xc80000") diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/84c7bfae679f54a7 b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/84c7bfae679f54a7 new file mode 100644 index 00000000000..0ef635ae250 --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/84c7bfae679f54a7 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("00000000000000000000000000000001") diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/9abdd061069970e6 b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/9abdd061069970e6 new file mode 100644 index 00000000000..91440f2b895 --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/9abdd061069970e6 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("00100") diff --git a/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/c50ed3d1a22fe00d b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/c50ed3d1a22fe00d new file mode 100644 index 00000000000..7d6a3a52eb7 --- /dev/null +++ b/common/quotas/global/algorithm/testdata/fuzz/FuzzMultiUpdate/c50ed3d1a22fe00d @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("00000000000000000000000000000000000000000000000000000000000000\bPG\x12\r\x14<\x9e2\x8d\xb2\f\xdc:\xe4\xd1\xf0G\x93z\xb6O\xbe\xec\xc41\xf9\x9eq|\x1c\f\x03\x83D\x90\x18\xfd\x96\xb6Z\x1a\xf0\xc4\xc9\"\x14\x15\xca\xcaw\x04\xd7\xda\xce%\x92&\x04#\x8d_e\xc1\xa0\xc4\x1e\x82\x0e\xe05\x1a҃4\xfe1-\x00}\xb4\xb1e\\~Ġd\x83\x8b\x93\x9fl@25\xeam\x19\xb4\xd5\x1d\x9c\x1b6\x7f\x96\xc0\x94\xa4\xd2\xdf\xfc\x00\x0e\xb2AOCm\xe6L\xdc~\x1e\xeb\x9a}\x1co\xa1\xc5u/29\x80JL9\xf2\xe8+\"\xe6_\x99'\xaajJt\x99\x12\x04\xfb\xff&p\xac3\x10g<\x1f\xfeX\x87P\xd8\x17\x13ڃ\xa9&\x8akQ\xf3\x13=!\xa5\x8d\xa4)\r\bC^E\x83'ʔ\x00\xac\x10w\xc3\xd9ӧ\x1e[A\xffuEQn\x8c*ar\x8b49\xc3\xce\x03u\xf3\nD\xc5A\xa8\x98\x14\xba.\x98A\x8b\x1f=<\xa2\x9a\b\\\x00(>h\xbc\x1d\xbd+/Y\x88\xb2h\x15\xf5\xd6z\x8e`\xd2[x Ϣ\xacP3h\xba\x9bWs\a\xf4'-\xb3\x97\x12g}\xcf\xea\xcarD\x12=\xc5\x16\xd9\xedâ[s\x9f\x17uX\xd5\xfa\x9ee\x85\xd0\xe0X\xe6\x1eS\xf1\x80E\xd79\xfaQ=\xc5\xe22\x85\xb1j\x7fy[+eK{\xb0\xaa'\xe8fA\x7f\x1a\xae\x8dMDŽ\xa3\x93\x13\x10͗{Qcs\xfe\x8c\xe6n\xc9\xed\xe9!|\f/W-k>}$\xed\x9c[\x990\x18\bE4|\t;\x0e\xba\x03\xa9J\xf5\x15u\xc6=\xb4\xc6WCI\xb4\xfb\xab!\x8eϽ\xb1\xad\x8e\x13\xd1\xecω\xcej\xd4\"\x8d2z\x86t4on\x189%`\xb8\xad\xbc\xa0\xae$\x81ѾU\x1fh\xec\xe4n\a\x10䖸\x883\xa9}\x94\xe6\xac:\xa7\xfbBy^\x1b\xa2\"'(e\xfcK\xa4\x01!D\xb2\xe4)\x98Y/{\x8d\x82\x97\xef\v\xe6X\xe3\x04տ\xb4\x95\xee\x8bY\x9f\x93a \xbfIǜ\xb0\x17]\xc1\xd8T\xbb\xd3\x05\xb9\x8eP\xee?\xa5Gx\x80G.&\x84'n\x93, \xdd\x13b\xf1\xb72V/8\xe0Fb\x10a\xe7\x94\xdb\x03k\x82i\xf46f=+\xce\x13E\xd2.b\xb1\x84ʴ\x0f\xf9\xac\xf8ʑ\xef+J\xe0\x7f&\xe4\xa02\xa2\x04\xea\x17k\xb3T\xb9\x8c\xd1І_o\xbby|\xa9\x96L\xc3F\xc7c\x03\xf7\xf6\x9d\x86\xe0'\x1e\\\x1aXZ\a\x9d\x9d\x01\xf3\xee\x1bf:W\x8c\x83\xacX\xb7\fs@{r\xa1\x94\xb7\xb9gwX/\x1c\x05\xb4np\xbfM\x1bL\x91\x89\xa9\xf7\x8bP\x8e\x99BA\xf3ԞU>\rZ\xb4S\xa6\xf2d\xf2\xdbo\xca\x1a%\"7\xf7\xf6\xb0\xc8\xcb\x11\x1b\xcb\x06Cq\x85\x7fW\xe2\xc1\xe99\xf7\xc6? rate.Limit(target) { + // should never exceed whole-cluster target + t.Error("boosted beyond configured limit") + } + if boosted < 0 { + // should never become negative. + // + // the ratelimiter treats negatives as zero, so this is "fine", + // but it's likely a sign of flawed logic. + t.Error("boosted is negative") + } + if math.IsNaN(float64(boosted)) { + t.Error("boosted is NaN") + } + if math.IsInf(float64(boosted), 0) { + t.Error("boosted is inf") + } + }) +} + +func anyInvalid(f ...float64) bool { + for _, v := range f { + if math.IsNaN(v) { + return true + } + if math.IsInf(v, 0) { + return true + } + } + return false +} diff --git a/common/quotas/global/collection/testdata/fuzz/FuzzBoostRPS/1b0805e3169f0ae7 b/common/quotas/global/collection/testdata/fuzz/FuzzBoostRPS/1b0805e3169f0ae7 new file mode 100644 index 00000000000..258f67f912d --- /dev/null +++ b/common/quotas/global/collection/testdata/fuzz/FuzzBoostRPS/1b0805e3169f0ae7 @@ -0,0 +1,5 @@ +go test fuzz v1 +float64(-95) +float64(-158) +float64(-52.111111111111114) +float64(0) diff --git a/common/quotas/global/collection/testdata/fuzz/FuzzBoostRPS/216cd14a71215fe2 b/common/quotas/global/collection/testdata/fuzz/FuzzBoostRPS/216cd14a71215fe2 new file mode 100644 index 00000000000..0c8f9a53882 --- /dev/null +++ b/common/quotas/global/collection/testdata/fuzz/FuzzBoostRPS/216cd14a71215fe2 @@ -0,0 +1,5 @@ +go test fuzz v1 +float64(-95) +float64(0) +float64(0) +float64(0) diff --git a/common/quotas/global/rpc/client.go b/common/quotas/global/rpc/client.go index 72c2acf3249..2f74c5737a5 100644 --- a/common/quotas/global/rpc/client.go +++ b/common/quotas/global/rpc/client.go @@ -29,7 +29,9 @@ package rpc import ( "context" + "errors" "fmt" + "math" "sync" "time" @@ -82,6 +84,10 @@ type ( // Currently, there are no truly fatal errors so this API does not return `error`. // Even in the presence of errors, some successful data may have been loaded, // and that will be part of the UpdateResult struct. + // + // As part of the contract of this call, UpdateResult will not ever contain + // NaN, Inf, or negative values, as these imply fatal calculation problems. + // It checks internally and will return an error rather than any value. Update(ctx context.Context, period time.Duration, load map[shared.GlobalKey]Calls) UpdateResult } @@ -197,6 +203,25 @@ func (c *client) updateSinglePeer(ctx context.Context, peer history.Peer, period // deserialization errors should never happen return nil, &SerializationError{err} } + + // and check the values. + // - weights must be 0..1 inclusive + // - used RPS must be non-negative + // - NaNs and Infs are always rejected + // + // any failure is fatal to the whole request, that host cannot be trusted, + // and any affected limiters should switch to their fallbacks if the issue + // persists long enough. + for key, entry := range resp { + if msg := shared.SanityCheckFloat(0, entry.Weight, 1, "weight"); msg != "" { + return nil, fmt.Errorf("bad value for key %q: from host: %q: %w", key, peer, errors.New(msg)) + } + // no upper bound, but true infs are not allowed + if msg := shared.SanityCheckFloat(0, entry.UsedRPS, math.Inf(1), "used rps"); msg != "" { + return nil, fmt.Errorf("bad value for key %q: from host: %q %w", key, peer, errors.New(msg)) + } + } + return resp, nil } diff --git a/common/quotas/global/rpc/client_test.go b/common/quotas/global/rpc/client_test.go index ebe50db521b..de8f4d7869f 100644 --- a/common/quotas/global/rpc/client_test.go +++ b/common/quotas/global/rpc/client_test.go @@ -26,6 +26,7 @@ import ( "context" "errors" "fmt" + "math" "reflect" "testing" "time" @@ -52,6 +53,14 @@ func TestClient(t *testing.T) { return impl.(*client), hc, pr } + encode := func(t *testing.T, w map[algorithm.Limit]algorithm.HostUsage) (*types.RatelimitUpdateResponse, error) { + a, err := AggregatorWeightsToAny(w) + assert.NoError(t, err, "should be impossible: could not encode aggregator response to any") + return &types.RatelimitUpdateResponse{ + Any: a, + }, nil + } + t.Run("valid request", func(t *testing.T) { c, hc, pr := setup(t) data := map[shared.GlobalKey]Calls{ @@ -69,13 +78,6 @@ func TestClient(t *testing.T) { "b": {Weight: 0.2, UsedRPS: used}, "c": {Weight: 0.3, UsedRPS: used}, } - encode := func(w map[algorithm.Limit]algorithm.HostUsage) (*types.RatelimitUpdateResponse, error) { - a, err := AggregatorWeightsToAny(w) - assert.NoError(t, err, "should be impossible: could not encode aggregator response to any") - return &types.RatelimitUpdateResponse{ - Any: a, - }, nil - } stringKeys := make([]string, 0, len(data)) for k := range data { stringKeys = append(stringKeys, string(k)) @@ -86,10 +88,10 @@ func TestClient(t *testing.T) { }, nil) hc.EXPECT(). RatelimitUpdate(gomock.Any(), matchrequest{t, []string{"a", "c"}}, matchyarpc{t, yarpc.WithShardKey("agg-1")}). - Return(encode(map[algorithm.Limit]algorithm.HostUsage{"a": {Weight: 0.1, Used: algorithm.PerSecond(used)}, "c": {Weight: 0.3, Used: algorithm.PerSecond(used)}})) + Return(encode(t, map[algorithm.Limit]algorithm.HostUsage{"a": {Weight: 0.1, Used: algorithm.PerSecond(used)}, "c": {Weight: 0.3, Used: algorithm.PerSecond(used)}})) hc.EXPECT(). RatelimitUpdate(gomock.Any(), matchrequest{t, []string{"b"}}, matchyarpc{t, yarpc.WithShardKey("agg-2")}). - Return(encode(map[algorithm.Limit]algorithm.HostUsage{"b": {Weight: 0.2, Used: algorithm.PerSecond(used)}})) + Return(encode(t, map[algorithm.Limit]algorithm.HostUsage{"b": {Weight: 0.2, Used: algorithm.PerSecond(used)}})) result := c.Update(context.Background(), time.Second, data) assert.NoError(t, result.Err) @@ -103,6 +105,24 @@ func TestClient(t *testing.T) { assert.ErrorContains(t, res.Err, "unable to shard") assert.ErrorIs(t, res.Err, err) // should contain the cause }) + t.Run("bad math", func(t *testing.T) { + // the client should validate responses to make sure they do not contain + // known-to-be-invalid data. + c, hc, pr := setup(t) + pr.EXPECT().GlobalRatelimitPeers([]string{"a"}). + Return(map[history.Peer][]string{"agg-1": {"a"}}, nil) + hc.EXPECT().RatelimitUpdate(gomock.Any(), gomock.Any(), gomock.Any()). + Return(encode(t, map[algorithm.Limit]algorithm.HostUsage{"a": { + Weight: algorithm.HostWeight(math.NaN()), // invalid value from agg, detected after deserializing + Used: 1, + }})) + + result := c.Update(context.Background(), time.Second, map[shared.GlobalKey]Calls{"a": {10, 20}}) + assert.Empty(t, result.Weights, "bad data should lead to no usable response") + assert.ErrorContains(t, result.Err, `bad value for key "a"`, "failure should mention the key") + assert.ErrorContains(t, result.Err, "is NaN", "failure should mention the bad value type") + assert.ErrorContains(t, result.Err, "potentially fatal error during ratelimit update requests", "failure should look alarming") + }) } func stringy[T ~string](in []T) []string { diff --git a/common/quotas/global/shared/sanity.go b/common/quotas/global/shared/sanity.go new file mode 100644 index 00000000000..21a391f1e21 --- /dev/null +++ b/common/quotas/global/shared/sanity.go @@ -0,0 +1,114 @@ +// The MIT License (MIT) + +// Copyright (c) 2017-2020 Uber Technologies Inc. + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package shared + +import ( + "fmt" + "math" + "strings" + + "go.uber.org/zap" + "go.uber.org/zap/zaptest/observer" + + "github.com/uber/cadence/common/log" + "github.com/uber/cadence/common/log/tag" +) + +// SanityCheckFloat checks bounds and numerically-valid-ness, and returns an error string if any check fails. +// +// This is the "lower level" of SanityLogFloat, when invalid means behavior needs to change, +// and invalid data is an unavoidable possibility. +// +// If invalid data should *always* be avoided by earlier code, use SanityLogFloat instead to +// do a paranoid log of violations without affecting behavior. +func SanityCheckFloat[T ~float64](lower, actual, upper T, what string) (msg string) { + if math.IsNaN(float64(actual)) { + return what + " is NaN, should be impossible" + } else if math.IsInf(float64(actual), 0) { + return what + " is Inf, should be impossible" + } else if actual < lower { + return what + " is below lower bound, should be impossible" + } else if upper < actual { + return what + " is above upper bound, should be impossible" + } + return "" +} + +// SanityLogFloat ensures values are within sane bounds (inclusive) and not NaN or Inf, +// and logs an error if these expectations are violated. +// This should only be used when a value is believed to *always* be valid, not +// when it needs to be clamped to a reasonable range. +// +// This largely exists because [math.Max] and similar retain NaN/Inf instead of +// treating them as error states, and that (and other issues) caused problems. +// There are a lot of potential gotchas with floats, and without good fuzzing +// you're kinda unlikely to trigger them all, so this helps us be as paranoid as +// we like with small enough cost to ignore (while also documenting expectations). +// +// The "actual" value is returned no matter what, to make this easier to use in +// a logic chain without temp vars. +// +// Violations are intentionally NOT fatal, nor is a "reasonable" value returned, +// as they are not expected to occur and code paths involved are not built with +// that assumption. Consider it "undefined behavior", this log exists only to +// help troubleshoot the cause later if it happens. +// +// --- +// +// In tests, always assert that this is not ever triggered, by using the following: +// +// logger, obs := logtest.NewObserved(t) +// t.Cleanup(func() { AssertNoSanityCheckFailures(t, obs) }) +// // your test +func SanityLogFloat[T ~float64](lower, actual, upper T, what string, logger log.Logger) T { + msg := SanityCheckFloat(lower, actual, upper, what) + if msg != "" { + logger.Error(msg, + tag.Dynamic("lower", lower), + tag.Dynamic("actual", actual), + tag.Dynamic("upper", upper), + tag.Dynamic("actual_type", fmt.Sprintf("%T", actual))) + } + + return actual +} + +// small and easier to mock in tests, as no other methods are needed +type testingt interface { + Errorf(string, ...any) +} + +func AssertNoSanityCheckFailures(t testingt, entries []observer.LoggedEntry) { + logged := false + for _, entry := range entries { + if entry.Level.Enabled(zap.ErrorLevel) && strings.Contains(entry.Message, "impossible") { + t.Errorf("sanity check log detected: %v: %v", entry.Message, entry.Context) + if logged { + t.Errorf("ignoring any remaining sanity check failures, comment this out to get all logs") + // produces 2 logs and then gives up, as large or fuzz tests can trigger thousands and that's very slow + break + } + logged = true + } + } +} diff --git a/common/quotas/global/shared/sanity_test.go b/common/quotas/global/shared/sanity_test.go new file mode 100644 index 00000000000..6407508a894 --- /dev/null +++ b/common/quotas/global/shared/sanity_test.go @@ -0,0 +1,100 @@ +// The MIT License (MIT) + +// Copyright (c) 2017-2020 Uber Technologies Inc. + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package shared + +import ( + "fmt" + "math" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/uber/cadence/common/log/testlogger" +) + +func TestSanityChecks(t *testing.T) { + tests := map[string]struct { + lower, actual, upper float64 + expected string + }{ + "nan": {lower: 0, actual: math.NaN(), upper: 0, expected: "is NaN"}, + "pos inf": {lower: 0, actual: math.Inf(1), upper: 0, expected: "is Inf"}, + "neg inf": {lower: 0, actual: math.Inf(-1), upper: 0, expected: "is Inf"}, + "too low": {lower: 5, actual: 1, upper: 10, expected: "below lower bound"}, + "too high": {lower: 0, actual: 10, upper: 5, expected: "above upper bound"}, + + "inclusive lower": {lower: 0, actual: 0, upper: 10, expected: ""}, + "valid": {lower: 0, actual: 5, upper: 10, expected: ""}, + "inclusive upper": {lower: 0, actual: 10, upper: 10, expected: ""}, + } + for name, test := range tests { + name, test := name, test + t.Run(name, func(t *testing.T) { + msg := SanityCheckFloat(test.lower, test.actual, test.upper, "test") + if test.expected == "" { + assert.Empty(t, msg, "should not have failed validation") + } else { + assert.Containsf(t, msg, test.expected, "wrong or missing error case") + } + }) + } + + t.Run("logs", func(t *testing.T) { + l, obs := testlogger.NewObserved(t) + _ = SanityLogFloat[float64](5, 1, 10, "test", l) + logs := obs.TakeAll() + require.Len(t, logs, 1, "should have made an error log") + assert.Contains(t, logs[0].Message, "below lower bound", "should log the issue") + assert.Contains(t, logs[0].Message, "test", "should mention the 'what'") + assert.NotZero(t, logs[0].ContextMap()["actual"], "should log the actual value") + }) + t.Run("assert helper", func(t *testing.T) { + l, obs := testlogger.NewObserved(t) + _ = SanityLogFloat[float64](5, 1, 10, "test", l) + logs := obs.TakeAll() + require.Len(t, logs, 1, "should have made an error log") + + errobs := &errorfobserver{} + AssertNoSanityCheckFailures(errobs, logs) + require.Len(t, errobs.logs, 1, "should Errorf that there was a log") + assert.Contains(t, errobs.logs[0], "sanity check log detected") + assert.Contains(t, errobs.logs[0], "below lower bound", "should contain the original message") + + errobs = &errorfobserver{} // clear the logs + tripleLog := append(logs, logs[0], logs[0]) // fake having three logs, details don't matter + AssertNoSanityCheckFailures(errobs, tripleLog) + require.Len(t, errobs.logs, 3, "should have two Errorfs and one giving up") + assert.Contains(t, errobs.logs[0], "sanity check log detected") + assert.Contains(t, errobs.logs[1], "sanity check log detected") + assert.Contains(t, errobs.logs[2], "ignoring any remaining sanity check failures") + }) +} + +type errorfobserver struct { + logs []string +} + +func (o *errorfobserver) Errorf(format string, args ...interface{}) { + o.logs = append(o.logs, fmt.Sprintf(format, args...)) +} diff --git a/service/history/handler/handler_test.go b/service/history/handler/handler_test.go index 21aa570cfc6..533b3e48f40 100644 --- a/service/history/handler/handler_test.go +++ b/service/history/handler/handler_test.go @@ -3621,6 +3621,7 @@ func TestRatelimitUpdate(t *testing.T) { require.NoError(t, err) alg, err := algorithm.New( metrics.NewNoopMetricsClient(), + testlogger.New(t), algorithm.Config{ NewDataWeight: func(opts ...dynamicconfig.FilterOption) float64 { return 0.5 }, UpdateInterval: func(opts ...dynamicconfig.FilterOption) time.Duration { return 3 * time.Second }, diff --git a/service/history/resource/resource.go b/service/history/resource/resource.go index be1e6306dfd..df7e4866c03 100644 --- a/service/history/resource/resource.go +++ b/service/history/resource/resource.go @@ -136,6 +136,7 @@ func New( ) ratelimitAlgorithm, err := algorithm.New( params.MetricsClient, + params.Logger, algorithm.Config{ NewDataWeight: config.GlobalRatelimiterNewDataWeight, UpdateInterval: config.GlobalRatelimiterUpdateInterval,