Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: checker handle source unavailable #1134

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 10 additions & 17 deletions checker/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,24 +132,17 @@ func (triggerChecker *TriggerChecker) handleFetchError(checkData moira.CheckData
triggerChecker.trigger.ClusterKey(),
)
}
case remote.ErrRemoteTriggerResponse:
timeSinceLastSuccessfulCheck := checkData.Timestamp - checkData.LastSuccessfulCheckTimestamp
if timeSinceLastSuccessfulCheck >= triggerChecker.ttl {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Убрал это условие, т.к. оно было очень странным:

  • если "повезло" и источник оказался недоступен в момент проверки триггера, то триггер перейдёт в EXCEPTION, если с момента проверки прошло более ttl секунд (это которые "Set NODATA if has no value for ttl seconds"). Плюс к этому здесь генерилось 2 NotificationEvent, в случае выполнения условия.
  • иначе триггер состояние не поменяет + к этому, если источник остаётся не доступным, то триггер вообще не будет проверяться (т.к. если источник недоступен, то триггеры не шедулятся на проверку), т.е. не перейдёт в EXCEPTION

Если я что-то не учёл/не правильно понял, сигнализируйте)

checkData.State = moira.StateEXCEPTION
checkData.Message = fmt.Sprintf("Remote server unavailable. Trigger is not checked for %d seconds", timeSinceLastSuccessfulCheck)

var comparingErr error
checkData, comparingErr = triggerChecker.compareTriggerStates(checkData)
if comparingErr != nil {
triggerChecker.logger.Error().
Error(comparingErr).
String(moira.LogFieldNameTriggerID, triggerChecker.triggerID).
Msg("cannot compare trigger states")
}
}
case remote.ErrRemoteUnavailable:
checkData.State = moira.StateEXCEPTION
checkData.Message = fmt.Sprintf("Remote server unavailable. Trigger is not checked since: %v", checkData.LastSuccessfulCheckTimestamp)

logTriggerCheckException(triggerChecker.logger, triggerChecker.triggerID, err)
case local.ErrUnknownFunction, local.ErrEvalExpr:
triggerChecker.logger.Error().
String(moira.LogFieldNameTriggerID, triggerChecker.triggerID).
String("trigger.cluster_id", triggerChecker.trigger.ClusterId.String()).
String("trigger.source", triggerChecker.trigger.TriggerSource.String()).
Error(err).
Msg("Trigger check failed")
case local.ErrUnknownFunction, local.ErrEvalExpr, remote.ErrRemoteTriggerResponse:
checkData.State = moira.StateEXCEPTION
checkData.Message = err.Error()
logTriggerCheckException(triggerChecker.logger, triggerChecker.triggerID, err)
Expand Down
241 changes: 241 additions & 0 deletions checker/check_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package checker

import (
"errors"
"fmt"
"math"
"testing"
Expand All @@ -12,6 +13,7 @@ import (
logging "github.com/moira-alert/moira/logging/zerolog_adapter"
metricSource "github.com/moira-alert/moira/metric_source"
"github.com/moira-alert/moira/metric_source/local"
"github.com/moira-alert/moira/metric_source/remote"
"go.uber.org/mock/gomock"

"github.com/moira-alert/moira/metrics"
Expand Down Expand Up @@ -1924,3 +1926,242 @@ func TestTriggerChecker_handlePrepareError(t *testing.T) {
})
})
}

func TestTriggerChecker_handleFetchError(t *testing.T) {
Convey("Test handleFetchError", t, func() {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
dataBase := mock_moira_alert.NewMockDatabase(mockCtrl)
logger, _ := logging.GetLogger("Test")

metric := "some.metric"
testTime := time.Date(2022, time.June, 6, 10, 0, 0, 0, time.UTC).Unix()

trigger := &moira.Trigger{
ID: "test trigger",
TriggerSource: moira.GraphiteLocal,
ClusterId: moira.DefaultCluster,
}
triggerChecker := TriggerChecker{
triggerID: trigger.ID,
from: testTime - 50,
until: testTime,
trigger: trigger,
database: dataBase,
logger: logger,
ttlState: moira.TTLStateNODATA,
lastCheck: &moira.CheckData{
State: moira.StateOK,
Timestamp: testTime - 10,
Metrics: map[string]moira.MetricState{
metric: {
State: moira.StateOK,
Timestamp: testTime - 41,
},
},
},
}

Convey("with ErrTriggerHasEmptyTargets, ErrTriggerHasOnlyWildcards", func() {
errorList := []error{
ErrTriggerHasEmptyTargets{},
ErrTriggerHasOnlyWildcards{},
}

Convey("when triggerChecker.ttl == 0", func() {
triggerChecker.ttl = 0

for i, givenErr := range errorList {
Convey(fmt.Sprintf("Case %v: %T", i+1, givenErr), func() {
expectedCheckData := moira.CheckData{
Score: int64(1_000),
Metrics: triggerChecker.lastCheck.Metrics,
State: triggerChecker.ttlState.ToTriggerState(),
Timestamp: triggerChecker.until,
Message: givenErr.Error(),
MetricsToTargetRelation: map[string]string{},
}

dataBase.EXPECT().SetTriggerLastCheck(
triggerChecker.triggerID,
&expectedCheckData,
triggerChecker.trigger.ClusterKey(),
).Return(nil).Times(1)

err := triggerChecker.handleFetchError(newCheckData(triggerChecker.lastCheck, triggerChecker.until), givenErr)
So(err, ShouldBeNil)
})
}
})

Convey("when triggerChecker.ttl != 0", func() {
triggerChecker.ttl = 600

for i, givenErr := range errorList {
Convey(fmt.Sprintf("Case %v: %T", i+1, givenErr), func() {
expectedCheckData := moira.CheckData{
Score: int64(1_000),
Metrics: triggerChecker.lastCheck.Metrics,
State: triggerChecker.ttlState.ToTriggerState(),
Timestamp: triggerChecker.until,
EventTimestamp: triggerChecker.until,
Message: givenErr.Error(),
MetricsToTargetRelation: map[string]string{},
}

dataBase.EXPECT().PushNotificationEvent(
&moira.NotificationEvent{
IsTriggerEvent: true,
TriggerID: triggerChecker.triggerID,
State: triggerChecker.ttlState.ToTriggerState(),
OldState: triggerChecker.lastCheck.State,
Timestamp: triggerChecker.until,
Metric: triggerChecker.trigger.Name,
},
true,
).Return(nil).Times(1)
dataBase.EXPECT().SetTriggerLastCheck(
triggerChecker.triggerID,
&expectedCheckData,
triggerChecker.trigger.ClusterKey(),
).Return(nil).Times(1)

err := triggerChecker.handleFetchError(newCheckData(triggerChecker.lastCheck, triggerChecker.until), givenErr)
So(err, ShouldBeNil)
})
}
})
})

Convey("with graphite remote unavailable", func() {
givenErr := remote.ErrRemoteUnavailable{
InternalError: errors.New("some err"),
}

triggerChecker.ttl = 10
triggerChecker.lastCheck.LastSuccessfulCheckTimestamp = triggerChecker.until - 20

expectedCheckData := moira.CheckData{
Score: int64(100_000),
Metrics: triggerChecker.lastCheck.Metrics,
State: moira.StateEXCEPTION,
Timestamp: triggerChecker.until,
EventTimestamp: triggerChecker.until,
LastSuccessfulCheckTimestamp: triggerChecker.lastCheck.LastSuccessfulCheckTimestamp,
Message: fmt.Sprintf(
"Remote server unavailable. Trigger is not checked since: %v",
triggerChecker.lastCheck.LastSuccessfulCheckTimestamp),
MetricsToTargetRelation: map[string]string{},
}

dataBase.EXPECT().PushNotificationEvent(
&moira.NotificationEvent{
IsTriggerEvent: true,
TriggerID: triggerChecker.triggerID,
State: moira.StateEXCEPTION,
OldState: triggerChecker.lastCheck.State,
Timestamp: triggerChecker.until,
Metric: triggerChecker.trigger.Name,
},
true,
).Return(nil).Times(1)
dataBase.EXPECT().SetTriggerLastCheck(
triggerChecker.triggerID,
&expectedCheckData,
triggerChecker.trigger.ClusterKey(),
).Return(nil).Times(1)

err := triggerChecker.handleFetchError(newCheckData(triggerChecker.lastCheck, triggerChecker.until), givenErr)
So(err, ShouldBeNil)
})

Convey("with bad functions, problems in expressions, etc", func() {
errorsList := []error{
local.ErrorUnknownFunction(errors.New("unknown func \"dance\"")),
local.ErrorEvalExpression(errors.New("eval expr"), "badExpr(dancing.mops"),
remote.ErrRemoteTriggerResponse{
InternalError: errors.New("user write bad target"),
Target: "bad target",
},
}

for i, givenErr := range errorsList {
Convey(fmt.Sprintf("Case %v: %T", i+1, givenErr), func() {
expectedCheckData := moira.CheckData{
Score: int64(100_000),
Metrics: triggerChecker.lastCheck.Metrics,
State: moira.StateEXCEPTION,
Timestamp: triggerChecker.until,
EventTimestamp: triggerChecker.until,
LastSuccessfulCheckTimestamp: triggerChecker.lastCheck.LastSuccessfulCheckTimestamp,
Message: givenErr.Error(),
MetricsToTargetRelation: map[string]string{},
}

dataBase.EXPECT().PushNotificationEvent(
&moira.NotificationEvent{
IsTriggerEvent: true,
TriggerID: triggerChecker.triggerID,
State: moira.StateEXCEPTION,
OldState: triggerChecker.lastCheck.State,
Timestamp: triggerChecker.until,
Metric: triggerChecker.trigger.Name,
},
true,
).Return(nil).Times(1)
dataBase.EXPECT().SetTriggerLastCheck(
triggerChecker.triggerID,
&expectedCheckData,
triggerChecker.trigger.ClusterKey(),
).Return(nil).Times(1)

err := triggerChecker.handleFetchError(newCheckData(triggerChecker.lastCheck, triggerChecker.until), givenErr)
So(err, ShouldBeNil)
})
}
})

Convey("with undefined err", func() {
givenErr := errors.New("some undefined error")

checkMetrics, err := metrics.ConfigureCheckerMetrics(
metrics.NewDummyRegistry(),
[]moira.ClusterKey{moira.DefaultLocalCluster},
).GetCheckMetricsBySource(moira.DefaultLocalCluster)
So(err, ShouldBeNil)

triggerChecker.metrics = checkMetrics

expectedCheckData := moira.CheckData{
Score: int64(100_000),
Metrics: triggerChecker.lastCheck.Metrics,
State: moira.StateEXCEPTION,
Timestamp: triggerChecker.until,
EventTimestamp: triggerChecker.until,
LastSuccessfulCheckTimestamp: triggerChecker.lastCheck.LastSuccessfulCheckTimestamp,
Message: givenErr.Error(),
MetricsToTargetRelation: map[string]string{},
}

dataBase.EXPECT().PushNotificationEvent(
&moira.NotificationEvent{
IsTriggerEvent: true,
TriggerID: triggerChecker.triggerID,
State: moira.StateEXCEPTION,
OldState: triggerChecker.lastCheck.State,
Timestamp: triggerChecker.until,
Metric: triggerChecker.trigger.Name,
},
true,
).Return(nil).Times(1)
dataBase.EXPECT().SetTriggerLastCheck(
triggerChecker.triggerID,
&expectedCheckData,
triggerChecker.trigger.ClusterKey(),
).Return(nil).Times(1)

err = triggerChecker.handleFetchError(newCheckData(triggerChecker.lastCheck, triggerChecker.until), givenErr)
So(err, ShouldBeNil)
})
})
}
8 changes: 8 additions & 0 deletions metric_source/local/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ type ErrEvalExpr struct {
target string
}

// ErrorEvalExpression creates ErrEvalExpr with given err and target.
func ErrorEvalExpression(err error, target string) ErrEvalExpr {
return ErrEvalExpr{
internalError: err,
target: target,
}
}

// Error is implementation of golang error interface for ErrEvalExpr struct.
func (err ErrEvalExpr) Error() string {
return fmt.Sprintf("failed to evaluate target '%s': %s", err.target, err.internalError.Error())
Expand Down
5 changes: 1 addition & 4 deletions metric_source/local/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,7 @@ func (eval *evaluator) Eval(
} else if isErrUnknownFunction(err) {
err = ErrorUnknownFunction(err)
} else {
err = ErrEvalExpr{
target: exp.ToString(),
internalError: err,
}
err = ErrorEvalExpression(err, exp.ToString())
}
}

Expand Down
Loading