diff --git a/checker/config.go b/checker/config.go index 8c9675e78..2b875376e 100644 --- a/checker/config.go +++ b/checker/config.go @@ -18,6 +18,7 @@ type Config struct { LogTriggersToLevel map[string]string MetricEventPopBatchSize int64 MetricEventPopDelay time.Duration + CriticalTimeOfCheck time.Duration } // SourceCheckConfig represents check parameters for a single metric source diff --git a/checker/worker/trigger_handler.go b/checker/worker/trigger_handler.go index 817b6e23b..6faae40a3 100644 --- a/checker/worker/trigger_handler.go +++ b/checker/worker/trigger_handler.go @@ -62,6 +62,16 @@ func (manager *WorkerManager) handleTriggerInLock(triggerID string, metrics *met defer metrics.TriggersCheckTime.UpdateSince(startedAt) err = manager.checkTrigger(triggerID) + + timeSince := time.Since(startedAt) + if timeSince > manager.Config.CriticalTimeOfCheck { + manager.Logger.Warning(). + String("trigger_id", triggerID). + Error(err). + String("time_of_check", timeSince.String()). + Msg("It took too long to check trigger") + } + return err } diff --git a/cmd/checker/config.go b/cmd/checker/config.go index d05be0f71..f035e3f25 100644 --- a/cmd/checker/config.go +++ b/cmd/checker/config.go @@ -46,6 +46,8 @@ type checkerConfig struct { MetricEventPopBatchSize int `yaml:"metric_event_pop_batch_size"` // Metric event pop operation delay MetricEventPopDelay string `yaml:"metric_event_pop_delay"` + // Duration of check that is considered critical and must be logged + CriticalTimeOfCheck string `yaml:"critical_time_of_check"` } func handleParallelChecks(parallelChecks *int) bool { @@ -119,6 +121,7 @@ func (config *config) getSettings(logger moira.Logger) *checker.Config { LogTriggersToLevel: logTriggersToLevel, MetricEventPopBatchSize: int64(config.Checker.MetricEventPopBatchSize), MetricEventPopDelay: to.Duration(config.Checker.MetricEventPopDelay), + CriticalTimeOfCheck: to.Duration(config.Checker.CriticalTimeOfCheck), } } @@ -138,6 +141,7 @@ func getDefault() config { NoDataCheckInterval: "60s", LazyTriggersCheckInterval: "10m", StopCheckingInterval: "30s", + CriticalTimeOfCheck: "1h", }, Telemetry: cmd.TelemetryConfig{ Listen: ":8092",