From f6979f7f0586c08cbe84087a59c49b6aee47f876 Mon Sep 17 00:00:00 2001 From: Lukasz Mierzwa Date: Tue, 31 Oct 2023 12:21:02 +0000 Subject: [PATCH] Check other Prometheus servers for missing metrics --- cmd/pint/scan.go | 2 + cmd/pint/tests/0037_disable_checks.txt | 2 +- cmd/pint/tests/0039_prom_selected_path.txt | 2 + cmd/pint/tests/0103_file_disable.txt | 2 +- cmd/pint/tests/0115_file_disable_tag.txt | 2 +- cmd/pint/tests/0144_discovery_filepath.txt | 8 +++ .../tests/0145_discovery_filepath_dup.txt | 6 +++ cmd/pint/tests/0149_discovery_prom.txt | 4 ++ .../tests/0150_discovery_prom_dup_tags.txt | 2 + .../tests/0152_discovery_prom_dup_uptime.txt | 4 ++ .../tests/0155_discovery_prom_dup_include.txt | 2 + .../tests/0156_discovery_prom_dup_exclude.txt | 2 + cmd/pint/tests/0157_series_other_servers.txt | 49 +++++++++++++++++++ docs/changelog.md | 8 +++ internal/checks/promql_series.go | 44 ++++++++++++++++- internal/config/prometheus.go | 6 ++- internal/promapi/prometheus.go | 6 +++ 17 files changed, 145 insertions(+), 6 deletions(-) create mode 100644 cmd/pint/tests/0157_series_other_servers.txt diff --git a/cmd/pint/scan.go b/cmd/pint/scan.go index 79010fd9..61a3f009 100644 --- a/cmd/pint/scan.go +++ b/cmd/pint/scan.go @@ -16,6 +16,7 @@ import ( "github.com/cloudflare/pint/internal/config" "github.com/cloudflare/pint/internal/discovery" "github.com/cloudflare/pint/internal/output" + "github.com/cloudflare/pint/internal/promapi" "github.com/cloudflare/pint/internal/reporter" ) @@ -76,6 +77,7 @@ func checkRules(ctx context.Context, workers int, gen *config.PrometheusGenerato results := make(chan reporter.Report, workers*5) wg := sync.WaitGroup{} + ctx = context.WithValue(ctx, promapi.AllPrometheusServers, gen.Servers()) for _, s := range cfg.Check { settings, _ := s.Decode() key := checks.SettingsKey(s.Name) diff --git a/cmd/pint/tests/0037_disable_checks.txt b/cmd/pint/tests/0037_disable_checks.txt index b1f81cc0..fbd26caf 100644 --- a/cmd/pint/tests/0037_disable_checks.txt +++ b/cmd/pint/tests/0037_disable_checks.txt @@ -7,9 +7,9 @@ level=INFO msg="Loading configuration file" path=.pint.hcl level=INFO msg="Finding all rules to check" paths=["rules"] level=DEBUG msg="File parsed" path=rules/0001.yml rules=3 level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[] +level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1 workers=16 level=DEBUG msg="Generated all Prometheus servers" count=1 level=DEBUG msg="Found alerting rule" path=rules/0001.yml alert=default-for lines=1-3 -level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1 workers=16 level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/template","promql/fragile","promql/regexp","promql/vector_matching(prom)","rule/duplicate(prom)","labels/conflict(prom)"] path=rules/0001.yml rule=default-for level=DEBUG msg="Found recording rule" path=rules/0001.yml record=sum-job lines=5-6 level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/template","promql/fragile","promql/regexp","promql/vector_matching(prom)","rule/duplicate(prom)","labels/conflict(prom)","promql/aggregate(job:true)"] path=rules/0001.yml rule=sum-job diff --git a/cmd/pint/tests/0039_prom_selected_path.txt b/cmd/pint/tests/0039_prom_selected_path.txt index 60cd6d06..bf8d8fe7 100644 --- a/cmd/pint/tests/0039_prom_selected_path.txt +++ b/cmd/pint/tests/0039_prom_selected_path.txt @@ -7,6 +7,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl level=INFO msg="Finding all rules to check" paths=["rules"] level=DEBUG msg="File parsed" path=rules/0001.yml rules=3 level=INFO msg="Configured new Prometheus server" name=disabled uris=1 tags=[] include=["^invalid/.+$"] exclude=["^invalid/rules/.+$"] +level=DEBUG msg="Starting query workers" name=disabled uri=http://127.0.0.1:123 workers=16 level=DEBUG msg="Generated all Prometheus servers" count=1 level=DEBUG msg="Found alerting rule" path=rules/0001.yml alert=first lines=1-3 level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=first @@ -18,6 +19,7 @@ rules/0001.yml:6 Warning: job label is required and should be preserved when agg 6 | expr: sum(bar) level=INFO msg="Problems found" Warning=1 +level=DEBUG msg="Stopping query workers" name=disabled uri=http://127.0.0.1:123 -- rules/0001.yml -- - alert: first expr: foo > 1 diff --git a/cmd/pint/tests/0103_file_disable.txt b/cmd/pint/tests/0103_file_disable.txt index 3f039248..5a6cff3e 100644 --- a/cmd/pint/tests/0103_file_disable.txt +++ b/cmd/pint/tests/0103_file_disable.txt @@ -7,9 +7,9 @@ level=INFO msg="Loading configuration file" path=.pint.hcl level=INFO msg="Finding all rules to check" paths=["rules"] level=DEBUG msg="File parsed" path=rules/0001.yml rules=1 level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[] +level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16 level=DEBUG msg="Generated all Prometheus servers" count=1 level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines=9-10 -level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16 level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/vector_matching(prom)","labels/conflict(prom)","alerts/external_labels(prom)"] path=rules/0001.yml rule=colo:test1 level=DEBUG msg="Stopping query workers" name=prom uri=http://127.0.0.1:7103 -- rules/0001.yml -- diff --git a/cmd/pint/tests/0115_file_disable_tag.txt b/cmd/pint/tests/0115_file_disable_tag.txt index 2d5c9ebe..ece034b0 100644 --- a/cmd/pint/tests/0115_file_disable_tag.txt +++ b/cmd/pint/tests/0115_file_disable_tag.txt @@ -7,9 +7,9 @@ level=INFO msg="Loading configuration file" path=.pint.hcl level=INFO msg="Finding all rules to check" paths=["rules"] level=DEBUG msg="File parsed" path=rules/0001.yml rules=1 level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=["foo","bar"] include=[] exclude=[] +level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16 level=DEBUG msg="Generated all Prometheus servers" count=1 level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines="6 8" -level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16 level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","alerts/external_labels(prom)"] path=rules/0001.yml rule=colo:test1 level=DEBUG msg="Stopping query workers" name=prom uri=http://127.0.0.1:7103 -- rules/0001.yml -- diff --git a/cmd/pint/tests/0144_discovery_filepath.txt b/cmd/pint/tests/0144_discovery_filepath.txt index b65abf67..ecf37fe0 100644 --- a/cmd/pint/tests/0144_discovery_filepath.txt +++ b/cmd/pint/tests/0144_discovery_filepath.txt @@ -20,10 +20,18 @@ level=DEBUG msg="Path discovery match" match=^(?P\w+).ya?ml$ path=prom2.ym level=DEBUG msg="Extracted regexp variables" regexp=^(?P\w+).ya?ml$ vars={"name":"prom2"} level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.example.com headers=["X-Host"] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["name/prom2"] required=true level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 tags=["name/prom1"] include=[] exclude=["^.*$"] +level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1.example.com workers=16 +level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1-backup.example.com workers=16 level=INFO msg="Configured new Prometheus server" name=prom2 uris=2 tags=["name/prom2"] include=[] exclude=["^.*$"] +level=DEBUG msg="Starting query workers" name=prom2 uri=https://prom2.example.com workers=16 +level=DEBUG msg="Starting query workers" name=prom2 uri=https://prom2-backup.example.com workers=16 level=DEBUG msg="Generated all Prometheus servers" count=2 level=DEBUG msg="Found recording rule" path=rules/0001.yml record=sum:up lines=4-5 level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=sum:up +level=DEBUG msg="Stopping query workers" name=prom1 uri=https://prom1.example.com +level=DEBUG msg="Stopping query workers" name=prom1 uri=https://prom1-backup.example.com +level=DEBUG msg="Stopping query workers" name=prom2 uri=https://prom2.example.com +level=DEBUG msg="Stopping query workers" name=prom2 uri=https://prom2-backup.example.com -- rules/0001.yml -- groups: - name: foo diff --git a/cmd/pint/tests/0145_discovery_filepath_dup.txt b/cmd/pint/tests/0145_discovery_filepath_dup.txt index f32b82d7..95937691 100644 --- a/cmd/pint/tests/0145_discovery_filepath_dup.txt +++ b/cmd/pint/tests/0145_discovery_filepath_dup.txt @@ -7,6 +7,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl level=INFO msg="Finding all rules to check" paths=["rules"] level=DEBUG msg="File parsed" path=rules/0001.yml rules=1 level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=[] exclude=[] +level=DEBUG msg="Starting query workers" name=prom2 uri=https://unique.example.com workers=16 level=INFO msg="Finding Prometheus servers using file paths" dir=servers match=^(?P\w+).ya?ml$ level=DEBUG msg="Path discovery match" match=^(?P\w+).ya?ml$ path=prom1.yaml level=DEBUG msg="Extracted regexp variables" regexp=^(?P\w+).ya?ml$ vars={"name":"prom1"} @@ -21,6 +22,11 @@ level=DEBUG msg="Path discovery match" match=^(?P\w+).ya?ml$ path=prom2.ym level=DEBUG msg="Extracted regexp variables" regexp=^(?P\w+).ya?ml$ vars={"name":"prom2"} level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["name/prom2"] required=true level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 tags=["name/prom1"] include=[] exclude=[] +level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1.example.com workers=16 +level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1-backup.example.com workers=16 +level=DEBUG msg="Stopping query workers" name=prom2 uri=https://unique.example.com +level=DEBUG msg="Stopping query workers" name=prom1 uri=https://prom1.example.com +level=DEBUG msg="Stopping query workers" name=prom1 uri=https://prom1-backup.example.com level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom2" -- rules/0001.yml -- groups: diff --git a/cmd/pint/tests/0149_discovery_prom.txt b/cmd/pint/tests/0149_discovery_prom.txt index 3d7e5c92..37837341 100644 --- a/cmd/pint/tests/0149_discovery_prom.txt +++ b/cmd/pint/tests/0149_discovery_prom.txt @@ -19,9 +19,13 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.exam level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7149 level=DEBUG msg="Added new failover URI" name=prom-ha uri=https://prom2.example.com level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 tags=[] include=[] exclude=["^.*$"] +level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16 +level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom2.example.com workers=16 level=DEBUG msg="Generated all Prometheus servers" count=1 level=DEBUG msg="Found recording rule" path=rules/0001.yml record=sum:up lines=4-5 level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=sum:up +level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com +level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom2.example.com -- rules/0001.yml -- groups: - name: foo diff --git a/cmd/pint/tests/0150_discovery_prom_dup_tags.txt b/cmd/pint/tests/0150_discovery_prom_dup_tags.txt index 9543ee0e..8ec7e7a7 100644 --- a/cmd/pint/tests/0150_discovery_prom_dup_tags.txt +++ b/cmd/pint/tests/0150_discovery_prom_dup_tags.txt @@ -19,6 +19,8 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.exam level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7150 level=WARN msg="Duplicated prometheus server with different tags" name=prom-ha a=["prom2"] b=["prom1"] level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=["prom1"] include=[] exclude=[] +level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16 +level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha" -- rules/0001.yml -- groups: diff --git a/cmd/pint/tests/0152_discovery_prom_dup_uptime.txt b/cmd/pint/tests/0152_discovery_prom_dup_uptime.txt index 8dd89d98..9f25c179 100644 --- a/cmd/pint/tests/0152_discovery_prom_dup_uptime.txt +++ b/cmd/pint/tests/0152_discovery_prom_dup_uptime.txt @@ -27,9 +27,13 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=prom2 tags=[] required=false level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7152 level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 tags=[] include=[] exclude=["^.*$"] +level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16 +level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom2.example.com workers=16 level=DEBUG msg="Generated all Prometheus servers" count=1 level=DEBUG msg="Found recording rule" path=rules/0001.yml record=sum:up lines=4-5 level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=sum:up +level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com +level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom2.example.com -- rules/0001.yml -- groups: - name: foo diff --git a/cmd/pint/tests/0155_discovery_prom_dup_include.txt b/cmd/pint/tests/0155_discovery_prom_dup_include.txt index 0c69eea1..4d73b6b3 100644 --- a/cmd/pint/tests/0155_discovery_prom_dup_include.txt +++ b/cmd/pint/tests/0155_discovery_prom_dup_include.txt @@ -19,6 +19,8 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.exam level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7155 level=WARN msg="Duplicated prometheus server with different include" name=prom-ha a=["^prom2$"] b=["^prom1$"] level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=[] include=["^prom1$"] exclude=[] +level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16 +level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha" -- rules/0001.yml -- groups: diff --git a/cmd/pint/tests/0156_discovery_prom_dup_exclude.txt b/cmd/pint/tests/0156_discovery_prom_dup_exclude.txt index 136227d7..b84162a0 100644 --- a/cmd/pint/tests/0156_discovery_prom_dup_exclude.txt +++ b/cmd/pint/tests/0156_discovery_prom_dup_exclude.txt @@ -19,6 +19,8 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.exam level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7156 level=WARN msg="Duplicated prometheus server with different exclude" name=prom-ha a=["^prom2$"] b=["^prom1$"] level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=[] include=[] exclude=["^prom1$"] +level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16 +level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha" -- rules/0001.yml -- groups: diff --git a/cmd/pint/tests/0157_series_other_servers.txt b/cmd/pint/tests/0157_series_other_servers.txt new file mode 100644 index 00000000..a116710a --- /dev/null +++ b/cmd/pint/tests/0157_series_other_servers.txt @@ -0,0 +1,49 @@ +http response prometheus1 /api/v1/metadata 200 {"status":"success","data":{}} +http response prometheus1 /api/v1/status/config 200 {"status":"success","data":{"yaml":"global:\n scrape_interval: 30s\n"}} +http response prometheus1 /api/v1/status/flags 200 {"status":"success","data":{"storage.tsdb.retention.time": "1d"}} +http response prometheus1 /api/v1/query_range 200 {"status":"success","data":{"resultType":"matrix","result":[]}} +http response prometheus1 /api/v1/query 200 {"status":"success","data":{"resultType":"vector","result":[]}} +http start prometheus1 127.0.0.1:7157 + +http response prometheus2 /api/v1/metadata 200 {"status":"success","data":{}} +http response prometheus2 /api/v1/status/config 200 {"status":"success","data":{"yaml":"global:\n scrape_interval: 30s\n"}} +http response prometheus2 /api/v1/status/flags 200 {"status":"success","data":{"storage.tsdb.retention.time": "1d"}} +http response prometheus2 /api/v1/query_range 200 {"status":"success","data":{"resultType":"matrix","result":[]}} +http response prometheus2 /api/v1/query 200 {"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1698249632.491,"1"]}]}} +http start prometheus2 127.0.0.1:8157 + +pint.error --no-color lint rules +! stdout . +cmp stderr stderr.txt + +-- stderr.txt -- +level=INFO msg="Loading configuration file" path=.pint.hcl +level=INFO msg="Finding all rules to check" paths=["rules"] +level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=["^rules/1.yml$"] exclude=[] +level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=["^rules/2.yml$"] exclude=[] +level=WARN msg="No results for Prometheus uptime metric, you might have set uptime config option to a missing metric, please check your config" name=prom1 metric=up +level=WARN msg="Using dummy Prometheus uptime metric results with no gaps" name=prom1 metric=up +rules/1.yml:5 Bug: prometheus "prom1" at http://127.0.0.1:7157 didn't have any series for "only_on_prom2" metric in the last 1w, "only_on_prom2" was found on other prometheus servers: prom2, are you deploying this rule to the correct instance? (promql/series) + 5 | expr: only_on_prom2 == 0 + +level=INFO msg="Problems found" Bug=1 +level=ERROR msg="Fatal error" err="found 1 problem(s) with severity Bug or higher" +-- rules/1.yml -- +groups: +- name: foo + rules: + - alert: foo + expr: only_on_prom2 == 0 +-- .pint.hcl -- +prometheus "prom1" { + uri = "http://127.0.0.1:7157" + timeout = "5s" + required = true + include = [ "rules/1.yml" ] +} +prometheus "prom2" { + uri = "http://127.0.0.1:8157" + timeout = "5s" + required = true + include = [ "rules/2.yml" ] +} diff --git a/docs/changelog.md b/docs/changelog.md index ebcd2977..25b09985 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -6,6 +6,14 @@ - Added [alerts/external_labels](checks/alerts/external_labels.md) check. +### Changed + +- When [promql/series](checks/promql/series.md) finds that a time series used + by a rule is missing it will now also check other defined Prometheus servers + and add that information to the report. + This allows pint to flag rules that are most likely deployed to the wrong servers, + using missing scrape jobs. + ## v0.48.2 ### Fixed diff --git a/internal/checks/promql_series.go b/internal/checks/promql_series.go index f2ad3c6a..e813bb43 100644 --- a/internal/checks/promql_series.go +++ b/internal/checks/promql_series.go @@ -262,8 +262,12 @@ func (c SeriesCheck) Check(ctx context.Context, _ string, rule parser.Rule, entr text, severity := c.textAndSeverity( settings, bareSelector.String(), - fmt.Sprintf("%s didn't have any series for %q metric in the last %s", - promText(c.prom.Name(), trs.URI), bareSelector.String(), sinceDesc(trs.Series.From)), + fmt.Sprintf("%s didn't have any series for %q metric in the last %s%s", + promText(c.prom.Name(), trs.URI), + bareSelector.String(), + sinceDesc(trs.Series.From), + c.checkOtherServer(ctx, selector.String()), + ), Bug, ) problems = append(problems, Problem{ @@ -516,6 +520,42 @@ func (c SeriesCheck) Check(ctx context.Context, _ string, rule parser.Rule, entr return problems } +func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string { + var servers []*promapi.FailoverGroup + if val := ctx.Value(promapi.AllPrometheusServers); val != nil { + servers = val.([]*promapi.FailoverGroup) + } + + if len(servers) == 0 { + return "" + } + + presentProms := []string{} + for _, prom := range servers { + slog.Debug("Checking if metric exists on any other Prometheus server", slog.String("check", c.Reporter()), slog.String("selector", query)) + + qr, err := prom.Query(ctx, fmt.Sprintf("count(%s)", query)) + if err != nil { + continue + } + + var series int + for _, s := range qr.Series { + series += int(s.Value) + } + + if series > 0 { + presentProms = append(presentProms, prom.Name()) + } + } + + if len(presentProms) > 0 { + return fmt.Sprintf(", %q was found on other prometheus servers: %s, are you deploying this rule to the correct instance?", query, strings.Join(presentProms, ", ")) + } + + return "" +} + func (c SeriesCheck) queryProblem(err error, selector string, expr parser.PromQLExpr) Problem { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) return Problem{ diff --git a/internal/config/prometheus.go b/internal/config/prometheus.go index a8f3d484..497962db 100644 --- a/internal/config/prometheus.go +++ b/internal/config/prometheus.go @@ -198,6 +198,10 @@ type PrometheusGenerator struct { cfg Config } +func (pg *PrometheusGenerator) Servers() []*promapi.FailoverGroup { + return pg.servers +} + func (pg *PrometheusGenerator) Count() int { return len(pg.servers) } @@ -213,7 +217,6 @@ func (pg *PrometheusGenerator) ServersForPath(path string) []*promapi.FailoverGr var servers []*promapi.FailoverGroup for _, server := range pg.servers { if server.IsEnabledForPath(path) { - server.StartWorkers(pg.metricsRegistry) servers = append(servers, server) } } @@ -235,6 +238,7 @@ func (pg *PrometheusGenerator) addServer(server *promapi.FailoverGroup) error { slog.Any("include", server.Include()), slog.Any("exclude", server.Exclude()), ) + server.StartWorkers(pg.metricsRegistry) return nil } diff --git a/internal/promapi/prometheus.go b/internal/promapi/prometheus.go index 8e692a50..53a26ac6 100644 --- a/internal/promapi/prometheus.go +++ b/internal/promapi/prometheus.go @@ -18,6 +18,12 @@ import ( "go.uber.org/ratelimit" ) +type PrometheusContextKey string + +const ( + AllPrometheusServers = PrometheusContextKey("allServers") +) + type QueryError struct { err error msg string