diff --git a/.gitignore b/.gitignore index e3f116e8..d41d1df3 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,3 @@ gen # Temporary directory .tmp/* - diff --git a/chart/README.md b/chart/README.md index 2c4084dc..55ba3cf2 100644 --- a/chart/README.md +++ b/chart/README.md @@ -6,9 +6,9 @@ A Helm chart to install Sparrow ## Maintainers -| Name | Email | Url | -| ---- | ------ | --- | -| eumel8 | | | +| Name | Email | Url | +|---------|----------------------------------|---------------------------| +| eumel8 | | | | y-eight | | | ## Source Code @@ -17,50 +17,50 @@ A Helm chart to install Sparrow ## Values -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| affinity | object | `{}` | | -| env | object | `{}` | | -| extraArgs | object | `{"loaderFilePath":"/runconfig/checks.yaml","loaderType":"file"}` | extra command line start parameters see: https://github.com/caas-team/sparrow/blob/main/docs/sparrow_run.md | -| fullnameOverride | string | `""` | | -| image.pullPolicy | string | `"IfNotPresent"` | | -| image.repository | string | `"ghcr.io/caas-team/sparrow"` | | -| image.tag | string | `""` | Overrides the image tag whose default is the chart appVersion. | -| imagePullSecrets | list | `[]` | | -| ingress.annotations | object | `{}` | | -| ingress.className | string | `""` | | -| ingress.enabled | bool | `false` | | -| ingress.hosts[0].host | string | `"chart-example.local"` | | -| ingress.hosts[0].paths[0].path | string | `"/"` | | -| ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` | | -| ingress.tls | list | `[]` | | -| nameOverride | string | `""` | | -| networkPolicies | object | `{"proxy":{"enabled":false}}` | define a network policy that will open egress traffic to a proxy | -| nodeSelector | object | `{}` | | -| podAnnotations | object | `{}` | | -| podLabels | object | `{}` | | -| podSecurityContext.fsGroup | int | `1000` | | -| podSecurityContext.supplementalGroups[0] | int | `1000` | | -| replicaCount | int | `1` | | -| resources | object | `{}` | | -| runtimeConfig | object | `{"health":{"interval":"20s","retry":{"count":3,"delay":"1s"},"targets":["https://www.example.com/","https://www.google.com/"],"timeout":"10s"},"latency":{"interval":"1s","retry":{"count":3,"delay":"1s"},"targets":["https://example.com/","https://google.com/"],"timeout":"3s"}}` | runtime configuration of the Sparrow see: https://github.com/caas-team/sparrow#runtime | -| securityContext.allowPrivilegeEscalation | bool | `false` | | -| securityContext.capabilities.drop[0] | string | `"ALL"` | | -| securityContext.privileged | bool | `false` | | -| securityContext.readOnlyRootFilesystem | bool | `true` | | -| securityContext.runAsGroup | int | `1000` | | -| securityContext.runAsUser | int | `1000` | | -| service.port | int | `8080` | | -| service.type | string | `"ClusterIP"` | | -| serviceAccount.annotations | object | `{}` | Annotations to add to the service account | -| serviceAccount.automount | bool | `true` | Automatically mount a ServiceAccount's API credentials? | -| serviceAccount.create | bool | `true` | Specifies whether a service account should be created | -| serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template | -| serviceMonitor | object | `{"enabled":false,"interval":"30s","labels":{},"scrapeTimeout":"5s"}` | Configure a service monitor for prometheus-operator | -| serviceMonitor.enabled | bool | `false` | Enable the serviceMonitor | -| serviceMonitor.interval | string | `"30s"` | Sets the scrape interval | -| serviceMonitor.labels | object | `{}` | Additional label added to the service Monitor | -| serviceMonitor.scrapeTimeout | string | `"5s"` | Sets the scrape timeout | -| startupConfig | object | `{}` | startup configuration of the Sparrow see: https://github.com/caas-team/sparrow/blob/main/docs/sparrow_run.md | -| tolerations | list | `[]` | | +| Key | Type | Default | Description | +|------------------------------------------|--------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------| +| affinity | object | `{}` | | +| env | object | `{}` | | +| extraArgs | object | `{"loaderFilePath":"/runconfig/checks.yaml","loaderType":"file"}` | extra command line start parameters see: https://github.com/caas-team/sparrow/blob/main/docs/sparrow_run.md | +| fullnameOverride | string | `""` | | +| image.pullPolicy | string | `"IfNotPresent"` | | +| image.repository | string | `"ghcr.io/caas-team/sparrow"` | | +| image.tag | string | `""` | Overrides the image tag whose default is the chart appVersion. | +| imagePullSecrets | list | `[]` | | +| ingress.annotations | object | `{}` | | +| ingress.className | string | `""` | | +| ingress.enabled | bool | `false` | | +| ingress.hosts[0].host | string | `"chart-example.local"` | | +| ingress.hosts[0].paths[0].path | string | `"/"` | | +| ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` | | +| ingress.tls | list | `[]` | | +| nameOverride | string | `""` | | +| networkPolicies | object | `{"proxy":{"enabled":false}}` | define a network policy that will open egress traffic to a proxy | +| nodeSelector | object | `{}` | | +| podAnnotations | object | `{}` | | +| podLabels | object | `{}` | | +| podSecurityContext.fsGroup | int | `1000` | | +| podSecurityContext.supplementalGroups[0] | int | `1000` | | +| replicaCount | int | `1` | | +| resources | object | `{}` | | +| runtimeConfig | object | `{"health":{"interval":"20s","retry":{"count":3,"delay":"1s"},"targets":["https://www.example.com/","https://www.google.com/"],"timeout":"10s"},"latency":{"interval":"1s","retry":{"count":3,"delay":"1s"},"targets":["https://example.com/","https://google.com/"],"timeout":"3s"}}` | runtime configuration of the Sparrow see: https://github.com/caas-team/sparrow#runtime | +| securityContext.allowPrivilegeEscalation | bool | `false` | | +| securityContext.capabilities.drop[0] | string | `"ALL"` | | +| securityContext.privileged | bool | `false` | | +| securityContext.readOnlyRootFilesystem | bool | `true` | | +| securityContext.runAsGroup | int | `1000` | | +| securityContext.runAsUser | int | `1000` | | +| service.port | int | `8080` | | +| service.type | string | `"ClusterIP"` | | +| serviceAccount.annotations | object | `{}` | Annotations to add to the service account | +| serviceAccount.automount | bool | `true` | Automatically mount a ServiceAccount's API credentials? | +| serviceAccount.create | bool | `true` | Specifies whether a service account should be created | +| serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template | +| serviceMonitor | object | `{"enabled":false,"interval":"30s","labels":{},"scrapeTimeout":"5s"}` | Configure a service monitor for prometheus-operator | +| serviceMonitor.enabled | bool | `false` | Enable the serviceMonitor | +| serviceMonitor.interval | string | `"30s"` | Sets the scrape interval | +| serviceMonitor.labels | object | `{}` | Additional label added to the service Monitor | +| serviceMonitor.scrapeTimeout | string | `"5s"` | Sets the scrape timeout | +| startupConfig | object | `{}` | startup configuration of the Sparrow see: https://github.com/caas-team/sparrow/blob/main/docs/sparrow_run.md | +| tolerations | list | `[]` | | diff --git a/cmd/run.go b/cmd/run.go index 5287ce38..bf3ee927 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -82,8 +82,6 @@ func run() func(cmd *cobra.Command, args []string) error { log.Info("Running sparrow") if err = s.Run(ctx); err != nil { err = fmt.Errorf("error while running sparrow: %w", err) - // by this time all shutdown routines should have been called - // so we can exit here return err } diff --git a/internal/helper/retry_test.go b/internal/helper/retry_test.go index 2bccd124..28230d6c 100644 --- a/internal/helper/retry_test.go +++ b/internal/helper/retry_test.go @@ -65,7 +65,7 @@ func TestRetry(t *testing.T) { if effectorFuncCallCounter > 1 { return nil } - return fmt.Errorf("Ups sth wrong") + return fmt.Errorf("ups sth wrong") }, rc: RetryConfig{ Count: 2, @@ -81,7 +81,7 @@ func TestRetry(t *testing.T) { args: args{ effector: func(ctx context.Context) error { effectorFuncCallCounter++ - return fmt.Errorf("Ups sth wrong") + return fmt.Errorf("ups sth wrong") }, rc: RetryConfig{ Count: 2, @@ -98,7 +98,7 @@ func TestRetry(t *testing.T) { effector: func(ctx context.Context) error { effectorFuncCallCounter++ cancel() - return errors.New("Ups") + return errors.New("ups") }, rc: RetryConfig{ Count: 2, diff --git a/internal/logger/logger.go b/internal/logger/logger.go index 1e6a2e0c..1d0f427d 100644 --- a/internal/logger/logger.go +++ b/internal/logger/logger.go @@ -36,18 +36,19 @@ func NewLogger(h ...slog.Handler) *slog.Logger { if len(h) > 0 { handler = h[0] } else { - handler = slog.NewJSONHandler(os.Stderr, nil) + handler = slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{ + AddSource: true, + }) } return slog.New(handler) } // NewContextWithLogger creates a new context based on the provided parent context. -// It embeds a logger into this new context, which is a child of the logger from the parent context. -// The child logger inherits settings from the parent and is grouped under the provided childName. +// It embeds a logger into this new context. // It also returns a cancel function to cancel the new context. -func NewContextWithLogger(parent context.Context, childName string) (context.Context, context.CancelFunc) { +func NewContextWithLogger(parent context.Context) (context.Context, context.CancelFunc) { ctx, cancel := context.WithCancel(parent) - return IntoContext(ctx, FromContext(parent).WithGroup(childName)), cancel + return IntoContext(ctx, FromContext(parent)), cancel } // IntoContext embeds the provided slog.Logger into the given context and returns the modified context. @@ -68,7 +69,7 @@ func FromContext(ctx context.Context) *slog.Logger { return NewLogger() } -// Take the logger from the context and add it to the request context +// Middleware takes the logger from the context and adds it to the request context func Middleware(ctx context.Context) func(http.Handler) http.Handler { log := FromContext(ctx) return func(next http.Handler) http.Handler { diff --git a/pkg/api/routingtree.go b/pkg/api/routingtree.go index d2e92644..72d69f47 100644 --- a/pkg/api/routingtree.go +++ b/pkg/api/routingtree.go @@ -23,7 +23,7 @@ import ( "sync" ) -// creates a simple routing tree, so checks can easily create and remove handlers +// RoutingTree creates a simple routing tree, so checks can easily create and remove handlers // Maps the method to the path and the handler type RoutingTree struct { tree map[string]map[string]http.HandlerFunc diff --git a/pkg/checks/health/health.go b/pkg/checks/health/health.go index 0e3f8ab8..d64ddce3 100644 --- a/pkg/checks/health/health.go +++ b/pkg/checks/health/health.go @@ -84,7 +84,7 @@ type healthMetrics struct { // Run starts the health check func (h *Health) Run(ctx context.Context) error { - ctx, cancel := logger.NewContextWithLogger(ctx, "health") + ctx, cancel := logger.NewContextWithLogger(ctx) defer cancel() log := logger.FromContext(ctx) log.Info("Starting healthcheck", "interval", h.config.Interval.String()) @@ -193,7 +193,7 @@ func (h *Health) GetMetricCollectors() []prometheus.Collector { // check performs a health check using a retry function // to get the health status for all targets func (h *Health) check(ctx context.Context) map[string]string { - log := logger.FromContext(ctx).WithGroup("check") + log := logger.FromContext(ctx) log.Debug("Checking health") if len(h.config.Targets) == 0 { log.Debug("No targets defined") @@ -259,7 +259,10 @@ func getHealth(ctx context.Context, client *http.Client, url string) error { return err } defer func(Body io.ReadCloser) { - _ = Body.Close() + err := Body.Close() + if err != nil { + log.Error("Failed to close response body", "error", err.Error()) + } }(resp.Body) if resp.StatusCode != http.StatusOK { diff --git a/pkg/checks/latency/latency.go b/pkg/checks/latency/latency.go index 82b577f4..909520a5 100644 --- a/pkg/checks/latency/latency.go +++ b/pkg/checks/latency/latency.go @@ -86,7 +86,7 @@ type latencyMetrics struct { // Run starts the latency check func (l *Latency) Run(ctx context.Context) error { - ctx, cancel := logger.NewContextWithLogger(ctx, "latency") + ctx, cancel := logger.NewContextWithLogger(ctx) defer cancel() log := logger.FromContext(ctx) log.Info("Starting latency check", "interval", l.config.Interval.String()) @@ -114,7 +114,7 @@ func (l *Latency) Run(ctx context.Context) error { } func (l *Latency) Startup(ctx context.Context, cResult chan<- specs.Result) error { - log := logger.FromContext(ctx).WithGroup("latency") + log := logger.FromContext(ctx) log.Debug("Initializing latency check") l.CResult = cResult @@ -211,7 +211,7 @@ func (l *Latency) GetMetricCollectors() []prometheus.Collector { // check performs a latency check using a retry function // to get the latency to all targets func (l *Latency) check(ctx context.Context) map[string]LatencyResult { - log := logger.FromContext(ctx).WithGroup("check") + log := logger.FromContext(ctx) log.Debug("Checking latency") if len(l.config.Targets) == 0 { log.Debug("No targets defined") diff --git a/pkg/checks/oapi/oapi.go b/pkg/checks/oapi/oapi.go index 69c720ea..3daf1ac9 100644 --- a/pkg/checks/oapi/oapi.go +++ b/pkg/checks/oapi/oapi.go @@ -24,7 +24,7 @@ import ( "github.com/getkin/kin-openapi/openapi3gen" ) -// Takes in check perfdata and returns an openapi3.SchemaRef of a result wrapping the perfData +// OpenapiFromPerfData takes in check perfdata and returns an openapi3.SchemaRef of a result wrapping the perfData // this is a workaround, since the openapi3gen.NewSchemaRefForValue function does not work with any types func OpenapiFromPerfData[T any](data T) (*openapi3.SchemaRef, error) { checkSchema, err := openapi3gen.NewSchemaRefForValue(specs.Result{}, openapi3.Schemas{}) diff --git a/pkg/config/file.go b/pkg/config/file.go index ee7d1fda..e76061f3 100644 --- a/pkg/config/file.go +++ b/pkg/config/file.go @@ -20,6 +20,7 @@ package config import ( "context" + "fmt" "os" "gopkg.in/yaml.v3" @@ -41,22 +42,23 @@ func NewFileLoader(cfg *Config, cCfgChecks chan<- map[string]any) *FileLoader { } } -func (f *FileLoader) Run(ctx context.Context) { - log := logger.FromContext(ctx).WithGroup("FileLoader") +func (f *FileLoader) Run(ctx context.Context) error { + log := logger.FromContext(ctx) log.Info("Reading config from file", "file", f.path) // TODO refactor this to use fs.FS b, err := os.ReadFile(f.path) if err != nil { log.Error("Failed to read config file", "path", f.path, "error", err) - panic("failed to read config file " + err.Error()) + return fmt.Errorf("failed to read config file: %w", err) } var cfg RuntimeConfig if err := yaml.Unmarshal(b, &cfg); err != nil { log.Error("Failed to parse config file", "error", err) - panic("failed to parse config file: " + err.Error()) + return fmt.Errorf("failed to parse config file: %w", err) } f.c <- cfg.Checks + return nil } diff --git a/pkg/config/file_test.go b/pkg/config/file_test.go index a2c74bca..1fe81646 100644 --- a/pkg/config/file_test.go +++ b/pkg/config/file_test.go @@ -64,7 +64,13 @@ func TestFileLoader_Run(t *testing.T) { path: tt.fields.path, c: tt.fields.c, } - go f.Run(*tt.args.ctx) + go func() { + err := f.Run(*tt.args.ctx) + if err != nil { + t.Errorf("Expected no error, got %v", err) + return + } + }() (*tt.args.cancel)() config := <-tt.fields.c diff --git a/pkg/config/http.go b/pkg/config/http.go index 7557fcc6..4522f5eb 100644 --- a/pkg/config/http.go +++ b/pkg/config/http.go @@ -51,54 +51,57 @@ func NewHttpLoader(cfg *Config, cCfgChecks chan<- map[string]any) *HttpLoader { // The config is will be loaded periodically defined by the // loader interval configuration. A failed request will be retried defined // by the retry configuration -func (gl *HttpLoader) Run(ctx context.Context) { - ctx, cancel := logger.NewContextWithLogger(ctx, "httpLoader") +func (hl *HttpLoader) Run(ctx context.Context) error { + ctx, cancel := logger.NewContextWithLogger(ctx) defer cancel() log := logger.FromContext(ctx) - var runtimeCfg *RuntimeConfig - for { - getConfigRetry := helper.Retry(func(ctx context.Context) error { - var err error - runtimeCfg, err = gl.GetRuntimeConfig(ctx) - return err - }, gl.cfg.Loader.Http.RetryCfg) - - if err := getConfigRetry(ctx); err != nil { - log.Error("Could not get remote runtime configuration", "error", err) - return - } - - log.Info("Successfully got remote runtime configuration") - gl.cCfgChecks <- runtimeCfg.Checks + for { select { case <-ctx.Done(): - return - case <-time.After(gl.cfg.Loader.Interval): + return ctx.Err() + case <-time.After(hl.cfg.Loader.Interval): + getConfigRetry := helper.Retry(func(ctx context.Context) error { + var err error + runtimeCfg, err = hl.GetRuntimeConfig(ctx) + return err + }, hl.cfg.Loader.Http.RetryCfg) + + if err := getConfigRetry(ctx); err != nil { + log.Warn("Could not get remote runtime configuration", "error", err) + } + + log.Info("Successfully got remote runtime configuration") + hl.cCfgChecks <- runtimeCfg.Checks } } } // GetRuntimeConfig gets the remote runtime configuration -func (gl *HttpLoader) GetRuntimeConfig(ctx context.Context) (*RuntimeConfig, error) { - log := logger.FromContext(ctx).With("url", gl.cfg.Loader.Http.Url) +func (hl *HttpLoader) GetRuntimeConfig(ctx context.Context) (*RuntimeConfig, error) { + log := logger.FromContext(ctx).With("url", hl.cfg.Loader.Http.Url) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, gl.cfg.Loader.Http.Url, http.NoBody) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, hl.cfg.Loader.Http.Url, http.NoBody) if err != nil { log.Error("Could not create http GET request", "error", err.Error()) return nil, err } - if gl.cfg.Loader.Http.Token != "" { - req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", gl.cfg.Loader.Http.Token)) + if hl.cfg.Loader.Http.Token != "" { + req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", hl.cfg.Loader.Http.Token)) } - res, err := gl.client.Do(req) + res, err := hl.client.Do(req) //nolint:bodyclose if err != nil { log.Error("Http get request failed", "error", err.Error()) return nil, err } - defer res.Body.Close() + defer func(Body io.ReadCloser) { + err = Body.Close() + if err != nil { + log.Error("Failed to close response body", "error", err.Error()) + } + }(res.Body) if res.StatusCode != http.StatusOK { log.Error("Http get request failed", "status", res.Status) diff --git a/pkg/config/loader.go b/pkg/config/loader.go index b93dbb65..683a8438 100644 --- a/pkg/config/loader.go +++ b/pkg/config/loader.go @@ -23,7 +23,7 @@ import ( ) type Loader interface { - Run(context.Context) + Run(context.Context) error } // NewLoader Get a new typed runtime configuration loader diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 53d010e5..5484acd1 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -29,7 +29,7 @@ import ( // Validate validates the startup config func (c *Config) Validate(ctx context.Context) error { - ctx, cancel := logger.NewContextWithLogger(ctx, "configValidation") + ctx, cancel := logger.NewContextWithLogger(ctx) defer cancel() log := logger.FromContext(ctx) diff --git a/pkg/config/validate_test.go b/pkg/config/validate_test.go index dc611138..ff2c28df 100644 --- a/pkg/config/validate_test.go +++ b/pkg/config/validate_test.go @@ -28,7 +28,7 @@ import ( ) func TestConfig_Validate(t *testing.T) { - ctx, cancel := logger.NewContextWithLogger(context.Background(), "validation-test") + ctx, cancel := logger.NewContextWithLogger(context.Background()) defer cancel() type fields struct { diff --git a/pkg/db/db.go b/pkg/db/db.go index 292991bd..cc7607db 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -62,7 +62,7 @@ func (i *InMemory) Get(check string) (specs.Result, bool) { return *result, true } -// Returns a copy of the map +// List returns a copy of the map func (i *InMemory) List() map[string]specs.Result { results := make(map[string]specs.Result) i.data.Range(func(key, value any) bool { diff --git a/pkg/sparrow/api.go b/pkg/sparrow/api.go index 05c162b9..6278a918 100644 --- a/pkg/sparrow/api.go +++ b/pkg/sparrow/api.go @@ -42,11 +42,7 @@ const ( readHeaderTimeout = time.Second * 5 ) -var ( - ErrServeApi = errors.New("failed to serve api") - ErrApiContext = errors.New("api context canceled") - ErrCreateOpenapiSchema = errors.New("failed to get schema for check") -) +var ErrCreateOpenapiSchema = errors.New("failed to get schema for check") func (s *Sparrow) register(ctx context.Context) { s.router.Use(logger.Middleware(ctx)) @@ -75,17 +71,15 @@ func (s *Sparrow) register(ctx context.Context) { // // Blocks until context is done func (s *Sparrow) api(ctx context.Context) error { - log := logger.FromContext(ctx).WithGroup("api") + log := logger.FromContext(ctx) cErr := make(chan error, 1) s.register(ctx) - server := http.Server{Addr: s.cfg.Api.ListeningAddress, Handler: s.router, ReadHeaderTimeout: readHeaderTimeout} - // run http server in goroutine go func(cErr chan error) { defer close(cErr) log.Info("Serving Api", "addr", s.cfg.Api.ListeningAddress) - if err := server.ListenAndServe(); err != nil { + if err := s.server.ListenAndServe(); err != nil { log.Error("Failed to serve api", "error", err) cErr <- err } @@ -93,26 +87,31 @@ func (s *Sparrow) api(ctx context.Context) error { select { case <-ctx.Done(): - if ctx.Err() != nil { - shutdownCtx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) - defer cancel() - err := server.Shutdown(shutdownCtx) - if err != nil { - log.Error("Failed to shutdown api server", "error", err) - } - log.Error("Api context canceled", "error", ctx.Err()) - return fmt.Errorf("%w: %w", ErrApiContext, ctx.Err()) - } + return fmt.Errorf("failed serving API: %w", ctx.Err()) case err := <-cErr: if errors.Is(err, http.ErrServerClosed) || err == nil { log.Info("Api server closed") return nil } - log.Error("failed to serve api", "error", err) - return fmt.Errorf("%w: %w", ErrServeApi, err) + log.Error("failed serving API", "error", err) + return fmt.Errorf("failed serving API: %w", err) } +} - return nil +// shutdownAPI gracefully shuts down the api server +// Returns an error if an error is present in the context +// or if the server cannot be shut down +func (s *Sparrow) shutdownAPI(ctx context.Context) error { + errC := ctx.Err() + log := logger.FromContext(ctx) + shutdownCtx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) + defer cancel() + err := s.server.Shutdown(shutdownCtx) + if err != nil { + log.Error("Failed to shutdown api server", "error", err) + return fmt.Errorf("failed shutting down API: %w", errors.Join(errC, err)) + } + return errC } // okHandler returns a handler that will serve status ok diff --git a/pkg/sparrow/api_test.go b/pkg/sparrow/api_test.go index 91c4673b..4a5cb4a1 100644 --- a/pkg/sparrow/api_test.go +++ b/pkg/sparrow/api_test.go @@ -72,6 +72,7 @@ func TestSparrow_api_shutdownWhenContextCanceled(t *testing.T) { cfg: &config.Config{Api: config.ApiConfig{ListeningAddress: ":8080"}}, router: chi.NewRouter(), metrics: NewMetrics(), + server: &http.Server{}, //nolint:gosec } ctx, cancel := context.WithCancel(context.Background()) cancel() diff --git a/pkg/sparrow/metrics.go b/pkg/sparrow/metrics.go index 5241c7fc..8a65363b 100644 --- a/pkg/sparrow/metrics.go +++ b/pkg/sparrow/metrics.go @@ -34,7 +34,7 @@ type PrometheusMetrics struct { registry *prometheus.Registry } -// InitMetrics initializes the metrics and returns the PrometheusMetrics +// NewMetrics initializes the metrics and returns the PrometheusMetrics func NewMetrics() Metrics { registry := prometheus.NewRegistry() diff --git a/pkg/sparrow/run.go b/pkg/sparrow/run.go index 2db12da9..af6a587a 100644 --- a/pkg/sparrow/run.go +++ b/pkg/sparrow/run.go @@ -20,13 +20,14 @@ package sparrow import ( "context" - "errors" "fmt" + "net/http" "slices" + "sync" "time" "github.com/caas-team/sparrow/pkg/checks" - targets "github.com/caas-team/sparrow/pkg/sparrow/targets" + "github.com/caas-team/sparrow/pkg/sparrow/targets" "github.com/caas-team/sparrow/internal/logger" "github.com/caas-team/sparrow/pkg/api" @@ -43,16 +44,26 @@ type Sparrow struct { db db.DB // the existing checks checks map[string]checks.Check + server *http.Server metrics Metrics resultFanIn map[string]chan specs.Result - cResult chan specs.ResultDTO - cfg *config.Config - loader config.Loader + cfg *config.Config + cCfgChecks chan map[string]any - targets targets.TargetManager + // cResult is the channel where the checks send their results to + cResult chan specs.ResultDTO + // cErr is used to handle non-recoverable errors of the sparrow components + cErr chan error + // cDone is used to signal that the sparrow was shut down because of an error + cDone chan struct{} + // shutOnce is used to ensure that the shutdown function is only called once + shutOnce sync.Once + + loader config.Loader + tarMan targets.TargetManager routingTree *api.RoutingTree router chi.Router @@ -70,12 +81,15 @@ func New(cfg *config.Config) *Sparrow { cCfgChecks: make(chan map[string]any, 1), routingTree: api.NewRoutingTree(), router: chi.NewRouter(), + cErr: make(chan error, 1), + cDone: make(chan struct{}, 1), } - // Set the target manager + sparrow.server = &http.Server{Addr: cfg.Api.ListeningAddress, Handler: sparrow.router, ReadHeaderTimeout: readHeaderTimeout} + if cfg.HasTargetManager() { gm := targets.NewGitlabManager(cfg.SparrowName, cfg.TargetManager) - sparrow.targets = gm + sparrow.tarMan = gm } sparrow.loader = config.NewLoader(cfg, sparrow.cCfgChecks) @@ -85,35 +99,38 @@ func New(cfg *config.Config) *Sparrow { // Run starts the sparrow func (s *Sparrow) Run(ctx context.Context) error { - ctx, cancel := logger.NewContextWithLogger(ctx, "sparrow") + ctx, cancel := logger.NewContextWithLogger(ctx) log := logger.FromContext(ctx) defer cancel() - go s.loader.Run(ctx) - - if s.targets != nil { - go s.targets.Reconcile(ctx) - } - - // Start the api go func() { - err := s.api(ctx) - if err != nil { - log.Error("Error running api server", "error", err) + s.cErr <- s.loader.Run(ctx) + }() + go func() { + if s.tarMan != nil { + s.cErr <- s.tarMan.Reconcile(ctx) } }() + go func() { + s.cErr <- s.api(ctx) + }() for { select { - case <-ctx.Done(): - return s.shutdown(ctx) case result := <-s.cResult: go s.db.Save(result) case configChecks := <-s.cCfgChecks: - // Config got updated - // Set checks s.cfg.Checks = configChecks s.ReconcileChecks(ctx) + case <-ctx.Done(): + s.shutdown(ctx) + case err := <-s.cErr: + if err != nil { + log.Error("Non-recoverable error in sparrow component", "error", err) + s.shutdown(ctx) + } + case <-s.cDone: + return fmt.Errorf("sparrow was shut down") } } } @@ -186,10 +203,10 @@ func (s *Sparrow) updateCheckTargets(cfg any) any { } var urls []string - if s.targets == nil { + if s.tarMan == nil { return checkCfg } - gt := s.targets.GetTargets() + gt := s.tarMan.GetTargets() // filter out globalTargets that are already in the config and self for _, t := range gt { @@ -292,13 +309,24 @@ func fanInResults(checkChan chan specs.Result, cResult chan specs.ResultDTO, nam // shutdown shuts down the sparrow and all managed components gracefully. // It returns an error if one is present in the context or if any of the // components fail to shut down. -func (s *Sparrow) shutdown(ctx context.Context) error { +func (s *Sparrow) shutdown(ctx context.Context) { errC := ctx.Err() + log := logger.FromContext(ctx) ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) defer cancel() - errS := s.targets.Shutdown(ctx) - if errS != nil { - return fmt.Errorf("failed to shutdown sparrow: %w", errors.Join(errC, errS)) - } - return errC + + s.shutOnce.Do(func() { + log.Info("Shutting down sparrow gracefully") + var errS error + if s.tarMan != nil { + errS = s.tarMan.Shutdown(ctx) + } + errA := s.shutdownAPI(ctx) + if errS != nil || errA != nil { + log.Error("Failed to shutdown gracefully", "contextError", errC, "apiError", errA, "targetError", errS) + } + + // Signal that shutdown is complete + s.cDone <- struct{}{} + }) } diff --git a/pkg/sparrow/run_test.go b/pkg/sparrow/run_test.go index 5cc42ed4..8fd20240 100644 --- a/pkg/sparrow/run_test.go +++ b/pkg/sparrow/run_test.go @@ -40,7 +40,7 @@ import ( ) func TestSparrow_ReconcileChecks(t *testing.T) { - ctx, cancel := logger.NewContextWithLogger(context.Background(), "sparrow-test") + ctx, cancel := logger.NewContextWithLogger(context.Background()) defer cancel() mockCheck := checks.CheckMock{ @@ -197,19 +197,29 @@ func Test_fanInResults(t *testing.T) { close(checkChan) } -// TestSparrow_Run tests that the Run method starts the API -func TestSparrow_Run(t *testing.T) { +// TestSparrow_Run_FullComponentStart tests that the Run method starts the API, +// loader and a targetManager all start. +func TestSparrow_Run_FullComponentStart(t *testing.T) { // create simple file loader config c := &config.Config{ Api: config.ApiConfig{ListeningAddress: ":9090"}, Loader: config.LoaderConfig{ Type: "file", + File: config.FileLoaderConfig{Path: "../config/testdata/config.yaml"}, Interval: time.Second * 1, }, + TargetManager: config.TargetManagerConfig{ + CheckInterval: time.Second * 1, + RegistrationInterval: time.Second * 1, + UnhealthyThreshold: time.Second * 1, + Gitlab: config.GitlabTargetManagerConfig{ + BaseURL: "https://gitlab.com", + Token: "my-cool-token", + ProjectID: 42, + }, + }, } - c.Loader.File.Path = ("../config/testdata/config.yaml") - // start sparrow s := New(c) ctx := context.Background() @@ -220,12 +230,45 @@ func TestSparrow_Run(t *testing.T) { } }() - t.Log("Letting API run shortly") - time.Sleep(time.Millisecond * 150) + t.Log("Running sparrow for 10ms") + time.Sleep(time.Millisecond * 10) +} + +// TestSparrow_Run_ContextCancel tests that after a context cancels the Run method +// will return an error and all started components will be shut down. +func TestSparrow_Run_ContextCancel(t *testing.T) { + // create simple file loader config + c := &config.Config{ + Api: config.ApiConfig{ListeningAddress: ":9090"}, + Loader: config.LoaderConfig{ + Type: "file", + File: config.FileLoaderConfig{Path: "../config/testdata/config.yaml"}, + Interval: time.Second * 1, + }, + } + + // start sparrow + s := New(c) + s.tarMan = &gitlabmock.MockTargetManager{} + ctx, cancel := context.WithCancel(context.Background()) + go func() { + err := s.Run(ctx) + t.Logf("Sparrow exited with error: %v", err) + if err == nil { + t.Error("Sparrow.Run() should have errored out, no error received") + } + }() + + t.Log("Running sparrow for 10ms") + time.Sleep(time.Millisecond * 10) + + t.Log("Canceling context and waiting for shutdown") + cancel() + time.Sleep(time.Millisecond * 30) } // TestSparrow_updateCheckTargets tests that the updateCheckTargets method -// updates the check targets, if they exists in the config of the checks. +// updates the check targets, if they exist in the config of the checks. func TestSparrow_updateCheckTargets(t *testing.T) { now := time.Now() gt := []specs.GlobalTarget{ @@ -333,7 +376,7 @@ func TestSparrow_updateCheckTargets(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { s := &Sparrow{ - targets: &gitlabmock.MockTargetManager{ + tarMan: &gitlabmock.MockTargetManager{ Targets: tt.globalTargets, }, cfg: &config.Config{ diff --git a/pkg/sparrow/targets/gitlab.go b/pkg/sparrow/targets/gitlab.go index 4fe03ea9..5ee0b7d3 100644 --- a/pkg/sparrow/targets/gitlab.go +++ b/pkg/sparrow/targets/gitlab.go @@ -20,6 +20,7 @@ package targets import ( "context" + "errors" "fmt" "sync" "time" @@ -40,8 +41,9 @@ const shutdownTimeout = 30 * time.Second type gitlabTargetManager struct { targets []specs.GlobalTarget mu sync.RWMutex - done chan struct{} - gitlab gitlab.Gitlab + // channel to signal the "reconcile" routine to stop + done chan struct{} + gitlab gitlab.Gitlab // the DNS name used for self-registration name string // the interval for the target reconciliation process @@ -73,7 +75,7 @@ func NewGitlabManager(name string, gtmConfig config.TargetManagerConfig) *gitlab // // The global targets are evaluated for healthiness and // unhealthy gitlabTargetManager are removed. -func (t *gitlabTargetManager) Reconcile(ctx context.Context) { +func (t *gitlabTargetManager) Reconcile(ctx context.Context) error { log := logger.FromContext(ctx) log.Info("Starting global gitlabTargetManager reconciler") @@ -86,29 +88,21 @@ func (t *gitlabTargetManager) Reconcile(ctx context.Context) { for { select { case <-ctx.Done(): - if err := ctx.Err(); err != nil { - log.Error("Context canceled", "error", err) - ctxS, cancel := context.WithTimeout(context.Background(), shutdownTimeout) - defer cancel() //nolint: gocritic // how else can we defer a cancel? - err = t.Shutdown(ctxS) - if err != nil { - log.Error("Failed to shutdown gracefully, stopping routine", "error", err) - return - } - } + log.Error("Error while reconciling gitlab targets", "err", ctx.Err()) + return ctx.Err() case <-t.done: - log.Info("Ending Reconcile routine") - return + log.Info("Gitlab target reconciliation ended") + return nil case <-checkTimer.C: err := t.refreshTargets(ctx) if err != nil { - log.Error("Failed to get global targets", "error", err) + log.Warn("Failed to get global targets", "error", err) } checkTimer.Reset(t.checkInterval) case <-registrationTimer.C: err := t.updateRegistration(ctx) if err != nil { - log.Error("Failed to register self as global target", "error", err) + log.Warn("Failed to register self as global target", "error", err) } registrationTimer.Reset(t.registrationInterval) } @@ -127,8 +121,12 @@ func (t *gitlabTargetManager) GetTargets() []specs.GlobalTarget { func (t *gitlabTargetManager) Shutdown(ctx context.Context) error { t.mu.Lock() defer t.mu.Unlock() + + errC := ctx.Err() log := logger.FromContext(ctx) log.Debug("Shut down signal received") + ctxS, cancel := context.WithTimeout(context.Background(), shutdownTimeout) + defer cancel() if t.Registered() { f := gitlab.File{ @@ -138,17 +136,17 @@ func (t *gitlabTargetManager) Shutdown(ctx context.Context) error { CommitMessage: "Unregistering global target", } f.SetFileName(fmt.Sprintf("%s.json", t.name)) - err := t.gitlab.DeleteFile(ctx, f) + err := t.gitlab.DeleteFile(ctxS, f) if err != nil { log.Error("Failed to shutdown gracefully", "error", err) - return err + return fmt.Errorf("failed to shutdown gracefully: %w", errors.Join(errC, err)) } t.registered = false } select { case t.done <- struct{}{}: - log.Info("Stopping reconcile routine") + log.Debug("Stopping gitlab reconciliation routine") default: } diff --git a/pkg/sparrow/targets/gitlab_test.go b/pkg/sparrow/targets/gitlab_test.go index 249db58f..d63cc1ad 100644 --- a/pkg/sparrow/targets/gitlab_test.go +++ b/pkg/sparrow/targets/gitlab_test.go @@ -274,7 +274,11 @@ func Test_gitlabTargetManager_Reconcile_success(t *testing.T) { gtm := mockGitlabTargetManager(glmock, "test") ctx := context.Background() go func() { - gtm.Reconcile(ctx) + err := gtm.Reconcile(ctx) + if err != nil { + t.Error("Reconcile() should not have returned an error") + return + } }() time.Sleep(time.Millisecond * 300) @@ -330,7 +334,11 @@ func Test_gitlabTargetManager_Reconcile_failure(t *testing.T) { ctx := context.Background() go func() { - gtm.Reconcile(ctx) + err := gtm.Reconcile(ctx) + if err != nil { + t.Error("Reconcile() should not have returned an error") + return + } }() time.Sleep(time.Millisecond * 300) @@ -368,8 +376,13 @@ func Test_gitlabTargetManager_Reconcile_Context_Canceled(t *testing.T) { gtm := mockGitlabTargetManager(glmock, "test") ctx, cancel := context.WithCancel(context.Background()) + go func() { - gtm.Reconcile(ctx) + err := gtm.Reconcile(ctx) + if err == nil { + t.Error("Reconcile() should have returned an error") + return + } }() time.Sleep(time.Millisecond * 250) @@ -377,8 +390,8 @@ func Test_gitlabTargetManager_Reconcile_Context_Canceled(t *testing.T) { time.Sleep(time.Millisecond * 250) gtm.mu.Lock() - if gtm.Registered() { - t.Fatalf("Reconcile() should not be registered") + if !gtm.Registered() { + t.Fatalf("Reconcile() should still be registered") } gtm.mu.Unlock() } @@ -400,7 +413,11 @@ func Test_gitlabTargetManager_Reconcile_Context_Done(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*10) defer cancel() go func() { - gtm.Reconcile(ctx) + err := gtm.Reconcile(ctx) + if err == nil { + t.Error("Reconcile() should have returned an error") + return + } }() time.Sleep(time.Millisecond * 15) @@ -428,7 +445,11 @@ func Test_gitlabTargetManager_Reconcile_Shutdown(t *testing.T) { ctx := context.Background() go func() { - gtm.Reconcile(ctx) + err := gtm.Reconcile(ctx) + if err != nil { + t.Error("Reconcile() should not have returned an error") + return + } }() time.Sleep(time.Millisecond * 250) @@ -463,7 +484,11 @@ func Test_gitlabTargetManager_Reconcile_Shutdown_Fail_Unregister(t *testing.T) { ctx := context.Background() go func() { - gtm.Reconcile(ctx) + err := gtm.Reconcile(ctx) + if err != nil { + t.Error("Reconcile() should not have returned an error") + return + } }() time.Sleep(time.Millisecond * 250) diff --git a/pkg/sparrow/targets/targetmanager.go b/pkg/sparrow/targets/targetmanager.go index 19c5bd3d..75525cdc 100644 --- a/pkg/sparrow/targets/targetmanager.go +++ b/pkg/sparrow/targets/targetmanager.go @@ -29,7 +29,7 @@ import ( type TargetManager interface { // Reconcile fetches the global targets from the configured // endpoint and updates the local state - Reconcile(ctx context.Context) + Reconcile(ctx context.Context) error // GetTargets returns the current global targets GetTargets() []specs.GlobalTarget // Shutdown shuts down the target manager diff --git a/pkg/sparrow/targets/test/mocktargets.go b/pkg/sparrow/targets/test/mocktargets.go index 0a7835e0..44ce23e2 100644 --- a/pkg/sparrow/targets/test/mocktargets.go +++ b/pkg/sparrow/targets/test/mocktargets.go @@ -12,9 +12,10 @@ type MockTargetManager struct { Targets []specs.GlobalTarget } -func (m *MockTargetManager) Reconcile(ctx context.Context) { +func (m *MockTargetManager) Reconcile(ctx context.Context) error { log := logger.FromContext(ctx) log.Info("MockReconcile called") + return nil } func (m *MockTargetManager) Shutdown(ctx context.Context) error { diff --git a/scripts/gen-docs/gen-docs.go b/scripts/gen-docs/gen-docs.go index a436456a..6180c29a 100644 --- a/scripts/gen-docs/gen-docs.go +++ b/scripts/gen-docs/gen-docs.go @@ -41,7 +41,7 @@ func execute() { rootCmd.AddCommand(NewCmdGenDocs()) if err := rootCmd.Execute(); err != nil { - fmt.Fprintln(os.Stderr, err) + _, _ = fmt.Fprintln(os.Stderr, err) os.Exit(1) } } @@ -68,7 +68,7 @@ func runGenDocs(path *string) func(cmd *cobra.Command, args []string) error { c.DisableAutoGenTag = true return func(cmd *cobra.Command, args []string) error { if err := doc.GenMarkdownTree(c, *path); err != nil { - return fmt.Errorf("Failed to generate docs: %w", err) + return fmt.Errorf("failed to generate docs: %w", err) } return nil }