diff --git a/cmd/fabric/main.go b/cmd/fabric/main.go index 7c68ef4d86..995c8e071e 100644 --- a/cmd/fabric/main.go +++ b/cmd/fabric/main.go @@ -31,6 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/metrics/server" @@ -142,6 +143,14 @@ func run(cmd *cobra.Command, _ []string) error { return fmt.Errorf("unable to create manager: %w", err) } + // Register the healthiness probes. + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + return fmt.Errorf("unable to set up healthz probe: %w", err) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + return fmt.Errorf("unable to set up readyz probe: %w", err) + } + gwr, err := sourcedetector.NewGatewayReconciler( mgr.GetClient(), mgr.GetScheme(), diff --git a/cmd/gateway/geneve/main.go b/cmd/gateway/geneve/main.go index e8408b3925..736f6df1e8 100644 --- a/cmd/gateway/geneve/main.go +++ b/cmd/gateway/geneve/main.go @@ -25,6 +25,7 @@ import ( "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/metrics/server" @@ -95,6 +96,14 @@ func run(cmd *cobra.Command, _ []string) error { return fmt.Errorf("unable to create manager: %w", err) } + // Register the healthiness probes. + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + return fmt.Errorf("unable to set up healthz probe: %w", err) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + return fmt.Errorf("unable to set up readyz probe: %w", err) + } + inr, err := geneve.NewInternalNodeReconciler( mgr.GetClient(), mgr.GetScheme(), diff --git a/cmd/gateway/main.go b/cmd/gateway/main.go index 950f8118ac..eeb23c2bd0 100644 --- a/cmd/gateway/main.go +++ b/cmd/gateway/main.go @@ -29,6 +29,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/metrics/server" @@ -160,6 +161,14 @@ func run(cmd *cobra.Command, _ []string) error { return fmt.Errorf("unable to create manager: %w", err) } + // Register the healthiness probes. + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + return fmt.Errorf("unable to set up healthz probe: %w", err) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + return fmt.Errorf("unable to set up readyz probe: %w", err) + } + if connoptions.EnableConnectionController { // Setup the connection controller. connr, err := connection.NewConnectionsReconciler( diff --git a/cmd/gateway/wireguard/main.go b/cmd/gateway/wireguard/main.go index 59606401ed..248ea40bea 100644 --- a/cmd/gateway/wireguard/main.go +++ b/cmd/gateway/wireguard/main.go @@ -29,6 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client/config" "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/metrics/server" @@ -107,6 +108,14 @@ func run(cmd *cobra.Command, _ []string) error { return fmt.Errorf("unable to create manager: %w", err) } + // Register the healthiness probes. + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + return fmt.Errorf("unable to set up healthz probe: %w", err) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + return fmt.Errorf("unable to set up readyz probe: %w", err) + } + // Setup the controller. pkr, err := wireguard.NewPublicKeysReconciler( mgr.GetClient(), diff --git a/deployments/liqo/README.md b/deployments/liqo/README.md index 6673be1ab1..723f78b004 100644 --- a/deployments/liqo/README.md +++ b/deployments/liqo/README.md @@ -86,6 +86,8 @@ | networking.enabled | bool | `true` | Use the default Liqo networking module. | | networking.fabric.config.fullMasquerade | bool | `false` | Enabe/Disable the full masquerade mode for the fabric pod. It means that all traffic will be masquerade using the first external cidr IP, instead of using the pod IP. Full masquerade is useful when the cluster nodeports uses a PodCIDR IP to masqerade the incoming traffic. IMPORTANT: Please consider that enabling this feature will masquerade the source IP of traffic towards a remote cluster, making impossible for a pod that receives the traffic to know the original source IP. | | networking.fabric.config.gatewayMasqueradeBypass | bool | `false` | Enable/Disable the masquerade bypass for the gateway pods. It means that the packets from gateway pods will not be masqueraded from the host where the pod is scheduled. This is useful in scenarios where CNIs masquerade the traffic from pod to nodes. For example this is required when using the Azure CNI or Kindnet. | +| networking.fabric.config.healthProbeBindAddressPort | string | `"8081"` | Set the port where the fabric pod will expose the health probe. To disable the health probe, set the port to 0. | +| networking.fabric.config.metricsAddressPort | string | `"8082"` | Set the port where the fabric pod will expose the metrics. To disable the metrics, set the port to 0. | | networking.fabric.config.nftablesMonitor | bool | `true` | Enable/Disable the nftables monitor for the fabric pod. It means that the fabric pod will monitor the nftables rules and will restore them in case of changes. In some cases (like K3S), this monitor can cause a huge amount of CPU usage. If you are experiencing high CPU usage, you can disable this feature. | | networking.fabric.image.name | string | `"ghcr.io/liqotech/fabric"` | Image repository for the fabric pod. | | networking.fabric.image.version | string | `""` | Custom version for the fabric image. If not specified, the global tag is used. | diff --git a/deployments/liqo/templates/liqo-fabric-daemonset.yaml b/deployments/liqo/templates/liqo-fabric-daemonset.yaml index a5da07bc8d..ae47210bc3 100644 --- a/deployments/liqo/templates/liqo-fabric-daemonset.yaml +++ b/deployments/liqo/templates/liqo-fabric-daemonset.yaml @@ -45,6 +45,8 @@ spec: - --podname=$(POD_NAME) - --nodename=$(NODE_NAME) - --geneve-port={{ .Values.networking.genevePort }} + - --health-probe-bind-address=:{{ .Values.networking.fabric.config.healthProbeBindAddressPort}} + - --metrics-address=:{{ .Values.networking.fabric.config.metricsAddressPort}} {{- if not .Values.requirements.kernel.enabled }} - --disable-kernel-version-check {{- end }} @@ -79,6 +81,16 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + {{- if and .Values.networking.fabric.config.healthProbeBindAddressPort (ne .Values.networking.fabric.config.healthProbeBindAddressPort "0") }} + ports: + - name: healthz + containerPort: {{ .Values.networking.fabric.config.healthProbeBindAddressPort }} + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: healthz + {{- end }} hostNetwork: true {{- if .Values.networking.fabric.pod.priorityClassName }} priorityClassName: {{ .Values.networking.fabric.pod.priorityClassName }} diff --git a/deployments/liqo/templates/liqo-wireguard-gateway-client-template.yaml b/deployments/liqo/templates/liqo-wireguard-gateway-client-template.yaml index acbf2a3da9..56f6b9b451 100644 --- a/deployments/liqo/templates/liqo-wireguard-gateway-client-template.yaml +++ b/deployments/liqo/templates/liqo-wireguard-gateway-client-template.yaml @@ -78,6 +78,8 @@ spec: ports: - containerPort: 8082 name: gw-metrics + - containerPort: 8083 + name: healthz {{- end }} env: - name: NODE_NAME @@ -94,6 +96,10 @@ spec: add: - NET_ADMIN - NET_RAW + readinessProbe: + httpGet: + path: /readyz + port: healthz - name: wireguard image: {{ .Values.networking.gatewayTemplates.container.wireguard.image.name }}{{ include "liqo.suffix" $wireguardConfig }}:{{ include "liqo.version" $wireguardConfig }} imagePullPolicy: {{ .Values.pullPolicy }} @@ -116,6 +122,8 @@ spec: ports: - containerPort: 8084 name: wg-metrics + - containerPort: 8085 + name: healthz {{- end }} securityContext: capabilities: @@ -130,6 +138,10 @@ spec: mountPath: /ipc - name: wireguard-config mountPath: /etc/wireguard/keys + readinessProbe: + httpGet: + path: /readyz + port: healthz - name: geneve image: {{ .Values.networking.gatewayTemplates.container.geneve.image.name }}{{ include "liqo.suffix" $geneveConfig }}:{{ include "liqo.version" $geneveConfig }} imagePullPolicy: {{ .Values.pullPolicy }} @@ -154,6 +166,8 @@ spec: ports: - containerPort: 8086 name: gv-metrics + - containerPort: 8087 + name: healthz {{- end }} env: - name: NODE_NAME @@ -169,6 +183,10 @@ spec: add: - NET_ADMIN - NET_RAW + readinessProbe: + httpGet: + path: /readyz + port: healthz # Uncomment to set a priorityClassName # priorityClassName: "" volumes: diff --git a/deployments/liqo/templates/liqo-wireguard-gateway-server-template-eks.yaml b/deployments/liqo/templates/liqo-wireguard-gateway-server-template-eks.yaml index a08b51654e..1d81a9cdf9 100644 --- a/deployments/liqo/templates/liqo-wireguard-gateway-server-template-eks.yaml +++ b/deployments/liqo/templates/liqo-wireguard-gateway-server-template-eks.yaml @@ -106,6 +106,13 @@ spec: - containerPort: 8082 name: gw-metrics {{- end }} + ports: + - containerPort: 8083 + name: healthz + readinessProbe: + httpGet: + path: /readyz + port: healthz env: - name: NODE_NAME valueFrom: @@ -143,6 +150,13 @@ spec: - containerPort: 8084 name: wg-metrics {{- end }} + ports: + - containerPort: 8085 + name: healthz + readinessProbe: + httpGet: + path: /readyz + port: healthz securityContext: capabilities: add: @@ -172,7 +186,7 @@ spec: {{- if .Values.metrics.enabled }} - --metrics-address=:8086 {{- end }} - - --health-probe-bind-address=:8086 + - --health-probe-bind-address=:8087 volumeMounts: - name: ipc mountPath: /ipc @@ -181,6 +195,13 @@ spec: - containerPort: 8086 name: gv-metrics {{- end }} + ports: + - containerPort: 8087 + name: healthz + readinessProbe: + httpGet: + path: /readyz + port: healthz env: - name: NODE_NAME valueFrom: diff --git a/deployments/liqo/templates/liqo-wireguard-gateway-server-template.yaml b/deployments/liqo/templates/liqo-wireguard-gateway-server-template.yaml index d4a248548e..85b61f99f5 100644 --- a/deployments/liqo/templates/liqo-wireguard-gateway-server-template.yaml +++ b/deployments/liqo/templates/liqo-wireguard-gateway-server-template.yaml @@ -80,9 +80,9 @@ spec: {{- include "liqo.concatenateMap" $d | nindent 16 }} {{- end }} {{- if .Values.metrics.enabled }} - - --metrics-address=:8084 + - --metrics-address=:8082 {{- end }} - - --health-probe-bind-address=:8085 + - --health-probe-bind-address=:8083 - --ping-enabled=true - --ping-loss-threshold={{ .Values.networking.gatewayTemplates.ping.lossThreshold }} - --ping-interval={{ .Values.networking.gatewayTemplates.ping.interval }} @@ -96,9 +96,15 @@ spec: mountPath: /ipc {{- if .Values.metrics.enabled }} ports: - - containerPort: 8084 + - containerPort: 8082 name: gw-metrics + - containerPort: 8083 + name: healthz {{- end }} + readinessProbe: + httpGet: + path: /readyz + port: healthz env: - name: NODE_NAME valueFrom: @@ -127,15 +133,21 @@ spec: - --mtu={{"{{ .Spec.MTU }}"}} - --listen-port={{"{{ .Spec.Endpoint.Port }}"}} {{- if .Values.metrics.enabled }} - - --metrics-address=:8082 + - --metrics-address=:8084 {{- end }} - - --health-probe-bind-address=:8083 + - --health-probe-bind-address=:8085 - --implementation={{ .Values.networking.gatewayTemplates.wireguard.implementation }} {{- if .Values.metrics.enabled }} ports: - - containerPort: 8082 + - containerPort: 8084 name: wg-metrics + - containerPort: 8085 + name: healthz {{- end }} + readinessProbe: + httpGet: + path: /readyz + port: healthz securityContext: capabilities: add: @@ -173,7 +185,13 @@ spec: ports: - containerPort: 8086 name: gv-metrics + - containerPort: 8087 + name: healthz {{- end }} + readinessProbe: + httpGet: + path: /readyz + port: healthz env: - name: NODE_NAME valueFrom: diff --git a/deployments/liqo/values.yaml b/deployments/liqo/values.yaml index 36da845086..cbf2b32935 100644 --- a/deployments/liqo/values.yaml +++ b/deployments/liqo/values.yaml @@ -124,6 +124,12 @@ networking: # In some cases (like K3S), this monitor can cause a huge amount of CPU usage. # If you are experiencing high CPU usage, you can disable this feature. nftablesMonitor: true + # -- Set the port where the fabric pod will expose the health probe. + # To disable the health probe, set the port to 0. + healthProbeBindAddressPort: "8081" + # -- Set the port where the fabric pod will expose the metrics. + # To disable the metrics, set the port to 0. + metricsAddressPort: "8082" authentication: # -- Enable/Disable the authentication module. diff --git a/pkg/gateway/flags.go b/pkg/gateway/flags.go index 46f670f3e2..e3ba3ae8c8 100644 --- a/pkg/gateway/flags.go +++ b/pkg/gateway/flags.go @@ -108,7 +108,7 @@ func InitFlags(flagset *pflag.FlagSet, opts *Options) { "RetryPeriod for the leader election") flagset.StringVar(&opts.MetricsAddress, FlagNameMetricsAddress.String(), "0", "Address for the metrics endpoint") - flagset.StringVar(&opts.ProbeAddr, FlagNameProbeAddr.String(), ":8081", "Address for the health probe endpoint") + flagset.StringVar(&opts.ProbeAddr, FlagNameProbeAddr.String(), "0", "Address for the health probe endpoint") flagset.BoolVar(&opts.DisableKernelVersionCheck, FlagNameDisableKernelVersionCheck.String(), false, "Disable the kernel version check") flagset.Var(&opts.MinimumKernelVersion, FlagNameMinimumKernelVersion.String(), "Minimum kernel version required by Liqo")