diff --git a/gcpdiag/lint/gke/eol.yaml b/gcpdiag/lint/gke/eol.yaml index 5b51a42fa..3ab333d95 100644 --- a/gcpdiag/lint/gke/eol.yaml +++ b/gcpdiag/lint/gke/eol.yaml @@ -30,3 +30,9 @@ regular_avail: 2024-07-30 stable_avail: 2024-08-13 eol: 2025-09-30 +'1.31': + oss_release: 2024-08-13 + rapid_avail: 2024-08-20 + regular_avail: 2024-09 + stable_avail: 2024-10 + eol: 2025-Q4 diff --git a/gcpdiag/queries/lb.py b/gcpdiag/queries/lb.py index 9e674c70b..3b769b5ff 100644 --- a/gcpdiag/queries/lb.py +++ b/gcpdiag/queries/lb.py @@ -108,8 +108,16 @@ def region(self): return None @property - def used_by(self) -> List[str]: - return self._resource_data.get('usedBy', []) + def used_by_refs(self) -> List[str]: + used_by = [] + for x in self._resource_data.get('usedBy', []): + reference = x.get('reference') + if reference: + match = re.match(r'https://www.googleapis.com/compute/v1/(.*)', + reference) + if match: + used_by.append(match.group(1)) + return used_by @caching.cached_api_call(in_memory=True) @@ -167,8 +175,11 @@ def _generate_health_response_callback( def health_response_callback(request_id, response, exception): del request_id, exception - for health_status in response.get('healthStatus', []): - backend_heath_statuses.append(BackendHealth(health_status, group)) + + # None is returned when backend type doesn't support health check + if response is not None: + for health_status in response.get('healthStatus', []): + backend_heath_statuses.append(BackendHealth(health_status, group)) return health_response_callback diff --git a/gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt b/gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt index d6c56f06a..48ccfad00 100644 --- a/gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt +++ b/gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt @@ -91,6 +91,15 @@ lb/unhealthy-backends: Load Balancer Unhealthy Backends Analyzer. This configuration can be problematic unless you have configured the load balancer to use a different port for health checks purposefully. +[AUTOMATED STEP]: Checks if firewall rules are configured correctly. + + - gcpdiag-lb2-aaaa/web-backend-service [FAIL] + [REASON] + The health checks are currently failing due to a misconfigured firewall. This is preventing Google Cloud probers from connecting to your backends, causing the load balancer to consider them unhealthy. + + [REMEDIATION] + Update your firewall rules to allow inbound traffic from the Google Cloud health check IP ranges (found at https://cloud.google.com/load-balancing/docs/health-check-concepts#ip-ranges) to your backends. + [END]: Finalizing unhealthy backends diagnostics... @@ -182,6 +191,12 @@ lb/unhealthy-backends: Load Balancer Unhealthy Backends Analyzer. [REASON] The load balancer is performing health checks on the same port that it is using for serving traffic. This is the standard configuration. +[AUTOMATED STEP]: Checks if firewall rules are configured correctly. + + - gcpdiag-lb2-aaaa/backend-service-2 [OK] + [REASON] + Firewalls are correctly configured and are not blocking the health check probes. + [END]: Finalizing unhealthy backends diagnostics... diff --git a/gcpdiag/runbook/lb/templates/unhealthy_backends.jinja b/gcpdiag/runbook/lb/templates/unhealthy_backends.jinja index 067259794..da2c8ec0d 100644 --- a/gcpdiag/runbook/lb/templates/unhealthy_backends.jinja +++ b/gcpdiag/runbook/lb/templates/unhealthy_backends.jinja @@ -40,6 +40,18 @@ This configuration can be problematic unless you have configured the load balanc The load balancer is performing health checks on the same port that it is using for serving traffic. This is the standard configuration. {% endblock port_mismatch_success_reason %} +{% block firewall_rules_failure_reason %} +The health checks are currently failing due to a misconfigured firewall. This is preventing Google Cloud probers from connecting to your backends, causing the load balancer to consider them unhealthy. +{% endblock firewall_rules_failure_reason %} + +{% block firewall_rules_failure_remediation %} +Update your firewall rules to allow inbound traffic from the Google Cloud health check IP ranges (found at https://cloud.google.com/load-balancing/docs/health-check-concepts#ip-ranges) to your backends. +{% endblock firewall_rules_failure_remediation %} + +{% block firewall_rules_success_reason %} +Firewalls are correctly configured and are not blocking the health check probes. +{% endblock firewall_rules_success_reason %} + {% block past_hc_success_uncertain_remediation %} Check the logs and monitoring metrics for those instances, focusing on the given timeframes to see if there were any errors, crashes, or resource exhaustion issues that coincide with the unhealthy transition. You can also inspect any application-specific logs for errors or warnings around the timestamp. {% endblock past_hc_success_uncertain_remediation %} diff --git a/gcpdiag/runbook/lb/unhealthy_backends.py b/gcpdiag/runbook/lb/unhealthy_backends.py index 424c115be..9ee70c920 100644 --- a/gcpdiag/runbook/lb/unhealthy_backends.py +++ b/gcpdiag/runbook/lb/unhealthy_backends.py @@ -94,11 +94,8 @@ def build_tree(self): port_check = ValidateBackendServicePortConfiguration() self.add_step(parent=start, child=port_check) - # TODO: restore this step once Recommender API returns - # correct load balancer url - - # firewall_check = VerifyFirewallRules() - # self.add_step(parent=start, child=firewall_check) + firewall_check = VerifyFirewallRules() + self.add_step(parent=start, child=firewall_check) # Ending your runbook self.add_end(UnhealthyBackendsEnd()) @@ -119,9 +116,8 @@ def execute(self): return # Early exit if Compute API is disabled try: - op.info( - f'name: {op.get(flags.BACKEND_SERVICE_NAME)}, region: {op.get(flags.REGION)}' - ) + op.info(f'name: {op.get(flags.BACKEND_SERVICE_NAME)}, region:' + f' {op.get(flags.REGION)}') backend_service = lb.get_backend_service( op.context.project_id, op.get(flags.BACKEND_SERVICE_NAME), @@ -130,10 +126,10 @@ def execute(self): except googleapiclient.errors.HttpError: op.add_skipped( proj, - reason= - (f'Backend service {op.get(flags.BACKEND_SERVICE_NAME)} does not exist in scope' - f' {op.get(flags.REGION, "global")} or project' - f' {op.get(flags.PROJECT_ID)}'), + reason=( + f'Backend service {op.get(flags.BACKEND_SERVICE_NAME)} does not' + f' exist in scope {op.get(flags.REGION, "global")} or project' + f' {op.get(flags.PROJECT_ID)}'), ) return # Early exit if load balancer doesn't exist @@ -183,21 +179,49 @@ def execute(self): ) -# TODO: restore this steps once Recommender API returns correct load balancer url -# class VerifyFirewallRules(runbook.Step): -# """Checks if firewall rules are configured correctly.""" +class VerifyFirewallRules(runbook.Step): + """Checks if firewall rules are configured correctly.""" + + template = 'unhealthy_backends::firewall_rules' -# def execute(self): -# """Checks if firewall rules are configured correctly.""" -# backend_service = lb.get_backend_service( -# op.context.project_id, -# op.get(flags.BACKEND_SERVICE_NAME), -# op.get(flags.REGION), -# ) -# insights = lb.get_lb_insights_for_a_project(op.context.project_id) -# for insight in insights: -# if insight.is_firewall_rule_insight: -# op.info(insight.details) + def execute(self): + """Checks if firewall rules are configured correctly.""" + backend_service = lb.get_backend_service( + op.context.project_id, + op.get(flags.BACKEND_SERVICE_NAME), + op.get(flags.REGION), + ) + + used_by_refs = backend_service.used_by_refs + insights = lb.get_lb_insights_for_a_project(op.context.project_id) + for insight in insights: + if insight.is_firewall_rule_insight and insight.details.get( + 'loadBalancerUri'): + # network load balancers (backend service is central resource): + if insight.details.get('loadBalancerUri').endswith( + backend_service.full_path): + op.add_failed( + resource=backend_service, + reason=op.prep_msg( + op.FAILURE_REASON, + insight=insight.description, + ), + remediation=op.prep_msg(op.FAILURE_REMEDIATION), + ) + return # Exit the loop after finding a match + for ref in used_by_refs: + # application load balancers (url map is central resource): + if insight.details.get('loadBalancerUri').endswith(ref): + op.add_failed( + resource=backend_service, + reason=op.prep_msg( + op.FAILURE_REASON, + insight=insight.details, + ), + remediation=op.prep_msg(op.FAILURE_REMEDIATION), + ) + return # Exit the loop after finding a match + op.add_ok(backend_service, reason=op.prep_msg(op.SUCCESS_REASON)) class ValidateBackendServicePortConfiguration(runbook.Step): @@ -367,10 +391,10 @@ def execute(self): else: op.add_skipped( resource=backend_service, - reason= - (f'Unsupported resource type {resource_type} for group {group} ' - f'in backend service {op.get(flags.BACKEND_SERVICE_NAME)} in scope' - f' {op.get(flags.REGION, "global")}'), + reason=(f'Unsupported resource type {resource_type} for group' + f' {group} in backend service' + f' {op.get(flags.BACKEND_SERVICE_NAME)} in scope' + f' {op.get(flags.REGION, "global")}'), ) continue serial_log_entries = logs.realtime_query( @@ -599,10 +623,10 @@ def execute(self): else: op.add_skipped( resource=group, - reason= - (f'Unsupported resource type {resource_type} for group {group} ' - f'in backend service {op.get(flags.BACKEND_SERVICE_NAME)} in scope' - f' {op.get(flags.REGION, "global")}'), + reason=(f'Unsupported resource type {resource_type} for group' + f' {group} in backend service' + f' {op.get(flags.BACKEND_SERVICE_NAME)} in scope' + f' {op.get(flags.REGION, "global")}'), ) continue diff --git a/test-data/lb2/json-dumps/lb-insights.json b/test-data/lb2/json-dumps/lb-insights.json index 4e8430421..18ba1952e 100644 --- a/test-data/lb2/json-dumps/lb-insights.json +++ b/test-data/lb2/json-dumps/lb-insights.json @@ -19,8 +19,8 @@ } ] }, - "lastRefreshTime": "2024-08-12T11:55:04.317647338Z", - "observationPeriod": "0s", + "lastRefreshTime": "2024-08-22T07:02:04.248202514Z", + "observationPeriod": "846419.930555176s", "stateInfo": { "state": "ACTIVE" }, @@ -29,7 +29,7 @@ "//cloudresourcemanager.googleapis.com/projects/gcpdiag-lb2-aaaa" ], "insightSubtype": "HEALTH_CHECK_PORT_MISMATCH", - "etag": "\"8d1b2881668276eb\"", + "etag": "\"ae64eeb84d1c424c\"", "severity": "LOW" }, { @@ -38,6 +38,8 @@ "content": { "forwardingRuleUri": "//compute.googleapis.com/projects/gcpdiag-lb2-aaaa/global/forwardingRules/http-content-rule", "loadBalancerType": "EXTERNAL_HTTP_PROXY", + "loadBalancerName": "web-map-http", + "loadBalancerUri": "//compute.googleapis.com/projects/gcpdiag-lb2-aaaa/global/urlMaps/web-map-http", "healthCheckFirewallInfo": { "misconfiguredInstanceUris": [ "//compute.googleapis.com/projects/gcpdiag-lb2-aaaa/zones/us-east1-b/instances/vm-m5q1", @@ -51,8 +53,8 @@ ] } }, - "lastRefreshTime": "2024-08-12T11:55:04.459978451Z", - "observationPeriod": "777.078654069s", + "lastRefreshTime": "2024-08-22T07:02:04.282633461Z", + "observationPeriod": "847196.901309079s", "stateInfo": { "state": "ACTIVE" }, @@ -61,7 +63,7 @@ "//compute.googleapis.com/projects/gcpdiag-lb2-aaaa/global/forwardingRules/http-content-rule" ], "insightSubtype": "HEALTH_CHECK_FIREWALL_NOT_CONFIGURED", - "etag": "\"1da24f29cfd9fa23\"", + "etag": "\"41757c97aab16f73\"", "severity": "MEDIUM" } ] diff --git a/website/content/en/runbook/diagnostic-trees/lb/unhealthy-backends.md b/website/content/en/runbook/diagnostic-trees/lb/unhealthy-backends.md index dca21d31d..42ec163ba 100644 --- a/website/content/en/runbook/diagnostic-trees/lb/unhealthy-backends.md +++ b/website/content/en/runbook/diagnostic-trees/lb/unhealthy-backends.md @@ -76,6 +76,8 @@ gcpdiag runbook --help - [Validate Backend Service Port Configuration](/runbook/steps/lb/validate-backend-service-port-configuration) + - [Verify Firewall Rules](/runbook/steps/lb/verify-firewall-rules) + - [Unhealthy Backends End](/runbook/steps/lb/unhealthy-backends-end) diff --git a/website/content/en/runbook/steps/lb/verify-firewall-rules.md b/website/content/en/runbook/steps/lb/verify-firewall-rules.md index 10df9255a..6f65268cb 100644 --- a/website/content/en/runbook/steps/lb/verify-firewall-rules.md +++ b/website/content/en/runbook/steps/lb/verify-firewall-rules.md @@ -14,6 +14,18 @@ description: > None +### Failure Reason + +The health checks are currently failing due to a misconfigured firewall. This is preventing Google Cloud probers from connecting to your backends, causing the load balancer to consider them unhealthy. + +### Failure Remediation + +Update your firewall rules to allow inbound traffic from the Google Cloud health check IP ranges (found at https://cloud.google.com/load-balancing/docs/health-check-concepts#ip-ranges) to your backends. + +### Success Reason + +Firewalls are correctly configured and are not blocking the health check probes. +