Skip to content

Commit

Permalink
Health check troubleshooting runbook: Verify firewall rules step
Browse files Browse the repository at this point in the history
Change-Id: I4e37a93cd9540e3dadb4e3954db7afbb350dfc0f
GitOrigin-RevId: a6c01a2658b4b76c9fd2e3e05e47da3cd6f022d8
  • Loading branch information
Natalia Kaczor authored and copybara-github committed Aug 23, 2024
1 parent bf1caa7 commit d7cb461
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 44 deletions.
6 changes: 6 additions & 0 deletions gcpdiag/lint/gke/eol.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@
regular_avail: 2024-07-30
stable_avail: 2024-08-13
eol: 2025-09-30
'1.31':
oss_release: 2024-08-13
rapid_avail: 2024-08-20
regular_avail: 2024-09
stable_avail: 2024-10
eol: 2025-Q4
19 changes: 15 additions & 4 deletions gcpdiag/queries/lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,16 @@ def region(self):
return None

@property
def used_by(self) -> List[str]:
return self._resource_data.get('usedBy', [])
def used_by_refs(self) -> List[str]:
used_by = []
for x in self._resource_data.get('usedBy', []):
reference = x.get('reference')
if reference:
match = re.match(r'https://www.googleapis.com/compute/v1/(.*)',
reference)
if match:
used_by.append(match.group(1))
return used_by


@caching.cached_api_call(in_memory=True)
Expand Down Expand Up @@ -167,8 +175,11 @@ def _generate_health_response_callback(

def health_response_callback(request_id, response, exception):
del request_id, exception
for health_status in response.get('healthStatus', []):
backend_heath_statuses.append(BackendHealth(health_status, group))

# None is returned when backend type doesn't support health check
if response is not None:
for health_status in response.get('healthStatus', []):
backend_heath_statuses.append(BackendHealth(health_status, group))

return health_response_callback

Expand Down
15 changes: 15 additions & 0 deletions gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ lb/unhealthy-backends: Load Balancer Unhealthy Backends Analyzer.

This configuration can be problematic unless you have configured the load balancer to use a different port for health checks purposefully.

[AUTOMATED STEP]: Checks if firewall rules are configured correctly.

- gcpdiag-lb2-aaaa/web-backend-service [FAIL]
[REASON]
The health checks are currently failing due to a misconfigured firewall. This is preventing Google Cloud probers from connecting to your backends, causing the load balancer to consider them unhealthy.

[REMEDIATION]
Update your firewall rules to allow inbound traffic from the Google Cloud health check IP ranges (found at https://cloud.google.com/load-balancing/docs/health-check-concepts#ip-ranges) to your backends.

[END]: Finalizing unhealthy backends diagnostics...


Expand Down Expand Up @@ -182,6 +191,12 @@ lb/unhealthy-backends: Load Balancer Unhealthy Backends Analyzer.
[REASON]
The load balancer is performing health checks on the same port that it is using for serving traffic. This is the standard configuration.

[AUTOMATED STEP]: Checks if firewall rules are configured correctly.

- gcpdiag-lb2-aaaa/backend-service-2 [OK]
[REASON]
Firewalls are correctly configured and are not blocking the health check probes.

[END]: Finalizing unhealthy backends diagnostics...


12 changes: 12 additions & 0 deletions gcpdiag/runbook/lb/templates/unhealthy_backends.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ This configuration can be problematic unless you have configured the load balanc
The load balancer is performing health checks on the same port that it is using for serving traffic. This is the standard configuration.
{% endblock port_mismatch_success_reason %}

{% block firewall_rules_failure_reason %}
The health checks are currently failing due to a misconfigured firewall. This is preventing Google Cloud probers from connecting to your backends, causing the load balancer to consider them unhealthy.
{% endblock firewall_rules_failure_reason %}

{% block firewall_rules_failure_remediation %}
Update your firewall rules to allow inbound traffic from the Google Cloud health check IP ranges (found at https://cloud.google.com/load-balancing/docs/health-check-concepts#ip-ranges) to your backends.
{% endblock firewall_rules_failure_remediation %}

{% block firewall_rules_success_reason %}
Firewalls are correctly configured and are not blocking the health check probes.
{% endblock firewall_rules_success_reason %}

{% block past_hc_success_uncertain_remediation %}
Check the logs and monitoring metrics for those instances, focusing on the given timeframes to see if there were any errors, crashes, or resource exhaustion issues that coincide with the unhealthy transition. You can also inspect any application-specific logs for errors or warnings around the timestamp.
{% endblock past_hc_success_uncertain_remediation %}
Expand Down
92 changes: 58 additions & 34 deletions gcpdiag/runbook/lb/unhealthy_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,8 @@ def build_tree(self):
port_check = ValidateBackendServicePortConfiguration()
self.add_step(parent=start, child=port_check)

# TODO: restore this step once Recommender API returns
# correct load balancer url

# firewall_check = VerifyFirewallRules()
# self.add_step(parent=start, child=firewall_check)
firewall_check = VerifyFirewallRules()
self.add_step(parent=start, child=firewall_check)

# Ending your runbook
self.add_end(UnhealthyBackendsEnd())
Expand All @@ -119,9 +116,8 @@ def execute(self):
return # Early exit if Compute API is disabled

try:
op.info(
f'name: {op.get(flags.BACKEND_SERVICE_NAME)}, region: {op.get(flags.REGION)}'
)
op.info(f'name: {op.get(flags.BACKEND_SERVICE_NAME)}, region:'
f' {op.get(flags.REGION)}')
backend_service = lb.get_backend_service(
op.context.project_id,
op.get(flags.BACKEND_SERVICE_NAME),
Expand All @@ -130,10 +126,10 @@ def execute(self):
except googleapiclient.errors.HttpError:
op.add_skipped(
proj,
reason=
(f'Backend service {op.get(flags.BACKEND_SERVICE_NAME)} does not exist in scope'
f' {op.get(flags.REGION, "global")} or project'
f' {op.get(flags.PROJECT_ID)}'),
reason=(
f'Backend service {op.get(flags.BACKEND_SERVICE_NAME)} does not'
f' exist in scope {op.get(flags.REGION, "global")} or project'
f' {op.get(flags.PROJECT_ID)}'),
)
return # Early exit if load balancer doesn't exist

Expand Down Expand Up @@ -183,21 +179,49 @@ def execute(self):
)


# TODO: restore this steps once Recommender API returns correct load balancer url
# class VerifyFirewallRules(runbook.Step):
# """Checks if firewall rules are configured correctly."""
class VerifyFirewallRules(runbook.Step):
"""Checks if firewall rules are configured correctly."""

template = 'unhealthy_backends::firewall_rules'

# def execute(self):
# """Checks if firewall rules are configured correctly."""
# backend_service = lb.get_backend_service(
# op.context.project_id,
# op.get(flags.BACKEND_SERVICE_NAME),
# op.get(flags.REGION),
# )
# insights = lb.get_lb_insights_for_a_project(op.context.project_id)
# for insight in insights:
# if insight.is_firewall_rule_insight:
# op.info(insight.details)
def execute(self):
"""Checks if firewall rules are configured correctly."""
backend_service = lb.get_backend_service(
op.context.project_id,
op.get(flags.BACKEND_SERVICE_NAME),
op.get(flags.REGION),
)

used_by_refs = backend_service.used_by_refs
insights = lb.get_lb_insights_for_a_project(op.context.project_id)
for insight in insights:
if insight.is_firewall_rule_insight and insight.details.get(
'loadBalancerUri'):
# network load balancers (backend service is central resource):
if insight.details.get('loadBalancerUri').endswith(
backend_service.full_path):
op.add_failed(
resource=backend_service,
reason=op.prep_msg(
op.FAILURE_REASON,
insight=insight.description,
),
remediation=op.prep_msg(op.FAILURE_REMEDIATION),
)
return # Exit the loop after finding a match
for ref in used_by_refs:
# application load balancers (url map is central resource):
if insight.details.get('loadBalancerUri').endswith(ref):
op.add_failed(
resource=backend_service,
reason=op.prep_msg(
op.FAILURE_REASON,
insight=insight.details,
),
remediation=op.prep_msg(op.FAILURE_REMEDIATION),
)
return # Exit the loop after finding a match
op.add_ok(backend_service, reason=op.prep_msg(op.SUCCESS_REASON))


class ValidateBackendServicePortConfiguration(runbook.Step):
Expand Down Expand Up @@ -367,10 +391,10 @@ def execute(self):
else:
op.add_skipped(
resource=backend_service,
reason=
(f'Unsupported resource type {resource_type} for group {group} '
f'in backend service {op.get(flags.BACKEND_SERVICE_NAME)} in scope'
f' {op.get(flags.REGION, "global")}'),
reason=(f'Unsupported resource type {resource_type} for group'
f' {group} in backend service'
f' {op.get(flags.BACKEND_SERVICE_NAME)} in scope'
f' {op.get(flags.REGION, "global")}'),
)
continue
serial_log_entries = logs.realtime_query(
Expand Down Expand Up @@ -599,10 +623,10 @@ def execute(self):
else:
op.add_skipped(
resource=group,
reason=
(f'Unsupported resource type {resource_type} for group {group} '
f'in backend service {op.get(flags.BACKEND_SERVICE_NAME)} in scope'
f' {op.get(flags.REGION, "global")}'),
reason=(f'Unsupported resource type {resource_type} for group'
f' {group} in backend service'
f' {op.get(flags.BACKEND_SERVICE_NAME)} in scope'
f' {op.get(flags.REGION, "global")}'),
)
continue

Expand Down
14 changes: 8 additions & 6 deletions test-data/lb2/json-dumps/lb-insights.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
}
]
},
"lastRefreshTime": "2024-08-12T11:55:04.317647338Z",
"observationPeriod": "0s",
"lastRefreshTime": "2024-08-22T07:02:04.248202514Z",
"observationPeriod": "846419.930555176s",
"stateInfo": {
"state": "ACTIVE"
},
Expand All @@ -29,7 +29,7 @@
"//cloudresourcemanager.googleapis.com/projects/gcpdiag-lb2-aaaa"
],
"insightSubtype": "HEALTH_CHECK_PORT_MISMATCH",
"etag": "\"8d1b2881668276eb\"",
"etag": "\"ae64eeb84d1c424c\"",
"severity": "LOW"
},
{
Expand All @@ -38,6 +38,8 @@
"content": {
"forwardingRuleUri": "//compute.googleapis.com/projects/gcpdiag-lb2-aaaa/global/forwardingRules/http-content-rule",
"loadBalancerType": "EXTERNAL_HTTP_PROXY",
"loadBalancerName": "web-map-http",
"loadBalancerUri": "//compute.googleapis.com/projects/gcpdiag-lb2-aaaa/global/urlMaps/web-map-http",
"healthCheckFirewallInfo": {
"misconfiguredInstanceUris": [
"//compute.googleapis.com/projects/gcpdiag-lb2-aaaa/zones/us-east1-b/instances/vm-m5q1",
Expand All @@ -51,8 +53,8 @@
]
}
},
"lastRefreshTime": "2024-08-12T11:55:04.459978451Z",
"observationPeriod": "777.078654069s",
"lastRefreshTime": "2024-08-22T07:02:04.282633461Z",
"observationPeriod": "847196.901309079s",
"stateInfo": {
"state": "ACTIVE"
},
Expand All @@ -61,7 +63,7 @@
"//compute.googleapis.com/projects/gcpdiag-lb2-aaaa/global/forwardingRules/http-content-rule"
],
"insightSubtype": "HEALTH_CHECK_FIREWALL_NOT_CONFIGURED",
"etag": "\"1da24f29cfd9fa23\"",
"etag": "\"41757c97aab16f73\"",
"severity": "MEDIUM"
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ gcpdiag runbook --help

- [Validate Backend Service Port Configuration](/runbook/steps/lb/validate-backend-service-port-configuration)

- [Verify Firewall Rules](/runbook/steps/lb/verify-firewall-rules)

- [Unhealthy Backends End](/runbook/steps/lb/unhealthy-backends-end)


Expand Down
12 changes: 12 additions & 0 deletions website/content/en/runbook/steps/lb/verify-firewall-rules.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,18 @@ description: >

None

### Failure Reason

The health checks are currently failing due to a misconfigured firewall. This is preventing Google Cloud probers from connecting to your backends, causing the load balancer to consider them unhealthy.

### Failure Remediation

Update your firewall rules to allow inbound traffic from the Google Cloud health check IP ranges (found at https://cloud.google.com/load-balancing/docs/health-check-concepts#ip-ranges) to your backends.

### Success Reason

Firewalls are correctly configured and are not blocking the health check probes.



<!--
Expand Down

0 comments on commit d7cb461

Please sign in to comment.