Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workaround for stuck loki-ingester pods #128

Merged
merged 1 commit into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,17 @@ parameters:
limits:
memory: 1.5Gi

images:
kubectl:
registry: quay.io
repository: appuio/oc
tag: v4.14

workaround:
ingester_fix:
schedule: '0,30 * * * *'
sleep_time: 2m

openshift4_elasticsearch_operator:
targetNamespaces:
- ${openshift4_logging:namespace}
Expand Down
26 changes: 3 additions & 23 deletions component/loki.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ local com = import 'lib/commodore.libjsonnet';
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';
local po = import 'lib/patch-operator.libsonnet';
local workaround = import 'loki_workaround.libsonnet';

// The hiera parameters for the component
local inv = kap.inventory();
Expand Down Expand Up @@ -116,36 +117,15 @@ local aggregate_loki_log_access = kube.ClusterRole('syn:loki:cluster-reader') {
],
};

// Generate missing metrics SA token for Loki Operator.
//
// The ServiceMonitor for the Loki Operator references a SA token secret
// called `loki-operator-controller-manager-metrics-token` which doesn't exist
// on the cluster after the operator is installed or upgraded to 5.8.5 via
// OLM.
local operator_metrics_sa_token =
kube.Secret('loki-operator-controller-manager-metrics-token') {
metadata+: {
// Loki operator is deployed in openshift-operators-redhat
namespace: 'openshift-operators-redhat',
annotations+: {
'kubernetes.io/service-account.name': 'loki-operator-controller-manager-metrics-reader',
// disable argocd prune/delete so removing the workaround should be
// fairly easy in case the Loki Operator OLM install fixes the issue.
'argocd.argoproj.io/sync-options': 'Prune=false,Delete=false',
},
},
data:: {},
type: 'kubernetes.io/service-account-token',
};

// Define outputs below
if loki.enabled then
{
'50_loki_stack': lokistack,
'50_loki_logstore': logstore,
'50_loki_netpol': [ netpol_viewplugin, netpol_lokigateway ],
'50_loki_rbac': [ aggregate_loki_log_access ],
'50_loki_operator_metrics_token': [ operator_metrics_sa_token ],
'50_loki_operator_metrics_token': workaround.missing_metrics_token,
'50_loki_ingester_fix': workaround.ingester_stuck,
}
else
std.trace(
Expand Down
137 changes: 137 additions & 0 deletions component/loki_workaround.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
local com = import 'lib/commodore.libjsonnet';
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';
local po = import 'lib/patch-operator.libsonnet';

// The hiera parameters for the component
local inv = kap.inventory();
local params = inv.parameters.openshift4_logging;


// Generate missing metrics SA token for Loki Operator.
//
// The ServiceMonitor for the Loki Operator references a SA token secret
// called `loki-operator-controller-manager-metrics-token` which doesn't exist
// on the cluster after the operator is installed or upgraded to 5.8.5 via
// OLM.
local missing_metrics_token =
kube.Secret('loki-operator-controller-manager-metrics-token') {
metadata+: {
// Loki operator is deployed in openshift-operators-redhat
namespace: 'openshift-operators-redhat',
annotations+: {
'kubernetes.io/service-account.name': 'loki-operator-controller-manager-metrics-reader',
// disable argocd prune/delete so removing the workaround should be
// fairly easy in case the Loki Operator OLM install fixes the issue.
'argocd.argoproj.io/sync-options': 'Prune=false,Delete=false',
},
},
data:: {},
type: 'kubernetes.io/service-account-token',
};


// Workaround for stuck loki-ingester.
// To be removed, once upstream is fixed.

local ingester_stuck = [
kube.ServiceAccount('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
},
kube.Role('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
rules: [ {
apiGroups: [ '' ],
resources: [ 'pods', 'pods/exec' ],
verbs: [
'get',
'list',
'watch',
'create',
'delete',
'patch',
'update',
],
} ],
},
kube.RoleBinding('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
roleRef: {
apiGroup: 'rbac.authorization.k8s.io',
kind: 'Role',
name: 'loki-ingester-check',
},
subjects: [ {
kind: 'ServiceAccount',
name: 'loki-ingester-check',
} ],
},
kube.ConfigMap('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
data: {
'wal-check.sh': importstr 'workaround-scripts/wal-check.sh',
},
},
kube.CronJob('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
spec: {
schedule: params.workaround.ingester_fix.schedule,
concurrencyPolicy: 'Forbid',
failedJobsHistoryLimit: 0,
jobTemplate: {
spec: {
activeDeadlineSeconds: 360,
backoffLimit: 1,
template: {
spec: {
containers: [ {
name: 'check-pod',
image: '%(registry)s/%(repository)s:%(tag)s' % params.images.kubectl,
imagePullPolicy: 'IfNotPresent',
command: [ '/usr/local/bin/wal-check.sh' ],
env: [ {
name: 'SLEEP_TIME',
value: params.workaround.ingester_fix.sleep_time,
} ],
ports: [],
stdin: false,
tty: false,
volumeMounts: [ {
mountPath: '/usr/local/bin/wal-check.sh',
name: 'wal-check',
readOnly: true,
subPath: 'wal-check.sh',
} ],
} ],
nodeSelector: { 'node-role.kubernetes.io/infra': '' },
restartPolicy: 'Never',
serviceAccountName: 'loki-ingester-check',
volumes: [ {
name: 'wal-check',
configMap: {
defaultMode: 364,
name: 'loki-ingester-check',
},
} ],
},
},
},
},
},
},
];

{
missing_metrics_token: [ missing_metrics_token ],
ingester_stuck: ingester_stuck,
}
50 changes: 50 additions & 0 deletions component/workaround-scripts/wal-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

set -e -o pipefail

# Check if pod is in stuck state.
function check_pod() {
POD_NAME="loki-ingester-${1}"
echo "checking POD ${POD_NAME}"
PHASE=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.phase')
if [ ${PHASE} != "Running" ]; then
return 0
fi
READY=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.conditions[] | select(.type == "ContainersReady") | .status')
if [ ${READY} == "True" ]; then
return 0
fi
return 1
}

# Check directories of pod and remove non-existing checkpoint if present.
function check_dir() {
shopt -s extglob
POD_NAME="loki-ingester-${1}"
echo "checking DIR ${POD_NAME}"
DIR_CHP=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^checkpoint\.[0-9]*$")
PATTERN=$(echo ${DIR_CHP} | sed 's/[^0-9]*//g')
DIR_WAL=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^0*${PATTERN}$" || exit 0)
if [ -z $DIR_WAL ]; then
kubectl -n openshift-logging exec -i ${POD_NAME} -- rm -rf /tmp/wal/${DIR_CHP}
kubectl -n openshift-logging delete po ${POD_NAME}
fi
}

# Check if pods are in stuck state for longer than ${SLEEP_TIME}.
# Only fix 1 pod at a time and immediatly exit if it is fixed.
function fix_pod() {
if ! check_pod $1; then
echo "stuck POD, waiting ${SLEEP_TIME}"
sleep ${SLEEP_TIME}
if ! check_pod $1; then
check_dir $1
exit 0
fi
fi
}

fix_pod 0
fix_pod 1

exit 0
Loading
Loading