Skip to content

Commit

Permalink
Workaround for stuck loki-ingester pods
Browse files Browse the repository at this point in the history
Sometimes loki-ingester pods get stuck when the pods are not
cleanly restarted. This workaround checks for long startup times
of ingester pods and cleans checkpoint directory to non-existing
indexes in `/tmp/wal` directory of the pod.
  • Loading branch information
DebakelOrakel committed Apr 10, 2024
1 parent 84b2501 commit fde4ebc
Show file tree
Hide file tree
Showing 4 changed files with 632 additions and 0 deletions.
185 changes: 185 additions & 0 deletions component/loki.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,190 @@ local operator_metrics_sa_token =
type: 'kubernetes.io/service-account-token',
};

// Workaround for stuck loki-ingester.
// To be removed, once upstream is fixed.
// -----------------------------------------------------------------------------

local workaround_ingester = [
{
apiVersion: 'v1',
kind: 'ServiceAccount',
metadata: {
labels: {
name: 'loki-ingester-check',
},
name: 'loki-ingester-check',
namespace: 'openshift-logging',
},
},
{
apiVersion: 'rbac.authorization.k8s.io/v1',
kind: 'Role',
metadata: {
labels: {
name: 'loki-ingester-check',
},
name: 'loki-ingester-check',
namespace: 'openshift-logging',
},
rules: [ {
apiGroups: [ '' ],
resources: [ 'pods', 'pods/exec' ],
verbs: [
'get',
'list',
'watch',
'create',
'delete',
'patch',
'update',
],
} ],
},
{
apiVersion: 'rbac.authorization.k8s.io/v1',
kind: 'RoleBinding',
metadata: {
labels: {
name: 'loki-ingester-check',
},
name: 'loki-ingester-check',
namespace: 'openshift-logging',
},
roleRef: {
apiGroup: 'rbac.authorization.k8s.io',
kind: 'Role',
name: 'loki-ingester-check',
},
subjects: [ {
kind: 'ServiceAccount',
name: 'loki-ingester-check',
} ],
},
{
apiVersion: 'v1',
kind: 'ConfigMap',
metadata: {
labels: {
name: 'loki-ingester-check',
},
name: 'loki-ingester-check',
namespace: 'openshift-logging',
},
data: {
'wal-check.sh': |||
#!/bin/bash
function check_pod() {
POD_NAME="loki-ingester-${1}"
echo "checking POD ${POD_NAME}"
PHASE=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.phase')
if [ ${PHASE} != "Running" ]; then
return 0
fi
READY=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.conditions[] | select(.type == "ContainersReady") | .status')
if [ ${READY} == "True" ]; then
return 0
fi
return 1
}
function check_dir() {
shopt -s extglob
POD_NAME="loki-ingester-${1}"
echo "checking DIR ${POD_NAME}"
DIR_CHP=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^checkpoint\.[0-9]*$")
PATTERN=$(echo ${DIR_CHP} | sed 's/[^0-9]*//g')
DIR_WAL=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^0*${PATTERN}$")
if [ -z $DIR_WAL ]; then
kubectl -n openshift-logging exec -i ${POD_NAME} -- rm -rf /tmp/wal/${DIR_CHP}
kubectl -n openshift-logging delete po ${POD_NAME}
fi
}
function fix_pod() {
check_pod $1
STATE=$?
if [ ${STATE} -gt 0 ]; then
echo "stuck POD, waiting ${SLEEP_TIME}"
sleep ${SLEEP_TIME}
check_pod $1
STATE=$?
fi
if [ ${STATE} -gt 0 ]; then
check_dir $1
exit 0
fi
}
fix_pod 0
fix_pod 1
exit 0
|||,
},
},
{
apiVersion: 'batch/v1',
kind: 'CronJob',
metadata: {
labels: {
name: 'loki-ingester-check',
},
name: 'loki-ingester-check',
namespace: 'openshift-logging',
},
spec: {
schedule: '0,30 * * * *',
concurrencyPolicy: 'Forbid',
failedJobsHistoryLimit: 0,
jobTemplate: {
spec: {
activeDeadlineSeconds: 360,
backoffLimit: 1,
template: {
spec: {
containers: [ {
name: 'check-pod',
image: 'quay.io/appuio/oc:v4.14',
imagePullPolicy: 'IfNotPresent',
command: [ '/usr/local/bin/wal-check.sh' ],
env: [ {
name: 'SLEEP_TIME',
value: '2m',
} ],
ports: [],
stdin: false,
tty: false,
volumeMounts: [ {
mountPath: '/usr/local/bin/wal-check.sh',
name: 'wal-check',
readOnly: true,
subPath: 'wal-check.sh',
} ],
} ],
nodeSelector: { 'node-role.kubernetes.io/infra': '' },
restartPolicy: 'Never',
serviceAccountName: 'loki-ingester-check',
volumes: [ {
name: 'wal-check',
configMap: {
defaultMode: 364,
name: 'loki-ingester-check',
},
} ],
},
},
},
},
},
},
];
// -----------------------------------------------------------------------------

// Define outputs below
if loki.enabled then
{
Expand All @@ -146,6 +330,7 @@ if loki.enabled then
'50_loki_netpol': [ netpol_viewplugin, netpol_lokigateway ],
'50_loki_rbac': [ aggregate_loki_log_access ],
'50_loki_operator_metrics_token': [ operator_metrics_sa_token ],
'50_loki_ingester_fix': workaround_ingester,
}
else
std.trace(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
name: loki-ingester-check
name: loki-ingester-check
namespace: openshift-logging
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
name: loki-ingester-check
name: loki-ingester-check
namespace: openshift-logging
rules:
- apiGroups:
- ''
resources:
- pods
- pods/exec
verbs:
- get
- list
- watch
- create
- delete
- patch
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
name: loki-ingester-check
name: loki-ingester-check
namespace: openshift-logging
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: loki-ingester-check
subjects:
- kind: ServiceAccount
name: loki-ingester-check
---
apiVersion: v1
data:
wal-check.sh: |
#!/bin/bash
function check_pod() {
POD_NAME="loki-ingester-${1}"
echo "checking POD ${POD_NAME}"
PHASE=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.phase')
if [ ${PHASE} != "Running" ]; then
return 0
fi
READY=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.conditions[] | select(.type == "ContainersReady") | .status')
if [ ${READY} == "True" ]; then
return 0
fi
return 1
}
function check_dir() {
shopt -s extglob
POD_NAME="loki-ingester-${1}"
echo "checking DIR ${POD_NAME}"
DIR_CHP=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^checkpoint\.[0-9]*$")
PATTERN=$(echo ${DIR_CHP} | sed 's/[^0-9]*//g')
DIR_WAL=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^0*${PATTERN}$")
if [ -z $DIR_WAL ]; then
kubectl -n openshift-logging exec -i ${POD_NAME} -- rm -rf /tmp/wal/${DIR_CHP}
kubectl -n openshift-logging delete po ${POD_NAME}
fi
}
function fix_pod() {
check_pod $1
STATE=$?
if [ ${STATE} -gt 0 ]; then
echo "stuck POD, waiting ${SLEEP_TIME}"
sleep ${SLEEP_TIME}
check_pod $1
STATE=$?
fi
if [ ${STATE} -gt 0 ]; then
check_dir $1
exit 0
fi
}
fix_pod 0
fix_pod 1
exit 0
kind: ConfigMap
metadata:
labels:
name: loki-ingester-check
name: loki-ingester-check
namespace: openshift-logging
---
apiVersion: batch/v1
kind: CronJob
metadata:
labels:
name: loki-ingester-check
name: loki-ingester-check
namespace: openshift-logging
spec:
concurrencyPolicy: Forbid
failedJobsHistoryLimit: 0
jobTemplate:
spec:
activeDeadlineSeconds: 360
backoffLimit: 1
template:
spec:
containers:
- command:
- /usr/local/bin/wal-check.sh
env:
- name: SLEEP_TIME
value: 2m
image: quay.io/appuio/oc:v4.14
imagePullPolicy: IfNotPresent
name: check-pod
ports: []
stdin: false
tty: false
volumeMounts:
- mountPath: /usr/local/bin/wal-check.sh
name: wal-check
readOnly: true
subPath: wal-check.sh
nodeSelector:
node-role.kubernetes.io/infra: ''
restartPolicy: Never
serviceAccountName: loki-ingester-check
volumes:
- configMap:
defaultMode: 364
name: loki-ingester-check
name: wal-check
schedule: 0,30 * * * *
Loading

0 comments on commit fde4ebc

Please sign in to comment.