diff --git a/.github/workflows/build-test-dev.yml b/.github/workflows/build-test-dev.yml
index 0812ed3..bf8b94b 100644
--- a/.github/workflows/build-test-dev.yml
+++ b/.github/workflows/build-test-dev.yml
@@ -39,7 +39,7 @@ jobs:
- name: Upload Test coverage Reports
if: ${{ always() }}
- uses: actions/upload-artifact@v2
+ uses: actions/upload-artifact@v3
with:
name: code-coverage-report
path: |
diff --git a/.github/workflows/prep-release.yml b/.github/workflows/prep-release.yml
index 7a154d8..6083856 100644
--- a/.github/workflows/prep-release.yml
+++ b/.github/workflows/prep-release.yml
@@ -4,7 +4,7 @@ on:
release_tag:
description: 'Release tag'
required: true
- default: '1.0.0-dev'
+ default: '1.0.1-dev'
prep_internal_release:
# Need to distinguish between internal and external releases
# Internal release: Will use default internal location for created images (ghcr.io) and will tag and push operator candidate there
diff --git a/Dockerfile b/Dockerfile
index 9a2a284..870d386 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,8 +23,8 @@ FROM registry.access.redhat.com/ubi8/ubi-minimal:latest
LABEL name="solace/pubsubplus-eventbroker-operator"
LABEL vendor="Solace Corporation"
-LABEL version="1.0.0"
-LABEL release="1.0.0"
+LABEL version="1.0.1"
+LABEL release="1.0.1"
LABEL summary="Solace PubSub+ Event Broker Kubernetes Operator"
LABEL description="The Solace PubSub+ Event Broker Kubernetes Operator deploys and manages the lifecycle of PubSub+ Event Brokers"
diff --git a/Makefile b/Makefile
index 495bd91..4642dd3 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
# To re-generate a bundle for another specific version without changing the standard setup, you can:
# - use the VERSION as arg of the bundle target (e.g make bundle VERSION=0.0.2)
# - use environment variables to overwrite this value (e.g export VERSION=0.0.2)
-VERSION ?= 1.0.0-dev
+VERSION ?= 1.0.1-dev
# API_VERSION defines the API version for the PubSubPlusEventBroker CRD
API_VERSION ?= v1beta1
diff --git a/bundle/manifests/pubsubplus-eventbroker-operator.clusterserviceversion.yaml b/bundle/manifests/pubsubplus-eventbroker-operator.clusterserviceversion.yaml
index 4cbd579..356145d 100644
--- a/bundle/manifests/pubsubplus-eventbroker-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/pubsubplus-eventbroker-operator.clusterserviceversion.yaml
@@ -20,8 +20,8 @@ metadata:
certified: "true"
com.redhat.delivery.operator.bundle: "true"
com.redhat.openshift.versions: v4.10
- containerImage: docker.io/solace/pubsubplus-eventbroker-operator:1.0.0
- createdAt: "2023-04-19T16:00:24Z"
+ containerImage: docker.io/solace/pubsubplus-eventbroker-operator:1.0.1
+ createdAt: "2023-09-13T10:40:30Z"
description: The Solace PubSub+ Event Broker Operator deploys and manages the
lifecycle of PubSub+ Event Brokers
operators.openshift.io/valid-subscription: '[]'
@@ -29,7 +29,7 @@ metadata:
operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
repository: https://github.com/SolaceProducts/pubsubplus-kubernetes-quickstart
support: Solace Products
- name: pubsubplus-eventbroker-operator.v1.0.0
+ name: pubsubplus-eventbroker-operator.v1.0.1
namespace: placeholder
spec:
apiservicedefinitions: {}
@@ -296,7 +296,7 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.annotations['olm.targetNamespaces']
- image: docker.io/solace/pubsubplus-eventbroker-operator:1.0.0
+ image: docker.io/solace/pubsubplus-eventbroker-operator:1.0.1
imagePullPolicy: Always
livenessProbe:
httpGet:
@@ -411,4 +411,4 @@ spec:
provider:
name: Solace Corporation
url: www.solace.com
- version: 1.0.0
+ version: 1.0.1
diff --git a/bundle/manifests/pubsubplus.solace.com_pubsubpluseventbrokers.yaml b/bundle/manifests/pubsubplus.solace.com_pubsubpluseventbrokers.yaml
index 9cacbb3..a7b5141 100644
--- a/bundle/manifests/pubsubplus.solace.com_pubsubpluseventbrokers.yaml
+++ b/bundle/manifests/pubsubplus.solace.com_pubsubpluseventbrokers.yaml
@@ -4,7 +4,7 @@ metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.3
labels:
- app.kubernetes.io/version: v1.0.0
+ app.kubernetes.io/version: v1.0.1
name: pubsubpluseventbrokers.pubsubplus.solace.com
spec:
group: pubsubplus.solace.com
diff --git a/bundle/metadata/annotations.yaml b/bundle/metadata/annotations.yaml
index d4ab5aa..fb59147 100644
--- a/bundle/metadata/annotations.yaml
+++ b/bundle/metadata/annotations.yaml
@@ -15,4 +15,4 @@ annotations:
operators.operatorframework.io.test.config.v1: tests/scorecard/
# Required by RedHat certification
- com.redhat.openshift.versions: "v4.10"
+ com.redhat.openshift.versions: "v4.11"
diff --git a/ci/whitesource/whitesource-agent.config b/ci/whitesource/whitesource-agent.config
index 7945775..1a776b4 100644
--- a/ci/whitesource/whitesource-agent.config
+++ b/ci/whitesource/whitesource-agent.config
@@ -45,7 +45,7 @@ projectVersion=
projectToken=
productName=pubsubplus-kubernetes-operator
-productVersion=v1.0.0
+productVersion=v1.0.1
productToken=
updateType=OVERRIDE
#requesterEmail=user@provider.com
diff --git a/controllers/brokerscripts/init.sh b/controllers/brokerscripts/init.sh
index 96f3256..36e7d09 100644
--- a/controllers/brokerscripts/init.sh
+++ b/controllers/brokerscripts/init.sh
@@ -15,15 +15,15 @@ if [ "${BROKER_TLS_ENABLED}" = "true" ]; then
cat /mnt/disks/certs/server/${BROKER_CERT_FILENAME} /mnt/disks/certs/server/${BROKER_CERTKEY_FILENAME} > /dev/shm/server.cert
export tls_servercertificate_filepath="/dev/shm/server.cert"
fi
+# Deal with the fact we cannot accept "-" in router names
+export routername=$(echo $(hostname) | sed 's/-//g')
if [ "${BROKER_REDUNDANCY}" = "true" ]; then
IFS='-' read -ra host_array <<< $(hostname)
is_monitor=$([ ${host_array[-2]} = "m" ] && echo 1 || echo 0)
is_backup=$([ ${host_array[-2]} = "b" ] && echo 1 || echo 0)
namespace=$(echo $STATEFULSET_NAMESPACE)
service=${BROKERSERVICES_NAME}
- # Deal with the fact we cannot accept "-" in broker names
service_name=$(echo ${service} | sed 's/-//g')
- export routername=$(echo $(hostname) | sed 's/-//g')
export redundancy_enable=yes
export configsync_enable=yes
export redundancy_authentication_presharedkey_key=$(cat /mnt/disks/secrets/presharedauthkey/preshared_auth_key | base64)
@@ -37,18 +37,18 @@ if [ "${BROKER_REDUNDANCY}" = "true" ]; then
# Non Monitor Nodes
if [ "${is_monitor}" = "0" ]; then
- case ${is_backup} in
- 0)
- export nodetype=message_routing
- export redundancy_matelink_connectvia=${service}-b-0.${service}-discovery.${namespace}.svc
- export redundancy_activestandbyrole=primary
- ;;
- 1)
- export nodetype=message_routing
- export redundancy_matelink_connectvia=${service}-p-0.${service}-discovery.${namespace}.svc
- export redundancy_activestandbyrole=backup
- ;;
- esac
+ case ${is_backup} in
+ 0)
+ export nodetype=message_routing
+ export redundancy_matelink_connectvia=${service}-b-0.${service}-discovery.${namespace}.svc
+ export redundancy_activestandbyrole=primary
+ ;;
+ 1)
+ export nodetype=message_routing
+ export redundancy_matelink_connectvia=${service}-p-0.${service}-discovery.${namespace}.svc
+ export redundancy_activestandbyrole=backup
+ ;;
+ esac
else
export nodetype=monitoring
fi
diff --git a/controllers/brokerscripts/readiness_check.sh b/controllers/brokerscripts/readiness_check.sh
index f80ecf7..6e4827c 100644
--- a/controllers/brokerscripts/readiness_check.sh
+++ b/controllers/brokerscripts/readiness_check.sh
@@ -19,28 +19,39 @@ set_label () {
#Prevent overdriving Kubernetes infra, don't set activity state to same as previous state
previous_state=$(get_label "active")
if [ "${2}" = "${previous_state}" ]; then
- #echo "$(date) INFO: ${APP}-Current and Previous state match (${2}), not updating pod label"
- :
+ #echo "$(date) INFO: ${APP}-Current and Previous state match (${2}), not updating pod label"
+ :
else
- echo "$(date) INFO: ${APP}-Updating pod label using K8s API from ${previous_state} to ${2}"
- echo "[{\"op\": \"add\", \"path\": \"/metadata/labels/${1}\", \"value\": \"${2}\" }]" > /tmp/patch_label.json
- K8S=https://kubernetes.default.svc.cluster.local:$KUBERNETES_SERVICE_PORT
- KUBE_TOKEN=$(&2
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ echo "$(date) INFO: ${APP}-Updating pod label using K8s API from ${previous_state} to ${2}"
+ echo "[{\"op\": \"add\", \"path\": \"/metadata/labels/${1}\", \"value\": \"${2}\" }]" > /tmp/patch_label.json
+ K8S=https://kubernetes.default.svc.cluster.local:$KUBERNETES_SERVICE_PORT
+ KUBE_TOKEN=$(&2
+ rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ fi
fi
fi
- fi
+}
+
+
+# Function to get remote sync state
+get_router_remote_config_state() {
+ # Params: $1 is property of config to return for router
+ routerresults=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/config-sync/database/remote/tables/table[1]/source-router/${1}")
+ routerremotesync_result=$(echo ${routerresults} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ echo $routerremotesync_result
}
# Main logic: note that there are no re-tries here, if check fails then return not ready.
@@ -50,157 +61,159 @@ if [ "${BROKER_REDUNDANCY}" = "true" ]; then
is_monitor=$([ ${host_array[-2]} = "m" ] && echo 1 || echo 0)
is_backup=$([ ${host_array[-2]} = "b" ] && echo 1 || echo 0)
password=$(cat /mnt/disks/secrets/admin/username_admin_password)
- # For update (includes SolOS upgrade) purposes, additional checks are required for readiness state when the pod has been started
- # This is an update if the LASTVERSION_FILE with K8s controller-revision-hash exists and contents differ from current value
- LASTVERSION_FILE=/var/lib/solace/var/lastConfigRevisionBeforeReboot
- if [ ! -f ${LASTVERSION_FILE} ] || [[ $(cat ${LASTVERSION_FILE}) != $(get_label "controller-revision-hash") ]] ; then
- echo "$(date) INFO: ${APP}-Initial startup or Upgrade detected, running additional checks..."
- # Check redundancy
- echo "$(date) INFO: ${APP}-Running checks. Redundancy state check started..."
- results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/redundancy/redundancy-status")
- redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- if [ "${redundancystatus_results}" != "Up" ]; then
- echo "$(date) INFO: ${APP}-Redundancy state is not yet up."
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
- fi
- # Additionally check config-sync status for non-monitoring nodes
- echo "$(date) INFO: ${APP}-Running checks. Config-sync state check started..."
- if [ "${is_monitor}" = "0" ]; then
- results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/config-sync/status/oper-status")
- confsyncstatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- if [ "${confsyncstatus_results}" != "Up" ]; then
- echo "$(date) INFO: ${APP}-Config-sync state is not yet up."
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
- fi
- fi
- fi
- # Record current version in LASTVERSION_FILE
- echo $(get_label "controller-revision-hash") > ${LASTVERSION_FILE}
# For monitor node just check for redundancy; active label will never be set
if [ "${is_monitor}" = "1" ]; then
- # Check redundancy
- echo "$(date) INFO: ${APP}-Running checks. Redundancy state check started..."
- results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/redundancy/redundancy-status")
- redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- if [ "${redundancystatus_results}" != "Up" ]; then
- echo "$(date) INFO: ${APP}-Redundancy state is not yet up."
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
- fi
- if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
- echo "$(date) INFO: ${APP}-All nodes online, monitor node is redundancy ready"
- touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
- echo "$(date) INFO: ${APP}-Server status check complete for this broker node"
- exit 1
- fi
- exit 0
+ # Check redundancy
+ results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/redundancy/redundancy-status")
+ redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ if [ "${redundancystatus_results}" != "Up" ]; then
+ echo "$(date) INFO: ${APP}-Waiting for redundancy up, redundancy state is not yet up."
+ rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ fi
+ if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
+ echo "$(date) INFO: ${APP}-All nodes online, monitor node is redundancy ready"
+ touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
+ echo "$(date) INFO: ${APP}-Server status check complete for this broker node"
+ exit 1
+ fi
+ exit 0
fi # End Monitor Node
+ # From here only message routing nodes.
# For Primary or Backup nodes set both service readiness (active label) and k8s readiness (exit return value)
health_result=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:5550/health-check/guaranteed-active)
case "${health_result}" in
- "200")
- if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
- echo "$(date) INFO: ${APP}-HA Event Broker health check reported 200, message spool is up"
- touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
- echo "$(date) INFO: ${APP}-Server status check complete for this broker node"
- echo "$(date) INFO: ${APP}-Changing pod label to active"
- #exit 1 Removing as this may delay activity switch by 5 seconds
- fi
- set_label "active" "true"
- exit 0
- ;;
- "503")
- if [[ $(get_label "active") = "true" ]]; then echo "$(date) INFO: ${APP}-HA Event Broker health check reported 503"; fi
- set_label "active" "false"
- # Further check is required to determine readiness
- ;;
- *)
- echo "$(date) WARN: ${APP}-HA Event Broker health check reported unexpected ${health_result}"
- set_label "active" "false"
- echo "$(date) INFO: ${APP}-Changing pod label to inactive"
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ "200")
+ if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
+ echo "$(date) INFO: ${APP}-HA Event Broker health check reported 200, message spool is up"
+ touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
+ echo "$(date) INFO: ${APP}-Server status check complete for this broker node"
+ echo "$(date) INFO: ${APP}-Changing pod label to active"
+ #exit 1 Removing as this may delay activity switch by 5 seconds
+ fi
+ set_label "active" "true"
+ exit 0
+ ;;
+ "503")
+ if [[ $(get_label "active") = "true" ]]; then echo "$(date) INFO: ${APP}-HA Event Broker health check reported 503"; fi
+ set_label "active" "false"
+ # Further check is required to determine readiness
+ ;;
+ *)
+ echo "$(date) WARN: ${APP}-HA Event Broker health check reported unexpected ${health_result}"
+ set_label "active" "false"
+ echo "$(date) INFO: ${APP}-Changing pod label to inactive"
+ rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
esac
# At this point analyzing readiness after health check returned 503 - checking if Event Broker is Standby
case "${is_backup}" in
- "0")
- config_role="primary"
- ;;
- "1")
- config_role="backup"
- ;;
+ "0")
+ config_role="primary"
+ ;;
+ "1")
+ config_role="backup"
+ ;;
esac
online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${config_role}/status/activity[text()]")
+ -q "" \
+ -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${config_role}/status/activity[text()]")
local_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
case "${local_activity}" in
- "Mate Active")
- # Check redundancy
- results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/redundancy/redundancy-status")
- redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- if [ "${redundancystatus_results}" != "Up" ]; then
- echo "$(date) INFO: ${APP}-Running checks.Redundancy state is not yet up."
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
- fi
- # Additionally check config-sync status for non-monitoring nodes
- if [ "${node_ordinal}" != "2" ]; then
- results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/config-sync/status/oper-status")
- confsyncstatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- if [ "${confsyncstatus_results}" != "Up" ]; then
- echo "$(date) INFO: ${APP}-Running checks. Config-sync state is not yet up."
+ "Mate Active")
+ # Check redundancy
+ results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/redundancy/redundancy-status")
+ redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ if [ "${redundancystatus_results}" != "Up" ]; then
+ echo "$(date) INFO: ${APP}-Running checks.Redundancy state is not yet up."
+ rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ fi
+ # Check config-sync status
+ results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/config-sync/status/oper-status")
+ confsyncstatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ if [ "${confsyncstatus_results}" != "Up" ]; then
+
+ # Additional check to confirm config-sync
+ echo "$(date) INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..."
+
+ messagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "count(/rpc-reply/rpc/show/config-sync/database/local/tables/table)")
+ messagevpn_total=$(echo ${messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+
+ # Count message_vpns in-sync and compare with total
+ localmessagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "count(//table[sync-state='In-Sync'])")
+ local_messagevpn_total_insync=$(echo ${localmessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ if [ "$messagevpn_total" -ne "$local_messagevpn_total_insync" ]; then
+ echo "$(date) INFO: ${APP}-Config-sync state is not in-sync locally."
+ rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ fi
+
+ echo "$(date) INFO: ${APP}-Checking Config-sync Setup. Remote config-sync state check starting..."
+ vpnremotehamate_result=$(get_router_remote_config_state "name")
+
+ remote_messagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "count(//table/source-router[name='$vpnremotehamate_result'])")
+ remote_messagevpn_total=$(echo ${remote_messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+
+ #Count message_vpns in-sync, not stale and compare with total
+ remotemessagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "count(//table/source-router[name='$vpnremotehamate_result' and sync-state='In-Sync' and stale='No'])")
+ remote_messagevpn_total_insync=$(echo ${remotemessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ if [ "$remote_messagevpn_total" -ne "$remote_messagevpn_total_insync" ]; then
+ echo "$(date) INFO: ${APP}-Config-sync state is not in-sync for remote."
+ rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ fi
+ fi
+ # Pass readiness check
+ if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
+ echo "$(date) INFO: ${APP}-Redundancy is up and node is Mate Active"
+ touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
+ echo "$(date) INFO: ${APP}-Server status check complete for this broker node"
+ exit 1
+ fi
+ exit 0
+ ;;
+ *)
+ echo "$(date) WARN: ${APP}-Health check returned 503 and local activity state is: ${local_activity}, failing readiness check."
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
- fi
- fi
- # Pass readiness check
- if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
- echo "$(date) INFO: ${APP}-Redundancy is up and node is mate Active"
- touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
- echo "$(date) INFO: ${APP}-Server status check complete for this broker node"
- exit 1
- fi
- exit 0
- ;;
- *)
- echo "$(date) WARN: ${APP}-Health check returned 503 and local activity state is: ${local_activity}, failing readiness check."
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
- ;;
+ ;;
esac
else
# nonHA config
health_result=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:5550/health-check/guaranteed-active)
case "${health_result}" in
- "200")
- if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
- echo "$(date) INFO: ${APP}-nonHA Event Broker health check reported 200, message spool is up"
- touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
- echo "$(date) INFO: ${APP}-Server status check complete for this broker node"
- echo "$(date) INFO: ${APP}-Changing pod label to active"
- exit 1
- fi
- set_label "active" "true"
- exit 0
- ;;
- "503")
- if [[ $(get_label "active") = "true" ]]; then echo "$(date) INFO: ${APP}-nonHA Event Broker health check reported 503, message spool is down"; fi
- set_label "active" "false"
- echo "$(date) INFO: ${APP}-Changing pod label to inactive"
- # Fail readiness check
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
- ;;
- *)
- echo "$(date) WARN: ${APP}-nonHA Event Broker health check reported ${health_result}"
- set_label "active" "false"
- echo "$(date) INFO: ${APP}-Changing pod label to inactive"
- # Fail readiness check
- rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ "200")
+ if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
+ echo "$(date) INFO: ${APP}-nonHA Event Broker health check reported 200, message spool is up"
+ touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
+ echo "$(date) INFO: ${APP}-Server status check complete for this broker node"
+ echo "$(date) INFO: ${APP}-Changing pod label to active"
+ exit 1
+ fi
+ set_label "active" "true"
+ exit 0
+ ;;
+ "503")
+ if [[ $(get_label "active") = "true" ]]; then echo "$(date) INFO: ${APP}-nonHA Event Broker health check reported 503, message spool is down"; fi
+ set_label "active" "false"
+ echo "$(date) INFO: ${APP}-Changing pod label to inactive"
+ # Fail readiness check
+ rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
+ ;;
+ *)
+ echo "$(date) WARN: ${APP}-nonHA Event Broker health check reported ${health_result}"
+ set_label "active" "false"
+ echo "$(date) INFO: ${APP}-Changing pod label to inactive"
+ # Fail readiness check
+ rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
esac
fi
\ No newline at end of file
diff --git a/controllers/brokerscripts/semp_query.sh b/controllers/brokerscripts/semp_query.sh
index 60ca37e..7b8887c 100644
--- a/controllers/brokerscripts/semp_query.sh
+++ b/controllers/brokerscripts/semp_query.sh
@@ -39,7 +39,7 @@ if [[ ${url} = "" || ${username} = "" || ${password} = "" ]]; then
echo 'missing parameter'
exit 1
fi
-if [ "$(curl --write-out '%{http_code}' --silent --output /dev/null -u ${username}:${password} ${url}/SEMP)" != "200" ] ; then
+if [ "$(curl --write-out '%{http_code}' --silent --output /dev/null -u ${username}:${password} ${url}/SEMP -d '')" != "200" ] ; then
echo "management host is not responding"
exit 1
fi
diff --git a/controllers/brokerscripts/startup-broker.sh b/controllers/brokerscripts/startup-broker.sh
index 891aa74..f9c5cbd 100644
--- a/controllers/brokerscripts/startup-broker.sh
+++ b/controllers/brokerscripts/startup-broker.sh
@@ -7,12 +7,13 @@ echo "$(date) INFO: ${APP}-PubSub+ broker node starting. HA flags: HA_configured
echo "$(date) INFO: ${APP}-Waiting for management API to become available"
password=$(cat /mnt/disks/secrets/admin/username_admin_password)
INITIAL_STARTUP_FILE=/var/lib/solace/var/k8s_initial_startup_marker
-loop_guard=120
+loop_guard=60
pause=10
count=0
-while [ ${count} -lt ${loop_guard} ]; do
+# Wait for Solace Management API
+while [ ${count} -lt ${loop_guard} ]; do
if /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 -t ; then
- break
+ break
fi
run_time=$((${count} * ${pause}))
((count++))
@@ -26,158 +27,205 @@ fi
if [ "${BROKER_TLS_ENABLED}" = "true" ]; then
rm /dev/shm/server.cert # remove as soon as possible
cert_results=$(curl --write-out '%{http_code}' --silent --output /dev/null -k -X PATCH -u admin:${password} https://localhost:1943/SEMP/v2/config/ \
- -H "content-type: application/json" \
- -d "{\"tlsServerCertContent\":\"$(cat /mnt/disks/certs/server/${BROKER_CERT_FILENAME} /mnt/disks/certs/server/${BROKER_CERTKEY_FILENAME} | awk '{printf "%s\\n", $0}')\"}")
+ -H "content-type: application/json" \
+ -d "{\"tlsServerCertContent\":\"$(cat /mnt/disks/certs/server/${BROKER_CERT_FILENAME} /mnt/disks/certs/server/${BROKER_CERTKEY_FILENAME} | awk '{printf "%s\\n", $0}')\"}")
if [ "${cert_results}" != "200" ]; then
- echo "$(date) ERROR: ${APP}-Unable to set the server certificate, exiting" >&2
- exit 1
+ echo "$(date) ERROR: ${APP}-Unable to set the server certificate, exiting" >&2
+ exit 1
fi
echo "$(date) INFO: ${APP}-Server certificate has been configured"
- # Future improvement: enable CA configuration from secret ca.crt
fi
if [ "${BROKER_REDUNDANCY}" = "true" ]; then
+ # Function to get remote sync state
+ get_router_remote_config_state() {
+ # Params: $1 is property of config to return for router
+ routerresults=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/config-sync/database/remote/tables/table[1]/source-router/${1}")
+ routerremotesync_result=$(echo ${routerresults} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ echo $routerremotesync_result
+ }
# for non-monitor nodes setup redundancy and config-sync
if [ "${is_monitor}" = "0" ]; then
- resync_step_required=""
- role=""
- count=0
- while [ ${count} -lt ${loop_guard} ]; do
- role_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/redundancy/active-standby-role[text()]")
- run_time=$((${count} * ${pause}))
- case "$(echo ${role_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)" in
- "Primary")
- role="primary"
- break
- ;;
- "Backup")
- role="backup"
- break
- ;;
- esac
- ((count++))
- echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, got ${role_results} for this node's active-standby role"
- sleep ${pause}
- done
- if [ ${count} -eq ${loop_guard} ]; then
- echo "$(date) ERROR: ${APP}-Could not determine this node's active-standby role" >&2
- exit 1
- fi
- # Determine local activity
- count=0
- echo "$(date) INFO: ${APP}-Management API is up, determined that this node's active-standby role is: ${role}"
- while [ ${count} -lt ${loop_guard} ]; do
- online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/activity[text()]")
- local_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- run_time=$((${count} * ${pause}))
- case "${local_activity}" in
- "Local Active")
- echo "$(date) INFO: ${APP}-Node activity status is Local Active, after ${run_time} seconds"
- # We should only be here on new cluster create, if not this is an indication of unexpected HA procedures
- if [[ ! -e ${INITIAL_STARTUP_FILE} ]]; then
- # Need to issue assert master to get back into sync only one time when the PubSub+ Event Broker starts the first time
- echo "$(date) INFO: ${APP}-Broker initial startup detected. This node will assert config-sync configuration over its mate"
- resync_step_required="true"
- else
- echo "$(date) WARN: ${APP}-Unexpected state: this is not an initial startup of the broker and this node reports Local Active. Normally expected nodes are Mate Active after restart"
- fi
+ resync_step_required=""
+ role=""
+ count=0
+ # Determine node's primary or backup role
+ while [ ${count} -lt ${loop_guard} ]; do
+ role_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/redundancy/active-standby-role[text()]")
+ run_time=$((${count} * ${pause}))
+ case "$(echo ${role_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)" in
+ "Primary")
+ role="primary"
break
;;
- "Mate Active")
- echo "$(date) INFO: ${APP}-Node activity status is Mate Active, after ${run_time} seconds"
+ "Backup")
+ role="backup"
break
;;
- esac
- ((count++))
- echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Local activity state is: ${local_activity}"
- sleep ${pause}
- done
- if [ ${count} -eq ${loop_guard} ]; then
- echo "$(date) ERROR: ${APP}-Local activity state never become Local Active or Mate Active" >&2
- exit 1
- fi
- # If we need to assert master, then we need to wait for mate to reconcile
- if [ "${resync_step_required}" = "true" ]; then
+ esac
+ ((count++))
+ echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, got ${role_results} for this node's primary or backup role"
+ sleep ${pause}
+ done
+ if [ ${count} -eq ${loop_guard} ]; then
+ echo "$(date) ERROR: ${APP}-Could not determine this node's primary or backup role" >&2
+ exit 1
+ fi
+ echo "$(date) INFO: ${APP}-Management API is up, determined that this node's role is: ${role}"
+ # Determine activity (local or mate active)
count=0
- echo "$(date) INFO: ${APP}-Waiting for mate activity state to be 'Standby'"
while [ ${count} -lt ${loop_guard} ]; do
- online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/detail/priority-reported-by-mate/summary[text()]")
- mate_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- run_time=$((${count} * ${pause}))
- case "${mate_activity}" in
- "Standby")
- echo "$(date) INFO: ${APP}-Activity state reported by mate is Standby, after ${run_time} seconds"
- break
- ;;
- esac
- ((count++))
- echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Mate activity state is: ${mate_activity}, not yet in sync"
- sleep ${pause}
+ online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/activity[text()]")
+ local_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ run_time=$((${count} * ${pause}))
+ case "${local_activity}" in
+ "Local Active")
+ echo "$(date) INFO: ${APP}-Node activity status is Local Active, after ${run_time} seconds"
+ # We should only be here on new cluster create, if not this is an indication of unexpected HA procedures
+ if [[ ! -e ${INITIAL_STARTUP_FILE} ]]; then
+ # Need to issue assert master to get back into sync only one time when the PubSub+ Event Broker starts the first time
+ echo "$(date) INFO: ${APP}-Broker initial startup detected. This node will assert config-sync configuration over its mate"
+ resync_step_required="true"
+ else
+ echo "$(date) WARN: ${APP}-Unexpected state: this is not an initial startup of the broker and this node reports Local Active. Possibly a redeploy?"
+ fi
+ break
+ ;;
+ "Mate Active")
+ echo "$(date) INFO: ${APP}-Node activity status is Mate Active, after ${run_time} seconds"
+ break
+ ;;
+ esac
+ ((count++))
+ echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, node activity state is: ${local_activity}"
+ sleep ${pause}
done
if [ ${count} -eq ${loop_guard} ]; then
- echo "$(date) ERROR: ${APP}-Mate not in sync, never reached Standby" >&2
- exit 1
+ echo "$(date) ERROR: ${APP}-Node activity state never become Local Active or Mate Active" >&2
+ exit 1
fi
- fi # if assert-master
- # Ensure Config-sync connection state is Connected before proceeding
- count=0
- echo "$(date) INFO: ${APP}-Waiting for config-sync connected"
- while [ ${count} -lt ${loop_guard} ]; do
- online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "" \
- -v "/rpc-reply/rpc/show/config-sync/status/client/connection-state")
- connection_state=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- run_time=$((${count} * ${pause}))
- case "${connection_state}" in
- "Connected")
- echo "$(date) INFO: ${APP}-Config-sync connection state is Connected, after ${run_time} seconds"
- break
- ;;
- esac
- ((count++))
- echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Config-sync connection state is: ${connection_state}, not yet in Connected"
- sleep ${pause}
- done
- if [ ${count} -eq ${loop_guard} ]; then
- echo "$(date) ERROR: ${APP}-Config-sync connection state never reached Connected" >&2
- exit 1
- fi
- # Now can issue assert-master command
- if [ "${resync_step_required}" = "true" ]; then
- echo "$(date) INFO: ${APP}-Initiating assert-master"
- /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q ""
- /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
- -q "*"
- fi
- # Wait for config-sync results
- count=0
- echo "$(date) INFO: ${APP}-Waiting for config-sync results"
- while [ ${count} -lt ${loop_guard} ]; do
- online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ # If we need to assert leader, then first wait for mate to report Standby state
+ if [ "${resync_step_required}" = "true" ]; then
+ # This branch is AD-active only
+ count=0
+ echo "$(date) INFO: ${APP}-Waiting for mate activity state to be 'Standby'"
+ while [ ${count} -lt ${loop_guard} ]; do
+ online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/detail/priority-reported-by-mate/summary[text()]")
+ mate_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ run_time=$((${count} * ${pause}))
+ case "${mate_activity}" in
+ "Standby")
+ echo "$(date) INFO: ${APP}-Activity state reported by mate is Standby, after ${run_time} seconds"
+ break
+ ;;
+ esac
+ ((count++))
+ echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Mate activity state is: ${mate_activity}, not yet in sync"
+ sleep ${pause}
+ done
+ if [ ${count} -eq ${loop_guard} ]; then
+ echo "$(date) ERROR: ${APP}-Mate not in sync, never reached Standby" >&2
+ exit 1
+ fi
+ fi # if assert-leader
+ # Ensure Config-sync connection state is Connected for both primary and backup before proceeding
+ count=0
+ echo "$(date) INFO: ${APP}-Waiting for config-sync connected"
+ while [ ${count} -lt ${loop_guard} ]; do
+ online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "" \
- -v "/rpc-reply/rpc/show/config-sync/status/oper-status")
- confsyncstatus_results=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
- run_time=$((${count} * ${pause}))
- case "${confsyncstatus_results}" in
- "Up")
- echo "$(date) INFO: ${APP}-Config-sync is Up, after ${run_time} seconds"
+ -v "/rpc-reply/rpc/show/config-sync/status/client/connection-state")
+ connection_state=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ run_time=$((${count} * ${pause}))
+ case "${connection_state}" in
+ "Connected")
+ echo "$(date) INFO: ${APP}-Config-sync connection state is Connected, after ${run_time} seconds"
+ break
+ ;;
+ esac
+ ((count++))
+ echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Config-sync connection state is: ${connection_state}, not yet in Connected"
+ sleep ${pause}
+ done
+ if [ ${count} -eq ${loop_guard} ]; then
+ echo "$(date) ERROR: ${APP}-Config-sync connection state never reached Connected" >&2
+ exit 1
+ fi
+ # Now can issue assert-leader command
+ if [ "${resync_step_required}" = "true" ]; then
+ # This branch is AD-active only
+ echo "$(date) INFO: ${APP}-Initiating assert-leader"
+ /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q ""
+ /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "*"
+ fi
+ # Wait for config-sync results
+ count=0
+ echo "$(date) INFO: ${APP}-Waiting for config-sync results"
+ while [ ${count} -lt ${loop_guard} ]; do
+ online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "/rpc-reply/rpc/show/config-sync/status/oper-status")
+ confsyncstatus_results=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ run_time=$((${count} * ${pause}))
+ case "${confsyncstatus_results}" in
+ "Up")
+ echo "$(date) INFO: ${APP}-Config-sync is Up, after ${run_time} seconds"
+ break
+ ;;
+ esac
+ ((count++))
+ echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Config-sync is: ${confsyncstatus_results}, not yet Up"
+ # Additional checks to confirm config-sync (even if reported gloabally as not Up, it may be still up between local primary and backup in a DR setup)
+ echo "$(date) INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..."
+ messagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "count(/rpc-reply/rpc/show/config-sync/database/local/tables/table)")
+ messagevpn_total=$(echo ${messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+
+ # Count message_vpns in-sync and compare with total
+ localmessagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "count(//table[sync-state='In-Sync'])")
+ local_messagevpn_total_insync=$(echo ${localmessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ if [ "$messagevpn_total" -ne "$local_messagevpn_total_insync" ]; then
+ echo "$(date) INFO: ${APP}-Config-sync state is not in-sync locally."
+ sleep ${pause}
+ continue
+ fi
+
+ echo "$(date) INFO: ${APP}-Checking Config-sync Setup. Remote config-sync state check starting..."
+ vpnremotehamate_result=$(get_router_remote_config_state "name")
+
+ remote_messagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "count(//table/source-router[name='$vpnremotehamate_result'])")
+ remote_messagevpn_total=$(echo ${remote_messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+
+ #Count message_vpns in-sync, not stale and compare with total
+ remotemessagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
+ -q "" \
+ -v "count(//table/source-router[name='$vpnremotehamate_result' and sync-state='In-Sync' and stale='No'])")
+ remote_messagevpn_total_insync=$(echo ${remotemessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)
+ if [ "$remote_messagevpn_total" -ne "$remote_messagevpn_total_insync" ]; then
+ echo "$(date) INFO: ${APP}-Config-sync state is not in-sync for remote."
+ sleep ${pause}
+ continue
+ fi
break
- ;;
- esac
- ((count++))
- echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Config-sync is: ${confsyncstatus_results}, not yet Up"
- sleep ${pause}
- done
- if [ ${count} -eq ${loop_guard} ]; then
- echo "$(date) ERROR: ${APP}-Config-sync never reached state \"Up\"" >&2
- exit 1
- fi
+ done
+ if [ ${count} -eq ${loop_guard} ]; then
+ echo "$(date) ERROR: ${APP}-Config-sync never reached state \"Up\"" >&2
+ exit 1
+ fi
fi # if not monitor
fi
echo "$(date) INFO: ${APP}-PubSub+ Event Broker bringup is complete for this node."
@@ -185,4 +233,4 @@ echo "$(date) INFO: ${APP}-PubSub+ Event Broker bringup is complete for this nod
if [[ ! -e ${INITIAL_STARTUP_FILE} ]]; then
echo "PubSub+ Event Broker initial startup completed on $(date)" > ${INITIAL_STARTUP_FILE}
fi
-exit 0
+exit 0
\ No newline at end of file
diff --git a/deploy/deploy.yaml b/deploy/deploy.yaml
index 35cb87a..806c2e4 100644
--- a/deploy/deploy.yaml
+++ b/deploy/deploy.yaml
@@ -1881,7 +1881,7 @@ spec:
env:
- name: WATCH_NAMESPACE
value: ""
- image: docker.io/solace/pubsubplus-eventbroker-operator:1.0.0
+ image: docker.io/solace/pubsubplus-eventbroker-operator:1.0.1
imagePullPolicy: Always
livenessProbe:
httpGet:
diff --git a/version.go b/version.go
index 41a8082..fec03ba 100644
--- a/version.go
+++ b/version.go
@@ -15,4 +15,4 @@
// limitations under the License.
package main
-const version = "1.0.0"
+const version = "1.0.1"