Skip to content

Commit

Permalink
Update kubevirtci, 1.30 lanes (#3343)
Browse files Browse the repository at this point in the history
* Bump kvci to latest

Signed-off-by: Alex Kalenyuk <[email protected]>

* Switch testing lanes to 1.30/1.29

Signed-off-by: Alex Kalenyuk <[email protected]>

* Revert "Add W/A for NFS OOMKills"

kubevirt/kubevirt#10822 (comment)

This reverts commit aba7803.

Signed-off-by: Alex Kalenyuk <[email protected]>

---------

Signed-off-by: Alex Kalenyuk <[email protected]>
  • Loading branch information
akalenyu authored Jul 26, 2024
1 parent 8dab107 commit 368ad80
Show file tree
Hide file tree
Showing 28 changed files with 407 additions and 33 deletions.
2 changes: 1 addition & 1 deletion automation/ceph-wffc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.28
export TARGET=k8s-1.29
#ensure no hard coded cdi cr in tests.
export RANDOM_CR=true
export KUBEVIRT_STORAGE=rook-ceph-default
Expand Down
2 changes: 1 addition & 1 deletion automation/ceph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.28
export TARGET=k8s-1.29
#ensure no hard coded cdi cr in tests.
export RANDOM_CR=true
export KUBEVIRT_STORAGE=rook-ceph-default
Expand Down
2 changes: 1 addition & 1 deletion automation/destructive.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.29
export TARGET=k8s-1.30
export KUBEVIRT_STORAGE=hpp
export KUBEVIRT_DEPLOY_PROMETHEUS=true
export CDI_E2E_FOCUS=Destructive
Expand Down
2 changes: 1 addition & 1 deletion automation/istio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.29
export TARGET=k8s-1.30
export KUBEVIRT_STORAGE=hpp
export CDI_E2E_SKIP=Destructive
export KUBEVIRT_DEPLOY_ISTIO=true
Expand Down
2 changes: 1 addition & 1 deletion automation/latest-hpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.29
export TARGET=k8s-1.30
export KUBEVIRT_STORAGE=hpp
export CDI_E2E_SKIP=Destructive
automation/test.sh
2 changes: 1 addition & 1 deletion automation/nfs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.29
export TARGET=k8s-1.30
export KUBEVIRT_DEPLOY_NFS_CSI=true
export KUBEVIRT_STORAGE=nfs
export CDI_E2E_SKIP=Destructive
Expand Down
2 changes: 1 addition & 1 deletion automation/non-csi-hpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.29
export TARGET=k8s-1.30
export KUBEVIRT_STORAGE=hpp
export HPP_CLASSIC=true
export CDI_E2E_SKIP=Destructive
Expand Down
2 changes: 1 addition & 1 deletion automation/previous-hpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.28
export TARGET=k8s-1.29
export KUBEVIRT_STORAGE=hpp
export KUBEVIRT_DEPLOY_PROMETHEUS=true
export CDI_E2E_SKIP=Destructive
Expand Down
2 changes: 1 addition & 1 deletion automation/upgrade.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

set -ex
export TARGET=k8s-1.29
export TARGET=k8s-1.30
export KUBEVIRT_STORAGE=hpp
export MULTI_UPGRADE=true
export CDI_E2E_SKIP=Destructive
Expand Down
6 changes: 0 additions & 6 deletions cluster-sync/sync.sh
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,6 @@ if [ "${KUBEVIRT_PROVIDER}" != "external" ]; then
configure_uploadproxy_override
# Tell prometheus to watch our namespace
configure_prometheus
if [ "$KUBEVIRT_STORAGE" == "nfs" ]; then
# nfs writing started to consistently breach the dirty_ratio, causing OOMKills
# we think the problem sits somewhere around the fsync calls to writeout to nfs being slow/failing
# https://github.com/kubevirt/containerized-data-importer/pull/3023#issuecomment-1913529241
_kubectl patch cdi ${CR_NAME} --type merge -p '{"spec":{"config":{"podResourceRequirements": {"limits": {"cpu": "750m", "memory": "1Gi"}, "requests": {"cpu": "100m", "memory": "60M"}}}}}'
fi
fi

# Grab all the CDI crds so we can check if they are structural schemas
Expand Down
2 changes: 1 addition & 1 deletion cluster-up/check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ function is_enabled() {
if [ "$1" == "1" ]; then
return 0
fi
if [ "$1" == "Y" ] || [ "$1" == "y"]; then
if [ "$1" == "Y" ] || [ "$1" == "y" ]; then
return 0
fi
return 1
Expand Down
6 changes: 5 additions & 1 deletion cluster-up/cluster/ephemeral-provider-common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ function _registry_volume() {

function _add_common_params() {
# shellcheck disable=SC2155
local params="--nodes ${KUBEVIRT_NUM_NODES} --memory ${KUBEVIRT_MEMORY_SIZE} --cpu 6 --secondary-nics ${KUBEVIRT_NUM_SECONDARY_NICS} --random-ports --background --prefix $provider_prefix ${KUBEVIRT_PROVIDER} ${KUBEVIRT_PROVIDER_EXTRA_ARGS}"
local params="--nodes ${KUBEVIRT_NUM_NODES} --memory ${KUBEVIRT_MEMORY_SIZE} --numa ${KUBEVIRT_NUM_NUMA_NODES} --cpu ${KUBEVIRT_NUM_VCPU} --secondary-nics ${KUBEVIRT_NUM_SECONDARY_NICS} --random-ports --background --prefix $provider_prefix ${KUBEVIRT_PROVIDER} ${KUBEVIRT_PROVIDER_EXTRA_ARGS}"

params=" --dns-port $KUBEVIRT_DNS_HOST_PORT $params"

Expand Down Expand Up @@ -169,6 +169,10 @@ function _add_common_params() {
params=" --hugepages-2m $KUBEVIRT_HUGEPAGES_2M $params"
fi

if [ -n "$KUBEVIRT_HUGEPAGES_1G" ]; then
params=" --hugepages-1g $KUBEVIRT_HUGEPAGES_1G $params"
fi

if [ -n "$KUBEVIRT_REALTIME_SCHEDULER" ]; then
params=" --enable-realtime-scheduler $params"
fi
Expand Down
File renamed without changes.
64 changes: 61 additions & 3 deletions cluster-up/cluster/k8s-provider-common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,15 @@ function deploy_cnao() {
fi
}

function deploy_kwok() {
if [[ ${KUBEVIRT_DEPLOY_KWOK} == "true" ]]; then
$kubectl create -f /opt/kwok/kwok.yaml
$kubectl create -f /opt/kwok/stage-fast.yaml

$kubectl apply -k /opt/kwok/kubevirt
fi
}

function create_network_addons_config() {
local nac="/opt/cnao/network-addons-config-example.cr.yaml"
if [ "$KUBEVIRT_WITH_MULTUS_V3" == "true" ]; then
Expand Down Expand Up @@ -144,8 +153,10 @@ function deploy_cdi() {
$ssh node01 -- 'sudo sed --regexp-extended -i s/v[0-9]+\.[0-9]+\.[0-9]+\(.*\)?$/'"$KUBEVIRT_CUSTOM_CDI_VERSION"'/g /opt/cdi-*-operator.yaml'
fi

$kubectl create -f /opt/cdi-*-operator.yaml
$kubectl create -f /opt/cdi-*-cr.yaml
LATEST_CDI_OPERATOR=$($ssh node01 -- 'ls -rt /opt/cdi-*-operator.yaml | tail -n 1')
LATEST_CDI_CR=$($ssh node01 -- 'ls -rt /opt/cdi-*-cr.yaml | tail -n 1')
$kubectl create -f $LATEST_CDI_OPERATOR
$kubectl create -f $LATEST_CDI_CR
fi
}

Expand All @@ -166,6 +177,50 @@ function configure_prometheus() {
fi
}

function deploy_aaq() {
if [ "$KUBEVIRT_DEPLOY_AAQ" == "true" ]; then
if [ -n "${KUBEVIRT_CUSTOM_AAQ_VERSION}" ]; then
$ssh node01 -- 'sudo sed --regexp-extended -i s/v[0-9]+\.[0-9]+\.[0-9]+\(.*\)?$/'"$KUBEVIRT_CUSTOM_AAQ_VERSION"'/g /opt/aaq/aaq-*-operator.yaml'
fi

$kubectl create -f /opt/aaq/aaq-*-operator.yaml
$kubectl create -f /opt/aaq/aaq-*-cr.yaml
fi
}

function wait_for_aaq_ready() {
if [ "$KUBEVIRT_DEPLOY_AAQ" == "true" ]; then
while [ "$($kubectl get pods --namespace aaq | grep -c 'aaq-')" -lt 4 ]; do
$kubectl get pods --namespace aaq
sleep 10
done
$kubectl wait --for=condition=Ready pod --timeout=180s --all --namespace aaq
fi
}

function wait_for_kwok_ready() {
if [ "KUBEVIRT_DEPLOY_KWOK" == "true" ]; then
$kubectl wait deployment -n kube-system kwok-controller --for condition=Available --timeout=200s
fi
}

function configure_cpu_manager() {
if [ ${KUBEVIRT_CPU_MANAGER_POLICY} == "static" ]; then
for node in $($kubectl get nodes -l "node-role.kubernetes.io/worker" --no-headers -o custom-columns=":metadata.name" | tr -d '\r'); do
# FIXME Replace with kubelet config drop ins once all providers are using k8s >= 1.28
# https://kubernetes.io/docs/tasks/administer-cluster/kubelet-config-file/#kubelet-conf-d
$kubectl drain ${node}
$ssh ${node} -- sudo systemctl stop kubelet
# FIXME ${ssh} is broken when using HereDocs, fix and replace this mess if possible.
# https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/#configuration
$ssh ${node} -- "sudo rm -f /var/lib/kubelet/cpu_manager_state && sudo echo -e 'cpuManagerPolicy: static\nkubeReserved:\n cpu: \"1\"\n memory: \"1Gi\"\ncpuManagerPolicyOptions:\n full-pcpus-only: \"true\"' | sudo tee -a /var/lib/kubelet/config.yaml && sudo sed -i 's/cpuManagerReconcilePeriod\:\ 0s/cpuManagerReconcilePeriod\:\ 5s/g' /var/lib/kubelet/config.yaml"
$ssh ${node} -- sudo systemctl start kubelet
$kubectl label --overwrite node/${node} cpumanager=true
$kubectl uncordon ${node}
done
fi
}

function up() {
params=$(_add_common_params)
if echo "$params" | grep -q ERROR; then
Expand Down Expand Up @@ -207,13 +262,16 @@ function up() {

configure_prometheus
configure_memory_overcommitment_behavior
configure_cpu_manager

deploy_cnao
deploy_multus
deploy_istio
deploy_cdi
deploy_aaq
deploy_kwok

until wait_for_cnao_ready && wait_for_istio_ready && wait_for_cdi_ready && wait_for_multus_ready; do
until wait_for_cnao_ready && wait_for_istio_ready && wait_for_cdi_ready && wait_for_multus_ready && wait_for_aaq_ready && wait_for_kwok_ready; do
echo "Waiting for cluster components..."
sleep 5
done
Expand Down
3 changes: 0 additions & 3 deletions cluster-up/cluster/kind-1.27-vgpu/provider.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ function configure_registry_proxy() {
}

function up() {
# load the vfio_mdev module
/usr/sbin/modprobe vfio_mdev

# print hardware info for easier debugging based on logs
echo 'Available cards'
${CRI_BIN} run --rm --cap-add=SYS_RAWIO quay.io/phoracek/lspci@sha256:0f3cacf7098202ef284308c64e3fc0ba441871a846022bb87d65ff130c79adb1 sh -c "lspci -k | grep -EA2 'VGA|3D'"
Expand Down
19 changes: 19 additions & 0 deletions cluster-up/cluster/kind-1.30-vgpu/config_vgpu_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

[ $(id -u) -ne 0 ] && echo "FATAL: this script requires sudo privileges" >&2 && exit 1

set -xe

SCRIPT_PATH=$(dirname "$(realpath "$0")")

source ${SCRIPT_PATH}/vgpu-node/node.sh
echo "_kubectl: " ${_kubectl}
echo "KUBEVIRTCI_PATH: " ${KUBEVIRTCI_PATH}
source ${KUBEVIRTCI_PATH}/cluster/kind/common.sh
echo "_kubectl: " ${_kubectl}

nodes=($(_kubectl get nodes -o custom-columns=:.metadata.name --no-headers))
node::remount_sysfs "${nodes[*]}"
node::discover_host_gpus

_kubectl get nodes
47 changes: 47 additions & 0 deletions cluster-up/cluster/kind-1.30-vgpu/conformance.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"Description": "DEFAULT",
"UUID": "",
"Version": "v0.56.9",
"ResultsDir": "/tmp/sonobuoy/results",
"Resources": null,
"Filters": {
"Namespaces": ".*",
"LabelSelector": ""
},
"Limits": {
"PodLogs": {
"Namespaces": "kube-system",
"SonobuoyNamespace": true,
"FieldSelectors": [],
"LabelSelector": "",
"Previous": false,
"SinceSeconds": null,
"SinceTime": null,
"Timestamps": false,
"TailLines": null,
"LimitBytes": null
}
},
"QPS": 30,
"Burst": 50,
"Server": {
"bindaddress": "0.0.0.0",
"bindport": 8080,
"advertiseaddress": "",
"timeoutseconds": 21600
},
"Plugins": null,
"PluginSearchPath": [
"./plugins.d",
"/etc/sonobuoy/plugins.d",
"~/sonobuoy/plugins.d"
],
"Namespace": "sonobuoy",
"WorkerImage": "sonobuoy/sonobuoy:v0.56.9",
"ImagePullPolicy": "IfNotPresent",
"ImagePullSecrets": "",
"AggregatorPermissions": "clusterAdmin",
"ServiceAccountName": "sonobuoy-serviceaccount",
"ProgressUpdatesPort": "8099",
"SecurityContextMode": "nonroot"
}
1 change: 1 addition & 0 deletions cluster-up/cluster/kind-1.30-vgpu/image
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
kindest/node:v1.30.0@sha256:047357ac0cfea04663786a612ba1eaba9702bef25227a794b52890dd8bcd692e
58 changes: 58 additions & 0 deletions cluster-up/cluster/kind-1.30-vgpu/provider.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env bash

set -e

DEFAULT_CLUSTER_NAME="vgpu"
DEFAULT_HOST_PORT=5000
ALTERNATE_HOST_PORT=5001
export CLUSTER_NAME=${CLUSTER_NAME:-$DEFAULT_CLUSTER_NAME}

if [ $CLUSTER_NAME == $DEFAULT_CLUSTER_NAME ]; then
export HOST_PORT=$DEFAULT_HOST_PORT
else
export HOST_PORT=$ALTERNATE_HOST_PORT
fi

function set_kind_params() {
version=$(cat cluster-up/cluster/$KUBEVIRT_PROVIDER/version)
export KIND_VERSION="${KIND_VERSION:-$version}"

image=$(cat cluster-up/cluster/$KUBEVIRT_PROVIDER/image)
export KIND_NODE_IMAGE="${KIND_NODE_IMAGE:-$image}"
}

function configure_registry_proxy() {
[ "$CI" != "true" ] && return

echo "Configuring cluster nodes to work with CI mirror-proxy..."

local -r ci_proxy_hostname="docker-mirror-proxy.kubevirt-prow.svc"
local -r kind_binary_path="${KUBEVIRTCI_CONFIG_PATH}/$KUBEVIRT_PROVIDER/.kind"
local -r configure_registry_proxy_script="${KUBEVIRTCI_PATH}/cluster/kind/configure-registry-proxy.sh"

KIND_BIN="$kind_binary_path" PROXY_HOSTNAME="$ci_proxy_hostname" $configure_registry_proxy_script
}

function up() {
# print hardware info for easier debugging based on logs
echo 'Available cards'
${CRI_BIN} run --rm --cap-add=SYS_RAWIO quay.io/phoracek/lspci@sha256:0f3cacf7098202ef284308c64e3fc0ba441871a846022bb87d65ff130c79adb1 sh -c "lspci -k | grep -EA2 'VGA|3D'"
echo ""

cp $KIND_MANIFESTS_DIR/kind.yaml ${KUBEVIRTCI_CONFIG_PATH}/$KUBEVIRT_PROVIDER/kind.yaml
_add_extra_mounts
kind_up

configure_registry_proxy

# remove the rancher.io kind default storageClass
_kubectl delete sc standard

${KUBEVIRTCI_PATH}/cluster/$KUBEVIRT_PROVIDER/config_vgpu_cluster.sh

echo "$KUBEVIRT_PROVIDER cluster '$CLUSTER_NAME' is ready"
}

set_kind_params

source ${KUBEVIRTCI_PATH}/cluster/kind/common.sh
1 change: 1 addition & 0 deletions cluster-up/cluster/kind-1.30-vgpu/version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.23.0
32 changes: 32 additions & 0 deletions cluster-up/cluster/kind-1.30-vgpu/vgpu-node/node.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

function node::discover_host_gpus() {
local -r gpu_types=( $(find /sys/class/mdev_bus/*/mdev_supported_types) )
[ "${#gpu_types[@]}" -eq 0 ] && echo "FATAL: Could not find available GPUs on host" >&2 && return 1

local gpu_addr
local gpu_addresses=()
for path in "${gpu_types}"; do
gpu_addr="${gpu_types#/sys/class/mdev_bus/}"
gpu_addr=${gpu_addr%/*}

gpu_addresses+=( $gpu_addr )
done

echo "${gpu_addresses[@]}"
}

function node::remount_sysfs() {
local -r nodes_array=($1)
local node_exec

for node in "${nodes_array[@]}"; do

# KIND mounts sysfs as read-only by default, remount as R/W"
node_exec="${CRI_BIN} exec $node"
$node_exec mount -o remount,rw /sys
$node_exec chmod 666 /dev/vfio/vfio

done
}

Loading

0 comments on commit 368ad80

Please sign in to comment.