Skip to content

Commit

Permalink
Merge branch 'master' into fix/mr-dashboard-rbac
Browse files Browse the repository at this point in the history
  • Loading branch information
lugi0 authored Nov 14, 2024
2 parents 66da484 + a3240e7 commit dd4521d
Show file tree
Hide file tree
Showing 34 changed files with 575 additions and 346 deletions.
7 changes: 7 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,13 @@ function wait_until_driver_image_is_built() {
}

function create_acceleratorprofile() {
echo "Creating AMD Accelerator Profile"
rhoai_ns=$(oc get namespace redhat-ods-applications --ignore-not-found -oname)
if [ -z $rhoai_ns ];
then
echo "redhat-ods-applications namespace not found. Is RHOAI Installed? NVIDIA Accelerator Profile creation SKIPPED."
return 0
fi
echo "Creating an Accelerator Profile for Dashboard"
oc apply -f - <<EOF
apiVersion: dashboard.opendatahub.io/v1
Expand Down
8 changes: 7 additions & 1 deletion ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,13 @@ function rerun_accelerator_migration() {
# 1. Delete the migration configmap
# 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again
# Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938

echo "Creating NVIDIA Accelerator Profile via RHOAI Dashboard deployment rollout"
configmap=$(oc get configmap migration-gpu-status --ignore-not-found -n redhat-ods-applications -oname)
if [ -z $configmap ];
then
echo "migration-gpu-status not found. Is RHOAI Installed? NVIDIA Accelerator Profile creation SKIPPED."
return 0
fi
echo "Deleting configmap migration-gpu-status"
if ! oc delete configmap migration-gpu-status -n redhat-ods-applications;
then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Runtime: Pytorch with ROCm and Python 3.9 (UBI 9)
common_base_image = (
"quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf"
"quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10"
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ deploymentSpec:
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf
image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
exec-verify-gpu-availability-2:
container:
args:
Expand Down Expand Up @@ -91,7 +91,7 @@ deploymentSpec:
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf
image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
resources:
accelerator:
count: '1'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
common_base_image = (
"quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa"
"quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a"
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ deploymentSpec:
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa
image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
exec-verify-gpu-availability-2:
container:
args:
Expand Down Expand Up @@ -91,7 +91,7 @@ deploymentSpec:
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa
image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
resources:
accelerator:
count: '1'
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: serving.kserve.io/v1alpha1
kind: ServingRuntime
metadata:
name: triton-keras-rest
spec:
annotations:
prometheus.kserve.io/path: /metrics
prometheus.kserve.io/port: "8002"
containers:
- args:
- tritonserver
- --model-store=/mnt/models
- --grpc-port=9000
- --http-port=8080
- --allow-grpc=true
- --allow-http=true
- --log-verbose=2
image: nvcr.io/nvidia/tritonserver:23.05-py3
name: kserve-container
resources:
limits:
cpu: "1"
memory: 2Gi
requests:
cpu: "1"
memory: 2Gi
ports:
- containerPort: 8080
protocol: TCP
protocolVersions:
- v2
- grpc-v2
supportedModelFormats:
- autoSelect: true
name: tensorrt
priority: 1
version: "8"
- autoSelect: true
name: tensorflow
priority: 1
version: "1"
- autoSelect: true
name: tensorflow
priority: 1
version: "2"
- autoSelect: true
name: onnx
priority: 1
version: "1"
- name: pytorch
version: "1"
- autoSelect: true
name: triton
priority: 1
version: "2"
118 changes: 118 additions & 0 deletions ods_ci/tests/Resources/Page/Components/Components.resource
Original file line number Diff line number Diff line change
@@ -1,2 +1,120 @@
*** Settings ***
Resource Menu.robot
Resource ../../OCP.resource
Resource ../../ODS.robot


*** Keywords ***
Set DSC Component Removed State And Wait For Completion
[Documentation] Set component management state to 'Removed', and wait for deployment and pod to be removed.
[Arguments] ${component} ${deployment_name} ${label_selector}

${management_state}= Get DSC Component State ${DSC_NAME} ${component} ${OPERATOR_NS}
IF "${management_state}" != "Removed"
Set Component State ${component} Removed
END

Wait For Resources To Be Removed ${deployment_name} ${label_selector}

Set DSC Component Managed State And Wait For Completion
[Documentation] Set component management state to 'Managed', and wait for deployment and pod to be available.
[Arguments] ${component} ${deployment_name} ${label_selector}

${management_state}= Get DSC Component State ${DSC_NAME} ${component} ${OPERATOR_NS}
IF "${management_state}" != "Managed"
Set Component State ${component} Managed
END

Wait For Resources To Be Available ${deployment_name} ${label_selector}

Check Image Pull Path Is Redhatio ${deployment_name}

Wait For Resources To Be Available
[Documentation] Wait until Deployment and Pod(s) are Available
[Arguments] ${deployment_name} ${label_selector}
Wait Until Keyword Succeeds 5 min 0 sec
... Is Resource Present Deployment ${deployment_name} ${APPLICATIONS_NS} ${IS_PRESENT}

Wait Until Keyword Succeeds 5 min 0 sec
... Check If Pod Exists ${APPLICATIONS_NS} ${label_selector} ${FALSE}

Wait Until Keyword Succeeds 8 min 0 sec
... Is Pod Ready ${label_selector}

Wait For Resources To Be Removed
[Documentation] Wait until Deployment and Pod(s) to Removed
[Arguments] ${deployment_name} ${label_selector}

Wait Until Keyword Succeeds 5 min 0 sec
... Is Resource Present Deployment ${deployment_name} ${APPLICATIONS_NS} ${IS_NOT_PRESENT}

Wait Until Keyword Succeeds 5 min 0 sec
... Check If Pod Does Not Exist ${label_selector} ${APPLICATIONS_NS}

Restore DSC Component State
[Documentation] Set component management state to original state, wait for component resources to be available.
[Arguments] ${component} ${deployment_name} ${LABEL_SELECTOR} ${saved_state}

${current_state}= Get DSC Component State ${DSC_NAME} ${component} ${OPERATOR_NS}
IF "${current_state}" != "${saved_state}"
IF "${saved_state}" == "Managed"
Set DSC Component Managed State And Wait For Completion ${component} ${deployment_name} ${LABEL_SELECTOR}
ELSE IF "${saved_state}" == "Removed"
Set DSC Component Removed State And Wait For Completion ${component} ${deployment_name} ${LABEL_SELECTOR}
ELSE
FAIL Component ${component} state "${saved_state}" not supported at this time
END
END

Is Pod Ready
[Documentation] Check If Pod Is In Ready State.
... Note: Will check that all pods with given label-selector are in Ready state.
[Arguments] ${label_selector}
${rc} ${output}= Run And Return Rc And Output
... oc get pod -A -l ${label_selector} -o jsonpath='{..status.conditions[?(@.type=="Ready")].status}'
# Log To Console "Pod Ready Status: ${output}"
Should Be Equal As Integers ${rc} 0
Should Not Contain ${output} False

Get DataScienceCluster Spec
[Documentation] Return the DSC Spec
[Arguments] ${DSC_NAME}
${rc} ${output}= Run And Return Rc And Output
... oc get DataScienceCluster/${DSC_NAME} -n ${OPERATOR_NS} -o "jsonpath={".spec"}"
Should Be Equal As Integers ${rc} 0
RETURN ${output}

Check Image Pull Path Is Redhatio
[Documentation] Check that the Deployment Image Pull Path is registry.redhat.io
[Arguments] ${deployment_name}

# Skip pull path check if Deployment is in exclusion list
IF $deployment_name in @{REDHATIO_PATH_CHECK_EXCLUSTION_LIST}
Log To Console Skip image pull path check for Deployment ${deployment_name}
RETURN
END

${rc} ${image}= Run And Return Rc And Output
... oc get deployment/${deployment_name} -n ${APPLICATIONS_NAMESPACE} -o jsonpath="{..image}"
Should Be Equal As Integers ${rc} 0 msg=${image}

Log To Console Check deployment ${deployment_name} pull path for image ${image}
IF "registry.redhat.io" in $image
Log To Console Deployment ${deployment_name} image contains pull path registry.redhat.io
ELSE
Fail Deployment image ${deployment_name} does not contain pull path registry.redhat.io
END

Check Model Registry Namespace
[Documentation] Check that DSC modelregistry.registriesNamespace is correct for ODH/RHOAI
... Validate that namespace exists.
${rc} ${namespace}= Run And Return Rc And Output
... oc get DataScienceCluster/${DSC_NAME} -n ${OPERATOR_NS} -o "jsonpath={".spec.components.modelregistry.registriesNamespace"}"
Should Be Equal As Integers ${rc} 0 msg=${namespace}

Should Be Equal ${namespace} ${MODEL_REGISTRY_NAMESPACE} msg=Model Registry Namespace: Actual "${namespace}" Expected: "${MODEL_REGISTRY_NAMESPACE}"

${rc} ${output}= Run And Return Rc And Output
... oc get namespace -A ${MODEL_REGISTRY_NAMESPACE}
Should Be Equal As Integers ${rc} 0 msg=${output}
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ Library Process


*** Variables ***
${VIRTUAL_ENV_NAME} venv3.11
${CODEFLARE-SDK-RELEASE-TAG} v0.22.0
${CODEFLARE-SDK_DIR} codeflare-sdk
${CODEFLARE-SDK_REPO_URL} %{CODEFLARE-SDK_REPO_URL=https://github.com/project-codeflare/codeflare-sdk.git}
${DISTRIBUTED_WORKLOADS_RELEASE_ASSETS} https://github.com/opendatahub-io/distributed-workloads/releases/latest/download
${RAY_IMAGE} quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4
${RAY_IMAGE_3.11} quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4
${RAY_IMAGE_3.9} quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
${FMS_HF_TUNING_IMAGE} quay.io/modh/fms-hf-tuning@sha256:73bcd66500b8637a9db1339f64c3217212ef74700a22a790f78e9a1f26b8b71a
${NOTEBOOK_IMAGE} quay.io/modh/odh-generic-data-science-notebook@sha256:16fc91984a9baf7765a6362493906a2c726b9906031711e2e55d686d296e6b3a
${NOTEBOOK_USER_NAME} ${TEST_USER_3.USERNAME}
Expand Down Expand Up @@ -41,21 +41,13 @@ Clone Git Repository
Prepare Codeflare-SDK Test Setup
[Documentation] Prepare codeflare-sdk tests by cloning codeflare-sdk repo and python virtual environmnet

Clone Git Repository ${CODEFLARE-SDK_REPO_URL} ${CODEFLARE-SDK-RELEASE-TAG} ${CODEFLARE-SDK_DIR}

${result} = Run Process virtualenv -p python3.11 ${VIRTUAL_ENV_NAME}
... shell=true stderr=STDOUT
Log To Console ${result.stdout}
IF ${result.rc} != 0
FAIL Unable to setup Python virtual environment
END

Run Codeflare-SDK Test
[Documentation] Run codeflare-sdk Test
[Arguments] ${TEST_TYPE} ${TEST_NAME}
[Arguments] ${TEST_TYPE} ${TEST_NAME} ${PYTHON_VERSION} ${RAY_IMAGE}
Log To Console "Running codeflare-sdk test: ${TEST_NAME}"
${result} = Run Process source ${VIRTUAL_ENV_NAME}/bin/activate && cd ${CODEFLARE-SDK_DIR} && poetry env use 3.11 && poetry install --with test,docs && poetry run pytest -v -s ./tests/${TEST_TYPE}/${TEST_NAME} --timeout\=300 && deactivate
${result} = Run Process cd ${CODEFLARE-SDK_DIR} && poetry env use ${PYTHON_VERSION} && poetry install --with test,docs && poetry run pytest -v -s ./tests/${TEST_TYPE}/${TEST_NAME} --timeout\=420
... env:RAY_IMAGE=${RAY_IMAGE}
... env:AWS_DEFAULT_ENDPOINT=${AWS_DEFAULT_ENDPOINT}
... env:AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
Expand All @@ -82,8 +74,6 @@ Codeflare Upgrade Tests Teardown

Cleanup Codeflare-SDK Setup
[Documentation] cleanup codeflare repository cloned and python setup
Log To Console "Removing Python virtual environment directory ${VIRTUAL_ENV_NAME}"
Remove Directory ${VIRTUAL_ENV_NAME} recursive=True
Log To Console "Removing directory ${CODEFLARE-SDK_DIR}"
Remove Directory ${CODEFLARE-SDK_DIR} recursive=True
Expand Down
12 changes: 7 additions & 5 deletions ods_ci/tests/Resources/Page/ModelRegistry/ModelRegistry.resource
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Prepare Model Registry Test Setup
Get Cluster Domain And Token
Run Update Notebook Script
Generate ModelRegistry Certificates
Apply Db Config Samples namespace=${NAMESPACE_MODEL_REGISTRY}
Apply Db Config Samples namespace=${NAMESPACE_MODEL_REGISTRY} samples=${MODEL_REGISTRY_DB_SAMPLES}
Create Model Registry Secrets
Fetch CA Certificate If RHODS Is Self-Managed

Expand Down Expand Up @@ -151,10 +151,11 @@ Create Generic Secret

Apply Db Config Samples
[Documentation] Applying the db config samples from https://github.com/opendatahub-io/model-registry-operator
[Arguments] ${namespace}
[Arguments] ${namespace} ${samples}
${rc} ${out}= Run And Return Rc And Output
... oc apply -k ${MODEL_REGISTRY_DB_SAMPLES} -n ${namespace}
... oc apply -k ${samples} -n ${namespace}
Should Be Equal As Integers ${rc} 0 msg=${out}
Wait For Model Registry Containers To Be Ready

Jupyter Notebook Can Query Model Registry
[Documentation] Runs the test workbench and check if there was no error during execution
Expand All @@ -171,12 +172,13 @@ Jupyter Notebook Can Query Model Registry

Wait For Model Registry Containers To Be Ready
[Documentation] Wait for model-registry-deployment to be ready
${NAMESPACE_MODEL_REGISTRY}= Get Model Registry Namespace From DSC
${result}= Run Process
... oc wait --for\=condition\=Available --timeout\=5m -n ${PRJ_TITLE} deployment/model-registry-db
... oc wait --for\=condition\=Available --timeout\=5m -n ${NAMESPACE_MODEL_REGISTRY} deployment/model-registry-db # robocop: disable:line-too-long
... shell=true stderr=STDOUT
Log To Console ${result.stdout}
${result}= Run Process
... oc wait --for\=condition\=Available --timeout\=5m -n ${PRJ_TITLE} deployment/model-registry-deployment
... oc wait --for\=condition\=Available --timeout\=5m -n ${NAMESPACE_MODEL_REGISTRY} deployment/modelregistry-sample # robocop: disable:line-too-long
... shell=true stderr=STDOUT
Log To Console ${result.stdout}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -363,18 +363,15 @@ Add And Run JupyterLab Code Cell 5 In Active Notebook
# This keyword was copied and amended from JupyterLibrary resources - Notebook.Add And Run JupyterLab Code Cell.

${add icon} = Get JupyterLab Icon XPath Custom add

${nb} = Get WebElement xpath://div${JLAB XP NB FRAG}\[${n}]
${nbid} = Get Element Attribute ${nb} id

${active-nb-tab} = Get WebElement xpath:${JL_TABBAR_SELECTED_XPATH}
${tab-id} = Get Element Attribute ${active-nb-tab} id

Click Element xpath://div[@aria-labelledby="${tab-id}"]/div[1]//${add icon}
Click Element xpath://div[@aria-labelledby="${tab-id}"]//*[@data-jp-item-name="insert"]
Sleep 0.1s
Click Element xpath://div[@aria-labelledby="${tab-id}"]//div[contains(concat(' ',normalize-space(@class),' '),' jp-mod-selected ')]
Set CodeMirror Value \#${nbid}${JLAB CSS ACTIVE INPUT} @{code}
Run Current JupyterLab Code Cell 5 ${tab-id}
Press Keys None @{code}
Click Element xpath://div[@aria-labelledby="${tab-id}"]//*[@data-jp-item-name="run"]
Click Element xpath://div[@aria-labelledby="${tab-id}"]//div[contains(concat(' ',normalize-space(@class),' '),' jp-mod-selected ')]

Add And Run JupyterLab Code Cell 6 In Active Notebook
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Fill In Run Creation Form # robocop: disable
[Arguments] ${name} ${pipeline_name}=${NONE} ${from_actions}=${TRUE} ${run_type}=Immediate
... ${trigger_type}=Periodic ${start_date}=${NONE} ${start_time}=${NONE}
... ${end_date}=${NONE} ${end_time}=${NONE} ${cron_expr}=${NONE}&{model_param}
Wait Until Page Contains Element ${PIPELINE_RUN_CREATE_BTN_XP}
Element Should Be Disabled ${PIPELINE_RUN_CREATE_BTN_XP}
Input Text ${PIPELINE_RUN_NAME_INPUT_XP} ${name}
Input Text ${PIPELINE_RUN_DESC_INPUT_XP} ${name}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ Create Pipeline Run
... ${end_time}=${NONE} ${cron_expr}=${NONE} ${press_cancel}=${FALSE} &{model_param}
Projects.Move To Tab Pipelines
Pipelines.Click Action From Pipeline Actions Menu pipeline_name=${pipeline_name} action=Create run
Wait For RHODS Dashboard To Load expected_page=Create run
... wait_for_cards=${FALSE}
Fill In Run Creation Form name=${name} pipeline_name=${pipeline_name}
... run_type=${run_type} trigger_type=Periodic start_date=${start_date}
... start_time=${start_time} end_date=${end_date} end_time=${end_time}
Expand Down
Loading

0 comments on commit dd4521d

Please sign in to comment.