Skip to content

Commit

Permalink
Merge branch 'master' into fix_gpupod_check
Browse files Browse the repository at this point in the history
  • Loading branch information
bdattoma authored Oct 2, 2024
2 parents e4c68ba + e8f3883 commit 563a939
Show file tree
Hide file tree
Showing 13 changed files with 386 additions and 31 deletions.
6 changes: 6 additions & 0 deletions ods_ci/libs/DataSciencePipelinesKfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,12 @@ def wait_for_run_completion(self, run_id, timeout=160, sleep_duration=5):
response = self.client.wait_for_run_completion(run_id=run_id, timeout=timeout, sleep_duration=sleep_duration)
return response.state

@keyword
def get_run_status(self, run_id):
"""###Gets run status"""
response = self.client.get_run(run_id)
return response.state

@keyword
def check_run_status(self, run_id, timeout=160):
"""Waits for a run to complete"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
*** Settings ***
Documentation Collection of keywords to interact with Data Science Pipelines via CLI
Library OperatingSystem
Library String
Library ../../../../libs/DataSciencePipelinesAPI.py
Library ../../../../libs/DataSciencePipelinesKfp.py
Resource ../../../Resources/OCP.resource
Resource ../../../Resources/Common.robot


*** Variables ***
${DSPA_PATH}= tests/Resources/Files/pipeline-samples/v2/dspa
Expand Down Expand Up @@ -42,10 +43,16 @@ Create Pipeline Server
... -p OBJECT_STORAGE_REGION=${object_storage_region}
... -p OBJECT_STORAGE_BUCKET=${object_storage_bucket_name}

Run oc process -f ${DSPA_PATH}/dspa-template.yaml ${template_parameters} | oc apply -n ${namespace} -f -
Run And Verify Command oc process -f ${DSPA_PATH}/dspa-template.yaml ${template_parameters} | oc apply -n ${namespace} -f - # robocop: off=line-too-long

IF ${configure_pip_index} Create Pipelines ConfigMap With Custom Pip Index Url And Trusted Host ${namespace}

Get DSP Version
[Documentation] Returns dspVersion of the DSPA deployed in ${namespace}
[Arguments] ${namespace}
${dsp_version}= Run And Verify Command oc get datasciencepipelinesapplications -n ${namespace} -o json | jq -r '.items[0].spec.dspVersion' # robocop: off=line-too-long
RETURN ${dsp_version}

# robocop: disable:line-too-long
Create PipelineServer Using Custom DSPA
[Documentation] Install and verifies that DataSciencePipelinesApplication CRD is installed and working
Expand All @@ -55,21 +62,25 @@ Create PipelineServer Using Custom DSPA
[Arguments] ${namespace} ${dspa_file}=data-science-pipelines-sample.yaml
... ${assert_install}=${TRUE} ${configure_pip_index}=${TRUE}

Run oc apply -f "${DSPA_PATH}/${dspa_file}" -n ${namespace}
Run And Verify Command oc apply -f "${DSPA_PATH}/${dspa_file}" -n ${namespace}
IF ${assert_install}==True
${generation_value} Run oc get datasciencepipelinesapplications -n ${namespace} -o json | jq '.items[0].metadata.generation'
${generation_value}= Run And Verify Command oc get datasciencepipelinesapplications -n ${namespace} -o json | jq '.items[0].metadata.generation' # robocop: off=line-too-long
Should Be True ${generation_value} == 2 DataSciencePipelinesApplication created
END

IF ${configure_pip_index} Create Pipelines ConfigMap With Custom Pip Index Url And Trusted Host ${namespace}

Verify Pipeline Server Deployments # robocop: disable
[Documentation] Verifies the correct deployment of DS Pipelines in the rhods namespace
[Documentation] Verifies the correct deployment of a DSPv2 DataSciencePipelineApplication
[Arguments] ${namespace}

@{all_pods}= Oc Get kind=Pod namespace=${namespace}
... label_selector=component=data-science-pipelines
Run Keyword And Continue On Failure Length Should Be ${all_pods} 7

${pods_count}= Get Length ${all_pods}
IF ${pods_count} < 7
Fail DSPA requires at least 7 pods running in the namespace
END

@{pipeline_api_server}= Oc Get kind=Pod namespace=${namespace}
... label_selector=app=ds-pipeline-dspa
Expand Down Expand Up @@ -106,12 +117,52 @@ Verify Pipeline Server Deployments # robocop: disable
${containerNames}= Create List mariadb
Verify Deployment ${mariadb} 1 1 ${containerNames}

Verify DSPv1 Pipeline Server Deployments
[Documentation] Verifies the correct deployment of a DSPv1 DataSciencePipelineApplication
[Arguments] ${namespace}

@{all_pods}= Oc Get kind=Pod namespace=${namespace}
... label_selector=component=data-science-pipelines

${pods_count}= Get Length ${all_pods}
IF ${pods_count} < 4
Fail DSPA requires at least 4 pods running in the namespace
END

@{pipeline_api_server}= Oc Get kind=Pod namespace=${namespace}
... label_selector=app=ds-pipeline-dspa
${containerNames}= Create List oauth-proxy ds-pipeline-api-server
Verify Deployment ${pipeline_api_server} 1 2 ${containerNames}

@{pipeline_persistenceagent}= Oc Get kind=Pod namespace=${namespace}
... label_selector=app=ds-pipeline-persistenceagent-dspa
${containerNames}= Create List ds-pipeline-persistenceagent
Verify Deployment ${pipeline_persistenceagent} 1 1 ${containerNames}

@{pipeline_scheduledworkflow}= Oc Get kind=Pod namespace=${namespace}
... label_selector=app=ds-pipeline-scheduledworkflow-dspa
${containerNames}= Create List ds-pipeline-scheduledworkflow
Verify Deployment ${pipeline_scheduledworkflow} 1 1 ${containerNames}

@{mariadb}= Oc Get kind=Pod namespace=${namespace}
... label_selector=app=mariadb-dspa
${containerNames}= Create List mariadb
Verify Deployment ${mariadb} 1 1 ${containerNames}

Wait Until Pipeline Server Is Deployed
[Documentation] Waits until all the expected pods of the pipeline server
... are running
[Arguments] ${namespace}
Wait Until Keyword Succeeds 10 times 10s
... Verify Pipeline Server Deployments namespace=${namespace}

${dspVersion}= Get DSP Version ${namespace}
IF "${dspVersion}" == "v2"
Wait Until Keyword Succeeds 10 times 10s
... Verify Pipeline Server Deployments namespace=${namespace}
ELSE
Wait Until Keyword Succeeds 10 times 10s
... Verify DSPv1 Pipeline Server Deployments namespace=${namespace}
END


Wait Until Pipeline Server Is Deleted
[Documentation] Waits until all pipeline server pods are deleted
Expand All @@ -128,13 +179,13 @@ Create Pipelines ConfigMap With Custom Pip Index Url And Trusted Host
[Documentation] Creates a Configmap (ds-pipeline-custom-env-vars) in the project,
... storing the values for pip_index_url and pip_trusted_host
[Arguments] ${namespace}
Run oc create configmap ds-pipeline-custom-env-vars -n ${namespace} --from-literal=pip_index_url=${PIP_INDEX_URL} --from-literal=pip_trusted_host=${PIP_TRUSTED_HOST}
Run And Verify Command oc create configmap ds-pipeline-custom-env-vars -n ${namespace} --from-literal=pip_index_url=${PIP_INDEX_URL} --from-literal=pip_trusted_host=${PIP_TRUSTED_HOST} # robocop: off=line-too-long

Create Secret With Pipelines Object Storage Information
[Documentation] Creates a secret needed to create a pipeline server containing the object storage credentials
[Arguments] ${namespace} ${object_storage_access_key} ${object_storage_secret_key}
Run oc create secret generic dashboard-dspa-secret -n ${namespace} --from-literal=AWS_ACCESS_KEY_ID=${object_storage_access_key} --from-literal=AWS_SECRET_ACCESS_KEY=${object_storage_secret_key}
Run oc label secret dashboard-dspa-secret -n ${namespace} opendatahub.io/dashboard=true
Run And Verify Command oc create secret generic dashboard-dspa-secret -n ${namespace} --from-literal=AWS_ACCESS_KEY_ID=${object_storage_access_key} --from-literal=AWS_SECRET_ACCESS_KEY=${object_storage_secret_key} # robocop: off=line-too-long
Run And Verify Command oc label secret dashboard-dspa-secret -n ${namespace} opendatahub.io/dashboard=true


Import Pipeline And Create Run
Expand Down Expand Up @@ -164,6 +215,20 @@ Import Pipeline And Create Run

RETURN ${pipeline_id} ${pipeline_version_id} ${pipeline_run_id} ${experiment_id}

Verify Run Status
[Documentation] Verifies pipeline run status matches ${pipeline_run_expected_status}
[Arguments] ${namespace} ${username} ${password}
... ${pipeline_run_id} ${pipeline_run_expected_status}="SUCCEEDED"

DataSciencePipelinesKfp.Setup Client user=${username} pwd=${password} project=${namespace}

${pipeline_run_status}= DataSciencePipelinesKfp.Get Run Status run_id=${pipeline_run_id}
IF "${pipeline_run_status}" != "${pipeline_run_expected_status}"
${error_msg}= Catenate Expected pipeline status was ${pipeline_run_expected_status} but pipeline run
... has status=${pipeline_run_status}
Fail ${error_msg}
END

Wait For Run Completion And Verify Status
[Documentation]
[Arguments] ${namespace} ${username} ${password}
Expand All @@ -175,11 +240,8 @@ Wait For Run Completion And Verify Status
${pipeline_run_status}= DataSciencePipelinesKfp.Wait For Run Completion run_id=${pipeline_run_id}
... timeout=${pipeline_run_timeout} sleep_duration=${5}

IF "${pipeline_run_status}" != "${pipeline_run_expected_status}"
${error_msg}= Catenate Expected pipeline status was ${pipeline_run_expected_status} but pipeline run
... finished with status=${pipeline_run_status}
Fail ${error_msg}
END
Verify Run Status namespace=${namespace} username=${username} password=${password}
... pipeline_run_id=${pipeline_run_id} pipeline_run_expected_status=${pipeline_run_expected_status}

RETURN ${pipeline_run_status}

Expand Down Expand Up @@ -207,5 +269,3 @@ Delete Pipeline And Related Resources
END

DataSciencePipelinesKfp.Delete Pipeline ${pipeline_id}


Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
*** Settings ***
Documentation Upgrade Testing Keywords
Resource DataSciencePipelinesBackend.resource
Resource ../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Projects.resource


*** Variables ***
${PROJECT}= dsp-upgrade-testing
${PIPELINE_LONGRUNNING_FILEPATH}= tests/Resources/Files/pipeline-samples/v2/pip_index_url/take_nap_pipeline_root_compiled.yaml # robocop: disable:line-too-long


*** Keywords ***
Setup Environment For Upgrade Testing
[Documentation] Creates project dsp-test-upgrade and sets up the resources to test during upgrade:
... - Creates a pipeline server
... - Starts a pipeline that will run for 1h
Create Project And Configure Pipeline Server ${PROJECT}
Start Long Running Pipeline ${PROJECT}

Verify Resources After Upgrade
[Documentation] Verifies the status of the resources created in ${PROJECT} after the upgrade
... Deletes ${PROJECT} if all verifications are correct (leaving for debugging purposes if not)
DataSciencePipelinesBackend.Wait Until Pipeline Server Is Deployed namespace=${PROJECT}

Verify Run Status
... namespace=${PROJECT} username=${TEST_USER.USERNAME} password=${TEST_USER.PASSWORD}
... pipeline_run_id=${DSP_LONGRUNNING_PIPELINE_RUN_ID} pipeline_run_expected_status=RUNNING

Projects.Delete Project Via CLI By Display Name ${PROJECT}

Create Project And Configure Pipeline Server
[Documentation] Creates a data science pipelines project ${namespace} (deleting existing one if needed),
... configures a pipeline server using the default configuration and waits until the server is running
[Arguments] ${namespace}
Projects.Delete Project Via CLI By Display Name ${namespace}
Projects.Create Data Science Project From CLI ${namespace}
DataSciencePipelinesBackend.Create Pipeline Server namespace=${namespace}
... object_storage_access_key=${S3.AWS_ACCESS_KEY_ID}
... object_storage_secret_key=${S3.AWS_SECRET_ACCESS_KEY}
... object_storage_endpoint=${S3.BUCKET_2.ENDPOINT}
... object_storage_region=${S3.BUCKET_2.REGION}
... object_storage_bucket_name=${S3.BUCKET_2.NAME}
... dsp_version=v2
DataSciencePipelinesBackend.Wait Until Pipeline Server Is Deployed namespace=${namespace}

Start Long Running Pipeline
[Documentation] Imports and creates a run of a long running pipeline
[Arguments] ${namespace}

${pipeline_run_params}= Create Dictionary naptime_secs=${3600}

# robocop:off=unused-variable
${pipeline_id} ${pipeline_version_id} ${pipeline_run_id} ${experiment_id}=
... DataSciencePipelinesBackend.Import Pipeline And Create Run
... namespace=${namespace} username=${TEST_USER.USERNAME} password=${TEST_USER.PASSWORD}
... pipeline_name=take-nap
... pipeline_description=A pipeline that runs for 1h and prints a message
... pipeline_package_path=${PIPELINE_LONGRUNNING_FILEPATH}
... pipeline_run_name=take-nap-run
... pipeline_run_params=${pipeline_run_params}

Set Global Variable ${DSP_LONGRUNNING_PIPELINE_RUN_ID} ${pipeline_run_id}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
AUTH_PROVIDER=opendatahub-auth-provider
AUTH_PROVIDER=
DOMAIN=
ISTIO_INGRESS=ingressgateway
REST_CREDENTIAL_NAME=modelregistry-sample-rest-credential
Expand Down
8 changes: 4 additions & 4 deletions ods_ci/tests/Resources/Common.robot
Original file line number Diff line number Diff line change
Expand Up @@ -437,13 +437,13 @@ Extract URLs From Text
RETURN ${urls}

Run And Verify Command
[Documentation] Run and verify shell command
[Arguments] ${command} ${print_to_log}=${TRUE}
[Documentation] Run and verify shell command
[Arguments] ${command} ${print_to_log}=${TRUE} ${expected_rc}=${0}
${result}= Run Process ${command} shell=yes stderr=STDOUT
IF ${print_to_log} Log ${result.stdout} console=True
Should Be True ${result.rc} == 0
Should Be True ${result.rc} == ${expected_rc}
RETURN ${result.stdout}

Run And Watch Command
[Documentation] Run any shell command (including args) with optional:
... Timeout: 10 minutes by default.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(base_image=common_base_image, packages_to_install=["torch"], pip_index_urls=["$PIP_INDEX_URL"])
@dsl.component(
base_image=common_base_image,
packages_to_install=["torch"],
pip_index_urls=["$PIP_INDEX_URL"],
pip_trusted_hosts=["$PIP_TRUSTED_HOST"],
)
def verify_gpu_availability(gpu_toleration_added: bool):
import torch # noqa: PLC0415

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ deploymentSpec:
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location --index-url $PIP_INDEX_URL\
\ --trusted-host $PIP_INDEX_URL 'kfp==2.9.0' '--no-deps' 'typing-extensions>=3.7.4,<5;\
\ --trusted-host $PIP_TRUSTED_HOST 'kfp==2.9.0' '--no-deps' 'typing-extensions>=3.7.4,<5;\
\ python_version<\"3.9\"' && python3 -m pip install --quiet --no-warn-script-location\
\ --index-url $PIP_INDEX_URL --trusted-host $PIP_INDEX_URL 'torch' && \"\
$0\" \"$@\"\n"
\ --index-url $PIP_INDEX_URL --trusted-host $PIP_TRUSTED_HOST 'torch' &&\
\ \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
Expand Down Expand Up @@ -72,10 +72,10 @@ deploymentSpec:
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location --index-url $PIP_INDEX_URL\
\ --trusted-host $PIP_INDEX_URL 'kfp==2.9.0' '--no-deps' 'typing-extensions>=3.7.4,<5;\
\ --trusted-host $PIP_TRUSTED_HOST 'kfp==2.9.0' '--no-deps' 'typing-extensions>=3.7.4,<5;\
\ python_version<\"3.9\"' && python3 -m pip install --quiet --no-warn-script-location\
\ --index-url $PIP_INDEX_URL --trusted-host $PIP_INDEX_URL 'torch' && \"\
$0\" \"$@\"\n"
\ --index-url $PIP_INDEX_URL --trusted-host $PIP_TRUSTED_HOST 'torch' &&\
\ \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from kfp import compiler, dsl, kubernetes
from kfp.dsl import PipelineTask

common_base_image = (
"registry.redhat.io/ubi8/python-39@sha256:3523b184212e1f2243e76d8094ab52b01ea3015471471290d011625e1763af61"
)


def add_pip_index_configuration(task: PipelineTask):
kubernetes.use_config_map_as_env(
task,
config_map_name="ds-pipeline-custom-env-vars",
config_map_key_to_env={"pip_index_url": "PIP_INDEX_URL", "pip_trusted_host": "PIP_TRUSTED_HOST"},
)


@dsl.component(base_image=common_base_image)
def take_nap(naptime_secs: int) -> str:
"""Sleeps for secs"""
from time import sleep # noqa: PLC0415

print(f"Sleeping for {naptime_secs} seconds: Zzzzzz ...")
sleep(naptime_secs)
return "I'm awake now. Did I snore?"


@dsl.component(base_image=common_base_image)
def wake_up(message: str):
"""Wakes up from nap printing a message"""
print(message)


@dsl.pipeline(name="take-nap-pipeline", description="Pipeline that sleeps for 15 mins (900 secs)")
def take_nap_pipeline(naptime_secs: int = 900):
take_nap_task = take_nap(naptime_secs=naptime_secs).set_caching_options(False)
add_pip_index_configuration(take_nap_task)
wake_up_task = wake_up(message=take_nap_task.output).set_caching_options(False)
add_pip_index_configuration(wake_up_task)


if __name__ == "__main__":
compiler.Compiler().compile(take_nap_pipeline, package_path=__file__.replace(".py", "_compiled.yaml"))
Loading

0 comments on commit 563a939

Please sign in to comment.