7_get_data_train_upload.yaml

# PIPELINE DEFINITION
# Name: 7-get-data-train-upload
components:
  comp-get-data:
    executorLabel: exec-get-data
    outputDefinitions:
      artifacts:
        train_data_output_path:
          artifactType:
            schemaTitle: system.Artifact
            schemaVersion: 0.0.1
        validate_data_output_path:
          artifactType:
            schemaTitle: system.Artifact
            schemaVersion: 0.0.1
  comp-train-model:
    executorLabel: exec-train-model
    inputDefinitions:
      artifacts:
        train_data_input_path:
          artifactType:
            schemaTitle: system.Artifact
            schemaVersion: 0.0.1
        validate_data_input_path:
          artifactType:
            schemaTitle: system.Artifact
            schemaVersion: 0.0.1
    outputDefinitions:
      artifacts:
        model_output_path:
          artifactType:
            schemaTitle: system.Artifact
            schemaVersion: 0.0.1
  comp-upload-model:
    executorLabel: exec-upload-model
    inputDefinitions:
      artifacts:
        input_model_path:
          artifactType:
            schemaTitle: system.Artifact
            schemaVersion: 0.0.1
deploymentSpec:
  executors:
    exec-get-data:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - get_data
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.8.0'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef get_data(train_data_output_path: OutputPath(), validate_data_output_path:\
          \ OutputPath()):\n    import urllib.request\n    print(\"starting download...\"\
          )\n    print(\"downloading training data\")\n    url = \"https://raw.githubusercontent.com/cfchase/fraud-detection/main/data/train.csv\"\
          \n    urllib.request.urlretrieve(url, train_data_output_path)\n    print(\"\
          train data downloaded\")\n    print(\"downloading validation data\")\n \
          \   url = \"https://raw.githubusercontent.com/cfchase/fraud-detection/main/data/validate.csv\"\
          \n    urllib.request.urlretrieve(url, validate_data_output_path)\n    print(\"\
          validation data downloaded\")\n\n"
        image: quay.io/modh/runtime-images:runtime-cuda-tensorflow-ubi9-python-3.9-2024a-20240523
    exec-train-model:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - train_model
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.8.0'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'onnx' 'onnxruntime'\
          \ 'tf2onnx' && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef train_model(train_data_input_path: InputPath(), validate_data_input_path:\
          \ InputPath(), model_output_path: OutputPath()):\n    import numpy as np\n\
          \    import pandas as pd\n    from keras.models import Sequential\n    from\
          \ keras.layers import Dense, Dropout, BatchNormalization, Activation\n \
          \   from sklearn.model_selection import train_test_split\n    from sklearn.preprocessing\
          \ import StandardScaler\n    from sklearn.utils import class_weight\n  \
          \  import tf2onnx\n    import onnx\n    import pickle\n    from pathlib\
          \ import Path\n\n    # Load the CSV data which we will use to train the\
          \ model.\n    # It contains the following fields:\n    #   distancefromhome\
          \ - The distance from home where the transaction happened.\n    #   distancefromlast_transaction\
          \ - The distance from last transaction happened.\n    #   ratiotomedianpurchaseprice\
          \ - Ratio of purchased price compared to median purchase price.\n    # \
          \  repeat_retailer - If it's from a retailer that already has been purchased\
          \ from before.\n    #   used_chip - If the (credit card) chip was used.\n\
          \    #   usedpinnumber - If the PIN number was used.\n    #   online_order\
          \ - If it was an online order.\n    #   fraud - If the transaction is fraudulent.\n\
          \n\n    feature_indexes = [\n        1,  # distance_from_last_transaction\n\
          \        2,  # ratio_to_median_purchase_price\n        4,  # used_chip\n\
          \        5,  # used_pin_number\n        6,  # online_order\n    ]\n\n  \
          \  label_indexes = [\n        7  # fraud\n    ]\n\n    X_train = pd.read_csv(train_data_input_path)\n\
          \    y_train = X_train.iloc[:, label_indexes]\n    X_train = X_train.iloc[:,\
          \ feature_indexes]\n\n    X_val = pd.read_csv(validate_data_input_path)\n\
          \    y_val = X_val.iloc[:, label_indexes]\n    X_val = X_val.iloc[:, feature_indexes]\n\
          \n    # Scale the data to remove mean and have unit variance. The data will\
          \ be between -1 and 1, which makes it a lot easier for the model to learn\
          \ than random (and potentially large) values.\n    # It is important to\
          \ only fit the scaler to the training data, otherwise you are leaking information\
          \ about the global distribution of variables (which is influenced by the\
          \ test set) into the training set.\n\n    scaler = StandardScaler()\n\n\
          \    X_train = scaler.fit_transform(X_train.values)\n\n    Path(\"artifact\"\
          ).mkdir(parents=True, exist_ok=True)\n    with open(\"artifact/scaler.pkl\"\
          , \"wb\") as handle:\n        pickle.dump(scaler, handle)\n\n    # Since\
          \ the dataset is unbalanced (it has many more non-fraud transactions than\
          \ fraudulent ones), set a class weight to weight the few fraudulent transactions\
          \ higher than the many non-fraud transactions.\n    class_weights = class_weight.compute_class_weight('balanced',\
          \ classes=np.unique(y_train), y=y_train.values.ravel())\n    class_weights\
          \ = {i: class_weights[i] for i in range(len(class_weights))}\n\n    # Build\
          \ the model, the model we build here is a simple fully connected deep neural\
          \ network, containing 3 hidden layers and one output layer.\n\n    model\
          \ = Sequential()\n    model.add(Dense(32, activation='relu', input_dim=len(feature_indexes)))\n\
          \    model.add(Dropout(0.2))\n    model.add(Dense(32))\n    model.add(BatchNormalization())\n\
          \    model.add(Activation('relu'))\n    model.add(Dropout(0.2))\n    model.add(Dense(32))\n\
          \    model.add(BatchNormalization())\n    model.add(Activation('relu'))\n\
          \    model.add(Dropout(0.2))\n    model.add(Dense(1, activation='sigmoid'))\n\
          \    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n\
          \    model.summary()\n\n    # Train the model and get performance\n\n  \
          \  epochs = 2\n    history = model.fit(X_train, y_train, epochs=epochs,\n\
          \                        validation_data=(scaler.transform(X_val.values),\
          \ y_val),\n                        verbose=True, class_weight=class_weights)\n\
          \n    # Save the model as ONNX for easy use of ModelMesh\n    model_proto,\
          \ _ = tf2onnx.convert.from_keras(model)\n    print(model_output_path)\n\
          \    onnx.save(model_proto, model_output_path)\n\n"
        image: quay.io/modh/runtime-images:runtime-cuda-tensorflow-ubi9-python-3.9-2024a-20240523
    exec-upload-model:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - upload_model
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.8.0'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'boto3' 'botocore'\
          \ && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef upload_model(input_model_path: InputPath()):\n    import os\n\
          \    import boto3\n    import botocore\n\n    aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')\n\
          \    aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')\n \
          \   endpoint_url = os.environ.get('AWS_S3_ENDPOINT')\n    region_name =\
          \ os.environ.get('AWS_DEFAULT_REGION')\n    bucket_name = os.environ.get('AWS_S3_BUCKET')\n\
          \n    s3_key = os.environ.get(\"S3_KEY\")\n\n    session = boto3.session.Session(aws_access_key_id=aws_access_key_id,\n\
          \                                    aws_secret_access_key=aws_secret_access_key)\n\
          \n    s3_resource = session.resource(\n        's3',\n        config=botocore.client.Config(signature_version='s3v4'),\n\
          \        endpoint_url=endpoint_url,\n        region_name=region_name)\n\n\
          \    bucket = s3_resource.Bucket(bucket_name)\n\n    print(f\"Uploading\
          \ {s3_key}\")\n    bucket.upload_file(input_model_path, s3_key)\n\n"
        env:
        - name: S3_KEY
          value: models/fraud/1/model.onnx
        image: quay.io/modh/runtime-images:runtime-cuda-tensorflow-ubi9-python-3.9-2024a-20240523
pipelineInfo:
  name: 7-get-data-train-upload
root:
  dag:
    tasks:
      get-data:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-get-data
        taskInfo:
          name: get-data
      train-model:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-train-model
        dependentTasks:
        - get-data
        inputs:
          artifacts:
            train_data_input_path:
              taskOutputArtifact:
                outputArtifactKey: train_data_output_path
                producerTask: get-data
            validate_data_input_path:
              taskOutputArtifact:
                outputArtifactKey: validate_data_output_path
                producerTask: get-data
        taskInfo:
          name: train-model
      upload-model:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-upload-model
        dependentTasks:
        - train-model
        inputs:
          artifacts:
            input_model_path:
              taskOutputArtifact:
                outputArtifactKey: model_output_path
                producerTask: train-model
        taskInfo:
          name: upload-model
schemaVersion: 2.1.0
sdkVersion: kfp-2.8.0
---
platforms:
  kubernetes:
    deploymentSpec:
      executors:
        exec-upload-model:
          secretAsEnv:
          - keyToEnv:
            - envVar: AWS_ACCESS_KEY_ID
              secretKey: AWS_ACCESS_KEY_ID
            - envVar: AWS_SECRET_ACCESS_KEY
              secretKey: AWS_SECRET_ACCESS_KEY
            - envVar: AWS_DEFAULT_REGION
              secretKey: AWS_DEFAULT_REGION
            - envVar: AWS_S3_BUCKET
              secretKey: AWS_S3_BUCKET
            - envVar: AWS_S3_ENDPOINT
              secretKey: AWS_S3_ENDPOINT
            secretName: aws-connection-my-storage