Skip to content
This repository has been archived by the owner on Jan 31, 2024. It is now read-only.

Add TrustyAI operator to ODH #864

Merged
merged 4 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions kfdef/odh-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ spec:
name: manifests
path: data-science-pipelines-operator/
name: data-science-pipelines-operator
- kustomizeConfig:
repoRef:
name: manifests
path: trustyai-service-operator
name: trustyai-service-operator
repos:
- name: manifests
uri: https://github.com/opendatahub-io/odh-manifests/tarball/master
Expand Down
20 changes: 0 additions & 20 deletions kfdef/trustyai.yaml

This file was deleted.

166 changes: 87 additions & 79 deletions tests/basictests/trustyai.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,144 +9,152 @@ RESOURCEDIR="${MY_DIR}/../resources"

TEST_USER=${OPENSHIFT_TESTUSER_NAME:-"admin"} #Username used to login to the ODH Dashboard
TEST_PASS=${OPENSHIFT_TESTUSER_PASS:-"admin"} #Password used to login to the ODH Dashboard
OPENSHIFT_OAUTH_ENDPOINT="https://$(oc get route -n openshift-authentication oauth-openshift -o json | jq -r '.spec.host')"
MM_NAMESPACE="${ODHPROJECT}-model"

MM_NAMESPACE="${ODHPROJECT}-model"

os::test::junit::declare_suite_start "$MY_SCRIPT"
# trackers of test successes
REQUESTS_CREATED=false
FAILURE=false
FAILURE_HANDLING='FAILURE=true && echo -e "\033[0;31mERROR\033[0m"'

function get_authentication(){
header "Getting authentication credentials to cluster"
oc adm policy add-role-to-user view -n ${ODHPROJECT} --rolebinding-name "view-$TEST_USER" $TEST_USER
TESTUSER_BEARER_TOKEN="$(curl -kiL -u $TEST_USER:$TEST_PASS -H 'X-CSRF-Token: xxx' $OPENSHIFT_OAUTH_ENDPOINT'/oauth/authorize?response_type=token&client_id=openshift-challenging-client' | grep -oP 'access_token=\K[^&]*')"
}

function check_trustyai_resources() {
header "Checking that TrustyAI resources have spun up"
oc project $ODHPROJECT
os::cmd::try_until_text "oc get deployment modelmesh-controller" "modelmesh-controller" $odhdefaulttimeout $odhdefaultinterval
os::cmd::try_until_text "oc get deployment trustyai-service" "trustyai-service" $odhdefaulttimeout $odhdefaultinterval
os::cmd::try_until_text "oc get route trustyai-service-route" "trustyai-service-route" $odhdefaulttimeout $odhdefaultinterval
os::test::junit::declare_suite_start "$MY_SCRIPT"

oc wait --for=condition=Ready $(oc get pod -o name | grep trustyai) --timeout=${odhdefaulttimeout}ms
function setup_monitoring() {
header "Enabling User Workload Monitoring on the cluster"
oc apply -f ${RESOURCEDIR}/modelmesh/enable-uwm.yaml || eval "$FAILURE_HANDLING"
}

function deploy_model() {
header "Deploying model into ModelMesh"
oc new-project $MM_NAMESPACE
os::cmd::expect_success "oc project $MM_NAMESPACE"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/modelmesh/service_account.yaml -n ${MM_NAMESPACE}"
oc new-project $MM_NAMESPACE || true

os::cmd::expect_success "oc project $MM_NAMESPACE" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/modelmesh/service_account.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
oc label namespace $MM_NAMESPACE "modelmesh-enabled=true" --overwrite=true || echo "Failed to apply modelmesh-enabled label."
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/secret.yaml -n ${MM_NAMESPACE}"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/odh-mlserver-0.x.yaml -n ${MM_NAMESPACE}"
# os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/model.yaml -n ${MM_NAMESPACE}"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/secret.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/odh-mlserver-0.x.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"

SECRETKEY=$(openssl rand -hex 32)
sed -i "s/<secretkey>/$SECRETKEY/g" ${RESOURCEDIR}/trustyai/sample-minio.yaml
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/sample-minio.yaml -n ${MM_NAMESPACE}"
#os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/openvino-serving-runtime.yaml -n ${MM_NAMESPACE}"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/openvino-inference-service.yaml -n ${MM_NAMESPACE}"
sleep 30
sed -i "s/<secretkey>/$SECRETKEY/g" ${RESOURCEDIR}/trustyai/sample-minio.yaml || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/sample-minio.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/openvino-inference-service.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/trustyai_crd.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
}

function check_trustyai_resources() {
header "Checking that TrustyAI resources have spun up"
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"

os::cmd::try_until_text "oc get deployment trustyai-service" "trustyai-service" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "oc get route trustyai-service-route" "trustyai-service-route" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "oc get pod | grep trustyai-service" "1/1" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"

}

function check_mm_resources() {
header "Checking that ModelMesh resources have spun up"
oc project $MM_NAMESPACE
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"

os::cmd::try_until_text "oc get pod | grep modelmesh-serving" "5/5" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "oc get route example-sklearn-isvc" "example-sklearn-isvc" $odhdefaulttimeout $odhdefaultinterval
INFER_ROUTE=$(oc get route example-sklearn-isvc --template={{.spec.host}}{{.spec.path}})
token=$(oc create token user-one -n ${MM_NAMESPACE})
os::cmd::try_until_text "oc get pod | grep modelmesh-serving" "5/5" $odhdefaulttimeout $odhdefaultinterval
os::cmd::try_until_text "curl -k https://$INFER_ROUTE/infer -d @${RESOURCEDIR}/trustyai/data.json -H 'Authorization: Bearer $token' -i" "model_name"
INFER_ROUTE=$(oc get route example-sklearn-isvc --template={{.spec.host}}{{.spec.path}}) || eval "$FAILURE_HANDLING"
token=$(oc create token user-one -n ${MM_NAMESPACE}) || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "curl -k https://$INFER_ROUTE/infer -d @${RESOURCEDIR}/trustyai/data.json -H 'Authorization: Bearer $token' -i" "model_name" || eval "$FAILURE_HANDLING"
}

function check_communication(){
header "Check communication between TrustyAI and ModelMesh"
oc project $MM_NAMESPACE
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"

# send some data to modelmesh
os::cmd::expect_success_and_text "curl -k https://$INFER_ROUTE/infer -d @${RESOURCEDIR}/trustyai/data.json -H 'Authorization: Bearer $token' -i" "model_name"
oc project ${ODHPROJECT}
os::cmd::try_until_text "oc logs $(oc get pods -o name | grep trustyai-service)" "Received partial input payload" $odhdefaulttimeout $odhdefaultinterval
os::cmd::expect_success_and_text "curl -k https://$INFER_ROUTE/infer -d @${RESOURCEDIR}/trustyai/data.json -H 'Authorization: Bearer $token' -i" "model_name" || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "oc logs $(oc get pods -o name | grep trustyai-service)" "Received partial input payload" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
}

function generate_data(){
header "Generate some data for TrustyAI (this will take a sec)"
oc project $MM_NAMESPACE
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"

# send a bunch of random data to the model
DIVISOR=128.498 # divide bash's $RANDOM by this to get a float range of [0.,255.], for MNIST
for i in {1..500};
do
DATA=$(sed "s/\[40.83, 3.5, 0.5, 0\]/\[$(($RANDOM % 2)),$(($RANDOM / 128)),$(($RANDOM / 128)), $(($RANDOM / 128)) \]/" ${RESOURCEDIR}/trustyai/data.json)
DATA=$(sed "s/\[40.83, 3.5, 0.5, 0\]/\[$(($RANDOM % 2)),$(($RANDOM / 128)),$(($RANDOM / 128)), $(($RANDOM / 128)) \]/" ${RESOURCEDIR}/trustyai/data.json) || eval "$FAILURE_HANDLING"
curl -k https://$INFER_ROUTE/infer -d "$DATA" -H 'Authorization: Bearer $token' -i > /dev/null 2>&1 &
sleep .01
done
}

function schedule_and_check_request(){
header "Create a metric request and confirm calculation"
oc project $ODHPROJECT
TRUSTY_ROUTE=$(oc get route/trustyai --template={{.spec.host}})
oc project $MM_NAMESPACE

TRUSTY_ROUTE=https://$(oc get route/trustyai-service --template={{.spec.host}}) || eval "$FAILURE_HANDLING"

os::cmd::expect_success_and_text "curl --location http://$TRUSTY_ROUTE/metrics/spd/request \
os::cmd::expect_success_and_text "curl -k --location $TRUSTY_ROUTE/metrics/spd/request \
--header 'Content-Type: application/json' \
--data '{
\"modelId\": \"example-sklearn-isvc\",
\"protectedAttribute\": \"input-0\",
\"favorableOutcome\": {
\"type\": \"INT64\",
\"value\": 0.0
},
\"outcomeName\": \"output-0\",
\"privilegedAttribute\": {
\"type\": \"DOUBLE\",
\"value\": 0.0
},
\"unprivilegedAttribute\": {
\"type\": \"DOUBLE\",
\"value\": 1.0
}
}'" "requestId"
os::cmd::try_until_text "curl http://$TRUSTY_ROUTE/q/metrics" "trustyai_spd"
\"protectedAttribute\": \"predict-0\",
\"favorableOutcome\": 0,
\"outcomeName\": \"predict\",
\"privilegedAttribute\": 0.0,
\"unprivilegedAttribute\": 1.0
}'" "requestId" || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "curl -k $TRUSTY_ROUTE/q/metrics" "trustyai_spd" || eval "$FAILURE_HANDLING"
REQUESTS_CREATED=true;
}


function test_prometheus_scraping(){
header "Ensure metrics are in Prometheus"
MODEL_MONITORING_ROUTE=$(oc get route -n ${ODHPROJECT} odh-model-monitoring --template={{.spec.host}})
os::cmd::try_until_text "curl -k --location -g --request GET 'https://'$MODEL_MONITORING_ROUTE'//api/v1/query?query=trustyai_spd' -H 'Authorization: Bearer $TESTUSER_BEARER_TOKEN' -i" "value" $odhdefaulttimeout $odhdefaultinterval

SECRET=`oc get secret -n openshift-user-workload-monitoring | grep prometheus-user-workload-token | head -n 1 | awk '{print $1 }'` || eval "$FAILURE_HANDLING"
TOKEN=`echo $(oc get secret $SECRET -n openshift-user-workload-monitoring -o json | jq -r '.data.token') | base64 -d` || eval "$FAILURE_HANDLING"
THANOS_QUERIER_HOST=`oc get route thanos-querier -n openshift-monitoring -o json | jq -r '.spec.host'` || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "curl -X GET -kG \"https://$THANOS_QUERIER_HOST/api/v1/query?\" --data-urlencode \"query=trustyai_spd{namespace='opendatahub-model'}\" -H 'Authorization: Bearer $TOKEN' | jq '.data.result[0].metric.protected'" "predict-0" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
}

function teardown_trustyai_test() {
header "Cleaning up the TrustyAI test"
oc project $ODHPROJECT

REQUEST_ID="$(curl http://$TRUSTY_ROUTE/metrics/spd/requests | jq '.requests [0].id')"

os::cmd::expect_success_and_text "curl -X DELETE --location http://$TRUSTY_ROUTE/metrics/spd/request \
-H 'Content-Type: application/json' \
-d '{
\"requestId\": \"'"$REQUEST_ID"'\"
}'" "Removed"
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"
TRUSTY_ROUTE=http://$(oc get route/trustyai-service --template={{.spec.host}}) || eval "$FAILURE_HANDLING"

oc project $MM_NAMESPACE
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/secret.yaml"
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/odh-mlserver-0.x.yaml"
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/model.yaml"
os::cmd::expect_success "oc delete project $MM_NAMESPACE"
if [ $REQUESTS_CREATED = true ]; then
for METRIC_NAME in "spd" "dir"
do
curl -sk $TRUSTY_ROUTE/metrics/$METRIC_NAME/requests
for REQUEST in $(curl -sk $TRUSTY_ROUTE/metrics/$METRIC_NAME/requests | jq -r '.requests [].id')
do
echo -n $REQUEST": "
curl -k -X DELETE --location $TRUSTY_ROUTE/metrics/$METRIC_NAME/request \
-H 'Content-Type: application/json' \
-d "{
\"requestId\": \"$REQUEST\"
}"
echo
done
done
fi

os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/secret.yaml" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/odh-mlserver-0.x.yaml" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/trustyai_crd.yaml" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc delete project $MM_NAMESPACE" || eval "$FAILURE_HANDLING"
}

get_authentication
deploy_model
check_mm_resources
check_communication
generate_data
schedule_and_check_request
test_prometheus_scraping
setup_monitoring
[ $FAILURE = false ] && deploy_model || echo -e "\033[0;31mSkipping model deployment due to previous failure\033[0m"
[ $FAILURE = false ] && check_trustyai_resources || echo -e "\033[0;31mSkipping TrustyAI resource check due to previous failure\033[0m"
[ $FAILURE = false ] && check_mm_resources || echo -e "\033[0;31mSkipping ModelMesh resource check due to previous failure\033[0m"
[ $FAILURE = false ] && check_communication || echo -e "\033[0;31mSkipping ModelMesh-TrustyAI communication check due to previous failure\033[0m"
[ $FAILURE = false ] && generate_data || echo -e "\033[0;31mSkipping data generation due to previous failure\033[0m"
[ $FAILURE = false ] && schedule_and_check_request || echo -e "\033[0;31mSkipping metric scheduling due to previous failure\033[0m"
[ $FAILURE = false ] && test_prometheus_scraping || echo -e "\033[0;31mSkipping Prometheus data check due to previous failure\033[0m"
teardown_trustyai_test

[ $FAILURE = true ] && os::cmd::expect_success "echo 'A previous assertion failed, marking suite as failed' && exit 1"

os::test::junit::declare_suite_end
os::test::junit::declare_suite_end
13 changes: 0 additions & 13 deletions tests/resources/trustyai/model.yaml

This file was deleted.

17 changes: 0 additions & 17 deletions tests/resources/trustyai/service_account.yaml

This file was deleted.

18 changes: 18 additions & 0 deletions tests/resources/trustyai/trustyai_crd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: trustyai.opendatahub.io.trustyai.opendatahub.io/v1alpha1
kind: TrustyAIService
metadata:
name: trustyai-service
spec:
# Optional values for replicas, image and tag. Below are the default values.
RobGeada marked this conversation as resolved.
Show resolved Hide resolved
# replicas: 1
# image: quay.io/trustyaiservice/trustyai-service
# tag: latest
storage:
format: "PVC"
folder: "/inputs"
size: "1Gi"
data:
filename: "data.csv"
format: "CSV"
metrics:
schedule: "5s"
7 changes: 7 additions & 0 deletions tests/resources/trustyai/trustyai_operator_configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: trustyai-service-operator-config
data:
trustyaiServiceImageName: "quay.io/trustyai/trustyai-service"
trustyaiServiceImageTag: "latest"
15 changes: 15 additions & 0 deletions tests/resources/trustyai/trustyai_operator_kfdef.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: kfdef.apps.kubeflow.org/v1
kind: KfDef
metadata:
name: trustyai-service-operator
spec:
applications:
- kustomizeConfig:
repoRef:
name: manifests
path: trustyai-service-operator
name: trustyai-service-operator
repos:
- name: manifests
uri: https://github.com/trustyai-explainability/trustyai-service-operator/tarball/main
version: v1.0.0
4 changes: 2 additions & 2 deletions tests/setup/odh-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ spec:
- kustomizeConfig:
repoRef:
name: manifests
path: trustyai-service
name: trustyai
path: trustyai-service-operator
name: trustyai-service-operator
- kustomizeConfig:
repoRef:
name: manifests
Expand Down
Loading