Skip to content
This repository has been archived by the owner on Jan 31, 2024. It is now read-only.

Add TrustyAI operator to ODH #864

Merged
merged 4 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions kfdef/odh-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ spec:
name: manifests
path: data-science-pipelines-operator/
name: data-science-pipelines-operator
- kustomizeConfig:
repoRef:
name: manifests
path: trustyai-service-operator
name: trustyai-service-operator
repos:
- name: manifests
uri: https://github.com/opendatahub-io/odh-manifests/tarball/master
Expand Down
20 changes: 0 additions & 20 deletions kfdef/trustyai.yaml

This file was deleted.

166 changes: 87 additions & 79 deletions tests/basictests/trustyai.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,144 +9,152 @@ RESOURCEDIR="${MY_DIR}/../resources"

TEST_USER=${OPENSHIFT_TESTUSER_NAME:-"admin"} #Username used to login to the ODH Dashboard
TEST_PASS=${OPENSHIFT_TESTUSER_PASS:-"admin"} #Password used to login to the ODH Dashboard
OPENSHIFT_OAUTH_ENDPOINT="https://$(oc get route -n openshift-authentication oauth-openshift -o json | jq -r '.spec.host')"
MM_NAMESPACE="${ODHPROJECT}-model"

MM_NAMESPACE="${ODHPROJECT}-model"

os::test::junit::declare_suite_start "$MY_SCRIPT"
# trackers of test successes
REQUESTS_CREATED=false
FAILURE=false
FAILURE_HANDLING='FAILURE=true && echo -e "\033[0;31mERROR\033[0m"'

function get_authentication(){
header "Getting authentication credentials to cluster"
oc adm policy add-role-to-user view -n ${ODHPROJECT} --rolebinding-name "view-$TEST_USER" $TEST_USER
TESTUSER_BEARER_TOKEN="$(curl -kiL -u $TEST_USER:$TEST_PASS -H 'X-CSRF-Token: xxx' $OPENSHIFT_OAUTH_ENDPOINT'/oauth/authorize?response_type=token&client_id=openshift-challenging-client' | grep -oP 'access_token=\K[^&]*')"
}

function check_trustyai_resources() {
header "Checking that TrustyAI resources have spun up"
oc project $ODHPROJECT
os::cmd::try_until_text "oc get deployment modelmesh-controller" "modelmesh-controller" $odhdefaulttimeout $odhdefaultinterval
os::cmd::try_until_text "oc get deployment trustyai-service" "trustyai-service" $odhdefaulttimeout $odhdefaultinterval
os::cmd::try_until_text "oc get route trustyai-service-route" "trustyai-service-route" $odhdefaulttimeout $odhdefaultinterval
os::test::junit::declare_suite_start "$MY_SCRIPT"

oc wait --for=condition=Ready $(oc get pod -o name | grep trustyai) --timeout=${odhdefaulttimeout}ms
function setup_monitoring() {
header "Enabling User Workload Monitoring on the cluster"
oc apply -f ${RESOURCEDIR}/modelmesh/enable-uwm.yaml || eval "$FAILURE_HANDLING"
}

function deploy_model() {
header "Deploying model into ModelMesh"
oc new-project $MM_NAMESPACE
os::cmd::expect_success "oc project $MM_NAMESPACE"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/modelmesh/service_account.yaml -n ${MM_NAMESPACE}"
oc new-project $MM_NAMESPACE || true

os::cmd::expect_success "oc project $MM_NAMESPACE" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/modelmesh/service_account.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
oc label namespace $MM_NAMESPACE "modelmesh-enabled=true" --overwrite=true || echo "Failed to apply modelmesh-enabled label."
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/secret.yaml -n ${MM_NAMESPACE}"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/odh-mlserver-0.x.yaml -n ${MM_NAMESPACE}"
# os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/model.yaml -n ${MM_NAMESPACE}"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/secret.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/odh-mlserver-0.x.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"

SECRETKEY=$(openssl rand -hex 32)
sed -i "s/<secretkey>/$SECRETKEY/g" ${RESOURCEDIR}/trustyai/sample-minio.yaml
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/sample-minio.yaml -n ${MM_NAMESPACE}"
#os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/openvino-serving-runtime.yaml -n ${MM_NAMESPACE}"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/openvino-inference-service.yaml -n ${MM_NAMESPACE}"
sleep 30
sed -i "s/<secretkey>/$SECRETKEY/g" ${RESOURCEDIR}/trustyai/sample-minio.yaml || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/sample-minio.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/openvino-inference-service.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/trustyai/trustyai_crd.yaml -n ${MM_NAMESPACE}" || eval "$FAILURE_HANDLING"
}

function check_trustyai_resources() {
header "Checking that TrustyAI resources have spun up"
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"

os::cmd::try_until_text "oc get deployment trustyai-service" "trustyai-service" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "oc get route trustyai-service-route" "trustyai-service-route" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "oc get pod | grep trustyai-service" "1/1" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"

}

function check_mm_resources() {
header "Checking that ModelMesh resources have spun up"
oc project $MM_NAMESPACE
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"

os::cmd::try_until_text "oc get pod | grep modelmesh-serving" "5/5" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "oc get route example-sklearn-isvc" "example-sklearn-isvc" $odhdefaulttimeout $odhdefaultinterval
INFER_ROUTE=$(oc get route example-sklearn-isvc --template={{.spec.host}}{{.spec.path}})
token=$(oc create token user-one -n ${MM_NAMESPACE})
os::cmd::try_until_text "oc get pod | grep modelmesh-serving" "5/5" $odhdefaulttimeout $odhdefaultinterval
os::cmd::try_until_text "curl -k https://$INFER_ROUTE/infer -d @${RESOURCEDIR}/trustyai/data.json -H 'Authorization: Bearer $token' -i" "model_name"
INFER_ROUTE=$(oc get route example-sklearn-isvc --template={{.spec.host}}{{.spec.path}}) || eval "$FAILURE_HANDLING"
token=$(oc create token user-one -n ${MM_NAMESPACE}) || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "curl -k https://$INFER_ROUTE/infer -d @${RESOURCEDIR}/trustyai/data.json -H 'Authorization: Bearer $token' -i" "model_name" || eval "$FAILURE_HANDLING"
}

function check_communication(){
header "Check communication between TrustyAI and ModelMesh"
oc project $MM_NAMESPACE
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"

# send some data to modelmesh
os::cmd::expect_success_and_text "curl -k https://$INFER_ROUTE/infer -d @${RESOURCEDIR}/trustyai/data.json -H 'Authorization: Bearer $token' -i" "model_name"
oc project ${ODHPROJECT}
os::cmd::try_until_text "oc logs $(oc get pods -o name | grep trustyai-service)" "Received partial input payload" $odhdefaulttimeout $odhdefaultinterval
os::cmd::expect_success_and_text "curl -k https://$INFER_ROUTE/infer -d @${RESOURCEDIR}/trustyai/data.json -H 'Authorization: Bearer $token' -i" "model_name" || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "oc logs $(oc get pods -o name | grep trustyai-service)" "Received partial input payload" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
}

function generate_data(){
header "Generate some data for TrustyAI (this will take a sec)"
oc project $MM_NAMESPACE
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"

# send a bunch of random data to the model
DIVISOR=128.498 # divide bash's $RANDOM by this to get a float range of [0.,255.], for MNIST
for i in {1..500};
do
DATA=$(sed "s/\[40.83, 3.5, 0.5, 0\]/\[$(($RANDOM % 2)),$(($RANDOM / 128)),$(($RANDOM / 128)), $(($RANDOM / 128)) \]/" ${RESOURCEDIR}/trustyai/data.json)
DATA=$(sed "s/\[40.83, 3.5, 0.5, 0\]/\[$(($RANDOM % 2)),$(($RANDOM / 128)),$(($RANDOM / 128)), $(($RANDOM / 128)) \]/" ${RESOURCEDIR}/trustyai/data.json) || eval "$FAILURE_HANDLING"
curl -k https://$INFER_ROUTE/infer -d "$DATA" -H 'Authorization: Bearer $token' -i > /dev/null 2>&1 &
sleep .01
done
}

function schedule_and_check_request(){
header "Create a metric request and confirm calculation"
oc project $ODHPROJECT
TRUSTY_ROUTE=$(oc get route/trustyai --template={{.spec.host}})
oc project $MM_NAMESPACE

TRUSTY_ROUTE=https://$(oc get route/trustyai-service --template={{.spec.host}}) || eval "$FAILURE_HANDLING"

os::cmd::expect_success_and_text "curl --location http://$TRUSTY_ROUTE/metrics/spd/request \
os::cmd::expect_success_and_text "curl -k --location $TRUSTY_ROUTE/metrics/spd/request \
--header 'Content-Type: application/json' \
--data '{
\"modelId\": \"example-sklearn-isvc\",
\"protectedAttribute\": \"input-0\",
\"favorableOutcome\": {
\"type\": \"INT64\",
\"value\": 0.0
},
\"outcomeName\": \"output-0\",
\"privilegedAttribute\": {
\"type\": \"DOUBLE\",
\"value\": 0.0
},
\"unprivilegedAttribute\": {
\"type\": \"DOUBLE\",
\"value\": 1.0
}
}'" "requestId"
os::cmd::try_until_text "curl http://$TRUSTY_ROUTE/q/metrics" "trustyai_spd"
\"protectedAttribute\": \"predict-0\",
\"favorableOutcome\": 0,
\"outcomeName\": \"predict\",
\"privilegedAttribute\": 0.0,
\"unprivilegedAttribute\": 1.0
}'" "requestId" || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "curl -k $TRUSTY_ROUTE/q/metrics" "trustyai_spd" || eval "$FAILURE_HANDLING"
REQUESTS_CREATED=true;
}


function test_prometheus_scraping(){
header "Ensure metrics are in Prometheus"
MODEL_MONITORING_ROUTE=$(oc get route -n ${ODHPROJECT} odh-model-monitoring --template={{.spec.host}})
os::cmd::try_until_text "curl -k --location -g --request GET 'https://'$MODEL_MONITORING_ROUTE'//api/v1/query?query=trustyai_spd' -H 'Authorization: Bearer $TESTUSER_BEARER_TOKEN' -i" "value" $odhdefaulttimeout $odhdefaultinterval

SECRET=`oc get secret -n openshift-user-workload-monitoring | grep prometheus-user-workload-token | head -n 1 | awk '{print $1 }'` || eval "$FAILURE_HANDLING"
TOKEN=`echo $(oc get secret $SECRET -n openshift-user-workload-monitoring -o json | jq -r '.data.token') | base64 -d` || eval "$FAILURE_HANDLING"
THANOS_QUERIER_HOST=`oc get route thanos-querier -n openshift-monitoring -o json | jq -r '.spec.host'` || eval "$FAILURE_HANDLING"
os::cmd::try_until_text "curl -X GET -kG \"https://$THANOS_QUERIER_HOST/api/v1/query?\" --data-urlencode \"query=trustyai_spd{namespace='opendatahub-model'}\" -H 'Authorization: Bearer $TOKEN' | jq '.data.result[0].metric.protected'" "predict-0" $odhdefaulttimeout $odhdefaultinterval || eval "$FAILURE_HANDLING"
}

function teardown_trustyai_test() {
header "Cleaning up the TrustyAI test"
oc project $ODHPROJECT

REQUEST_ID="$(curl http://$TRUSTY_ROUTE/metrics/spd/requests | jq '.requests [0].id')"

os::cmd::expect_success_and_text "curl -X DELETE --location http://$TRUSTY_ROUTE/metrics/spd/request \
-H 'Content-Type: application/json' \
-d '{
\"requestId\": \"'"$REQUEST_ID"'\"
}'" "Removed"
oc project $MM_NAMESPACE || eval "$FAILURE_HANDLING"
TRUSTY_ROUTE=http://$(oc get route/trustyai-service --template={{.spec.host}}) || eval "$FAILURE_HANDLING"

oc project $MM_NAMESPACE
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/secret.yaml"
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/odh-mlserver-0.x.yaml"
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/model.yaml"
os::cmd::expect_success "oc delete project $MM_NAMESPACE"
if [ $REQUESTS_CREATED = true ]; then
for METRIC_NAME in "spd" "dir"
do
curl -sk $TRUSTY_ROUTE/metrics/$METRIC_NAME/requests
for REQUEST in $(curl -sk $TRUSTY_ROUTE/metrics/$METRIC_NAME/requests | jq -r '.requests [].id')
do
echo -n $REQUEST": "
curl -k -X DELETE --location $TRUSTY_ROUTE/metrics/$METRIC_NAME/request \
-H 'Content-Type: application/json' \
-d "{
\"requestId\": \"$REQUEST\"
}"
echo
done
done
fi

os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/secret.yaml" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/odh-mlserver-0.x.yaml" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc delete -f ${RESOURCEDIR}/trustyai/trustyai_crd.yaml" || eval "$FAILURE_HANDLING"
os::cmd::expect_success "oc delete project $MM_NAMESPACE" || eval "$FAILURE_HANDLING"
}

get_authentication
deploy_model
check_mm_resources
check_communication
generate_data
schedule_and_check_request
test_prometheus_scraping
setup_monitoring
[ $FAILURE = false ] && deploy_model || echo -e "\033[0;31mSkipping model deployment due to previous failure\033[0m"
[ $FAILURE = false ] && check_trustyai_resources || echo -e "\033[0;31mSkipping TrustyAI resource check due to previous failure\033[0m"
[ $FAILURE = false ] && check_mm_resources || echo -e "\033[0;31mSkipping ModelMesh resource check due to previous failure\033[0m"
[ $FAILURE = false ] && check_communication || echo -e "\033[0;31mSkipping ModelMesh-TrustyAI communication check due to previous failure\033[0m"
[ $FAILURE = false ] && generate_data || echo -e "\033[0;31mSkipping data generation due to previous failure\033[0m"
[ $FAILURE = false ] && schedule_and_check_request || echo -e "\033[0;31mSkipping metric scheduling due to previous failure\033[0m"
[ $FAILURE = false ] && test_prometheus_scraping || echo -e "\033[0;31mSkipping Prometheus data check due to previous failure\033[0m"
teardown_trustyai_test

[ $FAILURE = true ] && os::cmd::expect_success "echo 'A previous assertion failed, marking suite as failed' && exit 1"

os::test::junit::declare_suite_end
os::test::junit::declare_suite_end
13 changes: 0 additions & 13 deletions tests/resources/trustyai/model.yaml

This file was deleted.

17 changes: 0 additions & 17 deletions tests/resources/trustyai/service_account.yaml

This file was deleted.

14 changes: 14 additions & 0 deletions tests/resources/trustyai/trustyai_crd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: trustyai.opendatahub.io.trustyai.opendatahub.io/v1alpha1
kind: TrustyAIService
metadata:
name: trustyai-service
spec:
storage:
format: "PVC"
folder: "/inputs"
size: "1Gi"
data:
filename: "data.csv"
format: "CSV"
metrics:
schedule: "5s"
7 changes: 7 additions & 0 deletions tests/resources/trustyai/trustyai_operator_configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: trustyai-service-operator-config
data:
trustyaiServiceImageName: "quay.io/trustyai/trustyai-service"
trustyaiServiceImageTag: "latest"
15 changes: 15 additions & 0 deletions tests/resources/trustyai/trustyai_operator_kfdef.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: kfdef.apps.kubeflow.org/v1
kind: KfDef
metadata:
name: trustyai-service-operator
spec:
applications:
- kustomizeConfig:
repoRef:
name: manifests
path: trustyai-service-operator
name: trustyai-service-operator
repos:
- name: manifests
uri: https://github.com/trustyai-explainability/trustyai-service-operator/tarball/main
version: v1.0.0
4 changes: 2 additions & 2 deletions tests/setup/odh-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ spec:
- kustomizeConfig:
repoRef:
name: manifests
path: trustyai-service
name: trustyai
path: trustyai-service-operator
name: trustyai-service-operator
- kustomizeConfig:
repoRef:
name: manifests
Expand Down
Loading