From 3f018571f0dd79bb9905dde6855642c73478c048 Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Tue, 21 Mar 2023 12:39:26 -0700 Subject: [PATCH] Bug fixes and E2E tests for PVC storage (#340) Motivation Address PVC follow-up work items outlined in #337 for PVC storage introduced in #267 Modifications Code changes: - Sort PVC mounts on serving runtime specs to avoid unstable repeated runtime rollouts as Kubernetes treat two otherwise identical deployment specs as different if the same set of volume mounts are in different order - Don't add non-existent PVCs from predictor/ISVC when allowAnyPVC is enabled as this would cause all serving pods for that runtime to stay in - Pending state with unbound (pending) volumes - Tolerate missing storage-config secret when allowAnyPVC is enabled - Lint: fix "io/ioutil" deprecations FVT changes: - Add Storage test suite - Add helper methods to add PVC to storage-config during FVT - Allow for additional time in WaitForReadyDeployStatus but allow early abort on success - Check if pod still running before gRPC/REST requests, reconnect if necessary - Only choose "Ready" runtime pod for port-forwards - Include ISVC tests in Predictor test suite to ensure "serial" execution of TLS tests Resolves #337 Signed-off-by: Christian Kadner --- .github/workflows/run-fvt.yml | 7 +- Makefile | 2 +- config/dependencies/fvt.yaml | 117 ++++++++ config/dependencies/minio-storage-secret.yaml | 14 +- config/rbac/cluster-scope/role.yaml | 1 + config/rbac/namespace-scope/role.yaml | 1 + controllers/config/templating.go | 5 +- controllers/modelmesh/etcd.go | 2 +- controllers/modelmesh/runtime.go | 1 + controllers/service_controller.go | 13 - controllers/servingruntime_controller.go | 117 +++++--- controllers/suite_test.go | 5 +- docs/install/README.md | 2 +- docs/install/install-script.md | 5 +- docs/predictors/setup-storage.md | 248 ++++++++++++++- docs/quickstart.md | 3 +- docs/release-process.md | 11 +- fvt/README.md | 2 +- fvt/fvtclient.go | 190 +++++++++--- fvt/globals.go | 25 +- fvt/helpers.go | 197 ++++++++---- fvt/inference.go | 7 +- fvt/predictor/isvc_test.go | 63 ---- fvt/predictor/predictor_suite_test.go | 49 +-- fvt/predictor/predictor_test.go | 282 ++++++++++-------- fvt/scaleToZero/scaleToZero_suite_test.go | 50 +--- fvt/scaleToZero/scale_to_zero_test.go | 6 +- fvt/storage/storage_suite_test.go | 105 +++++++ fvt/storage/storage_test.go | 189 ++++++++++++ fvt/testdata/isvcs/isvc-pvc-2.yaml | 12 + fvt/testdata/isvcs/isvc-pvc-3.yaml | 12 + fvt/testdata/isvcs/isvc-pvc-4.yaml | 12 + fvt/testdata/isvcs/isvc-pvc-path.yaml | 16 + fvt/testdata/isvcs/isvc-pvc-uri.yaml | 12 + fvt/utils.go | 35 ++- main.go | 3 +- pkg/config/config.go | 2 +- pkg/mmesh/etcdrangewatcher_test.go | 12 +- scripts/install.sh | 16 +- 39 files changed, 1405 insertions(+), 446 deletions(-) delete mode 100644 fvt/predictor/isvc_test.go create mode 100644 fvt/storage/storage_suite_test.go create mode 100644 fvt/storage/storage_test.go create mode 100644 fvt/testdata/isvcs/isvc-pvc-2.yaml create mode 100644 fvt/testdata/isvcs/isvc-pvc-3.yaml create mode 100644 fvt/testdata/isvcs/isvc-pvc-4.yaml create mode 100644 fvt/testdata/isvcs/isvc-pvc-path.yaml create mode 100644 fvt/testdata/isvcs/isvc-pvc-uri.yaml diff --git a/.github/workflows/run-fvt.yml b/.github/workflows/run-fvt.yml index 65f754bb..0b1987d8 100644 --- a/.github/workflows/run-fvt.yml +++ b/.github/workflows/run-fvt.yml @@ -1,4 +1,4 @@ -name: FVTs +name: FVT on: pull_request: @@ -71,7 +71,8 @@ jobs: run: | docker pull nvcr.io/nvidia/tritonserver:21.06.1-py3 docker pull seldonio/mlserver:0.5.2 - docker pull openvino/model_server:2022.1 + docker pull openvino/model_server:2022.2 + # docker pull pytorch/torchserve:0.6.0-cpu docker pull kserve/modelmesh-runtime-adapter docker pull kserve/rest-proxy docker pull kserve/modelmesh @@ -79,7 +80,7 @@ jobs: run: | docker images kubectl get pods - kubectl get servingruntimes + kubectl get clusterservingruntimes - name: Run FVTs run: | go install github.com/onsi/ginkgo/v2/ginkgo diff --git a/Makefile b/Makefile index 29027257..eabe70d5 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ test: # Run fvt tests. This requires an etcd, kubernetes connection, and model serving installation. Ginkgo CLI is used to run them in parallel fvt: - ginkgo -v -p -progress --fail-fast fvt/predictor fvt/scaleToZero --timeout=40m + ginkgo -v -procs=2 --progress --fail-fast fvt/predictor fvt/scaleToZero fvt/storage --timeout=50m # Command to regenerate the grpc go files from the proto files fvt-protoc: diff --git a/config/dependencies/fvt.yaml b/config/dependencies/fvt.yaml index 4ae03642..db6942d5 100644 --- a/config/dependencies/fvt.yaml +++ b/config/dependencies/fvt.yaml @@ -123,3 +123,120 @@ stringData: "default_bucket": "modelmesh-example-models", "region": "us-south" } +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-1" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-2" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-3" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "pvc-init" +spec: + template: + metadata: + name: "pvc-init-pod" + spec: + restartPolicy: OnFailure + containers: + - name: "copy-pod" + image: kserve/modelmesh-minio-examples:latest + securityContext: + runAsUser: 0 + allowPrivilegeEscalation: false + command: ["/bin/sh", "-ex", "-c"] + args: + - echo copy model files ...; + whoami; + ls -al "${SRC_FOLDER}"; + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_1}" && + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_2}" && + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_3}" && + ls -al "${DST_FOLDER_1}" && + ls -al "${DST_FOLDER_2}" && + ls -al "${DST_FOLDER_3}" && + echo done && + exit 0; + env: + - name: SRC_FOLDER + value: "/data1/modelmesh-example-models" + - name: DST_FOLDER_1 + value: "/mnt/pvc1" + - name: DST_FOLDER_2 + value: "/mnt/pvc2" + - name: DST_FOLDER_3 + value: "/mnt/pvc3" + volumeMounts: + - name: "pvc1" + mountPath: "/mnt/pvc1" + - name: "pvc2" + mountPath: "/mnt/pvc2" + - name: "pvc3" + mountPath: "/mnt/pvc3" + volumes: + - name: "pvc1" + persistentVolumeClaim: + claimName: "models-pvc-1" + - name: "pvc2" + persistentVolumeClaim: + claimName: "models-pvc-2" + - name: "pvc3" + persistentVolumeClaim: + claimName: "models-pvc-3" + backoffLimit: 4 +--- +apiVersion: v1 +kind: Pod +metadata: + name: "pvc-reader" +spec: + containers: + - name: main + image: ubuntu + command: ["/bin/sh", "-ec", "sleep 10000"] + volumeMounts: + - name: "pvc1" + mountPath: "/mnt/pvc1" + - name: "pvc2" + mountPath: "/mnt/pvc2" + - name: "pvc3" + mountPath: "/mnt/pvc3" + volumes: + - name: "pvc1" + persistentVolumeClaim: + claimName: "models-pvc-1" + - name: "pvc2" + persistentVolumeClaim: + claimName: "models-pvc-2" + - name: "pvc3" + persistentVolumeClaim: + claimName: "models-pvc-3" diff --git a/config/dependencies/minio-storage-secret.yaml b/config/dependencies/minio-storage-secret.yaml index 5140a811..1c456d35 100644 --- a/config/dependencies/minio-storage-secret.yaml +++ b/config/dependencies/minio-storage-secret.yaml @@ -1,5 +1,17 @@ +# Copyright 2021 IBM Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: v1 - kind: Secret metadata: name: storage-config diff --git a/config/rbac/cluster-scope/role.yaml b/config/rbac/cluster-scope/role.yaml index 25a7bd75..e0110393 100644 --- a/config/rbac/cluster-scope/role.yaml +++ b/config/rbac/cluster-scope/role.yaml @@ -33,6 +33,7 @@ rules: - "" resources: - endpoints + - persistentvolumeclaims verbs: - get - list diff --git a/config/rbac/namespace-scope/role.yaml b/config/rbac/namespace-scope/role.yaml index 946692d3..8624c152 100644 --- a/config/rbac/namespace-scope/role.yaml +++ b/config/rbac/namespace-scope/role.yaml @@ -33,6 +33,7 @@ rules: - "" resources: - endpoints + - persistentvolumeclaims verbs: - get - list diff --git a/controllers/config/templating.go b/controllers/config/templating.go index 694f7e44..31f8f5b9 100644 --- a/controllers/config/templating.go +++ b/controllers/config/templating.go @@ -16,7 +16,6 @@ package config import ( "bytes" "io" - "io/ioutil" "os" "text/template" @@ -43,9 +42,9 @@ func PathTemplateSource(path string, context interface{}) mf.Source { return templateSource(f, context) } -//A templating manifest source +// A templating manifest source func templateSource(r io.Reader, context interface{}) mf.Source { - b, err := ioutil.ReadAll(r) + b, err := io.ReadAll(r) if err != nil { panic(err) } diff --git a/controllers/modelmesh/etcd.go b/controllers/modelmesh/etcd.go index eb8f46a9..eaf4d226 100644 --- a/controllers/modelmesh/etcd.go +++ b/controllers/modelmesh/etcd.go @@ -23,7 +23,7 @@ const ( kvStoreEnvVar = "KV_STORE" ) -//mimics base/patches/etcd.yaml +// mimics base/patches/etcd.yaml func (m *Deployment) configureMMDeploymentForEtcdSecret(deployment *appsv1.Deployment) error { EtcdSecretName := m.EtcdSecretName diff --git a/controllers/modelmesh/runtime.go b/controllers/modelmesh/runtime.go index f6534433..fa19901a 100644 --- a/controllers/modelmesh/runtime.go +++ b/controllers/modelmesh/runtime.go @@ -69,6 +69,7 @@ func (m *Deployment) addVolumesToDeployment(deployment *appsv1.Deployment) error } // need to mount storage config for built-in adapters and the scenarios where StorageHelper is not disabled + // TODO: what about allowAnyPVC only use case without MinIO if rts.BuiltInAdapter != nil || useStorageHelper(rts) { storageVolume := corev1.Volume{ Name: ConfigStorageMount, diff --git a/controllers/service_controller.go b/controllers/service_controller.go index 22ee2799..ea756b03 100644 --- a/controllers/service_controller.go +++ b/controllers/service_controller.go @@ -11,19 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package controllers diff --git a/controllers/servingruntime_controller.go b/controllers/servingruntime_controller.go index 521a2c27..7fdefac8 100644 --- a/controllers/servingruntime_controller.go +++ b/controllers/servingruntime_controller.go @@ -11,19 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package controllers @@ -31,6 +18,7 @@ import ( "context" "encoding/json" "fmt" + "sort" "strings" "sync" "time" @@ -300,40 +288,52 @@ func (r *ServingRuntimeReconciler) Reconcile(ctx context.Context, req ctrl.Reque } func (r *ServingRuntimeReconciler) getPVCs(ctx context.Context, req ctrl.Request, rt *kserveapi.ServingRuntimeSpec, cfg *config.Config) ([]string, error) { + // get the PVCs from the storage-config Secret + storageConfigPVCsMap := make(map[string]struct{}) s := &corev1.Secret{} if err := r.Client.Get(ctx, types.NamespacedName{ Name: modelmesh.StorageSecretName, Namespace: req.Namespace, }, s); err != nil { - return nil, fmt.Errorf("Could not get the storage secret: %w", err) - } - - pvcsMap := make(map[string]struct{}) - var storageConfig map[string]string - for _, storageData := range s.Data { - if err := json.Unmarshal(storageData, &storageConfig); err != nil { - return nil, fmt.Errorf("Could not parse storage configuration json: %w", err) - } - if storageConfig["type"] == StoragePVCType { - if name := storageConfig["name"]; name != "" { - pvcsMap[name] = struct{}{} - } else { - r.Log.V(1).Info("Missing PVC name in storage configuration") + // do not fail with error if storage-config secret does not exist + // TODO: make sure storage-config secret does not get mounted to runtime pod + // MountVolume.SetUp failed for volume "storage-config" : secret "storage-config" not found + r.Log.V(1).Info("Could not get the storage-config secret", + "name", modelmesh.StorageSecretName, + "namespace", req.Namespace) + } else { + var storageConfig map[string]string + for _, storageData := range s.Data { + if err := json.Unmarshal(storageData, &storageConfig); err != nil { + return nil, fmt.Errorf("Could not parse storage configuration json: %w", err) + } + if storageConfig["type"] == StoragePVCType { + if name := storageConfig["name"]; name != "" { + storageConfigPVCsMap[name] = struct{}{} + } else { + r.Log.V(1).Info("Missing PVC name in storage configuration") + } } } } - // add pvcs for predictors when the global config is enabled + // collect PVCs from Predictors when the global flag 'allowAnyPVC' is enabled + predictorPVCsMap := make(map[string]struct{}) if cfg.AllowAnyPVC { restProxyEnabled := cfg.RESTProxy.Enabled + + // use a predicate function to extract the PVCs from Predictors in the registry f := func(p *api.Predictor) bool { if runtimeSupportsPredictor(rt, p, restProxyEnabled, req.Name) && - p.Spec.Storage != nil && p.Spec.Storage.Parameters != nil { + p.Spec.Storage != nil && + p.Spec.Storage.Parameters != nil { + params := *p.Spec.Storage.Parameters - if stype := params["type"]; stype == "pvc" { - if name := params["name"]; name != "" { - pvcsMap[name] = struct{}{} - } + storageType := params["type"] + name := params["name"] + + if storageType == "pvc" && name != "" { + predictorPVCsMap[name] = struct{}{} } } return false @@ -345,9 +345,50 @@ func (r *ServingRuntimeReconciler) getPVCs(ctx context.Context, req ctrl.Request } } } - pvcs := make([]string, 0, len(pvcsMap)) - for pvc := range pvcsMap { - pvcs = append(pvcs, pvc) + + pvcs := make([]string, 0, len(storageConfigPVCsMap)+len(predictorPVCsMap)) + + // append all PVCs from the storage-config secret; + for claimName := range storageConfigPVCsMap { + r.Log.V(2).Info("Add PVC from storage-config to runtime", + "claimName", claimName, + "runtime", req.Name) + pvcs = append(pvcs, claimName) + } + + // for any PVCs not in the storage-config secret, we need to make sure the PVCs + // exists; if we did mount a non-existent PVC to a runtime pod, it would keep it + // in pending state, effectively shutting down inferencing on any and all other + // Predictors for that runtime + for claimName := range predictorPVCsMap { + if _, alreadyAdded := storageConfigPVCsMap[claimName]; alreadyAdded { + // don't add PVCs that we found in the storage-config secret already + continue + } else { + pvc := &corev1.PersistentVolumeClaim{} + if err := r.Client.Get(ctx, types.NamespacedName{ + Name: claimName, + Namespace: req.Namespace, + }, pvc); err != nil { + r.Log.Error(err, "Could not find PVC in namespace", + "claimName", claimName, "namespace", req.Namespace) + } else { + r.Log.V(2).Info("Add any PVC from predictors to runtime", + "claimName", claimName, + "runtime", req.Name) + pvcs = append(pvcs, claimName) + } + } + } + + // we must sort the PVCs to avoid that otherwise identical runtime deployment + // specs are treated as different by Kubernetes causing unwanted cycles of + // runtimes getting terminated again and again just because the reconciler + // ordered the same set of PVCs in a different way + if len(pvcs) > 0 { + sort.Strings(pvcs) + r.Log.V(1).Info("Adding PVCs to runtime", + "pvcs", pvcs, "runtime", req.Name) } return pvcs, nil @@ -388,7 +429,7 @@ func (r *ServingRuntimeReconciler) determineReplicasAndRequeueDuration(ctx conte defer r.runtimeInfoMapMutex.Unlock() // initialize runtime information map if it is nil - // eg. if this is the first reconcile for any runtime + // e.g. if this is the first reconcile for any runtime if r.runtimeInfoMap == nil { r.runtimeInfoMap = make(map[types.NamespacedName]*runtimeInfo) } @@ -413,7 +454,7 @@ func (r *ServingRuntimeReconciler) determineReplicasAndRequeueDuration(ctx conte return scaledUp, time.Duration(0), nil } - // if this is the first time we see no predictors update the runtime info with + // if this is the first time we see no predictors, update the runtime info with // this transition if targetRuntimeInfo.TimeTransitionedToNoPredictors == nil { log.Info("Runtime no longer has any predictors, will scale to zero after grace period", diff --git a/controllers/suite_test.go b/controllers/suite_test.go index f519fa8b..e090467b 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -32,7 +32,6 @@ package controllers import ( "context" "io" - "io/ioutil" "os" "path" "path/filepath" @@ -211,7 +210,7 @@ func getDefaultConfig() (*config2.Config, error) { if defaultTestConfigFileContents == nil { var err error var testConfigFile = "./testdata/test-config-defaults.yaml" - if defaultTestConfigFileContents, err = ioutil.ReadFile(testConfigFile); err != nil { + if defaultTestConfigFileContents, err = os.ReadFile(testConfigFile); err != nil { return nil, err } } @@ -228,7 +227,7 @@ func getPayloadProcessingConfig(loadErrorFile bool) (*config2.Config, error) { } else { testConfigFile = "./testdata/test-config-payload-processor.yaml" } - if payloadProcessingTestConfigFileContents, err = ioutil.ReadFile(testConfigFile); err != nil { + if payloadProcessingTestConfigFileContents, err = os.ReadFile(testConfigFile); err != nil { return nil, err } } diff --git a/docs/install/README.md b/docs/install/README.md index 6000b3d1..88c486e1 100644 --- a/docs/install/README.md +++ b/docs/install/README.md @@ -8,7 +8,7 @@ - **etcd** - ModelMesh Serving requires an [etcd](https://etcd.io/) server in order to coordinate internal state which can be either dedicated or shared. More on this later. -- **S3-compatible object storage** - Before models can be deployed, a remote S3-compatible datastore is needed from which to pull the model data. This could be for example an [IBM Cloud Object Storage](https://www.ibm.com/cloud/object-storage) instance, or a locally running [MinIO](https://github.com/minio/minio) deployment. Note that this is not required to be in place prior to the initial installation. +- **Model storage** - The model files have to be stored in a compatible form of remote storage or on a Kubernetes Persistent Volume. For more information about supported storage options take a look at our [storage setup](/docs/predictors/setup-storage.md) page. We provide an install script `--quickstart` option to quickly run ModelMesh Serving with a provisioned etcd server. This may be useful for experimentation or development but should not be used in production. diff --git a/docs/install/install-script.md b/docs/install/install-script.md index 42ed6532..fc33249d 100644 --- a/docs/install/install-script.md +++ b/docs/install/install-script.md @@ -2,13 +2,14 @@ ## Prerequisites -- **Kubernetes cluster** - A Kubernetes cluster is required. You will need `cluster-admin` authority in order to complete all of the prescribed steps. +- **Kubernetes cluster** - A Kubernetes cluster is required. You will need + `cluster-admin` authority in order to complete all of the prescribed steps. - **Kubectl and Kustomize** - The installation will occur via the terminal using [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) and [kustomize](https://kubectl.docs.kubernetes.io/installation/kustomize/). - **etcd** - ModelMesh Serving requires an [etcd](https://etcd.io/) server in order to coordinate internal state which can be either dedicated or shared. More on this later. -- **S3-compatible object storage** - Before models can be deployed, a remote S3-compatible datastore is needed from which to pull the model data. This could be for example an [IBM Cloud Object Storage](https://www.ibm.com/cloud/object-storage) instance, or a locally running [MinIO](https://github.com/minio/minio) deployment. Note that this is not required to be in place prior to the initial installation. +- **Model storage** - The model files have to be stored in a compatible form of remote storage or on a Kubernetes Persistent Volume. For more information about supported storage options take a look at our [storage setup](/docs/predictors/setup-storage.md) page. We provide an install script to quickly run ModelMesh Serving with a provisioned etcd server. This may be useful for experimentation or development but should not be used in production. diff --git a/docs/predictors/setup-storage.md b/docs/predictors/setup-storage.md index ad08152f..55630d00 100644 --- a/docs/predictors/setup-storage.md +++ b/docs/predictors/setup-storage.md @@ -1,14 +1,16 @@ # Set up Storage for Loading Models -You will need access to an S3-compatible object storage, for example [MinIO](https://github.com/minio/minio). To provide access to the object storage, use the `storage-config` secret. +You will need access to an S3-compatible object storage, for example [MinIO](https://github.com/minio/minio). To configure access to the object storage, use the `storage-config` secret. -## Deploy a model from your own object storage +Alternatively, models can be stored on a Kubernetes Persistent Volume. Persistent Volume Claims can either be pre-configured in the `storage-config` secret, or, the `allowAnyPVC` configuration flag can be enabled, so that any PVC can be mounted dynamically at the time a predictor or inference service is deployed. -1. Download sample model or use an existing model +## Deploy a model from your own S3 compatible object storage + +### 1. Download sample model or use an existing model Here we show an example using an [ONNX model for MNIST](https://github.com/onnx/models/raw/ad5c181f1646225f034fba1862233ecb4c262e04/vision/classification/mnist/model/mnist-8.onnx). -2. Add your ONNX saved model to S3-based object storage +### 2. Add your saved model to S3-based object storage A bucket in MinIO needs to be created to copy the model into, which either requires [MinIO Client](https://docs.min.io/docs/minio-client-quickstart-guide.html) or port-forwarding the minio service and logging in using the web interface. @@ -56,7 +58,7 @@ $ mc ls myminio/models/onnx [2021-06-11 11:55:48 EDT] 26KiB mnist-8.onnx ``` -3. Add a storage entry to the `storage-config` secret +### 3. Add a storage entry to the `storage-config` secret Ensure there is a key defined in the common `storage-config` secret corresponding to the S3-based storage instance holding your model. The value of this secret key should be JSON like the following, `default_bucket` is optional. @@ -95,3 +97,239 @@ azureKey: | ``` Remember that after updating the storage config secret, there may be a delay of up to 2 minutes until the change is picked up. You should take this into account when creating/updating `InferenceService`s that use storage keys which have just been added or updated - they may fail to load otherwise. + +## Deploy a model stored on a Persistent Volume Claim + +Models can be stored on [Kubernetes Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/). + +There are two ways to enable PVC support in ModelMesh: + +1. The Persistent Volume Claims can be added in the `storage-config` secret. This way all PVCs will be mounted to all serving runtime pods. +2. The `allowAnyPVC` configuration flag can be set to `true`. This way the Modelmesh controller will dynamically mount the PVC to a runtime pod at the time a predictor or inference service requiring it is being deployed. + +Follow the example instructions below to create a PVC, store a model on it, and configure ModelMesh to mount the PVC to the runtime serving pods so that the model can be loaded for inferencing. + +### 1. Create a Persistent Volume Claim + +Persistent Volumes are namespace-scoped, so we have to create it in the same namespace as the ModelMesh serving deployment. We are using namespace `modelmesh-serving` here. + +```shell +kubectl config set-context --current --namespace=modelmesh-serving +``` + +Now we create the Persistent Volume Claim `my-models-pvc`. Along with it, we deploy a `pvc-access` pod in order to copy our model to the Persistent Volume later. + +```shell +kubectl apply -f - < 8008 +``` + +With `curl` we can perform an inference request to the SKLearn MNIST model. Make sure the `MODEL_NAME` +variable is set to the name of your `InferenceService`. + +```shell +MODEL_NAME="sklearn-pvc-example" + +curl -X POST -k "http://localhost:8008/v2/models/${MODEL_NAME}/infer" -d '{"inputs": [{ "name": "predict", "shape": [1, 64], "datatype": "FP32", "data": [0.0, 0.0, 1.0, 11.0, 14.0, 15.0, 3.0, 0.0, 0.0, 1.0, 13.0, 16.0, 12.0, 16.0, 8.0, 0.0, 0.0, 8.0, 16.0, 4.0, 6.0, 16.0, 5.0, 0.0, 0.0, 5.0, 15.0, 11.0, 13.0, 14.0, 0.0, 0.0, 0.0, 0.0, 2.0, 12.0, 16.0, 13.0, 0.0, 0.0, 0.0, 0.0, 0.0, 13.0, 16.0, 16.0, 6.0, 0.0, 0.0, 0.0, 0.0, 16.0, 16.0, 16.0, 7.0, 0.0, 0.0, 0.0, 0.0, 11.0, 13.0, 12.0, 1.0, 0.0]}]}' +``` + +The response should look like the following: + +```json +{ + "model_name": "sklearn-pvc-example__isvc-3d2daa3370", + "outputs": [ + {"name": "predict", "datatype": "INT64", "shape": [1], "data": [8]} + ] +} +``` + +You can find more detailed information about running inference requests [here](run-inference.md). + +To delete the resources created in this example, run the following commands: + +```shell +kubectl delete isvc "sklearn-pvc-example" +kubectl delete pod "pvc-access" +kubectl delete pvc "my-models-pvc" +``` diff --git a/docs/quickstart.md b/docs/quickstart.md index a75f70e9..7eb52436 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -270,7 +270,8 @@ To see more detailed instructions and information, click [here](./predictors/run ## 4. (Optional) Deleting your ModelMesh Serving installation -To delete all ModelMesh Serving resources that were installed, run the following from the root of the project: +To delete all ModelMesh Serving resources that were installed, run the following +command from the root of the project: ```shell ./scripts/delete.sh --namespace modelmesh-serving diff --git a/docs/release-process.md b/docs/release-process.md index 46abe050..97af46c0 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -243,7 +243,14 @@ release blog. For reference, here are a few examples of previous release blogs featuring ModelMesh: -- [v0.7.0](https://kserve.github.io/website/blog/articles/2021-10-11-KServe-0.7-release) -- [v0.8.0](https://kserve.github.io/website/0.8/blog/articles/2022-02-18-KServe-0.8-release/#modelmesh-updates) +- [v0.10.0](https://kserve.github.io/website/0.10/blog/articles/2023-02-05-KServe-0.10-release/#modelmesh-updates) - [v0.9.0](https://kserve.github.io/website/0.9/blog/articles/2022-07-21-KServe-0.9-release/#inferenceservice-api-for-modelmesh) +- [v0.8.0](https://kserve.github.io/website/0.8/blog/articles/2022-02-18-KServe-0.8-release/#modelmesh-updates) +- [v0.7.0](https://kserve.github.io/website/0.7/blog/articles/2021-10-11-KServe-0.7-release) + +And the corresponding PRs to illustrate the process and the participants: + - [v0.10.0 draft](https://github.com/kserve/website/pull/227#discussion_r1098917277) +- [v0.9.0 draft](https://github.com/kserve/website/pull/166) +- [v0.8.0 draft](https://github.com/kserve/website/pull/105) +- [v0.7.0 draft](https://github.com/kserve/website/pull/49) diff --git a/fvt/README.md b/fvt/README.md index c3c312f1..ef557243 100644 --- a/fvt/README.md +++ b/fvt/README.md @@ -33,7 +33,7 @@ If starting with a fresh namespace, install ModelMesh Serving configured for the ./scripts/install.sh --namespace modelmesh-serving --fvt --dev-mode-logging ``` -To re-configure an existing quick-start instance for FVTs, run: +To re-configure an existing "quickstart" deployment for FVTs, run: ```Shell kubectl apply -f config/dependencies/fvt.yaml diff --git a/fvt/fvtclient.go b/fvt/fvtclient.go index e6d1b5b7..97d50154 100644 --- a/fvt/fvtclient.go +++ b/fvt/fvtclient.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -20,7 +20,7 @@ import ( "encoding/json" "errors" "fmt" - "io/ioutil" + "io" "math/rand" "net/http" "os/exec" @@ -57,8 +57,8 @@ import ( torchserveapi "github.com/kserve/modelmesh-serving/fvt/generated/torchserve/apis" ) -const predictorTimeout = time.Second * 120 -const timeForStatusToStabilize = time.Second * 5 +const PredictorTimeout = time.Second * 120 // absolute time to wait for predictor to become ready +const TimeForStatusToStabilize = time.Second * 5 // time to wait between watcher events before assuming a stable state type ModelServingConnectionType int @@ -75,6 +75,19 @@ var applyPatchOptions = metav1.PatchOptions{ Force: func() *bool { t := true; return &t }(), } +// list option for serving runtime deployments +var deploymentListOptions = metav1.ListOptions{ + LabelSelector: "modelmesh-service", + TimeoutSeconds: &DefaultTimeout, +} + +// list option for serving runtime pods +var runtimePodListOptions = metav1.ListOptions{ + LabelSelector: "modelmesh-service=modelmesh-serving", + FieldSelector: "status.phase=Running", + TimeoutSeconds: &DefaultTimeout, +} + type FVTClient struct { dynamic.Interface namespace string @@ -95,6 +108,7 @@ type ModelMeshPortForward struct { cmdArgs []string done chan struct{} log logr.Logger + podName string } func (pf *ModelMeshPortForward) EnsureStarted() error { @@ -142,12 +156,15 @@ func (pf *ModelMeshPortForward) EnsureStopped() { <-pf.done } +// NewModelMeshPortForward switched to port-forwarding to a pod instead of the +// service, since, when port-forwarding to a Service, it just picks any pod to +// port-forward to without any guardrails against selecting a Terminating pod. +// It doesn't use readiness checks for pod selection, it seems to actually select +// the oldest pod which ends up being most likely to be terminated soon func NewModelMeshPortForward(namespace string, podName string, localPort int, targetPort int, log logr.Logger) *ModelMeshPortForward { portMap := fmt.Sprintf("%d:%d", localPort, targetPort) - cmdArgs := []string{"port-forward", "--namespace", namespace, "--address", "0.0.0.0", - "pod/" + podName, portMap} - - return &ModelMeshPortForward{nil, cmdArgs, nil, log} + cmdArgs := []string{"port-forward", "-n", namespace, "--address", "0.0.0.0", "pod/" + podName, portMap} + return &ModelMeshPortForward{nil, cmdArgs, nil, log, podName} } func GetFVTClient(log logr.Logger, namespace, serviceName, controllerNamespace string) (*FVTClient, error) { @@ -225,10 +242,10 @@ var ( Version: v1beta1.SchemeGroupVersion.Version, Resource: "inferenceservices", // this must be the plural form } - gvrEndpoints = schema.GroupVersionResource{ + gvrPods = schema.GroupVersionResource{ Group: "", Version: "v1", - Resource: "endpoints", // this must be the plural form + Resource: "pods", // this must be the plural form } ) @@ -280,6 +297,7 @@ func (fvt *FVTClient) ListServingRuntimes(options metav1.ListOptions) (*unstruct func (fvt *FVTClient) ListClusterServingRuntimes(options metav1.ListOptions) (*unstructured.UnstructuredList, error) { return fvt.Resource(gvrCRuntime).List(context.TODO(), options) } + func (fvt *FVTClient) GetPredictor(name string) *unstructured.Unstructured { obj, err := fvt.Resource(gvrPredictor).Namespace(fvt.namespace).Get(context.TODO(), name, metav1.GetOptions{}) Expect(err).ToNot(HaveOccurred()) @@ -346,18 +364,15 @@ func (fvt *FVTClient) WatchPredictorsAsync(c chan *unstructured.Unstructured, op } -func (fvt *FVTClient) GetRandomReadyRuntimePodNameFromEndpoints() string { - obj, err := fvt.Resource(gvrEndpoints).Namespace(fvt.namespace).Get(context.TODO(), fvt.serviceName, metav1.GetOptions{}) - Expect(err).ToNot(HaveOccurred()) - - var endpoints corev1.Endpoints - err = runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &endpoints) - Expect(err).ToNot(HaveOccurred()) +func (fvt *FVTClient) GetRandomReadyRuntimePod() string { + runtimePods := fvt.ListReadyRuntimePods() + numPods := len(runtimePods.Items) + Expect(numPods).ToNot(BeZero(), "There are no 'Ready' runtime pods") - addresses := endpoints.Subsets[0].Addresses - randomAddress := addresses[rand.Intn(len(addresses))] + i := rand.Intn(numPods) + name := runtimePods.Items[i].Name - return randomAddress.TargetRef.Name + return name } func (fvt *FVTClient) PrintPredictors() { @@ -374,6 +389,13 @@ func (fvt *FVTClient) PrintIsvcs() { } } +func (fvt *FVTClient) PrintDescribeIsvc(name string) { + err := fvt.RunKubectl("describe", "isvc", name) + if err != nil { + fvt.log.Error(err, fmt.Sprintf("Error running describe isvc '%s' command", name)) + } +} + func (fvt *FVTClient) PrintPods() { err := fvt.RunKubectl("get", "pods") if err != nil { @@ -412,12 +434,12 @@ func (fvt *FVTClient) TailPodLogs(sinceTime string) { func (fvt *FVTClient) RunKubectl(args ...string) error { args = append(args, "-n", fvt.namespace) - getPredictorCommand := exec.Command("kubectl", args...) - getPredictorCommand.Stdout = ginkgo.GinkgoWriter - getPredictorCommand.Stderr = ginkgo.GinkgoWriter - fvt.log.Info("Running command", "args", strings.Join(getPredictorCommand.Args, " ")) + kubectlCmd := exec.Command("kubectl", args...) + kubectlCmd.Stdout = ginkgo.GinkgoWriter + kubectlCmd.Stderr = ginkgo.GinkgoWriter + fvt.log.Info("Running command", "args", strings.Join(kubectlCmd.Args, " ")) fmt.Fprintf(ginkgo.GinkgoWriter, "=====================================================================================================================================\n") - err := getPredictorCommand.Run() + err := kubectlCmd.Run() fmt.Fprintf(ginkgo.GinkgoWriter, "=====================================================================================================================================\n") return err } @@ -465,7 +487,7 @@ func (fvt *FVTClient) RunKfsRestInference(modelName string, body []byte, tls boo return "", fmt.Errorf("Request failed with code %d", response.StatusCode) } - resp, err := ioutil.ReadAll(response.Body) + resp, err := io.ReadAll(response.Body) return string(resp), err } @@ -494,21 +516,44 @@ func (fvt *FVTClient) RunTorchserveInference(req *torchserveapi.PredictionsReque } func (fvt *FVTClient) ConnectToModelServing(connectionType ModelServingConnectionType) error { + // check if the gRPC and REST connection runtime pods are still around + if fvt.grpcPortForward != nil { + podName := fvt.grpcPortForward.podName + _, err := fvt.Resource(gvrPods).Namespace(fvt.namespace).Get(context.TODO(), podName, metav1.GetOptions{}) + if err != nil { + fvt.log.Info("Lost gRPC connection to pod " + podName + ". Reconnecting ...") + fvt.disconnectGrpcConnection() + } else { + fvt.log.V(2).Info("Still gRPC connected to pod " + podName) + } + } + if fvt.restPortForward != nil { + podName := fvt.restPortForward.podName + _, err := fvt.Resource(gvrPods).Namespace(fvt.namespace).Get(context.TODO(), podName, metav1.GetOptions{}) + if err != nil { + fvt.log.Info("Lost REST connection to pod " + podName + ". Reconnecting ...") + fvt.disconnectRestConnection() + } else { + fvt.log.V(2).Info("Still REST connected to pod " + podName) + } + } + + // (re-)create the gRPC and REST port-forwards if necessary if fvt.grpcPortForward == nil { - podName := fvt.GetRandomReadyRuntimePodNameFromEndpoints() + podName := fvt.GetRandomReadyRuntimePod() fvt.grpcPortForward = NewModelMeshPortForward(fvt.namespace, podName, fvt.grpcPort, 8033, fvt.log) } if fvt.restPortForward == nil { - podName := fvt.GetRandomReadyRuntimePodNameFromEndpoints() + podName := fvt.GetRandomReadyRuntimePod() fvt.restPortForward = NewModelMeshPortForward(fvt.namespace, podName, fvt.restPort, 8008, fvt.log) } + // start the port-forwards if err := fvt.grpcPortForward.EnsureStarted(); err != nil { - return fmt.Errorf("Error with grpc port-forward, could not connect to model serving") + return fmt.Errorf("Error with gRPC port-forward, could not connect to model serving") } - if err := fvt.restPortForward.EnsureStarted(); err != nil { - return fmt.Errorf("Error with rest port-forward, could not connect to model serving") + return fmt.Errorf("Error with REST port-forward, could not connect to model serving") } ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) @@ -576,6 +621,11 @@ func (fvt *FVTClient) DisconnectFromModelServing() { if fvt == nil { return } + fvt.disconnectGrpcConnection() + fvt.disconnectRestConnection() +} + +func (fvt *FVTClient) disconnectGrpcConnection() { if fvt.grpcConn != nil { fvt.grpcConn.Close() fvt.grpcConn = nil @@ -584,7 +634,9 @@ func (fvt *FVTClient) DisconnectFromModelServing() { fvt.grpcPortForward.EnsureStopped() fvt.grpcPortForward = nil } +} +func (fvt *FVTClient) disconnectRestConnection() { if fvt.restConn != nil { fvt.restConn.CloseIdleConnections() fvt.restConn = nil @@ -608,7 +660,7 @@ func (fvt *FVTClient) ApplyUserConfigMap(config map[string]interface{}) { cmu := &unstructured.Unstructured{ Object: map[string]interface{}{ "apiVersion": "v1", - "kind": "ConfigMap", + "kind": ConfigMapKind, "metadata": map[string]interface{}{ "name": UserConfigMapName, }, @@ -626,6 +678,33 @@ func (fvt *FVTClient) ApplyUserConfigMap(config map[string]interface{}) { Expect(err).ToNot(HaveOccurred()) } +func (fvt *FVTClient) CreateStorageConfigSecret(storageConfigData map[string]interface{}) { + var stringConfigData = map[string]string{} + + for k, v := range storageConfigData { + jsonValue, err := json.Marshal(v) + Expect(err).ToNot(HaveOccurred()) + stringConfigData[k] = string(jsonValue) + } + + var StorageSecret = corev1.Secret{ + Type: corev1.SecretTypeOpaque, + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: SecretKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: StorageConfigSecretName, + }, + StringData: stringConfigData, + } + + CreateSecret(&StorageSecret, fvt.controllerNamespace, fvt) + if fvt.namespace != fvt.controllerNamespace { + CreateSecret(&StorageSecret, fvt.namespace, fvt) + } +} + func (fvt *FVTClient) CreateTLSSecrets() { err := fvt.certGenerator.generate() Expect(err).ToNot(HaveOccurred()) @@ -669,11 +748,7 @@ func (fvt *FVTClient) UpdateConfigMapTLS(tlsConfig map[string]interface{}) { } func (fvt *FVTClient) StartWatchingDeploys() watch.Interface { - listOptions := metav1.ListOptions{ - LabelSelector: "modelmesh-service", - TimeoutSeconds: &DefaultTimeout, - } - deployWatcher, err := fvt.Resource(gvrDeployment).Namespace(fvt.namespace).Watch(context.TODO(), listOptions) + deployWatcher, err := fvt.Resource(gvrDeployment).Namespace(fvt.namespace).Watch(context.TODO(), deploymentListOptions) Expect(err).ToNot(HaveOccurred()) return deployWatcher } @@ -682,8 +757,7 @@ func (fvt *FVTClient) ListDeploys() appsv1.DeploymentList { var err error // query for UnstructuredList using the dynamic client - listOptions := metav1.ListOptions{LabelSelector: "modelmesh-service", TimeoutSeconds: &DefaultTimeout} - u, err := fvt.Resource(gvrDeployment).Namespace(fvt.namespace).List(context.TODO(), listOptions) + u, err := fvt.Resource(gvrDeployment).Namespace(fvt.namespace).List(context.TODO(), deploymentListOptions) Expect(err).ToNot(HaveOccurred()) // convert elements from Unstructured to Deployment @@ -698,6 +772,32 @@ func (fvt *FVTClient) ListDeploys() appsv1.DeploymentList { return deployments } +func (fvt *FVTClient) ListReadyRuntimePods() corev1.PodList { + var err error + + // query for UnstructuredList using the dynamic client + u, err := fvt.Resource(gvrPods).Namespace(fvt.namespace).List(context.TODO(), runtimePodListOptions) + Expect(err).ToNot(HaveOccurred()) + + // convert elements from Unstructured to Pod + var pods corev1.PodList + for _, up := range u.Items { + var p corev1.Pod + err = runtime.DefaultUnstructuredConverter.FromUnstructured(up.Object, &p) + Expect(err).ToNot(HaveOccurred()) + + // make sure to only return pods that are 'Ready' + // https://github.com/knative-sandbox/eventing-kafka-broker/pull/2523/files#diff-a9f3c3b + for _, c := range p.Status.Conditions { + if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue { + pods.Items = append(pods.Items, p) + } + } + } + + return pods +} + func (fvt *FVTClient) RestartDeploys() { // trigger a restart by patching an annotation with a timestamp // generate the JSON patch that adds/modifies the annotation @@ -741,6 +841,18 @@ func (fvt *FVTClient) DeleteConfigMap(resourceName string) error { return nil } +func (fvt *FVTClient) DeleteStorageConfigSecret() { + fvt.log.Info("Delete storage-config secret ...") + if err := fvt.DeleteSecret(StorageConfigSecretName, fvt.controllerNamespace); err != nil { + fvt.log.Error(err, "Unable to delete storage-config secret") + } + if fvt.namespace != fvt.controllerNamespace { + if err := fvt.DeleteSecret(StorageConfigSecretName, fvt.namespace); err != nil { + fvt.log.Error(err, "Unable to delete user namespaced storage-config secret") + } + } +} + func (fvt FVTClient) DeleteTLSSecrets() { if err := fvt.DeleteSecret(TLSSecretName, fvt.controllerNamespace); err != nil { fvt.log.Error(err, "Unable to delete TLS secret") diff --git a/fvt/globals.go b/fvt/globals.go index 2f89b606..e181993e 100644 --- a/fvt/globals.go +++ b/fvt/globals.go @@ -20,7 +20,7 @@ import ( var Log logr.Logger var FVTClientInstance *FVTClient -var DefaultTimeout = int64(120) +var DefaultTimeout = int64(120) // absolute timeout for watcher event channels var NameSpaceScopeMode = false var DefaultConfig = map[string]interface{}{ @@ -39,6 +39,28 @@ var DefaultConfig = map[string]interface{}{ }, } +var StorageConfigDataMinio = map[string]interface{}{ + "localMinIO": map[string]string{ + "type": "s3", + "access_key_id": "AKIAIOSFODNN7EXAMPLE", + "secret_access_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + "endpoint_url": "http://minio:9000", + "default_bucket": "modelmesh-example-models", + "region": "us-south", + }, +} + +var StorageConfigDataPVC = map[string]interface{}{ + "pvc1": map[string]string{ + "type": "pvc", + "name": "models-pvc-1", + }, + "pvc2": map[string]string{ + "type": "pvc", + "name": "models-pvc-2", + }, +} + var BasicTLSConfig = map[string]interface{}{ "tls": map[string]interface{}{ "secretName": TLSSecretName, @@ -71,4 +93,5 @@ const ( IsvcSamplesPath = "isvcs/" RuntimeSamplesPath = "runtimes/" TLSSecretName = "fvt-tls-secret" + StorageConfigSecretName = "storage-config" ) diff --git a/fvt/helpers.go b/fvt/helpers.go index e586bc46..84b4e6de 100644 --- a/fvt/helpers.go +++ b/fvt/helpers.go @@ -4,13 +4,14 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + package fvt import ( @@ -20,6 +21,8 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + api "github.com/kserve/modelmesh-serving/apis/serving/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" utilrand "k8s.io/apimachinery/pkg/util/rand" @@ -59,22 +62,22 @@ func CreatePredictorAndWaitAndExpectLoaded(predictorManifest *unstructured.Unstr createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to be 'Loaded'") + By("Waiting for predictor " + predictorName + " to be 'Loaded'") // TODO: "Standby" (or) "FailedToLoad" states are currently encountered after the "Loading" state but they shouldn't be (see issue#994) resultingPredictor := WaitForLastStateInExpectedList("activeModelState", []string{"Pending", "Loading", "Standby", "FailedToLoad", "Loading", "Loaded"}, watcher) ExpectPredictorState(resultingPredictor, true, "Loaded", "", "UpToDate") return resultingPredictor } -func CreateIsvcAndWaitAndExpectReady(isvcManifest *unstructured.Unstructured) *unstructured.Unstructured { +func CreateIsvcAndWaitAndExpectReady(isvcManifest *unstructured.Unstructured, timeout time.Duration) *unstructured.Unstructured { isvcName := isvcManifest.GetName() By("Creating inference service " + isvcName) - watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, DefaultTimeout) + watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, int64(timeout.Seconds())) defer watcher.Stop() FVTClientInstance.CreateIsvcExpectSuccess(isvcManifest) - By("Waiting for inference service" + isvcName + " to be 'Ready'") + By("Waiting for inference service " + isvcName + " to be 'Ready' and model is 'Loaded'") // ISVC does not have the status field set initially. - resultingIsvc := WaitForIsvcReady(watcher) + resultingIsvc := WaitForIsvcState(watcher, []api.ModelState{api.Standby, api.Loaded}, isvcName, timeout) return resultingIsvc } @@ -87,13 +90,25 @@ func CreatePredictorAndWaitAndExpectFailed(predictorManifest *unstructured.Unstr createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to be 'FailedToLoaded'") - // "Standby" state is encountered after the "Loading" state but it shouldn't be + By("Waiting for predictor " + predictorName + " to have 'FailedToLoad'") + // "Standby" state is encountered after the "Loading" state, but it shouldn't be resultingPredictor := WaitForLastStateInExpectedList("activeModelState", []string{"Pending", "Loading", "Standby", "Loading", "FailedToLoad"}, watcher) ExpectPredictorState(resultingPredictor, false, "FailedToLoad", "", "UpToDate") return resultingPredictor } +func CreateIsvcAndWaitAndExpectFailed(isvcManifest *unstructured.Unstructured) *unstructured.Unstructured { + isvcName := isvcManifest.GetName() + By("Creating inference service " + isvcName) + watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, DefaultTimeout) + defer watcher.Stop() + FVTClientInstance.CreateIsvcExpectSuccess(isvcManifest) + By("Waiting for inference service " + isvcName + " to fail") + // ISVC does not have the status field set initially. + resultingIsvc := WaitForIsvcState(watcher, []api.ModelState{api.FailedToLoad}, isvcName, PredictorTimeout) + return resultingIsvc +} + func CreatePredictorAndWaitAndExpectInvalidSpec(predictorManifest *unstructured.Unstructured) *unstructured.Unstructured { predictorName := predictorManifest.GetName() @@ -103,7 +118,7 @@ func CreatePredictorAndWaitAndExpectInvalidSpec(predictorManifest *unstructured. createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to have transitionStatus 'InvalidSpec'") + By("Waiting for predictor " + predictorName + " to have transitionStatus 'InvalidSpec'") return WaitForLastStateInExpectedList("transitionStatus", []string{"UpToDate", "InvalidSpec"}, watcher) } @@ -156,8 +171,10 @@ func ExpectPredictorState(obj *unstructured.Unstructured, available bool, active actualTransitionStatus := GetString(obj, "status", "transitionStatus") Expect(actualTransitionStatus).To(Equal(transitionStatus)) - if transitionStatus != "BlockedByFailedLoad" && transitionStatus != "InvalidSpec" && - activeModelState != "FailedToLoad" && targetModelState != "FailedToLoad" { + if transitionStatus != string(api.BlockedByFailedLoad) && + transitionStatus != string(api.InvalidSpec) && + activeModelState != string(api.FailedToLoad) && + targetModelState != string(api.FailedToLoad) { actualFailureInfo := GetMap(obj, "status", "lastFailureInfo") Expect(actualFailureInfo).To(BeNil()) } @@ -174,7 +191,7 @@ func ExpectPredictorFailureInfo(obj *unstructured.Unstructured, reason string, h Expect(actualFailureInfo["location"]).To(BeNil()) } if message != "" { - Expect(actualFailureInfo["message"]).To(Equal(message)) + Expect(actualFailureInfo["message"]).To(ContainSubstring(message)) } else { Expect(actualFailureInfo["message"]).ToNot(BeEmpty()) } @@ -188,47 +205,95 @@ func ExpectPredictorFailureInfo(obj *unstructured.Unstructured, reason string, h } } -func WaitForIsvcReady(watcher watch.Interface) *unstructured.Unstructured { +func ExpectIsvcState(obj *unstructured.Unstructured, activeModelState, targetModelState, transitionStatus string) { + actualActiveModelState := GetString(obj, "status", "modelStatus", "states", "activeModelState") + Expect(actualActiveModelState).To(Equal(activeModelState)) + + actualTargetModel := GetString(obj, "status", "modelStatus", "states", "targetModelState") + Expect(actualTargetModel).To(Equal(targetModelState)) + + actualTransitionStatus := GetString(obj, "status", "modelStatus", "transitionStatus") + Expect(actualTransitionStatus).To(Equal(transitionStatus)) + + if transitionStatus != "BlockedByFailedLoad" && + transitionStatus != "InvalidSpec" && + activeModelState != string(api.FailedToLoad) && + targetModelState != string(api.FailedToLoad) { + actualFailureInfo := GetMap(obj, "status", "modelStatus", "lastFailureInfo") + Expect(actualFailureInfo).To(BeNil()) + } +} + +func ExpectIsvcFailureInfo(obj *unstructured.Unstructured, reason string, hasLocation bool, hasTime bool, message string) { + actualFailureInfo := GetMap(obj, "status", "modelStatus", "lastFailureInfo") + Expect(actualFailureInfo).ToNot(BeNil()) + + Expect(actualFailureInfo["reason"]).To(Equal(reason)) + if hasLocation { + Expect(actualFailureInfo["location"]).ToNot(BeEmpty()) + } else { + Expect(actualFailureInfo["location"]).To(BeNil()) + } + if message != "" { + Expect(actualFailureInfo["message"]).To(ContainSubstring(message)) + } else { + Expect(actualFailureInfo["message"]).ToNot(BeEmpty()) + } + if !hasTime { + Expect(actualFailureInfo["time"]).To(BeNil()) + } else { + Expect(actualFailureInfo["time"]).ToNot(BeNil()) + actualTime, err := time.Parse(time.RFC3339, actualFailureInfo["time"].(string)) + Expect(err).To(BeNil()) + Expect(time.Since(actualTime) < time.Minute).To(BeTrue()) + } +} + +func WaitForIsvcState(watcher watch.Interface, anyOfDesiredStates []api.ModelState, name string, isvcTimeout time.Duration) *unstructured.Unstructured { ch := watcher.ResultChan() - isReady := false + reachedDesiredState := false var obj *unstructured.Unstructured - var isvcName string + var isvcName = name - timeout := time.After(predictorTimeout) + timeout := time.After(isvcTimeout) done := false for !done { select { // Exit the loop if InferenceService is not ready before given timeout. case <-timeout: done = true + FVTClientInstance.PrintDescribeIsvc(isvcName) case event, ok := <-ch: if !ok { // the channel was closed (watcher timeout reached) done = true + FVTClientInstance.PrintDescribeIsvc(isvcName) break } obj, ok = event.Object.(*unstructured.Unstructured) Expect(ok).To(BeTrue()) isvcName = GetString(obj, "metadata", "name") - conditions, exists := GetSlice(obj, "status", "conditions") + // ISVC does not have the status field set initially + // modelStatus will not exist until status.conditions exist + _, exists := GetSlice(obj, "status", "conditions") if !exists { time.Sleep(time.Second) continue } - for _, condition := range conditions { - conditionMap := condition.(map[string]interface{}) - if conditionMap["type"] == "Ready" { - if conditionMap["status"] == "True" { - isReady = true - done = true - break - } + // Note: first status.conditions[{"Type": "Ready", "Status": "True"}] can + // occur before status.conditions[{"Type": "Ready", "Status": "False"}] + // so we check for "activeModelState" instead + activeModelState := GetString(obj, "status", "modelStatus", "states", "activeModelState") + for _, desiredState := range anyOfDesiredStates { + if activeModelState == string(desiredState) { + reachedDesiredState = true + done = true + break } } - } } - Expect(isReady).To(BeTrue(), "Timeout before InferenceService '%s' ready", isvcName) + Expect(reachedDesiredState).To(BeTrue(), "Timeout before InferenceService '%s' reached any of the activeModelStates %s", isvcName, anyOfDesiredStates) return obj } @@ -244,7 +309,7 @@ func WaitForLastStateInExpectedList(statusAttribute string, expectedStates []str lastState := "UNSEEN" var predictorName string - timeout := time.After(predictorTimeout) + timeout := time.After(PredictorTimeout) lastStateIndex := 0 done := false for !done { @@ -292,26 +357,43 @@ func WaitForLastStateInExpectedList(statusAttribute string, expectedStates []str return obj } -func WaitForStableActiveDeployState() { +func WaitForStableActiveDeployState(timeToStabilize time.Duration) { watcher := FVTClientInstance.StartWatchingDeploys() defer watcher.Stop() - WaitForDeployStatus(watcher, timeForStatusToStabilize) + WaitForRuntimeDeploymentsToBeStable(timeToStabilize, watcher) } -func WaitForDeployStatus(watcher watch.Interface, timeToStabilize time.Duration) { +func WaitForRuntimeDeploymentsToBeStable(timeToStabilize time.Duration, watcher watch.Interface) { ch := watcher.ResultChan() - targetStateReached := false var obj *unstructured.Unstructured var replicas, updatedReplicas, availableReplicas int64 var deployName string - deployStatusesReady := make(map[string]bool) + deploymentReady := make(map[string]bool) + + // Get a list of ServingRuntime deployments + runtimeDeploys := FVTClientInstance.ListDeploys() + for _, deploy := range runtimeDeploys.Items { + // initialize all deployment statuses as not ready + deploymentReady[deploy.Name] = false + } + timeout := timeToStabilize + allReady := false done := false for !done { select { + // The select statement is only used with channels to let a goroutine wait on multiple communication operations. + // The select blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready. + case <-time.After(timeout): + // if no watcher events came in for the given length of timeForStatusToStabilize + // then we assume the deployment status has stabilized and exit the loop + Log.Info(fmt.Sprintf("Timed out after %v without events", timeout)) + done = true + break case event, ok := <-ch: if !ok { - // the channel was closed (watcher timeout reached) + // the channel was closed (watcher timeout reached, see DefaultTimeout) + Log.Info(fmt.Sprintf("Watcher timed out after %v seconds", DefaultTimeout)) done = true break } @@ -325,35 +407,36 @@ func WaitForDeployStatus(watcher watch.Interface, timeToStabilize time.Duration) Log.Info("Watcher got event with object", "name", deployName, - "status.replicas", replicas, - "status.availableReplicas", availableReplicas, - "status.updatedReplicas", updatedReplicas) + "replicas", replicas, + "available", availableReplicas, + "updated", updatedReplicas) if (updatedReplicas == replicas) && (availableReplicas == updatedReplicas) { - deployStatusesReady[deployName] = true - Log.Info(fmt.Sprintf("deployStatusesReady: %v", deployStatusesReady)) - } else { - deployStatusesReady[deployName] = false - } - - // this case executes if no events are received during the timeToStabilize duration - case <-time.After(timeToStabilize): - // check if all deployments are ready - stable := true - for _, status := range deployStatusesReady { - if !status { - stable = false - break + deploymentReady[deployName] = true + Log.Info(fmt.Sprintf("deployStatusesReady: %v", deploymentReady)) + // check if all deployments are ready + allReady = true + for _, thisReady := range deploymentReady { + allReady = allReady && thisReady } - } - if stable { - targetStateReached = true - done = true + // do not exit the loop yet (done=true), deployment my become unstable again, + // wait for timeForStatusToStabilize (see above) + if allReady { + // once we are ready, shorten time to wait for next event + // if we are truly ready no more event will come in + // if we are not yet ready, new events will come in quickly + timeout = TimeForStatusToStabilize + Log.Info(fmt.Sprintf("All deployments are ready: %v", deploymentReady)) + } + } else { + deploymentReady[deployName] = false + // restore the full time to wait in between watcher events + timeout = timeToStabilize } } } - Expect(targetStateReached).To(BeTrue(), "Timeout before deploy '%s' ready(last state was replicas: '%v' updatedReplicas: '%v' availableReplicas: '%v')", - deployName, replicas, updatedReplicas, availableReplicas) + + Expect(allReady).To(BeTrue(), fmt.Sprintf("Timed out before deployments were ready: %v", deploymentReady)) } func logPredictorStatus(obj *unstructured.Unstructured) []interface{} { diff --git a/fvt/inference.go b/fvt/inference.go index 2207b40c..740cafd2 100644 --- a/fvt/inference.go +++ b/fvt/inference.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -17,7 +17,6 @@ import ( "bytes" "encoding/binary" "encoding/json" - "io/ioutil" "math" "os" @@ -91,7 +90,7 @@ func ExpectSuccessfulInference_openvinoMnistTFSPredict(predictorName string) { Expect(err).ToNot(HaveOccurred()) Expect(inferResponse).ToNot(BeNil()) // NOTE: ModelSpec is not included in the response, so we can't assert on the name - // validate the the activation for the digit 7 is the maximum + // validate the activation for the digit 7 is the maximum activations, err := convertRawOutputContentsTo10Floats(inferResponse.Outputs["Plus214_Output_0"].TensorContent) max := activations[0] maxI := 0 @@ -106,7 +105,7 @@ func ExpectSuccessfulInference_openvinoMnistTFSPredict(predictorName string) { } func ExpectSuccessfulInference_torchserveMARPredict(predictorName string) { - imageBytes, err := ioutil.ReadFile(TestDataPath("0.png")) + imageBytes, err := os.ReadFile(TestDataPath("0.png")) Expect(err).ToNot(HaveOccurred()) inferRequest := &torchserveapi.PredictionsRequest{ diff --git a/fvt/predictor/isvc_test.go b/fvt/predictor/isvc_test.go deleted file mode 100644 index d02254ff..00000000 --- a/fvt/predictor/isvc_test.go +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2021 IBM Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package predictor - -import ( - . "github.com/kserve/modelmesh-serving/fvt" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -type FVTInferenceService struct { - name string - inferenceServiceFileName string -} - -var inferenceArray = []FVTInferenceService{ - { - name: "New Format", - inferenceServiceFileName: "new-format-mm.yaml", - }, - { - name: "Old Format", - inferenceServiceFileName: "old-format-mm.yaml", - }, -} - -var _ = Describe("Inference service", Ordered, func() { - for _, i := range inferenceArray { - var _ = Describe("test "+i.name+" isvc", Ordered, func() { - var isvcName string - - It("should successfully load a model", func() { - isvcObject := NewIsvcForFVT(i.inferenceServiceFileName) - isvcName = isvcObject.GetName() - CreateIsvcAndWaitAndExpectReady(isvcObject) - - err := FVTClientInstance.ConnectToModelServing(Insecure) - Expect(err).ToNot(HaveOccurred()) - }) - - It("should successfully run inference", func() { - ExpectSuccessfulInference_sklearnMnistSvm(isvcName) - }) - - AfterAll(func() { - FVTClientInstance.DeleteIsvc(isvcName) - }) - - }) - } -}) diff --git a/fvt/predictor/predictor_suite_test.go b/fvt/predictor/predictor_suite_test.go index 776be4e4..96c98f49 100644 --- a/fvt/predictor/predictor_suite_test.go +++ b/fvt/predictor/predictor_suite_test.go @@ -14,16 +14,14 @@ package predictor import ( - "os" "testing" "time" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/log/zap" - . "github.com/kserve/modelmesh-serving/fvt" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) @@ -32,38 +30,11 @@ func TestPredictorSuite(t *testing.T) { RunSpecs(t, "Predictor suite") } -func createFVTClient() { - Log = zap.New(zap.UseDevMode(true), zap.WriteTo(GinkgoWriter)) - Log.Info("Initializing test suite") - - namespace := os.Getenv("NAMESPACE") - if namespace == "" { - namespace = DefaultTestNamespace - } - serviceName := os.Getenv("SERVICENAME") - if serviceName == "" { - serviceName = DefaultTestServiceName - } - controllerNamespace := os.Getenv("CONTROLLERNAMESPACE") - if controllerNamespace == "" { - controllerNamespace = DefaultControllerNamespace - } - NameSpaceScopeMode = os.Getenv("NAMESPACESCOPEMODE") == "true" - Log.Info("Using environment variables", "NAMESPACE", namespace, "SERVICENAME", serviceName, - "CONTROLLERNAMESPACE", controllerNamespace, "NAMESPACESCOPEMODE", NameSpaceScopeMode) - - var err error - FVTClientInstance, err = GetFVTClient(Log, namespace, serviceName, controllerNamespace) - Expect(err).ToNot(HaveOccurred()) - Expect(FVTClientInstance).ToNot(BeNil()) - Log.Info("FVTClientInstance created", "client", FVTClientInstance) -} - var _ = SynchronizedBeforeSuite(func() []byte { // runs *only* on process #1 - createFVTClient() + InitializeFVTClient() - // confirm 3 cluster serving runtimes or serving runtimes exist + // confirm 4 cluster serving runtimes or serving runtimes exist var err error var list *unstructured.UnstructuredList if NameSpaceScopeMode { @@ -75,27 +46,31 @@ var _ = SynchronizedBeforeSuite(func() []byte { Expect(list.Items).To(HaveLen(4)) FVTClientInstance.SetDefaultUserConfigMap() + // ensure that there are no predictors to start FVTClientInstance.DeleteAllPredictors() FVTClientInstance.DeleteAllIsvcs() - // ensure a stable deploy state - WaitForStableActiveDeployState() + // create TLS secrets before start of tests FVTClientInstance.CreateTLSSecrets() + // ensure a stable deploy state + WaitForStableActiveDeployState(time.Second * 30) + return nil }, func(_ []byte) { // runs on *all* processes // create the fvtClient Instance on every other process except the first, since it got created in the above function. if FVTClientInstance == nil { - createFVTClient() + InitializeFVTClient() } + Log.Info("Setup completed") }) var _ = SynchronizedAfterSuite(func() { // runs on *all* processes - // ensure we cleanup any port-forward + // ensure we clean up any port-forward FVTClientInstance.DisconnectFromModelServing() }, func() { // runs *only* on process #1 diff --git a/fvt/predictor/predictor_test.go b/fvt/predictor/predictor_test.go index bfc8e9ee..15d5ea5f 100644 --- a/fvt/predictor/predictor_test.go +++ b/fvt/predictor/predictor_test.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -123,6 +123,22 @@ var predictorsArray = []FVTPredictor{ // }, } +type FVTInferenceService struct { + name string + inferenceServiceFileName string +} + +var inferenceArray = []FVTInferenceService{ + { + name: "New Format", + inferenceServiceFileName: "new-format-mm.yaml", + }, + { + name: "Old Format", + inferenceServiceFileName: "old-format-mm.yaml", + }, +} + var _ = Describe("Predictor", func() { // Many tests in this block assume a stable state of scaled up deployments // which may not be the case if other Describe blocks run first. So we want to @@ -871,121 +887,6 @@ var _ = Describe("Predictor", func() { Expect(err.Error()).To(ContainSubstring("Unexpected : cannot reshape array")) }) }) - - var _ = Describe("TLS XGBoost inference", Ordered, Serial, func() { - var xgboostPredictorObject *unstructured.Unstructured - var xgboostPredictorName string - - AfterAll(func() { - FVTClientInstance.SetDefaultUserConfigMap() - }) - - It("should successfully run an inference with basic TLS", func() { - FVTClientInstance.UpdateConfigMapTLS(BasicTLSConfig) - - By("Waiting for the deployments replicas to be ready") - WaitForStableActiveDeployState() - - // load the test predictor object - xgboostPredictorObject = NewPredictorForFVT("xgboost-predictor.yaml") - xgboostPredictorName = xgboostPredictorObject.GetName() - CreatePredictorAndWaitAndExpectLoaded(xgboostPredictorObject) - - By("Creating a new connection to ModelServing") - // ensure we are establishing a new connection after the TLS change - FVTClientInstance.DisconnectFromModelServing() - - var timeAsleep int - var mmeshErr error - for timeAsleep <= 30 { - mmeshErr = FVTClientInstance.ConnectToModelServing(TLS) - - if mmeshErr == nil { - Log.Info("Successfully connected to model mesh tls") - break - } - - Log.Info(fmt.Sprintf("Error found, retrying connecting to model-mesh: %s", mmeshErr.Error())) - FVTClientInstance.DisconnectFromModelServing() - timeAsleep += 10 - time.Sleep(time.Second * time.Duration(timeAsleep)) - } - - Expect(mmeshErr).ToNot(HaveOccurred()) - - By("Expect inference to succeed") - ExpectSuccessfulInference_xgboostMushroom(xgboostPredictorName) - - By("Expect inference to succeed via REST proxy") - ExpectSuccessfulRESTInference_xgboostMushroom(xgboostPredictorName, true) - - // cleanup the predictor - FVTClientInstance.DeletePredictor(xgboostPredictorName) - - // disconnect because TLS config will change - FVTClientInstance.DisconnectFromModelServing() - }) - - It("should successfully run an inference with mutual TLS", func() { - FVTClientInstance.UpdateConfigMapTLS(MutualTLSConfig) - - By("Waiting for the deployments replicas to be ready") - WaitForStableActiveDeployState() - - // load the test predictor object - xgboostPredictorObject = NewPredictorForFVT("xgboost-predictor.yaml") - xgboostPredictorName = xgboostPredictorObject.GetName() - CreatePredictorAndWaitAndExpectLoaded(xgboostPredictorObject) - - By("Creating a new connection to ModelServing") - // ensure we are establishing a new connection after the TLS change - FVTClientInstance.DisconnectFromModelServing() - - var timeAsleep int - var mmeshErr error - for timeAsleep <= 30 { - mmeshErr = FVTClientInstance.ConnectToModelServing(MutualTLS) - - if mmeshErr == nil { - Log.Info("Successfully connected to model mesh tls") - break - } - - Log.Info(fmt.Sprintf("Error found, retrying connecting to model-mesh: %s", mmeshErr.Error())) - FVTClientInstance.DisconnectFromModelServing() - timeAsleep += 10 - time.Sleep(time.Second * time.Duration(timeAsleep)) - } - Expect(mmeshErr).ToNot(HaveOccurred()) - - By("Expect inference to succeed") - ExpectSuccessfulInference_xgboostMushroom(xgboostPredictorName) - - // cleanup the predictor - FVTClientInstance.DeletePredictor(xgboostPredictorName) - - // disconnect because TLS config will change - FVTClientInstance.DisconnectFromModelServing() - }) - - It("should fail to run inference when the server has mutual TLS but the client does not present a certificate", func() { - FVTClientInstance.UpdateConfigMapTLS(MutualTLSConfig) - - By("Waiting for the deployments replicas to be ready") - WaitForStableActiveDeployState() - - By("Expect a new connection to fail") - // since the connection switches to TLS, ensure we are establishing a new connection - FVTClientInstance.DisconnectFromModelServing() - // this test is expected to fail to connect due to the TLS cert, so we - // don't retry if it fails - mmeshErr := FVTClientInstance.ConnectToModelServing(TLS) - Expect(mmeshErr).To(HaveOccurred()) - }) - }) - // The TLS tests `Describe` block should be the last one in the list to - // improve efficiency of the tests. Any test after the TLS tests would need - // to wait for the configuration changes to roll out to all Deployments. }) // These tests verify that an invalid Predictor fails to load. These are in a @@ -1083,7 +984,7 @@ var _ = Describe("Non-ModelMesh ServingRuntime", func() { Expect(err).ToNot(HaveOccurred()) By("Waiting for the deployments replicas to be ready") - WaitForStableActiveDeployState() + WaitForStableActiveDeployState(TimeForStatusToStabilize) By("Checking that new ServingRuntime resource exists") FVTClientInstance.GetServingRuntime(runtimeName) @@ -1120,3 +1021,150 @@ var _ = Describe("Non-ModelMesh ServingRuntime", func() { FVTClientInstance.DeletePredictor(obj.GetName()) }) }) + +var _ = Describe("Inference service", Ordered, func() { + for _, i := range inferenceArray { + var _ = Describe("test "+i.name+" isvc", Ordered, func() { + var isvcName string + + It("should successfully load a model", func() { + isvcObject := NewIsvcForFVT(i.inferenceServiceFileName) + isvcName = isvcObject.GetName() + CreateIsvcAndWaitAndExpectReady(isvcObject, PredictorTimeout) + + err := FVTClientInstance.ConnectToModelServing(Insecure) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should successfully run inference", func() { + ExpectSuccessfulInference_sklearnMnistSvm(isvcName) + }) + + AfterAll(func() { + FVTClientInstance.DeleteIsvc(isvcName) + }) + + }) + } +}) + +// The TLS tests `Describe` block should be the last one in the list to +// improve efficiency of the tests. Any test after the TLS tests would need +// to wait for the configuration changes to roll out to all Deployments. +// The TLS tests must run "Serial" (not in parallel with any other tests) since +// the configmap changes trigger deployment rollouts causing runtime pods to +// get terminated and any concurrently running inference requests would fail as +// the gRPC connection to terminating pods breaks. +var _ = Describe("TLS XGBoost inference", Ordered, Serial, func() { + var xgboostPredictorObject *unstructured.Unstructured + var xgboostPredictorName string + + AfterAll(func() { + FVTClientInstance.SetDefaultUserConfigMap() + }) + + It("should successfully run an inference with basic TLS", func() { + By("Updating the user ConfigMap to for basic TLS") + FVTClientInstance.UpdateConfigMapTLS(BasicTLSConfig) + + By("Waiting for stable deploy state after UpdateConfigMapTLS") + WaitForStableActiveDeployState(time.Second * 20) + + // load the test predictor object + xgboostPredictorObject = NewPredictorForFVT("xgboost-predictor.yaml") + xgboostPredictorName = xgboostPredictorObject.GetName() + CreatePredictorAndWaitAndExpectLoaded(xgboostPredictorObject) + + By("Creating a new connection to ModelServing") + // ensure we are establishing a new connection after the TLS change + FVTClientInstance.DisconnectFromModelServing() + + var timeAsleep int + var mmeshErr error + for timeAsleep <= 30 { + mmeshErr = FVTClientInstance.ConnectToModelServing(TLS) + + if mmeshErr == nil { + Log.Info("Successfully connected to model mesh tls") + break + } + + Log.Info(fmt.Sprintf("Error found, retrying connecting to model-mesh: %s", mmeshErr.Error())) + FVTClientInstance.DisconnectFromModelServing() + timeAsleep += 10 + time.Sleep(time.Second * time.Duration(timeAsleep)) + } + + Expect(mmeshErr).ToNot(HaveOccurred()) + + By("Expect inference to succeed") + ExpectSuccessfulInference_xgboostMushroom(xgboostPredictorName) + + By("Expect inference to succeed via REST proxy") + ExpectSuccessfulRESTInference_xgboostMushroom(xgboostPredictorName, true) + + // cleanup the predictor + FVTClientInstance.DeletePredictor(xgboostPredictorName) + + // disconnect because TLS config will change + FVTClientInstance.DisconnectFromModelServing() + }) + + It("should successfully run an inference with mutual TLS", func() { + By("Updating user config for Mutual TLS") + FVTClientInstance.UpdateConfigMapTLS(MutualTLSConfig) + + By("Waiting for stable deploy state after MutualTLSConfig") + WaitForStableActiveDeployState(time.Second * 20) + + // load the test predictor object + xgboostPredictorObject = NewPredictorForFVT("xgboost-predictor.yaml") + xgboostPredictorName = xgboostPredictorObject.GetName() + CreatePredictorAndWaitAndExpectLoaded(xgboostPredictorObject) + + By("Creating a new connection to ModelServing") + // ensure we are establishing a new connection after the TLS change + FVTClientInstance.DisconnectFromModelServing() + + var timeAsleep int + var mmeshErr error + for timeAsleep <= 30 { + mmeshErr = FVTClientInstance.ConnectToModelServing(MutualTLS) + + if mmeshErr == nil { + Log.Info("Successfully connected to model mesh tls") + break + } + + Log.Info(fmt.Sprintf("Error found, retrying connecting to model-mesh: %s", mmeshErr.Error())) + FVTClientInstance.DisconnectFromModelServing() + timeAsleep += 10 + time.Sleep(time.Second * time.Duration(timeAsleep)) + } + Expect(mmeshErr).ToNot(HaveOccurred()) + + By("Expect inference to succeed") + ExpectSuccessfulInference_xgboostMushroom(xgboostPredictorName) + + // cleanup the predictor + FVTClientInstance.DeletePredictor(xgboostPredictorName) + + // disconnect because TLS config will change + FVTClientInstance.DisconnectFromModelServing() + }) + + It("should fail to run inference when the server has mutual TLS but the client does not present a certificate", func() { + FVTClientInstance.UpdateConfigMapTLS(MutualTLSConfig) + + By("Waiting for the deployments replicas to be ready") + WaitForStableActiveDeployState(TimeForStatusToStabilize) + + By("Expect a new connection to fail") + // since the connection switches to TLS, ensure we are establishing a new connection + FVTClientInstance.DisconnectFromModelServing() + // this test is expected to fail to connect due to the TLS cert, so we + // don't retry if it fails + mmeshErr := FVTClientInstance.ConnectToModelServing(TLS) + Expect(mmeshErr).To(HaveOccurred()) + }) +}) \ No newline at end of file diff --git a/fvt/scaleToZero/scaleToZero_suite_test.go b/fvt/scaleToZero/scaleToZero_suite_test.go index 56b78278..5e6f55bf 100644 --- a/fvt/scaleToZero/scaleToZero_suite_test.go +++ b/fvt/scaleToZero/scaleToZero_suite_test.go @@ -14,18 +14,14 @@ package scaleToZero import ( - "os" "testing" "time" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/log/zap" - . "github.com/kserve/modelmesh-serving/fvt" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) func TestScaleToZeroSuite(t *testing.T) { @@ -35,35 +31,10 @@ func TestScaleToZeroSuite(t *testing.T) { var _ = SynchronizedBeforeSuite(func() []byte { // runs *only* on process #1 - return nil -}, func(_ []byte) { - // runs on *all* processes - Log = zap.New(zap.UseDevMode(true), zap.WriteTo(GinkgoWriter)) - Log.Info("Initializing test suite") - - namespace := os.Getenv("NAMESPACE") - if namespace == "" { - namespace = DefaultTestNamespace - } - serviceName := os.Getenv("SERVICENAME") - if serviceName == "" { - serviceName = DefaultTestServiceName - } - controllerNamespace := os.Getenv("CONTROLLERNAMESPACE") - if controllerNamespace == "" { - controllerNamespace = DefaultControllerNamespace - } - NameSpaceScopeMode = os.Getenv("NAMESPACESCOPEMODE") == "true" - Log.Info("Using environment variables", "NAMESPACE", namespace, "SERVICENAME", serviceName, - "CONTROLLERNAMESPACE", controllerNamespace, "NAMESPACESCOPEMODE", NameSpaceScopeMode) + InitializeFVTClient() + // confirm 4 cluster serving runtimes or serving runtimes exist var err error - FVTClientInstance, err = GetFVTClient(Log, namespace, serviceName, controllerNamespace) - Expect(err).ToNot(HaveOccurred()) - Expect(FVTClientInstance).ToNot(BeNil()) - Log.Info("FVTClientInstance created", "client", FVTClientInstance) - - // confirm 3 cluster serving runtimes or serving runtimes var list *unstructured.UnstructuredList if NameSpaceScopeMode { list, err = FVTClientInstance.ListServingRuntimes(metav1.ListOptions{}) @@ -86,12 +57,23 @@ var _ = SynchronizedBeforeSuite(func() []byte { FVTClientInstance.DeleteAllPredictors() FVTClientInstance.DeleteAllIsvcs() + // ensure a stable deploy state + WaitForStableActiveDeployState(TimeForStatusToStabilize) + + return nil +}, func(_ []byte) { + // runs on *all* processes + // create the fvtClient Instance on every other process except the first, since it got created in the above function. + if FVTClientInstance == nil { + InitializeFVTClient() + } + Log.Info("Setup completed") }) var _ = SynchronizedAfterSuite(func() { // runs on *all* processes - // ensure we cleanup any port-forward + // ensure we clean up all port-forwards FVTClientInstance.DisconnectFromModelServing() }, func() { // runs *only* on process #1 diff --git a/fvt/scaleToZero/scale_to_zero_test.go b/fvt/scaleToZero/scale_to_zero_test.go index eb13c32e..c2703e85 100644 --- a/fvt/scaleToZero/scale_to_zero_test.go +++ b/fvt/scaleToZero/scale_to_zero_test.go @@ -67,7 +67,7 @@ var _ = Describe("Scaling of runtime deployments to zero", Ordered, func() { It("should scale all runtimes down", func() { By("Waiting for the deployments to stabilize") - WaitForStableActiveDeployState() + WaitForStableActiveDeployState(TimeForStatusToStabilize) // check that all runtimes are scaled to zero expectScaledToZero() @@ -82,7 +82,7 @@ var _ = Describe("Scaling of runtime deployments to zero", Ordered, func() { FVTClientInstance.ApplyPredictorExpectSuccess(testPredictorObject) By("Waiting for the deployments to stabilize") - WaitForStableActiveDeployState() + WaitForStableActiveDeployState(TimeForStatusToStabilize) // check that all runtimes except the one are scaled to zero expectScaledUp() @@ -101,7 +101,7 @@ var _ = Describe("Scaling of runtime deployments to zero", Ordered, func() { FVTClientInstance.ApplyPredictorExpectSuccess(testPredictorObject) By("Waiting for the deployments to stabilize") - WaitForStableActiveDeployState() + WaitForStableActiveDeployState(TimeForStatusToStabilize) // ensure the runtime is ready and scaled up and others are scaled down expectScaledUp() diff --git a/fvt/storage/storage_suite_test.go b/fvt/storage/storage_suite_test.go new file mode 100644 index 00000000..07bd2e77 --- /dev/null +++ b/fvt/storage/storage_suite_test.go @@ -0,0 +1,105 @@ +// Copyright 2023 IBM Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package storage + +import ( + "testing" + "time" + + . "github.com/kserve/modelmesh-serving/fvt" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +func TestStorage(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Storage Suite") +} + +var _ = SynchronizedBeforeSuite(func() []byte { + // runs *only* on process #1 + InitializeFVTClient() + + // confirm 4 cluster serving runtimes or serving runtimes exist + var err error + var list *unstructured.UnstructuredList + if NameSpaceScopeMode { + list, err = FVTClientInstance.ListServingRuntimes(metav1.ListOptions{}) + } else { + list, err = FVTClientInstance.ListClusterServingRuntimes(metav1.ListOptions{}) + } + Expect(err).ToNot(HaveOccurred()) + Expect(list.Items).To(HaveLen(4)) + + FVTClientInstance.SetDefaultUserConfigMap() + + // replace default storage-config secret for local Minio with PVC config + FVTClientInstance.DeleteStorageConfigSecret() + FVTClientInstance.CreateStorageConfigSecret(StorageConfigDataPVC) + + // ensure that there are no predictors to start + FVTClientInstance.DeleteAllPredictors() + FVTClientInstance.DeleteAllIsvcs() + + // ensure a stable deploy state, on each process since we updated the storage config + Log.Info("Waiting for stable deploy state") + WaitForStableActiveDeployState(time.Second * 60) + + return nil +}, func(_ []byte) { + // runs on *all* processes + // create the fvtClient Instance on every other process except the first, since it got created in the above function. + if FVTClientInstance == nil { + InitializeFVTClient() + } + // connect to model serving service once for all each process + err := FVTClientInstance.ConnectToModelServing(Insecure) + Expect(err).ToNot(HaveOccurred()) + + Log.Info("Setup completed") +}) + +var _ = SynchronizedAfterSuite(func() { + // runs on *all* processes + // ensure we clean up any port-forward for each process + FVTClientInstance.DisconnectFromModelServing() +}, func() { + // runs *only* on process #1 + // restore config defaults + FVTClientInstance.DeleteTLSSecrets() + FVTClientInstance.SetDefaultUserConfigMap() + // reset default storage-config secret + FVTClientInstance.DeleteStorageConfigSecret() + FVTClientInstance.CreateStorageConfigSecret(StorageConfigDataMinio) + // restart pods to reset Bootstrap failure checks + FVTClientInstance.RestartDeploys() +}) + +// register handlers for a failed test case to print info to the console +var startTime string +var _ = JustBeforeEach(func() { + startTime = time.Now().Format("2006-01-02T15:04:05Z") +}) +var _ = JustAfterEach(func() { + if CurrentSpecReport().Failed() { + FVTClientInstance.PrintIsvcs() + FVTClientInstance.PrintPods() + FVTClientInstance.PrintDescribeNodes() + FVTClientInstance.PrintEvents() + FVTClientInstance.TailPodLogs(startTime) + } +}) diff --git a/fvt/storage/storage_test.go b/fvt/storage/storage_test.go new file mode 100644 index 00000000..5c08628c --- /dev/null +++ b/fvt/storage/storage_test.go @@ -0,0 +1,189 @@ +// Copyright 2023 IBM Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.package storage + +package storage + +import ( + "time" + + . "github.com/kserve/modelmesh-serving/fvt" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +var isvcFiles = map[string]string{ + "isvc-pvc-storage-uri": "isvc-pvc-uri.yaml", + "isvc-pvc-storage-path": "isvc-pvc-path.yaml", + "isvc-pvc2": "isvc-pvc-2.yaml", + "isvc-pvc3": "isvc-pvc-3.yaml", + "isvc-pvc4": "isvc-pvc-4.yaml", +} + +// ISVCs using PVCs from the FVT `storage-config` Secret (config/dependencies/fvt.yaml) +var isvcWithPvcInStorageConfig = []string{"isvc-pvc-storage-uri", "isvc-pvc-storage-path", "isvc-pvc2"} + +// ISVC using PVC not in the FVT `storage-config` Secret (config/dependencies/fvt.yaml) +// this should work only after setting allowAnyPVC = true +var isvcWithPvcNotInStorageConfig = "isvc-pvc3" + +// ISVC using a PVC that does not exist at all, this ISVC should fail to load +var isvcWithNonExistentPvc = "isvc-pvc4" + +var _ = Describe("ISVCs", func() { + + Describe("with PVC in storage-config", Ordered, func() { + + for _, name := range isvcWithPvcInStorageConfig { + + Describe("\""+name+"\"", Ordered, func() { + var isvcName = name + var fileName = isvcFiles[name] + + AfterAll(func() { + FVTClientInstance.DeleteIsvc(isvcName) + }) + + It("should successfully load a model", func() { + isvcObject := NewIsvcForFVT(fileName) + isvcName = isvcObject.GetName() + CreateIsvcAndWaitAndExpectReady(isvcObject, PredictorTimeout) + }) + + It("should successfully run inference", func() { + ExpectSuccessfulInference_sklearnMnistSvm(isvcName) + }) + }) + } + }) + + Describe("with PVC not in storage-config", Ordered, Serial, func() { + var isvcObject *unstructured.Unstructured + + It("should fail with PVC not mounted", func() { + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithPvcNotInStorageConfig]) + + obj := CreateIsvcAndWaitAndExpectFailed(isvcObject) + + By("Asserting on the ISVC state") + ExpectIsvcFailureInfo(obj, "ModelLoadFailed", true, true, "") + + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + + It("should load a model when allowAnyPVC", func() { + // This ISVC needs a new PVC which is not in the storage-config secret. + // The controller will update the deployment with the pvc_mount, but + // if the old runtime pods are still around, the ISVC will get deployed + // on an old runtime pod without the PVC mounted and keep failing + // until a new pod with the PVC is ready and the controller decides + // to move the ISVC onto the new pod that has the PVC mounted. + // If this process takes a long time so, we may want to take some extra + // measures to increase our chances for quick success: + // - enable scale to zero? to prevent the new ISVC to land on an old + // runtime pod that does not have the "any" PVC mounted yet + // - use more than 1 pod per runtime? assumption: controller keeps the + // old runtime pod (without the PVC mounted) around (pre-stop hook, + // terminationGracePeriodSeconds: 90 # to allow for model propagation) + // since it still has the ISVC on it but no new pod yet to place it + // on. With 2 runtime pods, one pod can get updated with the PVC mount + // and the controller can "move" the ISVC without service interruption + // from the old to the new pod + + // make a shallow copy of default configmap (don't modify the DefaultConfig reference) + // keeping 1 pod per runtime and don't scale to 0 + config := make(map[string]interface{}) + for k, v := range DefaultConfig { + config[k] = v + } + // update the model-serving-config to allow any PVC + config["allowAnyPVC"] = true + + By("Updating the user config to allow any PVC") + FVTClientInstance.ApplyUserConfigMap(config) + + By("Deleting any PVC entries from the storage-config secret") + FVTClientInstance.DeleteStorageConfigSecret() + + // TODO: create a separate test case for not having storage secret at all. + // Currently that is not working. Runtime pod events without it: + // --- + // TYPE REASON MESSAGE + // Normal Scheduled Successfully assigned modelmesh-serving/modelmesh-serving-mlserver-0.x-54685b95d5-6xmck to 10.87.76.74 + // Warning FailedMount Unable to attach or mount volumes: unmounted volumes=[storage-config], unattached volumes=[models-dir models-pvc-3 storage-config tc-config etcd-config kube-api-access-pqz7t]: timed out waiting for the condition + // Warning FailedMount MountVolume.SetUp failed for volume "storage-config" : secret "storage-config" not found + // --- + // recreate the storage-config secret without the PVCs + FVTClientInstance.CreateStorageConfigSecret(StorageConfigDataMinio) + + // after changing the storage-config, the runtime pod(s) restart with + // updated PVC mounts, wait for stability + By("Waiting for stable deploy state") + WaitForStableActiveDeployState(time.Second * 30) + + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithPvcNotInStorageConfig]) + + // print pods before deploying the predictor for debugging purposes + FVTClientInstance.PrintPods() + + // after mounting the new PVC the runtime pod(s) restart again, but + // the ISVC could have landed on the previous runtime pod without the + // PVC mounted yet and hence will fail to load the first time. + // So we extend the standard predictor timeout to wait a bit longer + extendedTimeout := PredictorTimeout * 2 + CreateIsvcAndWaitAndExpectReady(isvcObject, extendedTimeout) + + // print pods after the predictor is ready for debugging purposes + FVTClientInstance.PrintPods() + + // after adding predictor with allowAnyPVC, new pods get created with + // the new pvc mounts, so we want to wait for deployments to stabilize + // in order to avoid connecting to terminating pod which causes the + // port-forward got killed and needs to be re-established (rarely happens) + By("Waiting for stable deploy state before connecting gRPC connection") + WaitForStableActiveDeployState(time.Second * 10) + By("Connecting to model serving service") + err := FVTClientInstance.ConnectToModelServing(Insecure) + Expect(err).ToNot(HaveOccurred()) + + isvcName := isvcObject.GetName() + By("Running an inference request") + ExpectSuccessfulInference_sklearnMnistSvm(isvcName) + + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + + It("should fail with non-existent PVC", func() { + // make a shallow copy of default configmap (don't modify the DefaultConfig reference) + // keeping 1 pod per runtime and don't scale to 0 + config := make(map[string]interface{}) + for k, v := range DefaultConfig { + config[k] = v + } + // update the model-serving-config to allow any PVC + config["allowAnyPVC"] = true + FVTClientInstance.ApplyUserConfigMap(config) + + By("Waiting for stable deploy state") + WaitForStableActiveDeployState(time.Second * 30) + + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithNonExistentPvc]) + + obj := CreateIsvcAndWaitAndExpectFailed(isvcObject) + ExpectIsvcFailureInfo(obj, "ModelLoadFailed", true, true, "") + + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + }) +}) diff --git a/fvt/testdata/isvcs/isvc-pvc-2.yaml b/fvt/testdata/isvcs/isvc-pvc-2.yaml new file mode 100644 index 00000000..a0655632 --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-2.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc2 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-2/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-3.yaml b/fvt/testdata/isvcs/isvc-pvc-3.yaml new file mode 100644 index 00000000..e660226c --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-3.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc3 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-3/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-4.yaml b/fvt/testdata/isvcs/isvc-pvc-4.yaml new file mode 100644 index 00000000..6ca2a04d --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-4.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc4 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-4/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-path.yaml b/fvt/testdata/isvcs/isvc-pvc-path.yaml new file mode 100644 index 00000000..71db761c --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-path.yaml @@ -0,0 +1,16 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc-storage-path + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storage: + parameters: + type: pvc + name: models-pvc-1 + path: sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-uri.yaml b/fvt/testdata/isvcs/isvc-pvc-uri.yaml new file mode 100644 index 00000000..5334b703 --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-uri.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc-storage-uri + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-1/sklearn/mnist-svm.joblib diff --git a/fvt/utils.go b/fvt/utils.go index 20452485..efaedceb 100644 --- a/fvt/utils.go +++ b/fvt/utils.go @@ -17,19 +17,48 @@ import ( "context" "encoding/json" "fmt" - "io/ioutil" "os" "path/filepath" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" yamlserializer "k8s.io/apimachinery/pkg/runtime/serializer/yaml" "k8s.io/apimachinery/pkg/types" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) -//Utility function to return the testdata directory +func InitializeFVTClient() { + Log = zap.New(zap.UseDevMode(true), zap.WriteTo(GinkgoWriter)) + Log.Info("Initializing test suite") + + namespace := os.Getenv("NAMESPACE") + if namespace == "" { + namespace = DefaultTestNamespace + } + serviceName := os.Getenv("SERVICENAME") + if serviceName == "" { + serviceName = DefaultTestServiceName + } + controllerNamespace := os.Getenv("CONTROLLERNAMESPACE") + if controllerNamespace == "" { + controllerNamespace = DefaultControllerNamespace + } + NameSpaceScopeMode = os.Getenv("NAMESPACESCOPEMODE") == "true" + Log.Info("Using environment variables", "NAMESPACE", namespace, "SERVICENAME", serviceName, + "CONTROLLERNAMESPACE", controllerNamespace, "NAMESPACESCOPEMODE", NameSpaceScopeMode) + + var err error + FVTClientInstance, err = GetFVTClient(Log, namespace, serviceName, controllerNamespace) + Expect(err).ToNot(HaveOccurred()) + Expect(FVTClientInstance).ToNot(BeNil()) + Log.Info("FVTClientInstance created", "client", FVTClientInstance) +} + +// Utility function to return the testdata directory func TestDataPath(resourcePathWithinTestData string) string { wd, err := os.Getwd() Expect(err).ToNot(HaveOccurred()) @@ -38,7 +67,7 @@ func TestDataPath(resourcePathWithinTestData string) string { } func DecodeResourceFromFile(resourcePath string) *unstructured.Unstructured { - content, err := ioutil.ReadFile(resourcePath) + content, err := os.ReadFile(resourcePath) Expect(err).ToNot(HaveOccurred()) obj := &unstructured.Unstructured{} diff --git a/main.go b/main.go index 3ce4518c..84eafcea 100644 --- a/main.go +++ b/main.go @@ -18,7 +18,6 @@ import ( "context" "flag" "fmt" - "io/ioutil" "log" "os" "regexp" @@ -106,7 +105,7 @@ func main() { // ----- mmesh related envar setup ----- controllerNamespace := os.Getenv(ControllerNamespaceEnvVar) if controllerNamespace == "" { - bytes, err := ioutil.ReadFile(KubeNamespaceFile) + bytes, err := os.ReadFile(KubeNamespaceFile) if err != nil { //TODO check kube context and retrieve namespace from there setupLog.Info("Error reading Kube-mounted namespace file, reverting to default namespace", diff --git a/pkg/config/config.go b/pkg/config/config.go index 6cf11d9e..f8ba28d7 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, diff --git a/pkg/mmesh/etcdrangewatcher_test.go b/pkg/mmesh/etcdrangewatcher_test.go index ad225daf..38c8b030 100644 --- a/pkg/mmesh/etcdrangewatcher_test.go +++ b/pkg/mmesh/etcdrangewatcher_test.go @@ -15,7 +15,7 @@ package mmesh import ( "fmt" - "io/ioutil" + "os" "testing" "github.com/stretchr/testify/assert" @@ -79,7 +79,7 @@ func Test_GetEtcdClientConfig_SuccessWithCertOverwrite(t *testing.T) { Certificate: "myCertificate", CertificateFile: certificateFile} - certData, _ := ioutil.ReadFile(certificateFile) + certData, _ := os.ReadFile(certificateFile) etcdClientConfig, err := getEtcdClientConfig(etcdConfig, map[string][]byte{ certificateFile: certData, }, logger) @@ -103,8 +103,8 @@ func Test_GetEtcdClientConfig_SuccessWithKeyAndCert(t *testing.T) { ClientCertificate: "myClientCertificate", ClientCertificateFile: certificateFile} - certData1, _ := ioutil.ReadFile(keyFile) - certData2, _ := ioutil.ReadFile(certificateFile) + certData1, _ := os.ReadFile(keyFile) + certData2, _ := os.ReadFile(certificateFile) etcdClientConfig, err := getEtcdClientConfig(etcdConfig, map[string][]byte{ keyFile: certData1, certificateFile: certData2, @@ -158,8 +158,8 @@ func Test_CreateEtcdClient_Success(t *testing.T) { ClientKeyFile: keyFile, ClientCertificateFile: certificateFile} - certData1, _ := ioutil.ReadFile(keyFile) - certData2, _ := ioutil.ReadFile(certificateFile) + certData1, _ := os.ReadFile(keyFile) + certData2, _ := os.ReadFile(certificateFile) etcdClient, err := CreateEtcdClient(etcdConfig, map[string][]byte{ keyFile: certData1, certificateFile: certData2, diff --git a/scripts/install.sh b/scripts/install.sh index a73ac028..f4662677 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -132,7 +132,7 @@ wait_for_pods_ready() { fi wait_counter=$((wait_counter + 1)) - echo " Waiting 10 secs..." + echo " Waiting 10 secs ..." sleep 10 done } @@ -254,7 +254,7 @@ if [[ $quickstart == "true" ]]; then info "Deploying quickstart resources for etcd and minio" kubectl apply -f quickstart.yaml - info "Waiting for dependent pods to be up..." + info "Waiting for dependent pods to be up ..." wait_for_pods_ready "-l app=etcd" wait_for_pods_ready "-l app=minio" fi @@ -264,7 +264,7 @@ if [[ $fvt == "true" ]]; then info "Deploying fvt resources for etcd and minio" kubectl apply -f fvt.yaml - info "Waiting for dependent pods to be up..." + info "Waiting for dependent pods to be up ..." wait_for_pods_ready "-l app=etcd" wait_for_pods_ready "-l app=minio" fi @@ -305,7 +305,7 @@ if [[ $namespace_scope_mode == "true" ]]; then rm crd/kustomization.yaml.bak fi -info "Waiting for ModelMesh Serving controller pod to be up..." +info "Waiting for ModelMesh Serving controller pod to be up ..." wait_for_pods_ready "-l control-plane=modelmesh-controller" # Older versions of kustomize have different load restrictor flag formats. @@ -344,4 +344,12 @@ if [[ $namespace_scope_mode != "true" ]] && [[ ! -z $user_ns_array ]]; then fi rm quickstart.yaml quickstart.yaml.bak fvt.yaml fvt.yaml.bak +# wait for FVT storage resources that take long to initialize +# we don't want to wait earlier to not hold up any setup steps +# that happen after the initial FVT install block +if [[ $fvt == "true" ]]; then + info "Waiting for FVT PVC storage to be initialized ..." + kubectl wait --for=condition=complete --timeout=180s job/pvc-init +fi + success "Successfully installed ModelMesh Serving!"