diff --git a/.github/workflows/run-fvt.yml b/.github/workflows/run-fvt.yml index 65f754bb..0b1987d8 100644 --- a/.github/workflows/run-fvt.yml +++ b/.github/workflows/run-fvt.yml @@ -1,4 +1,4 @@ -name: FVTs +name: FVT on: pull_request: @@ -71,7 +71,8 @@ jobs: run: | docker pull nvcr.io/nvidia/tritonserver:21.06.1-py3 docker pull seldonio/mlserver:0.5.2 - docker pull openvino/model_server:2022.1 + docker pull openvino/model_server:2022.2 + # docker pull pytorch/torchserve:0.6.0-cpu docker pull kserve/modelmesh-runtime-adapter docker pull kserve/rest-proxy docker pull kserve/modelmesh @@ -79,7 +80,7 @@ jobs: run: | docker images kubectl get pods - kubectl get servingruntimes + kubectl get clusterservingruntimes - name: Run FVTs run: | go install github.com/onsi/ginkgo/v2/ginkgo diff --git a/Makefile b/Makefile index 29027257..eabe70d5 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ test: # Run fvt tests. This requires an etcd, kubernetes connection, and model serving installation. Ginkgo CLI is used to run them in parallel fvt: - ginkgo -v -p -progress --fail-fast fvt/predictor fvt/scaleToZero --timeout=40m + ginkgo -v -procs=2 --progress --fail-fast fvt/predictor fvt/scaleToZero fvt/storage --timeout=50m # Command to regenerate the grpc go files from the proto files fvt-protoc: diff --git a/config/dependencies/fvt.yaml b/config/dependencies/fvt.yaml index 4ae03642..db6942d5 100644 --- a/config/dependencies/fvt.yaml +++ b/config/dependencies/fvt.yaml @@ -123,3 +123,120 @@ stringData: "default_bucket": "modelmesh-example-models", "region": "us-south" } +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-1" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-2" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-3" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "pvc-init" +spec: + template: + metadata: + name: "pvc-init-pod" + spec: + restartPolicy: OnFailure + containers: + - name: "copy-pod" + image: kserve/modelmesh-minio-examples:latest + securityContext: + runAsUser: 0 + allowPrivilegeEscalation: false + command: ["/bin/sh", "-ex", "-c"] + args: + - echo copy model files ...; + whoami; + ls -al "${SRC_FOLDER}"; + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_1}" && + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_2}" && + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_3}" && + ls -al "${DST_FOLDER_1}" && + ls -al "${DST_FOLDER_2}" && + ls -al "${DST_FOLDER_3}" && + echo done && + exit 0; + env: + - name: SRC_FOLDER + value: "/data1/modelmesh-example-models" + - name: DST_FOLDER_1 + value: "/mnt/pvc1" + - name: DST_FOLDER_2 + value: "/mnt/pvc2" + - name: DST_FOLDER_3 + value: "/mnt/pvc3" + volumeMounts: + - name: "pvc1" + mountPath: "/mnt/pvc1" + - name: "pvc2" + mountPath: "/mnt/pvc2" + - name: "pvc3" + mountPath: "/mnt/pvc3" + volumes: + - name: "pvc1" + persistentVolumeClaim: + claimName: "models-pvc-1" + - name: "pvc2" + persistentVolumeClaim: + claimName: "models-pvc-2" + - name: "pvc3" + persistentVolumeClaim: + claimName: "models-pvc-3" + backoffLimit: 4 +--- +apiVersion: v1 +kind: Pod +metadata: + name: "pvc-reader" +spec: + containers: + - name: main + image: ubuntu + command: ["/bin/sh", "-ec", "sleep 10000"] + volumeMounts: + - name: "pvc1" + mountPath: "/mnt/pvc1" + - name: "pvc2" + mountPath: "/mnt/pvc2" + - name: "pvc3" + mountPath: "/mnt/pvc3" + volumes: + - name: "pvc1" + persistentVolumeClaim: + claimName: "models-pvc-1" + - name: "pvc2" + persistentVolumeClaim: + claimName: "models-pvc-2" + - name: "pvc3" + persistentVolumeClaim: + claimName: "models-pvc-3" diff --git a/config/dependencies/minio-storage-secret.yaml b/config/dependencies/minio-storage-secret.yaml index 5140a811..1c456d35 100644 --- a/config/dependencies/minio-storage-secret.yaml +++ b/config/dependencies/minio-storage-secret.yaml @@ -1,5 +1,17 @@ +# Copyright 2021 IBM Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: v1 - kind: Secret metadata: name: storage-config diff --git a/config/rbac/cluster-scope/role.yaml b/config/rbac/cluster-scope/role.yaml index 25a7bd75..e0110393 100644 --- a/config/rbac/cluster-scope/role.yaml +++ b/config/rbac/cluster-scope/role.yaml @@ -33,6 +33,7 @@ rules: - "" resources: - endpoints + - persistentvolumeclaims verbs: - get - list diff --git a/config/rbac/namespace-scope/role.yaml b/config/rbac/namespace-scope/role.yaml index 946692d3..8624c152 100644 --- a/config/rbac/namespace-scope/role.yaml +++ b/config/rbac/namespace-scope/role.yaml @@ -33,6 +33,7 @@ rules: - "" resources: - endpoints + - persistentvolumeclaims verbs: - get - list diff --git a/controllers/config/templating.go b/controllers/config/templating.go index 694f7e44..31f8f5b9 100644 --- a/controllers/config/templating.go +++ b/controllers/config/templating.go @@ -16,7 +16,6 @@ package config import ( "bytes" "io" - "io/ioutil" "os" "text/template" @@ -43,9 +42,9 @@ func PathTemplateSource(path string, context interface{}) mf.Source { return templateSource(f, context) } -//A templating manifest source +// A templating manifest source func templateSource(r io.Reader, context interface{}) mf.Source { - b, err := ioutil.ReadAll(r) + b, err := io.ReadAll(r) if err != nil { panic(err) } diff --git a/controllers/modelmesh/etcd.go b/controllers/modelmesh/etcd.go index eb8f46a9..eaf4d226 100644 --- a/controllers/modelmesh/etcd.go +++ b/controllers/modelmesh/etcd.go @@ -23,7 +23,7 @@ const ( kvStoreEnvVar = "KV_STORE" ) -//mimics base/patches/etcd.yaml +// mimics base/patches/etcd.yaml func (m *Deployment) configureMMDeploymentForEtcdSecret(deployment *appsv1.Deployment) error { EtcdSecretName := m.EtcdSecretName diff --git a/controllers/modelmesh/runtime.go b/controllers/modelmesh/runtime.go index f6534433..fa19901a 100644 --- a/controllers/modelmesh/runtime.go +++ b/controllers/modelmesh/runtime.go @@ -69,6 +69,7 @@ func (m *Deployment) addVolumesToDeployment(deployment *appsv1.Deployment) error } // need to mount storage config for built-in adapters and the scenarios where StorageHelper is not disabled + // TODO: what about allowAnyPVC only use case without MinIO if rts.BuiltInAdapter != nil || useStorageHelper(rts) { storageVolume := corev1.Volume{ Name: ConfigStorageMount, diff --git a/controllers/service_controller.go b/controllers/service_controller.go index 22ee2799..ea756b03 100644 --- a/controllers/service_controller.go +++ b/controllers/service_controller.go @@ -11,19 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package controllers diff --git a/controllers/servingruntime_controller.go b/controllers/servingruntime_controller.go index 521a2c27..7fdefac8 100644 --- a/controllers/servingruntime_controller.go +++ b/controllers/servingruntime_controller.go @@ -11,19 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package controllers @@ -31,6 +18,7 @@ import ( "context" "encoding/json" "fmt" + "sort" "strings" "sync" "time" @@ -300,40 +288,52 @@ func (r *ServingRuntimeReconciler) Reconcile(ctx context.Context, req ctrl.Reque } func (r *ServingRuntimeReconciler) getPVCs(ctx context.Context, req ctrl.Request, rt *kserveapi.ServingRuntimeSpec, cfg *config.Config) ([]string, error) { + // get the PVCs from the storage-config Secret + storageConfigPVCsMap := make(map[string]struct{}) s := &corev1.Secret{} if err := r.Client.Get(ctx, types.NamespacedName{ Name: modelmesh.StorageSecretName, Namespace: req.Namespace, }, s); err != nil { - return nil, fmt.Errorf("Could not get the storage secret: %w", err) - } - - pvcsMap := make(map[string]struct{}) - var storageConfig map[string]string - for _, storageData := range s.Data { - if err := json.Unmarshal(storageData, &storageConfig); err != nil { - return nil, fmt.Errorf("Could not parse storage configuration json: %w", err) - } - if storageConfig["type"] == StoragePVCType { - if name := storageConfig["name"]; name != "" { - pvcsMap[name] = struct{}{} - } else { - r.Log.V(1).Info("Missing PVC name in storage configuration") + // do not fail with error if storage-config secret does not exist + // TODO: make sure storage-config secret does not get mounted to runtime pod + // MountVolume.SetUp failed for volume "storage-config" : secret "storage-config" not found + r.Log.V(1).Info("Could not get the storage-config secret", + "name", modelmesh.StorageSecretName, + "namespace", req.Namespace) + } else { + var storageConfig map[string]string + for _, storageData := range s.Data { + if err := json.Unmarshal(storageData, &storageConfig); err != nil { + return nil, fmt.Errorf("Could not parse storage configuration json: %w", err) + } + if storageConfig["type"] == StoragePVCType { + if name := storageConfig["name"]; name != "" { + storageConfigPVCsMap[name] = struct{}{} + } else { + r.Log.V(1).Info("Missing PVC name in storage configuration") + } } } } - // add pvcs for predictors when the global config is enabled + // collect PVCs from Predictors when the global flag 'allowAnyPVC' is enabled + predictorPVCsMap := make(map[string]struct{}) if cfg.AllowAnyPVC { restProxyEnabled := cfg.RESTProxy.Enabled + + // use a predicate function to extract the PVCs from Predictors in the registry f := func(p *api.Predictor) bool { if runtimeSupportsPredictor(rt, p, restProxyEnabled, req.Name) && - p.Spec.Storage != nil && p.Spec.Storage.Parameters != nil { + p.Spec.Storage != nil && + p.Spec.Storage.Parameters != nil { + params := *p.Spec.Storage.Parameters - if stype := params["type"]; stype == "pvc" { - if name := params["name"]; name != "" { - pvcsMap[name] = struct{}{} - } + storageType := params["type"] + name := params["name"] + + if storageType == "pvc" && name != "" { + predictorPVCsMap[name] = struct{}{} } } return false @@ -345,9 +345,50 @@ func (r *ServingRuntimeReconciler) getPVCs(ctx context.Context, req ctrl.Request } } } - pvcs := make([]string, 0, len(pvcsMap)) - for pvc := range pvcsMap { - pvcs = append(pvcs, pvc) + + pvcs := make([]string, 0, len(storageConfigPVCsMap)+len(predictorPVCsMap)) + + // append all PVCs from the storage-config secret; + for claimName := range storageConfigPVCsMap { + r.Log.V(2).Info("Add PVC from storage-config to runtime", + "claimName", claimName, + "runtime", req.Name) + pvcs = append(pvcs, claimName) + } + + // for any PVCs not in the storage-config secret, we need to make sure the PVCs + // exists; if we did mount a non-existent PVC to a runtime pod, it would keep it + // in pending state, effectively shutting down inferencing on any and all other + // Predictors for that runtime + for claimName := range predictorPVCsMap { + if _, alreadyAdded := storageConfigPVCsMap[claimName]; alreadyAdded { + // don't add PVCs that we found in the storage-config secret already + continue + } else { + pvc := &corev1.PersistentVolumeClaim{} + if err := r.Client.Get(ctx, types.NamespacedName{ + Name: claimName, + Namespace: req.Namespace, + }, pvc); err != nil { + r.Log.Error(err, "Could not find PVC in namespace", + "claimName", claimName, "namespace", req.Namespace) + } else { + r.Log.V(2).Info("Add any PVC from predictors to runtime", + "claimName", claimName, + "runtime", req.Name) + pvcs = append(pvcs, claimName) + } + } + } + + // we must sort the PVCs to avoid that otherwise identical runtime deployment + // specs are treated as different by Kubernetes causing unwanted cycles of + // runtimes getting terminated again and again just because the reconciler + // ordered the same set of PVCs in a different way + if len(pvcs) > 0 { + sort.Strings(pvcs) + r.Log.V(1).Info("Adding PVCs to runtime", + "pvcs", pvcs, "runtime", req.Name) } return pvcs, nil @@ -388,7 +429,7 @@ func (r *ServingRuntimeReconciler) determineReplicasAndRequeueDuration(ctx conte defer r.runtimeInfoMapMutex.Unlock() // initialize runtime information map if it is nil - // eg. if this is the first reconcile for any runtime + // e.g. if this is the first reconcile for any runtime if r.runtimeInfoMap == nil { r.runtimeInfoMap = make(map[types.NamespacedName]*runtimeInfo) } @@ -413,7 +454,7 @@ func (r *ServingRuntimeReconciler) determineReplicasAndRequeueDuration(ctx conte return scaledUp, time.Duration(0), nil } - // if this is the first time we see no predictors update the runtime info with + // if this is the first time we see no predictors, update the runtime info with // this transition if targetRuntimeInfo.TimeTransitionedToNoPredictors == nil { log.Info("Runtime no longer has any predictors, will scale to zero after grace period", diff --git a/controllers/suite_test.go b/controllers/suite_test.go index f519fa8b..e090467b 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -32,7 +32,6 @@ package controllers import ( "context" "io" - "io/ioutil" "os" "path" "path/filepath" @@ -211,7 +210,7 @@ func getDefaultConfig() (*config2.Config, error) { if defaultTestConfigFileContents == nil { var err error var testConfigFile = "./testdata/test-config-defaults.yaml" - if defaultTestConfigFileContents, err = ioutil.ReadFile(testConfigFile); err != nil { + if defaultTestConfigFileContents, err = os.ReadFile(testConfigFile); err != nil { return nil, err } } @@ -228,7 +227,7 @@ func getPayloadProcessingConfig(loadErrorFile bool) (*config2.Config, error) { } else { testConfigFile = "./testdata/test-config-payload-processor.yaml" } - if payloadProcessingTestConfigFileContents, err = ioutil.ReadFile(testConfigFile); err != nil { + if payloadProcessingTestConfigFileContents, err = os.ReadFile(testConfigFile); err != nil { return nil, err } } diff --git a/docs/install/README.md b/docs/install/README.md index 6000b3d1..88c486e1 100644 --- a/docs/install/README.md +++ b/docs/install/README.md @@ -8,7 +8,7 @@ - **etcd** - ModelMesh Serving requires an [etcd](https://etcd.io/) server in order to coordinate internal state which can be either dedicated or shared. More on this later. -- **S3-compatible object storage** - Before models can be deployed, a remote S3-compatible datastore is needed from which to pull the model data. This could be for example an [IBM Cloud Object Storage](https://www.ibm.com/cloud/object-storage) instance, or a locally running [MinIO](https://github.com/minio/minio) deployment. Note that this is not required to be in place prior to the initial installation. +- **Model storage** - The model files have to be stored in a compatible form of remote storage or on a Kubernetes Persistent Volume. For more information about supported storage options take a look at our [storage setup](/docs/predictors/setup-storage.md) page. We provide an install script `--quickstart` option to quickly run ModelMesh Serving with a provisioned etcd server. This may be useful for experimentation or development but should not be used in production. diff --git a/docs/install/install-script.md b/docs/install/install-script.md index 42ed6532..fc33249d 100644 --- a/docs/install/install-script.md +++ b/docs/install/install-script.md @@ -2,13 +2,14 @@ ## Prerequisites -- **Kubernetes cluster** - A Kubernetes cluster is required. You will need `cluster-admin` authority in order to complete all of the prescribed steps. +- **Kubernetes cluster** - A Kubernetes cluster is required. You will need + `cluster-admin` authority in order to complete all of the prescribed steps. - **Kubectl and Kustomize** - The installation will occur via the terminal using [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) and [kustomize](https://kubectl.docs.kubernetes.io/installation/kustomize/). - **etcd** - ModelMesh Serving requires an [etcd](https://etcd.io/) server in order to coordinate internal state which can be either dedicated or shared. More on this later. -- **S3-compatible object storage** - Before models can be deployed, a remote S3-compatible datastore is needed from which to pull the model data. This could be for example an [IBM Cloud Object Storage](https://www.ibm.com/cloud/object-storage) instance, or a locally running [MinIO](https://github.com/minio/minio) deployment. Note that this is not required to be in place prior to the initial installation. +- **Model storage** - The model files have to be stored in a compatible form of remote storage or on a Kubernetes Persistent Volume. For more information about supported storage options take a look at our [storage setup](/docs/predictors/setup-storage.md) page. We provide an install script to quickly run ModelMesh Serving with a provisioned etcd server. This may be useful for experimentation or development but should not be used in production. diff --git a/docs/predictors/setup-storage.md b/docs/predictors/setup-storage.md index ad08152f..55630d00 100644 --- a/docs/predictors/setup-storage.md +++ b/docs/predictors/setup-storage.md @@ -1,14 +1,16 @@ # Set up Storage for Loading Models -You will need access to an S3-compatible object storage, for example [MinIO](https://github.com/minio/minio). To provide access to the object storage, use the `storage-config` secret. +You will need access to an S3-compatible object storage, for example [MinIO](https://github.com/minio/minio). To configure access to the object storage, use the `storage-config` secret. -## Deploy a model from your own object storage +Alternatively, models can be stored on a Kubernetes Persistent Volume. Persistent Volume Claims can either be pre-configured in the `storage-config` secret, or, the `allowAnyPVC` configuration flag can be enabled, so that any PVC can be mounted dynamically at the time a predictor or inference service is deployed. -1. Download sample model or use an existing model +## Deploy a model from your own S3 compatible object storage + +### 1. Download sample model or use an existing model Here we show an example using an [ONNX model for MNIST](https://github.com/onnx/models/raw/ad5c181f1646225f034fba1862233ecb4c262e04/vision/classification/mnist/model/mnist-8.onnx). -2. Add your ONNX saved model to S3-based object storage +### 2. Add your saved model to S3-based object storage A bucket in MinIO needs to be created to copy the model into, which either requires [MinIO Client](https://docs.min.io/docs/minio-client-quickstart-guide.html) or port-forwarding the minio service and logging in using the web interface. @@ -56,7 +58,7 @@ $ mc ls myminio/models/onnx [2021-06-11 11:55:48 EDT] 26KiB mnist-8.onnx ``` -3. Add a storage entry to the `storage-config` secret +### 3. Add a storage entry to the `storage-config` secret Ensure there is a key defined in the common `storage-config` secret corresponding to the S3-based storage instance holding your model. The value of this secret key should be JSON like the following, `default_bucket` is optional. @@ -95,3 +97,239 @@ azureKey: | ``` Remember that after updating the storage config secret, there may be a delay of up to 2 minutes until the change is picked up. You should take this into account when creating/updating `InferenceService`s that use storage keys which have just been added or updated - they may fail to load otherwise. + +## Deploy a model stored on a Persistent Volume Claim + +Models can be stored on [Kubernetes Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/). + +There are two ways to enable PVC support in ModelMesh: + +1. The Persistent Volume Claims can be added in the `storage-config` secret. This way all PVCs will be mounted to all serving runtime pods. +2. The `allowAnyPVC` configuration flag can be set to `true`. This way the Modelmesh controller will dynamically mount the PVC to a runtime pod at the time a predictor or inference service requiring it is being deployed. + +Follow the example instructions below to create a PVC, store a model on it, and configure ModelMesh to mount the PVC to the runtime serving pods so that the model can be loaded for inferencing. + +### 1. Create a Persistent Volume Claim + +Persistent Volumes are namespace-scoped, so we have to create it in the same namespace as the ModelMesh serving deployment. We are using namespace `modelmesh-serving` here. + +```shell +kubectl config set-context --current --namespace=modelmesh-serving +``` + +Now we create the Persistent Volume Claim `my-models-pvc`. Along with it, we deploy a `pvc-access` pod in order to copy our model to the Persistent Volume later. + +```shell +kubectl apply -f - < 8008 +``` + +With `curl` we can perform an inference request to the SKLearn MNIST model. Make sure the `MODEL_NAME` +variable is set to the name of your `InferenceService`. + +```shell +MODEL_NAME="sklearn-pvc-example" + +curl -X POST -k "http://localhost:8008/v2/models/${MODEL_NAME}/infer" -d '{"inputs": [{ "name": "predict", "shape": [1, 64], "datatype": "FP32", "data": [0.0, 0.0, 1.0, 11.0, 14.0, 15.0, 3.0, 0.0, 0.0, 1.0, 13.0, 16.0, 12.0, 16.0, 8.0, 0.0, 0.0, 8.0, 16.0, 4.0, 6.0, 16.0, 5.0, 0.0, 0.0, 5.0, 15.0, 11.0, 13.0, 14.0, 0.0, 0.0, 0.0, 0.0, 2.0, 12.0, 16.0, 13.0, 0.0, 0.0, 0.0, 0.0, 0.0, 13.0, 16.0, 16.0, 6.0, 0.0, 0.0, 0.0, 0.0, 16.0, 16.0, 16.0, 7.0, 0.0, 0.0, 0.0, 0.0, 11.0, 13.0, 12.0, 1.0, 0.0]}]}' +``` + +The response should look like the following: + +```json +{ + "model_name": "sklearn-pvc-example__isvc-3d2daa3370", + "outputs": [ + {"name": "predict", "datatype": "INT64", "shape": [1], "data": [8]} + ] +} +``` + +You can find more detailed information about running inference requests [here](run-inference.md). + +To delete the resources created in this example, run the following commands: + +```shell +kubectl delete isvc "sklearn-pvc-example" +kubectl delete pod "pvc-access" +kubectl delete pvc "my-models-pvc" +``` diff --git a/docs/quickstart.md b/docs/quickstart.md index a75f70e9..7eb52436 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -270,7 +270,8 @@ To see more detailed instructions and information, click [here](./predictors/run ## 4. (Optional) Deleting your ModelMesh Serving installation -To delete all ModelMesh Serving resources that were installed, run the following from the root of the project: +To delete all ModelMesh Serving resources that were installed, run the following +command from the root of the project: ```shell ./scripts/delete.sh --namespace modelmesh-serving diff --git a/docs/release-process.md b/docs/release-process.md index 46abe050..97af46c0 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -243,7 +243,14 @@ release blog. For reference, here are a few examples of previous release blogs featuring ModelMesh: -- [v0.7.0](https://kserve.github.io/website/blog/articles/2021-10-11-KServe-0.7-release) -- [v0.8.0](https://kserve.github.io/website/0.8/blog/articles/2022-02-18-KServe-0.8-release/#modelmesh-updates) +- [v0.10.0](https://kserve.github.io/website/0.10/blog/articles/2023-02-05-KServe-0.10-release/#modelmesh-updates) - [v0.9.0](https://kserve.github.io/website/0.9/blog/articles/2022-07-21-KServe-0.9-release/#inferenceservice-api-for-modelmesh) +- [v0.8.0](https://kserve.github.io/website/0.8/blog/articles/2022-02-18-KServe-0.8-release/#modelmesh-updates) +- [v0.7.0](https://kserve.github.io/website/0.7/blog/articles/2021-10-11-KServe-0.7-release) + +And the corresponding PRs to illustrate the process and the participants: + - [v0.10.0 draft](https://github.com/kserve/website/pull/227#discussion_r1098917277) +- [v0.9.0 draft](https://github.com/kserve/website/pull/166) +- [v0.8.0 draft](https://github.com/kserve/website/pull/105) +- [v0.7.0 draft](https://github.com/kserve/website/pull/49) diff --git a/fvt/README.md b/fvt/README.md index c3c312f1..ef557243 100644 --- a/fvt/README.md +++ b/fvt/README.md @@ -33,7 +33,7 @@ If starting with a fresh namespace, install ModelMesh Serving configured for the ./scripts/install.sh --namespace modelmesh-serving --fvt --dev-mode-logging ``` -To re-configure an existing quick-start instance for FVTs, run: +To re-configure an existing "quickstart" deployment for FVTs, run: ```Shell kubectl apply -f config/dependencies/fvt.yaml diff --git a/fvt/fvtclient.go b/fvt/fvtclient.go index e6d1b5b7..97d50154 100644 --- a/fvt/fvtclient.go +++ b/fvt/fvtclient.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -20,7 +20,7 @@ import ( "encoding/json" "errors" "fmt" - "io/ioutil" + "io" "math/rand" "net/http" "os/exec" @@ -57,8 +57,8 @@ import ( torchserveapi "github.com/kserve/modelmesh-serving/fvt/generated/torchserve/apis" ) -const predictorTimeout = time.Second * 120 -const timeForStatusToStabilize = time.Second * 5 +const PredictorTimeout = time.Second * 120 // absolute time to wait for predictor to become ready +const TimeForStatusToStabilize = time.Second * 5 // time to wait between watcher events before assuming a stable state type ModelServingConnectionType int @@ -75,6 +75,19 @@ var applyPatchOptions = metav1.PatchOptions{ Force: func() *bool { t := true; return &t }(), } +// list option for serving runtime deployments +var deploymentListOptions = metav1.ListOptions{ + LabelSelector: "modelmesh-service", + TimeoutSeconds: &DefaultTimeout, +} + +// list option for serving runtime pods +var runtimePodListOptions = metav1.ListOptions{ + LabelSelector: "modelmesh-service=modelmesh-serving", + FieldSelector: "status.phase=Running", + TimeoutSeconds: &DefaultTimeout, +} + type FVTClient struct { dynamic.Interface namespace string @@ -95,6 +108,7 @@ type ModelMeshPortForward struct { cmdArgs []string done chan struct{} log logr.Logger + podName string } func (pf *ModelMeshPortForward) EnsureStarted() error { @@ -142,12 +156,15 @@ func (pf *ModelMeshPortForward) EnsureStopped() { <-pf.done } +// NewModelMeshPortForward switched to port-forwarding to a pod instead of the +// service, since, when port-forwarding to a Service, it just picks any pod to +// port-forward to without any guardrails against selecting a Terminating pod. +// It doesn't use readiness checks for pod selection, it seems to actually select +// the oldest pod which ends up being most likely to be terminated soon func NewModelMeshPortForward(namespace string, podName string, localPort int, targetPort int, log logr.Logger) *ModelMeshPortForward { portMap := fmt.Sprintf("%d:%d", localPort, targetPort) - cmdArgs := []string{"port-forward", "--namespace", namespace, "--address", "0.0.0.0", - "pod/" + podName, portMap} - - return &ModelMeshPortForward{nil, cmdArgs, nil, log} + cmdArgs := []string{"port-forward", "-n", namespace, "--address", "0.0.0.0", "pod/" + podName, portMap} + return &ModelMeshPortForward{nil, cmdArgs, nil, log, podName} } func GetFVTClient(log logr.Logger, namespace, serviceName, controllerNamespace string) (*FVTClient, error) { @@ -225,10 +242,10 @@ var ( Version: v1beta1.SchemeGroupVersion.Version, Resource: "inferenceservices", // this must be the plural form } - gvrEndpoints = schema.GroupVersionResource{ + gvrPods = schema.GroupVersionResource{ Group: "", Version: "v1", - Resource: "endpoints", // this must be the plural form + Resource: "pods", // this must be the plural form } ) @@ -280,6 +297,7 @@ func (fvt *FVTClient) ListServingRuntimes(options metav1.ListOptions) (*unstruct func (fvt *FVTClient) ListClusterServingRuntimes(options metav1.ListOptions) (*unstructured.UnstructuredList, error) { return fvt.Resource(gvrCRuntime).List(context.TODO(), options) } + func (fvt *FVTClient) GetPredictor(name string) *unstructured.Unstructured { obj, err := fvt.Resource(gvrPredictor).Namespace(fvt.namespace).Get(context.TODO(), name, metav1.GetOptions{}) Expect(err).ToNot(HaveOccurred()) @@ -346,18 +364,15 @@ func (fvt *FVTClient) WatchPredictorsAsync(c chan *unstructured.Unstructured, op } -func (fvt *FVTClient) GetRandomReadyRuntimePodNameFromEndpoints() string { - obj, err := fvt.Resource(gvrEndpoints).Namespace(fvt.namespace).Get(context.TODO(), fvt.serviceName, metav1.GetOptions{}) - Expect(err).ToNot(HaveOccurred()) - - var endpoints corev1.Endpoints - err = runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &endpoints) - Expect(err).ToNot(HaveOccurred()) +func (fvt *FVTClient) GetRandomReadyRuntimePod() string { + runtimePods := fvt.ListReadyRuntimePods() + numPods := len(runtimePods.Items) + Expect(numPods).ToNot(BeZero(), "There are no 'Ready' runtime pods") - addresses := endpoints.Subsets[0].Addresses - randomAddress := addresses[rand.Intn(len(addresses))] + i := rand.Intn(numPods) + name := runtimePods.Items[i].Name - return randomAddress.TargetRef.Name + return name } func (fvt *FVTClient) PrintPredictors() { @@ -374,6 +389,13 @@ func (fvt *FVTClient) PrintIsvcs() { } } +func (fvt *FVTClient) PrintDescribeIsvc(name string) { + err := fvt.RunKubectl("describe", "isvc", name) + if err != nil { + fvt.log.Error(err, fmt.Sprintf("Error running describe isvc '%s' command", name)) + } +} + func (fvt *FVTClient) PrintPods() { err := fvt.RunKubectl("get", "pods") if err != nil { @@ -412,12 +434,12 @@ func (fvt *FVTClient) TailPodLogs(sinceTime string) { func (fvt *FVTClient) RunKubectl(args ...string) error { args = append(args, "-n", fvt.namespace) - getPredictorCommand := exec.Command("kubectl", args...) - getPredictorCommand.Stdout = ginkgo.GinkgoWriter - getPredictorCommand.Stderr = ginkgo.GinkgoWriter - fvt.log.Info("Running command", "args", strings.Join(getPredictorCommand.Args, " ")) + kubectlCmd := exec.Command("kubectl", args...) + kubectlCmd.Stdout = ginkgo.GinkgoWriter + kubectlCmd.Stderr = ginkgo.GinkgoWriter + fvt.log.Info("Running command", "args", strings.Join(kubectlCmd.Args, " ")) fmt.Fprintf(ginkgo.GinkgoWriter, "=====================================================================================================================================\n") - err := getPredictorCommand.Run() + err := kubectlCmd.Run() fmt.Fprintf(ginkgo.GinkgoWriter, "=====================================================================================================================================\n") return err } @@ -465,7 +487,7 @@ func (fvt *FVTClient) RunKfsRestInference(modelName string, body []byte, tls boo return "", fmt.Errorf("Request failed with code %d", response.StatusCode) } - resp, err := ioutil.ReadAll(response.Body) + resp, err := io.ReadAll(response.Body) return string(resp), err } @@ -494,21 +516,44 @@ func (fvt *FVTClient) RunTorchserveInference(req *torchserveapi.PredictionsReque } func (fvt *FVTClient) ConnectToModelServing(connectionType ModelServingConnectionType) error { + // check if the gRPC and REST connection runtime pods are still around + if fvt.grpcPortForward != nil { + podName := fvt.grpcPortForward.podName + _, err := fvt.Resource(gvrPods).Namespace(fvt.namespace).Get(context.TODO(), podName, metav1.GetOptions{}) + if err != nil { + fvt.log.Info("Lost gRPC connection to pod " + podName + ". Reconnecting ...") + fvt.disconnectGrpcConnection() + } else { + fvt.log.V(2).Info("Still gRPC connected to pod " + podName) + } + } + if fvt.restPortForward != nil { + podName := fvt.restPortForward.podName + _, err := fvt.Resource(gvrPods).Namespace(fvt.namespace).Get(context.TODO(), podName, metav1.GetOptions{}) + if err != nil { + fvt.log.Info("Lost REST connection to pod " + podName + ". Reconnecting ...") + fvt.disconnectRestConnection() + } else { + fvt.log.V(2).Info("Still REST connected to pod " + podName) + } + } + + // (re-)create the gRPC and REST port-forwards if necessary if fvt.grpcPortForward == nil { - podName := fvt.GetRandomReadyRuntimePodNameFromEndpoints() + podName := fvt.GetRandomReadyRuntimePod() fvt.grpcPortForward = NewModelMeshPortForward(fvt.namespace, podName, fvt.grpcPort, 8033, fvt.log) } if fvt.restPortForward == nil { - podName := fvt.GetRandomReadyRuntimePodNameFromEndpoints() + podName := fvt.GetRandomReadyRuntimePod() fvt.restPortForward = NewModelMeshPortForward(fvt.namespace, podName, fvt.restPort, 8008, fvt.log) } + // start the port-forwards if err := fvt.grpcPortForward.EnsureStarted(); err != nil { - return fmt.Errorf("Error with grpc port-forward, could not connect to model serving") + return fmt.Errorf("Error with gRPC port-forward, could not connect to model serving") } - if err := fvt.restPortForward.EnsureStarted(); err != nil { - return fmt.Errorf("Error with rest port-forward, could not connect to model serving") + return fmt.Errorf("Error with REST port-forward, could not connect to model serving") } ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) @@ -576,6 +621,11 @@ func (fvt *FVTClient) DisconnectFromModelServing() { if fvt == nil { return } + fvt.disconnectGrpcConnection() + fvt.disconnectRestConnection() +} + +func (fvt *FVTClient) disconnectGrpcConnection() { if fvt.grpcConn != nil { fvt.grpcConn.Close() fvt.grpcConn = nil @@ -584,7 +634,9 @@ func (fvt *FVTClient) DisconnectFromModelServing() { fvt.grpcPortForward.EnsureStopped() fvt.grpcPortForward = nil } +} +func (fvt *FVTClient) disconnectRestConnection() { if fvt.restConn != nil { fvt.restConn.CloseIdleConnections() fvt.restConn = nil @@ -608,7 +660,7 @@ func (fvt *FVTClient) ApplyUserConfigMap(config map[string]interface{}) { cmu := &unstructured.Unstructured{ Object: map[string]interface{}{ "apiVersion": "v1", - "kind": "ConfigMap", + "kind": ConfigMapKind, "metadata": map[string]interface{}{ "name": UserConfigMapName, }, @@ -626,6 +678,33 @@ func (fvt *FVTClient) ApplyUserConfigMap(config map[string]interface{}) { Expect(err).ToNot(HaveOccurred()) } +func (fvt *FVTClient) CreateStorageConfigSecret(storageConfigData map[string]interface{}) { + var stringConfigData = map[string]string{} + + for k, v := range storageConfigData { + jsonValue, err := json.Marshal(v) + Expect(err).ToNot(HaveOccurred()) + stringConfigData[k] = string(jsonValue) + } + + var StorageSecret = corev1.Secret{ + Type: corev1.SecretTypeOpaque, + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: SecretKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: StorageConfigSecretName, + }, + StringData: stringConfigData, + } + + CreateSecret(&StorageSecret, fvt.controllerNamespace, fvt) + if fvt.namespace != fvt.controllerNamespace { + CreateSecret(&StorageSecret, fvt.namespace, fvt) + } +} + func (fvt *FVTClient) CreateTLSSecrets() { err := fvt.certGenerator.generate() Expect(err).ToNot(HaveOccurred()) @@ -669,11 +748,7 @@ func (fvt *FVTClient) UpdateConfigMapTLS(tlsConfig map[string]interface{}) { } func (fvt *FVTClient) StartWatchingDeploys() watch.Interface { - listOptions := metav1.ListOptions{ - LabelSelector: "modelmesh-service", - TimeoutSeconds: &DefaultTimeout, - } - deployWatcher, err := fvt.Resource(gvrDeployment).Namespace(fvt.namespace).Watch(context.TODO(), listOptions) + deployWatcher, err := fvt.Resource(gvrDeployment).Namespace(fvt.namespace).Watch(context.TODO(), deploymentListOptions) Expect(err).ToNot(HaveOccurred()) return deployWatcher } @@ -682,8 +757,7 @@ func (fvt *FVTClient) ListDeploys() appsv1.DeploymentList { var err error // query for UnstructuredList using the dynamic client - listOptions := metav1.ListOptions{LabelSelector: "modelmesh-service", TimeoutSeconds: &DefaultTimeout} - u, err := fvt.Resource(gvrDeployment).Namespace(fvt.namespace).List(context.TODO(), listOptions) + u, err := fvt.Resource(gvrDeployment).Namespace(fvt.namespace).List(context.TODO(), deploymentListOptions) Expect(err).ToNot(HaveOccurred()) // convert elements from Unstructured to Deployment @@ -698,6 +772,32 @@ func (fvt *FVTClient) ListDeploys() appsv1.DeploymentList { return deployments } +func (fvt *FVTClient) ListReadyRuntimePods() corev1.PodList { + var err error + + // query for UnstructuredList using the dynamic client + u, err := fvt.Resource(gvrPods).Namespace(fvt.namespace).List(context.TODO(), runtimePodListOptions) + Expect(err).ToNot(HaveOccurred()) + + // convert elements from Unstructured to Pod + var pods corev1.PodList + for _, up := range u.Items { + var p corev1.Pod + err = runtime.DefaultUnstructuredConverter.FromUnstructured(up.Object, &p) + Expect(err).ToNot(HaveOccurred()) + + // make sure to only return pods that are 'Ready' + // https://github.com/knative-sandbox/eventing-kafka-broker/pull/2523/files#diff-a9f3c3b + for _, c := range p.Status.Conditions { + if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue { + pods.Items = append(pods.Items, p) + } + } + } + + return pods +} + func (fvt *FVTClient) RestartDeploys() { // trigger a restart by patching an annotation with a timestamp // generate the JSON patch that adds/modifies the annotation @@ -741,6 +841,18 @@ func (fvt *FVTClient) DeleteConfigMap(resourceName string) error { return nil } +func (fvt *FVTClient) DeleteStorageConfigSecret() { + fvt.log.Info("Delete storage-config secret ...") + if err := fvt.DeleteSecret(StorageConfigSecretName, fvt.controllerNamespace); err != nil { + fvt.log.Error(err, "Unable to delete storage-config secret") + } + if fvt.namespace != fvt.controllerNamespace { + if err := fvt.DeleteSecret(StorageConfigSecretName, fvt.namespace); err != nil { + fvt.log.Error(err, "Unable to delete user namespaced storage-config secret") + } + } +} + func (fvt FVTClient) DeleteTLSSecrets() { if err := fvt.DeleteSecret(TLSSecretName, fvt.controllerNamespace); err != nil { fvt.log.Error(err, "Unable to delete TLS secret") diff --git a/fvt/globals.go b/fvt/globals.go index 2f89b606..e181993e 100644 --- a/fvt/globals.go +++ b/fvt/globals.go @@ -20,7 +20,7 @@ import ( var Log logr.Logger var FVTClientInstance *FVTClient -var DefaultTimeout = int64(120) +var DefaultTimeout = int64(120) // absolute timeout for watcher event channels var NameSpaceScopeMode = false var DefaultConfig = map[string]interface{}{ @@ -39,6 +39,28 @@ var DefaultConfig = map[string]interface{}{ }, } +var StorageConfigDataMinio = map[string]interface{}{ + "localMinIO": map[string]string{ + "type": "s3", + "access_key_id": "AKIAIOSFODNN7EXAMPLE", + "secret_access_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + "endpoint_url": "http://minio:9000", + "default_bucket": "modelmesh-example-models", + "region": "us-south", + }, +} + +var StorageConfigDataPVC = map[string]interface{}{ + "pvc1": map[string]string{ + "type": "pvc", + "name": "models-pvc-1", + }, + "pvc2": map[string]string{ + "type": "pvc", + "name": "models-pvc-2", + }, +} + var BasicTLSConfig = map[string]interface{}{ "tls": map[string]interface{}{ "secretName": TLSSecretName, @@ -71,4 +93,5 @@ const ( IsvcSamplesPath = "isvcs/" RuntimeSamplesPath = "runtimes/" TLSSecretName = "fvt-tls-secret" + StorageConfigSecretName = "storage-config" ) diff --git a/fvt/helpers.go b/fvt/helpers.go index e586bc46..84b4e6de 100644 --- a/fvt/helpers.go +++ b/fvt/helpers.go @@ -4,13 +4,14 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + package fvt import ( @@ -20,6 +21,8 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + api "github.com/kserve/modelmesh-serving/apis/serving/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" utilrand "k8s.io/apimachinery/pkg/util/rand" @@ -59,22 +62,22 @@ func CreatePredictorAndWaitAndExpectLoaded(predictorManifest *unstructured.Unstr createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to be 'Loaded'") + By("Waiting for predictor " + predictorName + " to be 'Loaded'") // TODO: "Standby" (or) "FailedToLoad" states are currently encountered after the "Loading" state but they shouldn't be (see issue#994) resultingPredictor := WaitForLastStateInExpectedList("activeModelState", []string{"Pending", "Loading", "Standby", "FailedToLoad", "Loading", "Loaded"}, watcher) ExpectPredictorState(resultingPredictor, true, "Loaded", "", "UpToDate") return resultingPredictor } -func CreateIsvcAndWaitAndExpectReady(isvcManifest *unstructured.Unstructured) *unstructured.Unstructured { +func CreateIsvcAndWaitAndExpectReady(isvcManifest *unstructured.Unstructured, timeout time.Duration) *unstructured.Unstructured { isvcName := isvcManifest.GetName() By("Creating inference service " + isvcName) - watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, DefaultTimeout) + watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, int64(timeout.Seconds())) defer watcher.Stop() FVTClientInstance.CreateIsvcExpectSuccess(isvcManifest) - By("Waiting for inference service" + isvcName + " to be 'Ready'") + By("Waiting for inference service " + isvcName + " to be 'Ready' and model is 'Loaded'") // ISVC does not have the status field set initially. - resultingIsvc := WaitForIsvcReady(watcher) + resultingIsvc := WaitForIsvcState(watcher, []api.ModelState{api.Standby, api.Loaded}, isvcName, timeout) return resultingIsvc } @@ -87,13 +90,25 @@ func CreatePredictorAndWaitAndExpectFailed(predictorManifest *unstructured.Unstr createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to be 'FailedToLoaded'") - // "Standby" state is encountered after the "Loading" state but it shouldn't be + By("Waiting for predictor " + predictorName + " to have 'FailedToLoad'") + // "Standby" state is encountered after the "Loading" state, but it shouldn't be resultingPredictor := WaitForLastStateInExpectedList("activeModelState", []string{"Pending", "Loading", "Standby", "Loading", "FailedToLoad"}, watcher) ExpectPredictorState(resultingPredictor, false, "FailedToLoad", "", "UpToDate") return resultingPredictor } +func CreateIsvcAndWaitAndExpectFailed(isvcManifest *unstructured.Unstructured) *unstructured.Unstructured { + isvcName := isvcManifest.GetName() + By("Creating inference service " + isvcName) + watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, DefaultTimeout) + defer watcher.Stop() + FVTClientInstance.CreateIsvcExpectSuccess(isvcManifest) + By("Waiting for inference service " + isvcName + " to fail") + // ISVC does not have the status field set initially. + resultingIsvc := WaitForIsvcState(watcher, []api.ModelState{api.FailedToLoad}, isvcName, PredictorTimeout) + return resultingIsvc +} + func CreatePredictorAndWaitAndExpectInvalidSpec(predictorManifest *unstructured.Unstructured) *unstructured.Unstructured { predictorName := predictorManifest.GetName() @@ -103,7 +118,7 @@ func CreatePredictorAndWaitAndExpectInvalidSpec(predictorManifest *unstructured. createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to have transitionStatus 'InvalidSpec'") + By("Waiting for predictor " + predictorName + " to have transitionStatus 'InvalidSpec'") return WaitForLastStateInExpectedList("transitionStatus", []string{"UpToDate", "InvalidSpec"}, watcher) } @@ -156,8 +171,10 @@ func ExpectPredictorState(obj *unstructured.Unstructured, available bool, active actualTransitionStatus := GetString(obj, "status", "transitionStatus") Expect(actualTransitionStatus).To(Equal(transitionStatus)) - if transitionStatus != "BlockedByFailedLoad" && transitionStatus != "InvalidSpec" && - activeModelState != "FailedToLoad" && targetModelState != "FailedToLoad" { + if transitionStatus != string(api.BlockedByFailedLoad) && + transitionStatus != string(api.InvalidSpec) && + activeModelState != string(api.FailedToLoad) && + targetModelState != string(api.FailedToLoad) { actualFailureInfo := GetMap(obj, "status", "lastFailureInfo") Expect(actualFailureInfo).To(BeNil()) } @@ -174,7 +191,7 @@ func ExpectPredictorFailureInfo(obj *unstructured.Unstructured, reason string, h Expect(actualFailureInfo["location"]).To(BeNil()) } if message != "" { - Expect(actualFailureInfo["message"]).To(Equal(message)) + Expect(actualFailureInfo["message"]).To(ContainSubstring(message)) } else { Expect(actualFailureInfo["message"]).ToNot(BeEmpty()) } @@ -188,47 +205,95 @@ func ExpectPredictorFailureInfo(obj *unstructured.Unstructured, reason string, h } } -func WaitForIsvcReady(watcher watch.Interface) *unstructured.Unstructured { +func ExpectIsvcState(obj *unstructured.Unstructured, activeModelState, targetModelState, transitionStatus string) { + actualActiveModelState := GetString(obj, "status", "modelStatus", "states", "activeModelState") + Expect(actualActiveModelState).To(Equal(activeModelState)) + + actualTargetModel := GetString(obj, "status", "modelStatus", "states", "targetModelState") + Expect(actualTargetModel).To(Equal(targetModelState)) + + actualTransitionStatus := GetString(obj, "status", "modelStatus", "transitionStatus") + Expect(actualTransitionStatus).To(Equal(transitionStatus)) + + if transitionStatus != "BlockedByFailedLoad" && + transitionStatus != "InvalidSpec" && + activeModelState != string(api.FailedToLoad) && + targetModelState != string(api.FailedToLoad) { + actualFailureInfo := GetMap(obj, "status", "modelStatus", "lastFailureInfo") + Expect(actualFailureInfo).To(BeNil()) + } +} + +func ExpectIsvcFailureInfo(obj *unstructured.Unstructured, reason string, hasLocation bool, hasTime bool, message string) { + actualFailureInfo := GetMap(obj, "status", "modelStatus", "lastFailureInfo") + Expect(actualFailureInfo).ToNot(BeNil()) + + Expect(actualFailureInfo["reason"]).To(Equal(reason)) + if hasLocation { + Expect(actualFailureInfo["location"]).ToNot(BeEmpty()) + } else { + Expect(actualFailureInfo["location"]).To(BeNil()) + } + if message != "" { + Expect(actualFailureInfo["message"]).To(ContainSubstring(message)) + } else { + Expect(actualFailureInfo["message"]).ToNot(BeEmpty()) + } + if !hasTime { + Expect(actualFailureInfo["time"]).To(BeNil()) + } else { + Expect(actualFailureInfo["time"]).ToNot(BeNil()) + actualTime, err := time.Parse(time.RFC3339, actualFailureInfo["time"].(string)) + Expect(err).To(BeNil()) + Expect(time.Since(actualTime) < time.Minute).To(BeTrue()) + } +} + +func WaitForIsvcState(watcher watch.Interface, anyOfDesiredStates []api.ModelState, name string, isvcTimeout time.Duration) *unstructured.Unstructured { ch := watcher.ResultChan() - isReady := false + reachedDesiredState := false var obj *unstructured.Unstructured - var isvcName string + var isvcName = name - timeout := time.After(predictorTimeout) + timeout := time.After(isvcTimeout) done := false for !done { select { // Exit the loop if InferenceService is not ready before given timeout. case <-timeout: done = true + FVTClientInstance.PrintDescribeIsvc(isvcName) case event, ok := <-ch: if !ok { // the channel was closed (watcher timeout reached) done = true + FVTClientInstance.PrintDescribeIsvc(isvcName) break } obj, ok = event.Object.(*unstructured.Unstructured) Expect(ok).To(BeTrue()) isvcName = GetString(obj, "metadata", "name") - conditions, exists := GetSlice(obj, "status", "conditions") + // ISVC does not have the status field set initially + // modelStatus will not exist until status.conditions exist + _, exists := GetSlice(obj, "status", "conditions") if !exists { time.Sleep(time.Second) continue } - for _, condition := range conditions { - conditionMap := condition.(map[string]interface{}) - if conditionMap["type"] == "Ready" { - if conditionMap["status"] == "True" { - isReady = true - done = true - break - } + // Note: first status.conditions[{"Type": "Ready", "Status": "True"}] can + // occur before status.conditions[{"Type": "Ready", "Status": "False"}] + // so we check for "activeModelState" instead + activeModelState := GetString(obj, "status", "modelStatus", "states", "activeModelState") + for _, desiredState := range anyOfDesiredStates { + if activeModelState == string(desiredState) { + reachedDesiredState = true + done = true + break } } - } } - Expect(isReady).To(BeTrue(), "Timeout before InferenceService '%s' ready", isvcName) + Expect(reachedDesiredState).To(BeTrue(), "Timeout before InferenceService '%s' reached any of the activeModelStates %s", isvcName, anyOfDesiredStates) return obj } @@ -244,7 +309,7 @@ func WaitForLastStateInExpectedList(statusAttribute string, expectedStates []str lastState := "UNSEEN" var predictorName string - timeout := time.After(predictorTimeout) + timeout := time.After(PredictorTimeout) lastStateIndex := 0 done := false for !done { @@ -292,26 +357,43 @@ func WaitForLastStateInExpectedList(statusAttribute string, expectedStates []str return obj } -func WaitForStableActiveDeployState() { +func WaitForStableActiveDeployState(timeToStabilize time.Duration) { watcher := FVTClientInstance.StartWatchingDeploys() defer watcher.Stop() - WaitForDeployStatus(watcher, timeForStatusToStabilize) + WaitForRuntimeDeploymentsToBeStable(timeToStabilize, watcher) } -func WaitForDeployStatus(watcher watch.Interface, timeToStabilize time.Duration) { +func WaitForRuntimeDeploymentsToBeStable(timeToStabilize time.Duration, watcher watch.Interface) { ch := watcher.ResultChan() - targetStateReached := false var obj *unstructured.Unstructured var replicas, updatedReplicas, availableReplicas int64 var deployName string - deployStatusesReady := make(map[string]bool) + deploymentReady := make(map[string]bool) + + // Get a list of ServingRuntime deployments + runtimeDeploys := FVTClientInstance.ListDeploys() + for _, deploy := range runtimeDeploys.Items { + // initialize all deployment statuses as not ready + deploymentReady[deploy.Name] = false + } + timeout := timeToStabilize + allReady := false done := false for !done { select { + // The select statement is only used with channels to let a goroutine wait on multiple communication operations. + // The select blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready. + case <-time.After(timeout): + // if no watcher events came in for the given length of timeForStatusToStabilize + // then we assume the deployment status has stabilized and exit the loop + Log.Info(fmt.Sprintf("Timed out after %v without events", timeout)) + done = true + break case event, ok := <-ch: if !ok { - // the channel was closed (watcher timeout reached) + // the channel was closed (watcher timeout reached, see DefaultTimeout) + Log.Info(fmt.Sprintf("Watcher timed out after %v seconds", DefaultTimeout)) done = true break } @@ -325,35 +407,36 @@ func WaitForDeployStatus(watcher watch.Interface, timeToStabilize time.Duration) Log.Info("Watcher got event with object", "name", deployName, - "status.replicas", replicas, - "status.availableReplicas", availableReplicas, - "status.updatedReplicas", updatedReplicas) + "replicas", replicas, + "available", availableReplicas, + "updated", updatedReplicas) if (updatedReplicas == replicas) && (availableReplicas == updatedReplicas) { - deployStatusesReady[deployName] = true - Log.Info(fmt.Sprintf("deployStatusesReady: %v", deployStatusesReady)) - } else { - deployStatusesReady[deployName] = false - } - - // this case executes if no events are received during the timeToStabilize duration - case <-time.After(timeToStabilize): - // check if all deployments are ready - stable := true - for _, status := range deployStatusesReady { - if !status { - stable = false - break + deploymentReady[deployName] = true + Log.Info(fmt.Sprintf("deployStatusesReady: %v", deploymentReady)) + // check if all deployments are ready + allReady = true + for _, thisReady := range deploymentReady { + allReady = allReady && thisReady } - } - if stable { - targetStateReached = true - done = true + // do not exit the loop yet (done=true), deployment my become unstable again, + // wait for timeForStatusToStabilize (see above) + if allReady { + // once we are ready, shorten time to wait for next event + // if we are truly ready no more event will come in + // if we are not yet ready, new events will come in quickly + timeout = TimeForStatusToStabilize + Log.Info(fmt.Sprintf("All deployments are ready: %v", deploymentReady)) + } + } else { + deploymentReady[deployName] = false + // restore the full time to wait in between watcher events + timeout = timeToStabilize } } } - Expect(targetStateReached).To(BeTrue(), "Timeout before deploy '%s' ready(last state was replicas: '%v' updatedReplicas: '%v' availableReplicas: '%v')", - deployName, replicas, updatedReplicas, availableReplicas) + + Expect(allReady).To(BeTrue(), fmt.Sprintf("Timed out before deployments were ready: %v", deploymentReady)) } func logPredictorStatus(obj *unstructured.Unstructured) []interface{} { diff --git a/fvt/inference.go b/fvt/inference.go index 2207b40c..740cafd2 100644 --- a/fvt/inference.go +++ b/fvt/inference.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -17,7 +17,6 @@ import ( "bytes" "encoding/binary" "encoding/json" - "io/ioutil" "math" "os" @@ -91,7 +90,7 @@ func ExpectSuccessfulInference_openvinoMnistTFSPredict(predictorName string) { Expect(err).ToNot(HaveOccurred()) Expect(inferResponse).ToNot(BeNil()) // NOTE: ModelSpec is not included in the response, so we can't assert on the name - // validate the the activation for the digit 7 is the maximum + // validate the activation for the digit 7 is the maximum activations, err := convertRawOutputContentsTo10Floats(inferResponse.Outputs["Plus214_Output_0"].TensorContent) max := activations[0] maxI := 0 @@ -106,7 +105,7 @@ func ExpectSuccessfulInference_openvinoMnistTFSPredict(predictorName string) { } func ExpectSuccessfulInference_torchserveMARPredict(predictorName string) { - imageBytes, err := ioutil.ReadFile(TestDataPath("0.png")) + imageBytes, err := os.ReadFile(TestDataPath("0.png")) Expect(err).ToNot(HaveOccurred()) inferRequest := &torchserveapi.PredictionsRequest{ diff --git a/fvt/predictor/isvc_test.go b/fvt/predictor/isvc_test.go deleted file mode 100644 index d02254ff..00000000 --- a/fvt/predictor/isvc_test.go +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2021 IBM Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package predictor - -import ( - . "github.com/kserve/modelmesh-serving/fvt" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -type FVTInferenceService struct { - name string - inferenceServiceFileName string -} - -var inferenceArray = []FVTInferenceService{ - { - name: "New Format", - inferenceServiceFileName: "new-format-mm.yaml", - }, - { - name: "Old Format", - inferenceServiceFileName: "old-format-mm.yaml", - }, -} - -var _ = Describe("Inference service", Ordered, func() { - for _, i := range inferenceArray { - var _ = Describe("test "+i.name+" isvc", Ordered, func() { - var isvcName string - - It("should successfully load a model", func() { - isvcObject := NewIsvcForFVT(i.inferenceServiceFileName) - isvcName = isvcObject.GetName() - CreateIsvcAndWaitAndExpectReady(isvcObject) - - err := FVTClientInstance.ConnectToModelServing(Insecure) - Expect(err).ToNot(HaveOccurred()) - }) - - It("should successfully run inference", func() { - ExpectSuccessfulInference_sklearnMnistSvm(isvcName) - }) - - AfterAll(func() { - FVTClientInstance.DeleteIsvc(isvcName) - }) - - }) - } -}) diff --git a/fvt/predictor/predictor_suite_test.go b/fvt/predictor/predictor_suite_test.go index 776be4e4..96c98f49 100644 --- a/fvt/predictor/predictor_suite_test.go +++ b/fvt/predictor/predictor_suite_test.go @@ -14,16 +14,14 @@ package predictor import ( - "os" "testing" "time" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/log/zap" - . "github.com/kserve/modelmesh-serving/fvt" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) @@ -32,38 +30,11 @@ func TestPredictorSuite(t *testing.T) { RunSpecs(t, "Predictor suite") } -func createFVTClient() { - Log = zap.New(zap.UseDevMode(true), zap.WriteTo(GinkgoWriter)) - Log.Info("Initializing test suite") - - namespace := os.Getenv("NAMESPACE") - if namespace == "" { - namespace = DefaultTestNamespace - } - serviceName := os.Getenv("SERVICENAME") - if serviceName == "" { - serviceName = DefaultTestServiceName - } - controllerNamespace := os.Getenv("CONTROLLERNAMESPACE") - if controllerNamespace == "" { - controllerNamespace = DefaultControllerNamespace - } - NameSpaceScopeMode = os.Getenv("NAMESPACESCOPEMODE") == "true" - Log.Info("Using environment variables", "NAMESPACE", namespace, "SERVICENAME", serviceName, - "CONTROLLERNAMESPACE", controllerNamespace, "NAMESPACESCOPEMODE", NameSpaceScopeMode) - - var err error - FVTClientInstance, err = GetFVTClient(Log, namespace, serviceName, controllerNamespace) - Expect(err).ToNot(HaveOccurred()) - Expect(FVTClientInstance).ToNot(BeNil()) - Log.Info("FVTClientInstance created", "client", FVTClientInstance) -} - var _ = SynchronizedBeforeSuite(func() []byte { // runs *only* on process #1 - createFVTClient() + InitializeFVTClient() - // confirm 3 cluster serving runtimes or serving runtimes exist + // confirm 4 cluster serving runtimes or serving runtimes exist var err error var list *unstructured.UnstructuredList if NameSpaceScopeMode { @@ -75,27 +46,31 @@ var _ = SynchronizedBeforeSuite(func() []byte { Expect(list.Items).To(HaveLen(4)) FVTClientInstance.SetDefaultUserConfigMap() + // ensure that there are no predictors to start FVTClientInstance.DeleteAllPredictors() FVTClientInstance.DeleteAllIsvcs() - // ensure a stable deploy state - WaitForStableActiveDeployState() + // create TLS secrets before start of tests FVTClientInstance.CreateTLSSecrets() + // ensure a stable deploy state + WaitForStableActiveDeployState(time.Second * 30) + return nil }, func(_ []byte) { // runs on *all* processes // create the fvtClient Instance on every other process except the first, since it got created in the above function. if FVTClientInstance == nil { - createFVTClient() + InitializeFVTClient() } + Log.Info("Setup completed") }) var _ = SynchronizedAfterSuite(func() { // runs on *all* processes - // ensure we cleanup any port-forward + // ensure we clean up any port-forward FVTClientInstance.DisconnectFromModelServing() }, func() { // runs *only* on process #1 diff --git a/fvt/predictor/predictor_test.go b/fvt/predictor/predictor_test.go index bfc8e9ee..15d5ea5f 100644 --- a/fvt/predictor/predictor_test.go +++ b/fvt/predictor/predictor_test.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -123,6 +123,22 @@ var predictorsArray = []FVTPredictor{ // }, } +type FVTInferenceService struct { + name string + inferenceServiceFileName string +} + +var inferenceArray = []FVTInferenceService{ + { + name: "New Format", + inferenceServiceFileName: "new-format-mm.yaml", + }, + { + name: "Old Format", + inferenceServiceFileName: "old-format-mm.yaml", + }, +} + var _ = Describe("Predictor", func() { // Many tests in this block assume a stable state of scaled up deployments // which may not be the case if other Describe blocks run first. So we want to @@ -871,121 +887,6 @@ var _ = Describe("Predictor", func() { Expect(err.Error()).To(ContainSubstring("Unexpected : cannot reshape array")) }) }) - - var _ = Describe("TLS XGBoost inference", Ordered, Serial, func() { - var xgboostPredictorObject *unstructured.Unstructured - var xgboostPredictorName string - - AfterAll(func() { - FVTClientInstance.SetDefaultUserConfigMap() - }) - - It("should successfully run an inference with basic TLS", func() { - FVTClientInstance.UpdateConfigMapTLS(BasicTLSConfig) - - By("Waiting for the deployments replicas to be ready") - WaitForStableActiveDeployState() - - // load the test predictor object - xgboostPredictorObject = NewPredictorForFVT("xgboost-predictor.yaml") - xgboostPredictorName = xgboostPredictorObject.GetName() - CreatePredictorAndWaitAndExpectLoaded(xgboostPredictorObject) - - By("Creating a new connection to ModelServing") - // ensure we are establishing a new connection after the TLS change - FVTClientInstance.DisconnectFromModelServing() - - var timeAsleep int - var mmeshErr error - for timeAsleep <= 30 { - mmeshErr = FVTClientInstance.ConnectToModelServing(TLS) - - if mmeshErr == nil { - Log.Info("Successfully connected to model mesh tls") - break - } - - Log.Info(fmt.Sprintf("Error found, retrying connecting to model-mesh: %s", mmeshErr.Error())) - FVTClientInstance.DisconnectFromModelServing() - timeAsleep += 10 - time.Sleep(time.Second * time.Duration(timeAsleep)) - } - - Expect(mmeshErr).ToNot(HaveOccurred()) - - By("Expect inference to succeed") - ExpectSuccessfulInference_xgboostMushroom(xgboostPredictorName) - - By("Expect inference to succeed via REST proxy") - ExpectSuccessfulRESTInference_xgboostMushroom(xgboostPredictorName, true) - - // cleanup the predictor - FVTClientInstance.DeletePredictor(xgboostPredictorName) - - // disconnect because TLS config will change - FVTClientInstance.DisconnectFromModelServing() - }) - - It("should successfully run an inference with mutual TLS", func() { - FVTClientInstance.UpdateConfigMapTLS(MutualTLSConfig) - - By("Waiting for the deployments replicas to be ready") - WaitForStableActiveDeployState() - - // load the test predictor object - xgboostPredictorObject = NewPredictorForFVT("xgboost-predictor.yaml") - xgboostPredictorName = xgboostPredictorObject.GetName() - CreatePredictorAndWaitAndExpectLoaded(xgboostPredictorObject) - - By("Creating a new connection to ModelServing") - // ensure we are establishing a new connection after the TLS change - FVTClientInstance.DisconnectFromModelServing() - - var timeAsleep int - var mmeshErr error - for timeAsleep <= 30 { - mmeshErr = FVTClientInstance.ConnectToModelServing(MutualTLS) - - if mmeshErr == nil { - Log.Info("Successfully connected to model mesh tls") - break - } - - Log.Info(fmt.Sprintf("Error found, retrying connecting to model-mesh: %s", mmeshErr.Error())) - FVTClientInstance.DisconnectFromModelServing() - timeAsleep += 10 - time.Sleep(time.Second * time.Duration(timeAsleep)) - } - Expect(mmeshErr).ToNot(HaveOccurred()) - - By("Expect inference to succeed") - ExpectSuccessfulInference_xgboostMushroom(xgboostPredictorName) - - // cleanup the predictor - FVTClientInstance.DeletePredictor(xgboostPredictorName) - - // disconnect because TLS config will change - FVTClientInstance.DisconnectFromModelServing() - }) - - It("should fail to run inference when the server has mutual TLS but the client does not present a certificate", func() { - FVTClientInstance.UpdateConfigMapTLS(MutualTLSConfig) - - By("Waiting for the deployments replicas to be ready") - WaitForStableActiveDeployState() - - By("Expect a new connection to fail") - // since the connection switches to TLS, ensure we are establishing a new connection - FVTClientInstance.DisconnectFromModelServing() - // this test is expected to fail to connect due to the TLS cert, so we - // don't retry if it fails - mmeshErr := FVTClientInstance.ConnectToModelServing(TLS) - Expect(mmeshErr).To(HaveOccurred()) - }) - }) - // The TLS tests `Describe` block should be the last one in the list to - // improve efficiency of the tests. Any test after the TLS tests would need - // to wait for the configuration changes to roll out to all Deployments. }) // These tests verify that an invalid Predictor fails to load. These are in a @@ -1083,7 +984,7 @@ var _ = Describe("Non-ModelMesh ServingRuntime", func() { Expect(err).ToNot(HaveOccurred()) By("Waiting for the deployments replicas to be ready") - WaitForStableActiveDeployState() + WaitForStableActiveDeployState(TimeForStatusToStabilize) By("Checking that new ServingRuntime resource exists") FVTClientInstance.GetServingRuntime(runtimeName) @@ -1120,3 +1021,150 @@ var _ = Describe("Non-ModelMesh ServingRuntime", func() { FVTClientInstance.DeletePredictor(obj.GetName()) }) }) + +var _ = Describe("Inference service", Ordered, func() { + for _, i := range inferenceArray { + var _ = Describe("test "+i.name+" isvc", Ordered, func() { + var isvcName string + + It("should successfully load a model", func() { + isvcObject := NewIsvcForFVT(i.inferenceServiceFileName) + isvcName = isvcObject.GetName() + CreateIsvcAndWaitAndExpectReady(isvcObject, PredictorTimeout) + + err := FVTClientInstance.ConnectToModelServing(Insecure) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should successfully run inference", func() { + ExpectSuccessfulInference_sklearnMnistSvm(isvcName) + }) + + AfterAll(func() { + FVTClientInstance.DeleteIsvc(isvcName) + }) + + }) + } +}) + +// The TLS tests `Describe` block should be the last one in the list to +// improve efficiency of the tests. Any test after the TLS tests would need +// to wait for the configuration changes to roll out to all Deployments. +// The TLS tests must run "Serial" (not in parallel with any other tests) since +// the configmap changes trigger deployment rollouts causing runtime pods to +// get terminated and any concurrently running inference requests would fail as +// the gRPC connection to terminating pods breaks. +var _ = Describe("TLS XGBoost inference", Ordered, Serial, func() { + var xgboostPredictorObject *unstructured.Unstructured + var xgboostPredictorName string + + AfterAll(func() { + FVTClientInstance.SetDefaultUserConfigMap() + }) + + It("should successfully run an inference with basic TLS", func() { + By("Updating the user ConfigMap to for basic TLS") + FVTClientInstance.UpdateConfigMapTLS(BasicTLSConfig) + + By("Waiting for stable deploy state after UpdateConfigMapTLS") + WaitForStableActiveDeployState(time.Second * 20) + + // load the test predictor object + xgboostPredictorObject = NewPredictorForFVT("xgboost-predictor.yaml") + xgboostPredictorName = xgboostPredictorObject.GetName() + CreatePredictorAndWaitAndExpectLoaded(xgboostPredictorObject) + + By("Creating a new connection to ModelServing") + // ensure we are establishing a new connection after the TLS change + FVTClientInstance.DisconnectFromModelServing() + + var timeAsleep int + var mmeshErr error + for timeAsleep <= 30 { + mmeshErr = FVTClientInstance.ConnectToModelServing(TLS) + + if mmeshErr == nil { + Log.Info("Successfully connected to model mesh tls") + break + } + + Log.Info(fmt.Sprintf("Error found, retrying connecting to model-mesh: %s", mmeshErr.Error())) + FVTClientInstance.DisconnectFromModelServing() + timeAsleep += 10 + time.Sleep(time.Second * time.Duration(timeAsleep)) + } + + Expect(mmeshErr).ToNot(HaveOccurred()) + + By("Expect inference to succeed") + ExpectSuccessfulInference_xgboostMushroom(xgboostPredictorName) + + By("Expect inference to succeed via REST proxy") + ExpectSuccessfulRESTInference_xgboostMushroom(xgboostPredictorName, true) + + // cleanup the predictor + FVTClientInstance.DeletePredictor(xgboostPredictorName) + + // disconnect because TLS config will change + FVTClientInstance.DisconnectFromModelServing() + }) + + It("should successfully run an inference with mutual TLS", func() { + By("Updating user config for Mutual TLS") + FVTClientInstance.UpdateConfigMapTLS(MutualTLSConfig) + + By("Waiting for stable deploy state after MutualTLSConfig") + WaitForStableActiveDeployState(time.Second * 20) + + // load the test predictor object + xgboostPredictorObject = NewPredictorForFVT("xgboost-predictor.yaml") + xgboostPredictorName = xgboostPredictorObject.GetName() + CreatePredictorAndWaitAndExpectLoaded(xgboostPredictorObject) + + By("Creating a new connection to ModelServing") + // ensure we are establishing a new connection after the TLS change + FVTClientInstance.DisconnectFromModelServing() + + var timeAsleep int + var mmeshErr error + for timeAsleep <= 30 { + mmeshErr = FVTClientInstance.ConnectToModelServing(MutualTLS) + + if mmeshErr == nil { + Log.Info("Successfully connected to model mesh tls") + break + } + + Log.Info(fmt.Sprintf("Error found, retrying connecting to model-mesh: %s", mmeshErr.Error())) + FVTClientInstance.DisconnectFromModelServing() + timeAsleep += 10 + time.Sleep(time.Second * time.Duration(timeAsleep)) + } + Expect(mmeshErr).ToNot(HaveOccurred()) + + By("Expect inference to succeed") + ExpectSuccessfulInference_xgboostMushroom(xgboostPredictorName) + + // cleanup the predictor + FVTClientInstance.DeletePredictor(xgboostPredictorName) + + // disconnect because TLS config will change + FVTClientInstance.DisconnectFromModelServing() + }) + + It("should fail to run inference when the server has mutual TLS but the client does not present a certificate", func() { + FVTClientInstance.UpdateConfigMapTLS(MutualTLSConfig) + + By("Waiting for the deployments replicas to be ready") + WaitForStableActiveDeployState(TimeForStatusToStabilize) + + By("Expect a new connection to fail") + // since the connection switches to TLS, ensure we are establishing a new connection + FVTClientInstance.DisconnectFromModelServing() + // this test is expected to fail to connect due to the TLS cert, so we + // don't retry if it fails + mmeshErr := FVTClientInstance.ConnectToModelServing(TLS) + Expect(mmeshErr).To(HaveOccurred()) + }) +}) \ No newline at end of file diff --git a/fvt/scaleToZero/scaleToZero_suite_test.go b/fvt/scaleToZero/scaleToZero_suite_test.go index 56b78278..5e6f55bf 100644 --- a/fvt/scaleToZero/scaleToZero_suite_test.go +++ b/fvt/scaleToZero/scaleToZero_suite_test.go @@ -14,18 +14,14 @@ package scaleToZero import ( - "os" "testing" "time" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/log/zap" - . "github.com/kserve/modelmesh-serving/fvt" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) func TestScaleToZeroSuite(t *testing.T) { @@ -35,35 +31,10 @@ func TestScaleToZeroSuite(t *testing.T) { var _ = SynchronizedBeforeSuite(func() []byte { // runs *only* on process #1 - return nil -}, func(_ []byte) { - // runs on *all* processes - Log = zap.New(zap.UseDevMode(true), zap.WriteTo(GinkgoWriter)) - Log.Info("Initializing test suite") - - namespace := os.Getenv("NAMESPACE") - if namespace == "" { - namespace = DefaultTestNamespace - } - serviceName := os.Getenv("SERVICENAME") - if serviceName == "" { - serviceName = DefaultTestServiceName - } - controllerNamespace := os.Getenv("CONTROLLERNAMESPACE") - if controllerNamespace == "" { - controllerNamespace = DefaultControllerNamespace - } - NameSpaceScopeMode = os.Getenv("NAMESPACESCOPEMODE") == "true" - Log.Info("Using environment variables", "NAMESPACE", namespace, "SERVICENAME", serviceName, - "CONTROLLERNAMESPACE", controllerNamespace, "NAMESPACESCOPEMODE", NameSpaceScopeMode) + InitializeFVTClient() + // confirm 4 cluster serving runtimes or serving runtimes exist var err error - FVTClientInstance, err = GetFVTClient(Log, namespace, serviceName, controllerNamespace) - Expect(err).ToNot(HaveOccurred()) - Expect(FVTClientInstance).ToNot(BeNil()) - Log.Info("FVTClientInstance created", "client", FVTClientInstance) - - // confirm 3 cluster serving runtimes or serving runtimes var list *unstructured.UnstructuredList if NameSpaceScopeMode { list, err = FVTClientInstance.ListServingRuntimes(metav1.ListOptions{}) @@ -86,12 +57,23 @@ var _ = SynchronizedBeforeSuite(func() []byte { FVTClientInstance.DeleteAllPredictors() FVTClientInstance.DeleteAllIsvcs() + // ensure a stable deploy state + WaitForStableActiveDeployState(TimeForStatusToStabilize) + + return nil +}, func(_ []byte) { + // runs on *all* processes + // create the fvtClient Instance on every other process except the first, since it got created in the above function. + if FVTClientInstance == nil { + InitializeFVTClient() + } + Log.Info("Setup completed") }) var _ = SynchronizedAfterSuite(func() { // runs on *all* processes - // ensure we cleanup any port-forward + // ensure we clean up all port-forwards FVTClientInstance.DisconnectFromModelServing() }, func() { // runs *only* on process #1 diff --git a/fvt/scaleToZero/scale_to_zero_test.go b/fvt/scaleToZero/scale_to_zero_test.go index eb13c32e..c2703e85 100644 --- a/fvt/scaleToZero/scale_to_zero_test.go +++ b/fvt/scaleToZero/scale_to_zero_test.go @@ -67,7 +67,7 @@ var _ = Describe("Scaling of runtime deployments to zero", Ordered, func() { It("should scale all runtimes down", func() { By("Waiting for the deployments to stabilize") - WaitForStableActiveDeployState() + WaitForStableActiveDeployState(TimeForStatusToStabilize) // check that all runtimes are scaled to zero expectScaledToZero() @@ -82,7 +82,7 @@ var _ = Describe("Scaling of runtime deployments to zero", Ordered, func() { FVTClientInstance.ApplyPredictorExpectSuccess(testPredictorObject) By("Waiting for the deployments to stabilize") - WaitForStableActiveDeployState() + WaitForStableActiveDeployState(TimeForStatusToStabilize) // check that all runtimes except the one are scaled to zero expectScaledUp() @@ -101,7 +101,7 @@ var _ = Describe("Scaling of runtime deployments to zero", Ordered, func() { FVTClientInstance.ApplyPredictorExpectSuccess(testPredictorObject) By("Waiting for the deployments to stabilize") - WaitForStableActiveDeployState() + WaitForStableActiveDeployState(TimeForStatusToStabilize) // ensure the runtime is ready and scaled up and others are scaled down expectScaledUp() diff --git a/fvt/storage/storage_suite_test.go b/fvt/storage/storage_suite_test.go new file mode 100644 index 00000000..07bd2e77 --- /dev/null +++ b/fvt/storage/storage_suite_test.go @@ -0,0 +1,105 @@ +// Copyright 2023 IBM Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package storage + +import ( + "testing" + "time" + + . "github.com/kserve/modelmesh-serving/fvt" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +func TestStorage(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Storage Suite") +} + +var _ = SynchronizedBeforeSuite(func() []byte { + // runs *only* on process #1 + InitializeFVTClient() + + // confirm 4 cluster serving runtimes or serving runtimes exist + var err error + var list *unstructured.UnstructuredList + if NameSpaceScopeMode { + list, err = FVTClientInstance.ListServingRuntimes(metav1.ListOptions{}) + } else { + list, err = FVTClientInstance.ListClusterServingRuntimes(metav1.ListOptions{}) + } + Expect(err).ToNot(HaveOccurred()) + Expect(list.Items).To(HaveLen(4)) + + FVTClientInstance.SetDefaultUserConfigMap() + + // replace default storage-config secret for local Minio with PVC config + FVTClientInstance.DeleteStorageConfigSecret() + FVTClientInstance.CreateStorageConfigSecret(StorageConfigDataPVC) + + // ensure that there are no predictors to start + FVTClientInstance.DeleteAllPredictors() + FVTClientInstance.DeleteAllIsvcs() + + // ensure a stable deploy state, on each process since we updated the storage config + Log.Info("Waiting for stable deploy state") + WaitForStableActiveDeployState(time.Second * 60) + + return nil +}, func(_ []byte) { + // runs on *all* processes + // create the fvtClient Instance on every other process except the first, since it got created in the above function. + if FVTClientInstance == nil { + InitializeFVTClient() + } + // connect to model serving service once for all each process + err := FVTClientInstance.ConnectToModelServing(Insecure) + Expect(err).ToNot(HaveOccurred()) + + Log.Info("Setup completed") +}) + +var _ = SynchronizedAfterSuite(func() { + // runs on *all* processes + // ensure we clean up any port-forward for each process + FVTClientInstance.DisconnectFromModelServing() +}, func() { + // runs *only* on process #1 + // restore config defaults + FVTClientInstance.DeleteTLSSecrets() + FVTClientInstance.SetDefaultUserConfigMap() + // reset default storage-config secret + FVTClientInstance.DeleteStorageConfigSecret() + FVTClientInstance.CreateStorageConfigSecret(StorageConfigDataMinio) + // restart pods to reset Bootstrap failure checks + FVTClientInstance.RestartDeploys() +}) + +// register handlers for a failed test case to print info to the console +var startTime string +var _ = JustBeforeEach(func() { + startTime = time.Now().Format("2006-01-02T15:04:05Z") +}) +var _ = JustAfterEach(func() { + if CurrentSpecReport().Failed() { + FVTClientInstance.PrintIsvcs() + FVTClientInstance.PrintPods() + FVTClientInstance.PrintDescribeNodes() + FVTClientInstance.PrintEvents() + FVTClientInstance.TailPodLogs(startTime) + } +}) diff --git a/fvt/storage/storage_test.go b/fvt/storage/storage_test.go new file mode 100644 index 00000000..5c08628c --- /dev/null +++ b/fvt/storage/storage_test.go @@ -0,0 +1,189 @@ +// Copyright 2023 IBM Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.package storage + +package storage + +import ( + "time" + + . "github.com/kserve/modelmesh-serving/fvt" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +var isvcFiles = map[string]string{ + "isvc-pvc-storage-uri": "isvc-pvc-uri.yaml", + "isvc-pvc-storage-path": "isvc-pvc-path.yaml", + "isvc-pvc2": "isvc-pvc-2.yaml", + "isvc-pvc3": "isvc-pvc-3.yaml", + "isvc-pvc4": "isvc-pvc-4.yaml", +} + +// ISVCs using PVCs from the FVT `storage-config` Secret (config/dependencies/fvt.yaml) +var isvcWithPvcInStorageConfig = []string{"isvc-pvc-storage-uri", "isvc-pvc-storage-path", "isvc-pvc2"} + +// ISVC using PVC not in the FVT `storage-config` Secret (config/dependencies/fvt.yaml) +// this should work only after setting allowAnyPVC = true +var isvcWithPvcNotInStorageConfig = "isvc-pvc3" + +// ISVC using a PVC that does not exist at all, this ISVC should fail to load +var isvcWithNonExistentPvc = "isvc-pvc4" + +var _ = Describe("ISVCs", func() { + + Describe("with PVC in storage-config", Ordered, func() { + + for _, name := range isvcWithPvcInStorageConfig { + + Describe("\""+name+"\"", Ordered, func() { + var isvcName = name + var fileName = isvcFiles[name] + + AfterAll(func() { + FVTClientInstance.DeleteIsvc(isvcName) + }) + + It("should successfully load a model", func() { + isvcObject := NewIsvcForFVT(fileName) + isvcName = isvcObject.GetName() + CreateIsvcAndWaitAndExpectReady(isvcObject, PredictorTimeout) + }) + + It("should successfully run inference", func() { + ExpectSuccessfulInference_sklearnMnistSvm(isvcName) + }) + }) + } + }) + + Describe("with PVC not in storage-config", Ordered, Serial, func() { + var isvcObject *unstructured.Unstructured + + It("should fail with PVC not mounted", func() { + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithPvcNotInStorageConfig]) + + obj := CreateIsvcAndWaitAndExpectFailed(isvcObject) + + By("Asserting on the ISVC state") + ExpectIsvcFailureInfo(obj, "ModelLoadFailed", true, true, "") + + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + + It("should load a model when allowAnyPVC", func() { + // This ISVC needs a new PVC which is not in the storage-config secret. + // The controller will update the deployment with the pvc_mount, but + // if the old runtime pods are still around, the ISVC will get deployed + // on an old runtime pod without the PVC mounted and keep failing + // until a new pod with the PVC is ready and the controller decides + // to move the ISVC onto the new pod that has the PVC mounted. + // If this process takes a long time so, we may want to take some extra + // measures to increase our chances for quick success: + // - enable scale to zero? to prevent the new ISVC to land on an old + // runtime pod that does not have the "any" PVC mounted yet + // - use more than 1 pod per runtime? assumption: controller keeps the + // old runtime pod (without the PVC mounted) around (pre-stop hook, + // terminationGracePeriodSeconds: 90 # to allow for model propagation) + // since it still has the ISVC on it but no new pod yet to place it + // on. With 2 runtime pods, one pod can get updated with the PVC mount + // and the controller can "move" the ISVC without service interruption + // from the old to the new pod + + // make a shallow copy of default configmap (don't modify the DefaultConfig reference) + // keeping 1 pod per runtime and don't scale to 0 + config := make(map[string]interface{}) + for k, v := range DefaultConfig { + config[k] = v + } + // update the model-serving-config to allow any PVC + config["allowAnyPVC"] = true + + By("Updating the user config to allow any PVC") + FVTClientInstance.ApplyUserConfigMap(config) + + By("Deleting any PVC entries from the storage-config secret") + FVTClientInstance.DeleteStorageConfigSecret() + + // TODO: create a separate test case for not having storage secret at all. + // Currently that is not working. Runtime pod events without it: + // --- + // TYPE REASON MESSAGE + // Normal Scheduled Successfully assigned modelmesh-serving/modelmesh-serving-mlserver-0.x-54685b95d5-6xmck to 10.87.76.74 + // Warning FailedMount Unable to attach or mount volumes: unmounted volumes=[storage-config], unattached volumes=[models-dir models-pvc-3 storage-config tc-config etcd-config kube-api-access-pqz7t]: timed out waiting for the condition + // Warning FailedMount MountVolume.SetUp failed for volume "storage-config" : secret "storage-config" not found + // --- + // recreate the storage-config secret without the PVCs + FVTClientInstance.CreateStorageConfigSecret(StorageConfigDataMinio) + + // after changing the storage-config, the runtime pod(s) restart with + // updated PVC mounts, wait for stability + By("Waiting for stable deploy state") + WaitForStableActiveDeployState(time.Second * 30) + + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithPvcNotInStorageConfig]) + + // print pods before deploying the predictor for debugging purposes + FVTClientInstance.PrintPods() + + // after mounting the new PVC the runtime pod(s) restart again, but + // the ISVC could have landed on the previous runtime pod without the + // PVC mounted yet and hence will fail to load the first time. + // So we extend the standard predictor timeout to wait a bit longer + extendedTimeout := PredictorTimeout * 2 + CreateIsvcAndWaitAndExpectReady(isvcObject, extendedTimeout) + + // print pods after the predictor is ready for debugging purposes + FVTClientInstance.PrintPods() + + // after adding predictor with allowAnyPVC, new pods get created with + // the new pvc mounts, so we want to wait for deployments to stabilize + // in order to avoid connecting to terminating pod which causes the + // port-forward got killed and needs to be re-established (rarely happens) + By("Waiting for stable deploy state before connecting gRPC connection") + WaitForStableActiveDeployState(time.Second * 10) + By("Connecting to model serving service") + err := FVTClientInstance.ConnectToModelServing(Insecure) + Expect(err).ToNot(HaveOccurred()) + + isvcName := isvcObject.GetName() + By("Running an inference request") + ExpectSuccessfulInference_sklearnMnistSvm(isvcName) + + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + + It("should fail with non-existent PVC", func() { + // make a shallow copy of default configmap (don't modify the DefaultConfig reference) + // keeping 1 pod per runtime and don't scale to 0 + config := make(map[string]interface{}) + for k, v := range DefaultConfig { + config[k] = v + } + // update the model-serving-config to allow any PVC + config["allowAnyPVC"] = true + FVTClientInstance.ApplyUserConfigMap(config) + + By("Waiting for stable deploy state") + WaitForStableActiveDeployState(time.Second * 30) + + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithNonExistentPvc]) + + obj := CreateIsvcAndWaitAndExpectFailed(isvcObject) + ExpectIsvcFailureInfo(obj, "ModelLoadFailed", true, true, "") + + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + }) +}) diff --git a/fvt/testdata/isvcs/isvc-pvc-2.yaml b/fvt/testdata/isvcs/isvc-pvc-2.yaml new file mode 100644 index 00000000..a0655632 --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-2.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc2 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-2/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-3.yaml b/fvt/testdata/isvcs/isvc-pvc-3.yaml new file mode 100644 index 00000000..e660226c --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-3.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc3 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-3/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-4.yaml b/fvt/testdata/isvcs/isvc-pvc-4.yaml new file mode 100644 index 00000000..6ca2a04d --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-4.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc4 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-4/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-path.yaml b/fvt/testdata/isvcs/isvc-pvc-path.yaml new file mode 100644 index 00000000..71db761c --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-path.yaml @@ -0,0 +1,16 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc-storage-path + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storage: + parameters: + type: pvc + name: models-pvc-1 + path: sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-uri.yaml b/fvt/testdata/isvcs/isvc-pvc-uri.yaml new file mode 100644 index 00000000..5334b703 --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-uri.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc-storage-uri + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-1/sklearn/mnist-svm.joblib diff --git a/fvt/utils.go b/fvt/utils.go index 20452485..efaedceb 100644 --- a/fvt/utils.go +++ b/fvt/utils.go @@ -17,19 +17,48 @@ import ( "context" "encoding/json" "fmt" - "io/ioutil" "os" "path/filepath" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" yamlserializer "k8s.io/apimachinery/pkg/runtime/serializer/yaml" "k8s.io/apimachinery/pkg/types" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) -//Utility function to return the testdata directory +func InitializeFVTClient() { + Log = zap.New(zap.UseDevMode(true), zap.WriteTo(GinkgoWriter)) + Log.Info("Initializing test suite") + + namespace := os.Getenv("NAMESPACE") + if namespace == "" { + namespace = DefaultTestNamespace + } + serviceName := os.Getenv("SERVICENAME") + if serviceName == "" { + serviceName = DefaultTestServiceName + } + controllerNamespace := os.Getenv("CONTROLLERNAMESPACE") + if controllerNamespace == "" { + controllerNamespace = DefaultControllerNamespace + } + NameSpaceScopeMode = os.Getenv("NAMESPACESCOPEMODE") == "true" + Log.Info("Using environment variables", "NAMESPACE", namespace, "SERVICENAME", serviceName, + "CONTROLLERNAMESPACE", controllerNamespace, "NAMESPACESCOPEMODE", NameSpaceScopeMode) + + var err error + FVTClientInstance, err = GetFVTClient(Log, namespace, serviceName, controllerNamespace) + Expect(err).ToNot(HaveOccurred()) + Expect(FVTClientInstance).ToNot(BeNil()) + Log.Info("FVTClientInstance created", "client", FVTClientInstance) +} + +// Utility function to return the testdata directory func TestDataPath(resourcePathWithinTestData string) string { wd, err := os.Getwd() Expect(err).ToNot(HaveOccurred()) @@ -38,7 +67,7 @@ func TestDataPath(resourcePathWithinTestData string) string { } func DecodeResourceFromFile(resourcePath string) *unstructured.Unstructured { - content, err := ioutil.ReadFile(resourcePath) + content, err := os.ReadFile(resourcePath) Expect(err).ToNot(HaveOccurred()) obj := &unstructured.Unstructured{} diff --git a/main.go b/main.go index 3ce4518c..84eafcea 100644 --- a/main.go +++ b/main.go @@ -18,7 +18,6 @@ import ( "context" "flag" "fmt" - "io/ioutil" "log" "os" "regexp" @@ -106,7 +105,7 @@ func main() { // ----- mmesh related envar setup ----- controllerNamespace := os.Getenv(ControllerNamespaceEnvVar) if controllerNamespace == "" { - bytes, err := ioutil.ReadFile(KubeNamespaceFile) + bytes, err := os.ReadFile(KubeNamespaceFile) if err != nil { //TODO check kube context and retrieve namespace from there setupLog.Info("Error reading Kube-mounted namespace file, reverting to default namespace", diff --git a/pkg/config/config.go b/pkg/config/config.go index 6cf11d9e..f8ba28d7 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, diff --git a/pkg/mmesh/etcdrangewatcher_test.go b/pkg/mmesh/etcdrangewatcher_test.go index ad225daf..38c8b030 100644 --- a/pkg/mmesh/etcdrangewatcher_test.go +++ b/pkg/mmesh/etcdrangewatcher_test.go @@ -15,7 +15,7 @@ package mmesh import ( "fmt" - "io/ioutil" + "os" "testing" "github.com/stretchr/testify/assert" @@ -79,7 +79,7 @@ func Test_GetEtcdClientConfig_SuccessWithCertOverwrite(t *testing.T) { Certificate: "myCertificate", CertificateFile: certificateFile} - certData, _ := ioutil.ReadFile(certificateFile) + certData, _ := os.ReadFile(certificateFile) etcdClientConfig, err := getEtcdClientConfig(etcdConfig, map[string][]byte{ certificateFile: certData, }, logger) @@ -103,8 +103,8 @@ func Test_GetEtcdClientConfig_SuccessWithKeyAndCert(t *testing.T) { ClientCertificate: "myClientCertificate", ClientCertificateFile: certificateFile} - certData1, _ := ioutil.ReadFile(keyFile) - certData2, _ := ioutil.ReadFile(certificateFile) + certData1, _ := os.ReadFile(keyFile) + certData2, _ := os.ReadFile(certificateFile) etcdClientConfig, err := getEtcdClientConfig(etcdConfig, map[string][]byte{ keyFile: certData1, certificateFile: certData2, @@ -158,8 +158,8 @@ func Test_CreateEtcdClient_Success(t *testing.T) { ClientKeyFile: keyFile, ClientCertificateFile: certificateFile} - certData1, _ := ioutil.ReadFile(keyFile) - certData2, _ := ioutil.ReadFile(certificateFile) + certData1, _ := os.ReadFile(keyFile) + certData2, _ := os.ReadFile(certificateFile) etcdClient, err := CreateEtcdClient(etcdConfig, map[string][]byte{ keyFile: certData1, certificateFile: certData2, diff --git a/scripts/install.sh b/scripts/install.sh index a73ac028..f4662677 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -132,7 +132,7 @@ wait_for_pods_ready() { fi wait_counter=$((wait_counter + 1)) - echo " Waiting 10 secs..." + echo " Waiting 10 secs ..." sleep 10 done } @@ -254,7 +254,7 @@ if [[ $quickstart == "true" ]]; then info "Deploying quickstart resources for etcd and minio" kubectl apply -f quickstart.yaml - info "Waiting for dependent pods to be up..." + info "Waiting for dependent pods to be up ..." wait_for_pods_ready "-l app=etcd" wait_for_pods_ready "-l app=minio" fi @@ -264,7 +264,7 @@ if [[ $fvt == "true" ]]; then info "Deploying fvt resources for etcd and minio" kubectl apply -f fvt.yaml - info "Waiting for dependent pods to be up..." + info "Waiting for dependent pods to be up ..." wait_for_pods_ready "-l app=etcd" wait_for_pods_ready "-l app=minio" fi @@ -305,7 +305,7 @@ if [[ $namespace_scope_mode == "true" ]]; then rm crd/kustomization.yaml.bak fi -info "Waiting for ModelMesh Serving controller pod to be up..." +info "Waiting for ModelMesh Serving controller pod to be up ..." wait_for_pods_ready "-l control-plane=modelmesh-controller" # Older versions of kustomize have different load restrictor flag formats. @@ -344,4 +344,12 @@ if [[ $namespace_scope_mode != "true" ]] && [[ ! -z $user_ns_array ]]; then fi rm quickstart.yaml quickstart.yaml.bak fvt.yaml fvt.yaml.bak +# wait for FVT storage resources that take long to initialize +# we don't want to wait earlier to not hold up any setup steps +# that happen after the initial FVT install block +if [[ $fvt == "true" ]]; then + info "Waiting for FVT PVC storage to be initialized ..." + kubectl wait --for=condition=complete --timeout=180s job/pvc-init +fi + success "Successfully installed ModelMesh Serving!"