diff --git a/cmd/kubevirt-csi-driver/kubevirt-csi-driver.go b/cmd/kubevirt-csi-driver/kubevirt-csi-driver.go index a0337039..94325402 100644 --- a/cmd/kubevirt-csi-driver/kubevirt-csi-driver.go +++ b/cmd/kubevirt-csi-driver/kubevirt-csi-driver.go @@ -22,11 +22,11 @@ import ( ) var ( - endpoint = flag.String("endpoint", "unix:/csi/csi.sock", "CSI endpoint") - nodeName = flag.String("node-name", "", "The node name - the node this pods runs on") - infraClusterNamespace = flag.String("infra-cluster-namespace", "", "The infra-cluster namespace") - infraClusterKubeconfig = flag.String("infra-cluster-kubeconfig", "", "the infra-cluster kubeconfig file. If not set, defaults to in cluster config.") - infraClusterLabels = flag.String("infra-cluster-labels", "", "The infra-cluster labels to use when creating resources in infra cluster. 'name=value' fields separated by a comma") + endpoint = flag.String("endpoint", "unix:/csi/csi.sock", "CSI endpoint") + nodeName = flag.String("node-name", "", "The node name - the node this pods runs on") + infraClusterNamespace = flag.String("infra-cluster-namespace", "", "The infra-cluster namespace") + infraClusterKubeconfig = flag.String("infra-cluster-kubeconfig", "", "the infra-cluster kubeconfig file. If not set, defaults to in cluster config.") + infraClusterLabels = flag.String("infra-cluster-labels", "", "The infra-cluster labels to use when creating resources in infra cluster. 'name=value' fields separated by a comma") // infraStorageClassEnforcement = flag.String("infra-storage-class-enforcement", "", "A string encoded yaml that represents the policy of enforcing which infra storage classes are allowed in persistentVolume of type kubevirt") infraStorageClassEnforcement = os.Getenv("INFRA_STORAGE_CLASS_ENFORCEMENT") diff --git a/deploy/controller-infra/base/deploy.yaml b/deploy/controller-infra/base/deploy.yaml index b1884dc7..d630c25d 100644 --- a/deploy/controller-infra/base/deploy.yaml +++ b/deploy/controller-infra/base/deploy.yaml @@ -36,7 +36,7 @@ spec: - "--tenant-cluster-kubeconfig=/var/run/secrets/tenantcluster/value" - "--run-node-service=false" - "--run-controller-service=true" - - --v=5 + - "--v=5" ports: - name: healthz containerPort: 10301 @@ -76,10 +76,12 @@ spec: - name: csi-provisioner image: quay.io/openshift/origin-csi-external-provisioner:latest args: - - --csi-address=$(ADDRESS) - - --default-fstype=ext4 - - --kubeconfig=/var/run/secrets/tenantcluster/value - - --v=5 + - "--csi-address=$(ADDRESS)" + - "--default-fstype=ext4" + - "--kubeconfig=/var/run/secrets/tenantcluster/value" + - "--v=5" + - "--timeout=3m" + - "--retry-interval-max=1m" env: - name: ADDRESS value: /var/lib/csi/sockets/pluginproxy/csi.sock @@ -91,9 +93,11 @@ spec: - name: csi-attacher image: quay.io/openshift/origin-csi-external-attacher:latest args: - - --csi-address=$(ADDRESS) - - --kubeconfig=/var/run/secrets/tenantcluster/value - - --v=5 + - "--csi-address=$(ADDRESS)" + - "--kubeconfig=/var/run/secrets/tenantcluster/value" + - "--v=5" + - "--timeout=3m" + - "--retry-interval-max=1m" env: - name: ADDRESS value: /var/lib/csi/sockets/pluginproxy/csi.sock @@ -109,9 +113,9 @@ spec: - name: csi-liveness-probe image: quay.io/openshift/origin-csi-livenessprobe:latest args: - - --csi-address=/csi/csi.sock - - --probe-timeout=3s - - --health-port=10301 + - "--csi-address=/csi/csi.sock" + - "--probe-timeout=3s" + - "--health-port=10301" volumeMounts: - name: socket-dir mountPath: /csi diff --git a/deploy/tenant/base/deploy.yaml b/deploy/tenant/base/deploy.yaml index a84a0e73..14f612b2 100644 --- a/deploy/tenant/base/deploy.yaml +++ b/deploy/tenant/base/deploy.yaml @@ -169,6 +169,7 @@ spec: - "--node-name=$(KUBE_NODE_NAME)" - "--run-node-service=true" - "--run-controller-service=false" + - "--v=5" env: - name: KUBE_NODE_NAME valueFrom: @@ -203,9 +204,9 @@ spec: - name: csi-node-driver-registrar image: quay.io/openshift/origin-csi-node-driver-registrar:latest args: - - --csi-address=$(ADDRESS) - - --kubelet-registration-path=$(DRIVER_REG_SOCK_PATH) - - --v=5 + - "--csi-address=$(ADDRESS)" + - "--kubelet-registration-path=$(DRIVER_REG_SOCK_PATH)" + - "--v=5" lifecycle: preStop: exec: @@ -227,9 +228,9 @@ spec: - name: csi-liveness-probe image: quay.io/openshift/origin-csi-livenessprobe:latest args: - - --csi-address=/csi/csi.sock - - --probe-timeout=3s - - --health-port=10300 + - "--csi-address=/csi/csi.sock" + - "--probe-timeout=3s" + - "--health-port=10300" volumeMounts: - name: plugin-dir mountPath: /csi diff --git a/pkg/kubevirt/client.go b/pkg/kubevirt/client.go index ba5d3bba..a112bc04 100644 --- a/pkg/kubevirt/client.go +++ b/pkg/kubevirt/client.go @@ -68,7 +68,7 @@ func (c *client) RemoveVolumeFromVM(namespace string, vmName string, hotPlugRequ // EnsureVolumeAvailable checks to make sure the volume is available in the node before returning, checks for 2 minutes func (c *client) EnsureVolumeAvailable(namespace, vmName, volumeName string, timeout time.Duration) error { return wait.PollImmediate(time.Second, timeout, func() (done bool, err error) { - vmi, err := c.virtClient.VirtualMachineInstance(namespace).Get(context.TODO(), vmName, &metav1.GetOptions{}) + vmi, err := c.GetVirtualMachine(namespace, vmName) if err != nil { return false, err } @@ -85,7 +85,7 @@ func (c *client) EnsureVolumeAvailable(namespace, vmName, volumeName string, tim // EnsureVolumeAvailable checks to make sure the volume is available in the node before returning, checks for 2 minutes func (c *client) EnsureVolumeRemoved(namespace, vmName, volumeName string, timeout time.Duration) error { return wait.PollImmediate(time.Second, timeout, func() (done bool, err error) { - vmi, err := c.virtClient.VirtualMachineInstance(namespace).Get(context.TODO(), vmName, &metav1.GetOptions{}) + vmi, err := c.GetVirtualMachine(namespace, vmName) if err != nil { return false, err } diff --git a/pkg/service/controller.go b/pkg/service/controller.go index 57b9f968..627d2db7 100644 --- a/pkg/service/controller.go +++ b/pkg/service/controller.go @@ -7,6 +7,7 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/util/wait" kubevirtv1 "kubevirt.io/api/core/v1" "github.com/container-storage-interface/spec/lib/go/csi" @@ -195,7 +196,7 @@ func (c *ControllerService) DeleteVolume(ctx context.Context, req *csi.DeleteVol return nil, err } dvName := req.VolumeId - klog.Infof("Removing data volume with %s", dvName) + klog.V(3).Infof("Removing data volume with %s", dvName) err := c.virtClient.DeleteDataVolume(c.infraClusterNamespace, dvName) if err != nil { @@ -231,7 +232,7 @@ func (c *ControllerService) ControllerPublishVolume( } dvName := req.GetVolumeId() - klog.Infof("Attaching DataVolume %s to Node ID %s", dvName, req.NodeId) + klog.V(3).Infof("Attaching DataVolume %s to Node ID %s", dvName, req.NodeId) // Get VM name vmName, err := c.getVMNameByCSINodeID(req.NodeId) @@ -247,7 +248,7 @@ func (c *ControllerService) ControllerPublishVolume( bus := req.VolumeContext[busParameter] // hotplug DataVolume to VM - klog.Infof("Start attaching DataVolume %s to VM %s. Volume name: %s. Serial: %s. Bus: %s", dvName, vmName, dvName, serial, bus) + klog.V(3).Infof("Start attaching DataVolume %s to VM %s. Volume name: %s. Serial: %s. Bus: %s", dvName, vmName, dvName, serial, bus) addVolumeOptions := &kubevirtv1.AddVolumeOptions{ Name: dvName, @@ -266,32 +267,58 @@ func (c *ControllerService) ControllerPublishVolume( }, } - volumeFound := false - vm, err := c.virtClient.GetVirtualMachine(c.infraClusterNamespace, vmName) + if err := wait.ExponentialBackoff(wait.Backoff{ + Duration: time.Second, + Steps: 5, + Factor: 2, + Cap: time.Second * 30, + }, func() (bool, error) { + if err := c.addVolumeToVm(dvName, vmName, addVolumeOptions); err != nil { + klog.Infof("failed adding volume %s to VM %s, retrying, err: %v", dvName, vmName, err) + return false, nil + } + return true, nil + }); err != nil { + return nil, err + } + + // Ensure that the csi-attacher and csi-provisioner --timeout values are > the timeout specified here so we don't get + // odd failures with detaching volumes. + err = c.virtClient.EnsureVolumeAvailable(c.infraClusterNamespace, vmName, dvName, time.Minute*2) if err != nil { + klog.Errorf("volume %s failed to be ready in time (2m) in VM %s, %v", dvName, vmName, err) return nil, err } + + klog.V(3).Infof("Successfully attached volume %s to VM %s", dvName, vmName) + return &csi.ControllerPublishVolumeResponse{}, nil +} + +func (c *ControllerService) isVolumeAttached(dvName, vmName string) (bool, error) { + vm, err := c.virtClient.GetVirtualMachine(c.infraClusterNamespace, vmName) + if err != nil { + return false, err + } for _, volumeStatus := range vm.Status.VolumeStatus { if volumeStatus.Name == dvName { - volumeFound = true - break + return true, nil } } + return false, nil +} + +func (c *ControllerService) addVolumeToVm(dvName, vmName string, addVolumeOptions *kubevirtv1.AddVolumeOptions) error { + volumeFound, err := c.isVolumeAttached(dvName, vmName) + if err != nil { + return err + } if !volumeFound { err = c.virtClient.AddVolumeToVM(c.infraClusterNamespace, vmName, addVolumeOptions) if err != nil { - klog.Errorf("failed adding volume %s to VM %s, %v", dvName, vmName, err) - return nil, err + return err } } - - err = c.virtClient.EnsureVolumeAvailable(c.infraClusterNamespace, vmName, dvName, time.Minute*2) - if err != nil { - klog.Errorf("volume %s failed to be ready in time in VM %s, %v", dvName, vmName, err) - return nil, err - } - - return &csi.ControllerPublishVolumeResponse{}, nil + return nil } func (c *ControllerService) validateControllerUnpublishVolumeRequest(req *csi.ControllerUnpublishVolumeRequest) error { @@ -314,7 +341,7 @@ func (c *ControllerService) ControllerUnpublishVolume(ctx context.Context, req * return nil, err } dvName := req.VolumeId - klog.Infof("Detaching DataVolume %s from Node ID %s", dvName, req.NodeId) + klog.V(3).Infof("Detaching DataVolume %s from Node ID %s", dvName, req.NodeId) // Get VM name vmName, err := c.getVMNameByCSINodeID(req.NodeId) @@ -322,11 +349,36 @@ func (c *ControllerService) ControllerUnpublishVolume(ctx context.Context, req * return nil, err } - vm, err := c.virtClient.GetVirtualMachine(c.infraClusterNamespace, vmName) + if err := wait.ExponentialBackoff(wait.Backoff{ + Duration: time.Second, + Steps: 5, + Factor: 2, + Cap: time.Second * 30, + }, func() (bool, error) { + if err := c.removeVolumeFromVm(dvName, vmName); err != nil { + klog.Infof("failed removing volume %s from VM %s, err: %v", dvName, vmName, err) + return false, nil + } + return true, nil + }); err != nil { + return nil, err + } + + err = c.virtClient.EnsureVolumeRemoved(c.infraClusterNamespace, vmName, dvName, time.Minute*2) if err != nil { - klog.Error("failed getting virtual machine " + vmName) + klog.Errorf("volume %s failed to be removed in time (2m) from VM %s, %v", dvName, vmName, err) return nil, err } + + klog.V(3).Infof("Successfully unpublished volume %s from VM %s", dvName, vmName) + return &csi.ControllerUnpublishVolumeResponse{}, nil +} + +func (c *ControllerService) removeVolumeFromVm(dvName, vmName string) error { + vm, err := c.virtClient.GetVirtualMachine(c.infraClusterNamespace, vmName) + if err != nil { + return err + } removePossible := false for _, volumeStatus := range vm.Status.VolumeStatus { if volumeStatus.HotplugVolume != nil && volumeStatus.Name == dvName { @@ -337,17 +389,10 @@ func (c *ControllerService) ControllerUnpublishVolume(ctx context.Context, req * // Detach DataVolume from VM err = c.virtClient.RemoveVolumeFromVM(c.infraClusterNamespace, vmName, &kubevirtv1.RemoveVolumeOptions{Name: dvName}) if err != nil { - klog.Error("failed removing volume " + dvName + " from VM " + vmName) - return nil, err + return err } } - err = c.virtClient.EnsureVolumeRemoved(c.infraClusterNamespace, vmName, dvName, time.Minute*2) - if err != nil { - klog.Errorf("volume %s failed to be removed in time from VM %s, %v", dvName, vmName, err) - return nil, err - } - - return &csi.ControllerUnpublishVolumeResponse{}, nil + return nil } // ValidateVolumeCapabilities unimplemented @@ -359,19 +404,27 @@ func (c *ControllerService) ValidateVolumeCapabilities(ctx context.Context, req if len(req.VolumeCapabilities) == 0 { return nil, status.Errorf(codes.InvalidArgument, "volumeCapabilities not provided for %s", req.VolumeId) } - + klog.V(3).Info("Calling volume capabilities") for _, cap := range req.GetVolumeCapabilities() { if cap.GetMount() == nil { return nil, status.Error(codes.InvalidArgument, "mount type is undefined") } } dvName := req.GetVolumeId() + klog.V(3).Infof("DataVolume name %s", dvName) if _, err := c.virtClient.GetDataVolume(c.infraClusterNamespace, dvName); errors.IsNotFound(err) { return nil, status.Errorf(codes.NotFound, "volume %s not found", req.GetVolumeId()) } else if err != nil { return nil, err } + klog.V(5).Info("Returning capabilities %v", &csi.ValidateVolumeCapabilitiesResponse{ + Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{ + VolumeContext: req.GetVolumeContext(), + VolumeCapabilities: req.GetVolumeCapabilities(), + Parameters: req.GetParameters(), + }, + }) return &csi.ValidateVolumeCapabilitiesResponse{ Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{ VolumeContext: req.GetVolumeContext(), diff --git a/pkg/service/node.go b/pkg/service/node.go index f4b284c9..21b38690 100644 --- a/pkg/service/node.go +++ b/pkg/service/node.go @@ -111,7 +111,7 @@ func (n *NodeService) NodeStageVolume(_ context.Context, req *csi.NodeStageVolum if err := n.validateNodeStageVolumeRequest(req); err != nil { return nil, err } - klog.Infof("Staging volume %s", req.VolumeId) + klog.V(3).Infof("Staging volume %s", req.VolumeId) if req.VolumeCapability.GetMount() != nil { // Filesystem volume mode, create FS if needed @@ -125,13 +125,13 @@ func (n *NodeService) NodeStageVolume(_ context.Context, req *csi.NodeStageVolum // is there a filesystem on this device? if device.Fstype != "" { - klog.Infof("Detected fs %s", device.Fstype) + klog.V(3).Infof("Detected fs %s", device.Fstype) return &csi.NodeStageVolumeResponse{}, nil } fsType := req.VolumeCapability.GetMount().FsType // no filesystem - create it - klog.Infof("Creating FS %s on device %s", fsType, device) + klog.V(3).Infof("Creating FS %s on device %s", fsType, device) err = n.fsMaker.Make(device.Path, fsType) if err != nil { klog.Errorf("Could not create filesystem %s on %s", fsType, device) @@ -158,8 +158,10 @@ func (n *NodeService) validateNodeUnstageVolumeRequest(req *csi.NodeUnstageVolum // NodeUnstageVolume unstages a volume from the node func (n *NodeService) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error) { if err := n.validateNodeUnstageVolumeRequest(req); err != nil { + klog.Errorf("Validate Node unstage failed %v", err) return nil, err } + klog.V(3).Info("Validate Node unstage completed") // nothing to do here, we don't erase the filesystem of a device. return &csi.NodeUnstageVolumeResponse{}, nil } @@ -197,7 +199,7 @@ func (n *NodeService) validateNodePublishRequest(req *csi.NodePublishVolumeReque return nil } -//NodePublishVolume mounts the volume to the target path (req.GetTargetPath) +// NodePublishVolume mounts the volume to the target path (req.GetTargetPath) func (n *NodeService) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) { if req != nil { klog.V(3).Infof("Node Publish Request: %+v", *req) @@ -226,7 +228,7 @@ func (n *NodeService) NodePublishVolume(ctx context.Context, req *csi.NodePublis return nil, status.Error(codes.Internal, err.Error()) } if !notMnt { - klog.Infof("Volume %s already mounted on %s", req.GetVolumeId(), targetPath) + klog.V(3).Infof("Volume %s already mounted on %s", req.GetVolumeId(), targetPath) return &csi.NodePublishVolumeResponse{}, nil } @@ -242,9 +244,9 @@ func (n *NodeService) NodePublishVolume(ctx context.Context, req *csi.NodePublis if err != nil { return nil, err } - klog.Infof("GetMount() %v", req.VolumeCapability.GetMount()) + klog.V(3).Infof("GetMount() %v", req.VolumeCapability.GetMount()) fsType = req.VolumeCapability.GetMount().FsType - klog.Infof("Mounting devicePath %s, on targetPath: %s with FS type: %s", + klog.V(3).Infof("Mounting devicePath %s, on targetPath: %s with FS type: %s", device, targetPath, fsType) } @@ -285,26 +287,29 @@ func (n *NodeService) validateNodeUnpublishRequest(req *csi.NodeUnpublishVolumeR return nil } -//NodeUnpublishVolume unmount the volume from the worker node +// NodeUnpublishVolume unmount the volume from the worker node func (n *NodeService) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error) { if req != nil { klog.V(3).Infof("Node Unpublish Request: %+v", *req) } if err := n.validateNodeUnpublishRequest(req); err != nil { + klog.Errorf("Validate node unpublish failed %v", err) return nil, err } targetPath := req.GetTargetPath() - klog.Infof("Unmounting %s", targetPath) + klog.V(5).Infof("Unmounting %s", targetPath) err := n.mounter.Unmount(targetPath) if err != nil { - klog.Infof("failed to unmount") + klog.Errorf("failed to unmount %v", err) return nil, err } if err = os.RemoveAll(targetPath); err != nil { + klog.Errorf("failed to remove %s, %v", targetPath, err) return nil, fmt.Errorf("remove target path: %w", err) } + klog.V(3).Info("Validate Node unpublish completed") return &csi.NodeUnpublishVolumeResponse{}, nil } @@ -325,7 +330,7 @@ func (n *NodeService) NodeGetInfo(context.Context, *csi.NodeGetInfoRequest) (*cs return &csi.NodeGetInfoResponse{NodeId: n.nodeID}, nil } -//NodeGetCapabilities returns the supported capabilities of the node service +// NodeGetCapabilities returns the supported capabilities of the node service func (n *NodeService) NodeGetCapabilities(context.Context, *csi.NodeGetCapabilitiesRequest) (*csi.NodeGetCapabilitiesResponse, error) { caps := make([]*csi.NodeServiceCapability, 0, len(nodeCaps)) for _, c := range nodeCaps {