Skip to content

Commit

Permalink
Added sriov interface type support
Browse files Browse the repository at this point in the history
The type extracts PCI IDs from SRIOV-VF-PCI-ADDR environment variable
set by SR-IOV CNI plugin; and configures hostdev libvirt devices that
corresponds to the extracted IDs.

The change mounts additional host mounts for /sys and /dev to allow
containerized libvirt / qemu to plug pci devices using vfio kernel
interface.

Note that at this moment, SR-IOV enabled VMIs run their virt-launcher
pods privileged to allow qemu open /dev/vfio/NN devices. We don't know
in advance the name of the device until we create and start the pod,
at which point SR-IOV DP allocates a PCI ID to the pod that can be
mapped to its IOMMU group number and hence /dev/vfio/NN device.

In the future, SR-IOV DP will register the /dev/vfio/NN device with
device cgroup, at which point we will be able to drop the privileged
mode. (Additional capabilities like SYS_RESOURCE and SYS_RAWIO are
still needed.) This work is tracked in:
k8snetworkplumbingwg/sriov-network-device-plugin#26
  • Loading branch information
booxter authored and yossisegev committed Jan 3, 2019
1 parent 33858a3 commit 127ffd8
Show file tree
Hide file tree
Showing 16 changed files with 501 additions and 66 deletions.
4 changes: 4 additions & 0 deletions api/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -4540,10 +4540,14 @@
},
"slirp": {
"$ref": "#/definitions/v1.InterfaceSlirp"
},
"sriov": {
"$ref": "#/definitions/v1.InterfaceSRIOV"
}
}
},
"v1.InterfaceBridge": {},
"v1.InterfaceSRIOV": {},
"v1.InterfaceSlirp": {},
"v1.KVMTimer": {
"properties": {
Expand Down
50 changes: 50 additions & 0 deletions cluster/examples/vmi-sriov.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
apiVersion: kubevirt.io/v1alpha2
kind: VirtualMachineInstance
metadata:
creationTimestamp: null
labels:
special: vmi-sriov
name: vmi-sriov
spec:
domain:
devices:
disks:
- disk:
bus: virtio
name: registrydisk
volumeName: registryvolume
- disk:
bus: virtio
name: cloudinitdisk
volumeName: cloudinitvolume
interfaces:
- bridge: {}
name: default
- name: sriov-net
sriov: {}
machine:
type: ""
resources:
limits:
intel.com/sriov: "1"
requests:
intel.com/sriov: "1"
memory: 1024M
networks:
- name: default
pod: {}
- multus:
networkName: sriov-net
name: sriov-net
terminationGracePeriodSeconds: 0
volumes:
- name: registryvolume
registryDisk:
image: registry:5000/kubevirt/fedora-cloud-registry-disk-demo:devel
- cloudInitNoCloud:
userData: |
#!/bin/bash
echo "fedora" |passwd fedora --stdin
dhclient eth1
name: cloudinitvolume
status: {}
25 changes: 25 additions & 0 deletions pkg/api/v1/deepcopy_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 24 additions & 2 deletions pkg/api/v1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pkg/api/v1/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,7 @@ type Interface struct {
type InterfaceBindingMethod struct {
Bridge *InterfaceBridge `json:"bridge,omitempty"`
Slirp *InterfaceSlirp `json:"slirp,omitempty"`
SRIOV *InterfaceSRIOV `json:"sriov,omitempty"`
}

// ---
Expand All @@ -813,6 +814,10 @@ type InterfaceBridge struct{}
// +k8s:openapi-gen=true
type InterfaceSlirp struct{}

// ---
// +k8s:openapi-gen=true
type InterfaceSRIOV struct{}

// Port repesents a port to expose from the virtual machine.
// Default protocol TCP.
// The port field is mandatory
Expand Down
4 changes: 4 additions & 0 deletions pkg/api/v1/schema_swagger_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

67 changes: 65 additions & 2 deletions pkg/virt-controller/services/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,15 @@ func GetImagePullPolicy(store cache.Store) (policy k8sv1.PullPolicy, err error)
return
}

func isSRIOVVmi(vmi *v1.VirtualMachineInstance) bool {
for _, iface := range vmi.Spec.Domain.Devices.Interfaces {
if iface.SRIOV != nil {
return true
}
}
return false
}

func (t *templateService) RenderLaunchManifest(vmi *v1.VirtualMachineInstance) (*k8sv1.Pod, error) {
precond.MustNotBeNil(vmi)
domain := precond.MustNotBeEmpty(vmi.GetObjectMeta().GetName())
Expand All @@ -125,6 +134,7 @@ func (t *templateService) RenderLaunchManifest(vmi *v1.VirtualMachineInstance) (
var volumes []k8sv1.Volume
var volumeDevices []k8sv1.VolumeDevice
var userId int64 = 0
// Privileged mode is disabled by default.
var privileged bool = false
var volumeMounts []k8sv1.VolumeMount
var imagePullSecrets []k8sv1.LocalObjectReference
Expand All @@ -149,6 +159,60 @@ func (t *templateService) RenderLaunchManifest(vmi *v1.VirtualMachineInstance) (
MountPath: "/var/run/libvirt",
})

if isSRIOVVmi(vmi) {
// libvirt needs this volume to unbind the device from kernel
// driver, and register it with vfio userspace driver
volumeMounts = append(volumeMounts, k8sv1.VolumeMount{
Name: "pci-bus",
MountPath: "/sys/bus/pci/",
})
volumes = append(volumes, k8sv1.Volume{
Name: "pci-bus",
VolumeSource: k8sv1.VolumeSource{
HostPath: &k8sv1.HostPathVolumeSource{
Path: "/sys/bus/pci/",
},
},
})

// libvirt needs this volume to determine iommu group assigned
// to the device
volumeMounts = append(volumeMounts, k8sv1.VolumeMount{
Name: "pci-devices",
MountPath: "/sys/devices/",
})
volumes = append(volumes, k8sv1.Volume{
Name: "pci-devices",
VolumeSource: k8sv1.VolumeSource{
HostPath: &k8sv1.HostPathVolumeSource{
Path: "/sys/devices/",
},
},
})

// libvirt uses vfio-pci to pass host devices through
volumeMounts = append(volumeMounts, k8sv1.VolumeMount{
Name: "dev-vfio",
MountPath: "/dev/vfio/",
})
volumes = append(volumes, k8sv1.Volume{
Name: "dev-vfio",
VolumeSource: k8sv1.VolumeSource{
HostPath: &k8sv1.HostPathVolumeSource{
Path: "/dev/vfio/",
},
},
})

// todo: revisit when SR-IOV DP registers /dev/vfio/NN with pod
// device group:
// https://github.com/intel/sriov-network-device-plugin/pull/26
//
// Run virt-launcher compute container privileged to allow qemu
// to open /dev/vfio/NN for PCI passthrough
privileged = true
}

serviceAccountName := ""

for _, volume := range vmi.Spec.Volumes {
Expand Down Expand Up @@ -443,8 +507,7 @@ func (t *templateService) RenderLaunchManifest(vmi *v1.VirtualMachineInstance) (
Image: t.launcherImage,
ImagePullPolicy: imagePullPolicy,
SecurityContext: &k8sv1.SecurityContext{
RunAsUser: &userId,
// Privileged mode is disabled.
RunAsUser: &userId,
Privileged: &privileged,
Capabilities: &k8sv1.Capabilities{
Add: capabilities,
Expand Down
42 changes: 42 additions & 0 deletions pkg/virt-controller/services/template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,48 @@ var _ = Describe("Template", func() {
})
})

Context("with sriov interface", func() {
It("should run privileged", func() {
sriovInterface := v1.InterfaceSRIOV{}
domain := v1.DomainSpec{}
domain.Devices.Interfaces = []v1.Interface{{Name: "testnet", InterfaceBindingMethod: v1.InterfaceBindingMethod{SRIOV: &sriovInterface}}}
vmi := v1.VirtualMachineInstance{
ObjectMeta: metav1.ObjectMeta{
Name: "testvmi", Namespace: "default", UID: "1234",
},
Spec: v1.VirtualMachineInstanceSpec{Domain: domain},
}

pod, err := svc.RenderLaunchManifest(&vmi)
Expect(err).ToNot(HaveOccurred())

Expect(len(pod.Spec.Containers)).To(Equal(1))
Expect(*pod.Spec.Containers[0].SecurityContext.Privileged).To(Equal(true))
})
It("should mount pci related host directories", func() {
sriovInterface := v1.InterfaceSRIOV{}
domain := v1.DomainSpec{}
domain.Devices.Interfaces = []v1.Interface{{Name: "testnet", InterfaceBindingMethod: v1.InterfaceBindingMethod{SRIOV: &sriovInterface}}}
vmi := v1.VirtualMachineInstance{
ObjectMeta: metav1.ObjectMeta{
Name: "testvmi", Namespace: "default", UID: "1234",
},
Spec: v1.VirtualMachineInstanceSpec{Domain: domain},
}

pod, err := svc.RenderLaunchManifest(&vmi)
Expect(err).ToNot(HaveOccurred())

Expect(len(pod.Spec.Containers)).To(Equal(1))
// Skip first three mounts that are generic for all launcher pods
Expect(pod.Spec.Containers[0].VolumeMounts[3].MountPath).To(Equal("/sys/bus/pci/"))
Expect(pod.Spec.Containers[0].VolumeMounts[4].MountPath).To(Equal("/sys/devices/"))
Expect(pod.Spec.Containers[0].VolumeMounts[5].MountPath).To(Equal("/dev/vfio/"))
Expect(pod.Spec.Volumes[0].HostPath.Path).To(Equal("/sys/bus/pci/"))
Expect(pod.Spec.Volumes[1].HostPath.Path).To(Equal("/sys/devices/"))
Expect(pod.Spec.Volumes[2].HostPath.Path).To(Equal("/dev/vfio/"))
})
})
Context("with slirp interface", func() {
It("Should have empty port list in the pod manifest", func() {
slirpInterface := v1.InterfaceSlirp{}
Expand Down
Loading

0 comments on commit 127ffd8

Please sign in to comment.