forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
xsk: add user memory registration support sockopt
In this commit the base structure of the AF_XDP address family is set up. Further, we introduce the abilty register a window of user memory to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory window is viewed by an AF_XDP socket as a set of equally large frames. After a user memory registration all frames are "owned" by the user application, and not the kernel. v2: More robust checks on umem creation and unaccount on error. Call set_page_dirty_lock on cleanup. Simplified xdp_umem_reg. Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com> Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
- Loading branch information
1 parent
68e8b84
commit c0c77d8
Showing
8 changed files
with
596 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
/* SPDX-License-Identifier: GPL-2.0 | ||
* AF_XDP internal functions | ||
* Copyright(c) 2018 Intel Corporation. | ||
* | ||
* This program is free software; you can redistribute it and/or modify it | ||
* under the terms and conditions of the GNU General Public License, | ||
* version 2, as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope it will be useful, but WITHOUT | ||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
* more details. | ||
*/ | ||
|
||
#ifndef _LINUX_XDP_SOCK_H | ||
#define _LINUX_XDP_SOCK_H | ||
|
||
#include <linux/mutex.h> | ||
#include <net/sock.h> | ||
|
||
struct xdp_umem; | ||
|
||
struct xdp_sock { | ||
/* struct sock must be the first member of struct xdp_sock */ | ||
struct sock sk; | ||
struct xdp_umem *umem; | ||
/* Protects multiple processes in the control path */ | ||
struct mutex mutex; | ||
}; | ||
|
||
#endif /* _LINUX_XDP_SOCK_H */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note | ||
* | ||
* if_xdp: XDP socket user-space interface | ||
* Copyright(c) 2018 Intel Corporation. | ||
* | ||
* This program is free software; you can redistribute it and/or modify it | ||
* under the terms and conditions of the GNU General Public License, | ||
* version 2, as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope it will be useful, but WITHOUT | ||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
* more details. | ||
* | ||
* Author(s): Björn Töpel <bjorn.topel@intel.com> | ||
* Magnus Karlsson <magnus.karlsson@intel.com> | ||
*/ | ||
|
||
#ifndef _LINUX_IF_XDP_H | ||
#define _LINUX_IF_XDP_H | ||
|
||
#include <linux/types.h> | ||
|
||
/* XDP socket options */ | ||
#define XDP_UMEM_REG 3 | ||
|
||
struct xdp_umem_reg { | ||
__u64 addr; /* Start of packet data area */ | ||
__u64 len; /* Length of packet data area */ | ||
__u32 frame_size; /* Frame size */ | ||
__u32 frame_headroom; /* Frame head room */ | ||
}; | ||
|
||
#endif /* _LINUX_IF_XDP_H */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
// SPDX-License-Identifier: GPL-2.0 | ||
/* XDP user-space packet buffer | ||
* Copyright(c) 2018 Intel Corporation. | ||
* | ||
* This program is free software; you can redistribute it and/or modify it | ||
* under the terms and conditions of the GNU General Public License, | ||
* version 2, as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope it will be useful, but WITHOUT | ||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
* more details. | ||
*/ | ||
|
||
#include <linux/init.h> | ||
#include <linux/sched/mm.h> | ||
#include <linux/sched/signal.h> | ||
#include <linux/sched/task.h> | ||
#include <linux/uaccess.h> | ||
#include <linux/slab.h> | ||
#include <linux/bpf.h> | ||
#include <linux/mm.h> | ||
|
||
#include "xdp_umem.h" | ||
|
||
#define XDP_UMEM_MIN_FRAME_SIZE 2048 | ||
|
||
int xdp_umem_create(struct xdp_umem **umem) | ||
{ | ||
*umem = kzalloc(sizeof(**umem), GFP_KERNEL); | ||
|
||
if (!(*umem)) | ||
return -ENOMEM; | ||
|
||
return 0; | ||
} | ||
|
||
static void xdp_umem_unpin_pages(struct xdp_umem *umem) | ||
{ | ||
unsigned int i; | ||
|
||
if (umem->pgs) { | ||
for (i = 0; i < umem->npgs; i++) { | ||
struct page *page = umem->pgs[i]; | ||
|
||
set_page_dirty_lock(page); | ||
put_page(page); | ||
} | ||
|
||
kfree(umem->pgs); | ||
umem->pgs = NULL; | ||
} | ||
} | ||
|
||
static void xdp_umem_unaccount_pages(struct xdp_umem *umem) | ||
{ | ||
if (umem->user) { | ||
atomic_long_sub(umem->npgs, &umem->user->locked_vm); | ||
free_uid(umem->user); | ||
} | ||
} | ||
|
||
static void xdp_umem_release(struct xdp_umem *umem) | ||
{ | ||
struct task_struct *task; | ||
struct mm_struct *mm; | ||
|
||
if (umem->pgs) { | ||
xdp_umem_unpin_pages(umem); | ||
|
||
task = get_pid_task(umem->pid, PIDTYPE_PID); | ||
put_pid(umem->pid); | ||
if (!task) | ||
goto out; | ||
mm = get_task_mm(task); | ||
put_task_struct(task); | ||
if (!mm) | ||
goto out; | ||
|
||
mmput(mm); | ||
umem->pgs = NULL; | ||
} | ||
|
||
xdp_umem_unaccount_pages(umem); | ||
out: | ||
kfree(umem); | ||
} | ||
|
||
static void xdp_umem_release_deferred(struct work_struct *work) | ||
{ | ||
struct xdp_umem *umem = container_of(work, struct xdp_umem, work); | ||
|
||
xdp_umem_release(umem); | ||
} | ||
|
||
void xdp_get_umem(struct xdp_umem *umem) | ||
{ | ||
atomic_inc(&umem->users); | ||
} | ||
|
||
void xdp_put_umem(struct xdp_umem *umem) | ||
{ | ||
if (!umem) | ||
return; | ||
|
||
if (atomic_dec_and_test(&umem->users)) { | ||
INIT_WORK(&umem->work, xdp_umem_release_deferred); | ||
schedule_work(&umem->work); | ||
} | ||
} | ||
|
||
static int xdp_umem_pin_pages(struct xdp_umem *umem) | ||
{ | ||
unsigned int gup_flags = FOLL_WRITE; | ||
long npgs; | ||
int err; | ||
|
||
umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); | ||
if (!umem->pgs) | ||
return -ENOMEM; | ||
|
||
down_write(¤t->mm->mmap_sem); | ||
npgs = get_user_pages(umem->address, umem->npgs, | ||
gup_flags, &umem->pgs[0], NULL); | ||
up_write(¤t->mm->mmap_sem); | ||
|
||
if (npgs != umem->npgs) { | ||
if (npgs >= 0) { | ||
umem->npgs = npgs; | ||
err = -ENOMEM; | ||
goto out_pin; | ||
} | ||
err = npgs; | ||
goto out_pgs; | ||
} | ||
return 0; | ||
|
||
out_pin: | ||
xdp_umem_unpin_pages(umem); | ||
out_pgs: | ||
kfree(umem->pgs); | ||
umem->pgs = NULL; | ||
return err; | ||
} | ||
|
||
static int xdp_umem_account_pages(struct xdp_umem *umem) | ||
{ | ||
unsigned long lock_limit, new_npgs, old_npgs; | ||
|
||
if (capable(CAP_IPC_LOCK)) | ||
return 0; | ||
|
||
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | ||
umem->user = get_uid(current_user()); | ||
|
||
do { | ||
old_npgs = atomic_long_read(&umem->user->locked_vm); | ||
new_npgs = old_npgs + umem->npgs; | ||
if (new_npgs > lock_limit) { | ||
free_uid(umem->user); | ||
umem->user = NULL; | ||
return -ENOBUFS; | ||
} | ||
} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, | ||
new_npgs) != old_npgs); | ||
return 0; | ||
} | ||
|
||
int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) | ||
{ | ||
u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; | ||
u64 addr = mr->addr, size = mr->len; | ||
unsigned int nframes, nfpp; | ||
int size_chk, err; | ||
|
||
if (!umem) | ||
return -EINVAL; | ||
|
||
if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { | ||
/* Strictly speaking we could support this, if: | ||
* - huge pages, or* | ||
* - using an IOMMU, or | ||
* - making sure the memory area is consecutive | ||
* but for now, we simply say "computer says no". | ||
*/ | ||
return -EINVAL; | ||
} | ||
|
||
if (!is_power_of_2(frame_size)) | ||
return -EINVAL; | ||
|
||
if (!PAGE_ALIGNED(addr)) { | ||
/* Memory area has to be page size aligned. For | ||
* simplicity, this might change. | ||
*/ | ||
return -EINVAL; | ||
} | ||
|
||
if ((addr + size) < addr) | ||
return -EINVAL; | ||
|
||
nframes = size / frame_size; | ||
if (nframes == 0 || nframes > UINT_MAX) | ||
return -EINVAL; | ||
|
||
nfpp = PAGE_SIZE / frame_size; | ||
if (nframes < nfpp || nframes % nfpp) | ||
return -EINVAL; | ||
|
||
frame_headroom = ALIGN(frame_headroom, 64); | ||
|
||
size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; | ||
if (size_chk < 0) | ||
return -EINVAL; | ||
|
||
umem->pid = get_task_pid(current, PIDTYPE_PID); | ||
umem->size = (size_t)size; | ||
umem->address = (unsigned long)addr; | ||
umem->props.frame_size = frame_size; | ||
umem->props.nframes = nframes; | ||
umem->frame_headroom = frame_headroom; | ||
umem->npgs = size / PAGE_SIZE; | ||
umem->pgs = NULL; | ||
umem->user = NULL; | ||
|
||
umem->frame_size_log2 = ilog2(frame_size); | ||
umem->nfpp_mask = nfpp - 1; | ||
umem->nfpplog2 = ilog2(nfpp); | ||
atomic_set(&umem->users, 1); | ||
|
||
err = xdp_umem_account_pages(umem); | ||
if (err) | ||
goto out; | ||
|
||
err = xdp_umem_pin_pages(umem); | ||
if (err) | ||
goto out_account; | ||
return 0; | ||
|
||
out_account: | ||
xdp_umem_unaccount_pages(umem); | ||
out: | ||
put_pid(umem->pid); | ||
return err; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/* SPDX-License-Identifier: GPL-2.0 | ||
* XDP user-space packet buffer | ||
* Copyright(c) 2018 Intel Corporation. | ||
* | ||
* This program is free software; you can redistribute it and/or modify it | ||
* under the terms and conditions of the GNU General Public License, | ||
* version 2, as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope it will be useful, but WITHOUT | ||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
* more details. | ||
*/ | ||
|
||
#ifndef XDP_UMEM_H_ | ||
#define XDP_UMEM_H_ | ||
|
||
#include <linux/mm.h> | ||
#include <linux/if_xdp.h> | ||
#include <linux/workqueue.h> | ||
|
||
#include "xdp_umem_props.h" | ||
|
||
struct xdp_umem { | ||
struct page **pgs; | ||
struct xdp_umem_props props; | ||
u32 npgs; | ||
u32 frame_headroom; | ||
u32 nfpp_mask; | ||
u32 nfpplog2; | ||
u32 frame_size_log2; | ||
struct user_struct *user; | ||
struct pid *pid; | ||
unsigned long address; | ||
size_t size; | ||
atomic_t users; | ||
struct work_struct work; | ||
}; | ||
|
||
int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); | ||
void xdp_get_umem(struct xdp_umem *umem); | ||
void xdp_put_umem(struct xdp_umem *umem); | ||
int xdp_umem_create(struct xdp_umem **umem); | ||
|
||
#endif /* XDP_UMEM_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
/* SPDX-License-Identifier: GPL-2.0 | ||
* XDP user-space packet buffer | ||
* Copyright(c) 2018 Intel Corporation. | ||
* | ||
* This program is free software; you can redistribute it and/or modify it | ||
* under the terms and conditions of the GNU General Public License, | ||
* version 2, as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope it will be useful, but WITHOUT | ||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
* more details. | ||
*/ | ||
|
||
#ifndef XDP_UMEM_PROPS_H_ | ||
#define XDP_UMEM_PROPS_H_ | ||
|
||
struct xdp_umem_props { | ||
u32 frame_size; | ||
u32 nframes; | ||
}; | ||
|
||
#endif /* XDP_UMEM_PROPS_H_ */ |
Oops, something went wrong.