Skip to content

Commit

Permalink
mm: introduce fault_env
Browse files Browse the repository at this point in the history
The idea borrowed from Peter's patch from patchset on speculative page
faults[1]:

Instead of passing around the endless list of function arguments,
replace the lot with a single structure so we can change context without
endless function signature changes.

The changes are mostly mechanical with exception of faultaround code:
filemap_map_pages() got reworked a bit.

This patch is preparation for the next one.

[1] http://lkml.kernel.org/r/20141020222841.302891540@infradead.org

Link: http://lkml.kernel.org/r/1466021202-61880-9-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
kiryl authored and torvalds committed Jul 26, 2016
1 parent dcddffd commit bae473a
Show file tree
Hide file tree
Showing 10 changed files with 475 additions and 516 deletions.
10 changes: 5 additions & 5 deletions Documentation/filesystems/Locking
Original file line number Diff line number Diff line change
Expand Up @@ -548,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.

->map_pages() is called when VM asks to map easy accessible pages.
Filesystem should find and map pages associated with offsets from "pgoff"
till "max_pgoff". ->map_pages() is called with page table locked and must
Filesystem should find and map pages associated with offsets from "start_pgoff"
till "end_pgoff". ->map_pages() is called with page table locked and must
not block. If it's not possible to reach a page without blocking,
filesystem should skip it. Filesystem should use do_set_pte() to setup
page table entry. Pointer to entry associated with offset "pgoff" is
passed in "pte" field in vm_fault structure. Pointers to entries for other
offsets should be calculated relative to "pte".
page table entry. Pointer to entry associated with the page is passed in
"pte" field in fault_env structure. Pointers to entries for other offsets
should be calculated relative to "pte".

->page_mkwrite() is called when a previously read-only pte is
about to become writeable. The filesystem again must ensure that there are
Expand Down
22 changes: 11 additions & 11 deletions fs/userfaultfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
* fatal_signal_pending()s, and the mmap_sem must be released before
* returning it.
*/
int handle_userfault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, unsigned long reason)
int handle_userfault(struct fault_env *fe, unsigned long reason)
{
struct mm_struct *mm = vma->vm_mm;
struct mm_struct *mm = fe->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
Expand All @@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
BUG_ON(!rwsem_is_locked(&mm->mmap_sem));

ret = VM_FAULT_SIGBUS;
ctx = vma->vm_userfaultfd_ctx.ctx;
ctx = fe->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;

Expand Down Expand Up @@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now.
*/
if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
/*
* Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
printk(KERN_WARNING
"FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
"FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
dump_stack();
}
#endif
Expand All @@ -324,18 +323,19 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* and wait.
*/
ret = VM_FAULT_RETRY;
if (flags & FAULT_FLAG_RETRY_NOWAIT)
if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;

/* take the reference before dropping the mmap_sem */
userfaultfd_ctx_get(ctx);

init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
uwq.msg = userfault_msg(address, flags, reason);
uwq.msg = userfault_msg(fe->address, fe->flags, reason);
uwq.ctx = ctx;

return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
return_to_userland =
(fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);

spin_lock(&ctx->fault_pending_wqh.lock);
Expand All @@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock);

must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
up_read(&mm->mmap_sem);

if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
Expand Down
20 changes: 5 additions & 15 deletions include/linux/huge_mm.h
Original file line number Diff line number Diff line change
@@ -1,20 +1,12 @@
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H

extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags);
extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma);
extern void huge_pmd_set_accessed(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pmd_t orig_pmd, int dirty);
extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pmd_t orig_pmd);
extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
Expand Down Expand Up @@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page)
return 1;
}

extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp);
extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);

extern struct page *huge_zero_page;

Expand Down Expand Up @@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
return NULL;
}

static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
{
return 0;
}
Expand Down
34 changes: 26 additions & 8 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,27 @@ struct vm_fault {
* VM_FAULT_DAX_LOCKED and fill in
* entry here.
*/
/* for ->map_pages() only */
pgoff_t max_pgoff; /* map pages for offset from pgoff till
* max_pgoff inclusive */
pte_t *pte; /* pte entry associated with ->pgoff */
};

/*
* Page fault context: passes though page fault handler instead of endless list
* of function arguments.
*/
struct fault_env {
struct vm_area_struct *vma; /* Target VMA */
unsigned long address; /* Faulting virtual address */
unsigned int flags; /* FAULT_FLAG_xxx flags */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address'
*/
pte_t *pte; /* Pointer to pte entry matching
* the 'address'. NULL if the page
* table hasn't been allocated.
*/
spinlock_t *ptl; /* Page table lock.
* Protects pte page table if 'pte'
* is not NULL, otherwise pmd.
*/
};

/*
Expand All @@ -327,7 +344,8 @@ struct vm_operations_struct {
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
pmd_t *, unsigned int flags);
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
void (*map_pages)(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff);

/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
Expand Down Expand Up @@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}

void do_set_pte(struct vm_area_struct *vma, unsigned long address,
struct page *page, pte_t *pte, bool write, bool anon);
void do_set_pte(struct fault_env *fe, struct page *page);
#endif

/*
Expand Down Expand Up @@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf);
extern void filemap_map_pages(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);

/* mm/page-writeback.c */
Expand Down
8 changes: 2 additions & 6 deletions include/linux/userfaultfd_k.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)

extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, unsigned long reason);
extern int handle_userfault(struct fault_env *fe, unsigned long reason);

extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len);
Expand Down Expand Up @@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
#else /* CONFIG_USERFAULTFD */

/* mm helpers */
static inline int handle_userfault(struct vm_area_struct *vma,
unsigned long address,
unsigned int flags,
unsigned long reason)
static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
{
return VM_FAULT_SIGBUS;
}
Expand Down
28 changes: 14 additions & 14 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2128,22 +2128,27 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);

void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
void filemap_map_pages(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
struct file *file = vma->vm_file;
struct file *file = fe->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
struct page *page;
unsigned long address = (unsigned long) vmf->virtual_address;
unsigned long addr;
pte_t *pte;

rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
if (iter.index > vmf->max_pgoff)
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
start_pgoff) {
if (iter.index > end_pgoff)
break;
fe->pte += iter.index - last_pgoff;
fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
last_pgoff = iter.index;
if (!pte_none(*fe->pte))
goto next;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
Expand Down Expand Up @@ -2179,22 +2184,17 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
if (page->index >= size >> PAGE_SHIFT)
goto unlock;

pte = vmf->pte + page->index - vmf->pgoff;
if (!pte_none(*pte))
goto unlock;

if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
do_set_pte(vma, addr, page, pte, false, false);
do_set_pte(fe, page);
unlock_page(page);
goto next;
unlock:
unlock_page(page);
skip:
put_page(page);
next:
if (iter.index == vmf->max_pgoff)
if (iter.index == end_pgoff)
break;
}
rcu_read_unlock();
Expand Down
Loading

0 comments on commit bae473a

Please sign in to comment.