From d672123ec4a34e23d0b5b25c2436d30774dffa42 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Thu, 18 Mar 2021 17:01:51 +1100 Subject: [PATCH] BACKPORT: FROMGIT: userfaultfd: support minor fault handling for shmem Patch series "userfaultfd: support minor fault handling for shmem", v2. Overview ======== See my original series [1] for a detailed overview of minor fault handling in general. The feature in this series works exactly like the hugetblfs version (from userspace's perspective). I'm sending this as a separate series because: - The original minor fault handling series has a full set of R-Bs, and seems close to being merged. So, it seems reasonable to start looking at this next step, which extends the basic functionality. - shmem is different enough that this series may require some additional work before it's ready, and I don't want to delay the original series unnecessarily by bundling them together. Use Case ======== In some cases it is useful to have VM memory backed by tmpfs instead of hugetlbfs. So, this feature will be used to support the same VM live migration use case described in my original series. Additionally, Android folks (Lokesh Gidra ) hope to optimize the Android Runtime garbage collector using this feature: "The plan is to use userfaultfd for concurrently compacting the heap. With this feature, the heap can be shared-mapped at another location where the GC-thread(s) could continue the compaction operation without the need to invoke userfault ioctl(UFFDIO_COPY) each time. OTOH, if and when Java threads get faults on the heap, UFFDIO_CONTINUE can be used to resume execution. Furthermore, this feature enables updating references in the 'non-moving' portion of the heap efficiently. Without this feature, uneccessary page copying (ioctl(UFFDIO_COPY)) would be required." [1] https://lore.kernel.org/linux-fsdevel/20210301222728.176417-1-axelrasmussen@google.com/T/#t This patch (of 5): Modify the userfaultfd register API to allow registering shmem VMAs in minor mode. Modify the shmem mcopy implementation to support UFFDIO_CONTINUE in order to resolve such faults. Combine the shmem mcopy handler functions into a single shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how the hugetlbfs implementation is structured, and lets us remove a good chunk of boilerplate. Link: https://lkml.kernel.org/r/20210302000133.272579-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20210302000133.272579-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jerome Glisse Cc: Joe Perches Cc: Lokesh Gidra Cc: Mike Rapoport Cc: Peter Xu Cc: Shaohua Li Cc: Shuah Khan Cc: Wang Qing Cc: Brian Geffon Cc: Cannon Matthews Cc: "Dr . David Alan Gilbert" Cc: David Rientjes Cc: Michel Lespinasse Cc: Mina Almasry Cc: Oliver Upton Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell (cherry picked from commit 4cc6e15679966aa49afc5b114c3c83ba0ac39b05 https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git akpm) Link: https://lore.kernel.org/patchwork/patch/1388146/ Conflicts: mm/shmem.c (1. Manual rebase 2. Enclosed shmem_copy_atomic_pte() with CONFIG_USERFAULTFD to avoid compile erros when USERFAULTFD is not enabled.) Signed-off-by: Lokesh Gidra Bug: 160737021 Bug: 169683130 Change-Id: Idcd822b2a124a089121b9ad8c65061f6979126ec --- fs/userfaultfd.c | 6 +- include/linux/shmem_fs.h | 26 ++++----- include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 ++- mm/shmem.c | 94 +++++++++++++++----------------- mm/userfaultfd.c | 27 ++++----- 6 files changed, 81 insertions(+), 84 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 58ec8ece8ae6..f35f0aba2faa 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1273,8 +1273,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, } if (vm_flags & VM_UFFD_MINOR) { - /* FIXME: Add minor fault interception for shmem. */ - if (!is_vm_hugetlb_page(vma)) + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; } @@ -1953,7 +1952,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a5a5d1d4d7b1..a9717c1a52a9 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include #include #include +#include /* inode in-kernel data */ @@ -118,21 +119,16 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); +#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep); -extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr); -#else -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ - src_addr, pagep) ({ BUG(); 0; }) -#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \ - dst_addr) ({ BUG(); 0; }) -#endif +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); +#else /* !CONFIG_SHMEM */ +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ + src_addr, mode, pagep) ({ BUG(); 0; }) +#endif /* CONFIG_SHMEM */ +#endif /* CONFIG_USERFAULTFD */ #endif diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index bafbeb1a2624..47d9790d863d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ - UFFD_FEATURE_MINOR_HUGETLBFS) + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -196,6 +197,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features; __u64 ioctls; diff --git a/mm/memory.c b/mm/memory.c index 01eb3633d828..95a5902c00eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4162,9 +4162,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(vmf); - if (ret) - return ret; + if (likely(!userfaultfd_minor(vmf->vma))) { + ret = do_fault_around(vmf); + if (ret) + return ret; + } } ret = __do_fault(vmf); diff --git a/mm/shmem.c b/mm/shmem.c index 51b26ec9d3df..8f95179b5e16 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; #include #include #include -#include #include #include @@ -1787,8 +1786,8 @@ unlock: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vmf and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they + * are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, @@ -1830,6 +1829,12 @@ repeat: return error; } + if (page && vma && userfaultfd_minor(vma)) { + unlock_page(page); + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + if (page) hindex = page->index; if (page && sgp == SGP_WRITE) @@ -2357,14 +2362,13 @@ bool shmem_mapping(struct address_space *mapping) return mapping->a_ops == &shmem_aops; } -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - bool zeropage, - struct page **pagep) +#ifdef CONFIG_USERFAULTFD +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; @@ -2381,12 +2385,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!shmem_inode_acct_block(inode, 1)) goto out; - if (!*pagep) { + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, pgoff); + if (!page) + goto out_unacct_blocks; + } else if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks; - if (!zeropage) { /* mcopy_atomic */ + if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, @@ -2400,7 +2409,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ return -ENOENT; } - } else { /* mfill_zeropage_atomic */ + } else { /* zeropage */ clear_highpage(page); } } else { @@ -2408,10 +2417,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; } - VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + if (!is_continue) { + VM_BUG_ON(PageSwapBacked(page)); + VM_BUG_ON(PageLocked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + __SetPageUptodate(page); + } ret = -EFAULT; offset = linear_page_index(dst_vma, dst_addr); @@ -2419,10 +2431,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_mm); - if (ret) - goto out_release; + /* If page wasn't already in the page cache, add it. */ + if (!is_continue) { + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); + if (ret) + goto out_release; + } _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2449,13 +2464,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!pte_none(*dst_pte)) goto out_release_unlock; - lru_cache_add(page); + if (!is_continue) { + lru_cache_add(page); - spin_lock_irq(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + spin_lock_irq(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + } inc_mm_counter(dst_mm, mm_counter_file(page)); page_add_file_rmap(page, false); @@ -2479,28 +2496,7 @@ out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); goto out; } - -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) -{ - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, false, pagep); -} - -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) -{ - struct page *page = NULL; - - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, 0, true, &page); -} +#endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e14b3820c6a8..cbb7c8d79a4d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage, + enum mcopy_atomic_mode mode, bool wp_copy) { ssize_t err; @@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) { - if (!zeropage) + switch (mode) { + case MCOPY_ATOMIC_NORMAL: err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page, wp_copy); - else + break; + case MCOPY_ATOMIC_ZEROPAGE: err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); + break; + case MCOPY_ATOMIC_CONTINUE: + err = -EINVAL; + break; + } } else { VM_WARN_ON_ONCE(wp_copy); - if (!zeropage) - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, - dst_vma, dst_addr, - src_addr, page); - else - err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + src_addr, mode, page); } return err; @@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; - bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters: @@ -530,7 +531,7 @@ retry: if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; - if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) goto out_unlock; /* @@ -578,7 +579,7 @@ retry: BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage, wp_copy); + src_addr, &page, mcopy_mode, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) {