Patch series "userfaultfd tmpfs/hugetlbfs/non-cooperative", v2 These userfaultfd features are finished and are ready for larger exposure in -mm and upstream merging. 1) tmpfs non present userfault 2) hugetlbfs non present userfault 3) non cooperative userfault for fork/madvise/mremap qemu development code is already exercising 2) and container postcopy live migration needs 3). 1) is not currently used but there's a self test and we know some qemu user for various reasons uses tmpfs as backing for KVM so it'll need it too to use postcopy live migration with tmpfs memory. All review feedback from the previous submit has been handled and the fixes are included. There's no outstanding issue AFIK. Upstream code just did a s/fe/vmf/ conversion in the page faults and this has been converted as well incrementally. In addition to the previous submits, this also wakes up stuck userfaults during UFFDIO_UNREGISTER. The non cooperative testcase actually reproduced this problem by getting stuck instead of quitting clean in some rare case as it could call UFFDIO_UNREGISTER while some userfault could be still in flight. The other option would have been to keep leaving it up to userland to serialize itself and to patch the testcase instead but the wakeup during unregister I think is preferable. David also asked the UFFD_FEATURE_MISSING_HUGETLBFS and UFFD_FEATURE_MISSING_SHMEM feature flags to be added so QEMU can avoid to probe if the hugetlbfs/shmem missing support is available by calling UFFDIO_REGISTER. QEMU already checks HUGETLBFS_MAGIC with fstatfs so if UFFD_FEATURE_MISSING_HUGETLBFS is also set, it knows UFFDIO_REGISTER will succeed (or if it fails, it's for some other more concerning reason). There's no reason to worry about adding too many feature flags. There are 64 available and worst case we've to bump the API if someday we're really going to run out of them. The round-trip network latency of hugetlbfs userfaults during postcopy live migration is still of the order of dozen milliseconds on 10GBit if at 2MB hugepage granularity so it's working perfectly and it should provide for higher bandwidth or lower CPU usage (which makes it interesting to add an option in the future to support THP granularity too for anonymous memory, UFFDIO_COPY would then have to create THP if alignment/len allows for it). 1GB hugetlbfs granularity will require big changes in hugetlbfs to work so it's deferred for later. This patch (of 42): This adds proper documentation (inline) to avoid the risk of further misunderstandings about the semantics of _IOW/_IOR and it also reminds whoever will bump the UFFDIO_API in the future, to change the two ioctl to _IOW. This was found while implementing strace support for those ioctl, otherwise we could have never found it by just reviewing kernel code and testing it. _IOC_READ or _IOC_WRITE alters nothing but the ioctl number itself, so it's only worth fixing if the UFFDIO_API is bumped someday. Link: http://lkml.kernel.org/r/20161216144821.5183-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: "Dmitry V. Levin" <ldv@altlinux.org> Cc: Michael Rapoport <RAPOPORT@il.ibm.com> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Pavel Emelyanov <xemul@parallels.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
174 lines
4.5 KiB
C
174 lines
4.5 KiB
C
/*
|
|
* include/linux/userfaultfd.h
|
|
*
|
|
* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
|
|
* Copyright (C) 2015 Red Hat, Inc.
|
|
*
|
|
*/
|
|
|
|
#ifndef _LINUX_USERFAULTFD_H
|
|
#define _LINUX_USERFAULTFD_H
|
|
|
|
#include <linux/types.h>
|
|
|
|
/*
|
|
* If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
|
|
* UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In
|
|
* userfaultfd.h we assumed the kernel was reading (instead _IOC_READ
|
|
* means the userland is reading).
|
|
*/
|
|
#define UFFD_API ((__u64)0xAA)
|
|
/*
|
|
* After implementing the respective features it will become:
|
|
* #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
|
|
* UFFD_FEATURE_EVENT_FORK)
|
|
*/
|
|
#define UFFD_API_FEATURES (0)
|
|
#define UFFD_API_IOCTLS \
|
|
((__u64)1 << _UFFDIO_REGISTER | \
|
|
(__u64)1 << _UFFDIO_UNREGISTER | \
|
|
(__u64)1 << _UFFDIO_API)
|
|
#define UFFD_API_RANGE_IOCTLS \
|
|
((__u64)1 << _UFFDIO_WAKE | \
|
|
(__u64)1 << _UFFDIO_COPY | \
|
|
(__u64)1 << _UFFDIO_ZEROPAGE)
|
|
|
|
/*
|
|
* Valid ioctl command number range with this API is from 0x00 to
|
|
* 0x3F. UFFDIO_API is the fixed number, everything else can be
|
|
* changed by implementing a different UFFD_API. If sticking to the
|
|
* same UFFD_API more ioctl can be added and userland will be aware of
|
|
* which ioctl the running kernel implements through the ioctl command
|
|
* bitmask written by the UFFDIO_API.
|
|
*/
|
|
#define _UFFDIO_REGISTER (0x00)
|
|
#define _UFFDIO_UNREGISTER (0x01)
|
|
#define _UFFDIO_WAKE (0x02)
|
|
#define _UFFDIO_COPY (0x03)
|
|
#define _UFFDIO_ZEROPAGE (0x04)
|
|
#define _UFFDIO_API (0x3F)
|
|
|
|
/* userfaultfd ioctl ids */
|
|
#define UFFDIO 0xAA
|
|
#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
|
|
struct uffdio_api)
|
|
#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
|
|
struct uffdio_register)
|
|
#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \
|
|
struct uffdio_range)
|
|
#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
|
|
struct uffdio_range)
|
|
#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
|
|
struct uffdio_copy)
|
|
#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
|
|
struct uffdio_zeropage)
|
|
|
|
/* read() structure */
|
|
struct uffd_msg {
|
|
__u8 event;
|
|
|
|
__u8 reserved1;
|
|
__u16 reserved2;
|
|
__u32 reserved3;
|
|
|
|
union {
|
|
struct {
|
|
__u64 flags;
|
|
__u64 address;
|
|
} pagefault;
|
|
|
|
struct {
|
|
/* unused reserved fields */
|
|
__u64 reserved1;
|
|
__u64 reserved2;
|
|
__u64 reserved3;
|
|
} reserved;
|
|
} arg;
|
|
} __packed;
|
|
|
|
/*
|
|
* Start at 0x12 and not at 0 to be more strict against bugs.
|
|
*/
|
|
#define UFFD_EVENT_PAGEFAULT 0x12
|
|
#if 0 /* not available yet */
|
|
#define UFFD_EVENT_FORK 0x13
|
|
#endif
|
|
|
|
/* flags for UFFD_EVENT_PAGEFAULT */
|
|
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
|
|
#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
|
|
|
|
struct uffdio_api {
|
|
/* userland asks for an API number and the features to enable */
|
|
__u64 api;
|
|
/*
|
|
* Kernel answers below with the all available features for
|
|
* the API, this notifies userland of which events and/or
|
|
* which flags for each event are enabled in the current
|
|
* kernel.
|
|
*
|
|
* Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
|
|
* are to be considered implicitly always enabled in all kernels as
|
|
* long as the uffdio_api.api requested matches UFFD_API.
|
|
*/
|
|
#if 0 /* not available yet */
|
|
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
|
|
#define UFFD_FEATURE_EVENT_FORK (1<<1)
|
|
#endif
|
|
__u64 features;
|
|
|
|
__u64 ioctls;
|
|
};
|
|
|
|
struct uffdio_range {
|
|
__u64 start;
|
|
__u64 len;
|
|
};
|
|
|
|
struct uffdio_register {
|
|
struct uffdio_range range;
|
|
#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
|
|
#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
|
|
__u64 mode;
|
|
|
|
/*
|
|
* kernel answers which ioctl commands are available for the
|
|
* range, keep at the end as the last 8 bytes aren't read.
|
|
*/
|
|
__u64 ioctls;
|
|
};
|
|
|
|
struct uffdio_copy {
|
|
__u64 dst;
|
|
__u64 src;
|
|
__u64 len;
|
|
/*
|
|
* There will be a wrprotection flag later that allows to map
|
|
* pages wrprotected on the fly. And such a flag will be
|
|
* available if the wrprotection ioctl are implemented for the
|
|
* range according to the uffdio_register.ioctls.
|
|
*/
|
|
#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
|
|
__u64 mode;
|
|
|
|
/*
|
|
* "copy" is written by the ioctl and must be at the end: the
|
|
* copy_from_user will not read the last 8 bytes.
|
|
*/
|
|
__s64 copy;
|
|
};
|
|
|
|
struct uffdio_zeropage {
|
|
struct uffdio_range range;
|
|
#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
|
|
__u64 mode;
|
|
|
|
/*
|
|
* "zeropage" is written by the ioctl and must be at the end:
|
|
* the copy_from_user will not read the last 8 bytes.
|
|
*/
|
|
__s64 zeropage;
|
|
};
|
|
|
|
#endif /* _LINUX_USERFAULTFD_H */
|