diff options
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/guest_memfd.c | 28 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 576 | ||||
-rw-r--r-- | virt/kvm/kvm_mm.h | 36 | ||||
-rw-r--r-- | virt/kvm/pfncache.c | 20 |
4 files changed, 280 insertions, 380 deletions
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 8f079a61a56d..47a9f68f7b24 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -302,6 +302,11 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); } +static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return gfn - slot->base_gfn + slot->gmem.pgoff; +} + static struct file_operations kvm_gmem_fops = { .open = generic_file_open, .release = kvm_gmem_release, @@ -551,12 +556,11 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) } /* Returns a locked folio on success. */ -static struct folio * -__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared, - int *max_order) +static struct folio *__kvm_gmem_get_pfn(struct file *file, + struct kvm_memory_slot *slot, + pgoff_t index, kvm_pfn_t *pfn, + bool *is_prepared, int *max_order) { - pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; struct folio *folio; @@ -590,8 +594,10 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order) { + pgoff_t index = kvm_gmem_get_index(slot, gfn); struct file *file = kvm_gmem_get_file(slot); struct folio *folio; bool is_prepared = false; @@ -600,7 +606,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, if (!file) return -EFAULT; - folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); if (IS_ERR(folio)) { r = PTR_ERR(folio); goto out; @@ -610,7 +616,10 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); - if (r < 0) + + if (!r) + *page = folio_file_page(folio, index); + else folio_put(folio); out: @@ -648,6 +657,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long for (i = 0; i < npages; i += (1 << max_order)) { struct folio *folio; gfn_t gfn = start_gfn + i; + pgoff_t index = kvm_gmem_get_index(slot, gfn); bool is_prepared = false; kvm_pfn_t pfn; @@ -656,7 +666,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6ca7a1045bbb..b684c332782c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -95,6 +95,13 @@ module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); /* + * Allow direct access (from KVM or the CPU) without MMU notifier protection + * to unpinned pages. + */ +static bool allow_unsafe_mappings; +module_param(allow_unsafe_mappings, bool, 0444); + +/* * Ordering of locks: * * kvm->lock --> kvm->slots_lock --> kvm->irq_lock @@ -153,52 +160,6 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } -bool kvm_is_zone_device_page(struct page *page) -{ - /* - * The metadata used by is_zone_device_page() to determine whether or - * not a page is ZONE_DEVICE is guaranteed to be valid if and only if - * the device has been pinned, e.g. by get_user_pages(). WARN if the - * page_count() is zero to help detect bad usage of this helper. - */ - if (WARN_ON_ONCE(!page_count(page))) - return false; - - return is_zone_device_page(page); -} - -/* - * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted - * page, NULL otherwise. Note, the list of refcounted PG_reserved page types - * is likely incomplete, it has been compiled purely through people wanting to - * back guest with a certain type of memory and encountering issues. - */ -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - if (!PageReserved(page)) - return page; - - /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */ - if (is_zero_pfn(pfn)) - return page; - - /* - * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting - * perspective they are "normal" pages, albeit with slightly different - * usage rules. - */ - if (kvm_is_zone_device_page(page)) - return page; - - return NULL; -} - /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -2746,37 +2707,93 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static inline int check_user_page_hwpoison(unsigned long addr) +static bool kvm_is_ad_tracked_page(struct page *page) +{ + /* + * Per page-flags.h, pages tagged PG_reserved "should in general not be + * touched (e.g. set dirty) except by its owner". + */ + return !PageReserved(page); +} + +static void kvm_set_page_dirty(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + SetPageDirty(page); +} + +static void kvm_set_page_accessed(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + mark_page_accessed(page); +} + +void kvm_release_page_clean(struct page *page) +{ + if (!page) + return; + + kvm_set_page_accessed(page); + put_page(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_clean); + +void kvm_release_page_dirty(struct page *page) +{ + if (!page) + return; + + kvm_set_page_dirty(page); + kvm_release_page_clean(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + +static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, + struct follow_pfnmap_args *map, bool writable) { - int rc, flags = FOLL_HWPOISON | FOLL_WRITE; + kvm_pfn_t pfn; + + WARN_ON_ONCE(!!page == !!map); - rc = get_user_pages(addr, 1, flags, NULL); - return rc == -EHWPOISON; + if (kfp->map_writable) + *kfp->map_writable = writable; + + if (map) + pfn = map->pfn; + else + pfn = page_to_pfn(page); + + *kfp->refcounted_page = page; + + return pfn; } /* * The fast path to get the writable pfn which will be stored in @pfn, - * true indicates success, otherwise false is returned. It's also the - * only part that runs if we can in atomic context. + * true indicates success, otherwise false is returned. */ -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, - bool *writable, kvm_pfn_t *pfn) +static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { - struct page *page[1]; + struct page *page; + bool r; /* - * Fast pin a writable pfn only if it is a write fault request - * or the caller allows to map a writable pfn for a read fault - * request. + * Try the fast-only path when the caller wants to pin/get the page for + * writing. If the caller only wants to read the page, KVM must go + * down the full, slow path in order to avoid racing an operation that + * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing + * at the old, read-only page while mm/ points at a new, writable page. */ - if (!(write_fault || writable)) + if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { - *pfn = page_to_pfn(page[0]); + if (kfp->pin) + r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; + else + r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); - if (writable) - *writable = true; + if (r) { + *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } @@ -2787,8 +2804,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, - bool interruptible, bool *writable, kvm_pfn_t *pfn) +static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { /* * When a VCPU accesses a page that is not mapped into the secondary @@ -2801,37 +2817,35 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, * Note that get_user_page_fast_only() and FOLL_WRITE for now * implicitly honor NUMA hinting faults and don't need this flag. */ - unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT; - struct page *page; + unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags; + struct page *page, *wpage; int npages; - might_sleep(); - - if (writable) - *writable = write_fault; - - if (write_fault) - flags |= FOLL_WRITE; - if (async) - flags |= FOLL_NOWAIT; - if (interruptible) - flags |= FOLL_INTERRUPTIBLE; - - npages = get_user_pages_unlocked(addr, 1, &page, flags); + if (kfp->pin) + npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); + else + npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; - /* map read fault as writable if possible */ - if (unlikely(!write_fault) && writable) { - struct page *wpage; + /* + * Pinning is mutually exclusive with opportunistically mapping a read + * fault as writable, as KVM should never pin pages when mapping memory + * into the guest (pinning is only for direct accesses from KVM). + */ + if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) + goto out; - if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { - *writable = true; - put_page(page); - page = wpage; - } + /* map read fault as writable if possible */ + if (!(flags & FOLL_WRITE) && kfp->map_writable && + get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { + put_page(page); + page = wpage; + flags |= FOLL_WRITE; } - *pfn = page_to_pfn(page); + +out: + *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -2846,24 +2860,21 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) return true; } -static int kvm_try_get_pfn(kvm_pfn_t pfn) -{ - struct page *page = kvm_pfn_to_refcounted_page(pfn); - - if (!page) - return 1; - - return get_page_unless_zero(page); -} - static int hva_to_pfn_remapped(struct vm_area_struct *vma, - unsigned long addr, bool write_fault, - bool *writable, kvm_pfn_t *p_pfn) + struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { - struct follow_pfnmap_args args = { .vma = vma, .address = addr }; - kvm_pfn_t pfn; + struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; + bool write_fault = kfp->flags & FOLL_WRITE; int r; + /* + * Remapped memory cannot be pinned in any meaningful sense. Bail if + * the caller wants to pin the page, i.e. access the page outside of + * MMU notifier protection, and unsafe umappings are disallowed. + */ + if (kfp->pin && !allow_unsafe_mappings) + return -EINVAL; + r = follow_pfnmap_start(&args); if (r) { /* @@ -2871,7 +2882,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * not call the fault handler, so do it here. */ bool unlocked = false; - r = fixup_user_fault(current->mm, addr, + r = fixup_user_fault(current->mm, kfp->hva, (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); if (unlocked) @@ -2885,164 +2896,104 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, } if (write_fault && !args.writable) { - pfn = KVM_PFN_ERR_RO_FAULT; + *p_pfn = KVM_PFN_ERR_RO_FAULT; goto out; } - if (writable) - *writable = args.writable; - pfn = args.pfn; - - /* - * Get a reference here because callers of *hva_to_pfn* and - * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the - * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will - * simply do nothing for reserved pfns. - * - * Whoever called remap_pfn_range is also going to call e.g. - * unmap_mapping_range before the underlying pages are freed, - * causing a call to our MMU notifier. - * - * Certain IO or PFNMAP mappings can be backed with valid - * struct pages, but be allocated without refcounting e.g., - * tail pages of non-compound higher order allocations, which - * would then underflow the refcount when the caller does the - * required put_page. Don't allow those pages here. - */ - if (!kvm_try_get_pfn(pfn)) - r = -EFAULT; + *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable); out: follow_pfnmap_end(&args); - *p_pfn = pfn; - return r; } -/* - * Pin guest page in memory and return its pfn. - * @addr: host virtual address which maps memory to the guest - * @atomic: whether this function is forbidden from sleeping - * @interruptible: whether the process can be interrupted by non-fatal signals - * @async: whether this function need to wait IO complete if the - * host page is not in the memory - * @write_fault: whether we should get a writable host page - * @writable: whether it allows to map a writable host page for !@write_fault - * - * The function will map a writable host page for these two cases: - * 1): @write_fault = true - * 2): @write_fault = false && @writable, @writable will tell the caller - * whether the mapping is writable. - */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, - bool *async, bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) { struct vm_area_struct *vma; kvm_pfn_t pfn; int npages, r; - /* we can do it either atomically or asynchronously, not both */ - BUG_ON(atomic && async); - - if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) - return pfn; + might_sleep(); - if (atomic) + if (WARN_ON_ONCE(!kfp->refcounted_page)) return KVM_PFN_ERR_FAULT; - npages = hva_to_pfn_slow(addr, async, write_fault, interruptible, - writable, &pfn); + if (hva_to_pfn_fast(kfp, &pfn)) + return pfn; + + npages = hva_to_pfn_slow(kfp, &pfn); if (npages == 1) return pfn; - if (npages == -EINTR) + if (npages == -EINTR || npages == -EAGAIN) return KVM_PFN_ERR_SIGPENDING; + if (npages == -EHWPOISON) + return KVM_PFN_ERR_HWPOISON; mmap_read_lock(current->mm); - if (npages == -EHWPOISON || - (!async && check_user_page_hwpoison(addr))) { - pfn = KVM_PFN_ERR_HWPOISON; - goto exit; - } - retry: - vma = vma_lookup(current->mm, addr); + vma = vma_lookup(current->mm, kfp->hva); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn); + r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { - if (async && vma_is_valid(vma, write_fault)) - *async = true; - pfn = KVM_PFN_ERR_FAULT; + if ((kfp->flags & FOLL_NOWAIT) && + vma_is_valid(vma, kfp->flags & FOLL_WRITE)) + pfn = KVM_PFN_ERR_NEEDS_IO; + else + pfn = KVM_PFN_ERR_FAULT; } -exit: mmap_read_unlock(current->mm); return pfn; } -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool interruptible, bool *async, - bool write_fault, bool *writable, hva_t *hva) +static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) { - unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL, + kfp->flags & FOLL_WRITE); - if (hva) - *hva = addr; + if (kfp->hva == KVM_HVA_ERR_RO_BAD) + return KVM_PFN_ERR_RO_FAULT; - if (kvm_is_error_hva(addr)) { - if (writable) - *writable = false; + if (kvm_is_error_hva(kfp->hva)) + return KVM_PFN_NOSLOT; - return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT : - KVM_PFN_NOSLOT; + if (memslot_is_readonly(kfp->slot) && kfp->map_writable) { + *kfp->map_writable = false; + kfp->map_writable = NULL; } - /* Do not map writable pfn in the readonly memslot. */ - if (writable && memslot_is_readonly(slot)) { - *writable = false; - writable = NULL; - } - - return hva_to_pfn(addr, atomic, interruptible, async, write_fault, - writable); + return hva_to_pfn(kfp); } -EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); -kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable) +kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, + unsigned int foll, bool *writable, + struct page **refcounted_page) { - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, - NULL, write_fault, writable, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); + struct kvm_follow_pfn kfp = { + .slot = slot, + .gfn = gfn, + .flags = foll, + .map_writable = writable, + .refcounted_page = refcounted_page, + }; -kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) -{ - return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true, - NULL, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); + if (WARN_ON_ONCE(!writable || !refcounted_page)) + return KVM_PFN_ERR_FAULT; -kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn) -{ - return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true, - NULL, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); + *writable = false; + *refcounted_page = NULL; -kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) -{ - return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); + return kvm_follow_pfn(&kfp); } -EXPORT_SYMBOL_GPL(gfn_to_pfn); +EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages) +int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry = 0; @@ -3056,193 +3007,92 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } -EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); +EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* - * Do not use this helper unless you are absolutely certain the gfn _must_ be - * backed by 'struct page'. A valid example is if the backing memslot is - * controlled by KVM. Note, if the returned page is valid, it's refcount has - * been elevated by gfn_to_pfn(). + * Don't use this API unless you are absolutely, positively certain that KVM + * needs to get a struct page, e.g. to pin the page for firmware DMA. + * + * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate + * its refcount. */ -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - struct page *page; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn(kvm, gfn); - - if (is_error_noslot_pfn(pfn)) - return KVM_ERR_PTR_BAD_PAGE; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return KVM_ERR_PTR_BAD_PAGE; - - return page; -} -EXPORT_SYMBOL_GPL(gfn_to_page); +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) +{ + struct page *refcounted_page = NULL; + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(kvm, gfn), + .gfn = gfn, + .flags = write ? FOLL_WRITE : 0, + .refcounted_page = &refcounted_page, + }; -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) -{ - if (dirty) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); + (void)kvm_follow_pfn(&kfp); + return refcounted_page; } +EXPORT_SYMBOL_GPL(__gfn_to_page); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + bool writable) { - kvm_pfn_t pfn; - void *hva = NULL; - struct page *page = KVM_UNMAPPED_PAGE; + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(vcpu->kvm, gfn), + .gfn = gfn, + .flags = writable ? FOLL_WRITE : 0, + .refcounted_page = &map->pinned_page, + .pin = true, + }; - if (!map) - return -EINVAL; + map->pinned_page = NULL; + map->page = NULL; + map->hva = NULL; + map->gfn = gfn; + map->writable = writable; - pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_noslot_pfn(pfn)) + map->pfn = kvm_follow_pfn(&kfp); + if (is_error_noslot_pfn(map->pfn)) return -EINVAL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - hva = kmap(page); + if (pfn_valid(map->pfn)) { + map->page = pfn_to_page(map->pfn); + map->hva = kmap(map->page); #ifdef CONFIG_HAS_IOMEM } else { - hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); + map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB); #endif } - if (!hva) - return -EFAULT; - - map->page = page; - map->hva = hva; - map->pfn = pfn; - map->gfn = gfn; - - return 0; + return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(kvm_vcpu_map); +EXPORT_SYMBOL_GPL(__kvm_vcpu_map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { - if (!map) - return; - if (!map->hva) return; - if (map->page != KVM_UNMAPPED_PAGE) + if (map->page) kunmap(map->page); #ifdef CONFIG_HAS_IOMEM else memunmap(map->hva); #endif - if (dirty) + if (map->writable) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - kvm_release_pfn(map->pfn, dirty); + if (map->pinned_page) { + if (map->writable) + kvm_set_page_dirty(map->pinned_page); + kvm_set_page_accessed(map->pinned_page); + unpin_user_page(map->pinned_page); + } map->hva = NULL; map->page = NULL; + map->pinned_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -static bool kvm_is_ad_tracked_page(struct page *page) -{ - /* - * Per page-flags.h, pages tagged PG_reserved "should in general not be - * touched (e.g. set dirty) except by its owner". - */ - return !PageReserved(page); -} - -static void kvm_set_page_dirty(struct page *page) -{ - if (kvm_is_ad_tracked_page(page)) - SetPageDirty(page); -} - -static void kvm_set_page_accessed(struct page *page) -{ - if (kvm_is_ad_tracked_page(page)) - mark_page_accessed(page); -} - -void kvm_release_page_clean(struct page *page) -{ - WARN_ON(is_error_page(page)); - - kvm_set_page_accessed(page); - put_page(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_clean); - -void kvm_release_pfn_clean(kvm_pfn_t pfn) -{ - struct page *page; - - if (is_error_noslot_pfn(pfn)) - return; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return; - - kvm_release_page_clean(page); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); - -void kvm_release_page_dirty(struct page *page) -{ - WARN_ON(is_error_page(page)); - - kvm_set_page_dirty(page); - kvm_release_page_clean(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); - -void kvm_release_pfn_dirty(kvm_pfn_t pfn) -{ - struct page *page; - - if (is_error_noslot_pfn(pfn)) - return; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return; - - kvm_release_page_dirty(page); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); - -/* - * Note, checking for an error/noslot pfn is the caller's responsibility when - * directly marking a page dirty/accessed. Unlike the "release" helpers, the - * "set" helpers are not to be used when the pfn might point at garbage. - */ -void kvm_set_pfn_dirty(kvm_pfn_t pfn) -{ - if (WARN_ON(is_error_noslot_pfn(pfn))) - return; - - if (pfn_valid(pfn)) - kvm_set_page_dirty(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); - -void kvm_set_pfn_accessed(kvm_pfn_t pfn) -{ - if (WARN_ON(is_error_noslot_pfn(pfn))) - return; - - if (pfn_valid(pfn)) - kvm_set_page_accessed(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); - static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 715f19669d01..acef3f5c582a 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -20,8 +20,40 @@ #define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) #endif /* KVM_HAVE_MMU_RWLOCK */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, - bool *async, bool write_fault, bool *writable); + +struct kvm_follow_pfn { + const struct kvm_memory_slot *slot; + const gfn_t gfn; + + unsigned long hva; + + /* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */ + unsigned int flags; + + /* + * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag). + * The page *must* be pinned if KVM will write to the page via a kernel + * mapping, e.g. via kmap(), mremap(), etc. + */ + bool pin; + + /* + * If non-NULL, try to get a writable mapping even for a read fault. + * Set to true if a writable mapping was obtained. + */ + bool *map_writable; + + /* + * Optional output. Set to a valid "struct page" if the returned pfn + * is for a refcounted or pinned struct page, NULL if the returned pfn + * has no struct page or if the struct page is not being refcounted + * (e.g. tail pages of non-compound higher order allocations from + * IO/PFNMAP mappings). + */ + struct page **refcounted_page; +}; + +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp); #ifdef CONFIG_HAVE_KVM_PFNCACHE void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index f0039efb9e1e..728d2c1b488a 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -159,6 +159,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; void *new_khva = NULL; unsigned long mmu_seq; + struct page *page; + + struct kvm_follow_pfn kfp = { + .slot = gpc->memslot, + .gfn = gpa_to_gfn(gpc->gpa), + .flags = FOLL_WRITE, + .hva = gpc->uhva, + .refcounted_page = &page, + }; lockdep_assert_held(&gpc->refresh_lock); @@ -192,13 +201,12 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) if (new_khva != old_khva) gpc_unmap(new_pfn, new_khva); - kvm_release_pfn_clean(new_pfn); + kvm_release_page_unused(page); cond_resched(); } - /* We always request a writeable mapping */ - new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL); + new_pfn = hva_to_pfn(&kfp); if (is_error_noslot_pfn(new_pfn)) goto out_error; @@ -213,7 +221,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) new_khva = gpc_map(new_pfn); if (!new_khva) { - kvm_release_pfn_clean(new_pfn); + kvm_release_page_unused(page); goto out_error; } @@ -231,11 +239,11 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) gpc->khva = new_khva + offset_in_page(gpc->uhva); /* - * Put the reference to the _new_ pfn. The pfn is now tracked by the + * Put the reference to the _new_ page. The page is now tracked by the * cache and can be safely migrated, swapped, etc... as the cache will * invalidate any mappings in response to relevant mmu_notifier events. */ - kvm_release_pfn_clean(new_pfn); + kvm_release_page_clean(page); return 0; |