summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu/mmu.c')
-rw-r--r--arch/x86/kvm/mmu/mmu.c444
1 files changed, 151 insertions, 293 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8e853a5fc867..22e7ad235123 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator {
static struct kmem_cache *pte_list_desc_cache;
struct kmem_cache *mmu_page_header_cache;
-static struct percpu_counter kvm_total_used_mmu_pages;
static void mmu_spte_set(u64 *sptep, u64 spte);
@@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
__set_spte(sptep, new_spte);
}
-/*
- * Update the SPTE (excluding the PFN), but do not track changes in its
- * accessed/dirty status.
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Returns true if the TLB needs to be flushed
*/
-static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
@@ -498,7 +498,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
- return old_spte;
+ return false;
}
if (!spte_has_volatile_bits(old_spte))
@@ -506,53 +506,10 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
- WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+ WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
+ spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
- return old_spte;
-}
-
-/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changed.
- *
- * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
- * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
- * spte, even though the writable spte might be cached on a CPU's TLB.
- *
- * Returns true if the TLB needs to be flushed
- */
-static bool mmu_spte_update(u64 *sptep, u64 new_spte)
-{
- bool flush = false;
- u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
-
- if (!is_shadow_present_pte(old_spte))
- return false;
-
- /*
- * For the spte updated out of mmu-lock is safe, since
- * we always atomically update it, see the comments in
- * spte_has_volatile_bits().
- */
- if (is_mmu_writable_spte(old_spte) &&
- !is_writable_pte(new_spte))
- flush = true;
-
- /*
- * Flush TLB when accessed/dirty states are changed in the page tables,
- * to guarantee consistency between TLB and page tables.
- */
-
- if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
- flush = true;
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
- }
-
- if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
- flush = true;
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
- }
-
- return flush;
+ return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
}
/*
@@ -563,10 +520,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
*/
static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
{
- kvm_pfn_t pfn;
u64 old_spte = *sptep;
int level = sptep_to_sp(sptep)->role.level;
- struct page *page;
if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
@@ -578,24 +533,6 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
return old_spte;
kvm_update_page_stats(kvm, level, -1);
-
- pfn = spte_to_pfn(old_spte);
-
- /*
- * KVM doesn't hold a reference to any pages mapped into the guest, and
- * instead uses the mmu_notifier to ensure that KVM unmaps any pages
- * before they are reclaimed. Sanity check that, if the pfn is backed
- * by a refcounted page, the refcount is elevated.
- */
- page = kvm_pfn_to_refcounted_page(pfn);
- WARN_ON_ONCE(page && !page_count(page));
-
- if (is_accessed_spte(old_spte))
- kvm_set_pfn_accessed(pfn);
-
- if (is_dirty_spte(old_spte))
- kvm_set_pfn_dirty(pfn);
-
return old_spte;
}
@@ -1250,16 +1187,6 @@ static bool spte_clear_dirty(u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool spte_wrprot_for_clear_dirty(u64 *sptep)
-{
- bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
- (unsigned long *)sptep);
- if (was_writable && !spte_ad_enabled(*sptep))
- kvm_set_pfn_dirty(spte_to_pfn(*sptep));
-
- return was_writable;
-}
-
/*
* Gets the GFN ready for another round of dirty logging by clearing the
* - D bit on ad-enabled SPTEs, and
@@ -1275,7 +1202,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
for_each_rmap_spte(rmap_head, &iter, sptep)
if (spte_ad_need_write_protect(*sptep))
- flush |= spte_wrprot_for_clear_dirty(sptep);
+ flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
else
flush |= spte_clear_dirty(sptep);
@@ -1640,15 +1568,12 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
(unsigned long *)sptep);
} else {
/*
- * Capture the dirty status of the page, so that
- * it doesn't get lost when the SPTE is marked
- * for access tracking.
+ * WARN if mmu_spte_update() signals the need
+ * for a TLB flush, as Access tracking a SPTE
+ * should never trigger an _immediate_ flush.
*/
- if (is_writable_pte(spte))
- kvm_set_pfn_dirty(spte_to_pfn(spte));
-
spte = mark_spte_for_access_track(spte);
- mmu_spte_update_no_track(sptep, spte);
+ WARN_ON_ONCE(mmu_spte_update(sptep, spte));
}
young = true;
}
@@ -1696,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
#endif
}
-/*
- * This value is the sum of all of the kvm instances's
- * kvm->arch.n_used_mmu_pages values. We need a global,
- * aggregate version in order to make the slab shrinker
- * faster
- */
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
-{
- kvm->arch.n_used_mmu_pages += nr;
- percpu_counter_add(&kvm_total_used_mmu_pages, nr);
-}
-
static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- kvm_mod_used_mmu_pages(kvm, +1);
+ kvm->arch.n_used_mmu_pages++;
kvm_account_pgtable_pages((void *)sp->spt, +1);
}
static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- kvm_mod_used_mmu_pages(kvm, -1);
+ kvm->arch.n_used_mmu_pages--;
kvm_account_pgtable_pages((void *)sp->spt, -1);
}
@@ -2802,7 +2715,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
* be write-protected.
*/
int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
- gfn_t gfn, bool can_unsync, bool prefetch)
+ gfn_t gfn, bool synchronizing, bool prefetch)
{
struct kvm_mmu_page *sp;
bool locked = false;
@@ -2817,12 +2730,12 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
/*
* The page is not write-tracked, mark existing shadow pages unsync
- * unless KVM is synchronizing an unsync SP (can_unsync = false). In
- * that case, KVM must complete emulation of the guest TLB flush before
- * allowing shadow pages to become unsync (writable by the guest).
+ * unless KVM is synchronizing an unsync SP. In that case, KVM must
+ * complete emulation of the guest TLB flush before allowing shadow
+ * pages to become unsync (writable by the guest).
*/
for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
- if (!can_unsync)
+ if (synchronizing)
return -EPERM;
if (sp->unsync)
@@ -2926,6 +2839,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (is_shadow_present_pte(*sptep)) {
+ if (prefetch)
+ return RET_PF_SPURIOUS;
+
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
@@ -2945,7 +2861,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
- true, host_writable, &spte);
+ false, host_writable, &spte);
if (*sptep == spte) {
ret = RET_PF_SPURIOUS;
@@ -2971,32 +2887,51 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
return ret;
}
-static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *start, u64 *end)
+static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep,
+ int nr_pages, unsigned int access)
{
struct page *pages[PTE_PREFETCH_NUM];
struct kvm_memory_slot *slot;
- unsigned int access = sp->role.access;
- int i, ret;
- gfn_t gfn;
+ int i;
+
+ if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM))
+ return false;
- gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
if (!slot)
- return -1;
+ return false;
- ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
- if (ret <= 0)
- return -1;
+ nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages);
+ if (nr_pages <= 0)
+ return false;
- for (i = 0; i < ret; i++, gfn++, start++) {
- mmu_set_spte(vcpu, slot, start, access, gfn,
+ for (i = 0; i < nr_pages; i++, gfn++, sptep++) {
+ mmu_set_spte(vcpu, slot, sptep, access, gfn,
page_to_pfn(pages[i]), NULL);
- put_page(pages[i]);
+
+ /*
+ * KVM always prefetches writable pages from the primary MMU,
+ * and KVM can make its SPTE writable in the fast page handler,
+ * without notifying the primary MMU. Mark pages/folios dirty
+ * now to ensure file data is written back if it ends up being
+ * written by the guest. Because KVM's prefetching GUPs
+ * writable PTEs, the probability of unnecessary writeback is
+ * extremely low.
+ */
+ kvm_release_page_dirty(pages[i]);
}
- return 0;
+ return true;
+}
+
+static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
+ unsigned int access = sp->role.access;
+
+ return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access);
}
static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
@@ -3014,8 +2949,9 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
if (is_shadow_present_pte(*spte) || spte == sptep) {
if (!start)
continue;
- if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ if (!direct_pte_prefetch_many(vcpu, sp, start, spte))
return;
+
start = NULL;
} else if (!start)
start = spte;
@@ -3165,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
}
int kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot, gfn_t gfn,
- int max_level)
+ const struct kvm_memory_slot *slot, gfn_t gfn)
{
bool is_private = kvm_slot_can_be_private(slot) &&
kvm_mem_is_private(kvm, gfn);
- return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
+ return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
}
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -3322,7 +3257,6 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
fault->slot = NULL;
fault->pfn = KVM_PFN_NOSLOT;
fault->map_writable = false;
- fault->hva = KVM_HVA_ERR_BAD;
/*
* If MMIO caching is disabled, emulate immediately without
@@ -3392,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault
* by setting the Writable bit, which can be done out of mmu_lock.
*/
if (!fault->present)
- return !kvm_ad_enabled();
+ return !kvm_ad_enabled;
/*
* Note, instruction fetches and writes are mutually exclusive, ignore
@@ -3419,7 +3353,7 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
* harm. This also avoids the TLB flush needed after setting dirty bit
* so non-PML cases won't be impacted.
*
- * Compare with set_spte where instead shadow_dirty_mask is set.
+ * Compare with make_spte() where instead shadow_dirty_mask is set.
*/
if (!try_cmpxchg64(sptep, &old_spte, new_spte))
return false;
@@ -3527,8 +3461,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* uses A/D bits for non-nested MMUs. Thus, if A/D bits are
* enabled, the SPTE can't be an access-tracked SPTE.
*/
- if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
- new_spte = restore_acc_track_spte(new_spte);
+ if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
+ new_spte = restore_acc_track_spte(new_spte) |
+ shadow_accessed_mask;
/*
* To keep things simple, only SPTEs that are MMU-writable can
@@ -4376,8 +4311,15 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
return max_level;
}
-static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
+static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int r)
+{
+ kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page,
+ r == RET_PF_RETRY, fault->map_writable);
+}
+
+static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
int max_order, r;
@@ -4387,7 +4329,7 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
}
r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
- &max_order);
+ &fault->refcounted_page, &max_order);
if (r) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return r;
@@ -4400,19 +4342,26 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
return RET_PF_CONTINUE;
}
-static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
- bool async;
+ unsigned int foll = fault->write ? FOLL_WRITE : 0;
if (fault->is_private)
- return kvm_faultin_pfn_private(vcpu, fault);
+ return kvm_mmu_faultin_pfn_private(vcpu, fault);
- async = false;
- fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
- &async, fault->write,
- &fault->map_writable, &fault->hva);
- if (!async)
- return RET_PF_CONTINUE; /* *pfn has correct page already */
+ foll |= FOLL_NOWAIT;
+ fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
+ &fault->map_writable, &fault->refcounted_page);
+
+ /*
+ * If resolving the page failed because I/O is needed to fault-in the
+ * page, then either set up an asynchronous #PF to do the I/O, or if
+ * doing an async #PF isn't possible, retry with I/O allowed. All
+ * other failures are terminal, i.e. retrying won't help.
+ */
+ if (fault->pfn != KVM_PFN_ERR_NEEDS_IO)
+ return RET_PF_CONTINUE;
if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(fault->addr, fault->gfn);
@@ -4430,14 +4379,16 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* to wait for IO. Note, gup always bails if it is unable to quickly
* get a page and a fatal signal, i.e. SIGKILL, is pending.
*/
- fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
- NULL, fault->write,
- &fault->map_writable, &fault->hva);
+ foll |= FOLL_INTERRUPTIBLE;
+ foll &= ~FOLL_NOWAIT;
+ fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
+ &fault->map_writable, &fault->refcounted_page);
+
return RET_PF_CONTINUE;
}
-static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
- unsigned int access)
+static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, unsigned int access)
{
struct kvm_memory_slot *slot = fault->slot;
int ret;
@@ -4520,7 +4471,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
- ret = __kvm_faultin_pfn(vcpu, fault);
+ ret = __kvm_mmu_faultin_pfn(vcpu, fault);
if (ret != RET_PF_CONTINUE)
return ret;
@@ -4538,7 +4489,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* mmu_lock is acquired.
*/
if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
- kvm_release_pfn_clean(fault->pfn);
+ kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY);
return RET_PF_RETRY;
}
@@ -4597,7 +4548,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r)
return r;
- r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
return r;
@@ -4614,8 +4565,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = direct_map(vcpu, fault);
out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -4688,7 +4639,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
if (r)
return r;
- r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
return r;
@@ -4701,8 +4652,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
r = kvm_tdp_mmu_map(vcpu, fault);
out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
read_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
return r;
}
#endif
@@ -5488,7 +5439,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
role.efer_nx = true;
role.smm = cpu_role.base.smm;
role.guest_mode = cpu_role.base.guest_mode;
- role.ad_disabled = !kvm_ad_enabled();
+ role.ad_disabled = !kvm_ad_enabled;
role.level = kvm_mmu_get_tdp_level(vcpu);
role.direct = true;
role.has_4_byte_gpte = false;
@@ -6228,7 +6179,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
/* It's actually a GPA for vcpu->arch.guest_mmu. */
if (mmu != &vcpu->arch.guest_mmu) {
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
- if (is_noncanonical_address(addr, vcpu))
+ if (is_noncanonical_invlpg_address(addr, vcpu))
return;
kvm_x86_call(flush_tlb_gva)(vcpu, addr);
@@ -6416,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
int nr_zapped, batch = 0;
+ LIST_HEAD(invalid_list);
bool unstable;
+ lockdep_assert_held(&kvm->slots_lock);
+
restart:
list_for_each_entry_safe_reverse(sp, node,
&kvm->arch.active_mmu_pages, link) {
@@ -6449,7 +6403,7 @@ restart:
}
unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
- &kvm->arch.zapped_obsolete_pages, &nr_zapped);
+ &invalid_list, &nr_zapped);
batch += nr_zapped;
if (unstable)
@@ -6465,7 +6419,7 @@ restart:
* kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
* running with an obsolete MMU.
*/
- kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
/*
@@ -6528,16 +6482,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_tdp_mmu_zap_invalidated_roots(kvm);
}
-static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
-{
- return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
-}
-
void kvm_mmu_init_vm(struct kvm *kvm)
{
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
@@ -6771,7 +6719,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm,
continue;
}
- spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
+ spte = make_small_spte(kvm, huge_spte, sp->role, index);
mmu_spte_set(sptep, spte);
__rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
}
@@ -6954,8 +6902,7 @@ restart:
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct &&
- sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
- PG_LEVEL_NUM)) {
+ sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_remote_tlbs_range())
@@ -6983,8 +6930,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
kvm_flush_remote_tlbs_memslot(kvm, slot);
}
-void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
+void kvm_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
@@ -6994,7 +6941,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
- kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+ kvm_tdp_mmu_recover_huge_pages(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@@ -7149,72 +7096,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
}
}
-static unsigned long mmu_shrink_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct kvm *kvm;
- int nr_to_scan = sc->nr_to_scan;
- unsigned long freed = 0;
-
- mutex_lock(&kvm_lock);
-
- list_for_each_entry(kvm, &vm_list, vm_list) {
- int idx;
-
- /*
- * Never scan more than sc->nr_to_scan VM instances.
- * Will not hit this condition practically since we do not try
- * to shrink more than one VM and it is very unlikely to see
- * !n_used_mmu_pages so many times.
- */
- if (!nr_to_scan--)
- break;
- /*
- * n_used_mmu_pages is accessed without holding kvm->mmu_lock
- * here. We may skip a VM instance errorneosly, but we do not
- * want to shrink a VM that only started to populate its MMU
- * anyway.
- */
- if (!kvm->arch.n_used_mmu_pages &&
- !kvm_has_zapped_obsolete_pages(kvm))
- continue;
-
- idx = srcu_read_lock(&kvm->srcu);
- write_lock(&kvm->mmu_lock);
-
- if (kvm_has_zapped_obsolete_pages(kvm)) {
- kvm_mmu_commit_zap_page(kvm,
- &kvm->arch.zapped_obsolete_pages);
- goto unlock;
- }
-
- freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
-
-unlock:
- write_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
-
- /*
- * unfair on small ones
- * per-vm shrinkers cry out
- * sadness comes quickly
- */
- list_move_tail(&kvm->vm_list, &vm_list);
- break;
- }
-
- mutex_unlock(&kvm_lock);
- return freed;
-}
-
-static unsigned long mmu_shrink_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
-}
-
-static struct shrinker *mmu_shrinker;
-
static void mmu_destroy_caches(void)
{
kmem_cache_destroy(pte_list_desc_cache);
@@ -7281,7 +7162,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
}
mutex_unlock(&kvm_lock);
}
@@ -7341,23 +7222,8 @@ int kvm_mmu_vendor_module_init(void)
if (!mmu_page_header_cache)
goto out;
- if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
- goto out;
-
- mmu_shrinker = shrinker_alloc(0, "x86-mmu");
- if (!mmu_shrinker)
- goto out_shrinker;
-
- mmu_shrinker->count_objects = mmu_shrink_count;
- mmu_shrinker->scan_objects = mmu_shrink_scan;
- mmu_shrinker->seeks = DEFAULT_SEEKS * 10;
-
- shrinker_register(mmu_shrinker);
-
return 0;
-out_shrinker:
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
out:
mmu_destroy_caches();
return ret;
@@ -7374,8 +7240,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
- shrinker_free(mmu_shrinker);
}
/*
@@ -7427,7 +7291,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
mutex_unlock(&kvm_lock);
}
@@ -7530,62 +7394,56 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
-static long get_nx_huge_page_recovery_timeout(u64 start_time)
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
{
- bool enabled;
- uint period;
-
- enabled = calc_nx_huge_pages_recovery_period(&period);
-
- return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
- : MAX_SCHEDULE_TIMEOUT;
}
-static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
+static bool kvm_nx_huge_page_recovery_worker(void *data)
{
- u64 start_time;
+ struct kvm *kvm = data;
+ bool enabled;
+ uint period;
long remaining_time;
- while (true) {
- start_time = get_jiffies_64();
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
-
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop() && remaining_time > 0) {
- schedule_timeout(remaining_time);
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
- set_current_state(TASK_INTERRUPTIBLE);
- }
-
- set_current_state(TASK_RUNNING);
-
- if (kthread_should_stop())
- return 0;
+ enabled = calc_nx_huge_pages_recovery_period(&period);
+ if (!enabled)
+ return false;
- kvm_recover_nx_huge_pages(kvm);
+ remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
+ - get_jiffies_64();
+ if (remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ /* check for signals and come back */
+ return true;
}
+
+ __set_current_state(TASK_RUNNING);
+ kvm_recover_nx_huge_pages(kvm);
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ return true;
}
int kvm_mmu_post_init_vm(struct kvm *kvm)
{
- int err;
-
if (nx_hugepage_mitigation_hard_disabled)
return 0;
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
- "kvm-nx-lpage-recovery",
- &kvm->arch.nx_huge_page_recovery_thread);
- if (!err)
- kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
+ kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
+ kvm, "kvm-nx-lpage-recovery");
- return err;
+ if (!kvm->arch.nx_huge_page_recovery_thread)
+ return -ENOMEM;
+
+ vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
+ return 0;
}
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
if (kvm->arch.nx_huge_page_recovery_thread)
- kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
+ vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES