summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorMichael Ellerman <mpe@ellerman.id.au>2017-07-31 20:20:29 +1000
committerMichael Ellerman <mpe@ellerman.id.au>2017-07-31 20:20:29 +1000
commitbb272221e9db79f13d454e1f3fb6b05013be985e (patch)
tree36f4acc50e3fabac71fadd34c720c0a6011db470 /mm
parent253fd51e2f533552ae35a0c661705da6c4842c1b (diff)
parent5771a8c08880cdca3bfb4a3fc6d309d6bba20877 (diff)
downloadlinux-bb272221e9db79f13d454e1f3fb6b05013be985e.tar.gz
linux-bb272221e9db79f13d454e1f3fb6b05013be985e.tar.bz2
linux-bb272221e9db79f13d454e1f3fb6b05013be985e.zip
Merge tag 'v4.13-rc1' into fixes
The fixes branch is based off a random pre-rc1 commit, because we had some fixes that needed to go in before rc1 was released. However we now need to fix some code that went in after that point, but before rc1, so merge rc1 to get that code into fixes so we can fix it!
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/cma.c20
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/hugetlb.c294
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kasan/kasan.c152
-rw-r--r--mm/kasan/kasan_init.c12
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/khugepaged.c3
-rw-r--r--mm/list_lru.c14
-rw-r--r--mm/madvise.c46
-rw-r--r--mm/memcontrol.c52
-rw-r--r--mm/memory-failure.c332
-rw-r--r--mm/memory.c6
-rw-r--r--mm/memory_hotplug.c138
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/migrate.c17
-rw-r--r--mm/mmap.c21
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c80
-rw-r--r--mm/page_io.c23
-rw-r--r--mm/page_isolation.c18
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/sparse-vmemmap.c4
-rw-r--r--mm/swap.c11
-rw-r--r--mm/swap_slots.c5
-rw-r--r--mm/swap_state.c10
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/util.c38
-rw-r--r--mm/vmalloc.c12
-rw-r--r--mm/vmpressure.c122
-rw-r--r--mm/vmscan.c21
-rw-r--r--mm/vmstat.c24
-rw-r--r--mm/zsmalloc.c54
38 files changed, 784 insertions, 806 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 46ef77d5c332..48b1af447fa7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -161,7 +161,6 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on COMPILE_TEST || !KASAN
config MEMORY_HOTPLUG_SPARSE
def_bool y
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index da91df50ba31..9075aa54e955 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
{
unsigned long flags;
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY);
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO);
if (!page)
return NULL;
diff --git a/mm/cma.c b/mm/cma.c
index 978b4a1441ef..c0da318c020e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -59,7 +59,7 @@ const char *cma_get_name(const struct cma *cma)
}
static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
- int align_order)
+ unsigned int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -67,17 +67,14 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
}
/*
- * Find a PFN aligned to the specified order and return an offset represented in
- * order_per_bits.
+ * Find the offset of the base PFN from the specified align_order.
+ * The value returned is represented in order_per_bits.
*/
static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
- int align_order)
+ unsigned int align_order)
{
- if (align_order <= cma->order_per_bit)
- return 0;
-
- return (ALIGN(cma->base_pfn, (1UL << align_order))
- - cma->base_pfn) >> cma->order_per_bit;
+ return (cma->base_pfn & ((1UL << align_order) - 1))
+ >> cma->order_per_bit;
}
static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
@@ -127,7 +124,7 @@ static int __init cma_activate_area(struct cma *cma)
* to be in the same zone.
*/
if (page_zone(pfn_to_page(pfn)) != zone)
- goto err;
+ goto not_in_zone;
}
init_cma_reserved_pageblock(pfn_to_page(base_pfn));
} while (--i);
@@ -141,7 +138,8 @@ static int __init cma_activate_area(struct cma *cma)
return 0;
-err:
+not_in_zone:
+ pr_err("CMA area %s could not be activated\n", cma->name);
kfree(cma->bitmap);
cma->count = 0;
return -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 3247b4208034..a49702445ce0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -239,14 +239,16 @@ void __delete_from_page_cache(struct page *page, void *shadow)
/* Leave page->index set: truncation lookup relies upon it */
/* hugetlb pages do not participate in page cache accounting. */
- if (!PageHuge(page))
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ if (PageHuge(page))
+ return;
+
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
} else {
- VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
+ VM_BUG_ON_PAGE(PageTransHuge(page), page);
}
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1a88006ec634..bc48ee783dd9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -20,9 +20,9 @@
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
+#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
-#include <linux/page-isolation.h>
#include <linux/jhash.h>
#include <asm/page.h>
@@ -872,7 +872,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
struct page *page;
list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
- if (!is_migrate_isolate_page(page))
+ if (!PageHWPoison(page))
break;
/*
* if 'non-isolated free hugepage' not found on the list,
@@ -887,19 +887,39 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
return page;
}
-static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
+ nodemask_t *nmask)
{
- struct page *page;
- int node;
+ unsigned int cpuset_mems_cookie;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ int node = -1;
- if (nid != NUMA_NO_NODE)
- return dequeue_huge_page_node_exact(h, nid);
+ zonelist = node_zonelist(nid, gfp_mask);
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
+ struct page *page;
+
+ if (!cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+ /*
+ * no need to ask again on the same node. Pool is node rather than
+ * zone aware
+ */
+ if (zone_to_nid(zone) == node)
+ continue;
+ node = zone_to_nid(zone);
- for_each_online_node(node) {
page = dequeue_huge_page_node_exact(h, node);
if (page)
return page;
}
+ if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
return NULL;
}
@@ -917,15 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
unsigned long address, int avoid_reserve,
long chg)
{
- struct page *page = NULL;
+ struct page *page;
struct mempolicy *mpol;
- nodemask_t *nodemask;
gfp_t gfp_mask;
+ nodemask_t *nodemask;
int nid;
- struct zonelist *zonelist;
- struct zone *zone;
- struct zoneref *z;
- unsigned int cpuset_mems_cookie;
/*
* A child process with MAP_PRIVATE mappings created by their parent
@@ -940,32 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- zonelist = node_zonelist(nid, gfp_mask);
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- MAX_NR_ZONES - 1, nodemask) {
- if (cpuset_zone_allowed(zone, gfp_mask)) {
- page = dequeue_huge_page_node(h, zone_to_nid(zone));
- if (page) {
- if (avoid_reserve)
- break;
- if (!vma_has_reserves(vma, chg))
- break;
-
- SetPagePrivate(page);
- h->resv_huge_pages--;
- break;
- }
- }
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
}
mpol_cond_put(mpol);
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
return page;
err:
@@ -1385,7 +1384,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
page = __alloc_pages_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_REPEAT|__GFP_NOWARN,
+ __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
huge_page_order(h));
if (page) {
prep_new_huge_page(h, page, nid);
@@ -1460,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
* number of free hugepages would be reduced below the number of reserved
* hugepages.
*/
-static int dissolve_free_huge_page(struct page *page)
+int dissolve_free_huge_page(struct page *page)
{
int rc = 0;
@@ -1473,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page)
rc = -EBUSY;
goto out;
}
+ /*
+ * Move PageHWPoison flag from head page to the raw error page,
+ * which makes any subpages rather than the error page reusable.
+ */
+ if (PageHWPoison(head) && page != head) {
+ SetPageHWPoison(page);
+ ClearPageHWPoison(head);
+ }
list_del(&head->lru);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
@@ -1513,82 +1520,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
return rc;
}
-/*
- * There are 3 ways this can get called:
- * 1. With vma+addr: we use the VMA's memory policy
- * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
- * page from any node, and let the buddy allocator itself figure
- * it out.
- * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
- * strictly from 'nid'
- */
static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr, int nid)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
int order = huge_page_order(h);
- gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
- unsigned int cpuset_mems_cookie;
-
- /*
- * We need a VMA to get a memory policy. If we do not
- * have one, we use the 'nid' argument.
- *
- * The mempolicy stuff below has some non-inlined bits
- * and calls ->vm_ops. That makes it hard to optimize at
- * compile-time, even when NUMA is off and it does
- * nothing. This helps the compiler optimize it out.
- */
- if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
- /*
- * If a specific node is requested, make sure to
- * get memory from there, but only when a node
- * is explicitly specified.
- */
- if (nid != NUMA_NO_NODE)
- gfp |= __GFP_THISNODE;
- /*
- * Make sure to call something that can handle
- * nid=NUMA_NO_NODE
- */
- return alloc_pages_node(nid, gfp, order);
- }
-
- /*
- * OK, so we have a VMA. Fetch the mempolicy and try to
- * allocate a huge page with it. We will only reach this
- * when CONFIG_NUMA=y.
- */
- do {
- struct page *page;
- struct mempolicy *mpol;
- int nid;
- nodemask_t *nodemask;
-
- cpuset_mems_cookie = read_mems_allowed_begin();
- nid = huge_node(vma, addr, gfp, &mpol, &nodemask);
- mpol_cond_put(mpol);
- page = __alloc_pages_nodemask(gfp, order, nid, nodemask);
- if (page)
- return page;
- } while (read_mems_allowed_retry(cpuset_mems_cookie));
- return NULL;
+ gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+ return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
}
-/*
- * There are two ways to allocate a huge page:
- * 1. When you have a VMA and an address (like a fault)
- * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
- *
- * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
- * this case which signifies that the allocation should be done with
- * respect for the VMA's memory policy.
- *
- * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
- * implies that memory policies will not be taken in to account.
- */
-static struct page *__alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr, int nid)
+static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
struct page *page;
unsigned int r_nid;
@@ -1597,15 +1541,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
return NULL;
/*
- * Make sure that anyone specifying 'nid' is not also specifying a VMA.
- * This makes sure the caller is picking _one_ of the modes with which
- * we can call this function, not both.
- */
- if (vma || (addr != -1)) {
- VM_WARN_ON_ONCE(addr == -1);
- VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
- }
- /*
* Assume we will successfully allocate the surplus page to
* prevent racing processes from causing the surplus to exceed
* overcommit
@@ -1638,7 +1573,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
}
spin_unlock(&hugetlb_lock);
- page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
+ page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
spin_lock(&hugetlb_lock);
if (page) {
@@ -1663,26 +1598,23 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
}
/*
- * Allocate a huge page from 'nid'. Note, 'nid' may be
- * NUMA_NO_NODE, which means that it may be allocated
- * anywhere.
- */
-static
-struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
-{
- unsigned long addr = -1;
-
- return __alloc_buddy_huge_page(h, NULL, addr, nid);
-}
-
-/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+ struct page *page;
+ struct mempolicy *mpol;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+ int nid;
+ nodemask_t *nodemask;
+
+ nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
+ page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+ mpol_cond_put(mpol);
+
+ return page;
}
/*
@@ -1692,19 +1624,46 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
*/
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
+ gfp_t gfp_mask = htlb_alloc_mask(h);
struct page *page = NULL;
+ if (nid != NUMA_NO_NODE)
+ gfp_mask |= __GFP_THISNODE;
+
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_node(h, nid);
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
spin_unlock(&hugetlb_lock);
if (!page)
- page = __alloc_buddy_huge_page_no_mpol(h, nid);
+ page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
return page;
}
+
+struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+ nodemask_t *nmask)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+
+ spin_lock(&hugetlb_lock);
+ if (h->free_huge_pages - h->resv_huge_pages > 0) {
+ struct page *page;
+
+ page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
+ if (page) {
+ spin_unlock(&hugetlb_lock);
+ return page;
+ }
+ }
+ spin_unlock(&hugetlb_lock);
+
+ /* No reservations, try to overcommit */
+
+ return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
@@ -1730,12 +1689,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
break;
}
list_add(&page->lru, &surplus_list);
+ cond_resched();
}
allocated += i;
@@ -2204,8 +2165,16 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
} else if (!alloc_fresh_huge_page(h,
&node_states[N_MEMORY]))
break;
+ cond_resched();
+ }
+ if (i < h->max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
+ h->max_huge_pages, buf, i);
+ h->max_huge_pages = i;
}
- h->max_huge_pages = i;
}
static void __init hugetlb_init_hstates(void)
@@ -2223,26 +2192,16 @@ static void __init hugetlb_init_hstates(void)
VM_BUG_ON(minimum_order == UINT_MAX);
}
-static char * __init memfmt(char *buf, unsigned long n)
-{
- if (n >= (1UL << 30))
- sprintf(buf, "%lu GB", n >> 30);
- else if (n >= (1UL << 20))
- sprintf(buf, "%lu MB", n >> 20);
- else
- sprintf(buf, "%lu KB", n >> 10);
- return buf;
-}
-
static void __init report_hugepages(void)
{
struct hstate *h;
for_each_hstate(h) {
char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
- memfmt(buf, huge_page_size(h)),
- h->free_huge_pages);
+ buf, h->free_huge_pages);
}
}
@@ -2801,6 +2760,11 @@ static int __init hugetlb_init(void)
return 0;
if (!size_to_hstate(default_hstate_size)) {
+ if (default_hstate_size != 0) {
+ pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
+ default_hstate_size, HPAGE_SIZE);
+ }
+
default_hstate_size = HPAGE_SIZE;
if (!size_to_hstate(default_hstate_size))
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
@@ -4739,40 +4703,6 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
}
-#ifdef CONFIG_MEMORY_FAILURE
-
-/*
- * This function is called from memory failure code.
- */
-int dequeue_hwpoisoned_huge_page(struct page *hpage)
-{
- struct hstate *h = page_hstate(hpage);
- int nid = page_to_nid(hpage);
- int ret = -EBUSY;
-
- spin_lock(&hugetlb_lock);
- /*
- * Just checking !page_huge_active is not enough, because that could be
- * an isolated/hwpoisoned hugepage (which have >0 refcount).
- */
- if (!page_huge_active(hpage) && !page_count(hpage)) {
- /*
- * Hwpoisoned hugepage isn't linked to activelist or freelist,
- * but dangling hpage->lru can trigger list-debug warnings
- * (this happens when we call unpoison_memory() on it),
- * so let it point to itself with list_del_init().
- */
- list_del_init(&hpage->lru);
- set_page_refcounted(hpage);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- ret = 0;
- }
- spin_unlock(&hugetlb_lock);
- return ret;
-}
-#endif
-
bool isolate_huge_page(struct page *page, struct list_head *list)
{
bool ret = true;
diff --git a/mm/internal.h b/mm/internal.h
index 0e4f558412fb..24d88f084705 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -23,7 +23,7 @@
* hints such as HIGHMEM usage.
*/
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
- __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
+ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
__GFP_ATOMIC)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c81549d5c833..ca11bc4ce205 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -134,97 +134,33 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr)
return false;
}
-static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+ unsigned long size)
{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
-
- if (unlikely(*shadow_addr)) {
- if (memory_is_poisoned_1(addr + 1))
- return true;
-
- /*
- * If single shadow byte covers 2-byte access, we don't
- * need to do anything more. Otherwise, test the first
- * shadow byte.
- */
- if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
- return false;
-
- return unlikely(*(u8 *)shadow_addr);
- }
-
- return false;
-}
-
-static __always_inline bool memory_is_poisoned_4(unsigned long addr)
-{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
- if (unlikely(*shadow_addr)) {
- if (memory_is_poisoned_1(addr + 3))
- return true;
-
- /*
- * If single shadow byte covers 4-byte access, we don't
- * need to do anything more. Otherwise, test the first
- * shadow byte.
- */
- if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
- return false;
-
- return unlikely(*(u8 *)shadow_addr);
- }
-
- return false;
-}
-
-static __always_inline bool memory_is_poisoned_8(unsigned long addr)
-{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
-
- if (unlikely(*shadow_addr)) {
- if (memory_is_poisoned_1(addr + 7))
- return true;
-
- /*
- * If single shadow byte covers 8-byte access, we don't
- * need to do anything more. Otherwise, test the first
- * shadow byte.
- */
- if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
- return false;
-
- return unlikely(*(u8 *)shadow_addr);
- }
+ /*
+ * Access crosses 8(shadow size)-byte boundary. Such access maps
+ * into 2 shadow bytes, so we need to check them both.
+ */
+ if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+ return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
- return false;
+ return memory_is_poisoned_1(addr + size - 1);
}
static __always_inline bool memory_is_poisoned_16(unsigned long addr)
{
- u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
-
- if (unlikely(*shadow_addr)) {
- u16 shadow_first_bytes = *(u16 *)shadow_addr;
-
- if (unlikely(shadow_first_bytes))
- return true;
-
- /*
- * If two shadow bytes covers 16-byte access, we don't
- * need to do anything more. Otherwise, test the last
- * shadow byte.
- */
- if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
- return false;
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
- return memory_is_poisoned_1(addr + 15);
- }
+ /* Unaligned 16-bytes access maps into 3 shadow bytes. */
+ if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+ return *shadow_addr || memory_is_poisoned_1(addr + 15);
- return false;
+ return *shadow_addr;
}
-static __always_inline unsigned long bytes_is_zero(const u8 *start,
+static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
size_t size)
{
while (size) {
@@ -237,7 +173,7 @@ static __always_inline unsigned long bytes_is_zero(const u8 *start,
return 0;
}
-static __always_inline unsigned long memory_is_zero(const void *start,
+static __always_inline unsigned long memory_is_nonzero(const void *start,
const void *end)
{
unsigned int words;
@@ -245,11 +181,11 @@ static __always_inline unsigned long memory_is_zero(const void *start,
unsigned int prefix = (unsigned long)start % 8;
if (end - start <= 16)
- return bytes_is_zero(start, end - start);
+ return bytes_is_nonzero(start, end - start);
if (prefix) {
prefix = 8 - prefix;
- ret = bytes_is_zero(start, prefix);
+ ret = bytes_is_nonzero(start, prefix);
if (unlikely(ret))
return ret;
start += prefix;
@@ -258,12 +194,12 @@ static __always_inline unsigned long memory_is_zero(const void *start,
words = (end - start) / 8;
while (words) {
if (unlikely(*(u64 *)start))
- return bytes_is_zero(start, 8);
+ return bytes_is_nonzero(start, 8);
start += 8;
words--;
}
- return bytes_is_zero(start, (end - start) % 8);
+ return bytes_is_nonzero(start, (end - start) % 8);
}
static __always_inline bool memory_is_poisoned_n(unsigned long addr,
@@ -271,7 +207,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
{
unsigned long ret;
- ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+ ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
kasan_mem_to_shadow((void *)addr + size - 1) + 1);
if (unlikely(ret)) {
@@ -292,11 +228,9 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
case 1:
return memory_is_poisoned_1(addr);
case 2:
- return memory_is_poisoned_2(addr);
case 4:
- return memory_is_poisoned_4(addr);
case 8:
- return memory_is_poisoned_8(addr);
+ return memory_is_poisoned_2_4_8(addr, size);
case 16:
return memory_is_poisoned_16(addr);
default:
@@ -803,17 +737,47 @@ void __asan_unpoison_stack_memory(const void *addr, size_t size)
EXPORT_SYMBOL(__asan_unpoison_stack_memory);
#ifdef CONFIG_MEMORY_HOTPLUG
-static int kasan_mem_notifier(struct notifier_block *nb,
+static int __meminit kasan_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
- return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
+ struct memory_notify *mem_data = data;
+ unsigned long nr_shadow_pages, start_kaddr, shadow_start;
+ unsigned long shadow_end, shadow_size;
+
+ nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
+ start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
+ shadow_size = nr_shadow_pages << PAGE_SHIFT;
+ shadow_end = shadow_start + shadow_size;
+
+ if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
+ WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
+ return NOTIFY_BAD;
+
+ switch (action) {
+ case MEM_GOING_ONLINE: {
+ void *ret;
+
+ ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
+ shadow_end, GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD,
+ pfn_to_nid(mem_data->start_pfn),
+ __builtin_return_address(0));
+ if (!ret)
+ return NOTIFY_BAD;
+
+ kmemleak_ignore(ret);
+ return NOTIFY_OK;
+ }
+ case MEM_OFFLINE:
+ vfree((void *)shadow_start);
+ }
+
+ return NOTIFY_OK;
}
static int __init kasan_memhotplug_init(void)
{
- pr_info("WARNING: KASAN doesn't support memory hot-add\n");
- pr_info("Memory hot-add will be disabled\n");
-
hotplug_memory_notifier(kasan_mem_notifier, 0);
return 0;
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index b96a5f773d88..554e4c0f23a2 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -118,6 +118,18 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
do {
next = p4d_addr_end(addr, end);
+ if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) {
+ pud_t *pud;
+ pmd_t *pmd;
+
+ p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+ pud = pud_offset(p4d, addr);
+ pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+ pmd = pmd_offset(pud, addr);
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_zero_pte));
+ continue;
+ }
if (p4d_none(*p4d)) {
p4d_populate(&init_mm, p4d,
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index beee0e980e2d..04bb1d3eb9ec 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -107,7 +107,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
return bug_type;
}
-const char *get_wild_bug_type(struct kasan_access_info *info)
+static const char *get_wild_bug_type(struct kasan_access_info *info)
{
const char *bug_type = "unknown-crash";
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index df4ebdb2b10a..c01f177a1120 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
static bool hugepage_vma_check(struct vm_area_struct *vma)
{
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE))
+ (vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
if (shmem_file(vma->vm_file)) {
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 234676e31edd..7a40fa2be858 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -117,6 +117,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
l = list_lru_from_kmem(nlru, item);
list_add_tail(item, &l->list);
l->nr_items++;
+ nlru->nr_items++;
spin_unlock(&nlru->lock);
return true;
}
@@ -136,6 +137,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
l = list_lru_from_kmem(nlru, item);
list_del_init(item);
l->nr_items--;
+ nlru->nr_items--;
spin_unlock(&nlru->lock);
return true;
}
@@ -183,15 +185,10 @@ EXPORT_SYMBOL_GPL(list_lru_count_one);
unsigned long list_lru_count_node(struct list_lru *lru, int nid)
{
- long count = 0;
- int memcg_idx;
+ struct list_lru_node *nlru;
- count += __list_lru_count_one(lru, nid, -1);
- if (list_lru_memcg_aware(lru)) {
- for_each_memcg_cache_index(memcg_idx)
- count += __list_lru_count_one(lru, nid, memcg_idx);
- }
- return count;
+ nlru = &lru->node[nid];
+ return nlru->nr_items;
}
EXPORT_SYMBOL_GPL(list_lru_count_node);
@@ -226,6 +223,7 @@ restart:
assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
isolated++;
+ nlru->nr_items--;
/*
* If the lru lock has been dropped, our list
* traversal is now invalid and so we have to
diff --git a/mm/madvise.c b/mm/madvise.c
index 25b78ee4fc2c..9976852f1e1c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -205,7 +205,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index);
+ vma, index, false);
if (page)
put_page(page);
}
@@ -246,7 +246,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
}
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0);
+ NULL, 0, false);
if (page)
put_page(page);
}
@@ -451,9 +451,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
- return -EINVAL;
-
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
return -EINVAL;
@@ -477,14 +474,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
return 0;
}
-static long madvise_free(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
-{
- *prev = vma;
- return madvise_free_single_vma(vma, start, end);
-}
-
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -504,9 +493,17 @@ static long madvise_free(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ zap_page_range(vma, start, end - start);
+ return 0;
+}
+
+static long madvise_dontneed_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ int behavior)
{
*prev = vma;
if (!can_madv_dontneed_vma(vma))
@@ -526,7 +523,8 @@ static long madvise_dontneed(struct vm_area_struct *vma,
* is also < vma->vm_end. If start <
* vma->vm_start it means an hole materialized
* in the user address space within the
- * virtual range passed to MADV_DONTNEED.
+ * virtual range passed to MADV_DONTNEED
+ * or MADV_FREE.
*/
return -ENOMEM;
}
@@ -537,7 +535,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
* Don't fail if end > vma->vm_end. If the old
* vma was splitted while the mmap_sem was
* released the effect of the concurrent
- * operation may not cause MADV_DONTNEED to
+ * operation may not cause madvise() to
* have an undefined result. There may be an
* adjacent next vma that we'll walk
* next. userfaultfd_remove() will generate an
@@ -549,8 +547,13 @@ static long madvise_dontneed(struct vm_area_struct *vma,
}
VM_WARN_ON(start >= end);
}
- zap_page_range(vma, start, end - start);
- return 0;
+
+ if (behavior == MADV_DONTNEED)
+ return madvise_dontneed_single_vma(vma, start, end);
+ else if (behavior == MADV_FREE)
+ return madvise_free_single_vma(vma, start, end);
+ else
+ return -EINVAL;
}
/*
@@ -656,9 +659,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_FREE:
- return madvise_free(vma, prev, start, end);
case MADV_DONTNEED:
- return madvise_dontneed(vma, prev, start, end);
+ return madvise_dontneed_free(vma, prev, start, end, behavior);
default:
return madvise_behavior(vma, prev, start, end, behavior);
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 425aa0caa712..3df3c04d73ab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -631,7 +631,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
val = __this_cpu_read(memcg->stat->nr_page_events);
next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
- if ((long)next - (long)val < 0) {
+ if ((long)(next - val) < 0) {
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
@@ -5317,38 +5317,52 @@ struct cgroup_subsys memory_cgrp_subsys = {
/**
* mem_cgroup_low - check if memory consumption is below the normal range
- * @root: the highest ancestor to consider
+ * @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
* Returns %true if memory consumption of @memcg, and that of all
- * configurable ancestors up to @root, is below the normal range.
+ * ancestors up to (but not including) @root, is below the normal range.
+ *
+ * @root is exclusive; it is never low when looked at directly and isn't
+ * checked when traversing the hierarchy.
+ *
+ * Excluding @root enables using memory.low to prioritize memory usage
+ * between cgroups within a subtree of the hierarchy that is limited by
+ * memory.high or memory.max.
+ *
+ * For example, given cgroup A with children B and C:
+ *
+ * A
+ * / \
+ * B C
+ *
+ * and
+ *
+ * 1. A/memory.current > A/memory.high
+ * 2. A/B/memory.current < A/B/memory.low
+ * 3. A/C/memory.current >= A/C/memory.low
+ *
+ * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
+ * should reclaim from 'C' until 'A' is no longer high or until we can
+ * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
+ * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
+ * low and we will reclaim indiscriminately from both 'B' and 'C'.
*/
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
return false;
- /*
- * The toplevel group doesn't have a configurable range, so
- * it's never low when looked at directly, and it is not
- * considered an ancestor when assessing the hierarchy.
- */
-
- if (memcg == root_mem_cgroup)
- return false;
-
- if (page_counter_read(&memcg->memory) >= memcg->low)
+ if (!root)
+ root = root_mem_cgroup;
+ if (memcg == root)
return false;
- while (memcg != root) {
- memcg = parent_mem_cgroup(memcg);
-
- if (memcg == root_mem_cgroup)
- break;
-
+ for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
if (page_counter_read(&memcg->memory) >= memcg->low)
return false;
}
+
return true;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index dbe3e50c9aa5..1cd3b3569af8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -49,7 +49,6 @@
#include <linux/swap.h>
#include <linux/backing-dev.h>
#include <linux/migrate.h>
-#include <linux/page-isolation.h>
#include <linux/suspend.h>
#include <linux/slab.h>
#include <linux/swapops.h>
@@ -555,6 +554,39 @@ static int delete_from_lru_cache(struct page *p)
return -EIO;
}
+static int truncate_error_page(struct page *p, unsigned long pfn,
+ struct address_space *mapping)
+{
+ int ret = MF_FAILED;
+
+ if (mapping->a_ops->error_remove_page) {
+ int err = mapping->a_ops->error_remove_page(mapping, p);
+
+ if (err != 0) {
+ pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
+ pfn, err);
+ } else if (page_has_private(p) &&
+ !try_to_release_page(p, GFP_NOIO)) {
+ pr_info("Memory failure: %#lx: failed to release buffers\n",
+ pfn);
+ } else {
+ ret = MF_RECOVERED;
+ }
+ } else {
+ /*
+ * If the file system doesn't support it just invalidate
+ * This fails on dirty or anything with private pages
+ */
+ if (invalidate_inode_page(p))
+ ret = MF_RECOVERED;
+ else
+ pr_info("Memory failure: %#lx: Failed to invalidate\n",
+ pfn);
+ }
+
+ return ret;
+}
+
/*
* Error hit kernel page.
* Do nothing, try to be lucky and not touch this instead. For a few cases we
@@ -579,8 +611,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
- int err;
- int ret = MF_FAILED;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -612,30 +642,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- if (mapping->a_ops->error_remove_page) {
- err = mapping->a_ops->error_remove_page(mapping, p);
- if (err != 0) {
- pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
- pfn, err);
- } else if (page_has_private(p) &&
- !try_to_release_page(p, GFP_NOIO)) {
- pr_info("Memory failure: %#lx: failed to release buffers\n",
- pfn);
- } else {
- ret = MF_RECOVERED;
- }
- } else {
- /*
- * If the file system doesn't support it just invalidate
- * This fails on dirty or anything with private pages
- */
- if (invalidate_inode_page(p))
- ret = MF_RECOVERED;
- else
- pr_info("Memory failure: %#lx: Failed to invalidate\n",
- pfn);
- }
- return ret;
+ return truncate_error_page(p, pfn, mapping);
}
/*
@@ -741,24 +748,29 @@ static int me_huge_page(struct page *p, unsigned long pfn)
{
int res = 0;
struct page *hpage = compound_head(p);
+ struct address_space *mapping;
if (!PageHuge(hpage))
return MF_DELAYED;
- /*
- * We can safely recover from error on free or reserved (i.e.
- * not in-use) hugepage by dequeuing it from freelist.
- * To check whether a hugepage is in-use or not, we can't use
- * page->lru because it can be used in other hugepage operations,
- * such as __unmap_hugepage_range() and gather_surplus_pages().
- * So instead we use page_mapping() and PageAnon().
- */
- if (!(page_mapping(hpage) || PageAnon(hpage))) {
- res = dequeue_hwpoisoned_huge_page(hpage);
- if (!res)
- return MF_RECOVERED;
+ mapping = page_mapping(hpage);
+ if (mapping) {
+ res = truncate_error_page(hpage, pfn, mapping);
+ } else {
+ unlock_page(hpage);
+ /*
+ * migration entry prevents later access on error anonymous
+ * hugepage, so we can free and dissolve it into buddy to
+ * save healthy subpages.
+ */
+ if (PageAnon(hpage))
+ put_page(hpage);
+ dissolve_free_huge_page(p);
+ res = MF_RECOVERED;
+ lock_page(hpage);
}
- return MF_DELAYED;
+
+ return res;
}
/*
@@ -857,7 +869,7 @@ static int page_action(struct page_state *ps, struct page *p,
count = page_count(p) - 1;
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
- if (count != 0) {
+ if (count > 0) {
pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
@@ -1010,20 +1022,84 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
return unmap_success;
}
-static void set_page_hwpoison_huge_page(struct page *hpage)
+static int identify_page_state(unsigned long pfn, struct page *p,
+ unsigned long page_flags)
{
- int i;
- int nr_pages = 1 << compound_order(hpage);
- for (i = 0; i < nr_pages; i++)
- SetPageHWPoison(hpage + i);
+ struct page_state *ps;
+
+ /*
+ * The first check uses the current page flags which may not have any
+ * relevant information. The second check with the saved page flags is
+ * carried out only if the first check can't determine the page status.
+ */
+ for (ps = error_states;; ps++)
+ if ((p->flags & ps->mask) == ps->res)
+ break;
+
+ page_flags |= (p->flags & (1UL << PG_dirty));
+
+ if (!ps->mask)
+ for (ps = error_states;; ps++)
+ if ((page_flags & ps->mask) == ps->res)
+ break;
+ return page_action(ps, p, pfn);
}
-static void clear_page_hwpoison_huge_page(struct page *hpage)
+static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags)
{
- int i;
- int nr_pages = 1 << compound_order(hpage);
- for (i = 0; i < nr_pages; i++)
- ClearPageHWPoison(hpage + i);
+ struct page *p = pfn_to_page(pfn);
+ struct page *head = compound_head(p);
+ int res;
+ unsigned long page_flags;
+
+ if (TestSetPageHWPoison(head)) {
+ pr_err("Memory failure: %#lx: already hardware poisoned\n",
+ pfn);
+ return 0;
+ }
+
+ num_poisoned_pages_inc();
+
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
+ /*
+ * Check "filter hit" and "race with other subpage."
+ */
+ lock_page(head);
+ if (PageHWPoison(head)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != head && TestSetPageHWPoison(head))) {
+ num_poisoned_pages_dec();
+ unlock_page(head);
+ return 0;
+ }
+ }
+ unlock_page(head);
+ dissolve_free_huge_page(p);
+ action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
+ return 0;
+ }
+
+ lock_page(head);
+ page_flags = head->flags;
+
+ if (!PageHWPoison(head)) {
+ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
+ num_poisoned_pages_dec();
+ unlock_page(head);
+ put_hwpoison_page(head);
+ return 0;
+ }
+
+ if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) {
+ action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+ res = identify_page_state(pfn, p, page_flags);
+out:
+ unlock_page(head);
+ return res;
}
/**
@@ -1046,12 +1122,10 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
*/
int memory_failure(unsigned long pfn, int trapno, int flags)
{
- struct page_state *ps;
struct page *p;
struct page *hpage;
struct page *orig_head;
int res;
- unsigned int nr_pages;
unsigned long page_flags;
if (!sysctl_memory_failure_recovery)
@@ -1064,34 +1138,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
- orig_head = hpage = compound_head(p);
+ if (PageHuge(p))
+ return memory_failure_hugetlb(pfn, trapno, flags);
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
return 0;
}
- /*
- * Currently errors on hugetlbfs pages are measured in hugepage units,
- * so nr_pages should be 1 << compound_order. OTOH when errors are on
- * transparent hugepages, they are supposed to be split and error
- * measurement is done in normal page units. So nr_pages should be one
- * in this case.
- */
- if (PageHuge(p))
- nr_pages = 1 << compound_order(hpage);
- else /* normal page or thp */
- nr_pages = 1;
- num_poisoned_pages_add(nr_pages);
+ orig_head = hpage = compound_head(p);
+ num_poisoned_pages_inc();
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's a free hugepage, which is also safe:
- * an affected hugepage will be dequeued from hugepage freelist,
- * so there's no concern about reusing it ever after.
- * 3) it's part of a non-compound high order page.
+ * 2) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
@@ -1102,32 +1164,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (is_free_buddy_page(p)) {
action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
return 0;
- } else if (PageHuge(hpage)) {
- /*
- * Check "filter hit" and "race with other subpage."
- */
- lock_page(hpage);
- if (PageHWPoison(hpage)) {
- if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != hpage && TestSetPageHWPoison(hpage))) {
- num_poisoned_pages_sub(nr_pages);
- unlock_page(hpage);
- return 0;
- }
- }
- set_page_hwpoison_huge_page(hpage);
- res = dequeue_hwpoisoned_huge_page(hpage);
- action_result(pfn, MF_MSG_FREE_HUGE,
- res ? MF_IGNORED : MF_DELAYED);
- unlock_page(hpage);
- return res;
} else {
action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
return -EBUSY;
}
}
- if (!PageHuge(p) && PageTransHuge(hpage)) {
+ if (PageTransHuge(hpage)) {
lock_page(p);
if (!PageAnon(p) || unlikely(split_huge_page(p))) {
unlock_page(p);
@@ -1138,7 +1181,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
pr_err("Memory failure: %#lx: thp split failed\n",
pfn);
if (TestClearPageHWPoison(p))
- num_poisoned_pages_sub(nr_pages);
+ num_poisoned_pages_dec();
put_hwpoison_page(p);
return -EBUSY;
}
@@ -1165,7 +1208,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
- lock_page(hpage);
+ lock_page(p);
/*
* The page could have changed compound pages during the locking.
@@ -1194,42 +1237,23 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (!PageHWPoison(p)) {
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- num_poisoned_pages_sub(nr_pages);
- unlock_page(hpage);
- put_hwpoison_page(hpage);
+ num_poisoned_pages_dec();
+ unlock_page(p);
+ put_hwpoison_page(p);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- num_poisoned_pages_sub(nr_pages);
- unlock_page(hpage);
- put_hwpoison_page(hpage);
+ num_poisoned_pages_dec();
+ unlock_page(p);
+ put_hwpoison_page(p);
return 0;
}
- if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+ if (!PageTransTail(p) && !PageLRU(p))
goto identify_page_state;
/*
- * For error on the tail page, we should set PG_hwpoison
- * on the head page to show that the hugepage is hwpoisoned
- */
- if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
- action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
- unlock_page(hpage);
- put_hwpoison_page(hpage);
- return 0;
- }
- /*
- * Set PG_hwpoison on all pages in an error hugepage,
- * because containment is done in hugepage unit for now.
- * Since we have done TestSetPageHWPoison() for the head page with
- * page lock held, we can safely set PG_hwpoison bits on tail pages.
- */
- if (PageHuge(p))
- set_page_hwpoison_huge_page(hpage);
-
- /*
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
@@ -1258,25 +1282,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
identify_page_state:
- res = -EBUSY;
- /*
- * The first check uses the current page flags which may not have any
- * relevant information. The second check with the saved page flagss is
- * carried out only if the first check can't determine the page status.
- */
- for (ps = error_states;; ps++)
- if ((p->flags & ps->mask) == ps->res)
- break;
-
- page_flags |= (p->flags & (1UL << PG_dirty));
-
- if (!ps->mask)
- for (ps = error_states;; ps++)
- if ((page_flags & ps->mask) == ps->res)
- break;
- res = page_action(ps, p, pfn);
+ res = identify_page_state(pfn, p, page_flags);
out:
- unlock_page(hpage);
+ unlock_page(p);
return res;
}
EXPORT_SYMBOL_GPL(memory_failure);
@@ -1398,7 +1406,6 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int freeit = 0;
- unsigned int nr_pages;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -1443,20 +1450,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- nr_pages = 1 << compound_order(page);
-
if (!get_hwpoison_page(p)) {
- /*
- * Since HWPoisoned hugepage should have non-zero refcount,
- * race between memory failure and unpoison seems to happen.
- * In such case unpoison fails and memory failure runs
- * to the end.
- */
- if (PageHuge(page)) {
- unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
- pfn, &unpoison_rs);
- return 0;
- }
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
@@ -1474,10 +1468,8 @@ int unpoison_memory(unsigned long pfn)
if (TestClearPageHWPoison(page)) {
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
pfn, &unpoison_rs);
- num_poisoned_pages_sub(nr_pages);
+ num_poisoned_pages_dec();
freeit = 1;
- if (PageHuge(page))
- clear_page_hwpoison_huge_page(page);
}
unlock_page(page);
@@ -1492,16 +1484,8 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- if (PageHuge(p)) {
- struct hstate *hstate = page_hstate(compound_head(p));
-
- if (hstate_is_gigantic(hstate))
- return alloc_huge_page_node(hstate, NUMA_NO_NODE);
- return alloc_huge_page_node(hstate, nid);
- } else {
- return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
- }
+ return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
}
/*
@@ -1608,15 +1592,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- /* overcommit hugetlb page will be freed to buddy */
- if (PageHuge(page)) {
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- num_poisoned_pages_add(1 << compound_order(hpage));
- } else {
- SetPageHWPoison(page);
- num_poisoned_pages_inc();
- }
+ if (PageHuge(page))
+ dissolve_free_huge_page(page);
}
return ret;
}
@@ -1732,15 +1709,12 @@ static int soft_offline_in_use_page(struct page *page, int flags)
static void soft_offline_free_page(struct page *page)
{
- if (PageHuge(page)) {
- struct page *hpage = compound_head(page);
+ struct page *head = compound_head(page);
- set_page_hwpoison_huge_page(hpage);
- if (!dequeue_hwpoisoned_huge_page(hpage))
- num_poisoned_pages_add(1 << compound_order(hpage));
- } else {
- if (!TestSetPageHWPoison(page))
- num_poisoned_pages_inc();
+ if (!TestSetPageHWPoison(head)) {
+ num_poisoned_pages_inc();
+ if (PageHuge(head))
+ dissolve_free_huge_page(page);
}
}
diff --git a/mm/memory.c b/mm/memory.c
index e31dd97e6114..0e517be91a89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3262,14 +3262,14 @@ static int fault_around_bytes_set(void *data, u64 val)
fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
+DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
static int __init fault_around_debugfs(void)
{
void *ret;
- ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
+ ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
&fault_around_bytes_fops);
if (!ret)
pr_warn("Failed to create fault_around_bytes in debugfs");
@@ -3591,7 +3591,7 @@ out:
return 0;
}
-static int create_huge_pmd(struct vm_fault *vmf)
+static inline int create_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f79aac7a12b5..8dccc317aac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,32 +52,17 @@ static void generic_online_page(struct page *page);
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
-/* The same as the cpu_hotplug lock, but for memory hotplug. */
-static struct {
- struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
- /*
- * Also blocks the new readers during
- * an ongoing mem hotplug operation.
- */
- int refcount;
+DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} mem_hotplug = {
- .active_writer = NULL,
- .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
- .refcount = 0,
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "mem_hotplug.lock" },
-#endif
-};
+void get_online_mems(void)
+{
+ percpu_down_read(&mem_hotplug_lock);
+}
-/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
-#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
-#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
-#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+void put_online_mems(void)
+{
+ percpu_up_read(&mem_hotplug_lock);
+}
bool movable_node_enabled = false;
@@ -99,60 +84,16 @@ static int __init setup_memhp_default_state(char *str)
}
__setup("memhp_default_state=", setup_memhp_default_state);
-void get_online_mems(void)
-{
- might_sleep();
- if (mem_hotplug.active_writer == current)
- return;
- memhp_lock_acquire_read();
- mutex_lock(&mem_hotplug.lock);
- mem_hotplug.refcount++;
- mutex_unlock(&mem_hotplug.lock);
-
-}
-
-void put_online_mems(void)
-{
- if (mem_hotplug.active_writer == current)
- return;
- mutex_lock(&mem_hotplug.lock);
-
- if (WARN_ON(!mem_hotplug.refcount))
- mem_hotplug.refcount++; /* try to fix things up */
-
- if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
- wake_up_process(mem_hotplug.active_writer);
- mutex_unlock(&mem_hotplug.lock);
- memhp_lock_release();
-
-}
-
-/* Serializes write accesses to mem_hotplug.active_writer. */
-static DEFINE_MUTEX(memory_add_remove_lock);
-
void mem_hotplug_begin(void)
{
- mutex_lock(&memory_add_remove_lock);
-
- mem_hotplug.active_writer = current;
-
- memhp_lock_acquire();
- for (;;) {
- mutex_lock(&mem_hotplug.lock);
- if (likely(!mem_hotplug.refcount))
- break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&mem_hotplug.lock);
- schedule();
- }
+ cpus_read_lock();
+ percpu_down_write(&mem_hotplug_lock);
}
void mem_hotplug_done(void)
{
- mem_hotplug.active_writer = NULL;
- mutex_unlock(&mem_hotplug.lock);
- memhp_lock_release();
- mutex_unlock(&memory_add_remove_lock);
+ percpu_up_write(&mem_hotplug_lock);
+ cpus_read_unlock();
}
/* add this memory to iomem resource */
@@ -580,11 +521,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
- int zone_type;
unsigned long flags;
- zone_type = zone - pgdat->node_zones;
-
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
@@ -934,6 +872,19 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
return &pgdat->node_zones[ZONE_NORMAL];
}
+static inline bool movable_pfn_range(int nid, struct zone *default_zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
+ MMOP_ONLINE_KERNEL))
+ return true;
+
+ if (!movable_node_is_enabled())
+ return false;
+
+ return !zone_intersects(default_zone, start_pfn, nr_pages);
+}
+
/*
* Associates the given pfn range with the given node and the zone appropriate
* for the given online type.
@@ -949,10 +900,10 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
/*
* MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
* movable zone if that is not possible (e.g. we are within
- * or past the existing movable zone)
+ * or past the existing movable zone). movable_node overrides
+ * this default and defaults to movable zone
*/
- if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
- MMOP_ONLINE_KERNEL))
+ if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
zone = movable_zone;
} else if (online_type == MMOP_ONLINE_MOVABLE) {
zone = &pgdat->node_zones[ZONE_MOVABLE];
@@ -1268,7 +1219,7 @@ register_fail:
error:
/* rollback pgdat allocation and others */
- if (new_pgdat)
+ if (new_pgdat && pgdat)
rollback_node_hotadd(nid, pgdat);
memblock_remove(start, size);
@@ -1420,32 +1371,19 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
static struct page *new_node_page(struct page *page, unsigned long private,
int **result)
{
- gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
int nid = page_to_nid(page);
nodemask_t nmask = node_states[N_MEMORY];
- struct page *new_page = NULL;
/*
- * TODO: allocate a destination hugepage from a nearest neighbor node,
- * accordance with memory policy of the user process if possible. For
- * now as a simple work-around, we use the next node for destination.
+ * try to allocate from a different node but reuse this node if there
+ * are no other online nodes to be used (e.g. we are offlining a part
+ * of the only existing node)
*/
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- next_node_in(nid, nmask));
-
node_clear(nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(nid, nmask);
- if (PageHighMem(page)
- || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
- gfp_mask |= __GFP_HIGHMEM;
-
- if (!nodes_empty(nmask))
- new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask);
- if (!new_page)
- new_page = __alloc_pages(gfp_mask, 0, nid);
-
- return new_page;
+ return new_page_nodemask(page, nid, &nmask);
}
#define NR_OFFLINE_AT_ONCE_PAGES (256)
@@ -1728,7 +1666,7 @@ repeat:
goto failed_removal;
ret = 0;
if (drain) {
- lru_add_drain_all();
+ lru_add_drain_all_cpuslocked();
cond_resched();
drain_all_pages(zone);
}
@@ -1749,7 +1687,7 @@ repeat:
}
}
/* drain all zone's lru pagevec, this is asynchronous... */
- lru_add_drain_all();
+ lru_add_drain_all_cpuslocked();
yield();
/* drain pcp pages, this is synchronous. */
drain_all_pages(zone);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7d8e56214ac0..d911fa5cb2a7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1078,7 +1078,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
/*
* if !vma, alloc_page_vma() will use task or system default policy
*/
- return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
+ vma, address);
}
#else
diff --git a/mm/migrate.c b/mm/migrate.c
index 051cc1555d36..627671551873 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1252,6 +1252,8 @@ put_anon:
out:
if (rc != -EAGAIN)
putback_active_hugepage(hpage);
+ if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
+ num_poisoned_pages_inc();
/*
* If migration was not successful and there's a freeing callback, use
@@ -1914,7 +1916,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int page_lru = page_is_file_cache(page);
unsigned long mmun_start = address & HPAGE_PMD_MASK;
unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
- pmd_t orig_entry;
/*
* Rate-limit the amount of data that is being migrated to a node.
@@ -1957,8 +1958,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Recheck the target PMD */
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
-fail_putback:
+ if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1980,7 +1980,6 @@ fail_putback:
goto out_unlock;
}
- orig_entry = *pmd;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1997,15 +1996,7 @@ fail_putback:
set_pmd_at(mm, mmun_start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
- if (page_count(page) != 2) {
- set_pmd_at(mm, mmun_start, pmd, orig_entry);
- flush_pmd_tlb_range(vma, mmun_start, mmun_end);
- mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
- update_mmu_cache_pmd(vma, address, &entry);
- page_remove_rmap(new_page, true);
- goto fail_putback;
- }
-
+ page_ref_unfreeze(page, 2);
mlock_migrate_page(new_page, page);
page_remove_rmap(page, true);
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f8cfe9d9b4d..f19efcf75418 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2177,7 +2177,6 @@ static int acct_stack_growth(struct vm_area_struct *vma,
unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
- struct rlimit *rlim = current->signal->rlim;
unsigned long new_start;
/* address space limit tests */
@@ -2185,7 +2184,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return -ENOMEM;
/* Stack limit test */
- if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (size > rlimit(RLIMIT_STACK))
return -ENOMEM;
/* mlock limit tests */
@@ -2193,7 +2192,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit = rlimit(RLIMIT_MEMLOCK);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
@@ -2232,7 +2231,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
- if (address >= TASK_SIZE)
+ if (address >= (TASK_SIZE & PAGE_MASK))
return -ENOMEM;
address += PAGE_SIZE;
@@ -2244,7 +2243,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
gap_addr = TASK_SIZE;
next = vma->vm_next;
- if (next && next->vm_start < gap_addr) {
+ if (next && next->vm_start < gap_addr &&
+ (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
@@ -2315,7 +2315,6 @@ int expand_downwards(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
- unsigned long gap_addr;
int error;
address &= PAGE_MASK;
@@ -2324,14 +2323,12 @@ int expand_downwards(struct vm_area_struct *vma,
return error;
/* Enforce stack_guard_gap */
- gap_addr = address - stack_guard_gap;
- if (gap_addr > address)
- return -ENOMEM;
prev = vma->vm_prev;
- if (prev && prev->vm_end > gap_addr) {
- if (!(prev->vm_flags & VM_GROWSDOWN))
+ /* Check that both stack segments have the same anon_vma? */
+ if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+ (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+ if (address - prev->vm_end < stack_guard_gap)
return -ENOMEM;
- /* Check that both stack segments have the same anon_vma? */
}
/* We must make sure the anon_vma is allocated. */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0e2c925e7826..9e8b4f030c1c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -490,6 +490,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
if (!down_read_trylock(&mm->mmap_sem)) {
ret = false;
+ trace_skip_task_reaping(tsk->pid);
goto unlock_oom;
}
@@ -500,9 +501,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
*/
if (!mmget_not_zero(mm)) {
up_read(&mm->mmap_sem);
+ trace_skip_task_reaping(tsk->pid);
goto unlock_oom;
}
+ trace_start_task_reaping(tsk->pid);
+
/*
* Tell all users of get_user/copy_from_user etc... that the content
* is no longer stable. No barriers really needed because unmapping
@@ -544,6 +548,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* put the oom_reaper out of the way.
*/
mmput_async(mm);
+ trace_finish_task_reaping(tsk->pid);
unlock_oom:
mutex_unlock(&oom_lock);
return ret;
@@ -615,6 +620,7 @@ static void wake_oom_reaper(struct task_struct *tsk)
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
spin_unlock(&oom_reaper_lock);
+ trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}
@@ -666,6 +672,7 @@ static void mark_oom_victim(struct task_struct *tsk)
*/
__thaw_task(tsk);
atomic_inc(&oom_victims);
+ trace_mark_victim(tsk->pid);
}
/**
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b60cc7ddac2..96e93b214d31 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(struct bdi_writeback *wb)
{
struct wb_domain *cgdom;
- __inc_wb_stat(wb, WB_WRITTEN);
+ inc_wb_stat(wb, WB_WRITTEN);
wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
wb->bdi->max_prop_frac);
@@ -2435,8 +2435,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
__inc_lruvec_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
__inc_node_page_state(page, NR_DIRTIED);
- __inc_wb_stat(wb, WB_RECLAIMABLE);
- __inc_wb_stat(wb, WB_DIRTIED);
+ inc_wb_stat(wb, WB_RECLAIMABLE);
+ inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
@@ -2741,7 +2741,7 @@ int test_clear_page_writeback(struct page *page)
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
- __dec_wb_stat(wb, WB_WRITEBACK);
+ dec_wb_stat(wb, WB_WRITEBACK);
__wb_writeout_inc(wb);
}
}
@@ -2786,7 +2786,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
- __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+ inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
/*
* We can come through here when swapping anonymous
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd65b60939b6..6d30e914afb6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2206,19 +2206,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
+ *
+ * The use of signed ints for order and current_order is a deliberate
+ * deviation from the rest of this file, to make the for loop
+ * condition simpler.
*/
static inline bool
-__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area *area;
- unsigned int current_order;
+ int current_order;
struct page *page;
int fallback_mt;
bool can_steal;
- /* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1;
- current_order >= order && current_order <= MAX_ORDER-1;
+ /*
+ * Find the largest available free page in the other list. This roughly
+ * approximates finding the pageblock with the most free pages, which
+ * would be too costly to do exactly.
+ */
+ for (current_order = MAX_ORDER - 1; current_order >= order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2226,19 +2233,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
if (fallback_mt == -1)
continue;
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ /*
+ * We cannot steal all free pages from the pageblock and the
+ * requested migratetype is movable. In that case it's better to
+ * steal and split the smallest available page instead of the
+ * largest available page, because even if the next movable
+ * allocation falls back into a different pageblock than this
+ * one, it won't cause permanent fragmentation.
+ */
+ if (!can_steal && start_migratetype == MIGRATE_MOVABLE
+ && current_order > order)
+ goto find_smallest;
- steal_suitable_fallback(zone, page, start_migratetype,
- can_steal);
+ goto do_steal;
+ }
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
+ return false;
- return true;
+find_smallest:
+ for (current_order = order; current_order < MAX_ORDER;
+ current_order++) {
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt != -1)
+ break;
}
- return false;
+ /*
+ * This should not happen - we already found a suitable fallback
+ * when looking for the largest page.
+ */
+ VM_BUG_ON(current_order == MAX_ORDER);
+
+do_steal:
+ page = list_first_entry(&area->free_list[fallback_mt],
+ struct page, lru);
+
+ steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+
+ return true;
+
}
/*
@@ -3246,6 +3284,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
+ /*
+ * We have already exhausted all our reclaim opportunities without any
+ * success so it is time to admit defeat. We will skip the OOM killer
+ * because it is very likely that the caller has a more reasonable
+ * fallback than shooting a random task.
+ */
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
@@ -3375,7 +3421,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
}
/*
- * !costly requests are much more important than __GFP_REPEAT
+ * !costly requests are much more important than __GFP_RETRY_MAYFAIL
* costly ones because they are de facto nofail and invoke OOM
* killer to move on while costly can fail and users are ready
* to cope with that. 1/4 retries is rather arbitrary but we
@@ -3882,9 +3928,9 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_REPEAT
+ * __GFP_RETRY_MAYFAIL
*/
- if (costly_order && !(gfp_mask & __GFP_REPEAT))
+ if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -5240,7 +5286,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
#endif
/* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine(__build_all_zonelists, pgdat, NULL);
+ stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
diff --git a/mm/page_io.c b/mm/page_io.c
index 2da71e627812..b6c4ac388209 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -117,6 +117,7 @@ static void swap_slot_free_notify(struct page *page)
static void end_swap_bio_read(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
+ struct task_struct *waiter = bio->bi_private;
if (bio->bi_status) {
SetPageError(page);
@@ -132,7 +133,9 @@ static void end_swap_bio_read(struct bio *bio)
swap_slot_free_notify(page);
out:
unlock_page(page);
+ WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
+ wake_up_process(waiter);
}
int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -329,11 +332,13 @@ out:
return ret;
}
-int swap_readpage(struct page *page)
+int swap_readpage(struct page *page, bool do_poll)
{
struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
+ blk_qc_t qc;
+ struct block_device *bdev;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -372,9 +377,23 @@ int swap_readpage(struct page *page)
ret = -ENOMEM;
goto out;
}
+ bdev = bio->bi_bdev;
+ bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
count_vm_event(PSWPIN);
- submit_bio(bio);
+ bio_get(bio);
+ qc = submit_bio(bio);
+ while (do_poll) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(bio->bi_private))
+ break;
+
+ if (!blk_mq_poll(bdev_get_queue(bdev), qc))
+ break;
+ }
+ __set_current_state(TASK_RUNNING);
+ bio_put(bio);
+
out:
return ret;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3606104893e0..757410d9f758 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,6 +8,7 @@
#include <linux/memory.h>
#include <linux/hugetlb.h>
#include <linux/page_owner.h>
+#include <linux/migrate.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -294,20 +295,5 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
struct page *alloc_migrate_target(struct page *page, unsigned long private,
int **resultp)
{
- gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-
- /*
- * TODO: allocate a destination hugepage from a nearest neighbor node,
- * accordance with memory policy of the user process if possible. For
- * now as a simple work-around, we use the next node for destination.
- */
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- next_node_in(page_to_nid(page),
- node_online_map));
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
- return alloc_page(gfp_mask);
+ return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 60634dc53a88..0fd9dcf2c5dc 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -281,7 +281,11 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
continue;
if (PageBuddy(page)) {
- pfn += (1UL << page_order(page)) - 1;
+ unsigned long freepage_order;
+
+ freepage_order = page_order_unsafe(page);
+ if (freepage_order < MAX_ORDER)
+ pfn += (1UL << freepage_order) - 1;
continue;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 9418f5a9bc46..b0aa6075d164 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf)
}
sgp = SGP_CACHE;
- if (vma->vm_flags & VM_HUGEPAGE)
- sgp = SGP_HUGE;
- else if (vma->vm_flags & VM_NOHUGEPAGE)
+
+ if ((vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
sgp = SGP_NOHUGE;
+ else if (vma->vm_flags & VM_HUGEPAGE)
+ sgp = SGP_HUGE;
error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
gfp, vma, vmf, &ret);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a56c3989f773..c50b1a14d55e 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
if (node_state(node, N_HIGH_MEMORY))
page = alloc_pages_node(
- node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+ node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
get_order(size));
else
page = alloc_pages(
- GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+ GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
get_order(size));
if (page)
return page_address(page);
diff --git a/mm/swap.c b/mm/swap.c
index 4f44dbd7f780..60b1d2a75852 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -688,7 +688,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-void lru_add_drain_all(void)
+void lru_add_drain_all_cpuslocked(void)
{
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
@@ -702,7 +702,6 @@ void lru_add_drain_all(void)
return;
mutex_lock(&lock);
- get_online_cpus();
cpumask_clear(&has_work);
for_each_online_cpu(cpu) {
@@ -722,10 +721,16 @@ void lru_add_drain_all(void)
for_each_cpu(cpu, &has_work)
flush_work(&per_cpu(lru_add_drain_work, cpu));
- put_online_cpus();
mutex_unlock(&lock);
}
+void lru_add_drain_all(void)
+{
+ get_online_cpus();
+ lru_add_drain_all_cpuslocked();
+ put_online_cpus();
+}
+
/**
* release_pages - batched put_page()
* @pages: array of pages to release
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 90c1032a8ac3..13a174006b91 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -273,11 +273,11 @@ int free_swap_slot(swp_entry_t entry)
{
struct swap_slots_cache *cache;
- cache = &get_cpu_var(swp_slots);
+ cache = raw_cpu_ptr(&swp_slots);
if (use_swap_slot_cache && cache->slots_ret) {
spin_lock_irq(&cache->free_lock);
/* Swap slots cache may be deactivated before acquiring lock */
- if (!use_swap_slot_cache) {
+ if (!use_swap_slot_cache || !cache->slots_ret) {
spin_unlock_irq(&cache->free_lock);
goto direct_free;
}
@@ -297,7 +297,6 @@ int free_swap_slot(swp_entry_t entry)
direct_free:
swapcache_free_entries(&entry, 1);
}
- put_cpu_var(swp_slots);
return 0;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9c71b6b2562f..b68c93014f50 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -412,14 +412,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* the swap entry is no longer in use.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr)
+ struct vm_area_struct *vma, unsigned long addr, bool do_poll)
{
bool page_was_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
vma, addr, &page_was_allocated);
if (page_was_allocated)
- swap_readpage(retpage);
+ swap_readpage(retpage, do_poll);
return retpage;
}
@@ -496,11 +496,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long start_offset, end_offset;
unsigned long mask;
struct blk_plug plug;
+ bool do_poll = true;
mask = swapin_nr_pages(offset) - 1;
if (!mask)
goto skip;
+ do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -511,7 +513,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
- gfp_mask, vma, addr);
+ gfp_mask, vma, addr, false);
if (!page)
continue;
if (offset != entry_offset && likely(!PageTransCompound(page)))
@@ -522,7 +524,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
- return read_swap_cache_async(entry, gfp_mask, vma, addr);
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 811d90e1c929..6ba4aab2db0b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1868,7 +1868,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
page = read_swap_cache_async(entry,
- GFP_HIGHUSER_MOVABLE, NULL, 0);
+ GFP_HIGHUSER_MOVABLE, NULL, 0, false);
if (!page) {
/*
* Either swap_duplicate() failed because entry
diff --git a/mm/truncate.c b/mm/truncate.c
index 6479ed2afc53..2330223841fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -530,9 +530,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
} else if (PageTransHuge(page)) {
index += HPAGE_PMD_NR - 1;
i += HPAGE_PMD_NR - 1;
- /* 'end' is in the middle of THP */
- if (index == round_down(end, HPAGE_PMD_NR))
+ /*
+ * 'end' is in the middle of THP. Don't
+ * invalidate the page as the part outside of
+ * 'end' could be still useful.
+ */
+ if (index > end) {
+ unlock_page(page);
continue;
+ }
}
ret = invalidate_inode_page(page);
diff --git a/mm/util.c b/mm/util.c
index 26be6407abd7..7b07ec852e01 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -83,6 +83,8 @@ EXPORT_SYMBOL(kstrdup_const);
* @s: the string to duplicate
* @max: read at most @max chars from @s
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Note: Use kmemdup_nul() instead if the size is known exactly.
*/
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
@@ -121,6 +123,28 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
+ * kmemdup_nul - Create a NUL-terminated string from unterminated data
+ * @s: The data to stringify
+ * @len: The size of the data
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
+{
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (buf) {
+ memcpy(buf, s, len);
+ buf[len] = '\0';
+ }
+ return buf;
+}
+EXPORT_SYMBOL(kmemdup_nul);
+
+/**
* memdup_user - duplicate memory region from user space
*
* @src: source address in user space
@@ -339,9 +363,9 @@ EXPORT_SYMBOL(vm_mmap);
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
- * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT
- * is supported only for large (>32kB) allocations, and it should be used only if
- * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks.
+ * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
*
* Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
*/
@@ -366,13 +390,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (size > PAGE_SIZE) {
kmalloc_flags |= __GFP_NOWARN;
- /*
- * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly
- * requests because there is no other way to tell the allocator
- * that we want to fail rather than retry endlessly.
- */
- if (!(kmalloc_flags & __GFP_REPEAT) ||
- (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
kmalloc_flags |= __GFP_NORETRY;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6211a807cb31..8698c1c86c4d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -325,6 +325,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
+#define VM_LAZY_FREE 0x02
#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
@@ -1497,6 +1498,7 @@ struct vm_struct *remove_vm_area(const void *addr)
spin_lock(&vmap_area_lock);
va->vm = NULL;
va->flags &= ~VM_VM_AREA;
+ va->flags |= VM_LAZY_FREE;
spin_unlock(&vmap_area_lock);
vmap_debug_free_range(va->va_start, va->va_end);
@@ -1793,7 +1795,7 @@ fail:
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
* and __GFP_NOFAIL are not supported
*
* Any use of gfp flags outside of GFP_KERNEL should be consulted
@@ -2704,8 +2706,14 @@ static int s_show(struct seq_file *m, void *p)
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
* behalf of vmap area is being tear down or vm_map_ram allocation.
*/
- if (!(va->flags & VM_VM_AREA))
+ if (!(va->flags & VM_VM_AREA)) {
+ seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start,
+ va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
+
return 0;
+ }
v = va->vm;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index ce0618bfa8d0..85350ce2d25d 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -93,12 +93,25 @@ enum vmpressure_levels {
VMPRESSURE_NUM_LEVELS,
};
+enum vmpressure_modes {
+ VMPRESSURE_NO_PASSTHROUGH = 0,
+ VMPRESSURE_HIERARCHY,
+ VMPRESSURE_LOCAL,
+ VMPRESSURE_NUM_MODES,
+};
+
static const char * const vmpressure_str_levels[] = {
[VMPRESSURE_LOW] = "low",
[VMPRESSURE_MEDIUM] = "medium",
[VMPRESSURE_CRITICAL] = "critical",
};
+static const char * const vmpressure_str_modes[] = {
+ [VMPRESSURE_NO_PASSTHROUGH] = "default",
+ [VMPRESSURE_HIERARCHY] = "hierarchy",
+ [VMPRESSURE_LOCAL] = "local",
+};
+
static enum vmpressure_levels vmpressure_level(unsigned long pressure)
{
if (pressure >= vmpressure_level_critical)
@@ -141,27 +154,31 @@ out:
struct vmpressure_event {
struct eventfd_ctx *efd;
enum vmpressure_levels level;
+ enum vmpressure_modes mode;
struct list_head node;
};
static bool vmpressure_event(struct vmpressure *vmpr,
- enum vmpressure_levels level)
+ const enum vmpressure_levels level,
+ bool ancestor, bool signalled)
{
struct vmpressure_event *ev;
- bool signalled = false;
+ bool ret = false;
mutex_lock(&vmpr->events_lock);
-
list_for_each_entry(ev, &vmpr->events, node) {
- if (level >= ev->level) {
- eventfd_signal(ev->efd, 1);
- signalled = true;
- }
+ if (ancestor && ev->mode == VMPRESSURE_LOCAL)
+ continue;
+ if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
+ continue;
+ if (level < ev->level)
+ continue;
+ eventfd_signal(ev->efd, 1);
+ ret = true;
}
-
mutex_unlock(&vmpr->events_lock);
- return signalled;
+ return ret;
}
static void vmpressure_work_fn(struct work_struct *work)
@@ -170,6 +187,8 @@ static void vmpressure_work_fn(struct work_struct *work)
unsigned long scanned;
unsigned long reclaimed;
enum vmpressure_levels level;
+ bool ancestor = false;
+ bool signalled = false;
spin_lock(&vmpr->sr_lock);
/*
@@ -194,12 +213,9 @@ static void vmpressure_work_fn(struct work_struct *work)
level = vmpressure_calc_level(scanned, reclaimed);
do {
- if (vmpressure_event(vmpr, level))
- break;
- /*
- * If not handled, propagate the event upward into the
- * hierarchy.
- */
+ if (vmpressure_event(vmpr, level, ancestor, signalled))
+ signalled = true;
+ ancestor = true;
} while ((vmpr = vmpressure_parent(vmpr)));
}
@@ -326,17 +342,40 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
+static enum vmpressure_levels str_to_level(const char *arg)
+{
+ enum vmpressure_levels level;
+
+ for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++)
+ if (!strcmp(vmpressure_str_levels[level], arg))
+ return level;
+ return -1;
+}
+
+static enum vmpressure_modes str_to_mode(const char *arg)
+{
+ enum vmpressure_modes mode;
+
+ for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++)
+ if (!strcmp(vmpressure_str_modes[mode], arg))
+ return mode;
+ return -1;
+}
+
+#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
+
/**
* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
* @memcg: memcg that is interested in vmpressure notifications
* @eventfd: eventfd context to link notifications with
- * @args: event arguments (used to set up a pressure level threshold)
+ * @args: event arguments (pressure level threshold, optional mode)
*
* This function associates eventfd context with the vmpressure
* infrastructure, so that the notifications will be delivered to the
- * @eventfd. The @args parameter is a string that denotes pressure level
- * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
- * "critical").
+ * @eventfd. The @args parameter is a comma-delimited string that denotes a
+ * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
+ * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
+ * "hierarchy" or "local").
*
* To be used as memcg event method.
*/
@@ -345,28 +384,53 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
- int level;
+ enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
+ enum vmpressure_levels level = -1;
+ char *spec, *spec_orig;
+ char *token;
+ int ret = 0;
+
+ spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL);
+ if (!spec) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN);
- for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
- if (!strcmp(vmpressure_str_levels[level], args))
- break;
+ /* Find required level */
+ token = strsep(&spec, ",");
+ level = str_to_level(token);
+ if (level == -1) {
+ ret = -EINVAL;
+ goto out;
}
- if (level >= VMPRESSURE_NUM_LEVELS)
- return -EINVAL;
+ /* Find optional mode */
+ token = strsep(&spec, ",");
+ if (token) {
+ mode = str_to_mode(token);
+ if (mode == -1) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
- if (!ev)
- return -ENOMEM;
+ if (!ev) {
+ ret = -ENOMEM;
+ goto out;
+ }
ev->efd = eventfd;
ev->level = level;
+ ev->mode = mode;
mutex_lock(&vmpr->events_lock);
list_add(&ev->node, &vmpr->events);
mutex_unlock(&vmpr->events_lock);
-
- return 0;
+out:
+ kfree(spec_orig);
+ return ret;
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9e95fafc026b..a1af041930a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2228,8 +2228,17 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
}
if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
- scan_balance = SCAN_ANON;
- goto out;
+ /*
+ * Force SCAN_ANON if there are enough inactive
+ * anonymous pages on the LRU in eligible zones.
+ * Otherwise, the small LRU gets thrashed.
+ */
+ if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
+ >> sc->priority) {
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
}
}
@@ -2497,18 +2506,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return false;
/* Consider stopping depending on scan and reclaim activity */
- if (sc->gfp_mask & __GFP_REPEAT) {
+ if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
/*
- * For __GFP_REPEAT allocations, stop reclaiming if the
+ * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
* full LRU list has been scanned and we are still failing
* to reclaim pages. This full LRU scan is potentially
- * expensive but a __GFP_REPEAT caller really wants to succeed
+ * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
*/
if (!nr_reclaimed && !nr_scanned)
return false;
} else {
/*
- * For non-__GFP_REPEAT allocations which can presumably
+ * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
* fail without consequence, stop if we failed to reclaim
* any pages from the last SWAP_CLUSTER_MAX number of
* pages that were scanned. This will return to the
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 744ceaeb42a0..9a4441bbeef2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1130,7 +1130,7 @@ static void frag_stop(struct seq_file *m, void *arg)
* If @assert_populated is true, only use callback for zones that are populated.
*/
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
- bool assert_populated,
+ bool assert_populated, bool nolock,
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
{
struct zone *zone;
@@ -1141,9 +1141,11 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
if (assert_populated && !populated_zone(zone))
continue;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!nolock)
+ spin_lock_irqsave(&zone->lock, flags);
print(m, pgdat, zone);
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (!nolock)
+ spin_unlock_irqrestore(&zone->lock, flags);
}
}
#endif
@@ -1166,7 +1168,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, true, frag_show_print);
+ walk_zones_in_node(m, pgdat, true, false, frag_show_print);
return 0;
}
@@ -1207,7 +1209,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print);
+ walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
return 0;
}
@@ -1258,7 +1260,8 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print);
+ walk_zones_in_node(m, pgdat, true, false,
+ pagetypeinfo_showblockcount_print);
return 0;
}
@@ -1284,7 +1287,8 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
+ walk_zones_in_node(m, pgdat, true, true,
+ pagetypeinfo_showmixedcount_print);
#endif /* CONFIG_PAGE_OWNER */
}
@@ -1446,7 +1450,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
+ walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
return 0;
}
@@ -1852,7 +1856,7 @@ static int unusable_show(struct seq_file *m, void *arg)
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
- walk_zones_in_node(m, pgdat, true, unusable_show_print);
+ walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
return 0;
}
@@ -1904,7 +1908,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, true, extfrag_show_print);
+ walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
return 0;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d41edd28298b..013eea76685e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -116,6 +116,11 @@
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+#define FULLNESS_BITS 2
+#define CLASS_BITS 8
+#define ISOLATED_BITS 3
+#define MAGIC_VAL_BITS 8
+
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
@@ -137,6 +142,8 @@
* (reason above)
*/
#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
+#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
+ ZS_SIZE_CLASS_DELTA) + 1)
enum fullness_group {
ZS_EMPTY,
@@ -169,11 +176,6 @@ static struct vfsmount *zsmalloc_mnt;
#endif
/*
- * number of size_classes
- */
-static int zs_size_classes;
-
-/*
* We assign a page to ZS_ALMOST_EMPTY fullness group when:
* n <= N / f, where
* n = number of allocated objects
@@ -244,7 +246,7 @@ struct link_free {
struct zs_pool {
const char *name;
- struct size_class **size_class;
+ struct size_class *size_class[ZS_SIZE_CLASSES];
struct kmem_cache *handle_cachep;
struct kmem_cache *zspage_cachep;
@@ -268,11 +270,6 @@ struct zs_pool {
#endif
};
-#define FULLNESS_BITS 2
-#define CLASS_BITS 8
-#define ISOLATED_BITS 3
-#define MAGIC_VAL_BITS 8
-
struct zspage {
struct {
unsigned int fullness:FULLNESS_BITS;
@@ -469,7 +466,7 @@ static bool is_zspage_isolated(struct zspage *zspage)
return zspage->isolated;
}
-static int is_first_page(struct page *page)
+static __maybe_unused int is_first_page(struct page *page)
{
return PagePrivate(page);
}
@@ -551,7 +548,7 @@ static int get_size_class_index(int size)
idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
ZS_SIZE_CLASS_DELTA);
- return min(zs_size_classes - 1, idx);
+ return min_t(int, ZS_SIZE_CLASSES - 1, idx);
}
static inline void zs_stat_inc(struct size_class *class,
@@ -610,7 +607,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
"obj_allocated", "obj_used", "pages_used",
"pages_per_zspage", "freeable");
- for (i = 0; i < zs_size_classes; i++) {
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
class = pool->size_class[i];
if (class->index != i)
@@ -1294,17 +1291,6 @@ static int zs_cpu_dead(unsigned int cpu)
return 0;
}
-static void __init init_zs_size_classes(void)
-{
- int nr;
-
- nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
- if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
- nr += 1;
-
- zs_size_classes = nr;
-}
-
static bool can_merge(struct size_class *prev, int pages_per_zspage,
int objs_per_zspage)
{
@@ -2145,7 +2131,7 @@ static void async_free_zspage(struct work_struct *work)
struct zs_pool *pool = container_of(work, struct zs_pool,
free_work);
- for (i = 0; i < zs_size_classes; i++) {
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
class = pool->size_class[i];
if (class->index != i)
continue;
@@ -2263,7 +2249,7 @@ unsigned long zs_compact(struct zs_pool *pool)
int i;
struct size_class *class;
- for (i = zs_size_classes - 1; i >= 0; i--) {
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
if (!class)
continue;
@@ -2309,7 +2295,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
shrinker);
- for (i = zs_size_classes - 1; i >= 0; i--) {
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
if (!class)
continue;
@@ -2361,12 +2347,6 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
- pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
- GFP_KERNEL);
- if (!pool->size_class) {
- kfree(pool);
- return NULL;
- }
pool->name = kstrdup(name, GFP_KERNEL);
if (!pool->name)
@@ -2379,7 +2359,7 @@ struct zs_pool *zs_create_pool(const char *name)
* Iterate reversely, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
*/
- for (i = zs_size_classes - 1; i >= 0; i--) {
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
int size;
int pages_per_zspage;
int objs_per_zspage;
@@ -2453,7 +2433,7 @@ void zs_destroy_pool(struct zs_pool *pool)
zs_unregister_migration(pool);
zs_pool_stat_destroy(pool);
- for (i = 0; i < zs_size_classes; i++) {
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
int fg;
struct size_class *class = pool->size_class[i];
@@ -2492,8 +2472,6 @@ static int __init zs_init(void)
if (ret)
goto hp_setup_fail;
- init_zs_size_classes();
-
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
#endif