diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2007-10-12 21:27:47 -0400 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2007-10-12 21:27:47 -0400 |
commit | b981d8b3f5e008ff10d993be633ad00564fc22cd (patch) | |
tree | e292dc07b22308912cf6a58354a608b9e5e8e1fd /mm | |
parent | b11d2127c4893a7315d1e16273bc8560049fa3ca (diff) | |
parent | 2b9e0aae1d50e880c58d46788e5e3ebd89d75d62 (diff) | |
download | linux-b981d8b3f5e008ff10d993be633ad00564fc22cd.tar.gz linux-b981d8b3f5e008ff10d993be633ad00564fc22cd.tar.bz2 linux-b981d8b3f5e008ff10d993be633ad00564fc22cd.zip |
Merge master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts:
drivers/macintosh/adbhid.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 5 | ||||
-rw-r--r-- | mm/bounce.c | 29 | ||||
-rw-r--r-- | mm/filemap.c | 350 | ||||
-rw-r--r-- | mm/filemap_xip.c | 41 | ||||
-rw-r--r-- | mm/fremap.c | 179 | ||||
-rw-r--r-- | mm/hugetlb.c | 51 | ||||
-rw-r--r-- | mm/memory.c | 335 | ||||
-rw-r--r-- | mm/mempolicy.c | 90 | ||||
-rw-r--r-- | mm/migrate.c | 31 | ||||
-rw-r--r-- | mm/mmap.c | 109 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 52 | ||||
-rw-r--r-- | mm/oom_kill.c | 3 | ||||
-rw-r--r-- | mm/page-writeback.c | 26 | ||||
-rw-r--r-- | mm/page_alloc.c | 40 | ||||
-rw-r--r-- | mm/page_io.c | 12 | ||||
-rw-r--r-- | mm/readahead.c | 517 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/shmem.c | 86 | ||||
-rw-r--r-- | mm/slab.c | 39 | ||||
-rw-r--r-- | mm/slob.c | 24 | ||||
-rw-r--r-- | mm/slub.c | 138 | ||||
-rw-r--r-- | mm/sparse.c | 16 | ||||
-rw-r--r-- | mm/swapfile.c | 6 | ||||
-rw-r--r-- | mm/truncate.c | 15 | ||||
-rw-r--r-- | mm/util.c | 26 | ||||
-rw-r--r-- | mm/vmalloc.c | 59 | ||||
-rw-r--r-- | mm/vmscan.c | 69 | ||||
-rw-r--r-- | mm/vmstat.c | 1 |
30 files changed, 1209 insertions, 1150 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 86187221e78f..a7609cbcb00d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -116,11 +116,11 @@ config SPARSEMEM_EXTREME config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG + depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC64 || SUPERH) comment "Memory hotplug is currently incompatible with Software Suspend" - depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND + depends on SPARSEMEM && HOTPLUG && HIBERNATION config MEMORY_HOTPLUG_SPARSE def_bool y @@ -137,6 +137,7 @@ config SPLIT_PTLOCK_CPUS int default "4096" if ARM && !CPU_CACHE_VIPT default "4096" if PARISC && !PA20 + default "4096" if XEN default "4" # diff --git a/mm/bounce.c b/mm/bounce.c index ad401fc57440..3b549bf31f7d 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -140,26 +140,19 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) mempool_free(bvec->bv_page, pool); } - bio_endio(bio_orig, bio_orig->bi_size, err); + bio_endio(bio_orig, err); bio_put(bio); } -static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) +static void bounce_end_io_write(struct bio *bio, int err) { - if (bio->bi_size) - return 1; - bounce_end_io(bio, page_pool, err); - return 0; } -static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) +static void bounce_end_io_write_isa(struct bio *bio, int err) { - if (bio->bi_size) - return 1; bounce_end_io(bio, isa_page_pool, err); - return 0; } static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) @@ -172,25 +165,17 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) bounce_end_io(bio, pool, err); } -static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) +static void bounce_end_io_read(struct bio *bio, int err) { - if (bio->bi_size) - return 1; - __bounce_end_io_read(bio, page_pool, err); - return 0; } -static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) +static void bounce_end_io_read_isa(struct bio *bio, int err) { - if (bio->bi_size) - return 1; - __bounce_end_io_read(bio, isa_page_pool, err); - return 0; } -static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, +static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, mempool_t *pool) { struct page *page; @@ -275,7 +260,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, *bio_orig = bio; } -void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) +void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { mempool_t *pool; diff --git a/mm/filemap.c b/mm/filemap.c index 5d5449f3d41c..15c8413ee929 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -891,15 +891,20 @@ void do_generic_mapping_read(struct address_space *mapping, unsigned long nr, ret; cond_resched(); - if (index == next_index) - next_index = page_cache_readahead(mapping, &ra, filp, - index, last_index - index); - find_page: page = find_get_page(mapping, index); - if (unlikely(page == NULL)) { - handle_ra_miss(mapping, &ra, index); - goto no_cached_page; + if (!page) { + page_cache_sync_readahead(mapping, + &ra, filp, + index, last_index - index); + page = find_get_page(mapping, index); + if (unlikely(page == NULL)) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, + &ra, filp, page, + index, last_index - index); } if (!PageUptodate(page)) goto page_not_up_to_date; @@ -1051,6 +1056,7 @@ no_cached_page: out: *_ra = ra; + _ra->prev_index = prev_index; *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) @@ -1212,26 +1218,6 @@ out: } EXPORT_SYMBOL(generic_file_aio_read); -int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) -{ - ssize_t written; - unsigned long count = desc->count; - struct file *file = desc->arg.data; - - if (size > count) - size = count; - - written = file->f_op->sendpage(file, page, offset, - size, &file->f_pos, size<count); - if (written < 0) { - desc->error = written; - written = 0; - } - desc->count = count - written; - desc->written += written; - return written; -} - static ssize_t do_readahead(struct address_space *mapping, struct file *filp, unsigned long index, unsigned long nr) @@ -1301,62 +1287,62 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) #define MMAP_LOTSAMISS (100) /** - * filemap_nopage - read in file data for page fault handling - * @area: the applicable vm_area - * @address: target address to read in - * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL + * filemap_fault - read in file data for page fault handling + * @vma: vma in which the fault was taken + * @vmf: struct vm_fault containing details of the fault * - * filemap_nopage() is invoked via the vma operations vector for a + * filemap_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. */ -struct page *filemap_nopage(struct vm_area_struct *area, - unsigned long address, int *type) +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int error; - struct file *file = area->vm_file; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; struct page *page; - unsigned long size, pgoff; - int did_readaround = 0, majmin = VM_FAULT_MINOR; - - pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + unsigned long size; + int did_readaround = 0; + int ret = 0; -retry_all: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff >= size) + if (vmf->pgoff >= size) goto outside_data_content; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(area)) + if (VM_RandomReadHint(vma)) goto no_cached_page; /* - * The readahead code wants to be told about each and every page - * so it can build and shrink its windows appropriately - * - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(area)) - page_cache_readahead(mapping, ra, file, pgoff, 1); - - /* * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_lock_page(mapping, vmf->pgoff); + /* + * For sequential accesses, we use the generic readahead logic. + */ + if (VM_SequentialReadHint(vma)) { + if (!page) { + page_cache_sync_readahead(mapping, ra, file, + vmf->pgoff, 1); + page = find_lock_page(mapping, vmf->pgoff); + if (!page) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, ra, file, page, + vmf->pgoff, 1); + } + } + if (!page) { unsigned long ra_pages; - if (VM_SequentialReadHint(area)) { - handle_ra_miss(mapping, ra, pgoff); - goto no_cached_page; - } ra->mmap_miss++; /* @@ -1371,7 +1357,7 @@ retry_find: * check did_readaround, as this is an inner loop. */ if (!did_readaround) { - majmin = VM_FAULT_MAJOR; + ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } did_readaround = 1; @@ -1379,11 +1365,11 @@ retry_find: if (ra_pages) { pgoff_t start = 0; - if (pgoff > ra_pages / 2) - start = pgoff - ra_pages / 2; + if (vmf->pgoff > ra_pages / 2) + start = vmf->pgoff - ra_pages / 2; do_page_cache_readahead(mapping, file, start, ra_pages); } - page = find_get_page(mapping, pgoff); + page = find_lock_page(mapping, vmf->pgoff); if (!page) goto no_cached_page; } @@ -1392,35 +1378,43 @@ retry_find: ra->mmap_hit++; /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. + * We have a locked page in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error. */ - if (!PageUptodate(page)) + if (unlikely(!PageUptodate(page))) goto page_not_uptodate; -success: + /* Must recheck i_size under page lock */ + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + unlock_page(page); + page_cache_release(page); + goto outside_data_content; + } + /* * Found the page and have a reference on it. */ mark_page_accessed(page); - if (type) - *type = majmin; - return page; + ra->prev_index = page->index; + vmf->page = page; + return ret | VM_FAULT_LOCKED; outside_data_content: /* * An external ptracer can access pages that normally aren't * accessible.. */ - if (area->vm_mm == current->mm) - return NOPAGE_SIGBUS; + if (vma->vm_mm == current->mm) + return VM_FAULT_SIGBUS; + /* Fall through to the non-read-ahead case */ no_cached_page: /* * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, pgoff); + error = page_cache_read(file, vmf->pgoff); /* * The page we want has now been added to the page cache. @@ -1436,12 +1430,13 @@ no_cached_page: * to schedule I/O. */ if (error == -ENOMEM) - return NOPAGE_OOM; - return NOPAGE_SIGBUS; + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; page_not_uptodate: + /* IO error path */ if (!did_readaround) { - majmin = VM_FAULT_MAJOR; + ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } @@ -1451,217 +1446,21 @@ page_not_uptodate: * because there really aren't any performance issues here * and we need to check for errors. */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } ClearPageError(page); error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ - shrink_readahead_size_eio(file, ra); page_cache_release(page); - return NOPAGE_SIGBUS; -} -EXPORT_SYMBOL(filemap_nopage); -static struct page * filemap_getpage(struct file *file, unsigned long pgoff, - int nonblock) -{ - struct address_space *mapping = file->f_mapping; - struct page *page; - int error; - - /* - * Do we have something in the page cache already? - */ -retry_find: - page = find_get_page(mapping, pgoff); - if (!page) { - if (nonblock) - return NULL; - goto no_cached_page; - } - - /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. - */ - if (!PageUptodate(page)) { - if (nonblock) { - page_cache_release(page); - return NULL; - } - goto page_not_uptodate; - } - -success: - /* - * Found the page and have a reference on it. - */ - mark_page_accessed(page); - return page; - -no_cached_page: - error = page_cache_read(file, pgoff); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - return NULL; - -page_not_uptodate: - lock_page(page); - - /* Did it get truncated while we waited for it? */ - if (!page->mapping) { - unlock_page(page); - goto err; - } - - /* Did somebody else get it up-to-date? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Umm, take care of errors if the page isn't up-to-date. - * Try to re-read it _once_. We do this synchronously, - * because there really aren't any performance issues here - * and we need to check for errors. - */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - goto err; - } - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - ClearPageError(page); - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); + if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - } - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ -err: - page_cache_release(page); - - return NULL; -} - -int filemap_populate(struct vm_area_struct *vma, unsigned long addr, - unsigned long len, pgprot_t prot, unsigned long pgoff, - int nonblock) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned long size; - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int err; - - if (!nonblock) - force_page_cache_readahead(mapping, vma->vm_file, - pgoff, len >> PAGE_CACHE_SHIFT); - -repeat: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) - return -EINVAL; - - page = filemap_getpage(file, pgoff, nonblock); - - /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as - * done in shmem_populate calling shmem_getpage */ - if (!page && !nonblock) - return -ENOMEM; - - if (page) { - err = install_page(mm, vma, addr, page, prot); - if (err) { - page_cache_release(page); - return err; - } - } else if (vma->vm_flags & VM_NONLINEAR) { - /* No page was found just because we can't read it in now (being - * here implies nonblock != 0), but the page may exist, so set - * the PTE to fault it in later. */ - err = install_file_pte(mm, vma, addr, pgoff, prot); - if (err) - return err; - } - - len -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - if (len) - goto repeat; - - return 0; + /* Things didn't work out. Return zero to tell the mm layer so. */ + shrink_readahead_size_eio(file, ra); + return VM_FAULT_SIGBUS; } -EXPORT_SYMBOL(filemap_populate); +EXPORT_SYMBOL(filemap_fault); struct vm_operations_struct generic_file_vm_ops = { - .nopage = filemap_nopage, - .populate = filemap_populate, + .fault = filemap_fault, }; /* This is used for a general mmap of a disk file */ @@ -1674,6 +1473,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 65ffc321f0c0..53ee6a299635 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -205,62 +205,58 @@ __xip_unmap (struct address_space * mapping, } /* - * xip_nopage() is invoked via the vma operations vector for a + * xip_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * - * This function is derived from filemap_nopage, but used for execute in place + * This function is derived from filemap_fault, but used for execute in place */ -static struct page * -xip_file_nopage(struct vm_area_struct * area, - unsigned long address, - int *type) +static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) { struct file *file = area->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; - unsigned long size, pgoff, endoff; + pgoff_t size; - pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) - + area->vm_pgoff; - endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) - + area->vm_pgoff; + /* XXX: are VM_FAULT_ codes OK? */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff >= size) - return NOPAGE_SIGBUS; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; - page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); + page = mapping->a_ops->get_xip_page(mapping, + vmf->pgoff*(PAGE_SIZE/512), 0); if (!IS_ERR(page)) goto out; if (PTR_ERR(page) != -ENODATA) - return NOPAGE_SIGBUS; + return VM_FAULT_OOM; /* sparse block */ if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) && (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) && (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { /* maybe shared writable, allocate new block */ - page = mapping->a_ops->get_xip_page (mapping, - pgoff*(PAGE_SIZE/512), 1); + page = mapping->a_ops->get_xip_page(mapping, + vmf->pgoff*(PAGE_SIZE/512), 1); if (IS_ERR(page)) - return NOPAGE_SIGBUS; + return VM_FAULT_SIGBUS; /* unmap page at pgoff from all other vmas */ - __xip_unmap(mapping, pgoff); + __xip_unmap(mapping, vmf->pgoff); } else { /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) - return NOPAGE_OOM; + return VM_FAULT_OOM; } out: page_cache_get(page); - return page; + vmf->page = page; + return 0; } static struct vm_operations_struct xip_file_vm_ops = { - .nopage = xip_file_nopage, + .fault = xip_file_fault, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) @@ -269,6 +265,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) file_accessed(file); vma->vm_ops = &xip_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } EXPORT_SYMBOL_GPL(xip_file_mmap); diff --git a/mm/fremap.c b/mm/fremap.c index 4e3f53dd5fd4..95bcb5641c72 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -20,13 +20,14 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> -static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, +static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; - struct page *page = NULL; if (pte_present(pte)) { + struct page *page; + flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); @@ -35,68 +36,21 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, set_page_dirty(page); page_remove_rmap(page, vma); page_cache_release(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, file_rss); } } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear_not_present_full(mm, addr, ptep, 0); } - return !!page; } /* - * Install a file page to a given virtual memory address, release any - * previously existing mapping. - */ -int install_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, struct page *page, pgprot_t prot) -{ - struct inode *inode; - pgoff_t size; - int err = -ENOMEM; - pte_t *pte; - pte_t pte_val; - spinlock_t *ptl; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - - /* - * This page may have been truncated. Tell the - * caller about it. - */ - err = -EINVAL; - inode = vma->vm_file->f_mapping->host; - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (!page->mapping || page->index >= size) - goto unlock; - err = -ENOMEM; - if (page_mapcount(page) > INT_MAX/2) - goto unlock; - - if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) - inc_mm_counter(mm, file_rss); - - flush_icache_page(vma, page); - pte_val = mk_pte(page, prot); - set_pte_at(mm, addr, pte, pte_val); - page_add_file_rmap(page); - update_mmu_cache(vma, addr, pte_val); - lazy_mmu_prot_update(pte_val); - err = 0; -unlock: - pte_unmap_unlock(pte, ptl); -out: - return err; -} -EXPORT_SYMBOL(install_page); - -/* * Install a file pte to a given virtual memory address, release any * previously existing mapping. */ -int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, +static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot) { int err = -ENOMEM; @@ -107,10 +61,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) goto out; - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { - update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); - } + if (!pte_none(*pte)) + zap_pte(mm, vma, addr, pte); set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); /* @@ -126,6 +78,25 @@ out: return err; } +static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + int err; + + do { + err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); + if (err) + return err; + + size -= PAGE_SIZE; + addr += PAGE_SIZE; + pgoff++; + } while (size); + + return 0; + +} + /*** * sys_remap_file_pages - remap arbitrary pages of a shared backing store * file within an existing vma. @@ -183,41 +154,77 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, * the single existing vma. vm_private_data is used as a * swapout cursor in a VM_NONLINEAR vma. */ - if (vma && (vma->vm_flags & VM_SHARED) && - (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) && - vma->vm_ops && vma->vm_ops->populate && - end > start && start >= vma->vm_start && - end <= vma->vm_end) { - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (pgoff != linear_page_index(vma, start) && - !(vma->vm_flags & VM_NONLINEAR)) { - if (!has_write_lock) { - up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - has_write_lock = 1; - goto retry; - } - mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); - flush_dcache_mmap_lock(mapping); - vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); - } + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; - err = vma->vm_ops->populate(vma, start, size, - vma->vm_page_prot, - pgoff, flags & MAP_NONBLOCK); + if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) + goto out; + + if (!(vma->vm_flags & VM_CAN_NONLINEAR)) + goto out; + if (end <= start || start < vma->vm_start || end > vma->vm_end) + goto out; + + /* Must set VM_NONLINEAR before any pages are populated. */ + if (!(vma->vm_flags & VM_NONLINEAR)) { + /* Don't need a nonlinear mapping, exit success */ + if (pgoff == linear_page_index(vma, start)) { + err = 0; + goto out; + } + + if (!has_write_lock) { + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + has_write_lock = 1; + goto retry; + } + mapping = vma->vm_file->f_mapping; /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). + * page_mkclean doesn't work on nonlinear vmas, so if + * dirty pages need to be accounted, emulate with linear + * vmas. */ + if (mapping_cap_account_dirty(mapping)) { + unsigned long addr; + + flags &= MAP_NONBLOCK; + addr = mmap_region(vma->vm_file, start, size, + flags, vma->vm_flags, pgoff, 1); + if (IS_ERR_VALUE(addr)) { + err = addr; + } else { + BUG_ON(addr != start); + err = 0; + } + goto out; + } + spin_lock(&mapping->i_mmap_lock); + flush_dcache_mmap_lock(mapping); + vma->vm_flags |= VM_NONLINEAR; + vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); + flush_dcache_mmap_unlock(mapping); + spin_unlock(&mapping->i_mmap_lock); + } + + err = populate_range(mm, vma, start, size, pgoff); + if (!err && !(flags & MAP_NONBLOCK)) { + if (unlikely(has_write_lock)) { + downgrade_write(&mm->mmap_sem); + has_write_lock = 0; + } + make_pages_present(start, start+size); } + + /* + * We can't clear VM_NONLINEAR because we'd have to do + * it after ->populate completes, and that would prevent + * downgrading the lock. (Locks can't be upgraded). + */ + +out: if (likely(!has_write_lock)) up_read(&mm->mmap_sem); else diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6912bbf33faa..eab8c428cc93 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -42,7 +42,7 @@ static void clear_huge_page(struct page *page, unsigned long addr) might_sleep(); for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { cond_resched(); - clear_user_highpage(page + i, addr); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); } } @@ -71,24 +71,24 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, { int nid; struct page *page = NULL; + struct mempolicy *mpol; struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask); + htlb_alloc_mask, &mpol); struct zone **z; for (z = zonelist->zones; *z; z++) { nid = zone_to_nid(*z); if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && - !list_empty(&hugepage_freelists[nid])) + !list_empty(&hugepage_freelists[nid])) { + page = list_entry(hugepage_freelists[nid].next, + struct page, lru); + list_del(&page->lru); + free_huge_pages--; + free_huge_pages_node[nid]--; break; + } } - - if (*z) { - page = list_entry(hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; - } + mpol_free(mpol); /* unref if mpol !NULL */ return page; } @@ -107,15 +107,19 @@ static int alloc_fresh_huge_page(void) { static int prev_nid; struct page *page; - static DEFINE_SPINLOCK(nid_lock); int nid; - spin_lock(&nid_lock); + /* + * Copy static prev_nid to local nid, work on that, then copy it + * back to prev_nid afterwards: otherwise there's a window in which + * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really doesn't + * matter if occasionally a racer chooses the same nid as we do. + */ nid = next_node(prev_nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); prev_nid = nid; - spin_unlock(&nid_lock); page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); @@ -207,7 +211,7 @@ static void update_and_free_page(struct page *page) 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); } - page[1].lru.next = NULL; + set_compound_page_dtor(page, NULL); set_page_refcounted(page); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -316,15 +320,14 @@ unsigned long hugetlb_total_pages(void) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static struct page *hugetlb_nopage(struct vm_area_struct *vma, - unsigned long address, int *unused) +static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); - return NULL; + return 0; } struct vm_operations_struct hugetlb_vm_ops = { - .nopage = hugetlb_nopage, + .fault = hugetlb_vm_op_fault, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, @@ -470,7 +473,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, avoidcopy = (page_count(old_page) == 1); if (avoidcopy) { set_huge_ptep_writable(vma, address, ptep); - return VM_FAULT_MINOR; + return 0; } page_cache_get(old_page); @@ -495,7 +498,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(new_page); page_cache_release(old_page); - return VM_FAULT_MINOR; + return 0; } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -552,7 +555,7 @@ retry: if (idx >= size) goto backout; - ret = VM_FAULT_MINOR; + ret = 0; if (!pte_none(*ptep)) goto backout; @@ -603,7 +606,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } - ret = VM_FAULT_MINOR; + ret = 0; spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ @@ -642,7 +645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, 0); spin_lock(&mm->page_table_lock); - if (ret == VM_FAULT_MINOR) + if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; diff --git a/mm/memory.c b/mm/memory.c index 9c6ff7fffdc8..f82b359b2745 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1047,7 +1047,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) foll_flags |= FOLL_GET; if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->nopage)) + (!vma->vm_ops || (!vma->vm_ops->nopage && + !vma->vm_ops->fault))) foll_flags |= FOLL_ANON; do { @@ -1067,31 +1068,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; - ret = __handle_mm_fault(mm, vma, start, + ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + else if (ret & VM_FAULT_SIGBUS) + return i ? i : -EFAULT; + BUG(); + } + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + /* - * The VM_FAULT_WRITE bit tells us that do_wp_page has - * broken COW when necessary, even if maybe_mkwrite - * decided not to set pte_write. We can thus safely do - * subsequent page lookups as if they were reads. + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. */ if (ret & VM_FAULT_WRITE) foll_flags &= ~FOLL_WRITE; - - switch (ret & ~VM_FAULT_WRITE) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - return i ? i : -EFAULT; - case VM_FAULT_OOM: - return i ? i : -ENOMEM; - default: - BUG(); - } + cond_resched(); } if (pages) { @@ -1638,7 +1638,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse = 0, ret = VM_FAULT_MINOR; + int reuse = 0, ret = 0; + int page_mkwrite = 0; struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); @@ -1687,6 +1688,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(old_page); if (!pte_same(*page_table, orig_pte)) goto unlock; + + page_mkwrite = 1; } dirty_page = old_page; get_page(dirty_page); @@ -1765,7 +1768,16 @@ gotten: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { - set_page_dirty_balance(dirty_page); + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); put_page(dirty_page); } return ret; @@ -1831,6 +1843,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long restart_addr; int need_break; + /* + * files that support invalidating or truncating portions of the + * file from under mmaped areas must have their ->fault function + * return a locked page (and set VM_FAULT_LOCKED in the return). + * This provides synchronisation against concurrent unmapping here. + */ + again: restart_addr = vma->vm_truncate_count; if (is_restart_addr(restart_addr) && start_addr < restart_addr) { @@ -1959,17 +1978,8 @@ void unmap_mapping_range(struct address_space *mapping, spin_lock(&mapping->i_mmap_lock); - /* serialize i_size write against truncate_count write */ - smp_wmb(); - /* Protect against page faults, and endless unmapping loops */ + /* Protect against endless unmapping loops */ mapping->truncate_count++; - /* - * For archs where spin_lock has inclusive semantics like ia64 - * this smp_mb() will prevent to read pagetable contents - * before the truncate_count increment is visible to - * other cpus. - */ - smp_mb(); if (unlikely(is_restart_addr(mapping->truncate_count))) { if (mapping->truncate_count == 0) reset_vma_truncate_counts(mapping); @@ -2008,8 +2018,18 @@ int vmtruncate(struct inode * inode, loff_t offset) if (IS_SWAPFILE(inode)) goto out_busy; i_size_write(inode, offset); + + /* + * unmap_mapping_range is called twice, first simply for efficiency + * so that truncate_inode_pages does fewer single-page unmaps. However + * after this first call, and before truncate_inode_pages finishes, + * it is possible for private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second unmap_mapping_range + * call must be made for correctness. + */ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); goto out_truncate; do_expand: @@ -2049,6 +2069,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); truncate_inode_pages_range(mapping, offset, end); + unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); @@ -2130,7 +2151,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; swp_entry_t entry; pte_t pte; - int ret = VM_FAULT_MINOR; + int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) goto out; @@ -2198,15 +2219,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(page); if (write_access) { + /* XXX: We could OR the do_wp_page code with this one? */ if (do_wp_page(mm, vma, address, - page_table, pmd, ptl, pte) == VM_FAULT_OOM) + page_table, pmd, ptl, pte) & VM_FAULT_OOM) ret = VM_FAULT_OOM; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); - lazy_mmu_prot_update(pte); unlock: pte_unmap_unlock(page_table, ptl); out: @@ -2271,7 +2292,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); - return VM_FAULT_MINOR; + return 0; release: page_cache_release(page); goto unlock; @@ -2280,102 +2301,114 @@ oom: } /* - * do_no_page() tries to create a new page mapping. It aggressively + * __do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if - * the "write_access" parameter is true in order to avoid the next - * page fault. + * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid + * the next page fault. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. + * but allow concurrent faults), and pte neither mapped nor locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) +static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { + pte_t *page_table; spinlock_t *ptl; - struct page *new_page; - struct address_space *mapping = NULL; + struct page *page; pte_t entry; - unsigned int sequence = 0; - int ret = VM_FAULT_MINOR; int anon = 0; struct page *dirty_page = NULL; + struct vm_fault vmf; + int ret; + int page_mkwrite = 0; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.page = NULL; - pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - sequence = mapping->truncate_count; - smp_rmb(); /* serializes i_size against truncate_count */ + if (likely(vma->vm_ops->fault)) { + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + } else { + /* Legacy ->nopage path */ + ret = 0; + vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* no page was available -- either SIGBUS or OOM */ + if (unlikely(vmf.page == NOPAGE_SIGBUS)) + return VM_FAULT_SIGBUS; + else if (unlikely(vmf.page == NOPAGE_OOM)) + return VM_FAULT_OOM; } -retry: - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* - * No smp_rmb is needed here as long as there's a full - * spin_lock/unlock sequence inside the ->nopage callback - * (for the pagecache lookup) that acts as an implicit - * smp_mb() and prevents the i_size read to happen - * after the next truncate_count read. + * For consistency in subsequent calls, make the faulted page always + * locked. */ - - /* no page was available -- either SIGBUS, OOM or REFAULT */ - if (unlikely(new_page == NOPAGE_SIGBUS)) - return VM_FAULT_SIGBUS; - else if (unlikely(new_page == NOPAGE_OOM)) - return VM_FAULT_OOM; - else if (unlikely(new_page == NOPAGE_REFAULT)) - return VM_FAULT_MINOR; + if (unlikely(!(ret & VM_FAULT_LOCKED))) + lock_page(vmf.page); + else + VM_BUG_ON(!PageLocked(vmf.page)); /* * Should we do an early C-O-W break? */ - if (write_access) { + page = vmf.page; + if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { - struct page *page; - - if (unlikely(anon_vma_prepare(vma))) - goto oom; + anon = 1; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto out; + } page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!page) - goto oom; - copy_user_highpage(page, new_page, address, vma); - page_cache_release(new_page); - new_page = page; - anon = 1; - + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + copy_user_highpage(page, vmf.page, address, vma); } else { - /* if the page will be shareable, see if the backing + /* + * If the page will be shareable, see if the backing * address space wants to know that the page is about - * to become writable */ - if (vma->vm_ops->page_mkwrite && - vma->vm_ops->page_mkwrite(vma, new_page) < 0 - ) { - page_cache_release(new_page); - return VM_FAULT_SIGBUS; + * to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(page); + if (vma->vm_ops->page_mkwrite(vma, page) < 0) { + ret = VM_FAULT_SIGBUS; + anon = 1; /* no anon but release vmf.page */ + goto out_unlocked; + } + lock_page(page); + /* + * XXX: this is not quite right (racy vs + * invalidate) to unlock and relock the page + * like this, however a better fix requires + * reworking page_mkwrite locking API, which + * is better done later. + */ + if (!page->mapping) { + ret = 0; + anon = 1; /* no anon but release vmf.page */ + goto out; + } + page_mkwrite = 1; } } + } page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - /* - * For a file-backed vma, someone could have truncated or otherwise - * invalidated this page. If unmap_mapping_range got called, - * retry getting the page. - */ - if (mapping && unlikely(sequence != mapping->truncate_count)) { - pte_unmap_unlock(page_table, ptl); - page_cache_release(new_page); - cond_resched(); - sequence = mapping->truncate_count; - smp_rmb(); - goto retry; - } /* * This silly early PAGE_DIRTY setting removes a race @@ -2388,45 +2421,63 @@ retry: * handle that later. */ /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); - if (write_access) + if (likely(pte_same(*page_table, orig_pte))) { + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(new_page); - page_add_new_anon_rmap(new_page, vma, address); + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); - page_add_file_rmap(new_page); - if (write_access) { - dirty_page = new_page; + page_add_file_rmap(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page = page; get_page(dirty_page); } } + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); } else { - /* One of our sibling threads was faster, back out. */ - page_cache_release(new_page); - goto unlock; + if (anon) + page_cache_release(page); + else + anon = 1; /* no anon but release faulted_page */ } - /* no need to invalidate: a not-present page shouldn't be cached */ - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); -unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { - set_page_dirty_balance(dirty_page); + +out: + unlock_page(vmf.page); +out_unlocked: + if (anon) + page_cache_release(vmf.page); + else if (dirty_page) { + set_page_dirty_balance(dirty_page, page_mkwrite); put_page(dirty_page); } + return ret; -oom: - page_cache_release(new_page); - return VM_FAULT_OOM; } +static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + pgoff_t pgoff = (((address & PAGE_MASK) + - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; + unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); + + pte_unmap(page_table); + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + + /* * do_no_pfn() tries to create a new page mapping for a page without * a struct_page backing it @@ -2450,7 +2501,6 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; unsigned long pfn; - int ret = VM_FAULT_MINOR; pte_unmap(page_table); BUG_ON(!(vma->vm_flags & VM_PFNMAP)); @@ -2462,7 +2512,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, else if (unlikely(pfn == NOPFN_SIGBUS)) return VM_FAULT_SIGBUS; else if (unlikely(pfn == NOPFN_REFAULT)) - return VM_FAULT_MINOR; + return 0; page_table = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2474,7 +2524,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, entry); } pte_unmap_unlock(page_table, ptl); - return ret; + return 0; } /* @@ -2486,33 +2536,28 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, int write_access, pte_t orig_pte) { + unsigned int flags = FAULT_FLAG_NONLINEAR | + (write_access ? FAULT_FLAG_WRITE : 0); pgoff_t pgoff; - int err; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - return VM_FAULT_MINOR; + return 0; - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || + !(vma->vm_flags & VM_CAN_NONLINEAR))) { /* * Page table corrupted: show pte and kill process. */ print_bad_pte(vma, orig_pte, address); return VM_FAULT_OOM; } - /* We can then assume vm->vm_ops && vma->vm_ops->populate */ pgoff = pte_to_pgoff(orig_pte); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, - vma->vm_page_prot, pgoff, 0); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err) - return VM_FAULT_SIGBUS; - return VM_FAULT_MAJOR; + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -2539,10 +2584,9 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { - if (vma->vm_ops->nopage) - return do_no_page(mm, vma, address, - pte, pmd, - write_access); + if (vma->vm_ops->fault || vma->vm_ops->nopage) + return do_linear_fault(mm, vma, address, + pte, pmd, write_access, entry); if (unlikely(vma->vm_ops->nopfn)) return do_no_pfn(mm, vma, address, pte, pmd, write_access); @@ -2551,7 +2595,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte, pmd, write_access); } if (pte_file(entry)) - return do_file_page(mm, vma, address, + return do_nonlinear_fault(mm, vma, address, pte, pmd, write_access, entry); return do_swap_page(mm, vma, address, pte, pmd, write_access, entry); @@ -2583,13 +2627,13 @@ static inline int handle_pte_fault(struct mm_struct *mm, } unlock: pte_unmap_unlock(pte, ptl); - return VM_FAULT_MINOR; + return 0; } /* * By the time we get here, we already hold the mm semaphore */ -int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { pgd_t *pgd; @@ -2618,8 +2662,6 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, write_access); } -EXPORT_SYMBOL_GPL(__handle_mm_fault); - #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. @@ -2824,3 +2866,4 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in return buf - old_buf; } +EXPORT_SYMBOL_GPL(access_process_vm); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9f4e9b95e8f2..3d6ac9505d07 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -149,7 +149,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) lower zones etc. Avoid empty zones because the memory allocator doesn't like them. If you implement node hot removal you have to fix that. */ - k = policy_zone; + k = MAX_NR_ZONES - 1; while (1) { for_each_node_mask(nd, *nodes) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; @@ -955,6 +955,11 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, goto out; } + if (!nodes_subset(new, node_online_map)) { + err = -EINVAL; + goto out; + } + err = security_task_movememory(task); if (err) goto out; @@ -1072,21 +1077,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, #endif -/* Return effective policy for a VMA */ +/* + * get_vma_policy(@task, @vma, @addr) + * @task - task for fallback if vma policy == default + * @vma - virtual memory area whose policy is sought + * @addr - address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to @task or system default policy, as necessary. + * Returned policy has extra reference count if shared, vma, + * or some other task's policy [show_numa_maps() can pass + * @task != current]. It is the caller's responsibility to + * free the reference in these cases. + */ static struct mempolicy * get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; + int shared_pol = 0; if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) + if (vma->vm_ops && vma->vm_ops->get_policy) { pol = vma->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy && + shared_pol = 1; /* if pol non-NULL, add ref below */ + } else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; } if (!pol) pol = &default_policy; + else if (!shared_pol && pol != current->mempolicy) + mpol_get(pol); /* vma or other task's policy */ return pol; } @@ -1202,19 +1223,45 @@ static inline unsigned interleave_nid(struct mempolicy *pol, } #ifdef CONFIG_HUGETLBFS -/* Return a zonelist suitable for a huge page allocation. */ +/* + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * @vma = virtual memory area whose policy is sought + * @addr = address in @vma for shared policy lookup and interleave policy + * @gfp_flags = for requested zone + * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy + * + * Returns a zonelist suitable for a huge page allocation. + * If the effective policy is 'BIND, returns pointer to policy's zonelist. + * If it is also a policy for which get_vma_policy() returns an extra + * reference, we must hold that reference until after allocation. + * In that case, return policy via @mpol so hugetlb allocation can drop + * the reference. For non-'BIND referenced policies, we can/do drop the + * reference here, so the caller doesn't need to know about the special case + * for default and current task policy. + */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags) + gfp_t gfp_flags, struct mempolicy **mpol) { struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; + *mpol = NULL; /* probably no unref needed */ if (pol->policy == MPOL_INTERLEAVE) { unsigned nid; nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); + __mpol_free(pol); /* finished with pol */ return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); } - return zonelist_policy(GFP_HIGHUSER, pol); + + zl = zonelist_policy(GFP_HIGHUSER, pol); + if (unlikely(pol != &default_policy && pol != current->mempolicy)) { + if (pol->policy != MPOL_BIND) + __mpol_free(pol); /* finished with pol */ + else + *mpol = pol; /* unref needed after allocation */ + } + return zl; } #endif @@ -1259,6 +1306,7 @@ struct page * alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; cpuset_update_task_memory_state(); @@ -1268,7 +1316,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } - return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); + zl = zonelist_policy(gfp, pol); + if (pol != &default_policy && pol != current->mempolicy) { + /* + * slow path: ref counted policy -- shared or vma + */ + struct page *page = __alloc_pages(gfp, 0, zl); + __mpol_free(pol); + return page; + } + /* + * fast path: default or task policy + */ + return __alloc_pages(gfp, 0, zl); } /** @@ -1605,11 +1665,11 @@ void __init numa_policy_init(void) policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); /* * Set interleaving policy for system init. Interleaving is only @@ -1867,6 +1927,7 @@ int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; + struct mempolicy *pol; int n; char buffer[50]; @@ -1877,8 +1938,13 @@ int show_numa_map(struct seq_file *m, void *v) if (!md) return 0; - mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(priv->task, vma, vma->vm_start)); + pol = get_vma_policy(priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol); + /* + * unref shared or other task's mempolicy + */ + if (pol != &default_policy && pol != current->mempolicy) + __mpol_free(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); diff --git a/mm/migrate.c b/mm/migrate.c index 34d8ada053e4..e2fdbce1874b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -49,9 +49,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { + if (PageLRU(page) && get_page_unless_zero(page)) { ret = 0; - get_page(page); ClearPageLRU(page); if (PageActive(page)) del_page_from_active_list(zone, page); @@ -612,6 +611,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); + int rcu_locked = 0; if (!newpage) return -ENOMEM; @@ -632,18 +632,41 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, goto unlock; wait_on_page_writeback(page); } - /* - * Establish migration ptes or remove ptes + * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrates a page. + * This rcu_read_lock() delays freeing anon_vma pointer until the end + * of migration. File cache pages are no problem because of page_lock() + * File Caches may use write_page() or lock_page() in migration, then, + * just care Anon page here. + */ + if (PageAnon(page)) { + rcu_read_lock(); + rcu_locked = 1; + } + /* + * This is a corner case handling. + * When a new swap-cache is read into, it is linked to LRU + * and treated as swapcache but has no rmap yet. + * Calling try_to_unmap() against a page->mapping==NULL page is + * BUG. So handle it here. */ + if (!page->mapping) + goto rcu_unlock; + /* Establish migration ptes or remove ptes */ try_to_unmap(page, 1); + if (!page_mapped(page)) rc = move_to_new_page(newpage, page); if (rc) remove_migration_ptes(page, page); +rcu_unlock: + if (rcu_locked) + rcu_read_unlock(); unlock: + unlock_page(page); if (rc != -EAGAIN) { diff --git a/mm/mmap.c b/mm/mmap.c index 144b4a290f2c..0d40e66c841b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -93,7 +93,7 @@ atomic_t vm_committed_space = ATOMIC_INIT(0); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; @@ -166,7 +166,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) /* Don't let a single process grow too big: leave 3% of the size of this process for other processes */ - allowed -= current->mm->total_vm / 32; + allowed -= mm->total_vm / 32; /* * cast `allowed' as a signed long because vm_committed_space @@ -1029,6 +1029,40 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } EXPORT_SYMBOL(do_mmap_pgoff); +/* + * Some shared mappigns will want the pages marked read-only + * to track write events. If so, we'll downgrade vm_page_prot + * to the private version (using protection_map[] without the + * VM_SHARED bit). + */ +int vma_wants_writenotify(struct vm_area_struct *vma) +{ + unsigned int vm_flags = vma->vm_flags; + + /* If it was private or non-writable, the write bit is already clear */ + if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + return 0; + + /* The backer wishes to know when pages are first written to? */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) + return 1; + + /* The open routine did something to the protections already? */ + if (pgprot_val(vma->vm_page_prot) != + pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)])) + return 0; + + /* Specialty mapping? */ + if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) + return 0; + + /* Can the mapping track the dirty pages? */ + return vma->vm_file && vma->vm_file->f_mapping && + mapping_cap_account_dirty(vma->vm_file->f_mapping); +} + + unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, unsigned int vm_flags, unsigned long pgoff, @@ -1165,12 +1199,8 @@ out: mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); } - if (flags & MAP_POPULATE) { - up_write(&mm->mmap_sem); - sys_remap_file_pages(addr, len, 0, - pgoff, flags & MAP_NONBLOCK); - down_write(&mm->mmap_sem); - } + if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) + make_pages_present(addr, addr + len); return addr; unmap_and_free_vma: @@ -1575,33 +1605,11 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ -#ifdef CONFIG_STACK_GROWSUP -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - return expand_upwards(vma, address); -} - -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma, *prev; - - addr &= PAGE_MASK; - vma = find_vma_prev(mm, addr, &prev); - if (vma && (vma->vm_start <= addr)) - return vma; - if (!prev || expand_stack(prev, addr)) - return NULL; - if (prev->vm_flags & VM_LOCKED) { - make_pages_present(addr, prev->vm_end); - } - return prev; -} -#else /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) +static inline int expand_downwards(struct vm_area_struct *vma, + unsigned long address) { int error; @@ -1638,6 +1646,38 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) return error; } +int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address) +{ + return expand_downwards(vma, address); +} + +#ifdef CONFIG_STACK_GROWSUP +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_upwards(vma, address); +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + addr &= PAGE_MASK; + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; + if (!prev || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + make_pages_present(addr, prev->vm_end); + return prev; +} +#else +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_downwards(vma, address); +} + struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr) { @@ -1655,9 +1695,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) { + if (vma->vm_flags & VM_LOCKED) make_pages_present(addr, start); - } return vma; } #endif @@ -2038,7 +2077,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) if (__vma && __vma->vm_start < vma->vm_end) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory(vma_pages(vma))) + security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index 3b8f3c0c63f3..e8346c30abec 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -128,7 +128,7 @@ static void change_protection(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } -static int +int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) { diff --git a/mm/mremap.c b/mm/mremap.c index bc7c52efc71b..8ea5c2412c6e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -120,7 +120,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, #define LATENCY_LIMIT (64 * PAGE_SIZE) -static unsigned long move_page_tables(struct vm_area_struct *vma, +unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len) { diff --git a/mm/nommu.c b/mm/nommu.c index 8bbbf147a794..8ed0cb43118a 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -54,12 +54,6 @@ DECLARE_RWSEM(nommu_vma_sem); struct vm_operations_struct generic_file_vm_ops = { }; -EXPORT_SYMBOL(vfree); -EXPORT_SYMBOL(vmalloc_to_page); -EXPORT_SYMBOL(vmalloc_32); -EXPORT_SYMBOL(vmap); -EXPORT_SYMBOL(vunmap); - /* * Handle all mappings that got truncated by a "truncate()" * system call. @@ -168,7 +162,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, finish_or_fault: return i ? : -EFAULT; } - EXPORT_SYMBOL(get_user_pages); DEFINE_RWLOCK(vmlist_lock); @@ -178,6 +171,7 @@ void vfree(void *addr) { kfree(addr); } +EXPORT_SYMBOL(vfree); void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { @@ -186,17 +180,19 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) */ return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } +EXPORT_SYMBOL(__vmalloc); struct page * vmalloc_to_page(void *addr) { return virt_to_page(addr); } +EXPORT_SYMBOL(vmalloc_to_page); unsigned long vmalloc_to_pfn(void *addr) { return page_to_pfn(virt_to_page(addr)); } - +EXPORT_SYMBOL(vmalloc_to_pfn); long vread(char *buf, char *addr, unsigned long count) { @@ -237,9 +233,8 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); -/* - * vmalloc_32 - allocate virtually continguos memory (32bit addressable) - * +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the @@ -249,17 +244,33 @@ void *vmalloc_32(unsigned long size) { return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); } +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + */ +void *vmalloc_32_user(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); +} +EXPORT_SYMBOL(vmalloc_32_user); void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { BUG(); return NULL; } +EXPORT_SYMBOL(vmap); void vunmap(void *addr) { BUG(); } +EXPORT_SYMBOL(vunmap); /* * Implement a stub for vmalloc_sync_all() if the architecture chose not to @@ -269,6 +280,13 @@ void __attribute__((weak)) vmalloc_sync_all(void) { } +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_page); + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty @@ -994,6 +1012,7 @@ unsigned long do_mmap_pgoff(struct file *file, show_free_areas(); return -ENOMEM; } +EXPORT_SYMBOL(do_mmap_pgoff); /* * handle mapping disposal for uClinux @@ -1074,6 +1093,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) return 0; } +EXPORT_SYMBOL(do_munmap); asmlinkage long sys_munmap(unsigned long addr, size_t len) { @@ -1164,6 +1184,7 @@ unsigned long do_mremap(unsigned long addr, return vma->vm_start; } +EXPORT_SYMBOL(do_mremap); asmlinkage unsigned long sys_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, @@ -1231,7 +1252,6 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr, return get_area(file, addr, len, pgoff, flags); } - EXPORT_SYMBOL(get_unmapped_area); /* @@ -1250,7 +1270,7 @@ EXPORT_SYMBOL(get_unmapped_area); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; @@ -1341,12 +1361,12 @@ int in_gate_area_no_task(unsigned long addr) return 0; } -struct page *filemap_nopage(struct vm_area_struct *area, - unsigned long address, int *type) +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); - return NULL; + return 0; } +EXPORT_SYMBOL(filemap_fault); /* * Access another process' address space. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7001410ab15..f9b82ad5047f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -17,6 +17,7 @@ #include <linux/oom.h> #include <linux/mm.h> +#include <linux/err.h> #include <linux/sched.h> #include <linux/swap.h> #include <linux/timex.h> @@ -156,7 +157,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) } #ifdef DEBUG - printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", + printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", p->pid, p->comm, points); #endif return points; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 886ea0d5a136..44720363374c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -274,9 +274,9 @@ static void balance_dirty_pages(struct address_space *mapping) pdflush_operation(background_writeout, 0); } -void set_page_dirty_balance(struct page *page) +void set_page_dirty_balance(struct page *page, int page_mkwrite) { - if (set_page_dirty(page)) { + if (set_page_dirty(page) || page_mkwrite) { struct address_space *mapping = page_mapping(page); if (mapping) @@ -918,6 +918,9 @@ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); + BUG_ON(!PageLocked(page)); + + ClearPageReclaim(page); if (mapping && mapping_cap_account_dirty(mapping)) { /* * Yes, Virginia, this is indeed insane. @@ -943,14 +946,19 @@ int clear_page_dirty_for_io(struct page *page) * We basically use the page "master dirty bit" * as a serialization point for all the different * threads doing their things. - * - * FIXME! We still have a race here: if somebody - * adds the page back to the page tables in - * between the "page_mkclean()" and the "TestClearPageDirty()", - * we might have it mapped without the dirty bit set. */ if (page_mkclean(page)) set_page_dirty(page); + /* + * We carefully synchronise fault handlers against + * installing a dirty pte and marking the page dirty + * at this point. We do this by having them hold the + * page lock at some point after installing their + * pte, but before marking the page dirty. + * Pages are always locked coming in here, so we get + * the desired exclusion. See mm/memory.c:do_wp_page() + * for more comments. + */ if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); return 1; @@ -979,6 +987,8 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } + if (ret) + dec_zone_page_state(page, NR_WRITEBACK); return ret; } @@ -1004,6 +1014,8 @@ int test_set_page_writeback(struct page *page) } else { ret = TestSetPageWriteback(page); } + if (!ret) + inc_zone_page_state(page, NR_WRITEBACK); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2a10b957f23..1a8c59571cb7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -138,7 +138,7 @@ static unsigned long __meminitdata dma_reserve; #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ unsigned long __initdata required_kernelcore; unsigned long __initdata required_movablecore; - unsigned long __initdata zone_movable_pfn[MAX_NUMNODES]; + unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -453,12 +453,6 @@ static inline int free_pages_check(struct page *page) 1 << PG_reserved | 1 << PG_buddy )))) bad_page(page); - /* - * PageReclaim == PageTail. It is only an error - * for PageReclaim to be set if PageCompound is clear. - */ - if (unlikely(!PageCompound(page) && PageReclaim(page))) - bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -602,7 +596,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 1 << PG_locked | 1 << PG_active | 1 << PG_dirty | - 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | @@ -617,7 +610,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) if (PageReserved(page)) return 1; - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); set_page_private(page, 0); @@ -733,7 +726,7 @@ static void __drain_pages(unsigned int cpu) } } -#ifdef CONFIG_PM +#ifdef CONFIG_HIBERNATION void mark_free_pages(struct zone *zone) { @@ -779,7 +772,7 @@ void drain_local_pages(void) __drain_pages(smp_processor_id()); local_irq_restore(flags); } -#endif /* CONFIG_PM */ +#endif /* CONFIG_HIBERNATION */ /* * Free a 0-order page @@ -1164,6 +1157,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ zonelist_scan: /* @@ -1173,6 +1167,18 @@ zonelist_scan: z = zonelist->zones; do { + /* + * In NUMA, this could be a policy zonelist which contains + * zones that may not be allowed by the current gfp_mask. + * Check the zone is allowed by the current flags + */ + if (unlikely(alloc_should_filter_zonelist(zonelist))) { + if (highest_zoneidx == -1) + highest_zoneidx = gfp_zone(gfp_mask); + if (zone_idx(*z) > highest_zoneidx) + continue; + } + if (NUMA_BUILD && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; @@ -1357,6 +1363,10 @@ nofail_alloc: if (page) goto got_pg; + /* The OOM killer will not help higher order allocs so fail */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto nopage; + out_of_memory(zonelist, gfp_mask, order); goto restart; } @@ -2335,6 +2345,8 @@ static int __cpuinit process_zones(int cpu) return 0; bad: for_each_zone(dzone) { + if (!populated_zone(dzone)) + continue; if (dzone == zone) break; kfree(zone_pcp(dzone, cpu)); @@ -2782,11 +2794,11 @@ unsigned long __meminit __absent_pages_in_range(int nid, if (i == -1) return 0; + prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + /* Account for ranges before physical memory on this node */ if (early_node_map[i].start_pfn > range_start_pfn) - hole_pages = early_node_map[i].start_pfn - range_start_pfn; - - prev_end_pfn = early_node_map[i].start_pfn; + hole_pages = prev_end_pfn - range_start_pfn; /* Find all holes for the zone within the node */ for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { diff --git a/mm/page_io.c b/mm/page_io.c index dbffec0d78c9..3b97f6850273 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -44,14 +44,11 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, return bio; } -static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) +static void end_swap_bio_write(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_size) - return 1; - if (!uptodate) { SetPageError(page); /* @@ -71,17 +68,13 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) } end_page_writeback(page); bio_put(bio); - return 0; } -int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) +void end_swap_bio_read(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_size) - return 1; - if (!uptodate) { SetPageError(page); ClearPageUptodate(page); @@ -94,7 +87,6 @@ int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) } unlock_page(page); bio_put(bio); - return 0; } /* diff --git a/mm/readahead.c b/mm/readahead.c index 9861e883fe57..be20c9d699d3 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -15,14 +15,23 @@ #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> +#include <linux/pagemap.h> void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { } EXPORT_SYMBOL(default_unplug_io_fn); +/* + * Convienent macros for min/max read-ahead pages. + * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up. + * The latter is necessary for systems with large page size(i.e. 64k). + */ +#define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE) +#define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE) + struct backing_dev_info default_backing_dev_info = { - .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, + .ra_pages = MAX_RA_PAGES, .state = 0, .capabilities = BDI_CAP_MAP_COPY, .unplug_io_fn = default_unplug_io_fn, @@ -41,82 +50,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); -/* - * Return max readahead size for this inode in number-of-pages. - */ -static inline unsigned long get_max_readahead(struct file_ra_state *ra) -{ - return ra->ra_pages; -} - -static inline unsigned long get_min_readahead(struct file_ra_state *ra) -{ - return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; -} - -static inline void reset_ahead_window(struct file_ra_state *ra) -{ - /* - * ... but preserve ahead_start + ahead_size value, - * see 'recheck:' label in page_cache_readahead(). - * Note: We never use ->ahead_size as rvalue without - * checking ->ahead_start != 0 first. - */ - ra->ahead_size += ra->ahead_start; - ra->ahead_start = 0; -} - -static inline void ra_off(struct file_ra_state *ra) -{ - ra->start = 0; - ra->flags = 0; - ra->size = 0; - reset_ahead_window(ra); - return; -} - -/* - * Set the initial window size, round to next power of 2 and square - * for small size, x 4 for medium, and x 2 for large - * for 128k (32 page) max ra - * 1-8 page = 32k initial, > 8 page = 128k initial - */ -static unsigned long get_init_ra_size(unsigned long size, unsigned long max) -{ - unsigned long newsize = roundup_pow_of_two(size); - - if (newsize <= max / 32) - newsize = newsize * 4; - else if (newsize <= max / 4) - newsize = newsize * 2; - else - newsize = max; - return newsize; -} - -/* - * Set the new window size, this is called only when I/O is to be submitted, - * not for each call to readahead. If a cache miss occured, reduce next I/O - * size, else increase depending on how close to max we are. - */ -static inline unsigned long get_next_ra_size(struct file_ra_state *ra) -{ - unsigned long max = get_max_readahead(ra); - unsigned long min = get_min_readahead(ra); - unsigned long cur = ra->size; - unsigned long newsize; - - if (ra->flags & RA_FLAG_MISS) { - ra->flags &= ~RA_FLAG_MISS; - newsize = max((cur - 2), min); - } else if (cur < max / 16) { - newsize = 4 * cur; - } else { - newsize = 2 * cur; - } - return min(newsize, max); -} - #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) /** @@ -193,66 +126,6 @@ out: } /* - * Readahead design. - * - * The fields in struct file_ra_state represent the most-recently-executed - * readahead attempt: - * - * start: Page index at which we started the readahead - * size: Number of pages in that read - * Together, these form the "current window". - * Together, start and size represent the `readahead window'. - * prev_index: The page which the readahead algorithm most-recently inspected. - * It is mainly used to detect sequential file reading. - * If page_cache_readahead sees that it is again being called for - * a page which it just looked at, it can return immediately without - * making any state changes. - * offset: Offset in the prev_index where the last read ended - used for - * detection of sequential file reading. - * ahead_start, - * ahead_size: Together, these form the "ahead window". - * ra_pages: The externally controlled max readahead for this fd. - * - * When readahead is in the off state (size == 0), readahead is disabled. - * In this state, prev_index is used to detect the resumption of sequential I/O. - * - * The readahead code manages two windows - the "current" and the "ahead" - * windows. The intent is that while the application is walking the pages - * in the current window, I/O is underway on the ahead window. When the - * current window is fully traversed, it is replaced by the ahead window - * and the ahead window is invalidated. When this copying happens, the - * new current window's pages are probably still locked. So - * we submit a new batch of I/O immediately, creating a new ahead window. - * - * So: - * - * ----|----------------|----------------|----- - * ^start ^start+size - * ^ahead_start ^ahead_start+ahead_size - * - * ^ When this page is read, we submit I/O for the - * ahead window. - * - * A `readahead hit' occurs when a read request is made against a page which is - * the next sequential page. Ahead window calculations are done only when it - * is time to submit a new IO. The code ramps up the size agressively at first, - * but slow down as it approaches max_readhead. - * - * Any seek/ramdom IO will result in readahead being turned off. It will resume - * at the first sequential access. - * - * There is a special-case: if the first page which the application tries to - * read happens to be the first page of the file, it is assumed that a linear - * read is about to happen and the window is immediately set to the initial size - * based on I/O request size and the max_readahead. - * - * This function is to be called for every read request, rather than when - * it is time to perform readahead. It is called only once for the entire I/O - * regardless of size unless readahead is unable to start enough I/O to satisfy - * the request (I/O request > max_readahead). - */ - -/* * do_page_cache_readahead actually reads a chunk of disk. It allocates all * the pages first, then submits them all for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. @@ -265,7 +138,8 @@ out: */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) { struct inode *inode = mapping->host; struct page *page; @@ -278,7 +152,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (isize == 0) goto out; - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); /* * Preallocate as many pages as we will need. @@ -286,7 +160,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, read_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { pgoff_t page_offset = offset + page_idx; - + if (page_offset > end_index) break; @@ -301,6 +175,8 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, break; page->index = page_offset; list_add(&page->lru, &page_pool); + if (page_idx == nr_to_read - lookahead_size) + SetPageReadahead(page); ret++; } read_unlock_irq(&mapping->tree_lock); @@ -337,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (this_chunk > nr_to_read) this_chunk = nr_to_read; err = __do_page_cache_readahead(mapping, filp, - offset, this_chunk); + offset, this_chunk, 0); if (err < 0) { ret = err; break; @@ -350,28 +226,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, } /* - * Check how effective readahead is being. If the amount of started IO is - * less than expected then the file is partly or fully in pagecache and - * readahead isn't helping. - * - */ -static inline int check_ra_success(struct file_ra_state *ra, - unsigned long nr_to_read, unsigned long actual) -{ - if (actual == 0) { - ra->cache_hit += nr_to_read; - if (ra->cache_hit >= VM_MAX_CACHE_HIT) { - ra_off(ra); - ra->flags |= RA_FLAG_INCACHE; - return 0; - } - } else { - ra->cache_hit=0; - } - return 1; -} - -/* * This version skips the IO if the queue is read-congested, and will tell the * block layer to abandon the readahead if request allocation would block. * @@ -384,200 +238,237 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (bdi_read_congested(mapping->backing_dev_info)) return -1; - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); + return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); } /* - * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' - * is set wait till the read completes. Otherwise attempt to read without - * blocking. - * Returns 1 meaning 'success' if read is successful without switching off - * readahead mode. Otherwise return failure. + * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * sensible upper limit. */ -static int -blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read, - struct file_ra_state *ra, int block) +unsigned long max_sane_readahead(unsigned long nr) +{ + return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) + + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); +} + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) { int actual; - if (!block && bdi_read_congested(mapping->backing_dev_info)) - return 0; + actual = __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); + + return actual; +} - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read); +/* + * Set the initial window size, round to next power of 2 and square + * for small size, x 4 for medium, and x 2 for large + * for 128k (32 page) max ra + * 1-8 page = 32k initial, > 8 page = 128k initial + */ +static unsigned long get_init_ra_size(unsigned long size, unsigned long max) +{ + unsigned long newsize = roundup_pow_of_two(size); - return check_ra_success(ra, nr_to_read, actual); + if (newsize <= max / 32) + newsize = newsize * 4; + else if (newsize <= max / 4) + newsize = newsize * 2; + else + newsize = max; + + return newsize; } -static int make_ahead_window(struct address_space *mapping, struct file *filp, - struct file_ra_state *ra, int force) +/* + * Get the previous window size, ramp it up, and + * return it as the new window size. + */ +static unsigned long get_next_ra_size(struct file_ra_state *ra, + unsigned long max) { - int block, ret; - - ra->ahead_size = get_next_ra_size(ra); - ra->ahead_start = ra->start + ra->size; - - block = force || (ra->prev_index >= ra->ahead_start); - ret = blockable_page_cache_readahead(mapping, filp, - ra->ahead_start, ra->ahead_size, ra, block); - - if (!ret && !force) { - /* A read failure in blocking mode, implies pages are - * all cached. So we can safely assume we have taken - * care of all the pages requested in this call. - * A read failure in non-blocking mode, implies we are - * reading more pages than requested in this call. So - * we safely assume we have taken care of all the pages - * requested in this call. - * - * Just reset the ahead window in case we failed due to - * congestion. The ahead window will any way be closed - * in case we failed due to excessive page cache hits. - */ - reset_ahead_window(ra); - } + unsigned long cur = ra->size; + unsigned long newsize; - return ret; + if (cur < max / 16) + newsize = 4 * cur; + else + newsize = 2 * cur; + + return min(newsize, max); } -/** - * page_cache_readahead - generic adaptive readahead - * @mapping: address_space which holds the pagecache and I/O vectors - * @ra: file_ra_state which holds the readahead state - * @filp: passed on to ->readpage() and ->readpages() - * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units - * @req_size: hint: total size of the read which the caller is performing in - * PAGE_CACHE_SIZE units +/* + * On-demand readahead design. + * + * The fields in struct file_ra_state represent the most-recently-executed + * readahead attempt: + * + * |<----- async_size ---------| + * |------------------- size -------------------->| + * |==================#===========================| + * ^start ^page marked with PG_readahead * - * page_cache_readahead() is the main function. If performs the adaptive - * readahead window size management and submits the readahead I/O. + * To overlap application thinking time and disk I/O time, we do + * `readahead pipelining': Do not wait until the application consumed all + * readahead pages and stalled on the missing page at readahead_index; + * Instead, submit an asynchronous readahead I/O as soon as there are + * only async_size pages left in the readahead window. Normally async_size + * will be equal to size, for maximum pipelining. * - * Note that @filp is purely used for passing on to the ->readpage[s]() - * handler: it may refer to a different file from @mapping (so we may not use - * @filp->f_mapping or @filp->f_path.dentry->d_inode here). - * Also, @ra may not be equal to &@filp->f_ra. + * In interleaved sequential reads, concurrent streams on the same fd can + * be invalidating each other's readahead state. So we flag the new readahead + * page at (start+size-async_size) with PG_readahead, and use it as readahead + * indicator. The flag won't be set on already cached pages, to avoid the + * readahead-for-nothing fuss, saving pointless page cache lookups. + * + * prev_index tracks the last visited page in the _previous_ read request. + * It should be maintained by the caller, and will be used for detecting + * small random reads. Note that the readahead algorithm checks loosely + * for sequential patterns. Hence interleaved reads might be served as + * sequential ones. + * + * There is a special-case: if the first page which the application tries to + * read happens to be the first page of the file, it is assumed that a linear + * read is about to happen and the window is immediately set to the initial size + * based on I/O request size and the max_readahead. * + * The code ramps up the readahead size aggressively at first, but slow down as + * it approaches max_readhead. + */ + +/* + * A minimal readahead algorithm for trivial sequential/random reads. */ -unsigned long -page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, - struct file *filp, pgoff_t offset, unsigned long req_size) +static unsigned long +ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t offset, + unsigned long req_size) { - unsigned long max, newsize; + unsigned long max; /* max readahead pages */ int sequential; - /* - * We avoid doing extra work and bogusly perturbing the readahead - * window expansion logic. - */ - if (offset == ra->prev_index && --req_size) - ++offset; - - /* Note that prev_index == -1 if it is a first read */ - sequential = (offset == ra->prev_index + 1); - ra->prev_index = offset; - ra->prev_offset = 0; - - max = get_max_readahead(ra); - newsize = min(req_size, max); - - /* No readahead or sub-page sized read or file already in cache */ - if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) - goto out; - - ra->prev_index += newsize - 1; + max = ra->ra_pages; + sequential = (offset - ra->prev_index <= 1UL) || (req_size > max); /* - * Special case - first read at start of file. We'll assume it's - * a whole-file read and grow the window fast. Or detect first - * sequential access + * It's the expected callback offset, assume sequential access. + * Ramp up sizes, and push forward the readahead window. */ - if (sequential && ra->size == 0) { - ra->size = get_init_ra_size(newsize, max); - ra->start = offset; - if (!blockable_page_cache_readahead(mapping, filp, offset, - ra->size, ra, 1)) - goto out; - - /* - * If the request size is larger than our max readahead, we - * at least want to be sure that we get 2 IOs in flight and - * we know that we will definitly need the new I/O. - * once we do this, subsequent calls should be able to overlap - * IOs,* thus preventing stalls. so issue the ahead window - * immediately. - */ - if (req_size >= max) - make_ahead_window(mapping, filp, ra, 1); - - goto out; + if (offset && (offset == (ra->start + ra->size - ra->async_size) || + offset == (ra->start + ra->size))) { + ra->start += ra->size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; } /* - * Now handle the random case: - * partial page reads and first access were handled above, - * so this must be the next page otherwise it is random + * Standalone, small read. + * Read as is, and do not pollute the readahead state. */ - if (!sequential) { - ra_off(ra); - blockable_page_cache_readahead(mapping, filp, offset, - newsize, ra, 1); - goto out; + if (!hit_readahead_marker && !sequential) { + return __do_page_cache_readahead(mapping, filp, + offset, req_size, 0); } /* - * If we get here we are doing sequential IO and this was not the first - * occurence (ie we have an existing window) + * It may be one of + * - first read on start of file + * - sequential cache miss + * - oversize random read + * Start readahead for it. */ - if (ra->ahead_start == 0) { /* no ahead window yet */ - if (!make_ahead_window(mapping, filp, ra, 0)) - goto recheck; - } + ra->start = offset; + ra->size = get_init_ra_size(req_size, max); + ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; /* - * Already have an ahead window, check if we crossed into it. - * If so, shift windows and issue a new ahead window. - * Only return the #pages that are in the current window, so that - * we get called back on the first page of the ahead window which - * will allow us to submit more IO. + * Hit on a marked page without valid readahead state. + * E.g. interleaved reads. + * Not knowing its readahead pos/size, bet on the minimal possible one. */ - if (ra->prev_index >= ra->ahead_start) { - ra->start = ra->ahead_start; - ra->size = ra->ahead_size; - make_ahead_window(mapping, filp, ra, 0); -recheck: - /* prev_index shouldn't overrun the ahead window */ - ra->prev_index = min(ra->prev_index, - ra->ahead_start + ra->ahead_size - 1); + if (hit_readahead_marker) { + ra->start++; + ra->size = get_next_ra_size(ra, max); } -out: - return ra->prev_index + 1; +readit: + return ra_submit(ra, mapping, filp); } -EXPORT_SYMBOL_GPL(page_cache_readahead); -/* - * handle_ra_miss() is called when it is known that a page which should have - * been present in the pagecache (we just did some readahead there) was in fact - * not found. This will happen if it was evicted by the VM (readahead - * thrashing) +/** + * page_cache_sync_readahead - generic file readahead + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @filp: passed on to ->readpage() and ->readpages() + * @offset: start offset into @mapping, in pagecache page-sized units + * @req_size: hint: total size of the read which the caller is performing in + * pagecache pages * - * Turn on the cache miss flag in the RA struct, this will cause the RA code - * to reduce the RA size on the next read. + * page_cache_sync_readahead() should be called when a cache miss happened: + * it will submit the read. The readahead logic may decide to piggyback more + * pages onto the read request if access patterns suggest it will improve + * performance. */ -void handle_ra_miss(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset) +void page_cache_sync_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + pgoff_t offset, unsigned long req_size) { - ra->flags |= RA_FLAG_MISS; - ra->flags &= ~RA_FLAG_INCACHE; - ra->cache_hit = 0; + /* no read-ahead */ + if (!ra->ra_pages) + return; + + /* do read-ahead */ + ondemand_readahead(mapping, ra, filp, false, offset, req_size); } +EXPORT_SYMBOL_GPL(page_cache_sync_readahead); -/* - * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a - * sensible upper limit. - */ -unsigned long max_sane_readahead(unsigned long nr) +/** + * page_cache_async_readahead - file readahead for marked pages + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @filp: passed on to ->readpage() and ->readpages() + * @page: the page at @offset which has the PG_readahead flag set + * @offset: start offset into @mapping, in pagecache page-sized units + * @req_size: hint: total size of the read which the caller is performing in + * pagecache pages + * + * page_cache_async_ondemand() should be called when a page is used which + * has the PG_readahead flag: this is a marker to suggest that the application + * has used up enough of the readahead window that we should start pulling in + * more pages. */ +void +page_cache_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *page, pgoff_t offset, + unsigned long req_size) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); + /* no read-ahead */ + if (!ra->ra_pages) + return; + + /* + * Same bit is used for PG_readahead and PG_reclaim. + */ + if (PageWriteback(page)) + return; + + ClearPageReadahead(page); + + /* + * Defer asynchronous read-ahead on IO congestion. + */ + if (bdi_read_congested(mapping->backing_dev_info)) + return; + + /* do read-ahead */ + ondemand_readahead(mapping, ra, filp, true, offset, req_size); } +EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index 61e492597a0b..41ac39749ef4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -149,7 +149,7 @@ static void anon_vma_ctor(void *data, struct kmem_cache *cachep, void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); } /* @@ -621,8 +621,10 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); - if (vma->vm_ops) + if (vma->vm_ops) { print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); + print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); + } if (vma->vm_file && vma->vm_file->f_op) print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); BUG(); diff --git a/mm/shmem.c b/mm/shmem.c index 96fa79fb6ad3..fcd19d323f9f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -83,6 +83,7 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate page */ + SGP_FAULT, /* same as SGP_CACHE, return with page locked */ }; static int shmem_getpage(struct inode *inode, unsigned long idx, @@ -1100,6 +1101,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, if (idx >= SHMEM_MAX_INDEX) return -EFBIG; + + if (type) + *type = 0; + /* * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read @@ -1133,9 +1138,9 @@ repeat: if (!swappage) { shmem_swp_unmap(entry); /* here we actually do the io */ - if (type && *type == VM_FAULT_MINOR) { + if (type && !(*type & VM_FAULT_MAJOR)) { __count_vm_event(PGMAJFAULT); - *type = VM_FAULT_MAJOR; + *type |= VM_FAULT_MAJOR; } spin_unlock(&info->lock); swappage = shmem_swapin(info, swap, idx); @@ -1289,8 +1294,10 @@ repeat: } done: if (*pagep != filepage) { - unlock_page(filepage); *pagep = filepage; + if (sgp != SGP_FAULT) + unlock_page(filepage); + } return 0; @@ -1302,72 +1309,21 @@ failed: return error; } -static struct page *shmem_nopage(struct vm_area_struct *vma, - unsigned long address, int *type) +static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - struct page *page = NULL; - unsigned long idx; int error; + int ret; - idx = (address - vma->vm_start) >> PAGE_SHIFT; - idx += vma->vm_pgoff; - idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - return NOPAGE_SIGBUS; + if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return VM_FAULT_SIGBUS; - error = shmem_getpage(inode, idx, &page, SGP_CACHE, type); + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret); if (error) - return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; - - mark_page_accessed(page); - return page; -} - -static int shmem_populate(struct vm_area_struct *vma, - unsigned long addr, unsigned long len, - pgprot_t prot, unsigned long pgoff, int nonblock) -{ - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - struct mm_struct *mm = vma->vm_mm; - enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; - unsigned long size; + return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size) - return -EINVAL; - - while ((long) len > 0) { - struct page *page = NULL; - int err; - /* - * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE - */ - err = shmem_getpage(inode, pgoff, &page, sgp, NULL); - if (err) - return err; - /* Page may still be null, but only if nonblock was set. */ - if (page) { - mark_page_accessed(page); - err = install_page(mm, vma, addr, page, prot); - if (err) { - page_cache_release(page); - return err; - } - } else if (vma->vm_flags & VM_NONLINEAR) { - /* No page was found just because we can't read it in - * now (being here implies nonblock != 0), but the page - * may exist, so set the PTE to fault it in later. */ - err = install_file_pte(mm, vma, addr, pgoff, prot); - if (err) - return err; - } - - len -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - } - return 0; + mark_page_accessed(vmf->page); + return ret | VM_FAULT_LOCKED; } #ifdef CONFIG_NUMA @@ -1414,6 +1370,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -2365,7 +2322,7 @@ static int init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, 0, init_once, NULL); + 0, 0, init_once); if (shmem_inode_cachep == NULL) return -ENOMEM; return 0; @@ -2459,8 +2416,7 @@ static const struct super_operations shmem_ops = { }; static struct vm_operations_struct shmem_vm_ops = { - .nopage = shmem_nopage, - .populate = shmem_populate, + .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, diff --git a/mm/slab.c b/mm/slab.c index 96d30ee256ef..6f6abef83a1a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -883,6 +883,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, */ static int use_alien_caches __read_mostly = 1; +static int numa_platform __read_mostly = 1; static int __init noaliencache_setup(char *s) { use_alien_caches = 0; @@ -1163,7 +1164,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); - int memsize = sizeof(struct kmem_list3); + const int memsize = sizeof(struct kmem_list3); switch (action) { case CPU_LOCK_ACQUIRE: @@ -1399,8 +1400,10 @@ void __init kmem_cache_init(void) int order; int node; - if (num_possible_nodes() == 1) + if (num_possible_nodes() == 1) { use_alien_caches = 0; + numa_platform = 0; + } for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -1484,7 +1487,7 @@ void __init kmem_cache_init(void) sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = @@ -1492,7 +1495,7 @@ void __init kmem_cache_init(void) sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); } slab_early_init = 0; @@ -1510,7 +1513,7 @@ void __init kmem_cache_init(void) sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); } #ifdef CONFIG_ZONE_DMA sizes->cs_dmacachep = kmem_cache_create( @@ -1519,7 +1522,7 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC, - NULL, NULL); + NULL); #endif sizes++; names++; @@ -2101,12 +2104,10 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) * @align: The required alignment for the objects. * @flags: SLAB flags * @ctor: A constructor for the objects. - * @dtor: A destructor for the objects (not implemented anymore). * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache - * and the @dtor is run before the pages are handed back. + * The @ctor is run when new pages are allocated by the cache. * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. @@ -2126,8 +2127,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void*, struct kmem_cache *, unsigned long), - void (*dtor)(void*, struct kmem_cache *, unsigned long)) + void (*ctor)(void*, struct kmem_cache *, unsigned long)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; @@ -2136,7 +2136,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * Sanity checks... these are all serious usage bugs. */ if (!name || in_interrupt() || (size < BYTES_PER_WORD) || - size > KMALLOC_MAX_SIZE || dtor) { + size > KMALLOC_MAX_SIZE) { printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, name); BUG(); @@ -2779,7 +2779,7 @@ static int cache_grow(struct kmem_cache *cachep, * 'nodeid'. */ if (!objp) - objp = kmem_getpages(cachep, flags, nodeid); + objp = kmem_getpages(cachep, local_flags, nodeid); if (!objp) goto failed; @@ -3561,7 +3561,14 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (numa_platform && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -3690,8 +3697,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, * functions. */ cachep = __find_general_cachep(size, flags); - if (unlikely(cachep == NULL)) - return NULL; + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; return __cache_alloc(cachep, flags, caller); } diff --git a/mm/slob.c b/mm/slob.c index c89ef116d7aa..ec33fcdc852e 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -293,6 +293,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) { struct slob_page *sp; + struct list_head *prev; slob_t *b = NULL; unsigned long flags; @@ -307,12 +308,22 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) if (node != -1 && page_to_nid(&sp->page) != node) continue; #endif + /* Enough room on this page? */ + if (sp->units < SLOB_UNITS(size)) + continue; - if (sp->units >= SLOB_UNITS(size)) { - b = slob_page_alloc(sp, size, align); - if (b) - break; - } + /* Attempt to alloc */ + prev = sp->list.prev; + b = slob_page_alloc(sp, size, align); + if (!b) + continue; + + /* Improve fragment distribution and reduce our average + * search time by starting our next search here. (see + * Knuth vol 1, sec 2.5, pg 449) */ + if (free_slob_pages.next != prev->next) + list_move_tail(&free_slob_pages, prev->next); + break; } spin_unlock_irqrestore(&slob_lock, flags); @@ -492,8 +503,7 @@ struct kmem_cache { struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void*, struct kmem_cache *, unsigned long), - void (*dtor)(void*, struct kmem_cache *, unsigned long)) + void (*ctor)(void*, struct kmem_cache *, unsigned long)) { struct kmem_cache *c; diff --git a/mm/slub.c b/mm/slub.c index 52a4f44be394..addb20a6d67d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -211,7 +211,8 @@ static inline void ClearSlabDebug(struct page *page) #define MAX_OBJECTS_PER_SLAB 65535 /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000 /* Poison object */ +#define __OBJECT_POISON 0x80000000 /* Poison object */ +#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ /* Not all arches define cache_line_size */ #ifndef cache_line_size @@ -985,7 +986,9 @@ out: __setup("slub_debug", setup_slub_debug); -static void kmem_cache_open_debug_check(struct kmem_cache *s) +static unsigned long kmem_cache_flags(unsigned long objsize, + unsigned long flags, const char *name, + void (*ctor)(void *, struct kmem_cache *, unsigned long)) { /* * The page->offset field is only 16 bit wide. This is an offset @@ -999,19 +1002,21 @@ static void kmem_cache_open_debug_check(struct kmem_cache *s) * Debugging or ctor may create a need to move the free * pointer. Fail if this happens. */ - if (s->objsize >= 65535 * sizeof(void *)) { - BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON | + if (objsize >= 65535 * sizeof(void *)) { + BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); - BUG_ON(s->ctor); - } - else + BUG_ON(ctor); + } else { /* * Enable debugging if selected on the kernel commandline. */ if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, s->name, + strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) - s->flags |= slub_debug; + flags |= slub_debug; + } + + return flags; } #else static inline void setup_object_debug(struct kmem_cache *s, @@ -1028,7 +1033,12 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) static inline int check_object(struct kmem_cache *s, struct page *page, void *object, int active) { return 1; } static inline void add_full(struct kmem_cache_node *n, struct page *page) {} -static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} +static inline unsigned long kmem_cache_flags(unsigned long objsize, + unsigned long flags, const char *name, + void (*ctor)(void *, struct kmem_cache *, unsigned long)) +{ + return flags; +} #define slub_debug 0 #endif /* @@ -1131,6 +1141,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) slab_pad_check(s, page); for_each_object(p, s, page_address(page)) check_object(s, page, p, 0); + ClearSlabDebug(page); } mod_zone_page_state(page_zone(page), @@ -1169,7 +1180,6 @@ static void discard_slab(struct kmem_cache *s, struct page *page) atomic_long_dec(&n->nr_slabs); reset_page_mapcount(page); - ClearSlabDebug(page); __ClearPageSlab(page); free_slab(s, page); } @@ -1656,6 +1666,7 @@ static void __always_inline slab_free(struct kmem_cache *s, unsigned long flags; local_irq_save(flags); + debug_check_no_locks_freed(object, s->objsize); if (likely(page == s->cpu_slab[smp_processor_id()] && !SlabDebug(page))) { object[page->offset] = page->lockless_freelist; @@ -1875,9 +1886,16 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + page = new_slab(kmalloc_caches, gfpflags, node); BUG_ON(!page); + if (page_to_nid(page) != node) { + printk(KERN_ERR "SLUB: Unable to allocate memory from " + "node %d\n", node); + printk(KERN_ERR "SLUB: Allocating a useless per node structure " + "in order to be able to continue\n"); + } + n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmalloc_caches, n); @@ -2079,9 +2097,8 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->name = name; s->ctor = ctor; s->objsize = size; - s->flags = flags; s->align = align; - kmem_cache_open_debug_check(s); + s->flags = kmem_cache_flags(size, flags, name, ctor); if (!calculate_sizes(s)) goto error; @@ -2276,10 +2293,26 @@ panic: } #ifdef CONFIG_ZONE_DMA + +static void sysfs_add_func(struct work_struct *w) +{ + struct kmem_cache *s; + + down_write(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + if (s->flags & __SYSFS_ADD_DEFERRED) { + s->flags &= ~__SYSFS_ADD_DEFERRED; + sysfs_slab_add(s); + } + } + up_write(&slub_lock); +} + +static DECLARE_WORK(sysfs_add_work, sysfs_add_func); + static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) { struct kmem_cache *s; - struct kmem_cache *x; char *text; size_t realsize; @@ -2288,22 +2321,36 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) return s; /* Dynamically create dma cache */ - x = kmalloc(kmem_size, flags & ~SLUB_DMA); - if (!x) - panic("Unable to allocate memory for dma cache\n"); + if (flags & __GFP_WAIT) + down_write(&slub_lock); + else { + if (!down_write_trylock(&slub_lock)) + goto out; + } + + if (kmalloc_caches_dma[index]) + goto unlock_out; realsize = kmalloc_caches[index].objsize; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - s = create_kmalloc_cache(x, text, realsize, flags); - down_write(&slub_lock); - if (!kmalloc_caches_dma[index]) { - kmalloc_caches_dma[index] = s; - up_write(&slub_lock); - return s; + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize), + s = kmalloc(kmem_size, flags & ~SLUB_DMA); + + if (!s || !text || !kmem_cache_open(s, flags, text, + realsize, ARCH_KMALLOC_MINALIGN, + SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + kfree(s); + kfree(text); + goto unlock_out; } + + list_add(&s->list, &slab_caches); + kmalloc_caches_dma[index] = s; + + schedule_work(&sysfs_add_work); + +unlock_out: up_write(&slub_lock); - kmem_cache_destroy(s); +out: return kmalloc_caches_dma[index]; } #endif @@ -2394,7 +2441,7 @@ size_t ksize(const void *object) struct page *page; struct kmem_cache *s; - if (object == ZERO_SIZE_PTR) + if (ZERO_OR_NULL_PTR(object)) return 0; page = get_object_page(object); @@ -2499,15 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *s) slab_unlock(page); discard_slab(s, page); } else { - if (n->nr_partial > MAX_PARTIAL) - list_move(&page->lru, - slabs_by_inuse + page->inuse); + list_move(&page->lru, + slabs_by_inuse + page->inuse); } } - if (n->nr_partial <= MAX_PARTIAL) - goto out; - /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. @@ -2515,7 +2558,6 @@ int kmem_cache_shrink(struct kmem_cache *s) for (i = s->objects - 1; i >= 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); - out: spin_unlock_irqrestore(&n->list_lock, flags); } @@ -2626,7 +2668,7 @@ static int slab_unmergeable(struct kmem_cache *s) } static struct kmem_cache *find_mergeable(size_t size, - size_t align, unsigned long flags, + size_t align, unsigned long flags, const char *name, void (*ctor)(void *, struct kmem_cache *, unsigned long)) { struct kmem_cache *s; @@ -2640,6 +2682,7 @@ static struct kmem_cache *find_mergeable(size_t size, size = ALIGN(size, sizeof(void *)); align = calculate_alignment(flags, align, size); size = ALIGN(size, align); + flags = kmem_cache_flags(size, flags, name, NULL); list_for_each_entry(s, &slab_caches, list) { if (slab_unmergeable(s)) @@ -2648,8 +2691,7 @@ static struct kmem_cache *find_mergeable(size_t size, if (size > s->size) continue; - if (((flags | slub_debug) & SLUB_MERGE_SAME) != - (s->flags & SLUB_MERGE_SAME)) + if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) continue; /* * Check if alignment is compatible. @@ -2668,14 +2710,12 @@ static struct kmem_cache *find_mergeable(size_t size, struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long), - void (*dtor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(void *, struct kmem_cache *, unsigned long)) { struct kmem_cache *s; - BUG_ON(dtor); down_write(&slub_lock); - s = find_mergeable(size, align, flags, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { s->refcount++; /* @@ -3087,7 +3127,7 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long flags; struct page *page; - if (!atomic_read(&n->nr_slabs)) + if (!atomic_long_read(&n->nr_slabs)) continue; spin_lock_irqsave(&n->list_lock, flags); @@ -3222,7 +3262,7 @@ static unsigned long slab_objects(struct kmem_cache *s, } if (flags & SO_FULL) { - int full_slabs = atomic_read(&n->nr_slabs) + int full_slabs = atomic_long_read(&n->nr_slabs) - per_cpu[node] - n->nr_partial; @@ -3258,7 +3298,7 @@ static int any_slab_objects(struct kmem_cache *s) for_each_node(node) { struct kmem_cache_node *n = get_node(s, node); - if (n->nr_partial || atomic_read(&n->nr_slabs)) + if (n->nr_partial || atomic_long_read(&n->nr_slabs)) return 1; } return 0; @@ -3781,7 +3821,9 @@ static int __init slab_sysfs_init(void) list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); - BUG_ON(err); + if (err) + printk(KERN_ERR "SLUB: Unable to add boot slab %s" + " to sysfs\n", s->name); } while (alias_list) { @@ -3789,7 +3831,9 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); - BUG_ON(err); + if (err) + printk(KERN_ERR "SLUB: Unable to add boot slab alias" + " %s to sysfs\n", s->name); kfree(al); } diff --git a/mm/sparse.c b/mm/sparse.c index e03b39f3540f..239f5a720d38 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -41,6 +41,15 @@ int page_to_nid(struct page *page) return section_to_node_table[page_to_section(page)]; } EXPORT_SYMBOL(page_to_nid); + +static void set_section_nid(unsigned long section_nr, int nid) +{ + section_to_node_table[section_nr] = nid; +} +#else /* !NODE_NOT_IN_PAGE_FLAGS */ +static inline void set_section_nid(unsigned long section_nr, int nid) +{ +} #endif #ifdef CONFIG_SPARSEMEM_EXTREME @@ -68,10 +77,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) struct mem_section *section; int ret = 0; -#ifdef NODE_NOT_IN_PAGE_FLAGS - section_to_node_table[section_nr] = nid; -#endif - if (mem_section[root]) return -EEXIST; @@ -148,6 +153,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) struct mem_section *ms; sparse_index_init(section, nid); + set_section_nid(section, nid); ms = __nr_to_section(section); if (!ms->section_mem_map) @@ -209,7 +215,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, return 1; } -__attribute__((weak)) +__attribute__((weak)) __init void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) { return NULL; diff --git a/mm/swapfile.c b/mm/swapfile.c index 7ff0a81c7b01..f071648e1360 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -425,7 +425,7 @@ void free_swap_and_cache(swp_entry_t entry) } } -#ifdef CONFIG_SOFTWARE_SUSPEND +#ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any). * @@ -951,7 +951,7 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) } } -#ifdef CONFIG_SOFTWARE_SUSPEND +#ifdef CONFIG_HIBERNATION /* * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). @@ -966,7 +966,7 @@ sector_t swapdev_block(int swap_type, pgoff_t offset) sis = swap_info + swap_type; return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; } -#endif /* CONFIG_SOFTWARE_SUSPEND */ +#endif /* CONFIG_HIBERNATION */ /* * Free all of a swapdev's extent information diff --git a/mm/truncate.c b/mm/truncate.c index f47e46d1be3b..5cdfbc1a59fd 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL(cancel_dirty_page); /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes anonymous. It will be left on the LRU and may even be mapped into - * user pagetables if we're racing with filemap_nopage(). + * user pagetables if we're racing with filemap_fault(). * * We need to bale out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on @@ -192,6 +192,11 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page_index<<PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } truncate_complete_page(mapping, page); unlock_page(page); } @@ -229,6 +234,11 @@ void truncate_inode_pages_range(struct address_space *mapping, break; lock_page(page); wait_on_page_writeback(page); + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index<<PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } if (page->index > next) next = page->index; next++; @@ -405,7 +415,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, break; } wait_on_page_writeback(page); - while (page_mapped(page)) { + if (page_mapped(page)) { if (!did_range_unmap) { /* * Zap the rest of the file in one hit. @@ -425,6 +435,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, PAGE_CACHE_SIZE, 0); } } + BUG_ON(page_mapped(page)); ret = do_launder_page(mapping, page); if (ret == 0 && !invalidate_complete_page2(mapping, page)) ret = -EIO; diff --git a/mm/util.c b/mm/util.c index 78f3783bdcc8..bf340d806868 100644 --- a/mm/util.c +++ b/mm/util.c @@ -6,7 +6,6 @@ /** * kstrdup - allocate space for and copy an existing string - * * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory */ @@ -27,6 +26,30 @@ char *kstrdup(const char *s, gfp_t gfp) EXPORT_SYMBOL(kstrdup); /** + * kstrndup - allocate space for and copy an existing string + * @s: the string to duplicate + * @max: read at most @max chars from @s + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kstrndup(const char *s, size_t max, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strnlen(s, max); + buf = kmalloc_track_caller(len+1, gfp); + if (buf) { + memcpy(buf, s, len); + buf[len] = '\0'; + } + return buf; +} +EXPORT_SYMBOL(kstrndup); + +/** * kmemdup - duplicate region of memory * * @src: memory region to duplicate @@ -80,7 +103,6 @@ EXPORT_SYMBOL(krealloc); /* * strndup_user - duplicate an existing string from user space - * * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8e05a11155c9..3cee76a8c9f0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -164,6 +164,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) flush_cache_vmap((unsigned long) area->addr, end); return err; } +EXPORT_SYMBOL_GPL(map_vm_area); static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, @@ -242,6 +243,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, { return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL); } +EXPORT_SYMBOL_GPL(__get_vm_area); /** * get_vm_area - reserve a contingous kernel virtual area @@ -583,9 +585,9 @@ void *vmalloc_exec(unsigned long size) } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) -#define GFP_VMALLOC32 GFP_DMA32 +#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) -#define GFP_VMALLOC32 GFP_DMA +#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL #else #define GFP_VMALLOC32 GFP_KERNEL #endif @@ -767,3 +769,56 @@ EXPORT_SYMBOL(remap_vmalloc_range); void __attribute__((weak)) vmalloc_sync_all(void) { } + + +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + /* apply_to_page_range() does all the hard work. */ + return 0; +} + +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * @returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size) +{ + struct vm_struct *area; + + area = get_vm_area(size, VM_IOREMAP); + if (area == NULL) + return NULL; + + /* + * This ensures that page tables are constructed for this region + * of kernel virtual address space and mapped into init_mm. + */ + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + area->size, f, NULL)) { + free_vm_area(area); + return NULL; + } + + /* Make sure the pagetables are constructed in process kernel + mappings */ + vmalloc_sync_all(); + + return area; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + struct vm_struct *ret; + ret = remove_vm_area(area->addr); + BUG_ON(ret != area); + kfree(area); +} +EXPORT_SYMBOL_GPL(free_vm_area); diff --git a/mm/vmscan.c b/mm/vmscan.c index d419e10e3daa..a6e65d024995 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -271,6 +271,12 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +/* Request for sync pageout. */ +enum pageout_io { + PAGEOUT_IO_ASYNC, + PAGEOUT_IO_SYNC, +}; + /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -287,7 +293,8 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct page *page, struct address_space *mapping, + enum pageout_io sync_writeback) { /* * If the page is dirty, only perform writeback if that write @@ -346,6 +353,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) ClearPageReclaim(page); return PAGE_ACTIVATE; } + + /* + * Wait on writeback if requested to. This happens when + * direct reclaiming a large contiguous area and the + * first attempt to free a range of pages fails. + */ + if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + wait_on_page_writeback(page); + if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); @@ -423,7 +439,8 @@ cannot_free: * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc) + struct scan_control *sc, + enum pageout_io sync_writeback) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; @@ -458,8 +475,23 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; - if (PageWriteback(page)) - goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + + if (PageWriteback(page)) { + /* + * Synchronous reclaim is performed in two passes, + * first an asynchronous pass over the list to + * start parallel writeback, and a second synchronous + * pass to wait for the IO to complete. Wait here + * for any page for which writeback has already + * started. + */ + if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + wait_on_page_writeback(page); + else + goto keep_locked; + } referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ @@ -478,8 +510,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ mapping = page_mapping(page); - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* * The page is mapped into the page tables of one or more @@ -505,7 +535,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { + switch (pageout(page, mapping, sync_writeback)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: @@ -777,6 +807,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, (sc->order > PAGE_ALLOC_COSTLY_ORDER)? ISOLATE_BOTH : ISOLATE_INACTIVE); nr_active = clear_active_flags(&page_list); + __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); __mod_zone_page_state(zone, NR_INACTIVE, @@ -785,7 +816,29 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc); + nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + + /* + * If we are direct reclaiming for contiguous pages and we do + * not reclaim everything in the list, try again and wait + * for IO to complete. This will stall high-order allocations + * but that should be acceptable to the caller + */ + if (nr_freed < nr_taken && !current_is_kswapd() && + sc->order > PAGE_ALLOC_COSTLY_ORDER) { + congestion_wait(WRITE, HZ/10); + + /* + * The attempt at page out may have made some + * of the pages active, mark them inactive again. + */ + nr_active = clear_active_flags(&page_list); + count_vm_events(PGDEACTIVATE, nr_active); + + nr_freed += shrink_page_list(&page_list, sc, + PAGEOUT_IO_SYNC); + } + nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { diff --git a/mm/vmstat.c b/mm/vmstat.c index fadf791cd7e6..c64d169537bf 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -10,6 +10,7 @@ */ #include <linux/mm.h> +#include <linux/err.h> #include <linux/module.h> #include <linux/cpu.h> #include <linux/sched.h> |