diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/bootmem.c | 69 | ||||
-rw-r--r-- | mm/filemap.c | 237 | ||||
-rw-r--r-- | mm/highmem.c | 7 | ||||
-rw-r--r-- | mm/memory.c | 36 | ||||
-rw-r--r-- | mm/mlock.c | 24 | ||||
-rw-r--r-- | mm/mmap.c | 28 | ||||
-rw-r--r-- | mm/mprotect.c | 31 | ||||
-rw-r--r-- | mm/mremap.c | 22 | ||||
-rw-r--r-- | mm/numa.c | 50 | ||||
-rw-r--r-- | mm/oom_kill.c | 210 | ||||
-rw-r--r-- | mm/page_alloc.c | 91 | ||||
-rw-r--r-- | mm/swap.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 14 | ||||
-rw-r--r-- | mm/vmscan.c | 70 |
16 files changed, 598 insertions, 302 deletions
diff --git a/mm/Makefile b/mm/Makefile index 56e93693b..d74cdec48 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -10,7 +10,7 @@ O_TARGET := mm.o O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ - page_alloc.o swap_state.o swapfile.o numa.o + page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o ifeq ($(CONFIG_HIGHMEM),y) O_OBJS += highmem.o diff --git a/mm/bootmem.c b/mm/bootmem.c index 0a8d37ba2..e9e9ef7bc 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -41,11 +41,15 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages) /* * Called once to set up the allocator itself. */ -static unsigned long __init init_bootmem_core (bootmem_data_t *bdata, +static unsigned long __init init_bootmem_core (pg_data_t *pgdat, unsigned long mapstart, unsigned long start, unsigned long end) { + bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize = ((end - start)+7)/8; + pgdat->node_next = pgdat_list; + pgdat_list = pgdat; + mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); bdata->node_boot_start = (start << PAGE_SHIFT); @@ -172,10 +176,6 @@ restart_scan: preferred = 0; goto restart_scan; } - /* - * Whoops, we cannot satisfy the allocation request. - */ - BUG(); found: if (start >= eidx) BUG(); @@ -221,15 +221,15 @@ found: return ret; } -static unsigned long __init free_all_bootmem_core(int nid, bootmem_data_t *bdata) +static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) { - struct page * page; + struct page *page = pgdat->node_mem_map; + bootmem_data_t *bdata = pgdat->bdata; unsigned long i, count, total = 0; unsigned long idx; if (!bdata->node_bootmem_map) BUG(); - page = NODE_MEM_MAP(nid); count = 0; idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); for (i = 0; i < idx; i++, page++) { @@ -260,59 +260,78 @@ static unsigned long __init free_all_bootmem_core(int nid, bootmem_data_t *bdata return total; } -unsigned long __init init_bootmem_node (int nid, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) +unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) { - return(init_bootmem_core(NODE_DATA(nid)->bdata, freepfn, startpfn, endpfn)); + return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); } -void __init reserve_bootmem_node (int nid, unsigned long physaddr, unsigned long size) +void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - reserve_bootmem_core(NODE_DATA(nid)->bdata, physaddr, size); + reserve_bootmem_core(pgdat->bdata, physaddr, size); } -void __init free_bootmem_node (int nid, unsigned long physaddr, unsigned long size) +void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - return(free_bootmem_core(NODE_DATA(nid)->bdata, physaddr, size)); + return(free_bootmem_core(pgdat->bdata, physaddr, size)); } -unsigned long __init free_all_bootmem_node (int nid) +unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) { - return(free_all_bootmem_core(nid, NODE_DATA(nid)->bdata)); + return(free_all_bootmem_core(pgdat)); } unsigned long __init init_bootmem (unsigned long start, unsigned long pages) { max_low_pfn = pages; min_low_pfn = start; - return(init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages)); + return(init_bootmem_core(&contig_page_data, start, 0, pages)); } void __init reserve_bootmem (unsigned long addr, unsigned long size) { - reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); + reserve_bootmem_core(contig_page_data.bdata, addr, size); } void __init free_bootmem (unsigned long addr, unsigned long size) { - return(free_bootmem_core(NODE_DATA(0)->bdata, addr, size)); + return(free_bootmem_core(contig_page_data.bdata, addr, size)); } unsigned long __init free_all_bootmem (void) { - return(free_all_bootmem_core(0, NODE_DATA(0)->bdata)); + return(free_all_bootmem_core(&contig_page_data)); } void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) { + pg_data_t *pgdat = pgdat_list; + void *ptr; + + while (pgdat) { + if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + align, goal))) + return(ptr); + pgdat = pgdat->node_next; + } /* - * In the discontigmem case, all non-node specific allocations come - * from the first node, node 0. + * Whoops, we cannot satisfy the allocation request. */ - return(__alloc_bootmem_core(NODE_DATA(0)->bdata, size, align, goal)); + BUG(); + return NULL; } -void * __init __alloc_bootmem_node (int nid, unsigned long size, unsigned long align, unsigned long goal) +void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - return(__alloc_bootmem_core(NODE_DATA(nid)->bdata, size, align, goal)); + void *ptr; + + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); + if (ptr) + return (ptr); + + /* + * Whoops, we cannot satisfy the allocation request. + */ + BUG(); + return NULL; } diff --git a/mm/filemap.c b/mm/filemap.c index 6aca16409..b19f4c5b3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,6 +135,12 @@ void invalidate_inode_pages(struct inode * inode) if (TryLockPage(page)) continue; + /* Neither can we invalidate something in use.. */ + if (page_count(page) != 1) { + UnlockPage(page); + continue; + } + __lru_cache_del(page); __remove_inode_page(page); UnlockPage(page); @@ -156,6 +162,7 @@ static inline void truncate_partial_page(struct page *page, unsigned partial) static inline void truncate_complete_page(struct page *page) { + /* Leave it on the LRU if it gets converted into anonymous buffers */ if (!page->buffers || block_flushpage(page, 0)) lru_cache_del(page); @@ -167,6 +174,7 @@ static inline void truncate_complete_page(struct page *page) * all sorts of fun problems ... */ ClearPageDirty(page); + ClearPageUptodate(page); remove_inode_page(page); page_cache_release(page); } @@ -495,20 +503,46 @@ void ___wait_on_page(struct page *page) } /* - * Get an exclusive lock on the page.. + * Get a lock on the page, assuming we need to sleep + * to get it.. */ -void lock_page(struct page *page) +static void __lock_page(struct page *page) { - while (TryLockPage(page)) - ___wait_on_page(page); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue_exclusive(&page->wait, &wait); + for (;;) { + sync_page(page); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (PageLocked(page)) { + run_task_queue(&tq_disk); + schedule(); + continue; + } + if (!TryLockPage(page)) + break; + } + tsk->state = TASK_RUNNING; + remove_wait_queue(&page->wait, &wait); } + +/* + * Get an exclusive lock on the page, optimistically + * assuming it's not locked.. + */ +void lock_page(struct page *page) +{ + if (TryLockPage(page)) + __lock_page(page); +} /* * a rather lightweight function, finding and getting a reference to a * hashed page atomically, waiting for it if it's locked. */ -struct page * __find_get_page (struct address_space *mapping, +static struct page * __find_get_page(struct address_space *mapping, unsigned long offset, struct page **hash) { struct page *page; @@ -517,41 +551,11 @@ struct page * __find_get_page (struct address_space *mapping, * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ -repeat: spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash); if (page) page_cache_get(page); spin_unlock(&pagecache_lock); - - /* Found the page, sleep if locked. */ - if (page && PageLocked(page)) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - - sync_page(page); - - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); - add_wait_queue(&page->wait, &wait); - - if (PageLocked(page)) - schedule(); - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&page->wait, &wait); - - /* - * The page might have been unhashed meanwhile. It's - * not freed though because we hold a reference to it. - * If this is the case then it will be freed _here_, - * and we recheck the hash anyway. - */ - page_cache_release(page); - goto repeat; - } - /* - * It's not locked so we can return the page and we hold - * a reference to it. - */ return page; } @@ -570,39 +574,23 @@ struct page * __find_lock_page (struct address_space *mapping, repeat: spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash); - if (page) + if (page) { page_cache_get(page); - spin_unlock(&pagecache_lock); - - /* Found the page, sleep if locked. */ - if (page && TryLockPage(page)) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - - sync_page(page); + spin_unlock(&pagecache_lock); - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); - add_wait_queue(&page->wait, &wait); + lock_page(page); - if (PageLocked(page)) - schedule(); - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&page->wait, &wait); + /* Is the page still hashed? Ok, good.. */ + if (page->mapping) + return page; - /* - * The page might have been unhashed meanwhile. It's - * not freed though because we hold a reference to it. - * If this is the case then it will be freed _here_, - * and we recheck the hash anyway. - */ + /* Nope: we raced. Release and try again.. */ + UnlockPage(page); page_cache_release(page); goto repeat; } - /* - * It's not locked so we can return the page and we hold - * a reference to it. - */ - return page; + spin_unlock(&pagecache_lock); + return NULL; } #if 0 @@ -993,7 +981,7 @@ page_ok: * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (page->mapping->i_mmap_shared != NULL) + if (mapping->i_mmap_shared != NULL) flush_dcache_page(page); /* @@ -1027,6 +1015,15 @@ page_not_up_to_date: /* Get exclusive access to the page ... */ lock_page(page); + + /* Did it get unhashed before we got the lock? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ if (Page_Uptodate(page)) { UnlockPage(page); goto page_ok; @@ -1323,16 +1320,16 @@ struct page * filemap_nopage(struct vm_area_struct * area, struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; struct page *page, **hash, *old_page; - unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + unsigned long size, pgoff; - unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; +retry_all: /* - * Semantics for shared and private memory areas are different - * past the end of the file. A shared mapping past the last page - * of the file is an error and results in a SIGBUS, while a - * private mapping just maps in a zero page. + * An external ptracer can access pages that normally aren't + * accessible.. */ + size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if ((pgoff >= size) && (area->vm_mm == current->mm)) return NULL; @@ -1411,6 +1408,15 @@ no_cached_page: page_not_uptodate: lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Did somebody else get it up-to-date? */ if (Page_Uptodate(page)) { UnlockPage(page); goto success; @@ -1429,6 +1435,15 @@ page_not_uptodate: * and we need to check for errors. */ lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Somebody else successfully read it in? */ if (Page_Uptodate(page)) { UnlockPage(page); goto success; @@ -1448,17 +1463,25 @@ page_not_uptodate: return NULL; } +/* + * If a task terminates while we're swapping the page, the vma and + * and file could be released: try_to_swap_out has done a get_file. + * vma/file is guaranteed to exist in the unmap/sync cases because + * mmap_sem is held. + * + * The "mapping" test takes care of somebody having truncated the + * page and thus made this write-page a no-op.. + */ static int filemap_write_page(struct file *file, struct page * page, int wait) { - /* - * If a task terminates while we're swapping the page, the vma and - * and file could be released: try_to_swap_out has done a get_file. - * vma/file is guaranteed to exist in the unmap/sync cases because - * mmap_sem is held. - */ - return page->mapping->a_ops->writepage(file, page); + struct address_space * mapping = page->mapping; + int error = 0; + + if (mapping) + error = mapping->a_ops->writepage(file, page); + return error; } @@ -1475,39 +1498,47 @@ int filemap_swapout(struct page * page, struct file * file) return retval; } +/* Called with mm->page_table_lock held to protect against other + * threads/the swapper from ripping pte's out from under us. + */ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { unsigned long pgoff; - pte_t pte = *ptep; + pte_t pte; struct page *page; int error; + pte = *ptep; + if (!(flags & MS_INVALIDATE)) { if (!pte_present(pte)) - return 0; - if (!pte_dirty(pte)) - return 0; + goto out; + if (!ptep_test_and_clear_dirty(ptep)) + goto out; flush_page_to_ram(pte_page(pte)); flush_cache_page(vma, address); - set_pte(ptep, pte_mkclean(pte)); flush_tlb_page(vma, address); page = pte_page(pte); page_cache_get(page); } else { if (pte_none(pte)) - return 0; + goto out; flush_cache_page(vma, address); - pte_clear(ptep); + + pte = ptep_get_and_clear(ptep); flush_tlb_page(vma, address); + if (!pte_present(pte)) { + spin_unlock(&vma->vm_mm->page_table_lock); swap_free(pte_to_swp_entry(pte)); - return 0; + spin_lock(&vma->vm_mm->page_table_lock); + goto out; } page = pte_page(pte); if (!pte_dirty(pte) || flags == MS_INVALIDATE) { page_cache_free(page); - return 0; + goto out; } } pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT; @@ -1516,11 +1547,20 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n", pgoff, page->index, address, vma->vm_start, vma->vm_pgoff); } + + spin_unlock(&vma->vm_mm->page_table_lock); lock_page(page); + error = filemap_write_page(vma->vm_file, page, 1); + UnlockPage(page); page_cache_free(page); + + spin_lock(&vma->vm_mm->page_table_lock); return error; + +out: + return 0; } static inline int filemap_sync_pte_range(pmd_t * pmd, @@ -1590,6 +1630,11 @@ int filemap_sync(struct vm_area_struct * vma, unsigned long address, unsigned long end = address + size; int error = 0; + /* Aquire the lock early; it may be possible to avoid dropping + * and reaquiring it repeatedly. + */ + spin_lock(&vma->vm_mm->page_table_lock); + dir = pgd_offset(vma->vm_mm, address); flush_cache_range(vma->vm_mm, end - size, end); if (address >= end) @@ -1600,6 +1645,9 @@ int filemap_sync(struct vm_area_struct * vma, unsigned long address, dir++; } while (address && (address < end)); flush_tlb_range(vma->vm_mm, end - size, end); + + spin_unlock(&vma->vm_mm->page_table_lock); + return error; } @@ -1766,11 +1814,11 @@ static long madvise_fixup_start(struct vm_area_struct * vma, get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = end; insert_vm_struct(current->mm, n); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -1790,10 +1838,10 @@ static long madvise_fixup_end(struct vm_area_struct * vma, get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_end = start; insert_vm_struct(current->mm, n); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -1823,7 +1871,7 @@ static long madvise_fixup_middle(struct vm_area_struct * vma, vma->vm_ops->open(left); vma->vm_ops->open(right); } - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = start; vma->vm_end = end; @@ -1831,7 +1879,7 @@ static long madvise_fixup_middle(struct vm_area_struct * vma, vma->vm_raend = 0; insert_vm_struct(current->mm, left); insert_vm_struct(current->mm, right); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -2270,13 +2318,20 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *,struct page*), void *data) { - struct page *page = __read_cache_page(mapping, index, filler, data); + struct page *page; int err; +retry: + page = __read_cache_page(mapping, index, filler, data); if (IS_ERR(page) || Page_Uptodate(page)) goto out; lock_page(page); + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry; + } if (Page_Uptodate(page)) { UnlockPage(page); goto out; diff --git a/mm/highmem.c b/mm/highmem.c index 3be601c6f..d83d9bb87 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -130,10 +130,9 @@ static void flush_all_zero_pkmaps(void) if (pkmap_count[i] != 1) continue; pkmap_count[i] = 0; - pte = pkmap_page_table[i]; + pte = ptep_get_and_clear(pkmap_page_table+i); if (pte_none(pte)) BUG(); - pte_clear(pkmap_page_table+i); page = pte_page(pte); page->virtual = NULL; } @@ -310,7 +309,7 @@ struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig) repeat_bh: bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); if (!bh) { - wakeup_bdflush(1); + wakeup_bdflush(1); /* Sets task->state to TASK_RUNNING */ current->policy |= SCHED_YIELD; schedule(); goto repeat_bh; @@ -324,7 +323,7 @@ repeat_bh: repeat_page: page = alloc_page(GFP_BUFFER); if (!page) { - wakeup_bdflush(1); + wakeup_bdflush(1); /* Sets task->state to TASK_RUNNING */ current->policy |= SCHED_YIELD; schedule(); goto repeat_page; diff --git a/mm/memory.c b/mm/memory.c index 6b047821d..11048ddce 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -215,30 +215,30 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; /* copy_one_pte */ if (pte_none(pte)) - goto cont_copy_pte_range; + goto cont_copy_pte_range_noset; if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); goto cont_copy_pte_range; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || - PageReserved(ptepage)) { - set_pte(dst_pte, pte); + PageReserved(ptepage)) goto cont_copy_pte_range; - } + /* If it's a COW mapping, write protect it both in the parent and the child */ if (cow) { - pte = pte_wrprotect(pte); - set_pte(src_pte, pte); + ptep_clear_wrprotect(src_pte); + pte = *src_pte; } + /* If it's a shared mapping, mark it clean in the child */ if (vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); - set_pte(dst_pte, pte_mkold(pte)); + pte = pte_mkold(pte); get_page(ptepage); - -cont_copy_pte_range: address += PAGE_SIZE; + +cont_copy_pte_range: set_pte(dst_pte, pte); +cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out; src_pte++; @@ -306,10 +306,9 @@ static inline int zap_pte_range(struct mm_struct *mm, pmd_t * pmd, unsigned long pte_t page; if (!size) break; - page = *pte; + page = ptep_get_and_clear(pte); pte++; size--; - pte_clear(pte-1); if (pte_none(page)) continue; freed += free_pte(page); @@ -642,7 +641,7 @@ static inline void zeromap_pte_range(pte_t * pte, unsigned long address, end = PMD_SIZE; do { pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); - pte_t oldpage = *pte; + pte_t oldpage = ptep_get_and_clear(pte); set_pte(pte, zero_pte); forget_pte(oldpage); address += PAGE_SIZE; @@ -712,8 +711,8 @@ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned end = PMD_SIZE; do { struct page *page; - pte_t oldpage = *pte; - pte_clear(pte); + pte_t oldpage; + oldpage = ptep_get_and_clear(pte); page = virt_to_page(__va(phys_addr)); if ((!VALID_PAGE(page)) || PageReserved(page)) @@ -746,6 +745,7 @@ static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned l return 0; } +/* Note: this is only safe if the mm semaphore is held when called. */ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) { int error = 0; @@ -781,8 +781,8 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long */ static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) { - flush_tlb_page(vma, address); set_pte(page_table, entry); + flush_tlb_page(vma, address); update_mmu_cache(vma, address, entry); } @@ -867,7 +867,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, /* * Re-check the pte - we dropped the lock */ - if (pte_val(*page_table) == pte_val(pte)) { + if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; break_cow(vma, old_page, new_page, address, page_table); @@ -1214,7 +1214,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * didn't change from under us.. */ spin_lock(&mm->page_table_lock); - if (pte_val(entry) == pte_val(*pte)) { + if (pte_same(entry, *pte)) { if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, entry); diff --git a/mm/mlock.c b/mm/mlock.c index a3d10ff99..f684a3c60 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -14,9 +14,9 @@ static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags) { - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_flags = newflags; - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -36,11 +36,11 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma, get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = end; insert_vm_struct(current->mm, n); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -61,10 +61,10 @@ static inline int mlock_fixup_end(struct vm_area_struct * vma, get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_end = start; insert_vm_struct(current->mm, n); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -96,7 +96,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma, vma->vm_ops->open(left); vma->vm_ops->open(right); } - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = start; vma->vm_end = end; @@ -104,7 +104,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma, vma->vm_raend = 0; insert_vm_struct(current->mm, left); insert_vm_struct(current->mm, right); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -183,9 +183,9 @@ static int do_mlock(unsigned long start, size_t len, int on) break; } } - vmlist_modify_lock(current->mm); + spin_lock(¤t->mm->page_table_lock); merge_segments(current->mm, start, end); - vmlist_modify_unlock(current->mm); + spin_unlock(¤t->mm->page_table_lock); return error; } @@ -257,9 +257,9 @@ static int do_mlockall(int flags) if (error) break; } - vmlist_modify_lock(current->mm); + spin_lock(¤t->mm->page_table_lock); merge_segments(current->mm, 0, TASK_SIZE); - vmlist_modify_unlock(current->mm); + spin_unlock(¤t->mm->page_table_lock); return error; } @@ -317,12 +317,12 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon */ flags = vma->vm_flags; addr = vma->vm_start; /* can addr have changed?? */ - vmlist_modify_lock(mm); + spin_lock(&mm->page_table_lock); insert_vm_struct(mm, vma); if (correct_wcount) atomic_inc(&file->f_dentry->d_inode->i_writecount); merge_segments(mm, vma->vm_start, vma->vm_end); - vmlist_modify_unlock(mm); + spin_unlock(&mm->page_table_lock); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { @@ -534,11 +534,11 @@ static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, /* Work out to one of the ends. */ if (end == area->vm_end) { area->vm_end = addr; - vmlist_modify_lock(mm); + spin_lock(&mm->page_table_lock); } else if (addr == area->vm_start) { area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT; area->vm_start = end; - vmlist_modify_lock(mm); + spin_lock(&mm->page_table_lock); } else { /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */ /* Add end mapping -- leave beginning for below */ @@ -560,12 +560,12 @@ static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, if (mpnt->vm_ops && mpnt->vm_ops->open) mpnt->vm_ops->open(mpnt); area->vm_end = addr; /* Truncate area */ - vmlist_modify_lock(mm); + spin_lock(&mm->page_table_lock); insert_vm_struct(mm, mpnt); } insert_vm_struct(mm, area); - vmlist_modify_unlock(mm); + spin_unlock(&mm->page_table_lock); return extra; } @@ -670,7 +670,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) npp = (prev ? &prev->vm_next : &mm->mmap); free = NULL; - vmlist_modify_lock(mm); + spin_lock(&mm->page_table_lock); for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { *npp = mpnt->vm_next; mpnt->vm_next = free; @@ -679,7 +679,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) avl_remove(mpnt, &mm->mmap_avl); } mm->mmap_cache = NULL; /* Kill the cache. */ - vmlist_modify_unlock(mm); + spin_unlock(&mm->page_table_lock); /* Ok - we have the memory areas we should free on the 'free' list, * so release them, and unmap the page range.. @@ -811,10 +811,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len) flags = vma->vm_flags; addr = vma->vm_start; - vmlist_modify_lock(mm); + spin_lock(&mm->page_table_lock); insert_vm_struct(mm, vma); merge_segments(mm, vma->vm_start, vma->vm_end); - vmlist_modify_unlock(mm); + spin_unlock(&mm->page_table_lock); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { @@ -840,10 +840,10 @@ void exit_mmap(struct mm_struct * mm) struct vm_area_struct * mpnt; release_segments(mm); + spin_lock(&mm->page_table_lock); mpnt = mm->mmap; - vmlist_modify_lock(mm); mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL; - vmlist_modify_unlock(mm); + spin_unlock(&mm->page_table_lock); mm->rss = 0; mm->total_vm = 0; mm->locked_vm = 0; @@ -985,9 +985,9 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l if (mpnt->vm_ops && mpnt->vm_ops->close) { mpnt->vm_pgoff += (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; mpnt->vm_start = mpnt->vm_end; - vmlist_modify_unlock(mm); + spin_unlock(&mm->page_table_lock); mpnt->vm_ops->close(mpnt); - vmlist_modify_lock(mm); + spin_lock(&mm->page_table_lock); } mm->map_count--; remove_shared_vm_struct(mpnt); diff --git a/mm/mprotect.c b/mm/mprotect.c index 53fc53acb..7b61abb3e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -30,9 +30,16 @@ static inline void change_pte_range(pmd_t * pmd, unsigned long address, if (end > PMD_SIZE) end = PMD_SIZE; do { - pte_t entry = *pte; - if (pte_present(entry)) + if (pte_present(*pte)) { + pte_t entry; + + /* Avoid an SMP race with hardware updated dirty/clean + * bits by wiping the pte and then setting the new pte + * into place. + */ + entry = ptep_get_and_clear(pte); set_pte(pte, pte_modify(entry, newprot)); + } address += PAGE_SIZE; pte++; } while (address && (address < end)); @@ -86,10 +93,10 @@ static void change_protection(unsigned long start, unsigned long end, pgprot_t n static inline int mprotect_fixup_all(struct vm_area_struct * vma, int newflags, pgprot_t prot) { - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_flags = newflags; vma->vm_page_prot = prot; - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -111,11 +118,11 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma, get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = end; insert_vm_struct(current->mm, n); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -138,10 +145,10 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma, get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_end = start; insert_vm_struct(current->mm, n); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -172,7 +179,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma, vma->vm_ops->open(left); vma->vm_ops->open(right); } - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = start; vma->vm_end = end; @@ -181,7 +188,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma, vma->vm_page_prot = prot; insert_vm_struct(current->mm, left); insert_vm_struct(current->mm, right); - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -263,9 +270,9 @@ asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot break; } } - vmlist_modify_lock(current->mm); + spin_lock(¤t->mm->page_table_lock); merge_segments(current->mm, start, end); - vmlist_modify_unlock(current->mm); + spin_unlock(¤t->mm->page_table_lock); out: up(¤t->mm->mmap_sem); return error; diff --git a/mm/mremap.c b/mm/mremap.c index d1f6a7b8b..719ca1ec1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -63,14 +63,14 @@ static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst) pte_t pte; spin_lock(&mm->page_table_lock); - pte = *src; - if (!pte_none(pte)) { - error++; - if (dst) { - pte_clear(src); - set_pte(dst, pte); - error--; + if (!pte_none(*src)) { + pte = ptep_get_and_clear(src); + if (!dst) { + /* No dest? We must put it back. */ + dst = src; + error++; } + set_pte(dst, pte); } spin_unlock(&mm->page_table_lock); return error; @@ -141,10 +141,10 @@ static inline unsigned long move_vma(struct vm_area_struct * vma, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - vmlist_modify_lock(current->mm); + spin_lock(¤t->mm->page_table_lock); insert_vm_struct(current->mm, new_vma); merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end); - vmlist_modify_unlock(current->mm); + spin_unlock(¤t->mm->page_table_lock); do_munmap(current->mm, addr, old_len); current->mm->total_vm += new_len >> PAGE_SHIFT; if (new_vma->vm_flags & VM_LOCKED) { @@ -258,9 +258,9 @@ unsigned long do_mremap(unsigned long addr, /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { int pages = (new_len - old_len) >> PAGE_SHIFT; - vmlist_modify_lock(vma->vm_mm); + spin_lock(&vma->vm_mm->page_table_lock); vma->vm_end = addr + new_len; - vmlist_modify_unlock(vma->vm_mm); + spin_unlock(&vma->vm_mm->page_table_lock); current->mm->total_vm += pages; if (vma->vm_flags & VM_LOCKED) { current->mm->locked_vm += pages; @@ -11,11 +11,11 @@ int numnodes = 1; /* Initialized for UMA platforms */ -#ifndef CONFIG_DISCONTIGMEM - static bootmem_data_t contig_bootmem_data; pg_data_t contig_page_data = { bdata: &contig_bootmem_data }; +#ifndef CONFIG_DISCONTIGMEM + /* * This is meant to be invoked by platforms whose physical memory starts * at a considerably higher value than 0. Examples are Super-H, ARM, m68k. @@ -25,7 +25,7 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, unsigned long *zones_size, unsigned long zone_start_paddr, unsigned long *zholes_size) { - free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, + free_area_init_core(0, &contig_page_data, &mem_map, zones_size, zone_start_paddr, zholes_size, pmap); } @@ -33,7 +33,11 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order) { +#ifdef CONFIG_NUMA return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order); +#else + return alloc_pages(gfp_mask, order); +#endif } #ifdef CONFIG_DISCONTIGMEM @@ -42,13 +46,12 @@ struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order) static spinlock_t node_lock = SPIN_LOCK_UNLOCKED; -void show_free_areas_node(int nid) +void show_free_areas_node(pg_data_t *pgdat) { unsigned long flags; spin_lock_irqsave(&node_lock, flags); - printk("Memory information for node %d:\n", nid); - show_free_areas_core(nid); + show_free_areas_core(pgdat); spin_unlock_irqrestore(&node_lock, flags); } @@ -75,10 +78,16 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, for (i = 0; i < MAX_NR_ZONES; i++) size += zones_size[i]; size = LONG_ALIGN((size + 7) >> 3); - pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(nid, size); + pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size); memset(pgdat->valid_addr_bitmap, 0, size); } +static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask, + unsigned long order) +{ + return __alloc_pages(pgdat->node_zonelists + gfp_mask, order); +} + /* * This can be refined. Currently, tries to do round robin, instead * should do concentratic circle search, starting from current node. @@ -86,33 +95,34 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, struct page * alloc_pages(int gfp_mask, unsigned long order) { struct page *ret = 0; - int startnode, tnode; + pg_data_t *start, *temp; #ifndef CONFIG_NUMA unsigned long flags; - static int nextnid = 0; + static pg_data_t *next = 0; #endif if (order >= MAX_ORDER) return NULL; #ifdef CONFIG_NUMA - tnode = numa_node_id(); + temp = NODE_DATA(numa_node_id()); #else spin_lock_irqsave(&node_lock, flags); - tnode = nextnid; - nextnid++; - if (nextnid == numnodes) - nextnid = 0; + if (!next) next = pgdat_list; + temp = next; + next = next->node_next; spin_unlock_irqrestore(&node_lock, flags); #endif - startnode = tnode; - while (tnode < numnodes) { - if ((ret = alloc_pages_node(tnode++, gfp_mask, order))) + start = temp; + while (temp) { + if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) return(ret); + temp = temp->node_next; } - tnode = 0; - while (tnode != startnode) { - if ((ret = alloc_pages_node(tnode++, gfp_mask, order))) + temp = pgdat_list; + while (temp != start) { + if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) return(ret); + temp = temp->node_next; } return(0); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c new file mode 100644 index 000000000..9882fe7cd --- /dev/null +++ b/mm/oom_kill.c @@ -0,0 +1,210 @@ +/* + * linux/mm/oom_kill.c + * + * Copyright (C) 1998,2000 Rik van Riel + * Thanks go out to Claus Fischer for some serious inspiration and + * for goading me into coding this file... + * + * The routines in this file are used to kill a process when + * we're seriously out of memory. This gets called from kswapd() + * in linux/mm/vmscan.c when we really run out of memory. + * + * Since we won't call these routines often (on a well-configured + * machine) this file will double as a 'coding guide' and a signpost + * for newbie kernel hackers. It features several pointers to major + * kernel subsystems and hints as to where to find out what things do. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/timex.h> + +/* #define DEBUG */ + +/** + * int_sqrt - oom_kill.c internal function, rough approximation to sqrt + * @x: integer of which to calculate the sqrt + * + * A very rough approximation to the sqrt() function. + */ +static unsigned int int_sqrt(unsigned int x) +{ + unsigned int out = x; + while (x & ~(unsigned int)1) x >>=2, out >>=1; + if (x) out -= out >> 2; + return (out ? out : 1); +} + +/** + * oom_badness - calculate a numeric value for how bad this task has been + * @p: task struct of which task we should calculate + * + * The formula used is relatively simple and documented inline in the + * function. The main rationale is that we want to select a good task + * to kill when we run out of memory. + * + * Good in this context means that: + * 1) we lose the minimum amount of work done + * 2) we recover a large amount of memory + * 3) we don't kill anything innocent of eating tons of memory + * 4) we want to kill the minimum amount of processes (one) + * 5) we try to kill the process the user expects us to kill, this + * algorithm has been meticulously tuned to meet the priniciple + * of least surprise ... (be careful when you change it) + */ + +static int badness(struct task_struct *p) +{ + int points, cpu_time, run_time; + + if (!p->mm) + return 0; + /* + * The memory size of the process is the basis for the badness. + */ + points = p->mm->total_vm; + + /* + * CPU time is in seconds and run time is in minutes. There is no + * particular reason for this other than that it turned out to work + * very well in practice. This is not safe against jiffie wraps + * but we don't care _that_ much... + */ + cpu_time = (p->times.tms_utime + p->times.tms_stime) >> (SHIFT_HZ + 3); + run_time = (jiffies - p->start_time) >> (SHIFT_HZ + 10); + + points /= int_sqrt(cpu_time); + points /= int_sqrt(int_sqrt(run_time)); + + /* + * Niced processes are most likely less important, so double + * their badness points. + */ + if (p->nice > 0) + points *= 2; + + /* + * Superuser processes are usually more important, so we make it + * less likely that we kill those. + */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || + p->uid == 0 || p->euid == 0) + points /= 4; + + /* + * We don't want to kill a process with direct hardware access. + * Not only could that mess up the hardware, but usually users + * tend to only have this flag set on applications they think + * of as important. + */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) + points /= 4; +#ifdef DEBUG + printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", + p->pid, p->comm, points); +#endif + return points; +} + +/* + * Simple selection loop. We chose the process with the highest + * number of 'points'. We need the locks to make sure that the + * list of task structs doesn't change while we look the other way. + * + * (not docbooked, we don't want this one cluttering up the manual) + */ +static struct task_struct * select_bad_process(void) +{ + int points = 0, maxpoints = 0; + struct task_struct *p = NULL; + struct task_struct *chosen = NULL; + + read_lock(&tasklist_lock); + for_each_task(p) + { + if (p->pid) + points = badness(p); + if (points > maxpoints) { + chosen = p; + maxpoints = points; + } + } + read_unlock(&tasklist_lock); + return chosen; +} + +/** + * oom_kill - kill the "best" process when we run out of memory + * + * If we run out of memory, we have the choice between either + * killing a random task (bad), letting the system crash (worse) + * OR try to be smart about which process to kill. Note that we + * don't have to be perfect here, we just have to be good. + * + * We must be careful though to never send SIGKILL a process with + * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that + * we select a process with CAP_SYS_RAW_IO set). + */ +void oom_kill(void) +{ + + struct task_struct *p = select_bad_process(); + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (p == NULL) + panic("Out of memory and no killable processes...\n"); + + printk(KERN_ERR "Out of Memory: Killed process %d (%s).", p->pid, p->comm); + + /* + * We give our sacrificial lamb high priority and access to + * all the memory it needs. That way it should be able to + * exit() and clear out its resources quickly... + */ + p->counter = 5 * HZ; + p->flags |= PF_MEMALLOC; + + /* This process has hardware access, be more careful. */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) { + force_sig(SIGTERM, p); + } else { + force_sig(SIGKILL, p); + } + + /* + * Make kswapd go out of the way, so "p" has a good chance of + * killing itself before someone else gets the chance to ask + * for more memory. + */ + current->policy |= SCHED_YIELD; + schedule(); + return; +} + +/** + * out_of_memory - is the system out of memory? + * + * Returns 0 if there is still enough memory left, + * 1 when we are out of memory (otherwise). + */ +int out_of_memory(void) +{ + struct sysinfo swp_info; + + /* Enough free memory? Not OOM. */ + if (nr_free_pages() > freepages.min) + return 0; + + if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low) + return 0; + + /* Enough swap space left? Not OOM. */ + si_swapinfo(&swp_info); + if (swp_info.freeswap > 0) + return 0; + + /* Else... */ + return 1; +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b5990a11..90c077439 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -17,13 +17,6 @@ #include <linux/pagemap.h> #include <linux/bootmem.h> -/* Use NUMNODES instead of numnodes for better code inside kernel APIs */ -#ifndef CONFIG_DISCONTIGMEM -#define NUMNODES 1 -#else -#define NUMNODES numnodes -#endif - int nr_swap_pages; int nr_active_pages; int nr_inactive_dirty_pages; @@ -294,7 +287,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) zone_t **zone; int direct_reclaim = 0; unsigned int gfp_mask = zonelist->gfp_mask; - struct page * page = NULL; + struct page * page; /* * Allocations put pressure on the VM subsystem. @@ -329,7 +322,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) * wake up bdflush. */ else if (free_shortage() && nr_inactive_dirty_pages > free_shortage() - && nr_inactive_dirty_pages > freepages.high) + && nr_inactive_dirty_pages >= freepages.high) wakeup_bdflush(0); try_again: @@ -347,7 +340,7 @@ try_again: if (!z->size) BUG(); - if (z->free_pages > z->pages_low) { + if (z->free_pages >= z->pages_low) { page = rmqueue(z, order); if (page) return page; @@ -517,17 +510,17 @@ try_again: * happen when the OOM killer selects this task for * instant execution... */ - if (direct_reclaim) + if (direct_reclaim) { page = reclaim_page(z); - if (page) - return page; + if (page) + return page; + } /* XXX: is pages_min/4 a good amount to reserve for this? */ if (z->free_pages < z->pages_min / 4 && !(current->flags & PF_MEMALLOC)) continue; - if (!page) - page = rmqueue(z, order); + page = rmqueue(z, order); if (page) return page; } @@ -588,12 +581,14 @@ unsigned int nr_free_pages (void) { unsigned int sum; zone_t *zone; - int i; + pg_data_t *pgdat = pgdat_list; sum = 0; - for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++) + while (pgdat) { + for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) sum += zone->free_pages; + pgdat = pgdat->node_next; + } return sum; } @@ -604,12 +599,14 @@ unsigned int nr_inactive_clean_pages (void) { unsigned int sum; zone_t *zone; - int i; + pg_data_t *pgdat = pgdat_list; sum = 0; - for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++) + while (pgdat) { + for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) sum += zone->inactive_clean_pages; + pgdat = pgdat->node_next; + } return sum; } @@ -644,11 +641,13 @@ unsigned int nr_free_buffer_pages (void) #if CONFIG_HIGHMEM unsigned int nr_free_highpages (void) { - int i; + pg_data_t *pgdat = pgdat_list; unsigned int pages = 0; - for (i = 0; i < NUMNODES; i++) - pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages; + while (pgdat) { + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + pgdat = pgdat->node_next; + } return pages; } #endif @@ -658,7 +657,7 @@ unsigned int nr_free_highpages (void) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. */ -void show_free_areas_core(int nid) +void show_free_areas_core(pg_data_t *pgdat) { unsigned long order; unsigned type; @@ -678,7 +677,7 @@ void show_free_areas_core(int nid) for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; - zone_t *zone = NODE_DATA(nid)->node_zones + type; + zone_t *zone = pgdat->node_zones + type; unsigned long nr, total, flags; total = 0; @@ -710,7 +709,7 @@ void show_free_areas_core(int nid) void show_free_areas(void) { - show_free_areas_core(0); + show_free_areas_core(pgdat_list); } /* @@ -780,9 +779,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, unsigned long totalpages, offset, realtotalpages; unsigned int cumulative = 0; - pgdat->node_next = pgdat_list; - pgdat_list = pgdat; - totalpages = 0; for (i = 0; i < MAX_NR_ZONES; i++) { unsigned long size = zones_size[i]; @@ -795,21 +791,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, printk("On node %d totalpages: %lu\n", nid, realtotalpages); - /* - * Select nr of pages we try to keep free for important stuff - * with a minimum of 10 pages and a maximum of 256 pages, so - * that we don't waste too much memory on large systems. - * This is fairly arbitrary, but based on some behaviour - * analysis. - */ - i = realtotalpages >> 7; - if (i < 10) - i = 10; - if (i > 256) - i = 256; - freepages.min += i; - freepages.low += i * 2; - freepages.high += i * 3; memlist_init(&active_list); memlist_init(&inactive_dirty_list); @@ -822,7 +803,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, */ map_size = (totalpages + 1)*sizeof(struct page); if (lmem_map == (struct page *)0) { - lmem_map = (struct page *) alloc_bootmem_node(nid, map_size); + lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); lmem_map = (struct page *)(PAGE_OFFSET + MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); } @@ -875,6 +856,20 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; + /* + * Add these free targets to the global free target; + * we have to be SURE that freepages.high is higher + * than SUM [zone->pages_min] for all zones, otherwise + * we may have bad bad problems. + * + * This means we cannot make the freepages array writable + * in /proc, but have to add a separate extra_free_target + * for people who require it to catch load spikes in eg. + * gigabit ethernet routing... + */ + freepages.min += mask; + freepages.low += mask*2; + freepages.high += mask*3; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; @@ -900,7 +895,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, bitmap_size = (bitmap_size + 7) >> 3; bitmap_size = LONG_ALIGN(bitmap_size); zone->free_area[i].map = - (unsigned int *) alloc_bootmem_node(nid, bitmap_size); + (unsigned int *) alloc_bootmem_node(pgdat, bitmap_size); } } build_zonelists(pgdat); @@ -908,7 +903,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, void __init free_area_init(unsigned long *zones_size) { - free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0, 0); + free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); } static int __init setup_mem_frac(char *str) @@ -174,6 +174,7 @@ void deactivate_page_nolock(struct page * page) */ int maxcount = (page->buffers ? 3 : 2); page->age = 0; + ClearPageReferenced(page); /* * Don't touch it if it's not on the active list. diff --git a/mm/swapfile.c b/mm/swapfile.c index fa4cb133e..688e2fcdd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -223,10 +223,10 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, if (pte_page(pte) != page) return; /* We will be removing the swap cache in a moment, so... */ - set_pte(dir, pte_mkdirty(pte)); + ptep_mkdirty(dir); return; } - if (pte_val(pte) != entry.val) + if (pte_to_swp_entry(pte).val != entry.val) return; set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); swap_free(entry); @@ -315,12 +315,12 @@ static void unuse_process(struct mm_struct * mm, */ if (!mm) return; - vmlist_access_lock(mm); + spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); unuse_vma(vma, pgd, entry, page); } - vmlist_access_unlock(mm); + spin_unlock(&mm->page_table_lock); return; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e8c557e04..15261612e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -34,8 +34,8 @@ static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned lo if (end > PMD_SIZE) end = PMD_SIZE; do { - pte_t page = *pte; - pte_clear(pte); + pte_t page; + page = ptep_get_and_clear(pte); address += PAGE_SIZE; pte++; if (pte_none(page)) @@ -142,15 +142,14 @@ inline int vmalloc_area_pages (unsigned long address, unsigned long size, flush_cache_all(); do { pmd_t *pmd; - pgd_t olddir = *dir; pmd = pmd_alloc_kernel(dir, address); if (!pmd) return -ENOMEM; + if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot)) return -ENOMEM; - if (pgd_val(olddir) != pgd_val(*dir)) - set_pgdir(address, *dir); + address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -222,14 +221,11 @@ void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot) return NULL; } area = get_vm_area(size, VM_ALLOC); - if (!area) { - BUG(); + if (!area) return NULL; - } addr = area->addr; if (vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask, prot)) { vfree(addr); - BUG(); return NULL; } return addr; diff --git a/mm/vmscan.c b/mm/vmscan.c index aacd9a5b0..d7fd0aca8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,22 +55,8 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un onlist = PageActive(page); /* Don't look at this pte if it's been accessed recently. */ - if (pte_young(pte)) { - set_pte(page_table, pte_mkold(pte)); - if (onlist) { - /* - * Transfer the "accessed" bit from the page - * tables to the global page map. Page aging - * will be done by refill_inactive_scan(). - */ - SetPageReferenced(page); - } else { - /* - * The page is not on the active list, so - * we have to do the page aging ourselves. - */ - age_page_up(page); - } + if (ptep_test_and_clear_young(page_table)) { + age_page_up(page); goto out_failed; } if (!onlist) @@ -88,6 +74,13 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un if (TryLockPage(page)) goto out_failed; + /* From this point on, the odds are that we're going to + * nuke this pte, so read and clear the pte. This hook + * is needed on CPUs which update the accessed and dirty + * bits in hardware. + */ + pte = ptep_get_and_clear(page_table); + /* * Is the page already in the swap cache? If so, then * we can just drop our reference to it without doing @@ -124,7 +117,6 @@ drop_pte: */ if (!pte_dirty(pte)) { flush_cache_page(vma, address); - pte_clear(page_table); goto drop_pte; } @@ -134,7 +126,7 @@ drop_pte: * locks etc. */ if (!(gfp_mask & __GFP_IO)) - goto out_unlock; + goto out_unlock_restore; /* * Don't do any of the expensive stuff if @@ -143,7 +135,7 @@ drop_pte: if (page->zone->free_pages + page->zone->inactive_clean_pages + page->zone->inactive_dirty_pages > page->zone->pages_high + inactive_target) - goto out_unlock; + goto out_unlock_restore; /* * Ok, it's really dirty. That means that @@ -169,10 +161,10 @@ drop_pte: int error; struct file *file = vma->vm_file; if (file) get_file(file); - pte_clear(page_table); + mm->rss--; flush_tlb_page(vma, address); - vmlist_access_unlock(mm); + spin_unlock(&mm->page_table_lock); error = swapout(page, file); UnlockPage(page); if (file) fput(file); @@ -191,7 +183,7 @@ drop_pte: */ entry = get_swap_page(); if (!entry.val) - goto out_unlock; /* No swap space left */ + goto out_unlock_restore; /* No swap space left */ if (!(page = prepare_highmem_swapout(page))) goto out_swap_free; @@ -205,7 +197,7 @@ drop_pte: mm->rss--; set_pte(page_table, swp_entry_to_pte(entry)); flush_tlb_page(vma, address); - vmlist_access_unlock(mm); + spin_unlock(&mm->page_table_lock); /* OK, do a physical asynchronous write to swap. */ rw_swap_page(WRITE, page, 0); @@ -215,10 +207,12 @@ out_free_success: page_cache_release(page); return 1; out_swap_free: + set_pte(page_table, pte); swap_free(entry); out_failed: return 0; -out_unlock: +out_unlock_restore: + set_pte(page_table, pte); UnlockPage(page); return 0; } @@ -307,7 +301,7 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi unsigned long end; /* Don't swap out areas which are locked down */ - if (vma->vm_flags & VM_LOCKED) + if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) return 0; pgdir = pgd_offset(mm, address); @@ -341,7 +335,7 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask) * Find the proper vm-area after freezing the vma chain * and ptes. */ - vmlist_access_lock(mm); + spin_lock(&mm->page_table_lock); vma = find_vma(mm, address); if (vma) { if (address < vma->vm_start) @@ -364,7 +358,7 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask) mm->swap_cnt = 0; out_unlock: - vmlist_access_unlock(mm); + spin_unlock(&mm->page_table_lock); /* We didn't find anything for the process */ return 0; @@ -790,7 +784,8 @@ int refill_inactive_scan(unsigned int priority, int oneshot) * * SUBTLE: we can have buffer pages with count 1. */ - if (page_count(page) <= (page->buffers ? 2 : 1)) { + if (page->age == 0 && page_count(page) <= + (page->buffers ? 2 : 1)) { deactivate_page_nolock(page); page_active = 0; } else { @@ -837,8 +832,9 @@ int free_shortage(void) for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones+ i; if (zone->size && (zone->inactive_clean_pages + - zone->free_pages < zone->pages_min)) { - sum += zone->pages_min; + zone->free_pages < zone->pages_min+1)) { + /* + 1 to have overlap with alloc_pages() !! */ + sum += zone->pages_min + 1; sum -= zone->free_pages; sum -= zone->inactive_clean_pages; } @@ -1095,12 +1091,20 @@ int kswapd(void *unused) * We go to sleep for one second, but if it's needed * we'll be woken up earlier... */ - if (!free_shortage() || !inactive_shortage()) + if (!free_shortage() || !inactive_shortage()) { interruptible_sleep_on_timeout(&kswapd_wait, HZ); /* - * TODO: insert out of memory check & oom killer - * invocation in an else branch here. + * If we couldn't free enough memory, we see if it was + * due to the system just not having enough memory. + * If that is the case, the only solution is to kill + * a process (the alternative is enternal deadlock). + * + * If there still is enough memory around, we just loop + * and try free some more memory... */ + } else if (out_of_memory()) { + oom_kill(); + } } } |