From 78c388aed2b7184182c08428db1de6c872d815f5 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 4 Jan 1999 16:03:48 +0000 Subject: Merge with Linux 2.1.131 and more MIPS goodies. (Did I mention that CVS is buggy ...) --- mm/filemap.c | 145 ++++++++++++++++----------------- mm/memory.c | 224 +++++++++++++++++++++++++-------------------------- mm/mlock.c | 8 -- mm/mmap.c | 74 ++++++++++------- mm/mprotect.c | 10 +-- mm/mremap.c | 21 +++-- mm/page_alloc.c | 36 ++++----- mm/page_io.c | 13 ++- mm/slab.c | 18 ++--- mm/swap.c | 12 +-- mm/swap_state.c | 80 ++++++++++++++----- mm/swapfile.c | 28 +++---- mm/vmalloc.c | 7 +- mm/vmscan.c | 244 ++++++++++++++++++++++++-------------------------------- 14 files changed, 443 insertions(+), 477 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ffda2b7c1..227bcd5a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -9,26 +9,17 @@ * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ -#include -#include -#include -#include +#include #include -#include #include -#include -#include -#include #include #include #include -#include #include #include #include #include -#include #include #include @@ -153,7 +144,7 @@ static inline int shrink_one_page(struct page *page, int gfp_mask) } while (tmp != bh); /* Refuse to swap out all buffer pages */ - if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages)) + if (buffer_under_min()) goto next; } @@ -167,14 +158,9 @@ static inline int shrink_one_page(struct page *page, int gfp_mask) case 1: /* is it a swap-cache or page-cache page? */ if (page->inode) { - if (test_and_clear_bit(PG_referenced, &page->flags)) { - touch_page(page); - break; - } - age_page(page); - if (page->age) + if (test_and_clear_bit(PG_referenced, &page->flags)) break; - if (page_cache_size * 100 < (page_cache.min_percent * num_physpages)) + if (pgcache_under_min()) break; if (PageSwapCache(page)) { delete_from_swap_cache(page); @@ -188,6 +174,9 @@ static inline int shrink_one_page(struct page *page, int gfp_mask) if (test_and_clear_bit(PG_referenced, &page->flags)) break; + if (buffer_under_min()) + break; + /* is it a buffer cache page? */ if (bh && try_to_free_buffer(bh, &bh, 6)) return 1; @@ -211,7 +200,7 @@ int shrink_mmap(int priority, int gfp_mask) struct page * page; int count_max, count_min; - count_max = (limit<<2) >> (priority>>1); + count_max = limit; count_min = (limit<<2) >> (priority); page = mem_map + clock; @@ -225,7 +214,15 @@ int shrink_mmap(int priority, int gfp_mask) if (shrink_one_page(page, gfp_mask)) return 1; count_max--; - if (page->inode || page->buffers) + /* + * If the page we looked at was recyclable but we didn't + * reclaim it (presumably due to PG_referenced), don't + * count it as scanned. This way, the more referenced + * page cache pages we encounter, the more rapidly we + * will age them. + */ + if (atomic_read(&page->count) != 1 || + (!page->inode && !page->buffers)) count_min--; page++; clock++; @@ -292,7 +289,7 @@ static inline void add_to_page_cache(struct page * page, struct page **hash) { atomic_inc(&page->count); - page->flags &= ~((1 << PG_uptodate) | (1 << PG_error)); + page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced); page->offset = offset; add_page_to_inode_queue(inode, page); __add_page_to_hash_queue(page, hash); @@ -313,7 +310,7 @@ static unsigned long try_to_read_ahead(struct file * file, offset &= PAGE_MASK; switch (page_cache) { case 0: - page_cache = get_user_page(offset); + page_cache = __get_free_page(GFP_USER); if (!page_cache) break; default: @@ -327,7 +324,6 @@ static unsigned long try_to_read_ahead(struct file * file, */ page = mem_map + MAP_NR(page_cache); add_to_page_cache(page, inode, offset, hash); - set_bit(PG_referenced, &page->flags); inode->i_op->readpage(file, page); page_cache = 0; } @@ -736,7 +732,7 @@ no_cached_page: * page.. */ if (!page_cache) { - page_cache = get_user_page(pos & PAGE_MASK); + page_cache = __get_free_page(GFP_USER); /* * That could have slept, so go around to the * very beginning.. @@ -1002,7 +998,7 @@ found_page: * extra page -- better to overlap the allocation with the I/O. */ if (no_share && !new_page) { - new_page = get_user_page(address); + new_page = __get_free_page(GFP_USER); if (!new_page) goto failure; } @@ -1039,7 +1035,7 @@ success: return new_page; no_cached_page: - new_page = get_user_page(address); + new_page = __get_free_page(GFP_USER); if (!new_page) goto no_page; @@ -1067,8 +1063,7 @@ no_cached_page: * Do a very limited read-ahead if appropriate */ if (PageLocked(page)) - new_page = try_to_read_ahead(file, offset + PAGE_SIZE, - get_user_page(address + PAGE_SIZE)); + new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0); goto found_page; page_locked_wait: @@ -1520,39 +1515,58 @@ generic_file_write(struct file *file, const char *buf, { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; + unsigned long pos = *ppos; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; struct page *page, **hash; unsigned long page_cache = 0; - unsigned long pgpos, offset; - unsigned long bytes, written; - unsigned long pos; - long status, sync, didread; + unsigned long written; + long status, sync; if (!inode->i_op || !inode->i_op->updatepage) return -EIO; sync = file->f_flags & O_SYNC; - pos = *ppos; written = 0; - status = 0; if (file->f_flags & O_APPEND) pos = inode->i_size; + /* + * Check whether we've reached the file size limit. + */ + status = -EFBIG; + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + + status = 0; + /* + * Check whether to truncate the write, + * and send the signal if we do. + */ + if (count > limit - pos) { + send_sig(SIGXFSZ, current, 0); + count = limit - pos; + } + while (count) { + unsigned long bytes, pgpos, offset; /* * Try to find the page in the cache. If it isn't there, * allocate a free page. */ offset = (pos & ~PAGE_MASK); pgpos = pos & PAGE_MASK; - - if ((bytes = PAGE_SIZE - offset) > count) + bytes = PAGE_SIZE - offset; + if (bytes > count) bytes = count; hash = page_hash(inode, pgpos); - if (!(page = __find_page(inode, pgpos, *hash))) { + page = __find_page(inode, pgpos, *hash); + if (!page) { if (!page_cache) { - page_cache = get_user_page(pgpos); + page_cache = __get_free_page(GFP_USER); if (page_cache) continue; status = -ENOMEM; @@ -1563,51 +1577,25 @@ generic_file_write(struct file *file, const char *buf, page_cache = 0; } - /* - * Note: setting of the PG_locked bit is handled - * below the i_op->xxx interface. - */ - didread = 0; -page_wait: + /* Get exclusive IO access to the page.. */ wait_on_page(page); - if (PageUptodate(page)) - goto do_update_page; + set_bit(PG_locked, &page->flags); /* - * The page is not up-to-date ... if we're writing less - * than a full page of data, we may have to read it first. - * But if the page is past the current end of file, we must - * clear it before updating. + * Do the real work.. If the writer ends up delaying the write, + * the writer needs to increment the page use counts until he + * is done with the page. */ - if (bytes < PAGE_SIZE) { - if (pgpos < inode->i_size) { - status = -EIO; - if (didread >= 2) - goto done_with_page; - status = inode->i_op->readpage(file, page); - if (status < 0) - goto done_with_page; - didread++; - goto page_wait; - } else { - /* Must clear for partial writes */ - memset((void *) page_address(page), 0, - PAGE_SIZE); - } - } - /* - * N.B. We should defer setting PG_uptodate at least until - * the data is copied. A failure in i_op->updatepage() could - * leave the page with garbage data. - */ - set_bit(PG_uptodate, &page->flags); - -do_update_page: - /* All right, the page is there. Now update it. */ - status = inode->i_op->updatepage(file, page, buf, - offset, bytes, sync); -done_with_page: + bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes); + status = -EFAULT; + if (bytes) + status = inode->i_op->updatepage(file, page, offset, bytes, sync); + + /* Mark it unlocked again and drop the page.. */ + clear_bit(PG_locked, &page->flags); + wake_up(&page->wait); __free_page(page); + if (status < 0) break; @@ -1622,6 +1610,7 @@ done_with_page: if (page_cache) free_page(page_cache); +out: return written ? written : status; } diff --git a/mm/memory.c b/mm/memory.c index 388d9ce03..932c35648 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -33,23 +33,13 @@ * Idea by Alex Bligh (alex@cconcepts.co.uk) */ -#include -#include -#include -#include -#include -#include -#include -#include #include +#include #include -#include #include -#include #include #include -#include unsigned long max_mapnr = 0; unsigned long num_physpages = 0; @@ -289,10 +279,6 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; } if (cow) pte = pte_wrprotect(pte); -#if 0 /* No longer needed with the new swap cache code */ - if (delete_from_swap_cache(&mem_map[page_nr])) - pte = pte_mkdirty(pte); -#endif set_pte(dst_pte, pte_mkold(pte)); set_pte(src_pte, pte); atomic_inc(&mem_map[page_nr].count); @@ -635,15 +621,15 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. */ -static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, - unsigned long address, int write_access, pte_t *page_table) +static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, + unsigned long address, pte_t *page_table) { pte_t pte; unsigned long old_page, new_page; struct page * page_map; pte = *page_table; - new_page = get_user_page(address); + new_page = __get_free_page(GFP_USER); /* Did someone else copy this page for us while we slept? */ if (pte_val(*page_table) != pte_val(pte)) goto end_wp_page; @@ -661,40 +647,42 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, * Do we need to copy? */ if (is_page_shared(page_map)) { - if (new_page) { - if (PageReserved(mem_map + MAP_NR(old_page))) - ++vma->vm_mm->rss; - copy_cow_page(old_page,new_page); - flush_page_to_ram(old_page); - flush_page_to_ram(new_page); - flush_cache_page(vma, address); - set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); - free_page(old_page); - flush_tlb_page(vma, address); - return; - } + unlock_kernel(); + if (!new_page) + return 0; + + if (PageReserved(mem_map + MAP_NR(old_page))) + ++vma->vm_mm->rss; + copy_cow_page(old_page,new_page); + flush_page_to_ram(old_page); + flush_page_to_ram(new_page); flush_cache_page(vma, address); - set_pte(page_table, BAD_PAGE); - flush_tlb_page(vma, address); + set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); free_page(old_page); - oom(tsk); - return; + flush_tlb_page(vma, address); + return 1; } + if (PageSwapCache(page_map)) delete_from_swap_cache(page_map); + + /* We can release the kernel lock now.. */ + unlock_kernel(); + flush_cache_page(vma, address); set_pte(page_table, pte_mkdirty(pte_mkwrite(pte))); flush_tlb_page(vma, address); +end_wp_page: if (new_page) free_page(new_page); - return; + return 1; + bad_wp_page: printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); send_sig(SIGKILL, tsk, 1); -end_wp_page: if (new_page) free_page(new_page); - return; + return 0; } /* @@ -783,30 +771,53 @@ void vmtruncate(struct inode * inode, unsigned long offset) } -static inline void do_swap_page(struct task_struct * tsk, +/* + * This is called with the kernel lock held, we need + * to return without it. + */ +static int do_swap_page(struct task_struct * tsk, struct vm_area_struct * vma, unsigned long address, pte_t * page_table, pte_t entry, int write_access) { - pte_t page; - if (!vma->vm_ops || !vma->vm_ops->swapin) { - swap_in(tsk, vma, address, page_table, pte_val(entry), write_access); + swap_in(tsk, vma, page_table, pte_val(entry), write_access); flush_page_to_ram(pte_page(*page_table)); - return; + } else { + pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry)); + if (pte_val(*page_table) != pte_val(entry)) { + free_page(pte_page(page)); + } else { + if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 && + !(vma->vm_flags & VM_SHARED)) + page = pte_wrprotect(page); + ++vma->vm_mm->rss; + ++tsk->maj_flt; + flush_page_to_ram(pte_page(page)); + set_pte(page_table, page); + } } - page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry)); - if (pte_val(*page_table) != pte_val(entry)) { - free_page(pte_page(page)); - return; + unlock_kernel(); + return 1; +} + +/* + * This only needs the MM semaphore + */ +static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) +{ + pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + if (write_access) { + unsigned long page = __get_free_page(GFP_USER); + if (!page) + return 0; + clear_page(page); + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + vma->vm_mm->rss++; + tsk->min_flt++; + flush_page_to_ram(page); } - if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 && - !(vma->vm_flags & VM_SHARED)) - page = pte_wrprotect(page); - ++vma->vm_mm->rss; - ++tsk->maj_flt; - flush_page_to_ram(pte_page(page)); - set_pte(page_table, page); - return; + put_page(page_table, entry); + return 1; } /* @@ -817,26 +828,34 @@ static inline void do_swap_page(struct task_struct * tsk, * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. + * + * This is called with the MM semaphore and the kernel lock held. + * We need to release the kernel lock as soon as possible.. */ -static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, - unsigned long address, int write_access, pte_t *page_table, pte_t entry) +static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *page_table) { unsigned long page; + pte_t entry; + + if (!vma->vm_ops || !vma->vm_ops->nopage) { + unlock_kernel(); + return do_anonymous_page(tsk, vma, page_table, write_access, + address); + } - if (!pte_none(entry)) - goto swap_page; - address &= PAGE_MASK; - if (!vma->vm_ops || !vma->vm_ops->nopage) - goto anonymous_page; /* * The third argument is "no_share", which tells the low-level code * to copy, not share the page even if sharing is possible. It's - * essentially an early COW detection + * essentially an early COW detection. */ - page = vma->vm_ops->nopage(vma, address, + page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access); + + unlock_kernel(); if (!page) - goto sigbus; + return 0; + ++tsk->maj_flt; ++vma->vm_mm->rss; /* @@ -849,7 +868,6 @@ static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, * so we can make it writable and dirty to avoid having to * handle that later. */ -/* do_no_page might already have flushed the page ... */ flush_page_to_ram(page); entry = mk_pte(page, vma->vm_page_prot); if (write_access) { @@ -859,32 +877,7 @@ static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, entry = pte_wrprotect(entry); put_page(page_table, entry); /* no need to invalidate: a not-present page shouldn't be cached */ - return; - -anonymous_page: - entry = pte_wrprotect(mk_pte(ZERO_PAGE(address), vma->vm_page_prot)); - if (write_access) { - unsigned long page = get_user_page(address); - if (!page) - goto sigbus; - clear_page(page); - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - vma->vm_mm->rss++; - tsk->min_flt++; - flush_page_to_ram(page); - } - put_page(page_table, entry); - return; - -sigbus: - force_sig(SIGBUS, current); - put_page(page_table, BAD_PAGE); - /* no need to invalidate, wasn't present */ - return; - -swap_page: - do_swap_page(tsk, vma, address, page_table, entry, write_access); - return; + return 1; } /* @@ -896,54 +889,57 @@ swap_page: * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). */ -static inline void handle_pte_fault(struct task_struct *tsk, +static inline int handle_pte_fault(struct task_struct *tsk, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) { - pte_t entry = *pte; + pte_t entry; + + lock_kernel(); + entry = *pte; if (!pte_present(entry)) { - do_no_page(tsk, vma, address, write_access, pte, entry); - return; + if (pte_none(entry)) + return do_no_page(tsk, vma, address, write_access, pte); + return do_swap_page(tsk, vma, address, pte, entry, write_access); } + entry = pte_mkyoung(entry); set_pte(pte, entry); flush_tlb_page(vma, address); - if (!write_access) - return; - if (pte_write(entry)) { + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(tsk, vma, address, pte); + entry = pte_mkdirty(entry); set_pte(pte, entry); flush_tlb_page(vma, address); - return; } - do_wp_page(tsk, vma, address, write_access, pte); + unlock_kernel(); + return 1; } /* * By the time we get here, we already hold the mm semaphore */ -void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma, +int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma, unsigned long address, int write_access) { pgd_t *pgd; pmd_t *pmd; - pte_t *pte; pgd = pgd_offset(vma->vm_mm, address); pmd = pmd_alloc(pgd, address); - if (!pmd) - goto no_memory; - pte = pte_alloc(pmd, address); - if (!pte) - goto no_memory; - lock_kernel(); - handle_pte_fault(tsk, vma, address, write_access, pte); - unlock_kernel(); - update_mmu_cache(vma, address, *pte); - return; -no_memory: - oom(tsk); + if (pmd) { + pte_t * pte = pte_alloc(pmd, address); + if (pte) { + if (handle_pte_fault(tsk, vma, address, write_access, pte)) { + update_mmu_cache(vma, address, *pte); + return 1; + } + } + } + return 0; } /* diff --git a/mm/mlock.c b/mm/mlock.c index 527443946..1c9035095 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -3,20 +3,12 @@ * * (C) Copyright 1995 Linus Torvalds */ -#include -#include -#include -#include #include #include -#include #include -#include -#include #include #include -#include #include static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags) diff --git a/mm/mmap.c b/mm/mmap.c index 77b0c5d62..4cbdbe3ca 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3,24 +3,17 @@ * * Written by obz. */ -#include -#include -#include -#include #include #include -#include #include -#include #include #include -#include +#include #include #include #include #include -#include #include /* description of effects of mapping type and prot in current implementation. @@ -57,6 +50,12 @@ int vm_enough_memory(long pages) * simple, it hopefully works in most obvious cases.. Easy to * fool it, but this should catch most mistakes. */ + /* 23/11/98 NJC: Somewhat less stupid version of algorithm, + * which tries to do "TheRightThing". Instead of using half of + * (buffers+cache), use the minimum values. Allow an extra 2% + * of num_physpages for safety margin. + */ + long free; /* Sometimes we want to use more memory than we have. */ @@ -65,10 +64,9 @@ int vm_enough_memory(long pages) free = buffermem >> PAGE_SHIFT; free += page_cache_size; - free >>= 1; free += nr_free_pages; free += nr_swap_pages; - free -= num_physpages >> 4; + free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100; return free > pages; } @@ -93,7 +91,21 @@ asmlinkage unsigned long sys_brk(unsigned long brk) struct mm_struct *mm = current->mm; down(&mm->mmap_sem); + + /* + * This lock-kernel is one of the main contention points for + * certain normal loads. And it really should not be here: almost + * everything in brk()/mmap()/munmap() is protected sufficiently by + * the mmap semaphore that we got above. + * + * We should move this into the few things that really want the + * lock, namely anything that actually touches a file descriptor + * etc. We can do all the normal anonymous mapping cases without + * ever getting the lock at all - the actual memory management + * code is already completely thread-safe. + */ lock_kernel(); + if (brk < mm->end_code) goto out; newbrk = PAGE_ALIGN(brk); @@ -162,7 +174,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, { struct mm_struct * mm = current->mm; struct vm_area_struct * vma; - int correct_wcount = 0, error; + int error; if ((len = PAGE_ALIGN(len)) == 0) return addr; @@ -286,30 +298,28 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, !vm_enough_memory(len >> PAGE_SHIFT)) goto free_vma; - error = 0; if (file) { + int correct_wcount = 0; if (vma->vm_flags & VM_DENYWRITE) { - if (file->f_dentry->d_inode->i_writecount > 0) + if (file->f_dentry->d_inode->i_writecount > 0) { error = -ETXTBSY; - else { - /* f_op->mmap might possibly sleep - * (generic_file_mmap doesn't, but other code - * might). In any case, this takes care of any - * race that this might cause. - */ - file->f_dentry->d_inode->i_writecount--; - correct_wcount = 1; + goto free_vma; } + /* f_op->mmap might possibly sleep + * (generic_file_mmap doesn't, but other code + * might). In any case, this takes care of any + * race that this might cause. + */ + file->f_dentry->d_inode->i_writecount--; + correct_wcount = 1; } - if (!error) - error = file->f_op->mmap(file, vma); - + error = file->f_op->mmap(file, vma); + /* Fix up the count if necessary, then check for an error */ + if (correct_wcount) + file->f_dentry->d_inode->i_writecount++; + if (error) + goto unmap_and_free_vma; } - /* Fix up the count if necessary, then check for an error */ - if (correct_wcount) - file->f_dentry->d_inode->i_writecount++; - if (error) - goto free_vma; /* * merge_segments may merge our vma, so we can't refer to it @@ -327,6 +337,11 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, } return addr; +unmap_and_free_vma: + /* Undo any partial mapping done by a device driver. */ + flush_cache_range(mm, vma->vm_start, vma->vm_end); + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); + flush_tlb_range(mm, vma->vm_start, vma->vm_end); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -418,6 +433,7 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr, mpnt->vm_ops = area->vm_ops; mpnt->vm_offset = area->vm_offset + (end - area->vm_start); mpnt->vm_file = area->vm_file; + mpnt->vm_pte = area->vm_pte; if (mpnt->vm_file) mpnt->vm_file->f_count++; if (mpnt->vm_ops && mpnt->vm_ops->open) diff --git a/mm/mprotect.c b/mm/mprotect.c index cc78e10ab..b28237c09 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -3,20 +3,12 @@ * * (C) Copyright 1994 Linus Torvalds */ -#include -#include -#include -#include -#include +#include #include #include -#include #include -#include -#include #include -#include #include static inline void change_pte_range(pmd_t * pmd, unsigned long address, diff --git a/mm/mremap.c b/mm/mremap.c index cd7a7eb4a..a10870318 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -4,21 +4,13 @@ * (C) Copyright 1996 Linus Torvalds */ -#include -#include -#include -#include -#include +#include #include #include -#include #include -#include -#include #include #include -#include #include extern int vm_enough_memory(long pages); @@ -142,7 +134,6 @@ static inline unsigned long move_vma(struct vm_area_struct * vma, new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start); - new_vma->vm_file = vma->vm_file; if (new_vma->vm_file) new_vma->vm_file->f_count++; if (new_vma->vm_ops && new_vma->vm_ops->open) @@ -151,6 +142,11 @@ static inline unsigned long move_vma(struct vm_area_struct * vma, merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end); do_munmap(addr, old_len); current->mm->total_vm += new_len >> PAGE_SHIFT; + if (new_vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += new_len >> PAGE_SHIFT; + make_pages_present(new_vma->vm_start, + new_vma->vm_end); + } return new_addr; } kmem_cache_free(vm_area_cachep, new_vma); @@ -224,8 +220,11 @@ asmlinkage unsigned long sys_mremap(unsigned long addr, int pages = (new_len - old_len) >> PAGE_SHIFT; vma->vm_end = addr + new_len; current->mm->total_vm += pages; - if (vma->vm_flags & VM_LOCKED) + if (vma->vm_flags & VM_LOCKED) { current->mm->locked_vm += pages; + make_pages_present(addr + old_len, + addr + new_len); + } ret = addr; goto out; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 70cad74eb..7ceec01b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7,25 +7,16 @@ #include #include -#include -#include #include -#include -#include -#include #include -#include #include #include #include #include #include -#include /* for cli()/sti() */ #include /* for copy_to/from_user */ -#include #include -#include int nr_swap_pages = 0; int nr_free_pages = 0; @@ -163,9 +154,11 @@ void __free_page(struct page *page) free_pages_ok(page->map_nr, 0); return; } +#if 0 if (PageSwapCache(page) && atomic_read(&page->count) == 1) printk(KERN_WARNING "VM: Releasing swap cache page at %p", __builtin_return_address(0)); +#endif } void free_pages(unsigned long addr, unsigned long order) @@ -182,10 +175,12 @@ void free_pages(unsigned long addr, unsigned long order) free_pages_ok(map_nr, order); return; } +#if 0 if (PageSwapCache(map) && atomic_read(&map->count) == 1) printk(KERN_WARNING "VM: Releasing swap cache pages at %p", __builtin_return_address(0)); +#endif } } @@ -227,7 +222,6 @@ do { unsigned long size = 1 << high; \ map += size; \ } \ atomic_set(&map->count, 1); \ - map->age = PAGE_INITIAL_AGE; \ } while (0) unsigned long __get_free_pages(int gfp_mask, unsigned long order) @@ -264,14 +258,15 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order) spin_unlock_irqrestore(&page_alloc_lock, flags); /* - * If we failed to find anything, we'll return NULL, but we'll - * wake up kswapd _now_ ad even wait for it synchronously if - * we can.. This way we'll at least make some forward progress - * over time. + * If we can schedule, do so, and make sure to yield. + * We may be a real-time process, and if kswapd is + * waiting for us we need to allow it to run a bit. */ - wake_up(&kswapd_wait); - if (gfp_mask & __GFP_WAIT) + if (gfp_mask & __GFP_WAIT) { + current->policy |= SCHED_YIELD; schedule(); + } + nopage: return 0; } @@ -372,12 +367,12 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m * was due to a write access. */ void swap_in(struct task_struct * tsk, struct vm_area_struct * vma, - unsigned long address, pte_t * page_table, unsigned long entry, int write_access) + pte_t * page_table, unsigned long entry, int write_access) { unsigned long page; struct page *page_map; - page_map = read_swap_cache(entry, address); + page_map = read_swap_cache(entry); if (pte_val(*page_table) != entry) { if (page_map) @@ -404,8 +399,9 @@ void swap_in(struct task_struct * tsk, struct vm_area_struct * vma, /* The page is unshared, and we want write access. In this case, it is safe to tear down the swap cache and give the page over entirely to this process. */ - - delete_from_swap_cache(page_map); + + if (PageSwapCache(page_map)) + delete_from_swap_cache(page_map); set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)))); return; } diff --git a/mm/page_io.c b/mm/page_io.c index 44f592df8..2dd24facc 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -10,21 +10,13 @@ */ #include -#include -#include #include -#include -#include -#include #include -#include #include #include #include -#include /* for cli()/sti() */ #include /* for copy_to/from_user */ -#include #include static struct wait_queue * lock_queue = NULL; @@ -66,6 +58,11 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) printk("Internal error: bad swap-device\n"); return; } + + /* Don't allow too many pending pages in flight.. */ + if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX) + wait = 1; + p = &swap_info[type]; offset = SWP_OFFSET(entry); if (offset >= p->max) { diff --git a/mm/slab.c b/mm/slab.c index d4be178a2..29680bd68 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -654,9 +654,9 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp) } slabp->s_magic = SLAB_MAGIC_DESTROYED; - kmem_freepages(cachep, slabp->s_mem-slabp->s_offset); if (slabp->s_index) kmem_cache_free(cachep->c_index_cachep, slabp->s_index); + kmem_freepages(cachep, slabp->s_mem-slabp->s_offset); if (SLAB_OFF_SLAB(cachep->c_flags)) kmem_cache_free(cache_slabp, slabp); } @@ -1194,7 +1194,6 @@ kmem_cache_grow(kmem_cache_t * cachep, int flags) cachep->c_dflags = SLAB_CFLGS_GROWN; cachep->c_growing++; -re_try: spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); /* A series of memory allocations for a new slab. @@ -1261,15 +1260,6 @@ opps1: kmem_freepages(cachep, objp); failed: spin_lock_irq(&cachep->c_spinlock); - if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) { - /* For large order (>0) slabs, we try again. - * Needed because the gfp() functions are not good at giving - * out contiguous pages unless pushed (but do not push too hard). - */ - if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep)) - goto re_try; - cachep->c_failures = 1; /* Memory is low, don't try as hard next time. */ - } cachep->c_growing--; spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); return 0; @@ -1448,8 +1438,10 @@ alloc_new_slab: } /* Couldn't grow, but some objs may have been freed. */ spin_lock_irq(&cachep->c_spinlock); - if (cachep->c_freep != kmem_slab_end(cachep)) - goto try_again; + if (cachep->c_freep != kmem_slab_end(cachep)) { + if ((flags & SLAB_ATOMIC) == 0) + goto try_again; + } } else { /* Very serious error - maybe panic() here? */ kmem_report_alloc_err("Bad slab magic (corrupt)", cachep); diff --git a/mm/swap.c b/mm/swap.c index 1788021b9..1e2d8c36b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -14,22 +14,14 @@ */ #include -#include -#include #include -#include -#include -#include #include -#include #include #include #include #include -#include /* for cli()/sti() */ #include /* for copy_to/from_user */ -#include #include /* @@ -70,13 +62,13 @@ swapstat_t swapstats = {0}; buffer_mem_t buffer_mem = { 5, /* minimum percent buffer */ - 25, /* borrow percent buffer */ + 10, /* borrow percent buffer */ 60 /* maximum percent buffer */ }; buffer_mem_t page_cache = { 5, /* minimum percent page cache */ - 30, /* borrow percent page cache */ + 15, /* borrow percent page cache */ 75 /* maximum */ }; diff --git a/mm/swap_state.c b/mm/swap_state.c index 2aaf0c46b..e098974b2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -8,19 +8,12 @@ */ #include -#include -#include #include -#include -#include -#include #include -#include #include #include #include -#include #include /* @@ -143,6 +136,50 @@ bad_unused: goto out; } +int swap_count(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + int retval = 0; + + if (!entry) + goto bad_entry; + type = SWP_TYPE(entry); + if (type & SHM_SWP_TYPE) + goto out; + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_unused; + retval = p->swap_map[offset]; +#ifdef DEBUG_SWAP + printk("DebugVM: swap_count(entry %08lx, count %d)\n", + entry, retval); +#endif +out: + return retval; + +bad_entry: + printk(KERN_ERR "swap_count: null entry!\n"); + goto out; +bad_file: + printk(KERN_ERR + "swap_count: entry %08lx, nonexistent swap file!\n", entry); + goto out; +bad_offset: + printk(KERN_ERR + "swap_count: entry %08lx, offset exceeds max!\n", entry); + goto out; +bad_unused: + printk(KERN_ERR + "swap_count at %8p: entry %08lx, unused page!\n", + __builtin_return_address(0), entry); + goto out; +} static inline void remove_from_swap_cache(struct page *page) { @@ -155,6 +192,7 @@ static inline void remove_from_swap_cache(struct page *page) printk ("VM: Removing swap cache page with wrong inode hash " "on page %08lx\n", page_address(page)); } +#if 0 /* * This is a legal case, but warn about it. */ @@ -163,6 +201,7 @@ static inline void remove_from_swap_cache(struct page *page) "VM: Removing page cache on unshared page %08lx\n", page_address(page)); } +#endif #ifdef DEBUG_SWAP printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n", @@ -173,24 +212,25 @@ static inline void remove_from_swap_cache(struct page *page) } +/* + * This must be called only on pages that have + * been verified to be in the swap cache. + */ void delete_from_swap_cache(struct page *page) { + long entry = page->offset; + #ifdef SWAP_CACHE_INFO swap_cache_del_total++; -#endif - if (PageSwapCache (page)) { - long entry = page->offset; -#ifdef SWAP_CACHE_INFO - swap_cache_del_success++; + swap_cache_del_success++; #endif #ifdef DEBUG_SWAP - printk("DebugVM: delete_from_swap_cache(%08lx count %d, " - "entry %08lx)\n", - page_address(page), atomic_read(&page->count), entry); + printk("DebugVM: delete_from_swap_cache(%08lx count %d, " + "entry %08lx)\n", + page_address(page), atomic_read(&page->count), entry); #endif - remove_from_swap_cache (page); - swap_free (entry); - } + remove_from_swap_cache (page); + swap_free (entry); } /* @@ -208,7 +248,7 @@ void free_page_and_swap_cache(unsigned long addr) delete_from_swap_cache(page); } - free_user_page(page, addr); + free_page(addr); } @@ -249,7 +289,7 @@ out_bad: * the swap entry is no longer in use. */ -struct page * read_swap_cache_async(unsigned long entry, unsigned long addr, int wait) +struct page * read_swap_cache_async(unsigned long entry, int wait) { struct page *found_page, *new_page; unsigned long new_page_addr; diff --git a/mm/swapfile.c b/mm/swapfile.c index b7446b3b5..c574fb59a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -5,25 +5,16 @@ * Swap reorganised 29.12.95, Stephen Tweedie */ -#include -#include +#include #include -#include -#include #include -#include -#include -#include #include -#include #include -#include #include /* for blk_size */ #include #include #include -#include #include unsigned int nr_swapfiles = 0; @@ -317,14 +308,14 @@ static int try_to_unuse(unsigned int type) /* Get a page for the entry, using the existing swap cache page if there is one. Otherwise, get a clean page and read the swap into it. */ - page_map = read_swap_cache(entry, 0); + page_map = read_swap_cache(entry); if (!page_map) { /* * Continue searching if the entry became unused. */ if (si->swap_map[i] == 0) continue; - return -ENOMEM; + return -ENOMEM; } page = page_address(page_map); read_lock(&tasklist_lock); @@ -559,8 +550,17 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) if (p->swap_device == swap_info[i].swap_device) goto bad_swap; } - } else if (!S_ISREG(swap_dentry->d_inode->i_mode)) + } else if (S_ISREG(swap_dentry->d_inode->i_mode)) { + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + if (i == type) + continue; + if (p->swap_file == swap_info[i].swap_file) + goto bad_swap; + } + } else goto bad_swap; + swap_header = (void *) __get_free_page(GFP_USER); if (!swap_header) { printk("Unable to start swapping: out of memory :-)\n"); @@ -627,7 +627,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) p->max = swap_header->info.last_page; if (p->max >= 0x7fffffffL/PAGE_SIZE || - (void *) &swap_header->info.badpages[swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) { + (void *) &swap_header->info.badpages[(int) swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) { error = -EINVAL; goto bad_swap; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e7711c23c..e99ad35fb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5,11 +5,9 @@ */ #include -#include #include #include -#include static struct vm_struct * vmlist = NULL; @@ -38,8 +36,7 @@ static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned lo if (pte_none(page)) continue; if (pte_present(page)) { - free_user_page(mem_map + MAP_NR(pte_page(page)), - pte_page(page)); + free_page(pte_page(page)); continue; } printk("Whee.. Swapped out page in kernel page table\n"); @@ -97,7 +94,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo unsigned long page; if (!pte_none(*pte)) printk("alloc_area_pte: page already exists\n"); - page = get_user_page(address); + page = __get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; set_pte(pte, mk_pte(page, prot)); diff --git a/mm/vmscan.c b/mm/vmscan.c index 884e67150..c5efa52a2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -10,39 +10,20 @@ * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $ */ -#include -#include -#include +#include #include -#include -#include #include #include #include -#include -#include -#include #include #include -#include #include -/* - * When are we next due for a page scan? - */ -static unsigned long next_swap_jiffies = 0; - -/* - * How often do we do a pageout scan during normal conditions? - * Default is four times a second. - */ -int swapout_interval = HZ / 4; - /* * The wait queue for waking up the pageout daemon: */ -struct wait_queue * kswapd_wait = NULL; +static struct task_struct * kswapd_task = NULL; static void init_swap_timer(void); @@ -123,8 +104,13 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc } if (pte_young(pte)) { + /* + * Transfer the "accessed" bit from the page + * tables to the global page map. + */ set_pte(page_table, pte_mkold(pte)); - touch_page(page_map); + set_bit(PG_referenced, &page_map->flags); + /* * We should test here to see if we want to recover any * swap cache page here. We do this if the page seeing @@ -137,10 +123,6 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc return 0; } - age_page(page_map); - if (page_map->age) - return 0; - if (pte_dirty(pte)) { if (vma->vm_ops && vma->vm_ops->swapout) { pid_t pid = tsk->pid; @@ -180,7 +162,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc * copy in memory, so we add it to the swap * cache. */ if (PageSwapCache(page_map)) { - free_page_and_swap_cache(page); + free_page(page); return (atomic_read(&page_map->count) == 0); } add_to_swap_cache(page_map, entry); @@ -198,7 +180,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc * asynchronously. That's no problem, shrink_mmap() can * correctly clean up the occassional unshared page * which gets left behind in the swap cache. */ - free_page_and_swap_cache(page); + free_page(page); return 1; /* we slept: the process may not exist any more */ } @@ -212,7 +194,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc set_pte(page_table, __pte(entry)); flush_tlb_page(vma, address); swap_duplicate(entry); - free_page_and_swap_cache(page); + free_page(page); return (atomic_read(&page_map->count) == 0); } /* @@ -228,7 +210,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc flush_cache_page(vma, address); pte_clear(page_table); flush_tlb_page(vma, address); - entry = page_unuse(page_map); + entry = (atomic_read(&page_map->count) == 1); __free_page(page_map); return entry; } @@ -310,8 +292,9 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * } static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *pgdir, unsigned long start, int gfp_mask) + unsigned long address, int gfp_mask) { + pgd_t *pgdir; unsigned long end; /* Don't swap out areas like shared memory which have their @@ -319,12 +302,14 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, if (vma->vm_flags & (VM_SHM | VM_LOCKED)) return 0; + pgdir = pgd_offset(tsk->mm, address); + end = vma->vm_end; - while (start < end) { - int result = swap_out_pgd(tsk, vma, pgdir, start, end, gfp_mask); + while (address < end) { + int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask); if (result) return result; - start = (start + PGDIR_SIZE) & PGDIR_MASK; + address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } return 0; @@ -344,22 +329,23 @@ static int swap_out_process(struct task_struct * p, int gfp_mask) * Find the proper vm-area */ vma = find_vma(p->mm, address); - if (!vma) { - p->swap_address = 0; - return 0; + if (vma) { + if (address < vma->vm_start) + address = vma->vm_start; + + for (;;) { + int result = swap_out_vma(p, vma, address, gfp_mask); + if (result) + return result; + vma = vma->vm_next; + if (!vma) + break; + address = vma->vm_start; + } } - if (address < vma->vm_start) - address = vma->vm_start; - for (;;) { - int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, gfp_mask); - if (result) - return result; - vma = vma->vm_next; - if (!vma) - break; - address = vma->vm_start; - } + /* We didn't find anything for the process */ + p->swap_cnt = 0; p->swap_address = 0; return 0; } @@ -420,20 +406,12 @@ static int swap_out(unsigned int priority, int gfp_mask) } pbest->swap_cnt--; - switch (swap_out_process(pbest, gfp_mask)) { - case 0: - /* - * Clear swap_cnt so we don't look at this task - * again until we've tried all of the others. - * (We didn't block, so the task is still here.) - */ - pbest->swap_cnt = 0; - break; - case 1: - return 1; - default: - break; - }; + /* + * Nonzero means we cleared out something, but only "1" means + * that we actually free'd up a page as a result. + */ + if (swap_out_process(pbest, gfp_mask) == 1) + return 1; } out: return 0; @@ -448,19 +426,12 @@ static int do_try_to_free_page(int gfp_mask) { static int state = 0; int i=6; - int stop; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - /* We try harder if we are waiting .. */ - stop = 3; - if (gfp_mask & __GFP_WAIT) - stop = 0; - - if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages) - || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages)) - shrink_mmap(i, gfp_mask); + if (buffer_over_borrow() || pgcache_over_borrow()) + state = 0; switch (state) { do { @@ -480,7 +451,7 @@ static int do_try_to_free_page(int gfp_mask) shrink_dcache_memory(i, gfp_mask); state = 0; i--; - } while ((i - stop) >= 0); + } while (i >= 0); } return 0; } @@ -510,10 +481,9 @@ void __init kswapd_setup(void) */ int kswapd(void *unused) { - struct wait_queue wait = { current, NULL }; current->session = 1; current->pgrp = 1; - sprintf(current->comm, "kswapd"); + strcpy(current->comm, "kswapd"); sigfillset(¤t->blocked); /* @@ -523,11 +493,12 @@ int kswapd(void *unused) */ lock_kernel(); - /* Give kswapd a realtime priority. */ - current->policy = SCHED_FIFO; - current->rt_priority = 32; /* Fixme --- we need to standardise our - namings for POSIX.4 realtime scheduling - priorities. */ + /* + * Set the base priority to something smaller than a + * regular process. We will scale up the priority + * dynamically depending on how much memory we need. + */ + current->priority = (DEF_PRIORITY * 2) / 3; /* * Tell the memory management that we're a "memory allocator", @@ -544,9 +515,9 @@ int kswapd(void *unused) current->flags |= PF_MEMALLOC; init_swap_timer(); - add_wait_queue(&kswapd_wait, &wait); + kswapd_task = current; while (1) { - int tries; + unsigned long end_time; current->state = TASK_INTERRUPTIBLE; flush_signals(current); @@ -554,39 +525,17 @@ int kswapd(void *unused) schedule(); swapstats.wakeups++; - /* - * Do the background pageout: be - * more aggressive if we're really - * low on free memory. - * - * We try page_daemon.tries_base times, divided by - * an 'urgency factor'. In practice this will mean - * a value of pager_daemon.tries_base / 8 or 4 = 64 - * or 128 pages at a time. - * This gives us 64 (or 128) * 4k * 4 (times/sec) = - * 1 (or 2) MB/s swapping bandwidth in low-priority - * background paging. This number rises to 8 MB/s - * when the priority is highest (but then we'll be - * woken up more often and the rate will be even - * higher). - */ - tries = pager_daemon.tries_base; - tries >>= 4*free_memory_available(); - + /* max one hundreth of a second */ + end_time = jiffies + (HZ-1)/100; do { - do_try_to_free_page(0); - /* - * Syncing large chunks is faster than swapping - * synchronously (less head movement). -- Rik. - */ - if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster) - run_task_queue(&tq_disk); - if (free_memory_available() > 1) + if (!do_try_to_free_page(0)) break; - } while (--tries > 0); + if (nr_free_pages > freepages.high + SWAP_CLUSTER_MAX) + break; + } while (time_before_eq(jiffies,end_time)); } /* As if we could ever get here - maybe we want to make this killable */ - remove_wait_queue(&kswapd_wait, &wait); + kswapd_task = NULL; unlock_kernel(); return 0; } @@ -620,42 +569,61 @@ int try_to_free_pages(unsigned int gfp_mask, int count) return retval; } +/* + * Wake up kswapd according to the priority + * 0 - no wakeup + * 1 - wake up as a low-priority process + * 2 - wake up as a normal process + * 3 - wake up as an almost real-time process + * + * This plays mind-games with the "goodness()" + * function in kernel/sched.c. + */ +static inline void kswapd_wakeup(struct task_struct *p, int priority) +{ + if (priority) { + p->counter = p->priority << priority; + wake_up_process(p); + } +} + /* * The swap_tick function gets called on every clock tick. */ void swap_tick(void) { - unsigned long now, want; - int want_wakeup = 0; - - want = next_swap_jiffies; - now = jiffies; + struct task_struct *p = kswapd_task; /* - * Examine the memory queues. Mark memory low - * if there is nothing available in the three - * highest queues. - * - * Schedule for wakeup if there isn't lots - * of free memory. + * Only bother to try to wake kswapd up + * if the task exists and can be woken. */ - switch (free_memory_available()) { - case 0: - want = now; - /* Fall through */ - case 1: - want_wakeup = 1; - default: - } - - if ((long) (now - want) >= 0) { - if (want_wakeup || (num_physpages * buffer_mem.max_percent) < (buffermem >> PAGE_SHIFT) * 100 - || (num_physpages * page_cache.max_percent < page_cache_size * 100)) { - /* Set the next wake-up time */ - next_swap_jiffies = now + swapout_interval; - wake_up(&kswapd_wait); - } + if (p && (p->state & TASK_INTERRUPTIBLE)) { + unsigned int pages; + int want_wakeup; + + /* + * Schedule for wakeup if there isn't lots + * of free memory or if there is too much + * of it used for buffers or pgcache. + * + * "want_wakeup" is our priority: 0 means + * not to wake anything up, while 3 means + * that we'd better give kswapd a realtime + * priority. + */ + want_wakeup = 0; + pages = nr_free_pages; + if (pages < freepages.high) + want_wakeup = 1; + if (pages < freepages.low) + want_wakeup = 2; + if (pages < freepages.min) + want_wakeup = 3; + + kswapd_wakeup(p,want_wakeup); } + timer_active |= (1<