summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1999-01-04 16:03:48 +0000
committerRalf Baechle <ralf@linux-mips.org>1999-01-04 16:03:48 +0000
commit78c388aed2b7184182c08428db1de6c872d815f5 (patch)
tree4b2003b1b4ceb241a17faa995da8dd1004bb8e45 /mm
parenteb7a5bf93aaa4be1d7c6181100ab7639e74d67f7 (diff)
Merge with Linux 2.1.131 and more MIPS goodies.
(Did I mention that CVS is buggy ...)
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c145
-rw-r--r--mm/memory.c224
-rw-r--r--mm/mlock.c8
-rw-r--r--mm/mmap.c74
-rw-r--r--mm/mprotect.c10
-rw-r--r--mm/mremap.c21
-rw-r--r--mm/page_alloc.c36
-rw-r--r--mm/page_io.c13
-rw-r--r--mm/slab.c18
-rw-r--r--mm/swap.c12
-rw-r--r--mm/swap_state.c80
-rw-r--r--mm/swapfile.c28
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmscan.c244
14 files changed, 443 insertions, 477 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index ffda2b7c1..227bcd5a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,26 +9,17 @@
* most "normal" filesystems (but you don't /have/ to use this:
* the NFS filesystem used to do this differently, for example)
*/
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
+#include <linux/malloc.h>
#include <linux/shm.h>
-#include <linux/errno.h>
#include <linux/mman.h>
-#include <linux/string.h>
-#include <linux/malloc.h>
-#include <linux/fs.h>
#include <linux/locks.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
-#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/blkdev.h>
#include <linux/file.h>
#include <linux/swapctl.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
@@ -153,7 +144,7 @@ static inline int shrink_one_page(struct page *page, int gfp_mask)
} while (tmp != bh);
/* Refuse to swap out all buffer pages */
- if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
+ if (buffer_under_min())
goto next;
}
@@ -167,14 +158,9 @@ static inline int shrink_one_page(struct page *page, int gfp_mask)
case 1:
/* is it a swap-cache or page-cache page? */
if (page->inode) {
- if (test_and_clear_bit(PG_referenced, &page->flags)) {
- touch_page(page);
- break;
- }
- age_page(page);
- if (page->age)
+ if (test_and_clear_bit(PG_referenced, &page->flags))
break;
- if (page_cache_size * 100 < (page_cache.min_percent * num_physpages))
+ if (pgcache_under_min())
break;
if (PageSwapCache(page)) {
delete_from_swap_cache(page);
@@ -188,6 +174,9 @@ static inline int shrink_one_page(struct page *page, int gfp_mask)
if (test_and_clear_bit(PG_referenced, &page->flags))
break;
+ if (buffer_under_min())
+ break;
+
/* is it a buffer cache page? */
if (bh && try_to_free_buffer(bh, &bh, 6))
return 1;
@@ -211,7 +200,7 @@ int shrink_mmap(int priority, int gfp_mask)
struct page * page;
int count_max, count_min;
- count_max = (limit<<2) >> (priority>>1);
+ count_max = limit;
count_min = (limit<<2) >> (priority);
page = mem_map + clock;
@@ -225,7 +214,15 @@ int shrink_mmap(int priority, int gfp_mask)
if (shrink_one_page(page, gfp_mask))
return 1;
count_max--;
- if (page->inode || page->buffers)
+ /*
+ * If the page we looked at was recyclable but we didn't
+ * reclaim it (presumably due to PG_referenced), don't
+ * count it as scanned. This way, the more referenced
+ * page cache pages we encounter, the more rapidly we
+ * will age them.
+ */
+ if (atomic_read(&page->count) != 1 ||
+ (!page->inode && !page->buffers))
count_min--;
page++;
clock++;
@@ -292,7 +289,7 @@ static inline void add_to_page_cache(struct page * page,
struct page **hash)
{
atomic_inc(&page->count);
- page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
+ page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
page->offset = offset;
add_page_to_inode_queue(inode, page);
__add_page_to_hash_queue(page, hash);
@@ -313,7 +310,7 @@ static unsigned long try_to_read_ahead(struct file * file,
offset &= PAGE_MASK;
switch (page_cache) {
case 0:
- page_cache = get_user_page(offset);
+ page_cache = __get_free_page(GFP_USER);
if (!page_cache)
break;
default:
@@ -327,7 +324,6 @@ static unsigned long try_to_read_ahead(struct file * file,
*/
page = mem_map + MAP_NR(page_cache);
add_to_page_cache(page, inode, offset, hash);
- set_bit(PG_referenced, &page->flags);
inode->i_op->readpage(file, page);
page_cache = 0;
}
@@ -736,7 +732,7 @@ no_cached_page:
* page..
*/
if (!page_cache) {
- page_cache = get_user_page(pos & PAGE_MASK);
+ page_cache = __get_free_page(GFP_USER);
/*
* That could have slept, so go around to the
* very beginning..
@@ -1002,7 +998,7 @@ found_page:
* extra page -- better to overlap the allocation with the I/O.
*/
if (no_share && !new_page) {
- new_page = get_user_page(address);
+ new_page = __get_free_page(GFP_USER);
if (!new_page)
goto failure;
}
@@ -1039,7 +1035,7 @@ success:
return new_page;
no_cached_page:
- new_page = get_user_page(address);
+ new_page = __get_free_page(GFP_USER);
if (!new_page)
goto no_page;
@@ -1067,8 +1063,7 @@ no_cached_page:
* Do a very limited read-ahead if appropriate
*/
if (PageLocked(page))
- new_page = try_to_read_ahead(file, offset + PAGE_SIZE,
- get_user_page(address + PAGE_SIZE));
+ new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
goto found_page;
page_locked_wait:
@@ -1520,39 +1515,58 @@ generic_file_write(struct file *file, const char *buf,
{
struct dentry *dentry = file->f_dentry;
struct inode *inode = dentry->d_inode;
+ unsigned long pos = *ppos;
+ unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
struct page *page, **hash;
unsigned long page_cache = 0;
- unsigned long pgpos, offset;
- unsigned long bytes, written;
- unsigned long pos;
- long status, sync, didread;
+ unsigned long written;
+ long status, sync;
if (!inode->i_op || !inode->i_op->updatepage)
return -EIO;
sync = file->f_flags & O_SYNC;
- pos = *ppos;
written = 0;
- status = 0;
if (file->f_flags & O_APPEND)
pos = inode->i_size;
+ /*
+ * Check whether we've reached the file size limit.
+ */
+ status = -EFBIG;
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ goto out;
+ }
+
+ status = 0;
+ /*
+ * Check whether to truncate the write,
+ * and send the signal if we do.
+ */
+ if (count > limit - pos) {
+ send_sig(SIGXFSZ, current, 0);
+ count = limit - pos;
+ }
+
while (count) {
+ unsigned long bytes, pgpos, offset;
/*
* Try to find the page in the cache. If it isn't there,
* allocate a free page.
*/
offset = (pos & ~PAGE_MASK);
pgpos = pos & PAGE_MASK;
-
- if ((bytes = PAGE_SIZE - offset) > count)
+ bytes = PAGE_SIZE - offset;
+ if (bytes > count)
bytes = count;
hash = page_hash(inode, pgpos);
- if (!(page = __find_page(inode, pgpos, *hash))) {
+ page = __find_page(inode, pgpos, *hash);
+ if (!page) {
if (!page_cache) {
- page_cache = get_user_page(pgpos);
+ page_cache = __get_free_page(GFP_USER);
if (page_cache)
continue;
status = -ENOMEM;
@@ -1563,51 +1577,25 @@ generic_file_write(struct file *file, const char *buf,
page_cache = 0;
}
- /*
- * Note: setting of the PG_locked bit is handled
- * below the i_op->xxx interface.
- */
- didread = 0;
-page_wait:
+ /* Get exclusive IO access to the page.. */
wait_on_page(page);
- if (PageUptodate(page))
- goto do_update_page;
+ set_bit(PG_locked, &page->flags);
/*
- * The page is not up-to-date ... if we're writing less
- * than a full page of data, we may have to read it first.
- * But if the page is past the current end of file, we must
- * clear it before updating.
+ * Do the real work.. If the writer ends up delaying the write,
+ * the writer needs to increment the page use counts until he
+ * is done with the page.
*/
- if (bytes < PAGE_SIZE) {
- if (pgpos < inode->i_size) {
- status = -EIO;
- if (didread >= 2)
- goto done_with_page;
- status = inode->i_op->readpage(file, page);
- if (status < 0)
- goto done_with_page;
- didread++;
- goto page_wait;
- } else {
- /* Must clear for partial writes */
- memset((void *) page_address(page), 0,
- PAGE_SIZE);
- }
- }
- /*
- * N.B. We should defer setting PG_uptodate at least until
- * the data is copied. A failure in i_op->updatepage() could
- * leave the page with garbage data.
- */
- set_bit(PG_uptodate, &page->flags);
-
-do_update_page:
- /* All right, the page is there. Now update it. */
- status = inode->i_op->updatepage(file, page, buf,
- offset, bytes, sync);
-done_with_page:
+ bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes);
+ status = -EFAULT;
+ if (bytes)
+ status = inode->i_op->updatepage(file, page, offset, bytes, sync);
+
+ /* Mark it unlocked again and drop the page.. */
+ clear_bit(PG_locked, &page->flags);
+ wake_up(&page->wait);
__free_page(page);
+
if (status < 0)
break;
@@ -1622,6 +1610,7 @@ done_with_page:
if (page_cache)
free_page(page_cache);
+out:
return written ? written : status;
}
diff --git a/mm/memory.c b/mm/memory.c
index 388d9ce03..932c35648 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -33,23 +33,13 @@
* Idea by Alex Bligh (alex@cconcepts.co.uk)
*/
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/mman.h>
#include <linux/swap.h>
-#include <linux/smp.h>
#include <linux/smp_lock.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-#include <asm/string.h>
unsigned long max_mapnr = 0;
unsigned long num_physpages = 0;
@@ -289,10 +279,6 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
}
if (cow)
pte = pte_wrprotect(pte);
-#if 0 /* No longer needed with the new swap cache code */
- if (delete_from_swap_cache(&mem_map[page_nr]))
- pte = pte_mkdirty(pte);
-#endif
set_pte(dst_pte, pte_mkold(pte));
set_pte(src_pte, pte);
atomic_inc(&mem_map[page_nr].count);
@@ -635,15 +621,15 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*/
-static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *page_table)
+static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
+ unsigned long address, pte_t *page_table)
{
pte_t pte;
unsigned long old_page, new_page;
struct page * page_map;
pte = *page_table;
- new_page = get_user_page(address);
+ new_page = __get_free_page(GFP_USER);
/* Did someone else copy this page for us while we slept? */
if (pte_val(*page_table) != pte_val(pte))
goto end_wp_page;
@@ -661,40 +647,42 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
* Do we need to copy?
*/
if (is_page_shared(page_map)) {
- if (new_page) {
- if (PageReserved(mem_map + MAP_NR(old_page)))
- ++vma->vm_mm->rss;
- copy_cow_page(old_page,new_page);
- flush_page_to_ram(old_page);
- flush_page_to_ram(new_page);
- flush_cache_page(vma, address);
- set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
- free_page(old_page);
- flush_tlb_page(vma, address);
- return;
- }
+ unlock_kernel();
+ if (!new_page)
+ return 0;
+
+ if (PageReserved(mem_map + MAP_NR(old_page)))
+ ++vma->vm_mm->rss;
+ copy_cow_page(old_page,new_page);
+ flush_page_to_ram(old_page);
+ flush_page_to_ram(new_page);
flush_cache_page(vma, address);
- set_pte(page_table, BAD_PAGE);
- flush_tlb_page(vma, address);
+ set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
free_page(old_page);
- oom(tsk);
- return;
+ flush_tlb_page(vma, address);
+ return 1;
}
+
if (PageSwapCache(page_map))
delete_from_swap_cache(page_map);
+
+ /* We can release the kernel lock now.. */
+ unlock_kernel();
+
flush_cache_page(vma, address);
set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
flush_tlb_page(vma, address);
+end_wp_page:
if (new_page)
free_page(new_page);
- return;
+ return 1;
+
bad_wp_page:
printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
send_sig(SIGKILL, tsk, 1);
-end_wp_page:
if (new_page)
free_page(new_page);
- return;
+ return 0;
}
/*
@@ -783,30 +771,53 @@ void vmtruncate(struct inode * inode, unsigned long offset)
}
-static inline void do_swap_page(struct task_struct * tsk,
+/*
+ * This is called with the kernel lock held, we need
+ * to return without it.
+ */
+static int do_swap_page(struct task_struct * tsk,
struct vm_area_struct * vma, unsigned long address,
pte_t * page_table, pte_t entry, int write_access)
{
- pte_t page;
-
if (!vma->vm_ops || !vma->vm_ops->swapin) {
- swap_in(tsk, vma, address, page_table, pte_val(entry), write_access);
+ swap_in(tsk, vma, page_table, pte_val(entry), write_access);
flush_page_to_ram(pte_page(*page_table));
- return;
+ } else {
+ pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
+ if (pte_val(*page_table) != pte_val(entry)) {
+ free_page(pte_page(page));
+ } else {
+ if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
+ !(vma->vm_flags & VM_SHARED))
+ page = pte_wrprotect(page);
+ ++vma->vm_mm->rss;
+ ++tsk->maj_flt;
+ flush_page_to_ram(pte_page(page));
+ set_pte(page_table, page);
+ }
}
- page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
- if (pte_val(*page_table) != pte_val(entry)) {
- free_page(pte_page(page));
- return;
+ unlock_kernel();
+ return 1;
+}
+
+/*
+ * This only needs the MM semaphore
+ */
+static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
+{
+ pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+ if (write_access) {
+ unsigned long page = __get_free_page(GFP_USER);
+ if (!page)
+ return 0;
+ clear_page(page);
+ entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ vma->vm_mm->rss++;
+ tsk->min_flt++;
+ flush_page_to_ram(page);
}
- if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
- !(vma->vm_flags & VM_SHARED))
- page = pte_wrprotect(page);
- ++vma->vm_mm->rss;
- ++tsk->maj_flt;
- flush_page_to_ram(pte_page(page));
- set_pte(page_table, page);
- return;
+ put_page(page_table, entry);
+ return 1;
}
/*
@@ -817,26 +828,34 @@ static inline void do_swap_page(struct task_struct * tsk,
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
+ *
+ * This is called with the MM semaphore and the kernel lock held.
+ * We need to release the kernel lock as soon as possible..
*/
-static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *page_table, pte_t entry)
+static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
+ unsigned long address, int write_access, pte_t *page_table)
{
unsigned long page;
+ pte_t entry;
+
+ if (!vma->vm_ops || !vma->vm_ops->nopage) {
+ unlock_kernel();
+ return do_anonymous_page(tsk, vma, page_table, write_access,
+ address);
+ }
- if (!pte_none(entry))
- goto swap_page;
- address &= PAGE_MASK;
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- goto anonymous_page;
/*
* The third argument is "no_share", which tells the low-level code
* to copy, not share the page even if sharing is possible. It's
- * essentially an early COW detection
+ * essentially an early COW detection.
*/
- page = vma->vm_ops->nopage(vma, address,
+ page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
(vma->vm_flags & VM_SHARED)?0:write_access);
+
+ unlock_kernel();
if (!page)
- goto sigbus;
+ return 0;
+
++tsk->maj_flt;
++vma->vm_mm->rss;
/*
@@ -849,7 +868,6 @@ static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
-/* do_no_page might already have flushed the page ... */
flush_page_to_ram(page);
entry = mk_pte(page, vma->vm_page_prot);
if (write_access) {
@@ -859,32 +877,7 @@ static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
entry = pte_wrprotect(entry);
put_page(page_table, entry);
/* no need to invalidate: a not-present page shouldn't be cached */
- return;
-
-anonymous_page:
- entry = pte_wrprotect(mk_pte(ZERO_PAGE(address), vma->vm_page_prot));
- if (write_access) {
- unsigned long page = get_user_page(address);
- if (!page)
- goto sigbus;
- clear_page(page);
- entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
- vma->vm_mm->rss++;
- tsk->min_flt++;
- flush_page_to_ram(page);
- }
- put_page(page_table, entry);
- return;
-
-sigbus:
- force_sig(SIGBUS, current);
- put_page(page_table, BAD_PAGE);
- /* no need to invalidate, wasn't present */
- return;
-
-swap_page:
- do_swap_page(tsk, vma, address, page_table, entry, write_access);
- return;
+ return 1;
}
/*
@@ -896,54 +889,57 @@ swap_page:
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*/
-static inline void handle_pte_fault(struct task_struct *tsk,
+static inline int handle_pte_fault(struct task_struct *tsk,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t * pte)
{
- pte_t entry = *pte;
+ pte_t entry;
+
+ lock_kernel();
+ entry = *pte;
if (!pte_present(entry)) {
- do_no_page(tsk, vma, address, write_access, pte, entry);
- return;
+ if (pte_none(entry))
+ return do_no_page(tsk, vma, address, write_access, pte);
+ return do_swap_page(tsk, vma, address, pte, entry, write_access);
}
+
entry = pte_mkyoung(entry);
set_pte(pte, entry);
flush_tlb_page(vma, address);
- if (!write_access)
- return;
- if (pte_write(entry)) {
+ if (write_access) {
+ if (!pte_write(entry))
+ return do_wp_page(tsk, vma, address, pte);
+
entry = pte_mkdirty(entry);
set_pte(pte, entry);
flush_tlb_page(vma, address);
- return;
}
- do_wp_page(tsk, vma, address, write_access, pte);
+ unlock_kernel();
+ return 1;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
-void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
+int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
unsigned long address, int write_access)
{
pgd_t *pgd;
pmd_t *pmd;
- pte_t *pte;
pgd = pgd_offset(vma->vm_mm, address);
pmd = pmd_alloc(pgd, address);
- if (!pmd)
- goto no_memory;
- pte = pte_alloc(pmd, address);
- if (!pte)
- goto no_memory;
- lock_kernel();
- handle_pte_fault(tsk, vma, address, write_access, pte);
- unlock_kernel();
- update_mmu_cache(vma, address, *pte);
- return;
-no_memory:
- oom(tsk);
+ if (pmd) {
+ pte_t * pte = pte_alloc(pmd, address);
+ if (pte) {
+ if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
+ update_mmu_cache(vma, address, *pte);
+ return 1;
+ }
+ }
+ }
+ return 0;
}
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 527443946..1c9035095 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -3,20 +3,12 @@
*
* (C) Copyright 1995 Linus Torvalds
*/
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/shm.h>
-#include <linux/errno.h>
#include <linux/mman.h>
-#include <linux/string.h>
-#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags)
diff --git a/mm/mmap.c b/mm/mmap.c
index 77b0c5d62..4cbdbe3ca 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3,24 +3,17 @@
*
* Written by obz.
*/
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/shm.h>
-#include <linux/errno.h>
#include <linux/mman.h>
-#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
-#include <linux/smp.h>
+#include <linux/swapctl.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/file.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
/* description of effects of mapping type and prot in current implementation.
@@ -57,6 +50,12 @@ int vm_enough_memory(long pages)
* simple, it hopefully works in most obvious cases.. Easy to
* fool it, but this should catch most mistakes.
*/
+ /* 23/11/98 NJC: Somewhat less stupid version of algorithm,
+ * which tries to do "TheRightThing". Instead of using half of
+ * (buffers+cache), use the minimum values. Allow an extra 2%
+ * of num_physpages for safety margin.
+ */
+
long free;
/* Sometimes we want to use more memory than we have. */
@@ -65,10 +64,9 @@ int vm_enough_memory(long pages)
free = buffermem >> PAGE_SHIFT;
free += page_cache_size;
- free >>= 1;
free += nr_free_pages;
free += nr_swap_pages;
- free -= num_physpages >> 4;
+ free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100;
return free > pages;
}
@@ -93,7 +91,21 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
struct mm_struct *mm = current->mm;
down(&mm->mmap_sem);
+
+ /*
+ * This lock-kernel is one of the main contention points for
+ * certain normal loads. And it really should not be here: almost
+ * everything in brk()/mmap()/munmap() is protected sufficiently by
+ * the mmap semaphore that we got above.
+ *
+ * We should move this into the few things that really want the
+ * lock, namely anything that actually touches a file descriptor
+ * etc. We can do all the normal anonymous mapping cases without
+ * ever getting the lock at all - the actual memory management
+ * code is already completely thread-safe.
+ */
lock_kernel();
+
if (brk < mm->end_code)
goto out;
newbrk = PAGE_ALIGN(brk);
@@ -162,7 +174,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma;
- int correct_wcount = 0, error;
+ int error;
if ((len = PAGE_ALIGN(len)) == 0)
return addr;
@@ -286,30 +298,28 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
!vm_enough_memory(len >> PAGE_SHIFT))
goto free_vma;
- error = 0;
if (file) {
+ int correct_wcount = 0;
if (vma->vm_flags & VM_DENYWRITE) {
- if (file->f_dentry->d_inode->i_writecount > 0)
+ if (file->f_dentry->d_inode->i_writecount > 0) {
error = -ETXTBSY;
- else {
- /* f_op->mmap might possibly sleep
- * (generic_file_mmap doesn't, but other code
- * might). In any case, this takes care of any
- * race that this might cause.
- */
- file->f_dentry->d_inode->i_writecount--;
- correct_wcount = 1;
+ goto free_vma;
}
+ /* f_op->mmap might possibly sleep
+ * (generic_file_mmap doesn't, but other code
+ * might). In any case, this takes care of any
+ * race that this might cause.
+ */
+ file->f_dentry->d_inode->i_writecount--;
+ correct_wcount = 1;
}
- if (!error)
- error = file->f_op->mmap(file, vma);
-
+ error = file->f_op->mmap(file, vma);
+ /* Fix up the count if necessary, then check for an error */
+ if (correct_wcount)
+ file->f_dentry->d_inode->i_writecount++;
+ if (error)
+ goto unmap_and_free_vma;
}
- /* Fix up the count if necessary, then check for an error */
- if (correct_wcount)
- file->f_dentry->d_inode->i_writecount++;
- if (error)
- goto free_vma;
/*
* merge_segments may merge our vma, so we can't refer to it
@@ -327,6 +337,11 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
}
return addr;
+unmap_and_free_vma:
+ /* Undo any partial mapping done by a device driver. */
+ flush_cache_range(mm, vma->vm_start, vma->vm_end);
+ zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+ flush_tlb_range(mm, vma->vm_start, vma->vm_end);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
return error;
@@ -418,6 +433,7 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
mpnt->vm_ops = area->vm_ops;
mpnt->vm_offset = area->vm_offset + (end - area->vm_start);
mpnt->vm_file = area->vm_file;
+ mpnt->vm_pte = area->vm_pte;
if (mpnt->vm_file)
mpnt->vm_file->f_count++;
if (mpnt->vm_ops && mpnt->vm_ops->open)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cc78e10ab..b28237c09 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -3,20 +3,12 @@
*
* (C) Copyright 1994 Linus Torvalds
*/
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
+#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/shm.h>
-#include <linux/errno.h>
#include <linux/mman.h>
-#include <linux/string.h>
-#include <linux/slab.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
static inline void change_pte_range(pmd_t * pmd, unsigned long address,
diff --git a/mm/mremap.c b/mm/mremap.c
index cd7a7eb4a..a10870318 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -4,21 +4,13 @@
* (C) Copyright 1996 Linus Torvalds
*/
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
+#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/shm.h>
-#include <linux/errno.h>
#include <linux/mman.h>
-#include <linux/string.h>
-#include <linux/slab.h>
#include <linux/swap.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
extern int vm_enough_memory(long pages);
@@ -142,7 +134,6 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
new_vma->vm_start = new_addr;
new_vma->vm_end = new_addr+new_len;
new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start);
- new_vma->vm_file = vma->vm_file;
if (new_vma->vm_file)
new_vma->vm_file->f_count++;
if (new_vma->vm_ops && new_vma->vm_ops->open)
@@ -151,6 +142,11 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end);
do_munmap(addr, old_len);
current->mm->total_vm += new_len >> PAGE_SHIFT;
+ if (new_vma->vm_flags & VM_LOCKED) {
+ current->mm->locked_vm += new_len >> PAGE_SHIFT;
+ make_pages_present(new_vma->vm_start,
+ new_vma->vm_end);
+ }
return new_addr;
}
kmem_cache_free(vm_area_cachep, new_vma);
@@ -224,8 +220,11 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
int pages = (new_len - old_len) >> PAGE_SHIFT;
vma->vm_end = addr + new_len;
current->mm->total_vm += pages;
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & VM_LOCKED) {
current->mm->locked_vm += pages;
+ make_pages_present(addr + old_len,
+ addr + new_len);
+ }
ret = addr;
goto out;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 70cad74eb..7ceec01b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7,25 +7,16 @@
#include <linux/config.h>
#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
#include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
#include <linux/swap.h>
-#include <linux/fs.h>
#include <linux/swapctl.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <asm/dma.h>
-#include <asm/system.h> /* for cli()/sti() */
#include <asm/uaccess.h> /* for copy_to/from_user */
-#include <asm/bitops.h>
#include <asm/pgtable.h>
-#include <asm/spinlock.h>
int nr_swap_pages = 0;
int nr_free_pages = 0;
@@ -163,9 +154,11 @@ void __free_page(struct page *page)
free_pages_ok(page->map_nr, 0);
return;
}
+#if 0
if (PageSwapCache(page) && atomic_read(&page->count) == 1)
printk(KERN_WARNING "VM: Releasing swap cache page at %p",
__builtin_return_address(0));
+#endif
}
void free_pages(unsigned long addr, unsigned long order)
@@ -182,10 +175,12 @@ void free_pages(unsigned long addr, unsigned long order)
free_pages_ok(map_nr, order);
return;
}
+#if 0
if (PageSwapCache(map) && atomic_read(&map->count) == 1)
printk(KERN_WARNING
"VM: Releasing swap cache pages at %p",
__builtin_return_address(0));
+#endif
}
}
@@ -227,7 +222,6 @@ do { unsigned long size = 1 << high; \
map += size; \
} \
atomic_set(&map->count, 1); \
- map->age = PAGE_INITIAL_AGE; \
} while (0)
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
@@ -264,14 +258,15 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
spin_unlock_irqrestore(&page_alloc_lock, flags);
/*
- * If we failed to find anything, we'll return NULL, but we'll
- * wake up kswapd _now_ ad even wait for it synchronously if
- * we can.. This way we'll at least make some forward progress
- * over time.
+ * If we can schedule, do so, and make sure to yield.
+ * We may be a real-time process, and if kswapd is
+ * waiting for us we need to allow it to run a bit.
*/
- wake_up(&kswapd_wait);
- if (gfp_mask & __GFP_WAIT)
+ if (gfp_mask & __GFP_WAIT) {
+ current->policy |= SCHED_YIELD;
schedule();
+ }
+
nopage:
return 0;
}
@@ -372,12 +367,12 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
* was due to a write access.
*/
void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
- unsigned long address, pte_t * page_table, unsigned long entry, int write_access)
+ pte_t * page_table, unsigned long entry, int write_access)
{
unsigned long page;
struct page *page_map;
- page_map = read_swap_cache(entry, address);
+ page_map = read_swap_cache(entry);
if (pte_val(*page_table) != entry) {
if (page_map)
@@ -404,8 +399,9 @@ void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
/* The page is unshared, and we want write access. In this
case, it is safe to tear down the swap cache and give the
page over entirely to this process. */
-
- delete_from_swap_cache(page_map);
+
+ if (PageSwapCache(page_map))
+ delete_from_swap_cache(page_map);
set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
return;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 44f592df8..2dd24facc 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -10,21 +10,13 @@
*/
#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
#include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
#include <linux/swap.h>
-#include <linux/fs.h>
#include <linux/locks.h>
#include <linux/swapctl.h>
#include <asm/dma.h>
-#include <asm/system.h> /* for cli()/sti() */
#include <asm/uaccess.h> /* for copy_to/from_user */
-#include <asm/bitops.h>
#include <asm/pgtable.h>
static struct wait_queue * lock_queue = NULL;
@@ -66,6 +58,11 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
printk("Internal error: bad swap-device\n");
return;
}
+
+ /* Don't allow too many pending pages in flight.. */
+ if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
+ wait = 1;
+
p = &swap_info[type];
offset = SWP_OFFSET(entry);
if (offset >= p->max) {
diff --git a/mm/slab.c b/mm/slab.c
index d4be178a2..29680bd68 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -654,9 +654,9 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
}
slabp->s_magic = SLAB_MAGIC_DESTROYED;
- kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
if (slabp->s_index)
kmem_cache_free(cachep->c_index_cachep, slabp->s_index);
+ kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
if (SLAB_OFF_SLAB(cachep->c_flags))
kmem_cache_free(cache_slabp, slabp);
}
@@ -1194,7 +1194,6 @@ kmem_cache_grow(kmem_cache_t * cachep, int flags)
cachep->c_dflags = SLAB_CFLGS_GROWN;
cachep->c_growing++;
-re_try:
spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
/* A series of memory allocations for a new slab.
@@ -1261,15 +1260,6 @@ opps1:
kmem_freepages(cachep, objp);
failed:
spin_lock_irq(&cachep->c_spinlock);
- if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) {
- /* For large order (>0) slabs, we try again.
- * Needed because the gfp() functions are not good at giving
- * out contiguous pages unless pushed (but do not push too hard).
- */
- if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep))
- goto re_try;
- cachep->c_failures = 1; /* Memory is low, don't try as hard next time. */
- }
cachep->c_growing--;
spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
return 0;
@@ -1448,8 +1438,10 @@ alloc_new_slab:
}
/* Couldn't grow, but some objs may have been freed. */
spin_lock_irq(&cachep->c_spinlock);
- if (cachep->c_freep != kmem_slab_end(cachep))
- goto try_again;
+ if (cachep->c_freep != kmem_slab_end(cachep)) {
+ if ((flags & SLAB_ATOMIC) == 0)
+ goto try_again;
+ }
} else {
/* Very serious error - maybe panic() here? */
kmem_report_alloc_err("Bad slab magic (corrupt)", cachep);
diff --git a/mm/swap.c b/mm/swap.c
index 1788021b9..1e2d8c36b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -14,22 +14,14 @@
*/
#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
#include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
#include <linux/swap.h>
-#include <linux/fs.h>
#include <linux/swapctl.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <asm/dma.h>
-#include <asm/system.h> /* for cli()/sti() */
#include <asm/uaccess.h> /* for copy_to/from_user */
-#include <asm/bitops.h>
#include <asm/pgtable.h>
/*
@@ -70,13 +62,13 @@ swapstat_t swapstats = {0};
buffer_mem_t buffer_mem = {
5, /* minimum percent buffer */
- 25, /* borrow percent buffer */
+ 10, /* borrow percent buffer */
60 /* maximum percent buffer */
};
buffer_mem_t page_cache = {
5, /* minimum percent page cache */
- 30, /* borrow percent page cache */
+ 15, /* borrow percent page cache */
75 /* maximum */
};
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2aaf0c46b..e098974b2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -8,19 +8,12 @@
*/
#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
#include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
#include <linux/swap.h>
-#include <linux/fs.h>
#include <linux/swapctl.h>
#include <linux/init.h>
#include <linux/pagemap.h>
-#include <asm/bitops.h>
#include <asm/pgtable.h>
/*
@@ -143,6 +136,50 @@ bad_unused:
goto out;
}
+int swap_count(unsigned long entry)
+{
+ struct swap_info_struct * p;
+ unsigned long offset, type;
+ int retval = 0;
+
+ if (!entry)
+ goto bad_entry;
+ type = SWP_TYPE(entry);
+ if (type & SHM_SWP_TYPE)
+ goto out;
+ if (type >= nr_swapfiles)
+ goto bad_file;
+ p = type + swap_info;
+ offset = SWP_OFFSET(entry);
+ if (offset >= p->max)
+ goto bad_offset;
+ if (!p->swap_map[offset])
+ goto bad_unused;
+ retval = p->swap_map[offset];
+#ifdef DEBUG_SWAP
+ printk("DebugVM: swap_count(entry %08lx, count %d)\n",
+ entry, retval);
+#endif
+out:
+ return retval;
+
+bad_entry:
+ printk(KERN_ERR "swap_count: null entry!\n");
+ goto out;
+bad_file:
+ printk(KERN_ERR
+ "swap_count: entry %08lx, nonexistent swap file!\n", entry);
+ goto out;
+bad_offset:
+ printk(KERN_ERR
+ "swap_count: entry %08lx, offset exceeds max!\n", entry);
+ goto out;
+bad_unused:
+ printk(KERN_ERR
+ "swap_count at %8p: entry %08lx, unused page!\n",
+ __builtin_return_address(0), entry);
+ goto out;
+}
static inline void remove_from_swap_cache(struct page *page)
{
@@ -155,6 +192,7 @@ static inline void remove_from_swap_cache(struct page *page)
printk ("VM: Removing swap cache page with wrong inode hash "
"on page %08lx\n", page_address(page));
}
+#if 0
/*
* This is a legal case, but warn about it.
*/
@@ -163,6 +201,7 @@ static inline void remove_from_swap_cache(struct page *page)
"VM: Removing page cache on unshared page %08lx\n",
page_address(page));
}
+#endif
#ifdef DEBUG_SWAP
printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
@@ -173,24 +212,25 @@ static inline void remove_from_swap_cache(struct page *page)
}
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache.
+ */
void delete_from_swap_cache(struct page *page)
{
+ long entry = page->offset;
+
#ifdef SWAP_CACHE_INFO
swap_cache_del_total++;
-#endif
- if (PageSwapCache (page)) {
- long entry = page->offset;
-#ifdef SWAP_CACHE_INFO
- swap_cache_del_success++;
+ swap_cache_del_success++;
#endif
#ifdef DEBUG_SWAP
- printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
- "entry %08lx)\n",
- page_address(page), atomic_read(&page->count), entry);
+ printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
+ "entry %08lx)\n",
+ page_address(page), atomic_read(&page->count), entry);
#endif
- remove_from_swap_cache (page);
- swap_free (entry);
- }
+ remove_from_swap_cache (page);
+ swap_free (entry);
}
/*
@@ -208,7 +248,7 @@ void free_page_and_swap_cache(unsigned long addr)
delete_from_swap_cache(page);
}
- free_user_page(page, addr);
+ free_page(addr);
}
@@ -249,7 +289,7 @@ out_bad:
* the swap entry is no longer in use.
*/
-struct page * read_swap_cache_async(unsigned long entry, unsigned long addr, int wait)
+struct page * read_swap_cache_async(unsigned long entry, int wait)
{
struct page *found_page, *new_page;
unsigned long new_page_addr;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b7446b3b5..c574fb59a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -5,25 +5,16 @@
* Swap reorganised 29.12.95, Stephen Tweedie
*/
-#include <linux/mm.h>
-#include <linux/smp.h>
+#include <linux/malloc.h>
#include <linux/smp_lock.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
#include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
#include <linux/swap.h>
-#include <linux/fs.h>
#include <linux/swapctl.h>
-#include <linux/malloc.h>
#include <linux/blkdev.h> /* for blk_size */
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/shm.h>
-#include <asm/bitops.h>
#include <asm/pgtable.h>
unsigned int nr_swapfiles = 0;
@@ -317,14 +308,14 @@ static int try_to_unuse(unsigned int type)
/* Get a page for the entry, using the existing swap
cache page if there is one. Otherwise, get a clean
page and read the swap into it. */
- page_map = read_swap_cache(entry, 0);
+ page_map = read_swap_cache(entry);
if (!page_map) {
/*
* Continue searching if the entry became unused.
*/
if (si->swap_map[i] == 0)
continue;
- return -ENOMEM;
+ return -ENOMEM;
}
page = page_address(page_map);
read_lock(&tasklist_lock);
@@ -559,8 +550,17 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
if (p->swap_device == swap_info[i].swap_device)
goto bad_swap;
}
- } else if (!S_ISREG(swap_dentry->d_inode->i_mode))
+ } else if (S_ISREG(swap_dentry->d_inode->i_mode)) {
+ error = -EBUSY;
+ for (i = 0 ; i < nr_swapfiles ; i++) {
+ if (i == type)
+ continue;
+ if (p->swap_file == swap_info[i].swap_file)
+ goto bad_swap;
+ }
+ } else
goto bad_swap;
+
swap_header = (void *) __get_free_page(GFP_USER);
if (!swap_header) {
printk("Unable to start swapping: out of memory :-)\n");
@@ -627,7 +627,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
p->max = swap_header->info.last_page;
if (p->max >= 0x7fffffffL/PAGE_SIZE ||
- (void *) &swap_header->info.badpages[swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) {
+ (void *) &swap_header->info.badpages[(int) swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) {
error = -EINVAL;
goto bad_swap;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e7711c23c..e99ad35fb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5,11 +5,9 @@
*/
#include <linux/malloc.h>
-#include <linux/swapctl.h>
#include <linux/vmalloc.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
static struct vm_struct * vmlist = NULL;
@@ -38,8 +36,7 @@ static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned lo
if (pte_none(page))
continue;
if (pte_present(page)) {
- free_user_page(mem_map + MAP_NR(pte_page(page)),
- pte_page(page));
+ free_page(pte_page(page));
continue;
}
printk("Whee.. Swapped out page in kernel page table\n");
@@ -97,7 +94,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo
unsigned long page;
if (!pte_none(*pte))
printk("alloc_area_pte: page already exists\n");
- page = get_user_page(address);
+ page = __get_free_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
set_pte(pte, mk_pte(page, prot));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 884e67150..c5efa52a2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -10,39 +10,20 @@
* Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
*/
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/smp_lock.h>
-#include <linux/slab.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/init.h>
-#include <asm/bitops.h>
#include <asm/pgtable.h>
/*
- * When are we next due for a page scan?
- */
-static unsigned long next_swap_jiffies = 0;
-
-/*
- * How often do we do a pageout scan during normal conditions?
- * Default is four times a second.
- */
-int swapout_interval = HZ / 4;
-
-/*
* The wait queue for waking up the pageout daemon:
*/
-struct wait_queue * kswapd_wait = NULL;
+static struct task_struct * kswapd_task = NULL;
static void init_swap_timer(void);
@@ -123,8 +104,13 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
}
if (pte_young(pte)) {
+ /*
+ * Transfer the "accessed" bit from the page
+ * tables to the global page map.
+ */
set_pte(page_table, pte_mkold(pte));
- touch_page(page_map);
+ set_bit(PG_referenced, &page_map->flags);
+
/*
* We should test here to see if we want to recover any
* swap cache page here. We do this if the page seeing
@@ -137,10 +123,6 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
return 0;
}
- age_page(page_map);
- if (page_map->age)
- return 0;
-
if (pte_dirty(pte)) {
if (vma->vm_ops && vma->vm_ops->swapout) {
pid_t pid = tsk->pid;
@@ -180,7 +162,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
* copy in memory, so we add it to the swap
* cache. */
if (PageSwapCache(page_map)) {
- free_page_and_swap_cache(page);
+ free_page(page);
return (atomic_read(&page_map->count) == 0);
}
add_to_swap_cache(page_map, entry);
@@ -198,7 +180,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
* asynchronously. That's no problem, shrink_mmap() can
* correctly clean up the occassional unshared page
* which gets left behind in the swap cache. */
- free_page_and_swap_cache(page);
+ free_page(page);
return 1; /* we slept: the process may not exist any more */
}
@@ -212,7 +194,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
set_pte(page_table, __pte(entry));
flush_tlb_page(vma, address);
swap_duplicate(entry);
- free_page_and_swap_cache(page);
+ free_page(page);
return (atomic_read(&page_map->count) == 0);
}
/*
@@ -228,7 +210,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
flush_cache_page(vma, address);
pte_clear(page_table);
flush_tlb_page(vma, address);
- entry = page_unuse(page_map);
+ entry = (atomic_read(&page_map->count) == 1);
__free_page(page_map);
return entry;
}
@@ -310,8 +292,9 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
}
static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
- pgd_t *pgdir, unsigned long start, int gfp_mask)
+ unsigned long address, int gfp_mask)
{
+ pgd_t *pgdir;
unsigned long end;
/* Don't swap out areas like shared memory which have their
@@ -319,12 +302,14 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
if (vma->vm_flags & (VM_SHM | VM_LOCKED))
return 0;
+ pgdir = pgd_offset(tsk->mm, address);
+
end = vma->vm_end;
- while (start < end) {
- int result = swap_out_pgd(tsk, vma, pgdir, start, end, gfp_mask);
+ while (address < end) {
+ int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
if (result)
return result;
- start = (start + PGDIR_SIZE) & PGDIR_MASK;
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
}
return 0;
@@ -344,22 +329,23 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
* Find the proper vm-area
*/
vma = find_vma(p->mm, address);
- if (!vma) {
- p->swap_address = 0;
- return 0;
+ if (vma) {
+ if (address < vma->vm_start)
+ address = vma->vm_start;
+
+ for (;;) {
+ int result = swap_out_vma(p, vma, address, gfp_mask);
+ if (result)
+ return result;
+ vma = vma->vm_next;
+ if (!vma)
+ break;
+ address = vma->vm_start;
+ }
}
- if (address < vma->vm_start)
- address = vma->vm_start;
- for (;;) {
- int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, gfp_mask);
- if (result)
- return result;
- vma = vma->vm_next;
- if (!vma)
- break;
- address = vma->vm_start;
- }
+ /* We didn't find anything for the process */
+ p->swap_cnt = 0;
p->swap_address = 0;
return 0;
}
@@ -420,20 +406,12 @@ static int swap_out(unsigned int priority, int gfp_mask)
}
pbest->swap_cnt--;
- switch (swap_out_process(pbest, gfp_mask)) {
- case 0:
- /*
- * Clear swap_cnt so we don't look at this task
- * again until we've tried all of the others.
- * (We didn't block, so the task is still here.)
- */
- pbest->swap_cnt = 0;
- break;
- case 1:
- return 1;
- default:
- break;
- };
+ /*
+ * Nonzero means we cleared out something, but only "1" means
+ * that we actually free'd up a page as a result.
+ */
+ if (swap_out_process(pbest, gfp_mask) == 1)
+ return 1;
}
out:
return 0;
@@ -448,19 +426,12 @@ static int do_try_to_free_page(int gfp_mask)
{
static int state = 0;
int i=6;
- int stop;
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
- /* We try harder if we are waiting .. */
- stop = 3;
- if (gfp_mask & __GFP_WAIT)
- stop = 0;
-
- if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
- || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
- shrink_mmap(i, gfp_mask);
+ if (buffer_over_borrow() || pgcache_over_borrow())
+ state = 0;
switch (state) {
do {
@@ -480,7 +451,7 @@ static int do_try_to_free_page(int gfp_mask)
shrink_dcache_memory(i, gfp_mask);
state = 0;
i--;
- } while ((i - stop) >= 0);
+ } while (i >= 0);
}
return 0;
}
@@ -510,10 +481,9 @@ void __init kswapd_setup(void)
*/
int kswapd(void *unused)
{
- struct wait_queue wait = { current, NULL };
current->session = 1;
current->pgrp = 1;
- sprintf(current->comm, "kswapd");
+ strcpy(current->comm, "kswapd");
sigfillset(&current->blocked);
/*
@@ -523,11 +493,12 @@ int kswapd(void *unused)
*/
lock_kernel();
- /* Give kswapd a realtime priority. */
- current->policy = SCHED_FIFO;
- current->rt_priority = 32; /* Fixme --- we need to standardise our
- namings for POSIX.4 realtime scheduling
- priorities. */
+ /*
+ * Set the base priority to something smaller than a
+ * regular process. We will scale up the priority
+ * dynamically depending on how much memory we need.
+ */
+ current->priority = (DEF_PRIORITY * 2) / 3;
/*
* Tell the memory management that we're a "memory allocator",
@@ -544,9 +515,9 @@ int kswapd(void *unused)
current->flags |= PF_MEMALLOC;
init_swap_timer();
- add_wait_queue(&kswapd_wait, &wait);
+ kswapd_task = current;
while (1) {
- int tries;
+ unsigned long end_time;
current->state = TASK_INTERRUPTIBLE;
flush_signals(current);
@@ -554,39 +525,17 @@ int kswapd(void *unused)
schedule();
swapstats.wakeups++;
- /*
- * Do the background pageout: be
- * more aggressive if we're really
- * low on free memory.
- *
- * We try page_daemon.tries_base times, divided by
- * an 'urgency factor'. In practice this will mean
- * a value of pager_daemon.tries_base / 8 or 4 = 64
- * or 128 pages at a time.
- * This gives us 64 (or 128) * 4k * 4 (times/sec) =
- * 1 (or 2) MB/s swapping bandwidth in low-priority
- * background paging. This number rises to 8 MB/s
- * when the priority is highest (but then we'll be
- * woken up more often and the rate will be even
- * higher).
- */
- tries = pager_daemon.tries_base;
- tries >>= 4*free_memory_available();
-
+ /* max one hundreth of a second */
+ end_time = jiffies + (HZ-1)/100;
do {
- do_try_to_free_page(0);
- /*
- * Syncing large chunks is faster than swapping
- * synchronously (less head movement). -- Rik.
- */
- if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
- run_task_queue(&tq_disk);
- if (free_memory_available() > 1)
+ if (!do_try_to_free_page(0))
break;
- } while (--tries > 0);
+ if (nr_free_pages > freepages.high + SWAP_CLUSTER_MAX)
+ break;
+ } while (time_before_eq(jiffies,end_time));
}
/* As if we could ever get here - maybe we want to make this killable */
- remove_wait_queue(&kswapd_wait, &wait);
+ kswapd_task = NULL;
unlock_kernel();
return 0;
}
@@ -620,42 +569,61 @@ int try_to_free_pages(unsigned int gfp_mask, int count)
return retval;
}
+/*
+ * Wake up kswapd according to the priority
+ * 0 - no wakeup
+ * 1 - wake up as a low-priority process
+ * 2 - wake up as a normal process
+ * 3 - wake up as an almost real-time process
+ *
+ * This plays mind-games with the "goodness()"
+ * function in kernel/sched.c.
+ */
+static inline void kswapd_wakeup(struct task_struct *p, int priority)
+{
+ if (priority) {
+ p->counter = p->priority << priority;
+ wake_up_process(p);
+ }
+}
+
/*
* The swap_tick function gets called on every clock tick.
*/
void swap_tick(void)
{
- unsigned long now, want;
- int want_wakeup = 0;
-
- want = next_swap_jiffies;
- now = jiffies;
+ struct task_struct *p = kswapd_task;
/*
- * Examine the memory queues. Mark memory low
- * if there is nothing available in the three
- * highest queues.
- *
- * Schedule for wakeup if there isn't lots
- * of free memory.
+ * Only bother to try to wake kswapd up
+ * if the task exists and can be woken.
*/
- switch (free_memory_available()) {
- case 0:
- want = now;
- /* Fall through */
- case 1:
- want_wakeup = 1;
- default:
- }
-
- if ((long) (now - want) >= 0) {
- if (want_wakeup || (num_physpages * buffer_mem.max_percent) < (buffermem >> PAGE_SHIFT) * 100
- || (num_physpages * page_cache.max_percent < page_cache_size * 100)) {
- /* Set the next wake-up time */
- next_swap_jiffies = now + swapout_interval;
- wake_up(&kswapd_wait);
- }
+ if (p && (p->state & TASK_INTERRUPTIBLE)) {
+ unsigned int pages;
+ int want_wakeup;
+
+ /*
+ * Schedule for wakeup if there isn't lots
+ * of free memory or if there is too much
+ * of it used for buffers or pgcache.
+ *
+ * "want_wakeup" is our priority: 0 means
+ * not to wake anything up, while 3 means
+ * that we'd better give kswapd a realtime
+ * priority.
+ */
+ want_wakeup = 0;
+ pages = nr_free_pages;
+ if (pages < freepages.high)
+ want_wakeup = 1;
+ if (pages < freepages.low)
+ want_wakeup = 2;
+ if (pages < freepages.min)
+ want_wakeup = 3;
+
+ kswapd_wakeup(p,want_wakeup);
}
+
timer_active |= (1<<SWAP_TIMER);
}
@@ -665,7 +633,7 @@ void swap_tick(void)
void init_swap_timer(void)
{
- timer_table[SWAP_TIMER].expires = 0;
+ timer_table[SWAP_TIMER].expires = jiffies;
timer_table[SWAP_TIMER].fn = swap_tick;
timer_active |= (1<<SWAP_TIMER);
}