From 78c388aed2b7184182c08428db1de6c872d815f5 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Mon, 4 Jan 1999 16:03:48 +0000
Subject: Merge with Linux 2.1.131 and more MIPS goodies.

(Did I mention that CVS is buggy ...)
---
 mm/filemap.c    | 145 ++++++++++++++++-----------------
 mm/memory.c     | 224 +++++++++++++++++++++++++--------------------------
 mm/mlock.c      |   8 --
 mm/mmap.c       |  74 ++++++++++-------
 mm/mprotect.c   |  10 +--
 mm/mremap.c     |  21 +++--
 mm/page_alloc.c |  36 ++++-----
 mm/page_io.c    |  13 ++-
 mm/slab.c       |  18 ++---
 mm/swap.c       |  12 +--
 mm/swap_state.c |  80 ++++++++++++++-----
 mm/swapfile.c   |  28 +++----
 mm/vmalloc.c    |   7 +-
 mm/vmscan.c     | 244 ++++++++++++++++++++++++--------------------------------
 14 files changed, 443 insertions(+), 477 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index ffda2b7c1..227bcd5a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,26 +9,17 @@
  * most "normal" filesystems (but you don't /have/ to use this:
  * the NFS filesystem used to do this differently, for example)
  */
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
+#include <linux/malloc.h>
 #include <linux/shm.h>
-#include <linux/errno.h>
 #include <linux/mman.h>
-#include <linux/string.h>
-#include <linux/malloc.h>
-#include <linux/fs.h>
 #include <linux/locks.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/swapctl.h>
 
-#include <asm/system.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 
@@ -153,7 +144,7 @@ static inline int shrink_one_page(struct page *page, int gfp_mask)
 		} while (tmp != bh);
 
 		/* Refuse to swap out all buffer pages */
-		if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
+		if (buffer_under_min())
 			goto next;
 	}
 
@@ -167,14 +158,9 @@ static inline int shrink_one_page(struct page *page, int gfp_mask)
 	case 1:
 		/* is it a swap-cache or page-cache page? */
 		if (page->inode) {
-			if (test_and_clear_bit(PG_referenced, &page->flags)) {
-				touch_page(page);
-				break;
-			}
-			age_page(page);
-			if (page->age)
+			if (test_and_clear_bit(PG_referenced, &page->flags))
 				break;
-			if (page_cache_size * 100 < (page_cache.min_percent * num_physpages))
+			if (pgcache_under_min())
 				break;
 			if (PageSwapCache(page)) {
 				delete_from_swap_cache(page);
@@ -188,6 +174,9 @@ static inline int shrink_one_page(struct page *page, int gfp_mask)
 		if (test_and_clear_bit(PG_referenced, &page->flags))
 			break;
 
+		if (buffer_under_min())
+			break;
+
 		/* is it a buffer cache page? */
 		if (bh && try_to_free_buffer(bh, &bh, 6))
 			return 1;
@@ -211,7 +200,7 @@ int shrink_mmap(int priority, int gfp_mask)
 	struct page * page;
 	int count_max, count_min;
 
-	count_max = (limit<<2) >> (priority>>1);
+	count_max = limit;
 	count_min = (limit<<2) >> (priority);
 
 	page = mem_map + clock;
@@ -225,7 +214,15 @@ int shrink_mmap(int priority, int gfp_mask)
 		if (shrink_one_page(page, gfp_mask))
 			return 1;
 		count_max--;
-		if (page->inode || page->buffers)
+		/* 
+		 * If the page we looked at was recyclable but we didn't
+		 * reclaim it (presumably due to PG_referenced), don't
+		 * count it as scanned.  This way, the more referenced
+		 * page cache pages we encounter, the more rapidly we
+		 * will age them. 
+		 */
+		if (atomic_read(&page->count) != 1 ||
+		    (!page->inode && !page->buffers))
 			count_min--;
 		page++;
 		clock++;
@@ -292,7 +289,7 @@ static inline void add_to_page_cache(struct page * page,
 	struct page **hash)
 {
 	atomic_inc(&page->count);
-	page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
+	page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
 	page->offset = offset;
 	add_page_to_inode_queue(inode, page);
 	__add_page_to_hash_queue(page, hash);
@@ -313,7 +310,7 @@ static unsigned long try_to_read_ahead(struct file * file,
 	offset &= PAGE_MASK;
 	switch (page_cache) {
 	case 0:
-		page_cache = get_user_page(offset);
+		page_cache = __get_free_page(GFP_USER);
 		if (!page_cache)
 			break;
 	default:
@@ -327,7 +324,6 @@ static unsigned long try_to_read_ahead(struct file * file,
 			 */
 			page = mem_map + MAP_NR(page_cache);
 			add_to_page_cache(page, inode, offset, hash);
-			set_bit(PG_referenced, &page->flags);
 			inode->i_op->readpage(file, page);
 			page_cache = 0;
 		}
@@ -736,7 +732,7 @@ no_cached_page:
 		 * page..
 		 */
 		if (!page_cache) {
-			page_cache = get_user_page(pos & PAGE_MASK);
+			page_cache = __get_free_page(GFP_USER);
 			/*
 			 * That could have slept, so go around to the
 			 * very beginning..
@@ -1002,7 +998,7 @@ found_page:
 	 * extra page -- better to overlap the allocation with the I/O.
 	 */
 	if (no_share && !new_page) {
-		new_page = get_user_page(address);
+		new_page = __get_free_page(GFP_USER);
 		if (!new_page)
 			goto failure;
 	}
@@ -1039,7 +1035,7 @@ success:
 	return new_page;
 
 no_cached_page:
-	new_page = get_user_page(address);
+	new_page = __get_free_page(GFP_USER);
 	if (!new_page)
 		goto no_page;
 
@@ -1067,8 +1063,7 @@ no_cached_page:
 	 * Do a very limited read-ahead if appropriate
 	 */
 	if (PageLocked(page))
-		new_page = try_to_read_ahead(file, offset + PAGE_SIZE,
-		                             get_user_page(address + PAGE_SIZE));
+		new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
 	goto found_page;
 
 page_locked_wait:
@@ -1520,39 +1515,58 @@ generic_file_write(struct file *file, const char *buf,
 {
 	struct dentry	*dentry = file->f_dentry; 
 	struct inode	*inode = dentry->d_inode; 
+	unsigned long	pos = *ppos;
+	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 	struct page	*page, **hash;
 	unsigned long	page_cache = 0;
-	unsigned long	pgpos, offset;
-	unsigned long	bytes, written;
-	unsigned long	pos;
-	long		status, sync, didread;
+	unsigned long	written;
+	long		status, sync;
 
 	if (!inode->i_op || !inode->i_op->updatepage)
 		return -EIO;
 
 	sync    = file->f_flags & O_SYNC;
-	pos     = *ppos;
 	written = 0;
-	status  = 0;
 
 	if (file->f_flags & O_APPEND)
 		pos = inode->i_size;
 
+	/*
+	 * Check whether we've reached the file size limit.
+	 */
+	status = -EFBIG;
+	if (pos >= limit) {
+		send_sig(SIGXFSZ, current, 0);
+		goto out;
+	}
+
+	status  = 0;
+	/*
+	 * Check whether to truncate the write,
+	 * and send the signal if we do.
+	 */
+	if (count > limit - pos) {
+		send_sig(SIGXFSZ, current, 0);
+		count = limit - pos;
+	}
+
 	while (count) {
+		unsigned long bytes, pgpos, offset;
 		/*
 		 * Try to find the page in the cache. If it isn't there,
 		 * allocate a free page.
 		 */
 		offset = (pos & ~PAGE_MASK);
 		pgpos = pos & PAGE_MASK;
-
-		if ((bytes = PAGE_SIZE - offset) > count)
+		bytes = PAGE_SIZE - offset;
+		if (bytes > count)
 			bytes = count;
 
 		hash = page_hash(inode, pgpos);
-		if (!(page = __find_page(inode, pgpos, *hash))) {
+		page = __find_page(inode, pgpos, *hash);
+		if (!page) {
 			if (!page_cache) {
-				page_cache = get_user_page(pgpos);
+				page_cache = __get_free_page(GFP_USER);
 				if (page_cache)
 					continue;
 				status = -ENOMEM;
@@ -1563,51 +1577,25 @@ generic_file_write(struct file *file, const char *buf,
 			page_cache = 0;
 		}
 
-		/*
-		 * Note: setting of the PG_locked bit is handled
-		 * below the i_op->xxx interface.
-		 */
-		didread = 0;
-page_wait:
+		/* Get exclusive IO access to the page.. */
 		wait_on_page(page);
-		if (PageUptodate(page))
-			goto do_update_page;
+		set_bit(PG_locked, &page->flags);
 
 		/*
-		 * The page is not up-to-date ... if we're writing less
-		 * than a full page of data, we may have to read it first.
-		 * But if the page is past the current end of file, we must
-		 * clear it before updating.
+		 * Do the real work.. If the writer ends up delaying the write,
+		 * the writer needs to increment the page use counts until he
+		 * is done with the page.
 		 */
-		if (bytes < PAGE_SIZE) {
-			if (pgpos < inode->i_size) {
-				status = -EIO;
-				if (didread >= 2)
-					goto done_with_page;
-				status = inode->i_op->readpage(file, page);
-				if (status < 0)
-					goto done_with_page;
-				didread++;
-				goto page_wait;
-			} else {
-				/* Must clear for partial writes */
-				memset((void *) page_address(page), 0,
-					 PAGE_SIZE);
-			}
-		}
-		/*
-		 * N.B. We should defer setting PG_uptodate at least until
-		 * the data is copied. A failure in i_op->updatepage() could
-		 * leave the page with garbage data.
-		 */
-		set_bit(PG_uptodate, &page->flags);
-
-do_update_page:
-		/* All right, the page is there.  Now update it. */
-		status = inode->i_op->updatepage(file, page, buf,
-							offset, bytes, sync);
-done_with_page:
+		bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes);
+		status = -EFAULT;
+		if (bytes)
+			status = inode->i_op->updatepage(file, page, offset, bytes, sync);
+
+		/* Mark it unlocked again and drop the page.. */
+		clear_bit(PG_locked, &page->flags);
+		wake_up(&page->wait);
 		__free_page(page);
+
 		if (status < 0)
 			break;
 
@@ -1622,6 +1610,7 @@ done_with_page:
 
 	if (page_cache)
 		free_page(page_cache);
+out:
 	return written ? written : status;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 388d9ce03..932c35648 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -33,23 +33,13 @@
  * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
  */
 
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/mman.h>
 #include <linux/swap.h>
-#include <linux/smp.h>
 #include <linux/smp_lock.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
-#include <asm/string.h>
 
 unsigned long max_mapnr = 0;
 unsigned long num_physpages = 0;
@@ -289,10 +279,6 @@ skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
 				}
 				if (cow)
 					pte = pte_wrprotect(pte);
-#if 0	/* No longer needed with the new swap cache code */
-				if (delete_from_swap_cache(&mem_map[page_nr]))
-					pte = pte_mkdirty(pte);
-#endif
 				set_pte(dst_pte, pte_mkold(pte));
 				set_pte(src_pte, pte);
 				atomic_inc(&mem_map[page_nr].count);
@@ -635,15 +621,15 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig
  * change only once the write actually happens. This avoids a few races,
  * and potentially makes it more efficient.
  */
-static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int write_access, pte_t *page_table)
+static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
+	unsigned long address, pte_t *page_table)
 {
 	pte_t pte;
 	unsigned long old_page, new_page;
 	struct page * page_map;
 	
 	pte = *page_table;
-	new_page = get_user_page(address);
+	new_page = __get_free_page(GFP_USER);
 	/* Did someone else copy this page for us while we slept? */
 	if (pte_val(*page_table) != pte_val(pte))
 		goto end_wp_page;
@@ -661,40 +647,42 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	 * Do we need to copy?
 	 */
 	if (is_page_shared(page_map)) {
-		if (new_page) {
-			if (PageReserved(mem_map + MAP_NR(old_page)))
-				++vma->vm_mm->rss;
-			copy_cow_page(old_page,new_page);
-			flush_page_to_ram(old_page);
-			flush_page_to_ram(new_page);
-			flush_cache_page(vma, address);
-			set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
-			free_page(old_page);
-			flush_tlb_page(vma, address);
-			return;
-		}
+		unlock_kernel();
+		if (!new_page)
+			return 0;
+
+		if (PageReserved(mem_map + MAP_NR(old_page)))
+			++vma->vm_mm->rss;
+		copy_cow_page(old_page,new_page);
+		flush_page_to_ram(old_page);
+		flush_page_to_ram(new_page);
 		flush_cache_page(vma, address);
-		set_pte(page_table, BAD_PAGE);
-		flush_tlb_page(vma, address);
+		set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 		free_page(old_page);
-		oom(tsk);
-		return;
+		flush_tlb_page(vma, address);
+		return 1;
 	}
+
 	if (PageSwapCache(page_map))
 		delete_from_swap_cache(page_map);
+
+	/* We can release the kernel lock now.. */
+	unlock_kernel();
+
 	flush_cache_page(vma, address);
 	set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 	flush_tlb_page(vma, address);
+end_wp_page:
 	if (new_page)
 		free_page(new_page);
-	return;
+	return 1;
+
 bad_wp_page:
 	printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 	send_sig(SIGKILL, tsk, 1);
-end_wp_page:
 	if (new_page)
 		free_page(new_page);
-	return;
+	return 0;
 }
 
 /*
@@ -783,30 +771,53 @@ void vmtruncate(struct inode * inode, unsigned long offset)
 }
 
 
-static inline void do_swap_page(struct task_struct * tsk, 
+/*
+ * This is called with the kernel lock held, we need
+ * to return without it.
+ */
+static int do_swap_page(struct task_struct * tsk, 
 	struct vm_area_struct * vma, unsigned long address,
 	pte_t * page_table, pte_t entry, int write_access)
 {
-	pte_t page;
-
 	if (!vma->vm_ops || !vma->vm_ops->swapin) {
-		swap_in(tsk, vma, address, page_table, pte_val(entry), write_access);
+		swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 		flush_page_to_ram(pte_page(*page_table));
-		return;
+	} else {
+		pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
+		if (pte_val(*page_table) != pte_val(entry)) {
+			free_page(pte_page(page));
+		} else {
+			if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
+			    !(vma->vm_flags & VM_SHARED))
+				page = pte_wrprotect(page);
+			++vma->vm_mm->rss;
+			++tsk->maj_flt;
+			flush_page_to_ram(pte_page(page));
+			set_pte(page_table, page);
+		}
 	}
-	page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
-	if (pte_val(*page_table) != pte_val(entry)) {
-		free_page(pte_page(page));
-		return;
+	unlock_kernel();
+	return 1;
+}
+
+/*
+ * This only needs the MM semaphore
+ */
+static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
+{
+	pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+	if (write_access) {
+		unsigned long page = __get_free_page(GFP_USER);
+		if (!page)
+			return 0;
+		clear_page(page);
+		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+		vma->vm_mm->rss++;
+		tsk->min_flt++;
+		flush_page_to_ram(page);
 	}
-	if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
-	    !(vma->vm_flags & VM_SHARED))
-		page = pte_wrprotect(page);
-	++vma->vm_mm->rss;
-	++tsk->maj_flt;
-	flush_page_to_ram(pte_page(page));
-	set_pte(page_table, page);
-	return;
+	put_page(page_table, entry);
+	return 1;
 }
 
 /*
@@ -817,26 +828,34 @@ static inline void do_swap_page(struct task_struct * tsk,
  *
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
+ *
+ * This is called with the MM semaphore and the kernel lock held.
+ * We need to release the kernel lock as soon as possible..
  */
-static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int write_access, pte_t *page_table, pte_t entry)
+static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
+	unsigned long address, int write_access, pte_t *page_table)
 {
 	unsigned long page;
+	pte_t entry;
+
+	if (!vma->vm_ops || !vma->vm_ops->nopage) {
+		unlock_kernel();
+		return do_anonymous_page(tsk, vma, page_table, write_access,
+		                         address);
+	}
 
-	if (!pte_none(entry))
-		goto swap_page;
-	address &= PAGE_MASK;
-	if (!vma->vm_ops || !vma->vm_ops->nopage)
-		goto anonymous_page;
 	/*
 	 * The third argument is "no_share", which tells the low-level code
 	 * to copy, not share the page even if sharing is possible.  It's
-	 * essentially an early COW detection 
+	 * essentially an early COW detection.
 	 */
-	page = vma->vm_ops->nopage(vma, address, 
+	page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
 		(vma->vm_flags & VM_SHARED)?0:write_access);
+
+	unlock_kernel();
 	if (!page)
-		goto sigbus;
+		return 0;
+
 	++tsk->maj_flt;
 	++vma->vm_mm->rss;
 	/*
@@ -849,7 +868,6 @@ static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	 * so we can make it writable and dirty to avoid having to
 	 * handle that later.
 	 */
-/* do_no_page might already have flushed the page ... */
 	flush_page_to_ram(page);
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write_access) {
@@ -859,32 +877,7 @@ static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 		entry = pte_wrprotect(entry);
 	put_page(page_table, entry);
 	/* no need to invalidate: a not-present page shouldn't be cached */
-	return;
-
-anonymous_page:
-	entry = pte_wrprotect(mk_pte(ZERO_PAGE(address), vma->vm_page_prot));
-	if (write_access) {
-		unsigned long page = get_user_page(address);
-		if (!page)
-			goto sigbus;
-		clear_page(page);
-		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-		vma->vm_mm->rss++;
-		tsk->min_flt++;
-		flush_page_to_ram(page);
-	}
-	put_page(page_table, entry);
-	return;
-
-sigbus:
-	force_sig(SIGBUS, current);
-	put_page(page_table, BAD_PAGE);
-	/* no need to invalidate, wasn't present */
-	return;
-
-swap_page:
-	do_swap_page(tsk, vma, address, page_table, entry, write_access);
-	return;
+	return 1;
 }
 
 /*
@@ -896,54 +889,57 @@ swap_page:
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  */
-static inline void handle_pte_fault(struct task_struct *tsk,
+static inline int handle_pte_fault(struct task_struct *tsk,
 	struct vm_area_struct * vma, unsigned long address,
 	int write_access, pte_t * pte)
 {
-	pte_t entry = *pte;
+	pte_t entry;
+
+	lock_kernel();
+	entry = *pte;
 
 	if (!pte_present(entry)) {
-		do_no_page(tsk, vma, address, write_access, pte, entry);
-		return;
+		if (pte_none(entry))
+			return do_no_page(tsk, vma, address, write_access, pte);
+		return do_swap_page(tsk, vma, address, pte, entry, write_access);
 	}
+
 	entry = pte_mkyoung(entry);
 	set_pte(pte, entry);
 	flush_tlb_page(vma, address);
-	if (!write_access)
-		return;
-	if (pte_write(entry)) {
+	if (write_access) {
+		if (!pte_write(entry))
+			return do_wp_page(tsk, vma, address, pte);
+
 		entry = pte_mkdirty(entry);
 		set_pte(pte, entry);
 		flush_tlb_page(vma, address);
-		return;
 	}
-	do_wp_page(tsk, vma, address, write_access, pte);
+	unlock_kernel();
+	return 1;
 }
 
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
+int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 	unsigned long address, int write_access)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
-	pte_t *pte;
 
 	pgd = pgd_offset(vma->vm_mm, address);
 	pmd = pmd_alloc(pgd, address);
-	if (!pmd)
-		goto no_memory;
-	pte = pte_alloc(pmd, address);
-	if (!pte)
-		goto no_memory;
-	lock_kernel();
-	handle_pte_fault(tsk, vma, address, write_access, pte);
-	unlock_kernel();
-	update_mmu_cache(vma, address, *pte);
-	return;
-no_memory:
-	oom(tsk);
+	if (pmd) {
+		pte_t * pte = pte_alloc(pmd, address);
+		if (pte) {
+			if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
+				update_mmu_cache(vma, address, *pte);
+				return 1;
+			}
+		}
+	}
+	return 0;
 }
 
 /*
diff --git a/mm/mlock.c b/mm/mlock.c
index 527443946..1c9035095 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -3,20 +3,12 @@
  *
  *  (C) Copyright 1995 Linus Torvalds
  */
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/shm.h>
-#include <linux/errno.h>
 #include <linux/mman.h>
-#include <linux/string.h>
-#include <linux/smp.h>
 #include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
 
 static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags)
diff --git a/mm/mmap.c b/mm/mmap.c
index 77b0c5d62..4cbdbe3ca 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3,24 +3,17 @@
  *
  * Written by obz.
  */
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/shm.h>
-#include <linux/errno.h>
 #include <linux/mman.h>
-#include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/smp.h>
+#include <linux/swapctl.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/file.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
 
 /* description of effects of mapping type and prot in current implementation.
@@ -57,6 +50,12 @@ int vm_enough_memory(long pages)
 	 * simple, it hopefully works in most obvious cases.. Easy to
 	 * fool it, but this should catch most mistakes.
 	 */
+	/* 23/11/98 NJC: Somewhat less stupid version of algorithm,
+	 * which tries to do "TheRightThing".  Instead of using half of
+	 * (buffers+cache), use the minimum values.  Allow an extra 2%
+	 * of num_physpages for safety margin.
+	 */
+
 	long free;
 	
         /* Sometimes we want to use more memory than we have. */
@@ -65,10 +64,9 @@ int vm_enough_memory(long pages)
 
 	free = buffermem >> PAGE_SHIFT;
 	free += page_cache_size;
-	free >>= 1;
 	free += nr_free_pages;
 	free += nr_swap_pages;
-	free -= num_physpages >> 4;
+	free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100; 
 	return free > pages;
 }
 
@@ -93,7 +91,21 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	struct mm_struct *mm = current->mm;
 
 	down(&mm->mmap_sem);
+
+	/*
+	 * This lock-kernel is one of the main contention points for
+	 * certain normal loads.  And it really should not be here: almost
+	 * everything in brk()/mmap()/munmap() is protected sufficiently by
+	 * the mmap semaphore that we got above.
+	 *
+	 * We should move this into the few things that really want the
+	 * lock, namely anything that actually touches a file descriptor
+	 * etc.  We can do all the normal anonymous mapping cases without
+	 * ever getting the lock at all - the actual memory management
+	 * code is already completely thread-safe.
+	 */
 	lock_kernel();
+
 	if (brk < mm->end_code)
 		goto out;
 	newbrk = PAGE_ALIGN(brk);
@@ -162,7 +174,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma;
-	int correct_wcount = 0, error;
+	int error;
 
 	if ((len = PAGE_ALIGN(len)) == 0)
 		return addr;
@@ -286,30 +298,28 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 	    !vm_enough_memory(len >> PAGE_SHIFT))
 		goto free_vma;
 
-	error = 0;
 	if (file) {
+		int correct_wcount = 0;
 		if (vma->vm_flags & VM_DENYWRITE) {
-			if (file->f_dentry->d_inode->i_writecount > 0)
+			if (file->f_dentry->d_inode->i_writecount > 0) {
 				error = -ETXTBSY;
-			else {
-	        		/* f_op->mmap might possibly sleep
-				 * (generic_file_mmap doesn't, but other code
-				 * might). In any case, this takes care of any
-				 * race that this might cause.
-				 */
-				file->f_dentry->d_inode->i_writecount--;
-				correct_wcount = 1;
+				goto free_vma;
 			}
+	        	/* f_op->mmap might possibly sleep
+			 * (generic_file_mmap doesn't, but other code
+			 * might). In any case, this takes care of any
+			 * race that this might cause.
+			 */
+			file->f_dentry->d_inode->i_writecount--;
+			correct_wcount = 1;
 		}
-		if (!error)
-			error = file->f_op->mmap(file, vma);
-	
+		error = file->f_op->mmap(file, vma);
+		/* Fix up the count if necessary, then check for an error */
+		if (correct_wcount)
+			file->f_dentry->d_inode->i_writecount++;
+		if (error)
+			goto unmap_and_free_vma;
 	}
-	/* Fix up the count if necessary, then check for an error */
-	if (correct_wcount)
-		file->f_dentry->d_inode->i_writecount++;
-	if (error)
-		goto free_vma;
 
 	/*
 	 * merge_segments may merge our vma, so we can't refer to it
@@ -327,6 +337,11 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 	}
 	return addr;
 
+unmap_and_free_vma:
+	/* Undo any partial mapping done by a device driver. */
+	flush_cache_range(mm, vma->vm_start, vma->vm_end);
+	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+	flush_tlb_range(mm, vma->vm_start, vma->vm_end);
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
 	return error;
@@ -418,6 +433,7 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
 		mpnt->vm_ops = area->vm_ops;
 		mpnt->vm_offset = area->vm_offset + (end - area->vm_start);
 		mpnt->vm_file = area->vm_file;
+		mpnt->vm_pte = area->vm_pte;
 		if (mpnt->vm_file)
 			mpnt->vm_file->f_count++;
 		if (mpnt->vm_ops && mpnt->vm_ops->open)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cc78e10ab..b28237c09 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -3,20 +3,12 @@
  *
  *  (C) Copyright 1994 Linus Torvalds
  */
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/shm.h>
-#include <linux/errno.h>
 #include <linux/mman.h>
-#include <linux/string.h>
-#include <linux/slab.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
 
 static inline void change_pte_range(pmd_t * pmd, unsigned long address,
diff --git a/mm/mremap.c b/mm/mremap.c
index cd7a7eb4a..a10870318 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -4,21 +4,13 @@
  *	(C) Copyright 1996 Linus Torvalds
  */
 
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/shm.h>
-#include <linux/errno.h>
 #include <linux/mman.h>
-#include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/swap.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
 
 extern int vm_enough_memory(long pages);
@@ -142,7 +134,6 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
 			new_vma->vm_start = new_addr;
 			new_vma->vm_end = new_addr+new_len;
 			new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start);
-			new_vma->vm_file = vma->vm_file;
 			if (new_vma->vm_file)
 				new_vma->vm_file->f_count++;
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
@@ -151,6 +142,11 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
 			merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end);
 			do_munmap(addr, old_len);
 			current->mm->total_vm += new_len >> PAGE_SHIFT;
+			if (new_vma->vm_flags & VM_LOCKED) {
+				current->mm->locked_vm += new_len >> PAGE_SHIFT;
+				make_pages_present(new_vma->vm_start,
+						   new_vma->vm_end);
+			}
 			return new_addr;
 		}
 		kmem_cache_free(vm_area_cachep, new_vma);
@@ -224,8 +220,11 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
 			int pages = (new_len - old_len) >> PAGE_SHIFT;
 			vma->vm_end = addr + new_len;
 			current->mm->total_vm += pages;
-			if (vma->vm_flags & VM_LOCKED)
+			if (vma->vm_flags & VM_LOCKED) {
 				current->mm->locked_vm += pages;
+				make_pages_present(addr + old_len,
+						   addr + new_len);
+			}
 			ret = addr;
 			goto out;
 		}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 70cad74eb..7ceec01b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7,25 +7,16 @@
 
 #include <linux/config.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
 #include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
 #include <linux/swap.h>
-#include <linux/fs.h>
 #include <linux/swapctl.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 
 #include <asm/dma.h>
-#include <asm/system.h> /* for cli()/sti() */
 #include <asm/uaccess.h> /* for copy_to/from_user */
-#include <asm/bitops.h>
 #include <asm/pgtable.h>
-#include <asm/spinlock.h>
 
 int nr_swap_pages = 0;
 int nr_free_pages = 0;
@@ -163,9 +154,11 @@ void __free_page(struct page *page)
 		free_pages_ok(page->map_nr, 0);
 		return;
 	}
+#if 0
 	if (PageSwapCache(page) && atomic_read(&page->count) == 1)
 		printk(KERN_WARNING "VM: Releasing swap cache page at %p",
 			__builtin_return_address(0));
+#endif
 }
 
 void free_pages(unsigned long addr, unsigned long order)
@@ -182,10 +175,12 @@ void free_pages(unsigned long addr, unsigned long order)
 			free_pages_ok(map_nr, order);
 			return;
 		}
+#if 0
 		if (PageSwapCache(map) && atomic_read(&map->count) == 1)
 			printk(KERN_WARNING 
 				"VM: Releasing swap cache pages at %p",
 				__builtin_return_address(0));
+#endif
 	}
 }
 
@@ -227,7 +222,6 @@ do { unsigned long size = 1 << high; \
 		map += size; \
 	} \
 	atomic_set(&map->count, 1); \
-	map->age = PAGE_INITIAL_AGE; \
 } while (0)
 
 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
@@ -264,14 +258,15 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
 
 	/*
-	 * If we failed to find anything, we'll return NULL, but we'll
-	 * wake up kswapd _now_ ad even wait for it synchronously if
-	 * we can.. This way we'll at least make some forward progress
-	 * over time.
+	 * If we can schedule, do so, and make sure to yield.
+	 * We may be a real-time process, and if kswapd is
+	 * waiting for us we need to allow it to run a bit.
 	 */
-	wake_up(&kswapd_wait);
-	if (gfp_mask & __GFP_WAIT)
+	if (gfp_mask & __GFP_WAIT) {
+		current->policy |= SCHED_YIELD;
 		schedule();
+	}
+
 nopage:
 	return 0;
 }
@@ -372,12 +367,12 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
  * was due to a write access.
  */
 void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, pte_t * page_table, unsigned long entry, int write_access)
+	pte_t * page_table, unsigned long entry, int write_access)
 {
 	unsigned long page;
 	struct page *page_map;
 	
-	page_map = read_swap_cache(entry, address);
+	page_map = read_swap_cache(entry);
 
 	if (pte_val(*page_table) != entry) {
 		if (page_map)
@@ -404,8 +399,9 @@ void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
 	/* The page is unshared, and we want write access.  In this
 	   case, it is safe to tear down the swap cache and give the
 	   page over entirely to this process. */
-		
-	delete_from_swap_cache(page_map);
+
+	if (PageSwapCache(page_map))
+		delete_from_swap_cache(page_map);
 	set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
   	return;
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 44f592df8..2dd24facc 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -10,21 +10,13 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
 #include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
 #include <linux/swap.h>
-#include <linux/fs.h>
 #include <linux/locks.h>
 #include <linux/swapctl.h>
 
 #include <asm/dma.h>
-#include <asm/system.h> /* for cli()/sti() */
 #include <asm/uaccess.h> /* for copy_to/from_user */
-#include <asm/bitops.h>
 #include <asm/pgtable.h>
 
 static struct wait_queue * lock_queue = NULL;
@@ -66,6 +58,11 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 		printk("Internal error: bad swap-device\n");
 		return;
 	}
+
+	/* Don't allow too many pending pages in flight.. */
+	if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
+		wait = 1;
+
 	p = &swap_info[type];
 	offset = SWP_OFFSET(entry);
 	if (offset >= p->max) {
diff --git a/mm/slab.c b/mm/slab.c
index d4be178a2..29680bd68 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -654,9 +654,9 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
 	}
 
 	slabp->s_magic = SLAB_MAGIC_DESTROYED;
-	kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
 	if (slabp->s_index)
 		kmem_cache_free(cachep->c_index_cachep, slabp->s_index);
+	kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
 	if (SLAB_OFF_SLAB(cachep->c_flags))
 		kmem_cache_free(cache_slabp, slabp);
 }
@@ -1194,7 +1194,6 @@ kmem_cache_grow(kmem_cache_t * cachep, int flags)
 	cachep->c_dflags = SLAB_CFLGS_GROWN;
 
 	cachep->c_growing++;
-re_try:
 	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
 
 	/* A series of memory allocations for a new slab.
@@ -1261,15 +1260,6 @@ opps1:
 	kmem_freepages(cachep, objp); 
 failed:
 	spin_lock_irq(&cachep->c_spinlock);
-	if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) {
-		/* For large order (>0) slabs, we try again.
-		 * Needed because the gfp() functions are not good at giving
-		 * out contiguous pages unless pushed (but do not push too hard).
-		 */
-		if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep))
-			goto re_try;
-		cachep->c_failures = 1;	/* Memory is low, don't try as hard next time. */
-	}
 	cachep->c_growing--;
 	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
 	return 0;
@@ -1448,8 +1438,10 @@ alloc_new_slab:
 		}
 		/* Couldn't grow, but some objs may have been freed. */
 		spin_lock_irq(&cachep->c_spinlock);
-		if (cachep->c_freep != kmem_slab_end(cachep))
-			goto try_again;
+		if (cachep->c_freep != kmem_slab_end(cachep)) {
+			if ((flags & SLAB_ATOMIC) == 0) 
+				goto try_again;
+		}
 	} else {
 		/* Very serious error - maybe panic() here? */
 		kmem_report_alloc_err("Bad slab magic (corrupt)", cachep);
diff --git a/mm/swap.c b/mm/swap.c
index 1788021b9..1e2d8c36b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -14,22 +14,14 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
 #include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
 #include <linux/swap.h>
-#include <linux/fs.h>
 #include <linux/swapctl.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 
 #include <asm/dma.h>
-#include <asm/system.h> /* for cli()/sti() */
 #include <asm/uaccess.h> /* for copy_to/from_user */
-#include <asm/bitops.h>
 #include <asm/pgtable.h>
 
 /*
@@ -70,13 +62,13 @@ swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
 	5,	/* minimum percent buffer */
-	25,	/* borrow percent buffer */
+	10,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
 	5,	/* minimum percent page cache */
-	30,	/* borrow percent page cache */
+	15,	/* borrow percent page cache */
 	75	/* maximum */
 };
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2aaf0c46b..e098974b2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -8,19 +8,12 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
 #include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
 #include <linux/swap.h>
-#include <linux/fs.h>
 #include <linux/swapctl.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 
-#include <asm/bitops.h>
 #include <asm/pgtable.h>
 
 /* 
@@ -143,6 +136,50 @@ bad_unused:
 	goto out;
 }
 
+int swap_count(unsigned long entry)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+	int retval = 0;
+
+	if (!entry)
+		goto bad_entry;
+	type = SWP_TYPE(entry);
+	if (type & SHM_SWP_TYPE)
+		goto out;
+	if (type >= nr_swapfiles)
+		goto bad_file;
+	p = type + swap_info;
+	offset = SWP_OFFSET(entry);
+	if (offset >= p->max)
+		goto bad_offset;
+	if (!p->swap_map[offset])
+		goto bad_unused;
+	retval = p->swap_map[offset];
+#ifdef DEBUG_SWAP
+	printk("DebugVM: swap_count(entry %08lx, count %d)\n",
+	       entry, retval);
+#endif
+out:
+	return retval;
+
+bad_entry:
+	printk(KERN_ERR "swap_count: null entry!\n");
+	goto out;
+bad_file:
+	printk(KERN_ERR
+	       "swap_count: entry %08lx, nonexistent swap file!\n", entry);
+	goto out;
+bad_offset:
+	printk(KERN_ERR
+	       "swap_count: entry %08lx, offset exceeds max!\n", entry);
+	goto out;
+bad_unused:
+	printk(KERN_ERR
+	       "swap_count at %8p: entry %08lx, unused page!\n", 
+	       __builtin_return_address(0), entry);
+	goto out;
+}
 
 static inline void remove_from_swap_cache(struct page *page)
 {
@@ -155,6 +192,7 @@ static inline void remove_from_swap_cache(struct page *page)
 		printk ("VM: Removing swap cache page with wrong inode hash "
 			"on page %08lx\n", page_address(page));
 	}
+#if 0
 	/*
 	 * This is a legal case, but warn about it.
 	 */
@@ -163,6 +201,7 @@ static inline void remove_from_swap_cache(struct page *page)
 			"VM: Removing page cache on unshared page %08lx\n", 
 			page_address(page));
 	}
+#endif
 
 #ifdef DEBUG_SWAP
 	printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
@@ -173,24 +212,25 @@ static inline void remove_from_swap_cache(struct page *page)
 }
 
 
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache.
+ */
 void delete_from_swap_cache(struct page *page)
 {
+	long entry = page->offset;
+
 #ifdef SWAP_CACHE_INFO
 	swap_cache_del_total++;
-#endif	
-	if (PageSwapCache (page))  {
-		long entry = page->offset;
-#ifdef SWAP_CACHE_INFO
-		swap_cache_del_success++;
+	swap_cache_del_success++;
 #endif
 #ifdef DEBUG_SWAP
-		printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
-		       "entry %08lx)\n",
-		       page_address(page), atomic_read(&page->count), entry);
+	printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
+	       "entry %08lx)\n",
+	       page_address(page), atomic_read(&page->count), entry);
 #endif
-		remove_from_swap_cache (page);
-		swap_free (entry);
-	}
+	remove_from_swap_cache (page);
+	swap_free (entry);
 }
 
 /* 
@@ -208,7 +248,7 @@ void free_page_and_swap_cache(unsigned long addr)
 		delete_from_swap_cache(page);
 	}
 	
-	free_user_page(page, addr);
+	free_page(addr);
 }
 
 
@@ -249,7 +289,7 @@ out_bad:
  * the swap entry is no longer in use.
  */
 
-struct page * read_swap_cache_async(unsigned long entry, unsigned long addr, int wait)
+struct page * read_swap_cache_async(unsigned long entry, int wait)
 {
 	struct page *found_page, *new_page;
 	unsigned long new_page_addr;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b7446b3b5..c574fb59a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -5,25 +5,16 @@
  *  Swap reorganised 29.12.95, Stephen Tweedie
  */
 
-#include <linux/mm.h>
-#include <linux/smp.h>
+#include <linux/malloc.h>
 #include <linux/smp_lock.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
 #include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/stat.h>
 #include <linux/swap.h>
-#include <linux/fs.h>
 #include <linux/swapctl.h>
-#include <linux/malloc.h>
 #include <linux/blkdev.h> /* for blk_size */
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/shm.h>
 
-#include <asm/bitops.h>
 #include <asm/pgtable.h>
 
 unsigned int nr_swapfiles = 0;
@@ -317,14 +308,14 @@ static int try_to_unuse(unsigned int type)
 		/* Get a page for the entry, using the existing swap
                    cache page if there is one.  Otherwise, get a clean
                    page and read the swap into it. */
-		page_map = read_swap_cache(entry, 0);
+		page_map = read_swap_cache(entry);
 		if (!page_map) {
 			/*
 			 * Continue searching if the entry became unused.
 			 */
 			if (si->swap_map[i] == 0)
 				continue;
-			return -ENOMEM;
+  			return -ENOMEM;
 		}
 		page = page_address(page_map);
 		read_lock(&tasklist_lock);
@@ -559,8 +550,17 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 			if (p->swap_device == swap_info[i].swap_device)
 				goto bad_swap;
 		}
-	} else if (!S_ISREG(swap_dentry->d_inode->i_mode))
+	} else if (S_ISREG(swap_dentry->d_inode->i_mode)) {
+		error = -EBUSY;
+		for (i = 0 ; i < nr_swapfiles ; i++) {
+			if (i == type)
+				continue;
+			if (p->swap_file == swap_info[i].swap_file)
+				goto bad_swap;
+		}
+	} else
 		goto bad_swap;
+
 	swap_header = (void *) __get_free_page(GFP_USER);
 	if (!swap_header) {
 		printk("Unable to start swapping: out of memory :-)\n");
@@ -627,7 +627,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 		p->max	       = swap_header->info.last_page;
 
 		if (p->max >= 0x7fffffffL/PAGE_SIZE ||
-		    (void *) &swap_header->info.badpages[swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) {
+		    (void *) &swap_header->info.badpages[(int) swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) {
 			error = -EINVAL;
 			goto bad_swap;
 		}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e7711c23c..e99ad35fb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5,11 +5,9 @@
  */
 
 #include <linux/malloc.h>
-#include <linux/swapctl.h>
 #include <linux/vmalloc.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 
 static struct vm_struct * vmlist = NULL;
 
@@ -38,8 +36,7 @@ static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned lo
 		if (pte_none(page))
 			continue;
 		if (pte_present(page)) {
-			free_user_page(mem_map + MAP_NR(pte_page(page)),
-			               pte_page(page));
+			free_page(pte_page(page));
 			continue;
 		}
 		printk("Whee.. Swapped out page in kernel page table\n");
@@ -97,7 +94,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo
 		unsigned long page;
 		if (!pte_none(*pte))
 			printk("alloc_area_pte: page already exists\n");
-		page = get_user_page(address);
+		page = __get_free_page(GFP_KERNEL);
 		if (!page)
 			return -ENOMEM;
 		set_pte(pte, mk_pte(page, prot));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 884e67150..c5efa52a2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -10,39 +10,20 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/kernel_stat.h>
-#include <linux/errno.h>
-#include <linux/string.h>
 #include <linux/swap.h>
 #include <linux/swapctl.h>
 #include <linux/smp_lock.h>
-#include <linux/slab.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 
-#include <asm/bitops.h>
 #include <asm/pgtable.h>
 
-/* 
- * When are we next due for a page scan? 
- */
-static unsigned long next_swap_jiffies = 0;
-
-/* 
- * How often do we do a pageout scan during normal conditions?
- * Default is four times a second.
- */
-int swapout_interval = HZ / 4;
-
 /* 
  * The wait queue for waking up the pageout daemon:
  */
-struct wait_queue * kswapd_wait = NULL;
+static struct task_struct * kswapd_task = NULL;
 
 static void init_swap_timer(void);
 
@@ -123,8 +104,13 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 	}
 	
 	if (pte_young(pte)) {
+		/*
+		 * Transfer the "accessed" bit from the page
+		 * tables to the global page map.
+		 */
 		set_pte(page_table, pte_mkold(pte));
-		touch_page(page_map);
+		set_bit(PG_referenced, &page_map->flags);
+
 		/* 
 		 * We should test here to see if we want to recover any
 		 * swap cache page here.  We do this if the page seeing
@@ -137,10 +123,6 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 		return 0;
 	}
 
-	age_page(page_map);
-	if (page_map->age)
-		return 0;
-
 	if (pte_dirty(pte)) {
 		if (vma->vm_ops && vma->vm_ops->swapout) {
 			pid_t pid = tsk->pid;
@@ -180,7 +162,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 			 * copy in memory, so we add it to the swap
 			 * cache. */
 			if (PageSwapCache(page_map)) {
-				free_page_and_swap_cache(page);
+				free_page(page);
 				return (atomic_read(&page_map->count) == 0);
 			}
 			add_to_swap_cache(page_map, entry);
@@ -198,7 +180,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 		 * asynchronously.  That's no problem, shrink_mmap() can
 		 * correctly clean up the occassional unshared page
 		 * which gets left behind in the swap cache. */
-		free_page_and_swap_cache(page);
+		free_page(page);
 		return 1;	/* we slept: the process may not exist any more */
 	}
 
@@ -212,7 +194,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 		set_pte(page_table, __pte(entry));
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
-		free_page_and_swap_cache(page);
+		free_page(page);
 		return (atomic_read(&page_map->count) == 0);
 	} 
 	/* 
@@ -228,7 +210,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = page_unuse(page_map);
+	entry = (atomic_read(&page_map->count) == 1);
 	__free_page(page_map);
 	return entry;
 }
@@ -310,8 +292,9 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
 }
 
 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *pgdir, unsigned long start, int gfp_mask)
+	unsigned long address, int gfp_mask)
 {
+	pgd_t *pgdir;
 	unsigned long end;
 
 	/* Don't swap out areas like shared memory which have their
@@ -319,12 +302,14 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 	if (vma->vm_flags & (VM_SHM | VM_LOCKED))
 		return 0;
 
+	pgdir = pgd_offset(tsk->mm, address);
+
 	end = vma->vm_end;
-	while (start < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, start, end, gfp_mask);
+	while (address < end) {
+		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
 		if (result)
 			return result;
-		start = (start + PGDIR_SIZE) & PGDIR_MASK;
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	}
 	return 0;
@@ -344,22 +329,23 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
 	 * Find the proper vm-area
 	 */
 	vma = find_vma(p->mm, address);
-	if (!vma) {
-		p->swap_address = 0;
-		return 0;
+	if (vma) {
+		if (address < vma->vm_start)
+			address = vma->vm_start;
+
+		for (;;) {
+			int result = swap_out_vma(p, vma, address, gfp_mask);
+			if (result)
+				return result;
+			vma = vma->vm_next;
+			if (!vma)
+				break;
+			address = vma->vm_start;
+		}
 	}
-	if (address < vma->vm_start)
-		address = vma->vm_start;
 
-	for (;;) {
-		int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, gfp_mask);
-		if (result)
-			return result;
-		vma = vma->vm_next;
-		if (!vma)
-			break;
-		address = vma->vm_start;
-	}
+	/* We didn't find anything for the process */
+	p->swap_cnt = 0;
 	p->swap_address = 0;
 	return 0;
 }
@@ -420,20 +406,12 @@ static int swap_out(unsigned int priority, int gfp_mask)
 		}
 		pbest->swap_cnt--;
 
-		switch (swap_out_process(pbest, gfp_mask)) {
-		case 0:
-			/*
-			 * Clear swap_cnt so we don't look at this task
-			 * again until we've tried all of the others.
-			 * (We didn't block, so the task is still here.)
-			 */
-			pbest->swap_cnt = 0;
-			break;
-		case 1:
-			return 1;
-		default:
-			break;
-		};
+		/*
+		 * Nonzero means we cleared out something, but only "1" means
+		 * that we actually free'd up a page as a result.
+		 */
+		if (swap_out_process(pbest, gfp_mask) == 1)
+				return 1;
 	}
 out:
 	return 0;
@@ -448,19 +426,12 @@ static int do_try_to_free_page(int gfp_mask)
 {
 	static int state = 0;
 	int i=6;
-	int stop;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	/* We try harder if we are waiting .. */
-	stop = 3;
-	if (gfp_mask & __GFP_WAIT)
-		stop = 0;
-
-	if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
-		   || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
-		shrink_mmap(i, gfp_mask);
+	if (buffer_over_borrow() || pgcache_over_borrow())
+		state = 0;
 
 	switch (state) {
 		do {
@@ -480,7 +451,7 @@ static int do_try_to_free_page(int gfp_mask)
 			shrink_dcache_memory(i, gfp_mask);
 			state = 0;
 		i--;
-		} while ((i - stop) >= 0);
+		} while (i >= 0);
 	}
 	return 0;
 }
@@ -510,10 +481,9 @@ void __init kswapd_setup(void)
  */
 int kswapd(void *unused)
 {
-	struct wait_queue wait = { current, NULL };
 	current->session = 1;
 	current->pgrp = 1;
-	sprintf(current->comm, "kswapd");
+	strcpy(current->comm, "kswapd");
 	sigfillset(&current->blocked);
 	
 	/*
@@ -523,11 +493,12 @@ int kswapd(void *unused)
 	 */
 	lock_kernel();
 
-	/* Give kswapd a realtime priority. */
-	current->policy = SCHED_FIFO;
-	current->rt_priority = 32;  /* Fixme --- we need to standardise our
-				    namings for POSIX.4 realtime scheduling
-				    priorities.  */
+	/*
+	 * Set the base priority to something smaller than a
+	 * regular process. We will scale up the priority
+	 * dynamically depending on how much memory we need.
+	 */
+	current->priority = (DEF_PRIORITY * 2) / 3;
 
 	/*
 	 * Tell the memory management that we're a "memory allocator",
@@ -544,9 +515,9 @@ int kswapd(void *unused)
 	current->flags |= PF_MEMALLOC;
 
 	init_swap_timer();
-	add_wait_queue(&kswapd_wait, &wait);
+	kswapd_task = current;
 	while (1) {
-		int tries;
+		unsigned long end_time;
 
 		current->state = TASK_INTERRUPTIBLE;
 		flush_signals(current);
@@ -554,39 +525,17 @@ int kswapd(void *unused)
 		schedule();
 		swapstats.wakeups++;
 
-		/*
-		 * Do the background pageout: be
-		 * more aggressive if we're really
-		 * low on free memory.
-		 *
-		 * We try page_daemon.tries_base times, divided by
-		 * an 'urgency factor'. In practice this will mean
-		 * a value of pager_daemon.tries_base / 8 or 4 = 64
-		 * or 128 pages at a time.
-		 * This gives us 64 (or 128) * 4k * 4 (times/sec) =
-		 * 1 (or 2) MB/s swapping bandwidth in low-priority
-		 * background paging. This number rises to 8 MB/s
-		 * when the priority is highest (but then we'll be
-		 * woken up more often and the rate will be even
-		 * higher).
-		 */
-		tries = pager_daemon.tries_base;
-		tries >>= 4*free_memory_available();
-
+		/* max one hundreth of a second */
+		end_time = jiffies + (HZ-1)/100;
 		do {
-			do_try_to_free_page(0);
-			/*
-			 * Syncing large chunks is faster than swapping
-			 * synchronously (less head movement). -- Rik.
-			 */
-			if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
-				run_task_queue(&tq_disk);
-			if (free_memory_available() > 1)
+			if (!do_try_to_free_page(0))
 				break;
-		} while (--tries > 0);
+			if (nr_free_pages > freepages.high + SWAP_CLUSTER_MAX)
+				break;
+		} while (time_before_eq(jiffies,end_time));
 	}
 	/* As if we could ever get here - maybe we want to make this killable */
-	remove_wait_queue(&kswapd_wait, &wait);
+	kswapd_task = NULL;
 	unlock_kernel();
 	return 0;
 }
@@ -620,42 +569,61 @@ int try_to_free_pages(unsigned int gfp_mask, int count)
 	return retval;
 }
 
+/*
+ * Wake up kswapd according to the priority
+ *	0 - no wakeup
+ *	1 - wake up as a low-priority process
+ *	2 - wake up as a normal process
+ *	3 - wake up as an almost real-time process
+ *
+ * This plays mind-games with the "goodness()"
+ * function in kernel/sched.c.
+ */
+static inline void kswapd_wakeup(struct task_struct *p, int priority)
+{
+	if (priority) {
+		p->counter = p->priority << priority;
+		wake_up_process(p);
+	}
+}
+
 /* 
  * The swap_tick function gets called on every clock tick.
  */
 void swap_tick(void)
 {
-	unsigned long now, want;
-	int want_wakeup = 0;
-
-	want = next_swap_jiffies;
-	now = jiffies;
+	struct task_struct *p = kswapd_task;
 
 	/*
-	 * Examine the memory queues. Mark memory low
-	 * if there is nothing available in the three
-	 * highest queues.
-	 *
-	 * Schedule for wakeup if there isn't lots
-	 * of free memory.
+	 * Only bother to try to wake kswapd up
+	 * if the task exists and can be woken.
 	 */
-	switch (free_memory_available()) {
-	case 0:
-		want = now;
-		/* Fall through */
-	case 1:
-		want_wakeup = 1;
-	default:
-	}
- 
-	if ((long) (now - want) >= 0) {
-		if (want_wakeup || (num_physpages * buffer_mem.max_percent) < (buffermem >> PAGE_SHIFT) * 100
-				|| (num_physpages * page_cache.max_percent < page_cache_size * 100)) {
-			/* Set the next wake-up time */
-			next_swap_jiffies = now + swapout_interval;
-			wake_up(&kswapd_wait);
-		}
+	if (p && (p->state & TASK_INTERRUPTIBLE)) {
+		unsigned int pages;
+		int want_wakeup;
+
+		/*
+		 * Schedule for wakeup if there isn't lots
+		 * of free memory or if there is too much
+		 * of it used for buffers or pgcache.
+		 *
+		 * "want_wakeup" is our priority: 0 means
+		 * not to wake anything up, while 3 means
+		 * that we'd better give kswapd a realtime
+		 * priority.
+		 */
+		want_wakeup = 0;
+		pages = nr_free_pages;
+		if (pages < freepages.high)
+			want_wakeup = 1;
+		if (pages < freepages.low)
+			want_wakeup = 2;
+		if (pages < freepages.min)
+			want_wakeup = 3;
+	
+		kswapd_wakeup(p,want_wakeup);
 	}
+
 	timer_active |= (1<<SWAP_TIMER);
 }
 
@@ -665,7 +633,7 @@ void swap_tick(void)
 
 void init_swap_timer(void)
 {
-	timer_table[SWAP_TIMER].expires = 0;
+	timer_table[SWAP_TIMER].expires = jiffies;
 	timer_table[SWAP_TIMER].fn = swap_tick;
 	timer_active |= (1<<SWAP_TIMER);
 }
-- 
cgit v1.2.3