14 files changed, 954 insertions, 516 deletions
diff --git a/mm/.cvsignore b/mm/.cvsignore
index 4671378ae..857dd22e9 100644
--- a/mm/.cvsignore
+++ b/mm/.cvsignore
@@ -1 +1,2 @@
 .depend
+.*.flags
diff --git a/mm/filemap.c b/mm/filemap.c
index 6d718c01d..7a4e20e21 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -25,6 +25,8 @@
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/swapctl.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -115,7 +117,7 @@ repeat:
 	}
 }
 
-int shrink_mmap(int priority, int dma)
+int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
 	struct page * page;
@@ -134,7 +136,7 @@ int shrink_mmap(int priority, int dma)
 
 		if (PageLocked(page))
 			goto next;
-		if (dma && !PageDMA(page))
+		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 			goto next;
 		/* First of all, regenerate the page's referenced bit
                    from any buffers in the page */
@@ -158,20 +160,31 @@ int shrink_mmap(int priority, int dma)
 
 		switch (atomic_read(&page->count)) {
 			case 1:
-				/* If it has been referenced recently, don't free it */
-				if (test_and_clear_bit(PG_referenced, &page->flags))
-					break;
-
-				/* is it a page cache page? */
+				/* is it a swap-cache or page-cache page? */
 				if (page->inode) {
+					if (test_and_clear_bit(PG_referenced, &page->flags)) {
+						touch_page(page);
+						break;
+					}
+					age_page(page);
+					if (page->age)
+						break;
+					if (PageSwapCache(page)) {
+						delete_from_swap_cache(page);
+						return 1;
+					}
 					remove_page_from_hash_queue(page);
 					remove_page_from_inode_queue(page);
 					__free_page(page);
 					return 1;
 				}
+				/* It's not a cache page, so we don't do aging.
+				 * If it has been referenced recently, don't free it */
+				if (test_and_clear_bit(PG_referenced, &page->flags))
+					break;
 
 				/* is it a buffer cache page? */
-				if (bh && try_to_free_buffer(bh, &bh, 6))
+				if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6))
 					return 1;
 				break;
 
@@ -208,6 +221,8 @@ unsigned long page_unuse(unsigned long page)
 		return count;
 	if (!p->inode)
 		return count;
+	if (PageSwapCache(p))
+		panic ("Doing a normal page_unuse of a swap cache page");
 	remove_page_from_hash_queue(p);
 	remove_page_from_inode_queue(p);
 	free_page(page);
@@ -260,8 +275,10 @@ static inline void add_to_page_cache(struct page * page,
  * that we could use for the cache (if it is 0 we can try to create one,
  * this is all overlapped with the IO on the previous page finishing anyway)
  */
-static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
+static unsigned long try_to_read_ahead(struct file * file,
+				unsigned long offset, unsigned long page_cache)
 {
+	struct inode *inode = file->f_dentry->d_inode;
 	struct page * page;
 	struct page ** hash;
 
@@ -282,7 +299,7 @@ static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offse
 			 */
 			page = mem_map + MAP_NR(page_cache);
 			add_to_page_cache(page, inode, offset, hash);
-			inode->i_op->readpage(inode, page);
+			inode->i_op->readpage(file, page);
 			page_cache = 0;
 		}
 		release_page(page);
@@ -299,18 +316,20 @@ static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offse
  */
 void __wait_on_page(struct page *page)
 {
-	struct wait_queue wait = { current, NULL };
+	struct task_struct *tsk = current;
+	struct wait_queue wait;
 
+	wait.task = tsk;
 	add_wait_queue(&page->wait, &wait);
 repeat:
+	tsk->state = TASK_UNINTERRUPTIBLE;
 	run_task_queue(&tq_disk);
-	current->state = TASK_UNINTERRUPTIBLE;
 	if (PageLocked(page)) {
 		schedule();
 		goto repeat;
 	}
+	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&page->wait, &wait);
-	current->state = TASK_RUNNING;
 }
 
 #if 0
@@ -436,16 +455,6 @@ static void profile_readahead(int async, struct file *filp)
  *   64k if defined (4K page size assumed).
  */
 
-#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)
-
-#if 0  /* small readahead */
-#define MAX_READAHEAD PageAlignSize(4096*7)
-#define MIN_READAHEAD PageAlignSize(4096*2)
-#else /* large readahead */
-#define MAX_READAHEAD PageAlignSize(4096*18)
-#define MIN_READAHEAD PageAlignSize(4096*3)
-#endif
-
 static inline int get_max_readahead(struct inode * inode)
 {
 	if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
@@ -453,9 +462,9 @@ static inline int get_max_readahead(struct inode * inode)
 	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 }
 
-static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode,
-	unsigned long ppos, struct page * page,
-	unsigned long page_cache)
+static inline unsigned long generic_file_readahead(int reada_ok,
+	struct file * filp, struct inode * inode,
+	unsigned long ppos, struct page * page, unsigned long page_cache)
 {
 	unsigned long max_ahead, ahead;
 	unsigned long raend;
@@ -519,7 +528,8 @@ static inline unsigned long generic_file_readahead(int reada_ok, struct file * f
 	ahead = 0;
 	while (ahead < max_ahead) {
 		ahead += PAGE_SIZE;
-		page_cache = try_to_read_ahead(inode, raend + ahead, page_cache);
+		page_cache = try_to_read_ahead(filp, raend + ahead,
+						page_cache);
 	}
 /*
  * If we tried to read ahead some pages,
@@ -567,7 +577,8 @@ static inline unsigned long generic_file_readahead(int reada_ok, struct file * f
 ssize_t generic_file_read(struct file * filp, char * buf,
 			  size_t count, loff_t *ppos)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct dentry *dentry = filp->f_dentry;
+	struct inode *inode = dentry->d_inode;
 	ssize_t error, read;
 	size_t pos, pgpos, page_cache;
 	int reada_ok;
@@ -724,7 +735,7 @@ no_cached_page:
 		if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 			filp->f_ramax = MIN_READAHEAD;
 
-		error = inode->i_op->readpage(inode, page);
+		error = inode->i_op->readpage(filp, page);
 		if (!error)
 			goto found_page;
 		release_page(page);
@@ -736,7 +747,7 @@ page_read_error:
 		 * Try to re-read it _once_. We do this synchronously,
 		 * because this happens only if there were errors.
 		 */
-		error = inode->i_op->readpage(inode, page);
+		error = inode->i_op->readpage(filp, page);
 		if (!error) {
 			wait_on_page(page);
 			if (PageUptodate(page) && !PageError(page))
@@ -751,7 +762,7 @@ page_read_error:
 	filp->f_reada = 1;
 	if (page_cache)
 		free_page(page_cache);
-	UPDATE_ATIME(inode)
+	UPDATE_ATIME(inode);
 	if (!read)
 		read = error;
 	return read;
@@ -771,11 +782,11 @@ page_read_error:
  */
 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
 {
-/* XXX:  Check the flushes in this code.  At least sometimes we do
-         duplicate flushes. ... */
+	struct file * file = area->vm_file;
+	struct dentry * dentry = file->f_dentry;
+	struct inode * inode = dentry->d_inode;
 	unsigned long offset;
 	struct page * page, **hash;
-	struct inode * inode = area->vm_dentry->d_inode;
 	unsigned long old_page, new_page;
 
 	new_page = 0;
@@ -856,14 +867,14 @@ no_cached_page:
 	new_page = 0;
 	add_to_page_cache(page, inode, offset, hash);
 
-	if (inode->i_op->readpage(inode, page) != 0)
+	if (inode->i_op->readpage(file, page) != 0)
 		goto failure;
 
 	/*
 	 * Do a very limited read-ahead if appropriate
 	 */
 	if (PageLocked(page))
-		new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
+		new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
 	goto found_page;
 
 page_locked_wait:
@@ -878,7 +889,7 @@ page_read_error:
 	 * because there really aren't any performance issues here
 	 * and we need to check for errors.
 	 */
-	if (inode->i_op->readpage(inode, page) != 0)
+	if (inode->i_op->readpage(file, page) != 0)
 		goto failure;
 	wait_on_page(page);
 	if (PageError(page))
@@ -907,6 +918,7 @@ static inline int do_write_page(struct inode * inode, struct file * file,
 {
 	int retval;
 	unsigned long size;
+	loff_t loff = offset;
 	mm_segment_t old_fs;
 
 	size = offset + PAGE_SIZE;
@@ -922,8 +934,7 @@ static inline int do_write_page(struct inode * inode, struct file * file,
 	old_fs = get_fs();
 	set_fs(KERNEL_DS);
 	retval = -EIO;
-	if (size == file->f_op->write(file, (const char *) page,
-				      size, &file->f_pos))
+	if (size == file->f_op->write(file, (const char *) page, size, &loff))
 		retval = 0;
 	set_fs(old_fs);
 	return retval;
@@ -934,7 +945,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
 	unsigned long page)
 {
 	int result;
-	struct file file;
+	struct file * file;
 	struct dentry * dentry;
 	struct inode * inode;
 	struct buffer_head * bh;
@@ -954,27 +965,21 @@ static int filemap_write_page(struct vm_area_struct * vma,
 		return 0;
 	}
 
-	dentry = vma->vm_dentry;
+	file = vma->vm_file;
+	dentry = file->f_dentry;
 	inode = dentry->d_inode;
-	file.f_op = inode->i_op->default_file_ops;
-	if (!file.f_op->write)
+	if (!file->f_op->write)
 		return -EIO;
-	file.f_mode = 3;
-	file.f_flags = 0;
-	file.f_count = 1;
-	file.f_dentry = dentry;
-	file.f_pos = offset;
-	file.f_reada = 0;
 
 	/*
 	 * If a task terminates while we're swapping the page, the vma and
-	 * and dentry could be released ... increment the count to be safe.
+	 * and file could be released ... increment the count to be safe.
 	 */
-	dget(dentry);
+	file->f_count++;
 	down(&inode->i_sem);
-	result = do_write_page(inode, &file, (const char *) page, offset);
+	result = do_write_page(inode, file, (const char *) page, offset);
 	up(&inode->i_sem);
-	dput(dentry);
+	fput(file);
 	return result;
 }
 
@@ -1209,7 +1214,8 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 	if (!inode->i_op || !inode->i_op->readpage)
 		return -ENOEXEC;
 	UPDATE_ATIME(inode);
-	vma->vm_dentry = dget(file->f_dentry);
+	vma->vm_file = file;
+	file->f_count++;
 	vma->vm_ops = ops;
 	return 0;
 }
@@ -1222,15 +1228,16 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 static int msync_interval(struct vm_area_struct * vma,
 	unsigned long start, unsigned long end, int flags)
 {
-	if (vma->vm_dentry && vma->vm_ops && vma->vm_ops->sync) {
+	if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
 		int error;
 		error = vma->vm_ops->sync(vma, start, end-start, flags);
 		if (!error && (flags & MS_SYNC)) {
-			struct dentry * dentry = vma->vm_dentry;
-			if (dentry) {
+			struct file * file = vma->vm_file;
+			if (file) {
+				struct dentry * dentry = file->f_dentry;
 				struct inode * inode = dentry->d_inode;
 				down(&inode->i_sem);
-				error = file_fsync(NULL,dentry);
+				error = file_fsync(file, dentry);
 				up(&inode->i_sem);
 			}
 		}
@@ -1315,7 +1322,8 @@ ssize_t
 generic_file_write(struct file *file, const char *buf,
 		   size_t count, loff_t *ppos)
 {
-	struct inode	*inode = file->f_dentry->d_inode; 
+	struct dentry	*dentry = file->f_dentry; 
+	struct inode	*inode = dentry->d_inode; 
 	struct page	*page, **hash;
 	unsigned long	page_cache = 0;
 	unsigned long	pgpos, offset;
@@ -1349,11 +1357,10 @@ generic_file_write(struct file *file, const char *buf,
 		if (!(page = __find_page(inode, pgpos, *hash))) {
 			if (!page_cache) {
 				page_cache = __get_free_page(GFP_KERNEL);
-				if (!page_cache) {
-					status = -ENOMEM;
-					break;
-				}
-				continue;
+				if (page_cache)
+					continue;
+				status = -ENOMEM;
+				break;
 			}
 			page = mem_map + MAP_NR(page_cache);
 			add_to_page_cache(page, inode, pgpos, hash);
@@ -1361,36 +1368,47 @@ generic_file_write(struct file *file, const char *buf,
 		}
 
 		/*
-		 * WSH 06/05/97: restructured slightly to make sure we release
-		 * the page on an error exit.  Removed explicit setting of
-		 * PG_locked, as that's handled below the i_op->xxx interface.
+		 * Note: setting of the PG_locked bit is handled
+		 * below the i_op->xxx interface.
 		 */
 		didread = 0;
 page_wait:
 		wait_on_page(page);
+		if (PageUptodate(page))
+			goto do_update_page;
 
 		/*
-		 * If the page is not uptodate, and we're writing less
+		 * The page is not up-to-date ... if we're writing less
 		 * than a full page of data, we may have to read it first.
-		 * However, don't bother with reading the page when it's
-		 * after the current end of file.
+		 * But if the page is past the current end of file, we must
+		 * clear it before updating.
 		 */
-		if (!PageUptodate(page)) {
-			if (bytes < PAGE_SIZE && pgpos < inode->i_size) {
-				if (didread < 2)
-				    status = inode->i_op->readpage(inode, page);
-				else 
-				    status = -EIO; /* two tries ... error out */
+		if (bytes < PAGE_SIZE) {
+			if (pgpos < inode->i_size) {
+				status = -EIO;
+				if (didread >= 2)
+					goto done_with_page;
+				status = inode->i_op->readpage(file, page);
 				if (status < 0)
 					goto done_with_page;
 				didread++;
 				goto page_wait;
+			} else {
+				/* Must clear for partial writes */
+				memset((void *) page_address(page), 0,
+					 PAGE_SIZE);
 			}
-			set_bit(PG_uptodate, &page->flags);
 		}
+		/*
+		 * N.B. We should defer setting PG_uptodate at least until
+		 * the data is copied. A failure in i_op->updatepage() could
+		 * leave the page with garbage data.
+		 */
+		set_bit(PG_uptodate, &page->flags);
 
+do_update_page:
 		/* Alright, the page is there.  Now update it. */
-		status = inode->i_op->updatepage(inode, page, buf,
+		status = inode->i_op->updatepage(file, page, buf,
 							offset, bytes, sync);
 done_with_page:
 		__free_page(page);
@@ -1408,9 +1426,7 @@ done_with_page:
 
 	if (page_cache)
 		free_page(page_cache);
-	if (written)
-		return written;
-	return status;
+	return written ? written : status;
 }
 
 /*
@@ -1429,7 +1445,7 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset,
 {
 	struct page * page;
 	struct page ** hash;
-	unsigned long page_cache;
+	unsigned long page_cache = 0;
 
 	hash = page_hash(inode, offset);
 	page = __find_page(inode, offset, *hash);
@@ -1443,14 +1459,15 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset,
 		add_to_page_cache(page, inode, offset, hash);
 	}
 	if (atomic_read(&page->count) != 2)
-		printk("get_cached_page: page count=%d\n",
+		printk(KERN_ERR "get_cached_page: page count=%d\n",
 			atomic_read(&page->count));
 	if (test_bit(PG_locked, &page->flags))
-		printk("get_cached_page: page already locked!\n");
+		printk(KERN_ERR "get_cached_page: page already locked!\n");
 	set_bit(PG_locked, &page->flags);
+	page_cache = page_address(page);
 
 out:
-	return page_address(page);
+	return page_cache;
 }
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index 82ed6c986..66cdf0bc1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -175,100 +175,16 @@ int new_page_tables(struct task_struct * tsk)
 	return 0;
 }
 
-static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
-{
-	pte_t pte = *old_pte;
-	unsigned long page_nr;
-
-	if (pte_none(pte))
-		return;
-	if (!pte_present(pte)) {
-		swap_duplicate(pte_val(pte));
-		set_pte(new_pte, pte);
-		return;
-	}
-	page_nr = MAP_NR(pte_page(pte));
-	if (page_nr >= max_mapnr || PageReserved(mem_map+page_nr)) {
-		set_pte(new_pte, pte);
-		return;
-	}
-	if (cow)
-		pte = pte_wrprotect(pte);
-	if (delete_from_swap_cache(&mem_map[page_nr]))
-		pte = pte_mkdirty(pte);
-	set_pte(new_pte, pte_mkold(pte));
-	set_pte(old_pte, pte);
-	atomic_inc(&mem_map[page_nr].count);
-}
-
-static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
-{
-	pte_t * src_pte, * dst_pte;
-	unsigned long end;
-
-	if (pmd_none(*src_pmd))
-		return 0;
-	if (pmd_bad(*src_pmd)) {
-		printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
-		pmd_clear(src_pmd);
-		return 0;
-	}
-	src_pte = pte_offset(src_pmd, address);
-	if (pmd_none(*dst_pmd)) {
-		if (!pte_alloc(dst_pmd, 0))
-			return -ENOMEM;
-	}
-	dst_pte = pte_offset(dst_pmd, address);
-	address &= ~PMD_MASK;
-	end = address + size;
-	if (end >= PMD_SIZE)
-		end = PMD_SIZE;
-	do {
-		/* I would like to switch arguments here, to make it
-		 * consistent with copy_xxx_range and memcpy syntax.
-		 */
-		copy_one_pte(src_pte++, dst_pte++, cow);
-		address += PAGE_SIZE;
-	} while (address < end);
-	return 0;
-}
-
-static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
-{
-	pmd_t * src_pmd, * dst_pmd;
-	unsigned long end;
-	int error = 0;
-
-	if (pgd_none(*src_pgd))
-		return 0;
-	if (pgd_bad(*src_pgd)) {
-		printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
-		pgd_clear(src_pgd);
-		return 0;
-	}
-	src_pmd = pmd_offset(src_pgd, address);
-	if (pgd_none(*dst_pgd)) {
-		if (!pmd_alloc(dst_pgd, 0))
-			return -ENOMEM;
-	}
-	dst_pmd = pmd_offset(dst_pgd, address);
-	address &= ~PGDIR_MASK;
-	end = address + size;
-	if (end > PGDIR_SIZE)
-		end = PGDIR_SIZE;
-	do {
-		error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
-		if (error)
-			break;
-		address = (address + PMD_SIZE) & PMD_MASK; 
-	} while (address < end);
-	return error;
-}
+#define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
+#define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
 
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
+ *
+ * 08Jan98 Merged into one routine from several inline routines to reduce
+ *         variable count and make things faster. -jj
  */
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma)
@@ -276,18 +192,105 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 	pgd_t * src_pgd, * dst_pgd;
 	unsigned long address = vma->vm_start;
 	unsigned long end = vma->vm_end;
-	int error = 0, cow;
+	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+	
+	src_pgd = pgd_offset(src, address)-1;
+	dst_pgd = pgd_offset(dst, address)-1;
+	
+	for (;;) {
+		pmd_t * src_pmd, * dst_pmd;
+
+		src_pgd++; dst_pgd++;
+		
+		/* copy_pmd_range */
+		
+		if (pgd_none(*src_pgd))
+			goto skip_copy_pmd_range;
+		if (pgd_bad(*src_pgd)) {
+			printk("copy_pmd_range: bad pgd (%08lx)\n", 
+				pgd_val(*src_pgd));
+			pgd_clear(src_pgd);
+skip_copy_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
+			if (address >= end)
+				goto out;
+			continue;
+		}
+		if (pgd_none(*dst_pgd)) {
+			if (!pmd_alloc(dst_pgd, 0))
+				goto nomem;
+		}
+		
+		src_pmd = pmd_offset(src_pgd, address);
+		dst_pmd = pmd_offset(dst_pgd, address);
+
+		do {
+			pte_t * src_pte, * dst_pte;
+		
+			/* copy_pte_range */
+		
+			if (pmd_none(*src_pmd))
+				goto skip_copy_pte_range;
+			if (pmd_bad(*src_pmd)) {
+				printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
+				pmd_clear(src_pmd);
+skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
+				if (address >= end)
+					goto out;
+				goto cont_copy_pmd_range;
+			}
+			if (pmd_none(*dst_pmd)) {
+				if (!pte_alloc(dst_pmd, 0))
+					goto nomem;
+			}
+			
+			src_pte = pte_offset(src_pmd, address);
+			dst_pte = pte_offset(dst_pmd, address);
+			
+			do {
+				pte_t pte = *src_pte;
+				unsigned long page_nr;
+				
+				/* copy_one_pte */
+
+				if (pte_none(pte))
+					goto cont_copy_pte_range;
+				if (!pte_present(pte)) {
+					swap_duplicate(pte_val(pte));
+					set_pte(dst_pte, pte);
+					goto cont_copy_pte_range;
+				}
+				page_nr = MAP_NR(pte_page(pte));
+				if (page_nr >= max_mapnr || 
+				    PageReserved(mem_map+page_nr)) {
+					set_pte(dst_pte, pte);
+					goto cont_copy_pte_range;
+				}
+				if (cow)
+					pte = pte_wrprotect(pte);
+#if 0	/* No longer needed with the new swap cache code */
+				if (delete_from_swap_cache(&mem_map[page_nr]))
+					pte = pte_mkdirty(pte);
+#endif
+				set_pte(dst_pte, pte_mkold(pte));
+				set_pte(src_pte, pte);
+				atomic_inc(&mem_map[page_nr].count);
+			
+cont_copy_pte_range:		address += PAGE_SIZE;
+				if (address >= end)
+					goto out;
+				src_pte++;
+				dst_pte++;
+			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+		
+cont_copy_pmd_range:	src_pmd++;
+			dst_pmd++;
+		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
+	}
+out:
+	return 0;
 
-	cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
-	src_pgd = pgd_offset(src, address);
-	dst_pgd = pgd_offset(dst, address);
-	while (address < end) {
-		error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
-		if (error)
-			break;
-		address = (address + PGDIR_SIZE) & PGDIR_MASK;
-	}
-	return error;
+nomem:
+	return -ENOMEM;
 }
 
 /*
@@ -299,7 +302,11 @@ static inline int free_pte(pte_t page)
 		unsigned long addr = pte_page(page);
 		if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
 			return 0;
-		free_page(addr);
+		/* 
+		 * free_page() used to be able to clear swap cache
+		 * entries.  We may now have to do it manually.  
+		 */
+		free_page_and_swap_cache(addr);
 		return 1;
 	}
 	swap_free(pte_val(page));
@@ -542,7 +549,7 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long
 static void put_page(pte_t * page_table, pte_t pte)
 {
 	if (!pte_none(*page_table)) {
-		free_page(pte_page(pte));
+		free_page_and_swap_cache(pte_page(pte));
 		return;
 	}
 /* no need for flush_tlb */
@@ -609,9 +616,13 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 {
 	pte_t pte;
 	unsigned long old_page, new_page;
-
-	new_page = __get_free_page(GFP_KERNEL);
+	struct page * page_map;
+	
 	pte = *page_table;
+	new_page = __get_free_page(GFP_KERNEL);
+	/* Did someone else copy this page for us while we slept? */
+	if (pte_val(*page_table) != pte_val(pte))
+		goto end_wp_page;
 	if (!pte_present(pte))
 		goto end_wp_page;
 	if (pte_write(pte))
@@ -620,10 +631,12 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	if (MAP_NR(old_page) >= max_mapnr)
 		goto bad_wp_page;
 	tsk->min_flt++;
+	page_map = mem_map + MAP_NR(old_page);
+	
 	/*
 	 * Do we need to copy?
 	 */
-	if (atomic_read(&mem_map[MAP_NR(old_page)].count) != 1) {
+	if (is_page_shared(page_map)) {
 		if (new_page) {
 			if (PageReserved(mem_map + MAP_NR(old_page)))
 				++vma->vm_mm->rss;
@@ -643,6 +656,8 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 		oom(tsk);
 		return;
 	}
+	if (PageSwapCache(page_map))
+		delete_from_swap_cache(page_map);
 	flush_cache_page(vma, address);
 	set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 	flush_tlb_page(vma, address);
@@ -867,12 +882,14 @@ static inline void handle_pte_fault(struct task_struct *tsk,
 		do_no_page(tsk, vma, address, write_access, pte, entry);
 		return;
 	}
-	set_pte(pte, pte_mkyoung(entry));
+	entry = pte_mkyoung(entry);
+	set_pte(pte, entry);
 	flush_tlb_page(vma, address);
 	if (!write_access)
 		return;
 	if (pte_write(entry)) {
-		set_pte(pte, pte_mkdirty(*pte));
+		entry = pte_mkdirty(entry);
+		set_pte(pte, entry);
 		flush_tlb_page(vma, address);
 		return;
 	}
diff --git a/mm/mlock.c b/mm/mlock.c
index eea100add..5bffab93f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -38,7 +38,8 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma,
 	n->vm_end = end;
 	vma->vm_offset += vma->vm_start - n->vm_start;
 	n->vm_flags = newflags;
-	n->vm_dentry = dget(vma->vm_dentry);
+	if (n->vm_file)
+		n->vm_file->f_count++;
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
 	insert_vm_struct(current->mm, n);
@@ -58,7 +59,8 @@ static inline int mlock_fixup_end(struct vm_area_struct * vma,
 	n->vm_start = start;
 	n->vm_offset += n->vm_start - vma->vm_start;
 	n->vm_flags = newflags;
-	n->vm_dentry = dget(vma->vm_dentry);
+	if (n->vm_file)
+		n->vm_file->f_count++;
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
 	insert_vm_struct(current->mm, n);
@@ -87,8 +89,8 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
 	vma->vm_offset += vma->vm_start - left->vm_start;
 	right->vm_offset += right->vm_start - left->vm_start;
 	vma->vm_flags = newflags;
-	if (vma->vm_dentry)
-		vma->vm_dentry->d_count += 2;
+	if (vma->vm_file)
+		vma->vm_file->f_count += 2;
 
 	if (vma->vm_ops && vma->vm_ops->open) {
 		vma->vm_ops->open(left);
diff --git a/mm/mmap.c b/mm/mmap.c
index 501b31913..52c185e85 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -17,6 +17,7 @@
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
+#include <linux/file.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -74,11 +75,11 @@ int vm_enough_memory(long pages)
 /* Remove one vm structure from the inode's i_mmap ring. */
 static inline void remove_shared_vm_struct(struct vm_area_struct *vma)
 {
-	struct dentry * dentry = vma->vm_dentry;
+	struct file * file = vma->vm_file;
 
-	if (dentry) {
+	if (file) {
 		if (vma->vm_flags & VM_DENYWRITE)
-			dentry->d_inode->i_writecount++;
+			file->f_dentry->d_inode->i_writecount++;
 		if(vma->vm_next_share)
 			vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share;
 		*vma->vm_pprev_share = vma->vm_next_share;
@@ -173,6 +174,10 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 	if (off + len < off)
 		return -EINVAL;
 
+	/* Too many mappings? */
+	if (mm->map_count > MAX_MAP_COUNT)
+		return -ENOMEM;
+
 	/* mlock MCL_FUTURE? */
 	if (mm->def_flags & VM_LOCKED) {
 		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
@@ -257,7 +262,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
 	vma->vm_ops = NULL;
 	vma->vm_offset = off;
-	vma->vm_dentry = NULL;
+	vma->vm_file = NULL;
 	vma->vm_pte = 0;
 
 	/* Clear old maps */
@@ -390,8 +395,8 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
 	if (addr == area->vm_start && end == area->vm_end) {
 		if (area->vm_ops && area->vm_ops->close)
 			area->vm_ops->close(area);
-		if (area->vm_dentry)
-			dput(area->vm_dentry);
+		if (area->vm_file)
+			fput(area->vm_file);
 		return 0;
 	}
 
@@ -414,7 +419,9 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
 		mpnt->vm_flags = area->vm_flags;
 		mpnt->vm_ops = area->vm_ops;
 		mpnt->vm_offset = area->vm_offset + (end - area->vm_start);
-		mpnt->vm_dentry = dget(area->vm_dentry);
+		mpnt->vm_file = area->vm_file;
+		if (mpnt->vm_file)
+			mpnt->vm_file->f_count++;
 		if (mpnt->vm_ops && mpnt->vm_ops->open)
 			mpnt->vm_ops->open(mpnt);
 		area->vm_end = addr;	/* Truncate area */
@@ -452,6 +459,7 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len)
  */
 int do_munmap(unsigned long addr, size_t len)
 {
+	struct mm_struct * mm;
 	struct vm_area_struct *mpnt, *next, *free, *extra;
 	int freed;
 
@@ -466,7 +474,8 @@ int do_munmap(unsigned long addr, size_t len)
 	 * every area affected in some way (by any overlap) is put
 	 * on the list.  If nothing is put on, nothing is affected.
 	 */
-	mpnt = current->mm->mmap;
+	mm = current->mm;
+	mpnt = mm->mmap;
 	while(mpnt && mpnt->vm_end <= addr)
 		mpnt = mpnt->vm_next;
 	if (!mpnt)
@@ -496,6 +505,13 @@ int do_munmap(unsigned long addr, size_t len)
 		mpnt = next;
 	}
 
+	if (free && (free->vm_start < addr) && (free->vm_end > addr+len)) {
+		if (mm->map_count > MAX_MAP_COUNT) {
+			kmem_cache_free(vm_area_cachep, extra);
+			return -ENOMEM;
+		}
+	}
+
 	/* Ok - we have the memory areas we should free on the 'free' list,
 	 * so release them, and unmap the page range..
 	 * If the one of the segments is only being partially unmapped,
@@ -508,6 +524,7 @@ int do_munmap(unsigned long addr, size_t len)
 		free = free->vm_next;
 		freed = 1;
 
+		mm->map_count--;
 		remove_shared_vm_struct(mpnt);
 
 		st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
@@ -518,9 +535,9 @@ int do_munmap(unsigned long addr, size_t len)
 		if (mpnt->vm_ops && mpnt->vm_ops->unmap)
 			mpnt->vm_ops->unmap(mpnt, st, size);
 
-		flush_cache_range(current->mm, st, end);
-		zap_page_range(current->mm, st, size);
-		flush_tlb_range(current->mm, st, end);
+		flush_cache_range(mm, st, end);
+		zap_page_range(mm, st, size);
+		flush_tlb_range(mm, st, end);
 
 		/*
 		 * Fix the mapping, and free the old area if it wasn't reused.
@@ -534,7 +551,7 @@ int do_munmap(unsigned long addr, size_t len)
 		kmem_cache_free(vm_area_cachep, extra);
 
 	if (freed)
-		current->mm->mmap_cache = NULL;	/* Kill the cache. */
+		mm->mmap_cache = NULL;	/* Kill the cache. */
 	return 0;
 }
 
@@ -560,13 +577,18 @@ void exit_mmap(struct mm_struct * mm)
 			if (mpnt->vm_ops->close)
 				mpnt->vm_ops->close(mpnt);
 		}
+		mm->map_count--;
 		remove_shared_vm_struct(mpnt);
 		zap_page_range(mm, start, size);
-		if (mpnt->vm_dentry)
-			dput(mpnt->vm_dentry);
+		if (mpnt->vm_file)
+			fput(mpnt->vm_file);
 		kmem_cache_free(vm_area_cachep, mpnt);
 		mpnt = next;
 	}
+
+	/* This is just debugging */
+	if (mm->map_count)
+		printk("exit_mmap: map count is %d\n", mm->map_count);
 }
 
 /* Insert vm structure into process list sorted by address
@@ -575,7 +597,9 @@ void exit_mmap(struct mm_struct * mm)
 void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
 {
 	struct vm_area_struct **pprev = &mm->mmap;
-	struct dentry * dentry;
+	struct file * file;
+
+	mm->map_count++;
 
 	/* Find where to link it in. */
 	while(*pprev && (*pprev)->vm_start <= vmp->vm_start)
@@ -587,9 +611,9 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
 	*pprev = vmp;
 	vmp->vm_pprev = pprev;
 
-	dentry = vmp->vm_dentry;
-	if (dentry) {
-		struct inode * inode = dentry->d_inode;
+	file = vmp->vm_file;
+	if (file) {
+		struct inode * inode = file->f_dentry->d_inode;
 		if (vmp->vm_flags & VM_DENYWRITE)
 			inode->i_writecount--;
       
@@ -636,8 +660,8 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 	for ( ; mpnt && prev->vm_start < end_addr ; prev = mpnt, mpnt = next) {
 		next = mpnt->vm_next;
 
-		/* To share, we must have the same dentry, operations.. */
-		if ((mpnt->vm_dentry != prev->vm_dentry)||
+		/* To share, we must have the same file, operations.. */
+		if ((mpnt->vm_file != prev->vm_file)||
 		    (mpnt->vm_pte != prev->vm_pte)	||
 		    (mpnt->vm_ops != prev->vm_ops)	||
 		    (mpnt->vm_flags != prev->vm_flags)	||
@@ -645,10 +669,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 			continue;
 
 		/*
-		 * If we have a dentry or it's a shared memory area
+		 * If we have a file or it's a shared memory area
 		 * the offsets must be contiguous..
 		 */
-		if ((mpnt->vm_dentry != NULL) || (mpnt->vm_flags & VM_SHM)) {
+		if ((mpnt->vm_file != NULL) || (mpnt->vm_flags & VM_SHM)) {
 			unsigned long off = prev->vm_offset+prev->vm_end-prev->vm_start;
 			if (off != mpnt->vm_offset)
 				continue;
@@ -668,9 +692,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 			mpnt->vm_start = mpnt->vm_end;
 			mpnt->vm_ops->close(mpnt);
 		}
+		mm->map_count--;
 		remove_shared_vm_struct(mpnt);
-		if (mpnt->vm_dentry)
-			dput(mpnt->vm_dentry);
+		if (mpnt->vm_file)
+			fput(mpnt->vm_file);
 		kmem_cache_free(vm_area_cachep, mpnt);
 		mpnt = prev;
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ddf4f4ed6..a34225d83 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -110,7 +110,8 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma,
 	vma->vm_offset += vma->vm_start - n->vm_start;
 	n->vm_flags = newflags;
 	n->vm_page_prot = prot;
-	n->vm_dentry = dget(n->vm_dentry);
+	if (n->vm_file)
+		n->vm_file->f_count++;
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
 	insert_vm_struct(current->mm, n);
@@ -132,7 +133,8 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma,
 	n->vm_offset += n->vm_start - vma->vm_start;
 	n->vm_flags = newflags;
 	n->vm_page_prot = prot;
-	n->vm_dentry = dget(n->vm_dentry);
+	if (n->vm_file)
+		n->vm_file->f_count++;
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
 	insert_vm_struct(current->mm, n);
@@ -163,8 +165,8 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
 	right->vm_offset += right->vm_start - left->vm_start;
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = prot;
-	if (vma->vm_dentry)
-		vma->vm_dentry->d_count += 2;
+	if (vma->vm_file)
+		vma->vm_file->f_count += 2;
 	if (vma->vm_ops && vma->vm_ops->open) {
 		vma->vm_ops->open(left);
 		vma->vm_ops->open(right);
diff --git a/mm/mremap.c b/mm/mremap.c
index aaabde322..a31a0ae14 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -140,7 +140,9 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
 			new_vma->vm_start = new_addr;
 			new_vma->vm_end = new_addr+new_len;
 			new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start);
-			new_vma->vm_dentry = dget(vma->vm_dentry);
+			new_vma->vm_file = vma->vm_file;
+			if (new_vma->vm_file)
+				new_vma->vm_file->f_count++;
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			insert_vm_struct(current->mm, new_vma);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07264f81e..ed748bbfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -19,6 +19,7 @@
 #include <linux/swapctl.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/pagemap.h>
 
 #include <asm/dma.h>
 #include <asm/system.h> /* for cli()/sti() */
@@ -101,6 +102,46 @@ static inline void remove_mem_queue(struct page * entry)
 static spinlock_t page_alloc_lock;
 #endif
 
+/*
+ * This routine is used by the kernel swap deamon to determine
+ * whether we have "enough" free pages. It is fairly arbitrary,
+ * but this had better return false if any reasonable "get_free_page()"
+ * allocation could currently fail..
+ *
+ * Currently we approve of the following situations:
+ * - the highest memory order has two entries
+ * - the highest memory order has one free entry and:
+ *	- the next-highest memory order has two free entries
+ * - the highest memory order has one free entry and:
+ *	- the next-highest memory order has one free entry
+ *	- the next-next-highest memory order has two free entries
+ *
+ * [previously, there had to be two entries of the highest memory
+ *  order, but this lead to problems on large-memory machines.]
+ */
+int free_memory_available(void)
+{
+	int i, retval = 0;
+	unsigned long flags;
+	struct free_area_struct * list = NULL;
+
+	spin_lock_irqsave(&page_alloc_lock, flags);
+	/* We fall through the loop if the list contains one
+	 * item. -- thanks to Colin Plumb <colin@nyx.net>
+	 */
+	for (i = 1; i < 4; ++i) {
+		list = free_area + NR_MEM_LISTS - i;
+		if (list->next == memory_head(list))
+			break;
+		if (list->next->next == memory_head(list))
+			continue;
+		retval = 1;
+		break;
+	}
+	spin_unlock_irqrestore(&page_alloc_lock, flags);
+	return retval;
+}
+
 static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 {
 	struct free_area_struct *area = free_area + order;
@@ -133,9 +174,12 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 void __free_page(struct page *page)
 {
 	if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
-		delete_from_swap_cache(page);
+		if (PageSwapCache(page))
+			panic ("Freeing swap cache page");
 		free_pages_ok(page->map_nr, 0);
 	}
+	if (PageSwapCache(page) && atomic_read(&page->count) == 1)
+		panic ("Releasing swap cache page");
 }
 
 void free_pages(unsigned long addr, unsigned long order)
@@ -147,10 +191,14 @@ void free_pages(unsigned long addr, unsigned long order)
 		if (PageReserved(map))
 			return;
 		if (atomic_dec_and_test(&map->count)) {
-			delete_from_swap_cache(map);
+			if (PageSwapCache(map))
+				panic ("Freeing swap cache pages");
 			free_pages_ok(map_nr, order);
 			return;
 		}
+		if (PageSwapCache(map) && atomic_read(&map->count) == 1)
+			panic ("Releasing swap cache pages at %p",
+			       __builtin_return_address(0));
 	}
 }
 
@@ -161,11 +209,13 @@ void free_pages(unsigned long addr, unsigned long order)
 	change_bit((index) >> (1+(order)), (area)->map)
 #define CAN_DMA(x) (PageDMA(x))
 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
-#define RMQUEUE(order, dma) \
+#define RMQUEUE(order, maxorder, dma) \
 do { struct free_area_struct * area = free_area+order; \
      unsigned long new_order = order; \
-	do { struct page *prev = memory_head(area), *ret; \
-		while (memory_head(area) != (ret = prev->next)) { \
+	do { struct page *prev = memory_head(area), *ret = prev->next; \
+		while (memory_head(area) != ret) { \
+			if (new_order >= maxorder && ret->next == prev) \
+				break; \
 			if (!dma || CAN_DMA(ret)) { \
 				unsigned long map_nr = ret->map_nr; \
 				(prev->next = ret->next)->prev = prev; \
@@ -176,6 +226,7 @@ do { struct free_area_struct * area = free_area+order; \
 				return ADDRESS(map_nr); \
 			} \
 			prev = ret; \
+			ret = ret->next; \
 		} \
 		new_order++; area++; \
 	} while (new_order < NR_MEM_LISTS); \
@@ -194,36 +245,40 @@ do { unsigned long size = 1 << high; \
 	map->age = PAGE_INITIAL_AGE; \
 } while (0)
 
-unsigned long __get_free_pages(int priority, unsigned long order, int dma)
+unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 {
-	unsigned long flags;
-	int reserved_pages;
+	unsigned long flags, maxorder;
 
 	if (order >= NR_MEM_LISTS)
-		return 0;
+		goto nopage;
 
-	if (in_interrupt() && priority != GFP_ATOMIC) {
+	/*
+	 * "maxorder" is the highest order number that we're allowed
+	 * to empty in order to find a free page..
+	 */
+	maxorder = order + NR_MEM_LISTS/3;
+	if (gfp_mask & __GFP_MED)
+		maxorder += NR_MEM_LISTS/3;
+	if ((gfp_mask & __GFP_HIGH) || maxorder > NR_MEM_LISTS)
+		maxorder = NR_MEM_LISTS;
+
+	if (in_interrupt() && (gfp_mask & __GFP_WAIT)) {
 		static int count = 0;
 		if (++count < 5) {
 			printk("gfp called nonatomically from interrupt %p\n",
-				return_address());
-			priority = GFP_ATOMIC;
+			       return_address());
+			gfp_mask &= ~__GFP_WAIT;
 		}
 	}
 
-	reserved_pages = 5;
-	if (priority != GFP_NFS)
-		reserved_pages = min_free_pages;
 repeat:
 	spin_lock_irqsave(&page_alloc_lock, flags);
-	if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
-		RMQUEUE(order, dma);
-		spin_unlock_irqrestore(&page_alloc_lock, flags);
-		return 0;
-	}
+	RMQUEUE(order, maxorder, (gfp_mask & GFP_DMA));
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
-	if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1))
+	if ((gfp_mask & __GFP_WAIT) && try_to_free_page(gfp_mask))
 		goto repeat;
+
+nopage:
 	return 0;
 }
 
@@ -315,31 +370,38 @@ __initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long e
 void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
 	pte_t * page_table, unsigned long entry, int write_access)
 {
-	unsigned long page = __get_free_page(GFP_KERNEL);
+	unsigned long page;
+	struct page *page_map;
+	
+	page_map = read_swap_cache(entry);
 
 	if (pte_val(*page_table) != entry) {
-		free_page(page);
+		if (page_map)
+			free_page_and_swap_cache(page_address(page_map));
 		return;
 	}
-	if (!page) {
+	if (!page_map) {
 		set_pte(page_table, BAD_PAGE);
 		swap_free(entry);
 		oom(tsk);
 		return;
 	}
-	read_swap_page(entry, (char *) page);
-	if (pte_val(*page_table) != entry) {
-		free_page(page);
-		return;
-	}
+
+	page = page_address(page_map);
 	vma->vm_mm->rss++;
-	tsk->maj_flt++;
-	if (!write_access && add_to_swap_cache(&mem_map[MAP_NR(page)], entry)) {
-		/* keep swap page allocated for the moment (swap cache) */
+	tsk->min_flt++;
+	swap_free(entry);
+
+	if (!write_access || is_page_shared(page_map)) {
 		set_pte(page_table, mk_pte(page, vma->vm_page_prot));
 		return;
 	}
+
+	/* The page is unshared, and we want write access.  In this
+	   case, it is safe to tear down the swap cache and give the
+	   page over entirely to this process. */
+		
+	delete_from_swap_cache(page_map);
 	set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
-  	swap_free(entry);
   	return;
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 5ebea3f09..e02565def 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -6,6 +6,7 @@
  *  Swap reorganised 29.12.95, 
  *  Asynchronous swapping added 30.12.95. Stephen Tweedie
  *  Removed race in async swapping. 14.4.1996. Bruno Haible
+ *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
  */
 
 #include <linux/mm.h>
@@ -27,26 +28,38 @@
 #include <asm/bitops.h>
 #include <asm/pgtable.h>
 
-static struct wait_queue * lock_queue = NULL;
-
 /*
  * Reads or writes a swap page.
  * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O.
+ * All IO to swap files (as opposed to swap partitions) is done
+ * synchronously.
  *
- * Important prevention of race condition: The first thing we do is set a lock
- * on this swap page, which lasts until I/O completes. This way a
- * write_swap_page(entry) immediately followed by a read_swap_page(entry)
- * on the same entry will first complete the write_swap_page(). Fortunately,
- * not more than one write_swap_page() request can be pending per entry. So
- * all races the caller must catch are: multiple read_swap_page() requests
- * on the same entry.
+ * Important prevention of race condition: the caller *must* atomically 
+ * create a unique swap cache entry for this swap page before calling
+ * rw_swap_page, and must lock that page.  By ensuring that there is a
+ * single page of memory reserved for the swap entry, the normal VM page
+ * lock on that page also doubles as a lock on swap entries.  Having only
+ * one lock to deal with per swap entry (rather than locking swap and memory
+ * independently) also makes it easier to make certain swapping operations
+ * atomic, which is particularly important when we are trying to ensure 
+ * that shared pages stay shared while being swapped.
  */
+
 void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 {
 	unsigned long type, offset;
 	struct swap_info_struct * p;
-	struct page *page;
-	
+	struct page *page = mem_map + MAP_NR(buf);
+
+#ifdef DEBUG_SWAP
+	printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n",
+		(rw == READ) ? "read" : "write", 
+		entry, buf, atomic_read(&page->count),
+		wait ? "wait" : "nowait");
+#endif
+
+	if (page->inode && page->inode != &swapper_inode)
+		panic ("Tried to swap a non-swapper page");
 	type = SWP_TYPE(entry);
 	if (type >= nr_swapfiles) {
 		printk("Internal error: bad swap-device\n");
@@ -59,33 +72,49 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 		return;
 	}
 	if (p->swap_map && !p->swap_map[offset]) {
-		printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
+		printk("Hmm.. Trying to %s unallocated swap (%08lx)\n", 
+		       (rw == READ) ? "read" : "write", 
+		       entry);
 		return;
 	}
 	if (!(p->flags & SWP_USED)) {
 		printk("Trying to swap to unused swap-device\n");
 		return;
 	}
-	/* Make sure we are the only process doing I/O with this swap page. */
-	while (test_and_set_bit(offset,p->swap_lockmap)) {
-		run_task_queue(&tq_disk);
-		sleep_on(&lock_queue);
+
+	if (!PageLocked(page)) {
+		printk("VM: swap page is unlocked\n");
+		return;
 	}
-	if (rw == READ)
+	
+	if (rw == READ) {
+		clear_bit(PG_uptodate, &page->flags);
 		kstat.pswpin++;
-	else
+	} else
 		kstat.pswpout++;
-	page = mem_map + MAP_NR(buf);
+
 	atomic_inc(&page->count);
-	wait_on_page(page);
+	/* 
+	 * Make sure that we have a swap cache association for this
+	 * page.  We need this to find which swap page to unlock once
+	 * the swap IO has completed to the physical page.  If the page
+	 * is not already in the cache, just overload the offset entry
+	 * as if it were: we are not allowed to manipulate the inode
+	 * hashing for locked pages.
+	 */
+	if (!PageSwapCache(page)) {
+		printk("VM: swap page is not in swap cache\n");
+		return;
+	}
+	if (page->offset != entry) {
+		printk ("swap entry mismatch");
+		return;
+	}
+
 	if (p->swap_device) {
 		if (!wait) {
 			set_bit(PG_free_after, &page->flags);
 			set_bit(PG_decr_after, &page->flags);
-			set_bit(PG_swap_unlock_after, &page->flags);
-			/* swap-cache  shouldn't be set, but play safe */
-			PageClearSwapCache(page);
-			page->pg_swap_entry = entry;
 			atomic_inc(&nr_async_pages);
 		}
 		ll_rw_page(rw,p->swap_device,offset,buf);
@@ -132,39 +161,55 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 			for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 				if (!(zones[i] = bmap(swapf,block++))) {
 					printk("rw_swap_page: bad swap file\n");
+					return;
 				}
 		}
 		ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
+		/* Unlike ll_rw_page, ll_rw_swap_file won't unlock the
+		   page for us. */
+		clear_bit(PG_locked, &page->flags);
+		wake_up(&page->wait);
 	} else
 		printk("rw_swap_page: no swap file or device\n");
+
 	atomic_dec(&page->count);
-	if (offset && !test_and_clear_bit(offset,p->swap_lockmap))
-		printk("rw_swap_page: lock already cleared\n");
-	wake_up(&lock_queue);
+#ifdef DEBUG_SWAP
+	printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n",
+		(rw == READ) ? "read" : "write", 
+		buf, atomic_read(&page->count));
+#endif
 }
 
-/* This is run when asynchronous page I/O has completed. */
-void swap_after_unlock_page (unsigned long entry)
+/*
+ * Setting up a new swap file needs a simple wrapper just to read the 
+ * swap signature.  SysV shared memory also needs a simple wrapper.
+ */
+void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer)
 {
-	unsigned long type, offset;
-	struct swap_info_struct * p;
-
-	type = SWP_TYPE(entry);
-	if (type >= nr_swapfiles) {
-		printk("swap_after_unlock_page: bad swap-device\n");
+	struct page *page;
+	
+	page = mem_map + MAP_NR((unsigned long) buffer);
+	wait_on_page(page);
+	set_bit(PG_locked, &page->flags);
+	if (test_and_set_bit(PG_swap_cache, &page->flags)) {
+		printk ("VM: read_swap_page: page already in swap cache!\n");
 		return;
 	}
-	p = &swap_info[type];
-	offset = SWP_OFFSET(entry);
-	if (offset >= p->max) {
-		printk("swap_after_unlock_page: weirdness\n");
+	if (page->inode) {
+		printk ("VM: read_swap_page: page already in page cache!\n");
 		return;
 	}
-	if (!test_and_clear_bit(offset,p->swap_lockmap))
-		printk("swap_after_unlock_page: lock already cleared\n");
-	wake_up(&lock_queue);
+	page->inode = &swapper_inode;
+	page->offset = entry;
+	atomic_inc(&page->count);	/* Protect from shrink_mmap() */
+	rw_swap_page(rw, entry, buffer, 1);
+	atomic_dec(&page->count);
+	page->inode = 0;
+	clear_bit(PG_swap_cache, &page->flags);
 }
 
+
+
 /*
  * Swap partitions are now read via brw_page.  ll_rw_page is an
  * asynchronous function now --- we must call wait_on_page afterwards
@@ -189,7 +234,7 @@ void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer)
 			panic("ll_rw_page: bad block dev cmd, must be R/W");
 	}
 	page = mem_map + MAP_NR(buffer);
-	if (test_and_set_bit(PG_locked, &page->flags))
-		panic ("ll_rw_page: page already locked");
+	if (!PageLocked(page))
+		panic ("ll_rw_page: page not already locked");
 	brw_page(rw, page, dev, &block, PAGE_SIZE, 0);
 }
diff --git a/mm/simp.c b/mm/simp.c
index 6ad6bc73c..ba7864604 100644
--- a/mm/simp.c
+++ b/mm/simp.c
@@ -115,7 +115,7 @@ struct simp * simp_create(char * name, long size,
 
 	if(!global) {
 #ifdef __SMP__
-		global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER, 0);
+		global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER);
 		memset(global, 0, CHUNK_SIZE);
 #else
 		global = (struct global_data*)get_free_page(GFP_KERNEL);
@@ -167,7 +167,7 @@ static void alloc_header(struct simp * simp)
 
 	spin_unlock(&simp->lock);
 	for(;;) {
-		hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER, 0);
+		hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER);
 		if(hdr)
 			break;
 		if(!simp_garbage())
diff --git a/mm/slab.c b/mm/slab.c
index 2d6466656..d0b4214f9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -506,8 +506,7 @@ kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
 	void	*addr;
 
 	*dma = flags & SLAB_DMA;
-	addr = (void*) __get_free_pages(flags & SLAB_LEVEL_MASK,
-					cachep->c_gfporder, *dma); 
+	addr = (void*) __get_free_pages(flags, cachep->c_gfporder);
 	/* Assume that now we have the pages no one else can legally
 	 * messes with the 'struct page's.
 	 * However vm_scan() might try to test the structure to see if
@@ -1732,19 +1731,18 @@ kmem_find_general_cachep(size_t size)
  * This function _cannot_ be called within a int, but it
  * can be interrupted.
  */
-int
-kmem_cache_reap(int pri, int dma, int wait)
+void
+kmem_cache_reap(int gfp_mask)
 {
 	kmem_slab_t	*slabp;
 	kmem_cache_t	*searchp;
 	kmem_cache_t	*best_cachep;
 	unsigned int	 scan;
 	unsigned int	 reap_level;
-	static unsigned long	call_count = 0;
 
 	if (in_interrupt()) {
 		printk("kmem_cache_reap() called within int!\n");
-		return 0;
+		return;
 	}
 
 	/* We really need a test semphore op so we can avoid sleeping when
@@ -1752,28 +1750,8 @@ kmem_cache_reap(int pri, int dma, int wait)
 	 */
 	down(&cache_chain_sem);
 
-	scan = 10-pri;
-	if (pri == 6 && !dma) {
-		if (++call_count == 199) {
-			/* Hack Alert!
-			 * Occassionally we try hard to reap a slab.
-			 */
-			call_count = 0UL;
-			reap_level = 0;
-			scan += 2;
-		} else
-			reap_level = 3;
-	} else {
-		if (pri >= 5) {
-			/* We also come here for dma==1 at pri==6, just
-			 * to try that bit harder (assumes that there are
-			 * less DMAable pages in a system - not always true,
-			 * but this doesn't hurt).
-			 */
-			reap_level = 2;
-		} else
-			reap_level = 0;
-	}
+	scan = 10;
+	reap_level = 0;
 
 	best_cachep = NULL;
 	searchp = clock_searchp;
@@ -1812,7 +1790,7 @@ kmem_cache_reap(int pri, int dma, int wait)
 		}
 		spin_unlock_irq(&searchp->c_spinlock);
 
-		if (dma && !dma_flag)
+		if ((gfp_mask & GFP_DMA) && !dma_flag)
 			goto next;
 
 		if (full_free) {
@@ -1825,10 +1803,6 @@ kmem_cache_reap(int pri, int dma, int wait)
 			 * more than one page per slab (as it can be difficult
 			 * to get high orders from gfp()).
 			 */
-			if (pri == 6) {	/* magic '6' from try_to_free_page() */
-				if (searchp->c_gfporder || searchp->c_ctor)
-					full_free--;
-			}
 			if (full_free >= reap_level) {
 				reap_level = full_free;
 				best_cachep = searchp;
@@ -1846,12 +1820,12 @@ next:
 
 	if (!best_cachep) {
 		/* couldn't find anthying to reap */
-		return 0;
+		return;
 	}
 
 	spin_lock_irq(&best_cachep->c_spinlock);
 	if (!best_cachep->c_growing && !(slabp = best_cachep->c_lastp)->s_inuse && slabp != kmem_slab_end(best_cachep)) {
-		if (dma) {
+		if (gfp_mask & GFP_DMA) {
 			do {
 				if (slabp->s_dma)
 					goto good_dma;
@@ -1874,11 +1848,11 @@ good_dma:
 		 */
 		spin_unlock_irq(&best_cachep->c_spinlock);
 		kmem_slab_destroy(best_cachep, slabp);
-		return 1;
+		return;
 	}
 dma_fail:
 	spin_unlock_irq(&best_cachep->c_spinlock);
-	return 0;
+	return;
 }
 
 #if	SLAB_SELFTEST
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 75f284124..4ebc5c05f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -3,6 +3,8 @@
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
+ *
+ *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
  */
 
 #include <linux/mm.h>
@@ -17,6 +19,7 @@
 #include <linux/fs.h>
 #include <linux/swapctl.h>
 #include <linux/init.h>
+#include <linux/pagemap.h>
 
 #include <asm/bitops.h>
 #include <asm/pgtable.h>
@@ -29,6 +32,18 @@ unsigned long swap_cache_del_success = 0;
 unsigned long swap_cache_find_total = 0;
 unsigned long swap_cache_find_success = 0;
 
+/* 
+ * Keep a reserved false inode which we will use to mark pages in the
+ * page cache are acting as swap cache instead of file cache. 
+ *
+ * We only need a unique pointer to satisfy the page cache, but we'll
+ * reserve an entire zeroed inode structure for the purpose just to
+ * ensure that any mistaken dereferences of this structure cause a
+ * kernel oops.
+ */
+struct inode swapper_inode;
+
+
 void show_swap_cache_info(void)
 {
 	printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
@@ -40,21 +55,33 @@ void show_swap_cache_info(void)
 
 int add_to_swap_cache(struct page *page, unsigned long entry)
 {
-	struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
-
 #ifdef SWAP_CACHE_INFO
 	swap_cache_add_total++;
 #endif
-	if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
-		page->pg_swap_entry = entry;
-		if (PageTestandSetSwapCache(page))
-			printk("swap_cache: replacing non-empty entry\n");
-#ifdef SWAP_CACHE_INFO
-		swap_cache_add_success++;
+#ifdef DEBUG_SWAP
+	printk("DebugVM: add_to_swap_cache(%08lx count %d, entry %08lx)\n",
+	       page_address(page), atomic_read(&page->count), entry);
 #endif
-		return 1;
+	if (PageTestandSetSwapCache(page)) {
+		printk("swap_cache: replacing non-empty entry %08lx "
+		       "on page %08lx",
+		       page->offset, page_address(page));
+		return 0;
 	}
-	return 0;
+	if (page->inode) {
+		printk("swap_cache: replacing page-cached entry "
+		       "on page %08lx", page_address(page));
+		return 0;
+	}
+	atomic_inc(&page->count);
+	page->inode = &swapper_inode;
+	page->offset = entry;
+	add_page_to_hash_queue(page, &swapper_inode, entry);
+	add_page_to_inode_queue(&swapper_inode, page);
+#ifdef SWAP_CACHE_INFO
+	swap_cache_add_success++;
+#endif
+	return 1;
 }
 
 /*
@@ -87,6 +114,10 @@ void swap_duplicate(unsigned long entry)
 				entry, p->swap_map[offset]);
 		p->swap_map[offset] = 127;
 	}
+#ifdef DEBUG_SWAP
+	printk("DebugVM: swap_duplicate(entry %08lx, count now %d)\n",
+	       entry, p->swap_map[offset]);
+#endif
 out:
 	return;
 
@@ -97,7 +128,173 @@ bad_offset:
 	printk("swap_duplicate: offset exceeds max\n");
 	goto out;
 bad_unused:
-	printk("swap_duplicate: unused page\n");
+	printk("swap_duplicate at %8p: unused page\n", 
+	       __builtin_return_address(0));
 	goto out;
 }
 
+
+void remove_from_swap_cache(struct page *page)
+{
+	if (!page->inode) {
+		printk ("VM: Removing swap cache page with zero inode hash "
+			"on page %08lx", page_address(page));
+		return;
+	}
+	if (page->inode != &swapper_inode) {
+		printk ("VM: Removing swap cache page with wrong inode hash "
+			"on page %08lx", page_address(page));
+	}
+	/*
+	 * This will be a legal case once we have a more mature swap cache.
+	 */
+	if (atomic_read(&page->count) == 1) {
+		printk ("VM: Removing page cache on unshared page %08lx", 
+			page_address(page));
+		return;
+	}
+
+	
+#ifdef DEBUG_SWAP
+	printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
+	       page_address(page), atomic_read(&page->count));
+#endif
+	remove_page_from_hash_queue (page);
+	remove_page_from_inode_queue (page);
+	PageClearSwapCache (page);
+	__free_page (page);
+}
+
+
+long find_in_swap_cache(struct page *page)
+{
+#ifdef SWAP_CACHE_INFO
+	swap_cache_find_total++;
+#endif
+	if (PageSwapCache (page))  {
+		long entry = page->offset;
+#ifdef SWAP_CACHE_INFO
+		swap_cache_find_success++;
+#endif	
+		remove_from_swap_cache (page);
+		return entry;
+	}
+	return 0;
+}
+
+int delete_from_swap_cache(struct page *page)
+{
+#ifdef SWAP_CACHE_INFO
+	swap_cache_del_total++;
+#endif	
+	if (PageSwapCache (page))  {
+		long entry = page->offset;
+#ifdef SWAP_CACHE_INFO
+		swap_cache_del_success++;
+#endif
+#ifdef DEBUG_SWAP
+		printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
+		       "entry %08lx)\n",
+		       page_address(page), atomic_read(&page->count), entry);
+#endif
+		remove_from_swap_cache (page);
+		swap_free (entry);
+		return 1;
+	}
+	return 0;
+}
+
+/* 
+ * Perform a free_page(), also freeing any swap cache associated with
+ * this page if it is the last user of the page. 
+ */
+
+void free_page_and_swap_cache(unsigned long addr)
+{
+	struct page *page = mem_map + MAP_NR(addr);
+	/* 
+	 * If we are the only user, then free up the swap cache. 
+	 */
+	if (PageSwapCache(page) && !is_page_shared(page)) {
+		delete_from_swap_cache(page);
+	}
+	
+	free_page(addr);
+}
+
+
+/*
+ * Lookup a swap entry in the swap cache.  We need to be careful about
+ * locked pages.  A found page will be returned with its refcount
+ * incremented.
+ */
+
+static struct page * lookup_swap_cache(unsigned long entry)
+{
+	struct page *found;
+	
+	while (1) {
+		found = find_page(&swapper_inode, entry);
+		if (!found)
+			return 0;
+		if (found->inode != &swapper_inode 
+		    || !PageSwapCache(found)) {
+			__free_page(found);
+			printk ("VM: Found a non-swapper swap page!\n");
+			return 0;
+		}
+		if (!PageLocked(found))
+			return found;
+		__free_page(found);
+		__wait_on_page(found);
+	}
+}
+
+/* 
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.  If wait==0, we are
+ * only doing readahead, so don't worry if the page is already locked.
+ */
+
+struct page * read_swap_cache_async(unsigned long entry, int wait)
+{
+	struct page *found_page, *new_page = 0;
+	unsigned long new_page_addr = 0;
+	
+#ifdef DEBUG_SWAP
+	printk("DebugVM: read_swap_cache_async entry %08lx%s\n",
+	       entry, wait ? ", wait" : "");
+#endif
+repeat:
+	found_page = lookup_swap_cache(entry);
+	if (found_page) {
+		if (new_page)
+			__free_page(new_page);
+		return found_page;
+	}
+
+	/* The entry is not present.  Lock down a new page, add it to
+	 * the swap cache and read its contents. */
+	if (!new_page) {
+		new_page_addr = __get_free_page(GFP_KERNEL);
+		if (!new_page_addr)
+			return 0;	/* Out of memory */
+		new_page = mem_map + MAP_NR(new_page_addr);
+		goto repeat;		/* We might have stalled */
+	}
+	
+	if (!add_to_swap_cache(new_page, entry)) {
+		free_page(new_page_addr);
+		return 0;
+	}
+	swap_duplicate(entry);		/* Account for the swap cache */
+	set_bit(PG_locked, &new_page->flags);
+	rw_swap_page(READ, entry, (char *) new_page_addr, wait);
+#ifdef DEBUG_SWAP
+	printk("DebugVM: read_swap_cache_async created "
+	       "entry %08lx at %p\n",
+	       entry, (char *) page_address(new_page));
+#endif
+	return new_page;
+}
+
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 13d2436ba..8608db8d8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,6 +21,7 @@
 #include <linux/malloc.h>
 #include <linux/blkdev.h> /* for blk_size */
 #include <linux/vmalloc.h>
+#include <linux/pagemap.h>
 
 #include <asm/bitops.h>
 #include <asm/pgtable.h>
@@ -51,8 +52,6 @@ static inline int scan_swap_map(struct swap_info_struct *si)
 			offset = si->cluster_next++;
 			if (si->swap_map[offset])
 				continue;
-			if (test_bit(offset, si->swap_lockmap))
-				continue;
 			si->cluster_nr--;
 			goto got_page;
 		}
@@ -61,8 +60,6 @@ static inline int scan_swap_map(struct swap_info_struct *si)
 	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
 		if (si->swap_map[offset])
 			continue;
-		if (test_bit(offset, si->swap_lockmap))
-			continue;
 		si->lowest_bit = offset;
 got_page:
 		si->swap_map[offset] = 1;
@@ -129,6 +126,7 @@ void swap_free(unsigned long entry)
 
 	if (!entry)
 		goto out;
+
 	type = SWP_TYPE(entry);
 	if (type & SHM_SWP_TYPE)
 		goto out;
@@ -152,6 +150,10 @@ void swap_free(unsigned long entry)
 		if (!--p->swap_map[offset])
 			nr_swap_pages++;
 	}
+#ifdef DEBUG_SWAP
+	printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
+	       entry, p->swap_map[offset]);
+#endif
 out:
 	return;
 
@@ -172,42 +174,38 @@ bad_free:
 /*
  * The swap entry has been read in advance, and we return 1 to indicate
  * that the page has been used or is no longer needed.
+ *
+ * Always set the resulting pte to be nowrite (the same as COW pages
+ * after one process has exited).  We don't know just how many ptes will
+ * share this swap entry, so be cautious and let do_wp_page work out
+ * what to do if a write is requested later.
  */
-static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
+static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
 	pte_t *dir, unsigned long entry, unsigned long page)
 {
 	pte_t pte = *dir;
 
 	if (pte_none(pte))
-		return 0;
+		return;
 	if (pte_present(pte)) {
-		struct page *pg;
-		unsigned long page_nr = MAP_NR(pte_page(pte));
-		unsigned long pg_swap_entry;
-
-		if (page_nr >= max_mapnr)
-			return 0;
-		pg = mem_map + page_nr;
-		if (!(pg_swap_entry = in_swap_cache(pg)))
-			return 0;
-		if (SWP_TYPE(pg_swap_entry) != SWP_TYPE(entry))
-			return 0;
-		delete_from_swap_cache(pg);
+		/* If this entry is swap-cached, then page must already
+                   hold the right address for any copies in physical
+                   memory */
+		if (pte_page(pte) != page)
+			return;
+		/* We will be removing the swap cache in a moment, so... */
 		set_pte(dir, pte_mkdirty(pte));
-		if (pg_swap_entry != entry)
-			return 0;
-		free_page(page);
-		return 1;
+		return;
 	}
 	if (pte_val(pte) != entry)
-		return 0;
-	set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
-	++vma->vm_mm->rss;
+		return;
+	set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	swap_free(entry);
-	return 1;
+	atomic_inc(&mem_map[MAP_NR(page)].count);
+	++vma->vm_mm->rss;
 }
 
-static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
+static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
 	unsigned long address, unsigned long size, unsigned long offset,
 	unsigned long entry, unsigned long page)
 {
@@ -215,11 +213,11 @@ static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
 	unsigned long end;
 
 	if (pmd_none(*dir))
-		return 0;
+		return;
 	if (pmd_bad(*dir)) {
 		printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 		pmd_clear(dir);
-		return 0;
+		return;
 	}
 	pte = pte_offset(dir, address);
 	offset += address & PMD_MASK;
@@ -228,16 +226,13 @@ static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		if (unuse_pte(vma, offset+address-vma->vm_start, pte, entry, 
-				page))
-			return 1;
+		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
 		address += PAGE_SIZE;
 		pte++;
 	} while (address < end);
-	return 0;
 }
 
-static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
+static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
 	unsigned long address, unsigned long size,
 	unsigned long entry, unsigned long page)
 {
@@ -245,11 +240,11 @@ static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
 	unsigned long offset, end;
 
 	if (pgd_none(*dir))
-		return 0;
+		return;
 	if (pgd_bad(*dir)) {
 		printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 		pgd_clear(dir);
-		return 0;
+		return;
 	}
 	pmd = pmd_offset(dir, address);
 	offset = address & PGDIR_MASK;
@@ -258,30 +253,26 @@ static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		if (unuse_pmd(vma, pmd, address, end - address, offset, entry,
-				 page))
-			return 1;
+		unuse_pmd(vma, pmd, address, end - address, offset, entry,
+			  page);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address < end);
-	return 0;
 }
 
-static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
 			unsigned long entry, unsigned long page)
 {
 	unsigned long start = vma->vm_start, end = vma->vm_end;
 
 	while (start < end) {
-		if (unuse_pgd(vma, pgdir, start, end - start, entry, page))
-			return 1;
+		unuse_pgd(vma, pgdir, start, end - start, entry, page);
 		start = (start + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	}
-	return 0;
 }
 
-static int unuse_process(struct mm_struct * mm, unsigned long entry, 
+static void unuse_process(struct mm_struct * mm, unsigned long entry, 
 			unsigned long page)
 {
 	struct vm_area_struct* vma;
@@ -290,13 +281,12 @@ static int unuse_process(struct mm_struct * mm, unsigned long entry,
 	 * Go through process' page directory.
 	 */
 	if (!mm || mm == &init_mm)
-		return 0;
+		return;
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-		if (unuse_vma(vma, pgd, entry, page))
-			return 1;
+		unuse_vma(vma, pgd, entry, page);
 	}
-	return 0;
+	return;
 }
 
 /*
@@ -309,19 +299,14 @@ static int try_to_unuse(unsigned int type)
 	struct swap_info_struct * si = &swap_info[type];
 	struct task_struct *p;
 	unsigned long page = 0;
+	struct page *page_map;
 	unsigned long entry;
 	int i;
 
 	while (1) {
-		if (!page) {
-			page = __get_free_page(GFP_KERNEL);
-			if (!page)
-				return -ENOMEM;
-		}
-
 		/*
-	 	* Find a swap page in use and read it in.
-	 	*/
+		 * Find a swap page in use and read it in.
+		 */
 		for (i = 1 , entry = 0; i < si->max ; i++) {
 			if (si->swap_map[i] > 0 && si->swap_map[i] != 0x80) {
 				entry = SWP_ENTRY(type, i);
@@ -330,36 +315,31 @@ static int try_to_unuse(unsigned int type)
 		}
 		if (!entry)
 			break;
-		read_swap_page(entry, (char *) page);
 
+		/* Get a page for the entry, using the existing swap
+                   cache page if there is one.  Otherwise, get a clean
+                   page and read the swap into it. */
+		page_map = read_swap_cache(entry);
+		if (!page_map)
+			return -ENOMEM;
+		page = page_address(page_map);
 		read_lock(&tasklist_lock);
-		for_each_task(p) {
-			if (unuse_process(p->mm, entry, page)) {
-				page = 0;
-				goto unlock;
-			}
-		}
-	unlock:
+		for_each_task(p)
+			unuse_process(p->mm, entry, page);
 		read_unlock(&tasklist_lock);
-		if (page) {
-			/*
-			 * If we couldn't find an entry, there are several
-			 * possible reasons: someone else freed it first,
-			 * we freed the last reference to an overflowed entry,
-			 * or the system has lost track of the use counts.
-			 */
-			if (si->swap_map[i] != 0) {
-				if (si->swap_map[i] != 127)
-					printk("try_to_unuse: entry %08lx "
-					       "not in use\n", entry);
-				si->swap_map[i] = 0;
-				nr_swap_pages++;
-			}
+		/* Now get rid of the extra reference to the temporary
+                   page we've been using. */
+		if (PageSwapCache(page_map))
+			delete_from_swap_cache(page_map);
+		free_page(page);
+		if (si->swap_map[i] != 0) {
+			if (si->swap_map[i] != 127)
+				printk("try_to_unuse: entry %08lx "
+				       "not in use\n", entry);
+			si->swap_map[i] = 0;
+			nr_swap_pages++;
 		}
 	}
-
-	if (page)
-		free_page(page);
 	return 0;
 }
 
@@ -370,7 +350,7 @@ asmlinkage int sys_swapoff(const char * specialfile)
 	struct file filp;
 	int i, type, prev;
 	int err = -EPERM;
-
+	
 	lock_kernel();
 	if (!suser())
 		goto out;
@@ -444,8 +424,6 @@ asmlinkage int sys_swapoff(const char * specialfile)
 	p->swap_device = 0;
 	vfree(p->swap_map);
 	p->swap_map = NULL;
-	free_page((long) p->swap_lockmap);
-	p->swap_lockmap = NULL;
 	p->flags = 0;
 	err = 0;
 out:
@@ -505,6 +483,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 	int error = -EPERM;
 	struct file filp;
 	static int least_priority = 0;
+	unsigned char *avail_map = 0;
 
 	lock_kernel();
 	if (!suser())
@@ -522,7 +501,6 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 	p->swap_file = NULL;
 	p->swap_device = 0;
 	p->swap_map = NULL;
-	p->swap_lockmap = NULL;
 	p->lowest_bit = 0;
 	p->highest_bit = 0;
 	p->cluster_nr = 0;
@@ -565,24 +543,24 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 		}
 	} else if (!S_ISREG(swap_dentry->d_inode->i_mode))
 		goto bad_swap;
-	p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
-	if (!p->swap_lockmap) {
+	avail_map = (unsigned char *) get_free_page(GFP_USER);
+	if (!avail_map) {
 		printk("Unable to start swapping: out of memory :-)\n");
 		error = -ENOMEM;
 		goto bad_swap;
 	}
-	read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
-	if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) {
+	rw_swap_page_nocache(READ, SWP_ENTRY(type,0), (char *) avail_map);
+	if (memcmp("SWAP-SPACE",avail_map+PAGE_SIZE-10,10)) {
 		printk("Unable to find swap-space signature\n");
 		error = -EINVAL;
 		goto bad_swap;
 	}
-	memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
+	memset(avail_map+PAGE_SIZE-10,0,10);
 	j = 0;
 	p->lowest_bit = 0;
 	p->highest_bit = 0;
 	for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
-		if (test_bit(i,p->swap_lockmap)) {
+		if (test_bit(i,avail_map)) {
 			if (!p->lowest_bit)
 				p->lowest_bit = i;
 			p->highest_bit = i;
@@ -601,13 +579,12 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 		goto bad_swap;
 	}
 	for (i = 1 ; i < p->max ; i++) {
-		if (test_bit(i,p->swap_lockmap))
+		if (test_bit(i,avail_map))
 			p->swap_map[i] = 0;
 		else
 			p->swap_map[i] = 0x80;
 	}
 	p->swap_map[0] = 0x80;
-	clear_page(p->swap_lockmap);
 	p->flags = SWP_WRITEOK;
 	p->pages = j;
 	nr_swap_pages += j;
@@ -634,15 +611,15 @@ bad_swap:
 	if(filp.f_op && filp.f_op->release)
 		filp.f_op->release(filp.f_dentry->d_inode,&filp);
 bad_swap_2:
-	free_page((long) p->swap_lockmap);
 	vfree(p->swap_map);
 	dput(p->swap_file);
 	p->swap_device = 0;
 	p->swap_file = NULL;
 	p->swap_map = NULL;
-	p->swap_lockmap = NULL;
 	p->flags = 0;
 out:
+	if (avail_map)
+		free_page((long) avail_map);
 	unlock_kernel();
 	return error;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a50684973..ebef7a362 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7,7 +7,7 @@
  *  kswapd added: 7.1.96  sct
  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  *  to bring the system back to free_pages_high: 2.4.97, Rik van Riel.
- *  Version: $Id: vmscan.c,v 1.23 1997/04/12 04:31:05 davem Exp $
+ *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
 #include <linux/mm.h>
@@ -61,7 +61,7 @@ static void init_swap_timer(void);
  * have died while we slept).
  */
 static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
-	unsigned long address, pte_t * page_table, int dma, int wait)
+	unsigned long address, pte_t * page_table, int gfp_mask)
 {
 	pte_t pte;
 	unsigned long entry;
@@ -78,20 +78,62 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 	page_map = mem_map + MAP_NR(page);
 	if (PageReserved(page_map)
 	    || PageLocked(page_map)
-	    || (dma && !PageDMA(page_map)))
+	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
 		return 0;
-	/* Deal with page aging.  Pages age from being unused; they
-	 * rejuvenate on being accessed.  Only swap old pages (age==0
-	 * is oldest). */
-	if ((pte_dirty(pte) && delete_from_swap_cache(page_map)) 
-	    || pte_young(pte))  {
+
+	/* 
+	 * Deal with page aging.  There are several special cases to
+	 * consider:
+	 * 
+	 * Page has been accessed, but is swap cached.  If the page is
+	 * getting sufficiently "interesting" --- its age is getting
+	 * high --- then if we are sufficiently short of free swap
+	 * pages, then delete the swap cache.  We can only do this if
+	 * the swap page's reference count is one: ie. there are no
+	 * other references to it beyond the swap cache (as there must
+	 * still be pte's pointing to it if count > 1).
+	 * 
+	 * If the page has NOT been touched, and its age reaches zero,
+	 * then we are swapping it out:
+	 *
+	 *   If there is already a swap cache page for this page, then
+	 *   another process has already allocated swap space, so just
+	 *   dereference the physical page and copy in the swap entry
+	 *   from the swap cache.  
+	 * 
+	 * Note, we rely on all pages read in from swap either having
+	 * the swap cache flag set, OR being marked writable in the pte,
+	 * but NEVER BOTH.  (It IS legal to be neither cached nor dirty,
+	 * however.)
+	 *
+	 * -- Stephen Tweedie 1998 */
+
+	if (PageSwapCache(page_map)) {
+		if (pte_write(pte)) {
+			printk ("VM: Found a writable swap-cached page!\n");
+			return 0;
+		}
+	}
+	
+	if (pte_young(pte)) {
 		set_pte(page_table, pte_mkold(pte));
 		touch_page(page_map);
+		/* 
+		 * We should test here to see if we want to recover any
+		 * swap cache page here.  We do this if the page seeing
+		 * enough activity, AND we are sufficiently low on swap
+		 *
+		 * We need to track both the number of available swap
+		 * pages and the total number present before we can do
+		 * this...  
+		 */
 		return 0;
 	}
+
 	age_page(page_map);
 	if (page_map->age)
 		return 0;
+
 	if (pte_dirty(pte)) {
 		if (vma->vm_ops && vma->vm_ops->swapout) {
 			pid_t pid = tsk->pid;
@@ -99,33 +141,83 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 			if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
 				kill_proc(pid, SIGBUS, 1);
 		} else {
-			if (atomic_read(&page_map->count) != 1)
-				return 0;
-			if (!(entry = get_swap_page()))
-				return 0;
+			/*
+			 * This is a dirty, swappable page.  First of all,
+			 * get a suitable swap entry for it, and make sure
+			 * we have the swap cache set up to associate the
+			 * page with that swap entry.
+			 */
+			if (PageSwapCache(page_map)) {
+				entry = page_map->offset;
+			} else {
+				entry = get_swap_page();
+				if (!entry)
+					return 0; /* No swap space left */
+			}
+			
 			vma->vm_mm->rss--;
+			tsk->nswap++;
 			flush_cache_page(vma, address);
 			set_pte(page_table, __pte(entry));
 			flush_tlb_page(vma, address);
-			tsk->nswap++;
-			rw_swap_page(WRITE, entry, (char *) page, wait);
+			swap_duplicate(entry);
+
+			/* Now to write back the page.  We have two
+			 * cases: if the page is already part of the
+			 * swap cache, then it is already on disk.  Just
+			 * free the page and return (we release the swap
+			 * cache on the last accessor too).
+			 *
+			 * If we have made a new swap entry, then we
+			 * start the write out to disk.  If the page is
+			 * shared, however, we still need to keep the
+			 * copy in memory, so we add it to the swap
+			 * cache. */
+			if (PageSwapCache(page_map)) {
+				free_page_and_swap_cache(page);
+				return (atomic_read(&page_map->count) == 0);
+			}
+			add_to_swap_cache(page_map, entry);
+			/* We checked we were unlocked way up above, and we
+			   have been careful not to stall until here */
+			set_bit(PG_locked, &page_map->flags);
+			/* OK, do a physical write to swap.  */
+			rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
 		}
-		free_page(page);
+		/* Now we can free the current physical page.  We also
+		 * free up the swap cache if this is the last use of the
+		 * page.  Note that there is a race here: the page may
+		 * still be shared COW by another process, but that
+		 * process may exit while we are writing out the page
+		 * asynchronously.  That's no problem, shrink_mmap() can
+		 * correctly clean up the occassional unshared page
+		 * which gets left behind in the swap cache. */
+		free_page_and_swap_cache(page);
 		return 1;	/* we slept: the process may not exist any more */
 	}
-        if ((entry = find_in_swap_cache(page_map)))  {
-		if (atomic_read(&page_map->count) != 1) {
-			set_pte(page_table, pte_mkdirty(pte));
-			printk("Aiee.. duplicated cached swap-cache entry\n");
-			return 0;
-		}
+
+	/* The page was _not_ dirty, but still has a zero age.  It must
+	 * already be uptodate on disk.  If it is in the swap cache,
+	 * then we can just unlink the page now.  Remove the swap cache
+	 * too if this is the last user.  */
+        if ((entry = in_swap_cache(page_map)))  {
 		vma->vm_mm->rss--;
 		flush_cache_page(vma, address);
 		set_pte(page_table, __pte(entry));
 		flush_tlb_page(vma, address);
-		free_page(page);
-		return 1;
+		swap_duplicate(entry);
+		free_page_and_swap_cache(page);
+		return (atomic_read(&page_map->count) == 0);
 	} 
+	/* 
+	 * A clean page to be discarded?  Must be mmap()ed from
+	 * somewhere.  Unlink the pte, and tell the filemap code to
+	 * discard any cached backing page if this is the last user.
+	 */
+	if (PageSwapCache(page_map)) {
+		printk ("VM: How can this page _still_ be cached?");
+		return 0;
+	}
 	vma->vm_mm->rss--;
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
@@ -150,7 +242,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
  */
 
 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
+	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -172,7 +264,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
 	do {
 		int result;
 		tsk->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(tsk, vma, address, pte, dma, wait);
+		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
 		address += PAGE_SIZE;
@@ -182,7 +274,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
 }
 
 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
+	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -202,7 +294,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait);
+		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
 		if (result)
 			return result;
 		address = (address + PMD_SIZE) & PMD_MASK;
@@ -212,7 +304,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
 }
 
 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *pgdir, unsigned long start, int dma, int wait)
+	pgd_t *pgdir, unsigned long start, int gfp_mask)
 {
 	unsigned long end;
 
@@ -223,7 +315,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 
 	end = vma->vm_end;
 	while (start < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait);
+		int result = swap_out_pgd(tsk, vma, pgdir, start, end, gfp_mask);
 		if (result)
 			return result;
 		start = (start + PGDIR_SIZE) & PGDIR_MASK;
@@ -232,7 +324,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 	return 0;
 }
 
-static int swap_out_process(struct task_struct * p, int dma, int wait)
+static int swap_out_process(struct task_struct * p, int gfp_mask)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -241,19 +333,20 @@ static int swap_out_process(struct task_struct * p, int dma, int wait)
 	 * Go through process' page directory.
 	 */
 	address = p->swap_address;
-	p->swap_address = 0;
 
 	/*
 	 * Find the proper vm-area
 	 */
 	vma = find_vma(p->mm, address);
-	if (!vma)
+	if (!vma) {
+		p->swap_address = 0;
 		return 0;
+	}
 	if (address < vma->vm_start)
 		address = vma->vm_start;
 
 	for (;;) {
-		int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait);
+		int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, gfp_mask);
 		if (result)
 			return result;
 		vma = vma->vm_next;
@@ -270,7 +363,7 @@ static int swap_out_process(struct task_struct * p, int dma, int wait)
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int dma, int wait)
+static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
 	int counter, assign, max_cnt;
@@ -321,7 +414,7 @@ static int swap_out(unsigned int priority, int dma, int wait)
 		}
 		pbest->swap_cnt--;
 
-		switch (swap_out_process(pbest, dma, wait)) {
+		switch (swap_out_process(pbest, gfp_mask)) {
 		case 0:
 			/*
 			 * Clear swap_cnt so we don't look at this task
@@ -345,7 +438,7 @@ out:
  * to be.  This works out OK, because we now do proper aging on page
  * contents. 
  */
-static inline int do_try_to_free_page(int priority, int dma, int wait)
+static inline int do_try_to_free_page(int gfp_mask)
 {
 	static int state = 0;
 	int i=6;
@@ -353,25 +446,27 @@ static inline int do_try_to_free_page(int priority, int dma, int wait)
 
 	/* Let the dcache know we're looking for memory ... */
 	shrink_dcache_memory();
+
 	/* Always trim SLAB caches when memory gets low. */
-	(void) kmem_cache_reap(0, dma, wait);
+	kmem_cache_reap(gfp_mask);
 
-	/* we don't try as hard if we're not waiting.. */
+	/* We try harder if we are waiting .. */
 	stop = 3;
-	if (wait)
+	if (gfp_mask & __GFP_WAIT)
 		stop = 0;
+
 	switch (state) {
 		do {
 		case 0:
-			if (shrink_mmap(i, dma))
+			if (shrink_mmap(i, gfp_mask))
 				return 1;
 			state = 1;
 		case 1:
-			if (shm_swap(i, dma))
+			if ((gfp_mask & __GFP_IO) && shm_swap(i, gfp_mask))
 				return 1;
 			state = 2;
 		default:
-			if (swap_out(i, dma, wait))
+			if (swap_out(i, gfp_mask))
 				return 1;
 			state = 0;
 		i--;
@@ -387,12 +482,12 @@ static inline int do_try_to_free_page(int priority, int dma, int wait)
  * now we need this so that we can do page allocations
  * without holding the kernel lock etc.
  */
-int try_to_free_page(int priority, int dma, int wait)
+int try_to_free_page(int gfp_mask)
 {
 	int retval;
 
 	lock_kernel();
-	retval = do_try_to_free_page(priority,dma,wait);
+	retval = do_try_to_free_page(gfp_mask);
 	unlock_kernel();
 	return retval;
 }
@@ -406,7 +501,7 @@ int try_to_free_page(int priority, int dma, int wait)
 void kswapd_setup(void)
 {
        int i;
-       char *revision="$Revision: 1.23 $", *s, *e;
+       char *revision="$Revision: 1.5 $", *s, *e;
 
        if ((s = strchr(revision, ':')) &&
            (e = strchr(s, '$')))
@@ -423,6 +518,7 @@ void kswapd_setup(void)
  */
 int kswapd(void *unused)
 {
+	struct wait_queue wait = { current, NULL };
 	current->session = 1;
 	current->pgrp = 1;
 	sprintf(current->comm, "kswapd");
@@ -442,42 +538,63 @@ int kswapd(void *unused)
 				    priorities.  */
 
 	init_swap_timer();
-	
+	add_wait_queue(&kswapd_wait, &wait);
 	while (1) {
-		int fail;
+		int tries;
 
 		kswapd_awake = 0;
 		flush_signals(current);
 		run_task_queue(&tq_disk);
-		interruptible_sleep_on(&kswapd_wait);
+		schedule();
+		current->state = TASK_INTERRUPTIBLE;
 		kswapd_awake = 1;
 		swapstats.wakeups++;
 		/* Do the background pageout: 
-		 * We now only swap out as many pages as needed.
-		 * When we are truly low on memory, we swap out
-		 * synchronously (WAIT == 1).  -- Rik.
-		 * If we've had too many consecutive failures,
-		 * go back to sleep to let other tasks run.
+		 * When we've got loads of memory, we try
+		 * (free_pages_high - nr_free_pages) times to
+		 * free memory. As memory gets tighter, kswapd
+		 * gets more and more agressive. -- Rik.
 		 */
-		for (fail = 0; fail++ < MAX_SWAP_FAIL;) {
-			int pages, wait;
+		tries = free_pages_high - nr_free_pages;
+		if (tries < min_free_pages) {
+			tries = min_free_pages;
+		}
+		else if (nr_free_pages < (free_pages_high + free_pages_low) / 2) {
+			tries <<= 1;
+			if (nr_free_pages < free_pages_low) {
+				tries <<= 1;
+				if (nr_free_pages <= min_free_pages) {
+					tries <<= 1;
+				}
+			}
+		}
+		while (tries--) {
+			int gfp_mask;
 
-			pages = nr_free_pages;
-			if (nr_free_pages >= min_free_pages)
-				pages += atomic_read(&nr_async_pages);
-			if (pages >= free_pages_high)
+			if (free_memory_available())
 				break;
-			wait = (pages < free_pages_low);
-			if (try_to_free_page(GFP_KERNEL, 0, wait))
-				fail = 0;
+			gfp_mask = __GFP_IO;
+			try_to_free_page(gfp_mask);
+			/*
+			 * Syncing large chunks is faster than swapping
+			 * synchronously (less head movement). -- Rik.
+			 */
+			if (atomic_read(&nr_async_pages) >= SWAP_CLUSTER_MAX)
+				run_task_queue(&tq_disk);
+
 		}
-		/*
-		 * Report failure if we couldn't reach the minimum goal.
-		 */
-		if (nr_free_pages < min_free_pages)
-			printk("kswapd: failed, got %d of %d\n",
-				nr_free_pages, min_free_pages);
+#if 0
+	/*
+	 * Report failure if we couldn't even reach min_free_pages.
+	 */
+	if (nr_free_pages < min_free_pages)
+		printk("kswapd: failed, got %d of %d\n",
+			nr_free_pages, min_free_pages);
+#endif
 	}
+	/* As if we could ever get here - maybe we want to make this killable */
+	remove_wait_queue(&kswapd_wait, &wait);
+	return 0;
 }
 
 /*