11 files changed, 1191 insertions, 822 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 227bcd5a9..3c15ea63b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -118,140 +118,83 @@ void remove_inode_page(struct page *page)
 	__free_page(page);
 }
 
-/*
- * Check whether we can free this page.
- */
-static inline int shrink_one_page(struct page *page, int gfp_mask)
-{
-	struct buffer_head *tmp, *bh;
-
-	if (PageLocked(page))
-		goto next;
-	if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
-		goto next;
-	/* First of all, regenerate the page's referenced bit
-         * from any buffers in the page
-	 */
-	bh = page->buffers;
-	if (bh) {
-		tmp = bh;
-		do {
-			if (buffer_touched(tmp)) {
-				clear_bit(BH_Touched, &tmp->b_state);
-				set_bit(PG_referenced, &page->flags);
-			}
-			tmp = tmp->b_this_page;
-		} while (tmp != bh);
-
-		/* Refuse to swap out all buffer pages */
-		if (buffer_under_min())
-			goto next;
-	}
-
-	/* We can't throw away shared pages, but we do mark
-	   them as referenced.  This relies on the fact that
-	   no page is currently in both the page cache and the
-	   buffer cache; we'd have to modify the following
-	   test to allow for that case. */
-
-	switch (atomic_read(&page->count)) {
-	case 1:
-		/* is it a swap-cache or page-cache page? */
-		if (page->inode) {
-			if (test_and_clear_bit(PG_referenced, &page->flags))
-				break;
-			if (pgcache_under_min())
-				break;
-			if (PageSwapCache(page)) {
-				delete_from_swap_cache(page);
-				return 1;
-			}
-			remove_inode_page(page);
-			return 1;
-		}
-		/* It's not a cache page, so we don't do aging.
-		 * If it has been referenced recently, don't free it */
-		if (test_and_clear_bit(PG_referenced, &page->flags))
-			break;
-
-		if (buffer_under_min())
-			break;
-
-		/* is it a buffer cache page? */
-		if (bh && try_to_free_buffer(bh, &bh, 6))
-			return 1;
-		break;
-
-	default:
-		/* more than one user: we can't throw it away */
-		set_bit(PG_referenced, &page->flags);
-		/* fall through */
-	case 0:
-		/* nothing */
-	}
-next:
-	return 0;
-}
-
 int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
 	unsigned long limit = num_physpages;
 	struct page * page;
-	int count_max, count_min;
+	int count;
 
-	count_max = limit;
-	count_min = (limit<<2) >> (priority);
+	count = limit >> priority;
 
 	page = mem_map + clock;
 	do {
-		if (PageSkip(page)) {
-			/* next_hash is overloaded for PageSkip */
-			page = page->next_hash;
-			clock = page->map_nr;
-		}
-		
-		if (shrink_one_page(page, gfp_mask))
-			return 1;
-		count_max--;
-		/* 
-		 * If the page we looked at was recyclable but we didn't
-		 * reclaim it (presumably due to PG_referenced), don't
-		 * count it as scanned.  This way, the more referenced
-		 * page cache pages we encounter, the more rapidly we
-		 * will age them. 
+		int referenced;
+
+		/* This works even in the presence of PageSkip because
+		 * the first two entries at the beginning of a hole will
+		 * be marked, not just the first.
 		 */
-		if (atomic_read(&page->count) != 1 ||
-		    (!page->inode && !page->buffers))
-			count_min--;
 		page++;
 		clock++;
 		if (clock >= max_mapnr) {
 			clock = 0;
 			page = mem_map;
 		}
-	} while (count_max > 0 && count_min > 0);
-	return 0;
-}
+		if (PageSkip(page)) {
+			/* next_hash is overloaded for PageSkip */
+			page = page->next_hash;
+			clock = page - mem_map;
+		}
+		
+		referenced = test_and_clear_bit(PG_referenced, &page->flags);
 
-/*
- * This is called from try_to_swap_out() when we try to get rid of some
- * pages..  If we're unmapping the last occurrence of this page, we also
- * free it from the page hash-queues etc, as we don't want to keep it
- * in-core unnecessarily.
- */
-unsigned long page_unuse(struct page * page)
-{
-	int count = atomic_read(&page->count);
-
-	if (count != 2)
-		return count;
-	if (!page->inode)
-		return count;
-	if (PageSwapCache(page))
-		panic ("Doing a normal page_unuse of a swap cache page");
-	remove_inode_page(page);
-	return 1;
+		if (PageLocked(page))
+			continue;
+
+		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+			continue;
+
+		/* We can't free pages unless there's just one user */
+		if (atomic_read(&page->count) != 1)
+			continue;
+
+		count--;
+
+		/*
+		 * Is it a page swap page? If so, we want to
+		 * drop it if it is no longer used, even if it
+		 * were to be marked referenced..
+		 */
+		if (PageSwapCache(page)) {
+			if (referenced && swap_count(page->offset) != 1)
+				continue;
+			delete_from_swap_cache(page);
+			return 1;
+		}	
+
+		if (referenced)
+			continue;
+
+		/* Is it a buffer page? */
+		if (page->buffers) {
+			if (buffer_under_min())
+				continue;
+			if (!try_to_free_buffers(page))
+				continue;
+			return 1;
+		}
+
+		/* is it a page-cache page? */
+		if (page->inode) {
+			if (pgcache_under_min())
+				continue;
+			remove_inode_page(page);
+			return 1;
+		}
+
+	} while (count > 0);
+	return 0;
 }
 
 /*
@@ -974,7 +917,7 @@ static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long
 	struct file * file = area->vm_file;
 	struct dentry * dentry = file->f_dentry;
 	struct inode * inode = dentry->d_inode;
-	unsigned long offset;
+	unsigned long offset, reada, i;
 	struct page * page, **hash;
 	unsigned long old_page, new_page;
 
@@ -1035,7 +978,18 @@ success:
 	return new_page;
 
 no_cached_page:
-	new_page = __get_free_page(GFP_USER);
+	/*
+	 * Try to read in an entire cluster at once.
+	 */
+	reada   = offset;
+	reada >>= PAGE_SHIFT + page_cluster;
+	reada <<= PAGE_SHIFT + page_cluster;
+
+	for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_SIZE)
+		new_page = try_to_read_ahead(file, reada, new_page);
+
+	if (!new_page)
+		new_page = __get_free_page(GFP_USER);
 	if (!new_page)
 		goto no_page;
 
@@ -1059,11 +1013,6 @@ no_cached_page:
 	if (inode->i_op->readpage(file, page) != 0)
 		goto failure;
 
-	/*
-	 * Do a very limited read-ahead if appropriate
-	 */
-	if (PageLocked(page))
-		new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
 	goto found_page;
 
 page_locked_wait:
@@ -1137,22 +1086,6 @@ static int filemap_write_page(struct vm_area_struct * vma,
 	struct file * file;
 	struct dentry * dentry;
 	struct inode * inode;
-	struct buffer_head * bh;
-
-	bh = mem_map[MAP_NR(page)].buffers;
-	if (bh) {
-		/* whee.. just mark the buffer heads dirty */
-		struct buffer_head * tmp = bh;
-		do {
-			/*
-			 * WSH: There's a race here: mark_buffer_dirty()
-			 * could block, and the buffers aren't pinned down.
-			 */
-			mark_buffer_dirty(tmp, 0);
-			tmp = tmp->b_this_page;
-		} while (tmp != bh);
-		return 0;
-	}
 
 	file = vma->vm_file;
 	dentry = file->f_dentry;
@@ -1174,50 +1107,15 @@ static int filemap_write_page(struct vm_area_struct * vma,
 
 
 /*
- * Swapping to a shared file: while we're busy writing out the page
- * (and the page still exists in memory), we save the page information
- * in the page table, so that "filemap_swapin()" can re-use the page
- * immediately if it is called while we're busy swapping it out..
- *
- * Once we've written it all out, we mark the page entry "empty", which
- * will result in a normal page-in (instead of a swap-in) from the now
- * up-to-date disk file.
+ * The page cache takes care of races between somebody
+ * trying to swap something out and swap something in
+ * at the same time..
  */
-int filemap_swapout(struct vm_area_struct * vma,
-	unsigned long offset,
-	pte_t *page_table)
-{
-	int error;
-	unsigned long page = pte_page(*page_table);
-	unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
-
-	flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
-	set_pte(page_table, __pte(entry));
-	flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
-	error = filemap_write_page(vma, offset, page);
-	if (pte_val(*page_table) == entry)
-		pte_clear(page_table);
-	return error;
-}
-
-/*
- * filemap_swapin() is called only if we have something in the page
- * tables that is non-zero (but not present), which we know to be the
- * page index of a page that is busy being swapped out (see above).
- * So we just use it directly..
- */
-static pte_t filemap_swapin(struct vm_area_struct * vma,
-	unsigned long offset,
-	unsigned long entry)
+int filemap_swapout(struct vm_area_struct * vma, struct page * page)
 {
-	unsigned long page = SWP_OFFSET(entry);
-
-	atomic_inc(&mem_map[page].count);
-	page = (page << PAGE_SHIFT) + PAGE_OFFSET;
-	return mk_pte(page,vma->vm_page_prot);
+	return filemap_write_page(vma, page->offset, page_address(page));
 }
 
-
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 	unsigned long address, unsigned int flags)
 {
@@ -1358,7 +1256,7 @@ static struct vm_operations_struct file_shared_mmap = {
 	filemap_nopage,		/* nopage */
 	NULL,			/* wppage */
 	filemap_swapout,	/* swapout */
-	filemap_swapin,		/* swapin */
+	NULL,			/* swapin */
 };
 
 /*
@@ -1637,7 +1535,7 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset,
 	if (!page) {
 		if (!new)
 			goto out;
-		page_cache = get_free_page(GFP_KERNEL);
+		page_cache = get_free_page(GFP_USER);
 		if (!page_cache)
 			goto out;
 		page = mem_map + MAP_NR(page_cache);
diff --git a/mm/memory.c b/mm/memory.c
index 932c35648..f788163c2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -126,48 +126,36 @@ int check_pgt_cache(void)
  * This function clears all user-level page tables of a process - this
  * is needed by execve(), so that old pages aren't in the way.
  */
-void clear_page_tables(struct task_struct * tsk)
+void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
 {
-	pgd_t * page_dir = tsk->mm->pgd;
-	int i;
-
-	if (!page_dir || page_dir == swapper_pg_dir)
-		goto out_bad;
-	for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
-		free_one_pgd(page_dir + i);
+	pgd_t * page_dir = mm->pgd;
 
-	/* keep the page table cache within bounds */
-	check_pgt_cache();
-	return;
+	if (page_dir && page_dir != swapper_pg_dir) {
+		page_dir += first;
+		do {
+			free_one_pgd(page_dir);
+			page_dir++;
+		} while (--nr);
 
-out_bad:
-	printk(KERN_ERR 
-		"clear_page_tables: %s trying to clear kernel pgd\n",
-		tsk->comm);
-	return;
+		/* keep the page table cache within bounds */
+		check_pgt_cache();
+	}
 }
 
 /*
- * This function frees up all page tables of a process when it exits. It
- * is the same as "clear_page_tables()", except it also frees the old
- * page table directory.
+ * This function just free's the page directory - the
+ * pages tables themselves have been freed earlier by 
+ * clear_page_tables().
  */
 void free_page_tables(struct mm_struct * mm)
 {
 	pgd_t * page_dir = mm->pgd;
-	int i;
-
-	if (!page_dir)
-		goto out;
-	if (page_dir == swapper_pg_dir)
-		goto out_bad;
-	for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
-		free_one_pgd(page_dir + i);
-	pgd_free(page_dir);
-
-	/* keep the page table cache within bounds */
-	check_pgt_cache();
-out:
+
+	if (page_dir) {
+		if (page_dir == swapper_pg_dir)
+			goto out_bad;
+		pgd_free(page_dir);
+	}
 	return;
 
 out_bad:
@@ -204,7 +192,7 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 	pgd_t * src_pgd, * dst_pgd;
 	unsigned long address = vma->vm_start;
 	unsigned long end = vma->vm_end;
-	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 	
 	src_pgd = pgd_offset(src, address)-1;
 	dst_pgd = pgd_offset(dst, address)-1;
@@ -277,10 +265,15 @@ skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
 					set_pte(dst_pte, pte);
 					goto cont_copy_pte_range;
 				}
-				if (cow)
+				/* If it's a COW mapping, write protect it both in the parent and the child */
+				if (cow) {
 					pte = pte_wrprotect(pte);
+					set_pte(src_pte, pte);
+				}
+				/* If it's a shared mapping, mark it clean in the child */
+				if (vma->vm_flags & VM_SHARED)
+					pte = pte_mkclean(pte);
 				set_pte(dst_pte, pte_mkold(pte));
-				set_pte(src_pte, pte);
 				atomic_inc(&mem_map[page_nr].count);
 			
 cont_copy_pte_range:		address += PAGE_SIZE;
@@ -644,37 +637,47 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	page_map = mem_map + MAP_NR(old_page);
 	
 	/*
-	 * Do we need to copy?
+	 * We can avoid the copy if:
+	 * - we're the only user (count == 1)
+	 * - the only other user is the swap cache,
+	 *   and the only swap cache user is itself,
+	 *   in which case we can remove the page
+	 *   from the swap cache.
 	 */
-	if (is_page_shared(page_map)) {
+	switch (atomic_read(&page_map->count)) {
+	case 2:
+		if (!PageSwapCache(page_map))
+			break;
+		if (swap_count(page_map->offset) != 1)
+			break;
+		delete_from_swap_cache(page_map);
+		/* FallThrough */
+	case 1:
+		/* We can release the kernel lock now.. */
 		unlock_kernel();
-		if (!new_page)
-			return 0;
 
-		if (PageReserved(mem_map + MAP_NR(old_page)))
-			++vma->vm_mm->rss;
-		copy_cow_page(old_page,new_page);
-		flush_page_to_ram(old_page);
-		flush_page_to_ram(new_page);
 		flush_cache_page(vma, address);
-		set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
-		free_page(old_page);
+		set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 		flush_tlb_page(vma, address);
+end_wp_page:
+		if (new_page)
+			free_page(new_page);
 		return 1;
 	}
-
-	if (PageSwapCache(page_map))
-		delete_from_swap_cache(page_map);
-
-	/* We can release the kernel lock now.. */
+		
 	unlock_kernel();
+	if (!new_page)
+		return 0;
 
+	if (PageReserved(mem_map + MAP_NR(old_page)))
+		++vma->vm_mm->rss;
+	copy_cow_page(old_page,new_page);
+	flush_page_to_ram(old_page);
+	flush_page_to_ram(new_page);
 	flush_cache_page(vma, address);
-	set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
+	set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+	free_page(old_page);
 	flush_tlb_page(vma, address);
-end_wp_page:
-	if (new_page)
-		free_page(new_page);
 	return 1;
 
 bad_wp_page:
diff --git a/mm/mmap.c b/mm/mmap.c
index 4cbdbe3ca..9a9146754 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -371,6 +371,100 @@ unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
 	}
 }
 
+#define vm_avl_empty	(struct vm_area_struct *) NULL
+
+#include "mmap_avl.c"
+
+/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = NULL;
+
+	if (mm) {
+		/* Check the cache first. */
+		/* (Cache hit rate is typically around 35%.) */
+		vma = mm->mmap_cache;
+		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+			if (!mm->mmap_avl) {
+				/* Go through the linear list. */
+				vma = mm->mmap;
+				while (vma && vma->vm_end <= addr)
+					vma = vma->vm_next;
+			} else {
+				/* Then go through the AVL tree quickly. */
+				struct vm_area_struct * tree = mm->mmap_avl;
+				vma = NULL;
+				for (;;) {
+					if (tree == vm_avl_empty)
+						break;
+					if (tree->vm_end > addr) {
+						vma = tree;
+						if (tree->vm_start <= addr)
+							break;
+						tree = tree->vm_avl_left;
+					} else
+						tree = tree->vm_avl_right;
+				}
+			}
+			if (vma)
+				mm->mmap_cache = vma;
+		}
+	}
+	return vma;
+}
+
+/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
+				      struct vm_area_struct **pprev)
+{
+	if (mm) {
+		if (!mm->mmap_avl) {
+			/* Go through the linear list. */
+			struct vm_area_struct * prev = NULL;
+			struct vm_area_struct * vma = mm->mmap;
+			while (vma && vma->vm_end <= addr) {
+				prev = vma;
+				vma = vma->vm_next;
+			}
+			*pprev = prev;
+			return vma;
+		} else {
+			/* Go through the AVL tree quickly. */
+			struct vm_area_struct * vma = NULL;
+			struct vm_area_struct * last_turn_right = NULL;
+			struct vm_area_struct * prev = NULL;
+			struct vm_area_struct * tree = mm->mmap_avl;
+			for (;;) {
+				if (tree == vm_avl_empty)
+					break;
+				if (tree->vm_end > addr) {
+					vma = tree;
+					prev = last_turn_right;
+					if (tree->vm_start <= addr)
+						break;
+					tree = tree->vm_avl_left;
+				} else {
+					last_turn_right = tree;
+					tree = tree->vm_avl_right;
+				}
+			}
+			if (vma) {
+				if (vma->vm_avl_left != vm_avl_empty) {
+					prev = vma->vm_avl_left;
+					while (prev->vm_avl_right != vm_avl_empty)
+						prev = prev->vm_avl_right;
+				}
+				if ((prev ? prev->vm_next : mm->mmap) != vma)
+					printk("find_vma_prev: tree inconsistent with list\n");
+				*pprev = prev;
+				return vma;
+			}
+		}
+	}
+	*pprev = NULL;
+	return NULL;
+}
+
 /* Normal function to fix up a mapping
  * This function is the default for when an area has no specific
  * function.  This may be used as part of a more specific routine.
@@ -446,6 +540,57 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
 	return 1;
 }
 
+/*
+ * Try to free as many page directory entries as we can,
+ * without having to work very hard at actually scanning
+ * the page tables themselves.
+ *
+ * Right now we try to free page tables if we have a nice
+ * PGDIR-aligned area that got free'd up. We could be more
+ * granular if we want to, but this is fast and simple,
+ * and covers the bad cases.
+ *
+ * "prev", if it exists, points to a vma before the one
+ * we just free'd - but there's no telling how much before.
+ */
+static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev,
+	unsigned long start, unsigned long end)
+{
+	unsigned long first = start & PGDIR_MASK;
+	unsigned long last = (end + PGDIR_SIZE - 1) & PGDIR_MASK;
+
+	if (!prev) {
+		prev = mm->mmap;
+		if (!prev)
+			goto no_mmaps;
+		if (prev->vm_end > start) {
+			if (last > prev->vm_end)
+				last = prev->vm_end;
+			goto no_mmaps;
+		}
+	}
+	for (;;) {
+		struct vm_area_struct *next = prev->vm_next;
+
+		if (next) {
+			if (next->vm_start < start) {
+				prev = next;
+				continue;
+			}
+			if (last > next->vm_start)
+				last = next->vm_start;
+		}
+		if (prev->vm_end > first)
+			first = prev->vm_end + PGDIR_SIZE - 1;
+		break;
+	}
+no_mmaps:
+	first = first >> PGDIR_SHIFT;
+	last = last >> PGDIR_SHIFT;
+	if (last > first)
+		clear_page_tables(mm, first, last-first);
+}
+
 /* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
  * work.  This now handles partial unmappings.
@@ -454,8 +599,7 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
 int do_munmap(unsigned long addr, size_t len)
 {
 	struct mm_struct * mm;
-	struct vm_area_struct *mpnt, *free, *extra;
-	int freed;
+	struct vm_area_struct *mpnt, *prev, **npp, *free, *extra;
 
 	if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
 		return -EINVAL;
@@ -469,15 +613,17 @@ int do_munmap(unsigned long addr, size_t len)
 	 * on the list.  If nothing is put on, nothing is affected.
 	 */
 	mm = current->mm;
-	mpnt = mm->mmap;
-	while(mpnt && mpnt->vm_end <= addr)
-		mpnt = mpnt->vm_next;
+	mpnt = find_vma_prev(mm, addr, &prev);
 	if (!mpnt)
 		return 0;
+	/* we have  addr < mpnt->vm_end  */
+
+	if (mpnt->vm_start >= addr+len)
+		return 0;
 
 	/* If we'll make "hole", check the vm areas limit */
-	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) &&
-	    mm->map_count > MAX_MAP_COUNT)
+	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
+	    && mm->map_count >= MAX_MAP_COUNT)
 		return -ENOMEM;
 
 	/*
@@ -488,18 +634,14 @@ int do_munmap(unsigned long addr, size_t len)
 	if (!extra)
 		return -ENOMEM;
 
-	/* we have addr < mpnt->vm_end */
+	npp = (prev ? &prev->vm_next : &mm->mmap);
 	free = NULL;
-	for ( ; mpnt && mpnt->vm_start < addr+len; ) {
-		struct vm_area_struct *next = mpnt->vm_next;
-
-		if(mpnt->vm_next)
-			mpnt->vm_next->vm_pprev = mpnt->vm_pprev;
-		*mpnt->vm_pprev = mpnt->vm_next;
-
+	for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
+		*npp = mpnt->vm_next;
 		mpnt->vm_next = free;
 		free = mpnt;
-		mpnt = next;
+		if (mm->mmap_avl)
+			avl_remove(mpnt, &mm->mmap_avl);
 	}
 
 	/* Ok - we have the memory areas we should free on the 'free' list,
@@ -507,15 +649,10 @@ int do_munmap(unsigned long addr, size_t len)
 	 * If the one of the segments is only being partially unmapped,
 	 * it will put new vm_area_struct(s) into the address space.
 	 */
-	freed = 0;
 	while ((mpnt = free) != NULL) {
 		unsigned long st, end, size;
 
 		free = free->vm_next;
-		freed = 1;
-
-		mm->map_count--;
-		remove_shared_vm_struct(mpnt);
 
 		st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
 		end = addr+len;
@@ -525,6 +662,9 @@ int do_munmap(unsigned long addr, size_t len)
 		if (mpnt->vm_ops && mpnt->vm_ops->unmap)
 			mpnt->vm_ops->unmap(mpnt, st, size);
 
+		remove_shared_vm_struct(mpnt);
+		mm->map_count--;
+
 		flush_cache_range(mm, st, end);
 		zap_page_range(mm, st, size);
 		flush_tlb_range(mm, st, end);
@@ -540,8 +680,9 @@ int do_munmap(unsigned long addr, size_t len)
 	if (extra)
 		kmem_cache_free(vm_area_cachep, extra);
 
-	if (freed)
-		mm->mmap_cache = NULL;	/* Kill the cache. */
+	free_pgtables(mm, prev, addr, addr+len);
+
+	mm->mmap_cache = NULL;	/* Kill the cache. */
 	return 0;
 }
 
@@ -557,13 +698,23 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len)
 	return ret;
 }
 
+/* Build the AVL tree corresponding to the VMA list. */
+void build_mmap_avl(struct mm_struct * mm)
+{
+	struct vm_area_struct * vma;
+
+	mm->mmap_avl = NULL;
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		avl_insert(vma, &mm->mmap_avl);
+}
+
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct * mm)
 {
 	struct vm_area_struct * mpnt;
 
 	mpnt = mm->mmap;
-	mm->mmap = mm->mmap_cache = NULL;
+	mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
 	mm->rss = 0;
 	mm->total_vm = 0;
 	mm->locked_vm = 0;
@@ -591,6 +742,8 @@ void exit_mmap(struct mm_struct * mm)
 	/* This is just debugging */
 	if (mm->map_count)
 		printk("exit_mmap: map count is %d\n", mm->map_count);
+
+	clear_page_tables(mm, 0, USER_PTRS_PER_PGD);
 }
 
 /* Insert vm structure into process list sorted by address
@@ -598,20 +751,26 @@ void exit_mmap(struct mm_struct * mm)
  */
 void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
 {
-	struct vm_area_struct **pprev = &mm->mmap;
+	struct vm_area_struct **pprev;
 	struct file * file;
 
-	mm->map_count++;
-
-	/* Find where to link it in. */
-	while(*pprev && (*pprev)->vm_start <= vmp->vm_start)
-		pprev = &(*pprev)->vm_next;
-
-	/* Insert it. */
-	if((vmp->vm_next = *pprev) != NULL)
-		(*pprev)->vm_pprev = &vmp->vm_next;
+	if (!mm->mmap_avl) {
+		pprev = &mm->mmap;
+		while (*pprev && (*pprev)->vm_start <= vmp->vm_start)
+			pprev = &(*pprev)->vm_next;
+	} else {
+		struct vm_area_struct *prev, *next;
+		avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next);
+		pprev = (prev ? &prev->vm_next : &mm->mmap);
+		if (*pprev != next)
+			printk("insert_vm_struct: tree inconsistent with list\n");
+	}
+	vmp->vm_next = *pprev;
 	*pprev = vmp;
-	vmp->vm_pprev = pprev;
+
+	mm->map_count++;
+	if (mm->map_count >= AVL_MIN_MAP_COUNT && !mm->mmap_avl)
+		build_mmap_avl(mm);
 
 	file = vmp->vm_file;
 	if (file) {
@@ -637,23 +796,17 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
  */
 void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
 {
-	struct vm_area_struct *prev, *mpnt, *next;
+	struct vm_area_struct *prev, *mpnt, *next, *prev1;
 
-	prev = NULL;
-	mpnt = mm->mmap;
-	while(mpnt && mpnt->vm_end <= start_addr) {
-		prev = mpnt;
-		mpnt = mpnt->vm_next;
-	}
+	mpnt = find_vma_prev(mm, start_addr, &prev1);
 	if (!mpnt)
 		return;
 
-	next = mpnt->vm_next;
-
-	/* we have prev->vm_next == mpnt && mpnt->vm_next = next */
-	if (!prev) {
+	if (prev1) {
+		prev = prev1;
+	} else {
 		prev = mpnt;
-		mpnt = next;
+		mpnt = mpnt->vm_next;
 	}
 
 	/* prev and mpnt cycle through the list, as long as
@@ -684,11 +837,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 		 * big segment can possibly merge with the next one.
 		 * The old unused mpnt is freed.
 		 */
-		if(mpnt->vm_next)
-			mpnt->vm_next->vm_pprev = mpnt->vm_pprev;
-		*mpnt->vm_pprev = mpnt->vm_next;
-
+		if (mm->mmap_avl)
+			avl_remove(mpnt, &mm->mmap_avl);
 		prev->vm_end = mpnt->vm_end;
+		prev->vm_next = mpnt->vm_next;
 		if (mpnt->vm_ops && mpnt->vm_ops->close) {
 			mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start;
 			mpnt->vm_start = mpnt->vm_end;
diff --git a/mm/mmap_avl.c b/mm/mmap_avl.c
new file mode 100644
index 000000000..5a48ce89b
--- /dev/null
+++ b/mm/mmap_avl.c
@@ -0,0 +1,374 @@
+/*
+ * Searching a VMA in the linear list task->mm->mmap is horribly slow.
+ * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search
+ * from O(n) to O(log n), where n is the number of VMAs of the task
+ * n is typically around 6, but may reach 3000 in some cases: object-oriented
+ * databases, persistent store, generational garbage collection (Java, Lisp),
+ * ElectricFence.
+ * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>.
+ */
+
+/* We keep the list and tree sorted by address. */
+#define vm_avl_key	vm_end
+#define vm_avl_key_t	unsigned long	/* typeof(vma->avl_key) */
+
+/*
+ * task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap
+ * or, more exactly, its root.
+ * A vm_area_struct has the following fields:
+ *   vm_avl_left     left son of a tree node
+ *   vm_avl_right    right son of a tree node
+ *   vm_avl_height   1+max(heightof(left),heightof(right))
+ * The empty tree is represented as NULL.
+ */
+
+/* Since the trees are balanced, their height will never be large. */
+#define avl_maxheight	41	/* why this? a small exercise */
+#define heightof(tree)	((tree) == vm_avl_empty ? 0 : (tree)->vm_avl_height)
+/*
+ * Consistency and balancing rules:
+ * 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right))
+ * 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1
+ * 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key,
+ *    foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key.
+ */
+
+#ifdef DEBUG_AVL
+
+/* Look up the nodes at the left and at the right of a given node. */
+static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
+{
+	vm_avl_key_t key = node->vm_avl_key;
+
+	*to_the_left = *to_the_right = NULL;
+	for (;;) {
+		if (tree == vm_avl_empty) {
+			printk("avl_neighbours: node not found in the tree\n");
+			return;
+		}
+		if (key == tree->vm_avl_key)
+			break;
+		if (key < tree->vm_avl_key) {
+			*to_the_right = tree;
+			tree = tree->vm_avl_left;
+		} else {
+			*to_the_left = tree;
+			tree = tree->vm_avl_right;
+		}
+	}
+	if (tree != node) {
+		printk("avl_neighbours: node not exactly found in the tree\n");
+		return;
+	}
+	if (tree->vm_avl_left != vm_avl_empty) {
+		struct vm_area_struct * node;
+		for (node = tree->vm_avl_left; node->vm_avl_right != vm_avl_empty; node = node->vm_avl_right)
+			continue;
+		*to_the_left = node;
+	}
+	if (tree->vm_avl_right != vm_avl_empty) {
+		struct vm_area_struct * node;
+		for (node = tree->vm_avl_right; node->vm_avl_left != vm_avl_empty; node = node->vm_avl_left)
+			continue;
+		*to_the_right = node;
+	}
+	if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right))
+		printk("avl_neighbours: tree inconsistent with list\n");
+}
+
+#endif
+
+/*
+ * Rebalance a tree.
+ * After inserting or deleting a node of a tree we have a sequence of subtrees
+ * nodes[0]..nodes[k-1] such that
+ * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}.
+ */
+static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
+{
+	for ( ; count > 0 ; count--) {
+		struct vm_area_struct ** nodeplace = *--nodeplaces_ptr;
+		struct vm_area_struct * node = *nodeplace;
+		struct vm_area_struct * nodeleft = node->vm_avl_left;
+		struct vm_area_struct * noderight = node->vm_avl_right;
+		int heightleft = heightof(nodeleft);
+		int heightright = heightof(noderight);
+		if (heightright + 1 < heightleft) {
+			/*                                                      */
+			/*                            *                         */
+			/*                          /   \                       */
+			/*                       n+2      n                     */
+			/*                                                      */
+			struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left;
+			struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right;
+			int heightleftright = heightof(nodeleftright);
+			if (heightof(nodeleftleft) >= heightleftright) {
+				/*                                                        */
+				/*                *                    n+2|n+3            */
+				/*              /   \                  /    \             */
+				/*           n+2      n      -->      /   n+1|n+2         */
+				/*           / \                      |    /    \         */
+				/*         n+1 n|n+1                 n+1  n|n+1  n        */
+				/*                                                        */
+				node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node;
+				nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright);
+				*nodeplace = nodeleft;
+			} else {
+				/*                                                        */
+				/*                *                     n+2               */
+				/*              /   \                 /     \             */
+				/*           n+2      n      -->    n+1     n+1           */
+				/*           / \                    / \     / \           */
+				/*          n  n+1                 n   L   R   n          */
+				/*             / \                                        */
+				/*            L   R                                       */
+				/*                                                        */
+				nodeleft->vm_avl_right = nodeleftright->vm_avl_left;
+				node->vm_avl_left = nodeleftright->vm_avl_right;
+				nodeleftright->vm_avl_left = nodeleft;
+				nodeleftright->vm_avl_right = node;
+				nodeleft->vm_avl_height = node->vm_avl_height = heightleftright;
+				nodeleftright->vm_avl_height = heightleft;
+				*nodeplace = nodeleftright;
+			}
+		}
+		else if (heightleft + 1 < heightright) {
+			/* similar to the above, just interchange 'left' <--> 'right' */
+			struct vm_area_struct * noderightright = noderight->vm_avl_right;
+			struct vm_area_struct * noderightleft = noderight->vm_avl_left;
+			int heightrightleft = heightof(noderightleft);
+			if (heightof(noderightright) >= heightrightleft) {
+				node->vm_avl_right = noderightleft; noderight->vm_avl_left = node;
+				noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft);
+				*nodeplace = noderight;
+			} else {
+				noderight->vm_avl_left = noderightleft->vm_avl_right;
+				node->vm_avl_right = noderightleft->vm_avl_left;
+				noderightleft->vm_avl_right = noderight;
+				noderightleft->vm_avl_left = node;
+				noderight->vm_avl_height = node->vm_avl_height = heightrightleft;
+				noderightleft->vm_avl_height = heightright;
+				*nodeplace = noderightleft;
+			}
+		}
+		else {
+			int height = (heightleft<heightright ? heightright : heightleft) + 1;
+			if (height == node->vm_avl_height)
+				break;
+			node->vm_avl_height = height;
+		}
+	}
+}
+
+/* Insert a node into a tree. */
+static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree)
+{
+	vm_avl_key_t key = new_node->vm_avl_key;
+	struct vm_area_struct ** nodeplace = ptree;
+	struct vm_area_struct ** stack[avl_maxheight];
+	int stack_count = 0;
+	struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
+	for (;;) {
+		struct vm_area_struct * node = *nodeplace;
+		if (node == vm_avl_empty)
+			break;
+		*stack_ptr++ = nodeplace; stack_count++;
+		if (key < node->vm_avl_key)
+			nodeplace = &node->vm_avl_left;
+		else
+			nodeplace = &node->vm_avl_right;
+	}
+	new_node->vm_avl_left = vm_avl_empty;
+	new_node->vm_avl_right = vm_avl_empty;
+	new_node->vm_avl_height = 1;
+	*nodeplace = new_node;
+	avl_rebalance(stack_ptr,stack_count);
+}
+
+/* Insert a node into a tree, and
+ * return the node to the left of it and the node to the right of it.
+ */
+static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,
+	struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
+{
+	vm_avl_key_t key = new_node->vm_avl_key;
+	struct vm_area_struct ** nodeplace = ptree;
+	struct vm_area_struct ** stack[avl_maxheight];
+	int stack_count = 0;
+	struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
+	*to_the_left = *to_the_right = NULL;
+	for (;;) {
+		struct vm_area_struct * node = *nodeplace;
+		if (node == vm_avl_empty)
+			break;
+		*stack_ptr++ = nodeplace; stack_count++;
+		if (key < node->vm_avl_key) {
+			*to_the_right = node;
+			nodeplace = &node->vm_avl_left;
+		} else {
+			*to_the_left = node;
+			nodeplace = &node->vm_avl_right;
+		}
+	}
+	new_node->vm_avl_left = vm_avl_empty;
+	new_node->vm_avl_right = vm_avl_empty;
+	new_node->vm_avl_height = 1;
+	*nodeplace = new_node;
+	avl_rebalance(stack_ptr,stack_count);
+}
+
+/* Removes a node out of a tree. */
+static void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
+{
+	vm_avl_key_t key = node_to_delete->vm_avl_key;
+	struct vm_area_struct ** nodeplace = ptree;
+	struct vm_area_struct ** stack[avl_maxheight];
+	int stack_count = 0;
+	struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
+	struct vm_area_struct ** nodeplace_to_delete;
+	for (;;) {
+		struct vm_area_struct * node = *nodeplace;
+#ifdef DEBUG_AVL
+		if (node == vm_avl_empty) {
+			/* what? node_to_delete not found in tree? */
+			printk("avl_remove: node to delete not found in tree\n");
+			return;
+		}
+#endif
+		*stack_ptr++ = nodeplace; stack_count++;
+		if (key == node->vm_avl_key)
+			break;
+		if (key < node->vm_avl_key)
+			nodeplace = &node->vm_avl_left;
+		else
+			nodeplace = &node->vm_avl_right;
+	}
+	nodeplace_to_delete = nodeplace;
+	/* Have to remove node_to_delete = *nodeplace_to_delete. */
+	if (node_to_delete->vm_avl_left == vm_avl_empty) {
+		*nodeplace_to_delete = node_to_delete->vm_avl_right;
+		stack_ptr--; stack_count--;
+	} else {
+		struct vm_area_struct *** stack_ptr_to_delete = stack_ptr;
+		struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left;
+		struct vm_area_struct * node;
+		for (;;) {
+			node = *nodeplace;
+			if (node->vm_avl_right == vm_avl_empty)
+				break;
+			*stack_ptr++ = nodeplace; stack_count++;
+			nodeplace = &node->vm_avl_right;
+		}
+		*nodeplace = node->vm_avl_left;
+		/* node replaces node_to_delete */
+		node->vm_avl_left = node_to_delete->vm_avl_left;
+		node->vm_avl_right = node_to_delete->vm_avl_right;
+		node->vm_avl_height = node_to_delete->vm_avl_height;
+		*nodeplace_to_delete = node; /* replace node_to_delete */
+		*stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */
+	}
+	avl_rebalance(stack_ptr,stack_count);
+}
+
+#ifdef DEBUG_AVL
+
+/* print a list */
+static void printk_list (struct vm_area_struct * vma)
+{
+	printk("[");
+	while (vma) {
+		printk("%08lX-%08lX", vma->vm_start, vma->vm_end);
+		vma = vma->vm_next;
+		if (!vma)
+			break;
+		printk(" ");
+	}
+	printk("]");
+}
+
+/* print a tree */
+static void printk_avl (struct vm_area_struct * tree)
+{
+	if (tree != vm_avl_empty) {
+		printk("(");
+		if (tree->vm_avl_left != vm_avl_empty) {
+			printk_avl(tree->vm_avl_left);
+			printk("<");
+		}
+		printk("%08lX-%08lX", tree->vm_start, tree->vm_end);
+		if (tree->vm_avl_right != vm_avl_empty) {
+			printk(">");
+			printk_avl(tree->vm_avl_right);
+		}
+		printk(")");
+	}
+}
+
+static char *avl_check_point = "somewhere";
+
+/* check a tree's consistency and balancing */
+static void avl_checkheights (struct vm_area_struct * tree)
+{
+	int h, hl, hr;
+
+	if (tree == vm_avl_empty)
+		return;
+	avl_checkheights(tree->vm_avl_left);
+	avl_checkheights(tree->vm_avl_right);
+	h = tree->vm_avl_height;
+	hl = heightof(tree->vm_avl_left);
+	hr = heightof(tree->vm_avl_right);
+	if ((h == hl+1) && (hr <= hl) && (hl <= hr+1))
+		return;
+	if ((h == hr+1) && (hl <= hr) && (hr <= hl+1))
+		return;
+	printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point);
+}
+
+/* check that all values stored in a tree are < key */
+static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key)
+{
+	if (tree == vm_avl_empty)
+		return;
+	avl_checkleft(tree->vm_avl_left,key);
+	avl_checkleft(tree->vm_avl_right,key);
+	if (tree->vm_avl_key < key)
+		return;
+	printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
+}
+
+/* check that all values stored in a tree are > key */
+static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key)
+{
+	if (tree == vm_avl_empty)
+		return;
+	avl_checkright(tree->vm_avl_left,key);
+	avl_checkright(tree->vm_avl_right,key);
+	if (tree->vm_avl_key > key)
+		return;
+	printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
+}
+
+/* check that all values are properly increasing */
+static void avl_checkorder (struct vm_area_struct * tree)
+{
+	if (tree == vm_avl_empty)
+		return;
+	avl_checkorder(tree->vm_avl_left);
+	avl_checkorder(tree->vm_avl_right);
+	avl_checkleft(tree->vm_avl_left,tree->vm_avl_key);
+	avl_checkright(tree->vm_avl_right,tree->vm_avl_key);
+}
+
+/* all checks */
+static void avl_check (struct task_struct * task, char *caller)
+{
+	avl_check_point = caller;
+/*	printk("task \"%s\", %s\n",task->comm,caller); */
+/*	printk("task \"%s\" list: ",task->comm); printk_list(task->mm->mmap); printk("\n"); */
+/*	printk("task \"%s\" tree: ",task->comm); printk_avl(task->mm->mmap_avl); printk("\n"); */
+	avl_checkheights(task->mm->mmap_avl);
+	avl_checkorder(task->mm->mmap_avl);
+}
+
+#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ceec01b9..4a956c085 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,33 +90,6 @@ static inline void remove_mem_queue(struct page * entry)
  */
 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
 
-/*
- * This routine is used by the kernel swap daemon to determine
- * whether we have "enough" free pages. It is fairly arbitrary,
- * having a low-water and high-water mark.
- *
- * This returns:
- *  0 - urgent need for memory
- *  1 - need some memory, but do it slowly in the background
- *  2 - no need to even think about it.
- */
-int free_memory_available(void)
-{
-	static int available = 1;
-
-	if (nr_free_pages < freepages.low) {
-		available = 0;
-		return 0;
-	}
-
-	if (nr_free_pages > freepages.high) {
-		available = 1;
-		return 2;
-	}
-
-	return available;
-}
-
 static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 {
 	struct free_area_struct *area = free_area + order;
@@ -151,14 +124,10 @@ void __free_page(struct page *page)
 	if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
 		if (PageSwapCache(page))
 			panic ("Freeing swap cache page");
-		free_pages_ok(page->map_nr, 0);
+		page->flags &= ~(1 << PG_referenced);
+		free_pages_ok(page - mem_map, 0);
 		return;
 	}
-#if 0
-	if (PageSwapCache(page) && atomic_read(&page->count) == 1)
-		printk(KERN_WARNING "VM: Releasing swap cache page at %p",
-			__builtin_return_address(0));
-#endif
 }
 
 void free_pages(unsigned long addr, unsigned long order)
@@ -172,15 +141,10 @@ void free_pages(unsigned long addr, unsigned long order)
 		if (atomic_dec_and_test(&map->count)) {
 			if (PageSwapCache(map))
 				panic ("Freeing swap cache pages");
+			map->flags &= ~(1 << PG_referenced);
 			free_pages_ok(map_nr, order);
 			return;
 		}
-#if 0
-		if (PageSwapCache(map) && atomic_read(&map->count) == 1)
-			printk(KERN_WARNING 
-				"VM: Releasing swap cache pages at %p",
-				__builtin_return_address(0));
-#endif
 	}
 }
 
@@ -191,14 +155,15 @@ void free_pages(unsigned long addr, unsigned long order)
 	change_bit((index) >> (1+(order)), (area)->map)
 #define CAN_DMA(x) (PageDMA(x))
 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
-#define RMQUEUE(order, dma) \
+#define RMQUEUE(order, gfp_mask) \
 do { struct free_area_struct * area = free_area+order; \
      unsigned long new_order = order; \
 	do { struct page *prev = memory_head(area), *ret = prev->next; \
 		while (memory_head(area) != ret) { \
-			if (!dma || CAN_DMA(ret)) { \
-				unsigned long map_nr = ret->map_nr; \
+			if (!(gfp_mask & __GFP_DMA) || CAN_DMA(ret)) { \
+				unsigned long map_nr; \
 				(prev->next = ret->next)->prev = prev; \
+				map_nr = ret - mem_map; \
 				MARK_USED(map_nr, new_order, area); \
 				nr_free_pages -= 1 << order; \
 				EXPAND(ret, map_nr, order, new_order, area); \
@@ -224,6 +189,8 @@ do { unsigned long size = 1 << high; \
 	atomic_set(&map->count, 1); \
 } while (0)
 
+int low_on_memory = 0;
+
 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 {
 	unsigned long flags;
@@ -231,30 +198,45 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 	if (order >= NR_MEM_LISTS)
 		goto nopage;
 
-	if (gfp_mask & __GFP_WAIT) {
-		if (in_interrupt()) {
-			static int count = 0;
-			if (++count < 5) {
-				printk("gfp called nonatomically from interrupt %p\n",
-					__builtin_return_address(0));
-			}
-			goto nopage;
+#ifdef ATOMIC_MEMORY_DEBUGGING
+	if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
+		static int count = 0;
+		if (++count < 5) {
+			printk("gfp called nonatomically from interrupt %p\n",
+				__builtin_return_address(0));
 		}
+		goto nopage;
+	}
+#endif
 
-		if (freepages.min > nr_free_pages) {
-			int freed;
-			freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
-			/*
-			 * Low priority (user) allocations must not
-			 * succeed if we didn't have enough memory
-			 * and we couldn't get more..
-			 */
-			if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
-				goto nopage;
+	/*
+	 * If this is a recursive call, we'd better
+	 * do our best to just allocate things without
+	 * further thought.
+	 */
+	if (!(current->flags & PF_MEMALLOC)) {
+		int freed;
+
+		if (nr_free_pages > freepages.min) {
+			if (!low_on_memory)
+				goto ok_to_allocate;
+			if (nr_free_pages >= freepages.high) {
+				low_on_memory = 0;
+				goto ok_to_allocate;
+			}
 		}
+
+		low_on_memory = 1;
+		current->flags |= PF_MEMALLOC;
+		freed = try_to_free_pages(gfp_mask);
+		current->flags &= ~PF_MEMALLOC;
+
+		if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+			goto nopage;
 	}
+ok_to_allocate:
 	spin_lock_irqsave(&page_alloc_lock, flags);
-	RMQUEUE(order, (gfp_mask & GFP_DMA));
+	RMQUEUE(order, gfp_mask);
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
 
 	/*
@@ -341,7 +323,6 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
 		--p;
 		atomic_set(&p->count, 0);
 		p->flags = (1 << PG_DMA) | (1 << PG_reserved);
-		p->map_nr = p - mem_map;
 	} while (p > mem_map);
 
 	for (i = 0 ; i < NR_MEM_LISTS ; i++) {
@@ -359,6 +340,46 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
 	return start_mem;
 }
 
+/* 
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time.  We also make sure to queue
+ * the 'original' request together with the readahead ones...  
+ */
+void swapin_readahead(unsigned long entry)
+{
+	int i;
+	struct page *new_page;
+	unsigned long offset = SWP_OFFSET(entry);
+	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
+	
+	offset = (offset >> page_cluster) << page_cluster;
+
+	i = 1 << page_cluster;
+	do {
+		/* Don't read-ahead past the end of the swap area */
+		if (offset >= swapdev->max)
+			break;
+		/* Don't block on I/O for read-ahead */
+		if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
+			break;
+		/* Don't read in bad or busy pages */
+		if (!swapdev->swap_map[offset])
+			break;
+		if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
+			break;
+		if (test_bit(offset, swapdev->swap_lockmap))
+			break;
+
+		/* Ok, do the async read-ahead now */
+		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
+		if (new_page != NULL)
+			__free_page(new_page);
+		offset++;
+	} while (--i);
+	return;
+}
+
 /*
  * The tests may look silly, but it essentially makes sure that
  * no other process did a swap-in on us just as we were waiting.
@@ -370,10 +391,12 @@ void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
 	pte_t * page_table, unsigned long entry, int write_access)
 {
 	unsigned long page;
-	struct page *page_map;
-	
-	page_map = read_swap_cache(entry);
+	struct page *page_map = lookup_swap_cache(entry);
 
+	if (!page_map) {
+		swapin_readahead(entry);
+		page_map = read_swap_cache(entry);
+	}
 	if (pte_val(*page_table) != entry) {
 		if (page_map)
 			free_page_and_swap_cache(page_address(page_map));
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dd24facc..498e4f63d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -7,6 +7,7 @@
  *  Asynchronous swapping added 30.12.95. Stephen Tweedie
  *  Removed race in async swapping. 14.4.1996. Bruno Haible
  *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
+ *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
  */
 
 #include <linux/mm.h>
@@ -15,8 +16,6 @@
 #include <linux/locks.h>
 #include <linux/swapctl.h>
 
-#include <asm/dma.h>
-#include <asm/uaccess.h> /* for copy_to/from_user */
 #include <asm/pgtable.h>
 
 static struct wait_queue * lock_queue = NULL;
@@ -24,8 +23,6 @@ static struct wait_queue * lock_queue = NULL;
 /*
  * Reads or writes a swap page.
  * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O.
- * All IO to swap files (as opposed to swap partitions) is done
- * synchronously.
  *
  * Important prevention of race condition: the caller *must* atomically 
  * create a unique swap cache entry for this swap page before calling
@@ -38,21 +35,22 @@ static struct wait_queue * lock_queue = NULL;
  * that shared pages stay shared while being swapped.
  */
 
-void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
+static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait)
 {
 	unsigned long type, offset;
 	struct swap_info_struct * p;
-	struct page *page = mem_map + MAP_NR(buf);
+	int zones[PAGE_SIZE/512];
+	int zones_used;
+	kdev_t dev = 0;
+	int block_size;
 
 #ifdef DEBUG_SWAP
 	printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n",
 		(rw == READ) ? "read" : "write", 
-		entry, buf, atomic_read(&page->count),
+		entry, (char *) page_address(page), atomic_read(&page->count),
 		wait ? "wait" : "nowait");
 #endif
 
-	if (page->inode && page->inode != &swapper_inode)
-		panic ("Tried to swap a non-swapper page");
 	type = SWP_TYPE(entry);
 	if (type >= nr_swapfiles) {
 		printk("Internal error: bad swap-device\n");
@@ -60,7 +58,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 	}
 
 	/* Don't allow too many pending pages in flight.. */
-	if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
 		wait = 1;
 
 	p = &swap_info[type];
@@ -85,13 +83,27 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 		printk(KERN_ERR "VM: swap page is unlocked\n");
 		return;
 	}
-	
-	/* Make sure we are the only process doing I/O with this swap page. */
-	while (test_and_set_bit(offset,p->swap_lockmap)) {
-		run_task_queue(&tq_disk);
-		sleep_on(&lock_queue);
+
+	if (PageSwapCache(page)) {
+		/* Make sure we are the only process doing I/O with this swap page. */
+		while (test_and_set_bit(offset,p->swap_lockmap)) {
+			run_task_queue(&tq_disk);
+			sleep_on(&lock_queue);
+		}
+
+		/* 
+		 * Make sure that we have a swap cache association for this
+		 * page.  We need this to find which swap page to unlock once
+		 * the swap IO has completed to the physical page.  If the page
+		 * is not already in the cache, just overload the offset entry
+		 * as if it were: we are not allowed to manipulate the inode
+		 * hashing for locked pages.
+		 */
+		if (page->offset != entry) {
+			printk ("swap entry mismatch");
+			return;
+		}
 	}
-	
 	if (rw == READ) {
 		clear_bit(PG_uptodate, &page->flags);
 		kstat.pswpin++;
@@ -99,54 +111,25 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 		kstat.pswpout++;
 
 	atomic_inc(&page->count);
-	/* 
-	 * Make sure that we have a swap cache association for this
-	 * page.  We need this to find which swap page to unlock once
-	 * the swap IO has completed to the physical page.  If the page
-	 * is not already in the cache, just overload the offset entry
-	 * as if it were: we are not allowed to manipulate the inode
-	 * hashing for locked pages.
-	 */
-	if (!PageSwapCache(page)) {
-		printk(KERN_ERR "VM: swap page is not in swap cache\n");
-		return;
-	}
-	if (page->offset != entry) {
-		printk (KERN_ERR "VM: swap entry mismatch\n");
-		return;
-	}
-
 	if (p->swap_device) {
-		if (!wait) {
-			set_bit(PG_free_after, &page->flags);
-			set_bit(PG_decr_after, &page->flags);
-			set_bit(PG_swap_unlock_after, &page->flags);
-			atomic_inc(&nr_async_pages);
-		}
-		ll_rw_page(rw,p->swap_device,offset,buf);
-		/*
-		 * NOTE! We don't decrement the page count if we
-		 * don't wait - that will happen asynchronously
-		 * when the IO completes.
-		 */
-		if (!wait)
-			return;
-		wait_on_page(page);
+		zones[0] = offset;
+		zones_used = 1;
+		dev = p->swap_device;
+		block_size = PAGE_SIZE;
 	} else if (p->swap_file) {
 		struct inode *swapf = p->swap_file->d_inode;
-		unsigned int zones[PAGE_SIZE/512];
 		int i;
 		if (swapf->i_op->bmap == NULL
 			&& swapf->i_op->smap != NULL){
 			/*
-				With MS-DOS, we use msdos_smap which return
+				With MS-DOS, we use msdos_smap which returns
 				a sector number (not a cluster or block number).
 				It is a patch to enable the UMSDOS project.
 				Other people are working on better solution.
 
 				It sounds like ll_rw_swap_file defined
-				it operation size (sector size) based on
-				PAGE_SIZE and the number of block to read.
+				its operation size (sector size) based on
+				PAGE_SIZE and the number of blocks to read.
 				So using bmap or smap should work even if
 				smap will require more blocks.
 			*/
@@ -159,39 +142,72 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 					return;
 				}
 			}
+			block_size = 512;
 		}else{
 			int j;
 			unsigned int block = offset
 				<< (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
 
-			for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
+			block_size = swapf->i_sb->s_blocksize;
+			for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size)
 				if (!(zones[i] = bmap(swapf,block++))) {
 					printk("rw_swap_page: bad swap file\n");
 					return;
 				}
+			zones_used = i;
+			dev = swapf->i_dev;
 		}
-		ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
-		/* Unlike ll_rw_page, ll_rw_swap_file won't unlock the
-		   page for us. */
-		clear_bit(PG_locked, &page->flags);
-		wake_up(&page->wait);
-	} else
+	} else {
 		printk(KERN_ERR "rw_swap_page: no swap file or device\n");
+		/* Do some cleaning up so if this ever happens we can hopefully
+		 * trigger controlled shutdown.
+		 */
+		if (PageSwapCache(page)) {
+			if (!test_and_clear_bit(offset,p->swap_lockmap))
+				printk("swap_after_unlock_page: lock already cleared\n");
+			wake_up(&lock_queue);
+		}
+		atomic_dec(&page->count);
+		return;
+	}
+ 	if (!wait) {
+ 		set_bit(PG_decr_after, &page->flags);
+ 		atomic_inc(&nr_async_pages);
+ 	}
+ 	if (PageSwapCache(page)) {
+ 		/* only lock/unlock swap cache pages! */
+ 		set_bit(PG_swap_unlock_after, &page->flags);
+ 	}
+ 	set_bit(PG_free_after, &page->flags);
 
+ 	/* block_size == PAGE_SIZE/zones_used */
+ 	brw_page(rw, page, dev, zones, block_size, 0);
+ 
+ 	/* Note! For consistency we do all of the logic,
+ 	 * decrementing the page count, and unlocking the page in the
+ 	 * swap lock map - in the IO completion handler.
+ 	 */
+ 	if (!wait) 
+ 		return;
+ 	wait_on_page(page);
 	/* This shouldn't happen, but check to be sure. */
-	if (atomic_read(&page->count) == 1)
+	if (atomic_read(&page->count) == 0)
 		printk(KERN_ERR "rw_swap_page: page unused while waiting!\n");
-	atomic_dec(&page->count);
-	if (offset && !test_and_clear_bit(offset,p->swap_lockmap))
-		printk(KERN_ERR "rw_swap_page: lock already cleared\n");
-	wake_up(&lock_queue);
+
 #ifdef DEBUG_SWAP
 	printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n",
 		(rw == READ) ? "read" : "write", 
-		buf, atomic_read(&page->count));
+		(char *) page_adddress(page), 
+		atomic_read(&page->count));
 #endif
 }
 
+/* Note: We could remove this totally asynchronous function,
+ * and improve swap performance, and remove the need for the swap lock map,
+ * by not removing pages from the swap cache until after I/O has been
+ * processed and letting remove_from_page_cache decrement the swap count
+ * just before it removes the page from the page cache.
+ */
 /* This is run when asynchronous page I/O has completed. */
 void swap_after_unlock_page (unsigned long entry)
 {
@@ -214,6 +230,35 @@ void swap_after_unlock_page (unsigned long entry)
 	wake_up(&lock_queue);
 }
 
+/* A simple wrapper so the base function doesn't need to enforce
+ * that all swap pages go through the swap cache!
+ */
+void rw_swap_page(int rw, unsigned long entry, char *buf, int wait)
+{
+	struct page *page = mem_map + MAP_NR(buf);
+
+	if (page->inode && page->inode != &swapper_inode)
+		panic ("Tried to swap a non-swapper page");
+
+	/*
+	 * Make sure that we have a swap cache association for this
+	 * page.  We need this to find which swap page to unlock once
+	 * the swap IO has completed to the physical page.  If the page
+	 * is not already in the cache, just overload the offset entry
+	 * as if it were: we are not allowed to manipulate the inode
+	 * hashing for locked pages.
+	 */
+	if (!PageSwapCache(page)) {
+		printk("VM: swap page is not in swap cache\n");
+		return;
+	}
+	if (page->offset != entry) {
+		printk ("swap entry mismatch");
+		return;
+	}
+	rw_swap_page_base(rw, entry, page, wait);
+}
+
 /*
  * Setting up a new swap file needs a simple wrapper just to read the 
  * swap signature.  SysV shared memory also needs a simple wrapper.
@@ -242,33 +287,23 @@ void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer)
 	clear_bit(PG_swap_cache, &page->flags);
 }
 
-
-
 /*
- * Swap partitions are now read via brw_page.  ll_rw_page is an
- * asynchronous function now --- we must call wait_on_page afterwards
- * if synchronous IO is required.  
+ * shmfs needs a version that doesn't put the page in the page cache!
+ * The swap lock map insists that pages be in the page cache!
+ * Therefore we can't use it.  Later when we can remove the need for the
+ * lock map and we can reduce the number of functions exported.
  */
-void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer)
+void rw_swap_page_nolock(int rw, unsigned long entry, char *buffer, int wait)
 {
-	int block = offset;
-	struct page *page;
-
-	switch (rw) {
-		case READ:
-			break;
-		case WRITE:
-			if (is_read_only(dev)) {
-				printk("Can't page to read-only device %s\n",
-					kdevname(dev));
-				return;
-			}
-			break;
-		default:
-			panic("ll_rw_page: bad block dev cmd, must be R/W");
+	struct page *page = mem_map + MAP_NR((unsigned long) buffer);
+	
+	if (!PageLocked(page)) {
+		printk("VM: rw_swap_page_nolock: page not locked!\n");
+		return;
+	}
+	if (PageSwapCache(page)) {
+		printk ("VM: rw_swap_page_nolock: page in swap cache!\n");
+		return;
 	}
-	page = mem_map + MAP_NR(buffer);
-	if (!PageLocked(page))
-		panic ("ll_rw_page: page not already locked");
-	brw_page(rw, page, dev, &block, PAGE_SIZE, 0);
+	rw_swap_page_base(rw, entry, page, wait);
 }
diff --git a/mm/swap.c b/mm/swap.c
index 1e2d8c36b..1d6b0d4d0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,35 +39,21 @@ freepages_t freepages = {
 	144	/* freepages.high */
 };
 
+/* How many pages do we try to swap or page in/out together? */
+int page_cluster = 4; /* Default value modified in swap_setup() */
+
 /* We track the number of pages currently being asynchronously swapped
    out, so that we don't try to swap TOO many pages out at once */
 atomic_t nr_async_pages = ATOMIC_INIT(0);
 
-/*
- * Constants for the page aging mechanism: the maximum age (actually,
- * the maximum "youthfulness"); the quanta by which pages rejuvenate
- * and age; and the initial age for new pages. 
- *
- * The "pageout_weight" is strictly a fixedpoint number with the
- * ten low bits being the fraction (ie 8192 really means "8.0").
- */
-swap_control_t swap_control = {
-	20, 3, 1, 3,		/* Page aging */
-	32, 4,			/* Aging cluster */
-	8192,			/* sc_pageout_weight aka PAGEOUT_WEIGHT */
-	8192,			/* sc_bufferout_weight aka BUFFEROUT_WEIGHT */
-};
-
-swapstat_t swapstats = {0};
-
 buffer_mem_t buffer_mem = {
-	5,	/* minimum percent buffer */
+	2,	/* minimum percent buffer */
 	10,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	5,	/* minimum percent page cache */
+	2,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
 };
@@ -77,3 +63,18 @@ pager_daemon_t pager_daemon = {
 	SWAP_CLUSTER_MAX,	/* minimum number of tries */
 	SWAP_CLUSTER_MAX,	/* do swap I/O in clusters of this size */
 };
+
+/*
+ * Perform any setup for the swap system
+ */
+
+void __init swap_setup(void)
+{
+	/* Use a smaller cluster for memory <16MB or <32MB */
+	if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT))
+		page_cluster = 2;
+	else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT))
+		page_cluster = 3;
+	else
+		page_cluster = 4;
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e098974b2..8c5e7176c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -29,18 +29,16 @@ struct inode swapper_inode;
 
 #ifdef SWAP_CACHE_INFO
 unsigned long swap_cache_add_total = 0;
-unsigned long swap_cache_add_success = 0;
 unsigned long swap_cache_del_total = 0;
-unsigned long swap_cache_del_success = 0;
 unsigned long swap_cache_find_total = 0;
 unsigned long swap_cache_find_success = 0;
 
 void show_swap_cache_info(void)
 {
-	printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
-		swap_cache_add_total, swap_cache_add_success, 
-		swap_cache_del_total, swap_cache_del_success,
-		swap_cache_find_total, swap_cache_find_success);
+	printk("Swap cache: add %ld, delete %ld, find %ld/%ld\n",
+		swap_cache_add_total, 
+		swap_cache_del_total,
+		swap_cache_find_success, swap_cache_find_total);
 }
 #endif
 
@@ -69,9 +67,6 @@ int add_to_swap_cache(struct page *page, unsigned long entry)
 	page->offset = entry;
 	add_page_to_hash_queue(page, &swapper_inode, entry);
 	add_page_to_inode_queue(&swapper_inode, page);
-#ifdef SWAP_CACHE_INFO
-	swap_cache_add_success++;
-#endif
 	return 1;
 }
 
@@ -192,16 +187,6 @@ static inline void remove_from_swap_cache(struct page *page)
 		printk ("VM: Removing swap cache page with wrong inode hash "
 			"on page %08lx\n", page_address(page));
 	}
-#if 0
-	/*
-	 * This is a legal case, but warn about it.
-	 */
-	if (atomic_read(&page->count) == 1) {
-		printk (KERN_WARNING 
-			"VM: Removing page cache on unshared page %08lx\n", 
-			page_address(page));
-	}
-#endif
 
 #ifdef DEBUG_SWAP
 	printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
@@ -222,7 +207,6 @@ void delete_from_swap_cache(struct page *page)
 
 #ifdef SWAP_CACHE_INFO
 	swap_cache_del_total++;
-	swap_cache_del_success++;
 #endif
 #ifdef DEBUG_SWAP
 	printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
@@ -241,6 +225,7 @@ void delete_from_swap_cache(struct page *page)
 void free_page_and_swap_cache(unsigned long addr)
 {
 	struct page *page = mem_map + MAP_NR(addr);
+
 	/* 
 	 * If we are the only user, then free up the swap cache. 
 	 */
@@ -248,7 +233,7 @@ void free_page_and_swap_cache(unsigned long addr)
 		delete_from_swap_cache(page);
 	}
 	
-	free_page(addr);
+	__free_page(page);
 }
 
 
@@ -258,18 +243,25 @@ void free_page_and_swap_cache(unsigned long addr)
  * incremented.
  */
 
-static struct page * lookup_swap_cache(unsigned long entry)
+struct page * lookup_swap_cache(unsigned long entry)
 {
 	struct page *found;
-	
+
+#ifdef SWAP_CACHE_INFO
+	swap_cache_find_total++;
+#endif
 	while (1) {
 		found = find_page(&swapper_inode, entry);
 		if (!found)
 			return 0;
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
 			goto out_bad;
-		if (!PageLocked(found))
+		if (!PageLocked(found)) {
+#ifdef SWAP_CACHE_INFO
+			swap_cache_find_success++;
+#endif
 			return found;
+		}
 		__free_page(found);
 		__wait_on_page(found);
 	}
@@ -291,7 +283,7 @@ out_bad:
 
 struct page * read_swap_cache_async(unsigned long entry, int wait)
 {
-	struct page *found_page, *new_page;
+	struct page *found_page = 0, *new_page;
 	unsigned long new_page_addr;
 	
 #ifdef DEBUG_SWAP
@@ -299,15 +291,20 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
 	       entry, wait ? ", wait" : "");
 #endif
 	/*
+	 * Make sure the swap entry is still in use.
+	 */
+	if (!swap_duplicate(entry))	/* Account for the swap cache */
+		goto out;
+	/*
 	 * Look for the page in the swap cache.
 	 */
 	found_page = lookup_swap_cache(entry);
 	if (found_page)
-		goto out;
+		goto out_free_swap;
 
-	new_page_addr = __get_free_page(GFP_KERNEL);
+	new_page_addr = __get_free_page(GFP_USER);
 	if (!new_page_addr)
-		goto out;	/* Out of memory */
+		goto out_free_swap;	/* Out of memory */
 	new_page = mem_map + MAP_NR(new_page_addr);
 
 	/*
@@ -316,11 +313,6 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
 	found_page = lookup_swap_cache(entry);
 	if (found_page)
 		goto out_free_page;
-	/*
-	 * Make sure the swap entry is still in use.
-	 */
-	if (!swap_duplicate(entry))	/* Account for the swap cache */
-		goto out_free_page;
 	/* 
 	 * Add it to the swap cache and read its contents.
 	 */
@@ -338,6 +330,8 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
 
 out_free_page:
 	__free_page(new_page);
+out_free_swap:
+	swap_free(entry);
 out:
 	return found_page;
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c574fb59a..dd66701b5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -23,6 +23,7 @@ struct swap_list_t swap_list = {-1, -1};
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
+#define SWAPFILE_CLUSTER 256
 
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
@@ -30,7 +31,7 @@ static inline int scan_swap_map(struct swap_info_struct *si)
 	/* 
 	 * We try to cluster swap pages by allocating them
 	 * sequentially in swap.  Once we've allocated
-	 * SWAP_CLUSTER_MAX pages this way, however, we resort to
+	 * SWAPFILE_CLUSTER pages this way, however, we resort to
 	 * first-free allocation, starting a new cluster.  This
 	 * prevents us from scattering swap pages all over the entire
 	 * swap partition, so that we reduce overall disk seek times
@@ -46,7 +47,7 @@ static inline int scan_swap_map(struct swap_info_struct *si)
 			goto got_page;
 		}
 	}
-	si->cluster_nr = SWAP_CLUSTER_MAX;
+	si->cluster_nr = SWAPFILE_CLUSTER;
 	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
 		if (si->swap_map[offset])
 			continue;
@@ -626,11 +627,11 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 		p->highest_bit = swap_header->info.last_page - 1;
 		p->max	       = swap_header->info.last_page;
 
-		if (p->max >= 0x7fffffffL/PAGE_SIZE ||
-		    (void *) &swap_header->info.badpages[(int) swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) {
-			error = -EINVAL;
+		error = -EINVAL;
+		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+			goto bad_swap;
+		if (p->max >= SWP_OFFSET(SWP_ENTRY(0,~0UL)))
 			goto bad_swap;
-		}
 		
 		/* OK, set up the swap map and apply the bad block list */
 		if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e99ad35fb..7063b2df1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -161,8 +161,10 @@ struct vm_struct * get_vm_area(unsigned long size)
 	for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
 		if (size + addr < (unsigned long) tmp->addr)
 			break;
-		if (addr > VMALLOC_END-size)
+		if (addr > VMALLOC_END-size) {
+			kfree(area);
 			return NULL;
+		}
 		addr = tmp->size + (unsigned long) tmp->addr;
 	}
 	area->addr = (void *)addr;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5efa52a2..116096153 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -20,13 +20,6 @@
 
 #include <asm/pgtable.h>
 
-/* 
- * The wait queue for waking up the pageout daemon:
- */
-static struct task_struct * kswapd_task = NULL;
-
-static void init_swap_timer(void);
-
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -38,7 +31,7 @@ static void init_swap_timer(void);
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
+static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
 	unsigned long address, pte_t * page_table, int gfp_mask)
 {
 	pte_t pte;
@@ -59,50 +52,6 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
 		return 0;
 
-	/* 
-	 * Deal with page aging.  There are several special cases to
-	 * consider:
-	 * 
-	 * Page has been accessed, but is swap cached.  If the page is
-	 * getting sufficiently "interesting" --- its age is getting
-	 * high --- then if we are sufficiently short of free swap
-	 * pages, then delete the swap cache.  We can only do this if
-	 * the swap page's reference count is one: ie. there are no
-	 * other references to it beyond the swap cache (as there must
-	 * still be PTEs pointing to it if count > 1).
-	 * 
-	 * If the page has NOT been touched, and its age reaches zero,
-	 * then we are swapping it out:
-	 *
-	 *   If there is already a swap cache page for this page, then
-	 *   another process has already allocated swap space, so just
-	 *   dereference the physical page and copy in the swap entry
-	 *   from the swap cache.  
-	 * 
-	 * Note, we rely on all pages read in from swap either having
-	 * the swap cache flag set, OR being marked writable in the pte,
-	 * but NEVER BOTH.  (It IS legal to be neither cached nor dirty,
-	 * however.)
-	 *
-	 * -- Stephen Tweedie 1998 */
-
-	if (PageSwapCache(page_map)) {
-		if (pte_write(pte)) {
-			struct page *found;
-			printk ("VM: Found a writable swap-cached page!\n");
-			/* Try to diagnose the problem ... */
-			found = find_page(&swapper_inode, page_map->offset);
-			if (found) {
-				printk("page=%p@%08lx, found=%p, count=%d\n",
-					page_map, page_map->offset,
-					found, atomic_read(&found->count));
-				__free_page(found);
-			} else 
-				printk ("Spurious, page not in cache\n");
-			return 0;
-		}
-	}
-	
 	if (pte_young(pte)) {
 		/*
 		 * Transfer the "accessed" bit from the page
@@ -110,109 +59,111 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 		 */
 		set_pte(page_table, pte_mkold(pte));
 		set_bit(PG_referenced, &page_map->flags);
+		return 0;
+	}
 
-		/* 
-		 * We should test here to see if we want to recover any
-		 * swap cache page here.  We do this if the page seeing
-		 * enough activity, AND we are sufficiently low on swap
-		 *
-		 * We need to track both the number of available swap
-		 * pages and the total number present before we can do
-		 * this...  
-		 */
+	/*
+	 * Is the page already in the swap cache? If so, then
+	 * we can just drop our reference to it without doing
+	 * any IO - it's already up-to-date on disk.
+	 *
+	 * Return 0, as we didn't actually free any real
+	 * memory, and we should just continue our scan.
+	 */
+	if (PageSwapCache(page_map)) {
+		entry = page_map->offset;
+		swap_duplicate(entry);
+		set_pte(page_table, __pte(entry));
+drop_pte:
+		vma->vm_mm->rss--;
+		flush_tlb_page(vma, address);
+		__free_page(page_map);
 		return 0;
 	}
 
-	if (pte_dirty(pte)) {
-		if (vma->vm_ops && vma->vm_ops->swapout) {
-			pid_t pid = tsk->pid;
-			vma->vm_mm->rss--;
-			if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
-				kill_proc(pid, SIGBUS, 1);
-		} else {
-			/*
-			 * This is a dirty, swappable page.  First of all,
-			 * get a suitable swap entry for it, and make sure
-			 * we have the swap cache set up to associate the
-			 * page with that swap entry.
-			 */
-        		entry = in_swap_cache(page_map);
-			if (!entry) {
-				entry = get_swap_page();
-				if (!entry)
-					return 0; /* No swap space left */
-			}
-			
-			vma->vm_mm->rss--;
-			tsk->nswap++;
-			flush_cache_page(vma, address);
-			set_pte(page_table, __pte(entry));
-			flush_tlb_page(vma, address);
-			swap_duplicate(entry);
-
-			/* Now to write back the page.  We have two
-			 * cases: if the page is already part of the
-			 * swap cache, then it is already on disk.  Just
-			 * free the page and return (we release the swap
-			 * cache on the last accessor too).
-			 *
-			 * If we have made a new swap entry, then we
-			 * start the write out to disk.  If the page is
-			 * shared, however, we still need to keep the
-			 * copy in memory, so we add it to the swap
-			 * cache. */
-			if (PageSwapCache(page_map)) {
-				free_page(page);
-				return (atomic_read(&page_map->count) == 0);
-			}
-			add_to_swap_cache(page_map, entry);
-			/* We checked we were unlocked way up above, and we
-			   have been careful not to stall until here */
-			set_bit(PG_locked, &page_map->flags);
-			/* OK, do a physical write to swap.  */
-			rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
-		}
-		/* Now we can free the current physical page.  We also
-		 * free up the swap cache if this is the last use of the
-		 * page.  Note that there is a race here: the page may
-		 * still be shared COW by another process, but that
-		 * process may exit while we are writing out the page
-		 * asynchronously.  That's no problem, shrink_mmap() can
-		 * correctly clean up the occassional unshared page
-		 * which gets left behind in the swap cache. */
-		free_page(page);
-		return 1;	/* we slept: the process may not exist any more */
+	/*
+	 * Is it a clean page? Then it must be recoverable
+	 * by just paging it in again, and we can just drop
+	 * it..
+	 *
+	 * However, this won't actually free any real
+	 * memory, as the page will just be in the page cache
+	 * somewhere, and as such we should just continue
+	 * our scan.
+	 *
+	 * Basically, this just makes it possible for us to do
+	 * some real work in the future in "shrink_mmap()".
+	 */
+	if (!pte_dirty(pte)) {
+		pte_clear(page_table);
+		goto drop_pte;
 	}
 
-	/* The page was _not_ dirty, but still has a zero age.  It must
-	 * already be uptodate on disk.  If it is in the swap cache,
-	 * then we can just unlink the page now.  Remove the swap cache
-	 * too if this is the last user.  */
-        if ((entry = in_swap_cache(page_map)))  {
-		vma->vm_mm->rss--;
-		flush_cache_page(vma, address);
-		set_pte(page_table, __pte(entry));
-		flush_tlb_page(vma, address);
-		swap_duplicate(entry);
-		free_page(page);
-		return (atomic_read(&page_map->count) == 0);
-	} 
-	/* 
-	 * A clean page to be discarded?  Must be mmap()ed from
-	 * somewhere.  Unlink the pte, and tell the filemap code to
-	 * discard any cached backing page if this is the last user.
+	/*
+	 * Don't go down into the swap-out stuff if
+	 * we cannot do I/O! Avoid recursing on FS
+	 * locks etc.
 	 */
-	if (PageSwapCache(page_map)) {
-		printk ("VM: How can this page _still_ be cached?");
+	if (!(gfp_mask & __GFP_IO))
 		return 0;
+
+	/*
+	 * Ok, it's really dirty. That means that
+	 * we should either create a new swap cache
+	 * entry for it, or we should write it back
+	 * to its own backing store.
+	 *
+	 * Note that in neither case do we actually
+	 * know that we make a page available, but
+	 * as we potentially sleep we can no longer
+	 * continue scanning, so we migth as well
+	 * assume we free'd something.
+	 *
+	 * NOTE NOTE NOTE! This should just set a
+	 * dirty bit in page_map, and just drop the
+	 * pte. All the hard work would be done by
+	 * shrink_mmap().
+	 *
+	 * That would get rid of a lot of problems.
+	 */
+	flush_cache_page(vma, address);
+	if (vma->vm_ops && vma->vm_ops->swapout) {
+		pid_t pid = tsk->pid;
+		pte_clear(page_table);
+		flush_tlb_page(vma, address);
+		vma->vm_mm->rss--;
+		
+		if (vma->vm_ops->swapout(vma, page_map))
+			kill_proc(pid, SIGBUS, 1);
+		__free_page(page_map);
+		return 1;
 	}
+
+	/*
+	 * This is a dirty, swappable page.  First of all,
+	 * get a suitable swap entry for it, and make sure
+	 * we have the swap cache set up to associate the
+	 * page with that swap entry.
+	 */
+	entry = get_swap_page();
+	if (!entry)
+		return 0; /* No swap space left */
+		
 	vma->vm_mm->rss--;
-	flush_cache_page(vma, address);
-	pte_clear(page_table);
+	tsk->nswap++;
+	set_pte(page_table, __pte(entry));
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
+	swap_duplicate(entry);	/* One for the process, one for the swap cache */
+	add_to_swap_cache(page_map, entry);
+	/* We checked we were unlocked way up above, and we
+	   have been careful not to stall until here */
+	set_bit(PG_locked, &page_map->flags);
+
+	/* OK, do a physical asynchronous write to swap.  */
+	rw_swap_page(WRITE, entry, (char *) page, 0);
+
 	__free_page(page_map);
-	return entry;
+	return 1;
 }
 
 /*
@@ -363,13 +314,23 @@ static int swap_out(unsigned int priority, int gfp_mask)
 	/* 
 	 * We make one or two passes through the task list, indexed by 
 	 * assign = {0, 1}:
-	 *   Pass 1: select the swappable task with maximal swap_cnt.
-	 *   Pass 2: assign new swap_cnt values, then select as above.
+	 *   Pass 1: select the swappable task with maximal RSS that has
+	 *         not yet been swapped out. 
+	 *   Pass 2: re-assign rss swap_cnt values, then select as above.
+	 *
 	 * With this approach, there's no need to remember the last task
 	 * swapped out.  If the swap-out fails, we clear swap_cnt so the 
 	 * task won't be selected again until all others have been tried.
+	 *
+	 * Think of swap_cnt as a "shadow rss" - it tells us which process
+	 * we want to page out (always try largest first).
 	 */
-	counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+	counter = nr_tasks / (priority+1);
+	if (counter < 1)
+		counter = 1;
+	if (counter > nr_tasks)
+		counter = nr_tasks;
+
 	for (; counter >= 0; counter--) {
 		assign = 0;
 		max_cnt = 0;
@@ -382,15 +343,9 @@ static int swap_out(unsigned int priority, int gfp_mask)
 				continue;
 	 		if (p->mm->rss <= 0)
 				continue;
-			if (assign) {
-				/* 
-				 * If we didn't select a task on pass 1, 
-				 * assign each task a new swap_cnt.
-				 * Normalise the number of pages swapped
-				 * by multiplying by (RSS / 1MB)
-				 */
-				p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
-			}
+			/* Refresh swap_cnt? */
+			if (assign)
+				p->swap_cnt = p->mm->rss;
 			if (p->swap_cnt > max_cnt) {
 				max_cnt = p->swap_cnt;
 				pbest = p;
@@ -404,56 +359,60 @@ static int swap_out(unsigned int priority, int gfp_mask)
 			}
 			goto out;
 		}
-		pbest->swap_cnt--;
 
-		/*
-		 * Nonzero means we cleared out something, but only "1" means
-		 * that we actually free'd up a page as a result.
-		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
-				return 1;
+		if (swap_out_process(pbest, gfp_mask))
+			return 1;
 	}
 out:
 	return 0;
 }
 
 /*
- * We are much more aggressive about trying to swap out than we used
- * to be.  This works out OK, because we now do proper aging on page
- * contents. 
+ * We need to make the locks finer granularity, but right
+ * now we need this so that we can do page allocations
+ * without holding the kernel lock etc.
+ *
+ * We want to try to free "count" pages, and we need to 
+ * cluster them so that we get good swap-out behaviour. See
+ * the "free_memory()" macro for details.
  */
-static int do_try_to_free_page(int gfp_mask)
+static int do_try_to_free_pages(unsigned int gfp_mask)
 {
-	static int state = 0;
-	int i=6;
+	int priority;
+	int count = SWAP_CLUSTER_MAX;
+
+	lock_kernel();
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	if (buffer_over_borrow() || pgcache_over_borrow())
-		state = 0;
+	priority = 6;
+	do {
+		while (shrink_mmap(priority, gfp_mask)) {
+			if (!--count)
+				goto done;
+		}
 
-	switch (state) {
-		do {
-		case 0:
-			if (shrink_mmap(i, gfp_mask))
-				return 1;
-			state = 1;
-		case 1:
-			if (shm_swap(i, gfp_mask))
-				return 1;
-			state = 2;
-		case 2:
-			if (swap_out(i, gfp_mask))
-				return 1;
-			state = 3;
-		case 3:
-			shrink_dcache_memory(i, gfp_mask);
-			state = 0;
-		i--;
-		} while (i >= 0);
-	}
-	return 0;
+		/* Try to get rid of some shared memory pages.. */
+		if (gfp_mask & __GFP_IO) {
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+		}
+
+		/* Then, try to page stuff out.. */
+		while (swap_out(priority, gfp_mask)) {
+			if (!--count)
+				goto done;
+		}
+
+		shrink_dcache_memory(priority, gfp_mask);
+	} while (--priority >= 0);
+done:
+	unlock_kernel();
+
+	return priority >= 0;
 }
 
 /*
@@ -467,6 +426,8 @@ void __init kswapd_setup(void)
        int i;
        char *revision="$Revision: 1.5 $", *s, *e;
 
+       swap_setup();
+       
        if ((s = strchr(revision, ':')) &&
            (e = strchr(s, '$')))
                s++, i = e - s;
@@ -475,35 +436,36 @@ void __init kswapd_setup(void)
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
+static struct task_struct *kswapd_process;
+
 /*
- * The background pageout daemon.
- * Started as a kernel thread from the init process.
+ * The background pageout daemon, started as a kernel thread
+ * from the init process. 
+ *
+ * This basically executes once a second, trickling out pages
+ * so that we have _some_ free memory available even if there
+ * is no other activity that frees anything up. This is needed
+ * for things like routing etc, where we otherwise might have
+ * all activity going on in asynchronous contexts that cannot
+ * page things out.
+ *
+ * If there are applications that are active memory-allocators
+ * (most normal use), this basically shouldn't matter.
  */
 int kswapd(void *unused)
 {
-	current->session = 1;
-	current->pgrp = 1;
-	strcpy(current->comm, "kswapd");
-	sigfillset(&current->blocked);
-	
-	/*
-	 *	As a kernel thread we want to tamper with system buffers
-	 *	and other internals and thus be subject to the SMP locking
-	 *	rules. (On a uniprocessor box this does nothing).
-	 */
-	lock_kernel();
-
-	/*
-	 * Set the base priority to something smaller than a
-	 * regular process. We will scale up the priority
-	 * dynamically depending on how much memory we need.
-	 */
-	current->priority = (DEF_PRIORITY * 2) / 3;
+	struct task_struct *tsk = current;
 
+	kswapd_process = tsk;
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	strcpy(tsk->comm, "kswapd");
+	sigfillset(&tsk->blocked);
+	
 	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
-	 * regardless (see "try_to_free_pages()"). "kswapd" should
+	 * regardless (see "__get_free_pages()"). "kswapd" should
 	 * never get caught in the normal page freeing logic.
 	 *
 	 * (Kswapd normally doesn't need memory anyway, but sometimes
@@ -512,128 +474,52 @@ int kswapd(void *unused)
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	current->flags |= PF_MEMALLOC;
+	tsk->flags |= PF_MEMALLOC;
 
-	init_swap_timer();
-	kswapd_task = current;
 	while (1) {
-		unsigned long end_time;
-
-		current->state = TASK_INTERRUPTIBLE;
-		flush_signals(current);
-		run_task_queue(&tq_disk);
-		schedule();
-		swapstats.wakeups++;
-
-		/* max one hundreth of a second */
-		end_time = jiffies + (HZ-1)/100;
+		/*
+		 * Wake up once a second to see if we need to make
+		 * more memory available.
+		 *
+		 * If we actually get into a low-memory situation,
+		 * the processes needing more memory will wake us
+		 * up on a more timely basis.
+		 */
 		do {
-			if (!do_try_to_free_page(0))
+			if (nr_free_pages >= freepages.high)
 				break;
-			if (nr_free_pages > freepages.high + SWAP_CLUSTER_MAX)
+
+			if (!do_try_to_free_pages(GFP_KSWAPD))
 				break;
-		} while (time_before_eq(jiffies,end_time));
+		} while (!tsk->need_resched);
+		run_task_queue(&tq_disk);
+		tsk->state = TASK_INTERRUPTIBLE;
+		schedule_timeout(HZ);
 	}
-	/* As if we could ever get here - maybe we want to make this killable */
-	kswapd_task = NULL;
-	unlock_kernel();
-	return 0;
 }
 
 /*
- * We need to make the locks finer granularity, but right
- * now we need this so that we can do page allocations
- * without holding the kernel lock etc.
+ * Called by non-kswapd processes when they want more
+ * memory.
+ *
+ * In a perfect world, this should just wake up kswapd
+ * and return. We don't actually want to swap stuff out
+ * from user processes, because the locking issues are
+ * nasty to the extreme (file write locks, and MM locking)
  *
- * The "PF_MEMALLOC" flag protects us against recursion:
- * if we need more memory as part of a swap-out effort we
- * will just silently return "success" to tell the page
- * allocator to accept the allocation.
+ * One option might be to let kswapd do all the page-out
+ * and VM page table scanning that needs locking, and this
+ * process thread could do just the mmap shrink stage that
+ * can be done by just dropping cached pages without having
+ * any deadlock issues.
  */
-int try_to_free_pages(unsigned int gfp_mask, int count)
+int try_to_free_pages(unsigned int gfp_mask)
 {
 	int retval = 1;
 
-	lock_kernel();
-	if (!(current->flags & PF_MEMALLOC)) {
-		current->flags |= PF_MEMALLOC;
-		do {
-			retval = do_try_to_free_page(gfp_mask);
-			if (!retval)
-				break;
-			count--;
-		} while (count > 0);
-		current->flags &= ~PF_MEMALLOC;
-	}
-	unlock_kernel();
+	wake_up_process(kswapd_process);
+	if (gfp_mask & __GFP_WAIT)
+		retval = do_try_to_free_pages(gfp_mask);
 	return retval;
 }
-
-/*
- * Wake up kswapd according to the priority
- *	0 - no wakeup
- *	1 - wake up as a low-priority process
- *	2 - wake up as a normal process
- *	3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(struct task_struct *p, int priority)
-{
-	if (priority) {
-		p->counter = p->priority << priority;
-		wake_up_process(p);
-	}
-}
-
-/* 
- * The swap_tick function gets called on every clock tick.
- */
-void swap_tick(void)
-{
-	struct task_struct *p = kswapd_task;
-
-	/*
-	 * Only bother to try to wake kswapd up
-	 * if the task exists and can be woken.
-	 */
-	if (p && (p->state & TASK_INTERRUPTIBLE)) {
-		unsigned int pages;
-		int want_wakeup;
-
-		/*
-		 * Schedule for wakeup if there isn't lots
-		 * of free memory or if there is too much
-		 * of it used for buffers or pgcache.
-		 *
-		 * "want_wakeup" is our priority: 0 means
-		 * not to wake anything up, while 3 means
-		 * that we'd better give kswapd a realtime
-		 * priority.
-		 */
-		want_wakeup = 0;
-		pages = nr_free_pages;
-		if (pages < freepages.high)
-			want_wakeup = 1;
-		if (pages < freepages.low)
-			want_wakeup = 2;
-		if (pages < freepages.min)
-			want_wakeup = 3;
 	
-		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
-}
-
-/* 
- * Initialise the swap timer
- */
-
-void init_swap_timer(void)
-{
-	timer_table[SWAP_TIMER].expires = jiffies;
-	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
-}