9 files changed, 246 insertions, 339 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index dedd7911e..4c89ad3e9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,7 +143,8 @@ void __set_page_dirty(struct page *page)
 	list_add(&page->list, &mapping->dirty_pages);
 	spin_unlock(&pagecache_lock);
 
-	mark_inode_dirty_pages(mapping->host);
+	if (mapping->host)
+		mark_inode_dirty_pages(mapping->host);
 }
 
 /**
@@ -306,7 +307,7 @@ inside:
 	 */
 	age_page_up(page);
 	if (inactive_shortage() > inactive_target / 2 && free_shortage())
-			wakeup_kswapd(0);
+			wakeup_kswapd();
 not_found:
 	return page;
 }
@@ -974,10 +975,6 @@ static void generic_file_readahead(int reada_ok,
  *   accessed sequentially.
  */
 	if (ahead) {
-		if (reada_ok == 2) {
-			run_task_queue(&tq_disk);
-		}
-
 		filp->f_ralen += ahead;
 		filp->f_rawin += filp->f_ralen;
 		filp->f_raend = raend + ahead + 1;
@@ -1835,7 +1832,8 @@ static long madvise_fixup_start(struct vm_area_struct * vma,
 	n->vm_end = end;
 	setup_read_behavior(n, behavior);
 	n->vm_raend = 0;
-	get_file(n->vm_file);
+	if (n->vm_file)
+		get_file(n->vm_file);
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
 	lock_vma_mappings(vma);
@@ -1861,7 +1859,8 @@ static long madvise_fixup_end(struct vm_area_struct * vma,
 	n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
 	setup_read_behavior(n, behavior);
 	n->vm_raend = 0;
-	get_file(n->vm_file);
+	if (n->vm_file)
+		get_file(n->vm_file);
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
 	lock_vma_mappings(vma);
@@ -1893,7 +1892,8 @@ static long madvise_fixup_middle(struct vm_area_struct * vma,
 	right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
 	left->vm_raend = 0;
 	right->vm_raend = 0;
-	atomic_add(2, &vma->vm_file->f_count);
+	if (vma->vm_file)
+		atomic_add(2, &vma->vm_file->f_count);
 
 	if (vma->vm_ops && vma->vm_ops->open) {
 		vma->vm_ops->open(left);
diff --git a/mm/memory.c b/mm/memory.c
index 6f1f318a3..7fc8de5eb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -207,7 +207,8 @@ skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
 			
 			src_pte = pte_offset(src_pmd, address);
 			dst_pte = pte_offset(dst_pmd, address);
-			
+
+			spin_lock(&src->page_table_lock);			
 			do {
 				pte_t pte = *src_pte;
 				struct page *ptepage;
@@ -240,10 +241,11 @@ skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
 cont_copy_pte_range:		set_pte(dst_pte, pte);
 cont_copy_pte_range_noset:	address += PAGE_SIZE;
 				if (address >= end)
-					goto out;
+					goto out_unlock;
 				src_pte++;
 				dst_pte++;
 			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+			spin_unlock(&src->page_table_lock);
 		
 cont_copy_pmd_range:	src_pmd++;
 			dst_pmd++;
@@ -252,6 +254,10 @@ cont_copy_pmd_range:	src_pmd++;
 out:
 	return 0;
 
+out_unlock:
+	spin_unlock(&src->page_table_lock);
+	return 0;
+
 nomem:
 	return -ENOMEM;
 }
@@ -939,7 +945,6 @@ void vmtruncate(struct inode * inode, loff_t offset)
 	if (inode->i_size < offset)
 		goto do_expand;
 	inode->i_size = offset;
-	truncate_inode_pages(mapping, offset);
 	spin_lock(&mapping->i_shared_lock);
 	if (!mapping->i_mmap && !mapping->i_mmap_shared)
 		goto out_unlock;
@@ -954,8 +959,7 @@ void vmtruncate(struct inode * inode, loff_t offset)
 
 out_unlock:
 	spin_unlock(&mapping->i_shared_lock);
-	/* this should go into ->truncate */
-	inode->i_size = offset;
+	truncate_inode_pages(mapping, offset);
 	if (inode->i_op && inode->i_op->truncate)
 		inode->i_op->truncate(inode);
 	return;
diff --git a/mm/mmap.c b/mm/mmap.c
index e5b3a989e..e1faba3c7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -883,6 +883,8 @@ void exit_mmap(struct mm_struct * mm)
 	mm->rss = 0;
 	mm->total_vm = 0;
 	mm->locked_vm = 0;
+
+	flush_cache_mm(mm);
 	while (mpnt) {
 		struct vm_area_struct * next = mpnt->vm_next;
 		unsigned long start = mpnt->vm_start;
@@ -895,13 +897,13 @@ void exit_mmap(struct mm_struct * mm)
 		}
 		mm->map_count--;
 		remove_shared_vm_struct(mpnt);
-		flush_cache_range(mm, start, end);
 		zap_page_range(mm, start, size);
 		if (mpnt->vm_file)
 			fput(mpnt->vm_file);
 		kmem_cache_free(vm_area_cachep, mpnt);
 		mpnt = next;
 	}
+	flush_tlb_mm(mm);
 
 	/* This is just debugging */
 	if (mm->map_count)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b67aa4913..09ac27284 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
+#include <linux/slab.h>
 
 int nr_swap_pages;
 int nr_active_pages;
@@ -303,7 +304,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 	 * an inactive page shortage, wake up kswapd.
 	 */
 	if (inactive_shortage() > inactive_target / 2 && free_shortage())
-		wakeup_kswapd(0);
+		wakeup_kswapd();
 	/*
 	 * If we are about to get low on free pages and cleaning
 	 * the inactive_dirty pages would fix the situation,
@@ -379,7 +380,7 @@ try_again:
 	 * - if we don't have __GFP_IO set, kswapd may be
 	 *   able to free some memory we can't free ourselves
 	 */
-	wakeup_kswapd(0);
+	wakeup_kswapd();
 	if (gfp_mask & __GFP_WAIT) {
 		__set_current_state(TASK_RUNNING);
 		current->policy |= SCHED_YIELD;
@@ -404,7 +405,7 @@ try_again:
 	 * - we're doing a higher-order allocation
 	 * 	--> move pages to the free list until we succeed
 	 * - we're /really/ tight on memory
-	 * 	--> wait on the kswapd waitqueue until memory is freed
+	 * 	--> try to free pages ourselves with page_launder
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
 		/*
@@ -443,36 +444,20 @@ try_again:
 		/*
 		 * When we arrive here, we are really tight on memory.
 		 *
-		 * We wake up kswapd and sleep until kswapd wakes us
-		 * up again. After that we loop back to the start.
-		 *
-		 * We have to do this because something else might eat
-		 * the memory kswapd frees for us and we need to be
-		 * reliable. Note that we don't loop back for higher
-		 * order allocations since it is possible that kswapd
-		 * simply cannot free a large enough contiguous area
-		 * of memory *ever*.
+		 * We try to free pages ourselves by:
+		 * 	- shrinking the i/d caches.
+		 * 	- reclaiming unused memory from the slab caches.
+		 * 	- swapping/syncing pages to disk (done by page_launder)
+		 * 	- moving clean pages from the inactive dirty list to
+		 * 	  the inactive clean list. (done by page_launder)
 		 */
-		if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
-			wakeup_kswapd(1);
+		if (gfp_mask & __GFP_WAIT) {
 			memory_pressure++;
-			if (!order)
-				goto try_again;
-		/*
-		 * If __GFP_IO isn't set, we can't wait on kswapd because
-		 * kswapd just might need some IO locks /we/ are holding ...
-		 *
-		 * SUBTLE: The scheduling point above makes sure that
-		 * kswapd does get the chance to free memory we can't
-		 * free ourselves...
-		 */
-		} else if (gfp_mask & __GFP_WAIT) {
 			try_to_free_pages(gfp_mask);
-			memory_pressure++;
+			wakeup_bdflush(0);
 			if (!order)
 				goto try_again;
 		}
-
 	}
 
 	/*
@@ -554,14 +539,8 @@ void __free_pages(struct page *page, unsigned long order)
 
 void free_pages(unsigned long addr, unsigned long order)
 {
-	struct page *fpage;
-
-#ifdef CONFIG_DISCONTIGMEM
-	if (addr == 0) return;
-#endif
-	fpage = virt_to_page(addr);
-	if (VALID_PAGE(fpage))
-		__free_pages(fpage, order);
+	if (addr != 0)
+		__free_pages(virt_to_page(addr), order);
 }
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index a81a74659..00426ca27 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -117,11 +117,43 @@ shmem_truncate_part (swp_entry_t * dir, unsigned long size,
 	return 0;
 }
 
+/*
+ * shmem_recalc_inode - recalculate the size of an inode
+ *
+ * @inode: inode to recalc
+ *
+ * We have to calculate the free blocks since the mm can drop pages
+ * behind our back
+ *
+ * But we know that normally
+ * inodes->i_blocks == inode->i_mapping->nrpages + info->swapped
+ *
+ * So the mm freed 
+ * inodes->i_blocks - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * It has to be called with the spinlock held.
+ */
+
+static void shmem_recalc_inode(struct inode * inode)
+{
+	unsigned long freed;
+
+	freed = inode->i_blocks -
+		(inode->i_mapping->nrpages + inode->u.shmem_i.swapped);
+	if (freed){
+		struct shmem_sb_info * info = &inode->i_sb->u.shmem_sb;
+		inode->i_blocks -= freed;
+		spin_lock (&info->stat_lock);
+		info->free_blocks += freed;
+		spin_unlock (&info->stat_lock);
+	}
+}
+
 static void shmem_truncate (struct inode * inode)
 {
 	int clear_base;
 	unsigned long start;
-	unsigned long mmfreed, freed = 0;
+	unsigned long freed = 0;
 	swp_entry_t **base, **ptr;
 	struct shmem_inode_info * info = &inode->u.shmem_i;
 
@@ -154,26 +186,9 @@ static void shmem_truncate (struct inode * inode)
 	info->i_indirect = 0;
 
 out:
-
-	/*
-	 * We have to calculate the free blocks since we do not know
-	 * how many pages the mm discarded
-	 *
-	 * But we know that normally
-	 * inodes->i_blocks == inode->i_mapping->nrpages + info->swapped
-	 *
-	 * So the mm freed 
-	 * inodes->i_blocks - (inode->i_mapping->nrpages + info->swapped)
-	 */
-
-	mmfreed = inode->i_blocks - (inode->i_mapping->nrpages + info->swapped);
 	info->swapped -= freed;
-	inode->i_blocks -= freed + mmfreed;
+	shmem_recalc_inode(inode);
 	spin_unlock (&info->lock);
-
-	spin_lock (&inode->i_sb->u.shmem_sb.stat_lock);
-	inode->i_sb->u.shmem_sb.free_blocks += freed + mmfreed;
-	spin_unlock (&inode->i_sb->u.shmem_sb.stat_lock);
 }
 
 static void shmem_delete_inode(struct inode * inode)
@@ -201,13 +216,15 @@ static int shmem_writepage(struct page * page)
 	swp_entry_t *entry, swap;
 
 	info = &page->mapping->host->u.shmem_i;
-	if (info->locked)
-		return 1;
 	swap = __get_swap_page(2);
-	if (!swap.val)
-		return 1;
+	if (!swap.val) {
+		set_page_dirty(page);
+		UnlockPage(page);
+		return -ENOMEM;
+	}
 
 	spin_lock(&info->lock);
+	shmem_recalc_inode(page->mapping->host);
 	entry = shmem_swp_entry (info, page->index);
 	if (!entry)	/* this had been allocted on page allocation */
 		BUG();
@@ -269,6 +286,9 @@ struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, i
 	entry = shmem_swp_entry (info, idx);
 	if (!entry)
 		goto oom;
+	spin_lock (&info->lock);
+	shmem_recalc_inode(inode);
+	spin_unlock (&info->lock);
 	if (entry->val) {
 		unsigned long flags;
 
@@ -310,6 +330,8 @@ struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, i
 	}
 	/* We have the page */
 	SetPageUptodate (page);
+	if (info->locked)
+		page_cache_get(page);
 
 cached_page:
 	UnlockPage (page);
@@ -374,8 +396,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
 			inode->i_fop = &shmem_dir_operations;
 			break;
 		case S_IFLNK:
-			inode->i_op = &page_symlink_inode_operations;
-			break;
+			BUG();
 		}
 		spin_lock (&shmem_ilock);
 		list_add (&inode->u.shmem_i.list, &shmem_inodes);
@@ -401,6 +422,32 @@ static int shmem_statfs(struct super_block *sb, struct statfs *buf)
 	return 0;
 }
 
+void shmem_lock(struct file * file, int lock)
+{
+	struct inode * inode = file->f_dentry->d_inode;
+	struct shmem_inode_info * info = &inode->u.shmem_i;
+	struct page * page;
+	unsigned long idx, size;
+
+	if (info->locked == lock)
+		return;
+	down(&inode->i_sem);
+	info->locked = lock;
+	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	for (idx = 0; idx < size; idx++) {
+		page = find_lock_page(inode->i_mapping, idx);
+		if (!page)
+			continue;
+		if (!lock) {
+			/* release the extra count and our reference */
+			page_cache_release(page);
+			page_cache_release(page);
+		}
+		UnlockPage(page);
+	}
+	up(&inode->i_sem);
+}
+
 /*
  * Lookup the data. This is trivial - if the dentry didn't already
  * exist, we know it is negative.
@@ -528,19 +575,6 @@ static int shmem_rename(struct inode * old_dir, struct dentry *old_dentry, struc
 	return error;
 }
 
-static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
-{
-	int error;
-
-	error = shmem_mknod(dir, dentry, S_IFLNK | S_IRWXUGO, 0);
-	if (!error) {
-		int l = strlen(symname)+1;
-		struct inode *inode = dentry->d_inode;
-		error = block_symlink(inode, symname, l);
-	}
-	return error;
-}
-
 static int shmem_mmap(struct file * file, struct vm_area_struct * vma)
 {
 	struct vm_operations_struct * ops;
@@ -677,7 +711,6 @@ static struct inode_operations shmem_dir_inode_operations = {
 	lookup:		shmem_lookup,
 	link:		shmem_link,
 	unlink:		shmem_unlink,
-	symlink:	shmem_symlink,
 	mkdir:		shmem_mkdir,
 	rmdir:		shmem_rmdir,
 	mknod:		shmem_mknod,
diff --git a/mm/slab.c b/mm/slab.c
index b3bd852d1..f6f8be1db 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1702,7 +1702,7 @@ static void enable_all_cpucaches (void)
  * kmem_cache_reap - Reclaim memory from caches.
  * @gfp_mask: the type of memory required.
  *
- * Called from try_to_free_page().
+ * Called from do_try_to_free_pages() and __alloc_pages()
  */
 void kmem_cache_reap (int gfp_mask)
 {
diff --git a/mm/swap.c b/mm/swap.c
index 693773ccd..b1a6640bc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,8 +30,7 @@
  * start background swapping if we fall below freepages.high free
  * pages, and we begin intensive swapping below freepages.low.
  *
- * Actual initialization is done in mm/page_alloc.c or 
- * arch/sparc(64)/mm/init.c.
+ * Actual initialization is done in mm/page_alloc.c
  */
 freepages_t freepages = {
 	0,	/* freepages.min */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 62ce5f1ff..93edab662 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -9,6 +9,7 @@
 #include <linux/malloc.h>
 #include <linux/vmalloc.h>
 #include <linux/spinlock.h>
+#include <linux/highmem.h>
 #include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index afa5261c1..f41c53328 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,45 +35,21 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
 {
 	pte_t pte;
 	swp_entry_t entry;
-	struct page * page;
-	int onlist;
-
-	pte = *page_table;
-	if (!pte_present(pte))
-		goto out_failed;
-	page = pte_page(pte);
-	if ((!VALID_PAGE(page)) || PageReserved(page))
-		goto out_failed;
-
-	if (!mm->swap_cnt)
-		return 1;
-
-	mm->swap_cnt--;
 
-	onlist = PageActive(page);
 	/* Don't look at this pte if it's been accessed recently. */
 	if (ptep_test_and_clear_young(page_table)) {
-		age_page_up(page);
-		goto out_failed;
+		page->age += PAGE_AGE_ADV;
+		if (page->age > PAGE_AGE_MAX)
+			page->age = PAGE_AGE_MAX;
+		return;
 	}
-	if (!onlist)
-		/* The page is still mapped, so it can't be freeable... */
-		age_page_down_ageonly(page);
-
-	/*
-	 * If the page is in active use by us, or if the page
-	 * is in active use by others, don't unmap it or
-	 * (worse) start unneeded IO.
-	 */
-	if (page->age > 0)
-		goto out_failed;
 
 	if (TryLockPage(page))
-		goto out_failed;
+		return;
 
 	/* From this point on, the odds are that we're going to
 	 * nuke this pte, so read and clear the pte.  This hook
@@ -87,9 +63,6 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 	 * Is the page already in the swap cache? If so, then
 	 * we can just drop our reference to it without doing
 	 * any IO - it's already up-to-date on disk.
-	 *
-	 * Return 0, as we didn't actually free any real
-	 * memory, and we should just continue our scan.
 	 */
 	if (PageSwapCache(page)) {
 		entry.val = page->index;
@@ -99,12 +72,12 @@ set_swap_pte:
 		swap_duplicate(entry);
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
-		UnlockPage(page);
 		mm->rss--;
-		deactivate_page(page);
+		if (!page->age)
+			deactivate_page(page);
+		UnlockPage(page);
 		page_cache_release(page);
-out_failed:
-		return 0;
+		return;
 	}
 
 	/*
@@ -153,34 +126,20 @@ out_failed:
 out_unlock_restore:
 	set_pte(page_table, pte);
 	UnlockPage(page);
-	return 0;
+	return;
 }
 
-/*
- * A new implementation of swap_out().  We do not swap complete processes,
- * but only a small number of blocks, before we continue with the next
- * process.  The number of blocks actually swapped is determined on the
- * number of page faults, that this process actually had in the last time,
- * so we won't swap heavily used processes all the time ...
- *
- * Note: the priority argument is a hint on much CPU to waste with the
- *       swap block search, not a hint, of how much blocks to swap with
- *       each process.
- *
- * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
- */
-
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
 
 	if (pmd_none(*dir))
-		return 0;
+		return count;
 	if (pmd_bad(*dir)) {
 		pmd_ERROR(*dir);
 		pmd_clear(dir);
-		return 0;
+		return count;
 	}
 	
 	pte = pte_offset(dir, address);
@@ -190,28 +149,33 @@ static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vm
 		end = pmd_end;
 
 	do {
-		int result;
-		mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
-		if (result)
-			return result;
+		if (pte_present(*pte)) {
+			struct page *page = pte_page(*pte);
+
+			if (VALID_PAGE(page) && !PageReserved(page)) {
+				try_to_swap_out(mm, vma, address, pte, page);
+				if (!--count)
+					break;
+			}
+		}
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
-	return 0;
+	mm->swap_address = address + PAGE_SIZE;
+	return count;
 }
 
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
 
 	if (pgd_none(*dir))
-		return 0;
+		return count;
 	if (pgd_bad(*dir)) {
 		pgd_ERROR(*dir);
 		pgd_clear(dir);
-		return 0;
+		return count;
 	}
 
 	pmd = pmd_offset(dir, address);
@@ -221,23 +185,23 @@ static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vm
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
-		if (result)
-			return result;
+		count = swap_out_pmd(mm, vma, pmd, address, end, count);
+		if (!count)
+			break;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
-	return 0;
+	return count;
 }
 
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
 {
 	pgd_t *pgdir;
 	unsigned long end;
 
 	/* Don't swap out areas which are locked down */
 	if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
-		return 0;
+		return count;
 
 	pgdir = pgd_offset(mm, address);
 
@@ -245,18 +209,17 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi
 	if (address >= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
-		if (result)
-			return result;
+		count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+		if (!count)
+			break;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (address && (address < end));
-	return 0;
+	return count;
 }
 
-static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
+static int swap_out_mm(struct mm_struct * mm, int count)
 {
-	int result = 0;
 	unsigned long address;
 	struct vm_area_struct* vma;
 
@@ -276,8 +239,8 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 			address = vma->vm_start;
 
 		for (;;) {
-			result = swap_out_vma(mm, vma, address, gfp_mask);
-			if (result)
+			count = swap_out_vma(mm, vma, address, count);
+			if (!count)
 				goto out_unlock;
 			vma = vma->vm_next;
 			if (!vma)
@@ -287,94 +250,63 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 	}
 	/* Reset to 0 when we reach the end of address space */
 	mm->swap_address = 0;
-	mm->swap_cnt = 0;
 
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
-	return result;
+	return !count;
 }
 
 /*
- * Select the task with maximal swap_cnt and try to swap out a page.
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
 #define SWAP_SHIFT 5
 #define SWAP_MIN 8
 
+static inline int swap_amount(struct mm_struct *mm)
+{
+	int nr = mm->rss >> SWAP_SHIFT;
+	return nr < SWAP_MIN ? SWAP_MIN : nr;
+}
+
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	int counter;
-	int __ret = 0;
-
-	/* 
-	 * We make one or two passes through the task list, indexed by 
-	 * assign = {0, 1}:
-	 *   Pass 1: select the swappable task with maximal RSS that has
-	 *         not yet been swapped out. 
-	 *   Pass 2: re-assign rss swap_cnt values, then select as above.
-	 *
-	 * With this approach, there's no need to remember the last task
-	 * swapped out.  If the swap-out fails, we clear swap_cnt so the 
-	 * task won't be selected again until all others have been tried.
-	 *
-	 * Think of swap_cnt as a "shadow rss" - it tells us which process
-	 * we want to page out (always try largest first).
-	 */
-	counter = (nr_threads << SWAP_SHIFT) >> priority;
-	if (counter < 1)
-		counter = 1;
+	int retval = 0;
+	struct mm_struct *mm = current->mm;
 
-	for (; counter >= 0; counter--) {
+	/* Always start by trying to penalize the process that is allocating memory */
+	if (mm)
+		retval = swap_out_mm(mm, swap_amount(mm));
+
+	/* Then, look at the other mm's */
+	counter = mmlist_nr >> priority;
+	do {
 		struct list_head *p;
-		unsigned long max_cnt = 0;
-		struct mm_struct *best = NULL;
-		int assign = 0;
-		int found_task = 0;
-	select:
+
 		spin_lock(&mmlist_lock);
 		p = init_mm.mmlist.next;
-		for (; p != &init_mm.mmlist; p = p->next) {
-			struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);
-	 		if (mm->rss <= 0)
-				continue;
-			found_task++;
-			/* Refresh swap_cnt? */
-			if (assign == 1) {
-				mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
-				if (mm->swap_cnt < SWAP_MIN)
-					mm->swap_cnt = SWAP_MIN;
-			}
-			if (mm->swap_cnt > max_cnt) {
-				max_cnt = mm->swap_cnt;
-				best = mm;
-			}
-		}
+		if (p == &init_mm.mmlist)
+			goto empty;
+
+		/* Move it to the back of the queue.. */
+		list_del(p);
+		list_add_tail(p, &init_mm.mmlist);
+		mm = list_entry(p, struct mm_struct, mmlist);
 
-		/* Make sure it doesn't disappear */
-		if (best)
-			atomic_inc(&best->mm_users);
+		/* Make sure the mm doesn't disappear when we drop the lock.. */
+		atomic_inc(&mm->mm_users);
 		spin_unlock(&mmlist_lock);
 
-		/*
-		 * We have dropped the tasklist_lock, but we
-		 * know that "mm" still exists: we are running
-		 * with the big kernel lock, and exit_mm()
-		 * cannot race with us.
-		 */
-		if (!best) {
-			if (!assign && found_task > 0) {
-				assign = 1;
-				goto select;
-			}
-			break;
-		} else {
-			__ret = swap_out_mm(best, gfp_mask);
-			mmput(best);
-			break;
-		}
-	}
-	return __ret;
+		/* Walk about 6% of the address space each time */
+		retval |= swap_out_mm(mm, swap_amount(mm));
+		mmput(mm);
+	} while (--counter >= 0);
+	return retval;
+
+empty:
+	spin_unlock(&mmlist_lock);
+	return 0;
 }
 
 
@@ -540,7 +472,6 @@ dirty_page_rescan:
 		 */
 		if (PageDirty(page)) {
 			int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
-			int result;
 
 			if (!writepage)
 				goto page_active;
@@ -558,16 +489,12 @@ dirty_page_rescan:
 			page_cache_get(page);
 			spin_unlock(&pagemap_lru_lock);
 
-			result = writepage(page);
+			writepage(page);
 			page_cache_release(page);
 
 			/* And re-start the thing.. */
 			spin_lock(&pagemap_lru_lock);
-			if (result != 1)
-				continue;
-			/* writepage refused to do anything */
-			set_page_dirty(page);
-			goto page_active;
+			continue;
 		}
 
 		/*
@@ -808,6 +735,9 @@ int free_shortage(void)
 int inactive_shortage(void)
 {
 	int shortage = 0;
+	pg_data_t *pgdat = pgdat_list;
+
+	/* Is the inactive dirty list too small? */
 
 	shortage += freepages.high;
 	shortage += inactive_target;
@@ -818,7 +748,27 @@ int inactive_shortage(void)
 	if (shortage > 0)
 		return shortage;
 
-	return 0;
+	/* If not, do we have enough per-zone pages on the inactive list? */
+
+	shortage = 0;
+
+	do {
+		int i;
+		for(i = 0; i < MAX_NR_ZONES; i++) {
+			int zone_shortage;
+			zone_t *zone = pgdat->node_zones+ i;
+
+			zone_shortage = zone->pages_high;
+			zone_shortage -= zone->inactive_dirty_pages;
+			zone_shortage -= zone->inactive_clean_pages;
+			zone_shortage -= zone->free_pages;
+			if (zone_shortage > 0)
+				shortage += zone_shortage;
+		}
+		pgdat = pgdat->node_next;
+	} while (pgdat);
+
+	return shortage;
 }
 
 /*
@@ -833,72 +783,35 @@ int inactive_shortage(void)
  * really care about latency. In that case we don't try
  * to free too many pages.
  */
+#define DEF_PRIORITY (6)
 static int refill_inactive(unsigned int gfp_mask, int user)
 {
-	int priority, count, start_count, made_progress;
+	int count, start_count, maxtry;
 
 	count = inactive_shortage() + free_shortage();
 	if (user)
 		count = (1 << page_cluster);
 	start_count = count;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	priority = 6;
+	maxtry = 6;
 	do {
-		made_progress = 0;
-
 		if (current->need_resched) {
 			__set_current_state(TASK_RUNNING);
 			schedule();
 		}
 
-		while (refill_inactive_scan(priority, 1)) {
-			made_progress = 1;
-			if (--count <= 0)
-				goto done;
-		}
-
-		/*
-		 * don't be too light against the d/i cache since
-	   	 * refill_inactive() almost never fail when there's
-	   	 * really plenty of memory free. 
-		 */
-		shrink_dcache_memory(priority, gfp_mask);
-		shrink_icache_memory(priority, gfp_mask);
-
-		/*
-		 * Then, try to page stuff out..
-		 */
-		while (swap_out(priority, gfp_mask)) {
-			made_progress = 1;
+		while (refill_inactive_scan(DEF_PRIORITY, 1)) {
 			if (--count <= 0)
 				goto done;
 		}
 
-		/*
-		 * If we either have enough free memory, or if
-		 * page_launder() will be able to make enough
-		 * free memory, then stop.
-		 */
-		if (!inactive_shortage() || !free_shortage())
-			goto done;
+		/* If refill_inactive_scan failed, try to page stuff out.. */
+		swap_out(DEF_PRIORITY, gfp_mask);
 
-		/*
-		 * Only switch to a lower "priority" if we
-		 * didn't make any useful progress in the
-		 * last loop.
-		 */
-		if (!made_progress)
-			priority--;
-	} while (priority >= 0);
-
-	/* Always end on a refill_inactive.., may sleep... */
-	while (refill_inactive_scan(0, 1)) {
-		if (--count <= 0)
-			goto done;
-	}
+		if (--maxtry <= 0)
+				return 0;
+		
+	} while (inactive_shortage());
 
 done:
 	return (count < start_count);
@@ -922,20 +835,29 @@ static int do_try_to_free_pages(unsigned int gfp_mask, int user)
 
 	/*
 	 * If needed, we move pages from the active list
-	 * to the inactive list. We also "eat" pages from
-	 * the inode and dentry cache whenever we do this.
+	 * to the inactive list.
 	 */
-	if (free_shortage() || inactive_shortage()) {
-		shrink_dcache_memory(6, gfp_mask);
-		shrink_icache_memory(6, gfp_mask);
+	if (inactive_shortage())
 		ret += refill_inactive(gfp_mask, user);
+
+	/* 	
+	 * Delete pages from the inode and dentry caches and 
+	 * reclaim unused slab cache if memory is low.
+	 */
+	if (free_shortage()) {
+		shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+		shrink_icache_memory(DEF_PRIORITY, gfp_mask);
 	} else {
 		/*
-		 * Reclaim unused slab cache memory.
+		 * Illogical, but true. At least for now.
+		 *
+		 * If we're _not_ under shortage any more, we
+		 * reap the caches. Why? Because a noticeable
+		 * part of the caches are the buffer-heads, 
+		 * which we'll want to keep if under shortage.
 		 */
 		kmem_cache_reap(gfp_mask);
-		ret = 1;
-	}
+	} 
 
 	return ret;
 }
@@ -988,13 +910,8 @@ int kswapd(void *unused)
 		static int recalc = 0;
 
 		/* If needed, try to free some memory. */
-		if (inactive_shortage() || free_shortage()) {
-			int wait = 0;
-			/* Do we need to do some synchronous flushing? */
-			if (waitqueue_active(&kswapd_done))
-				wait = 1;
-			do_try_to_free_pages(GFP_KSWAPD, wait);
-		}
+		if (inactive_shortage() || free_shortage()) 
+			do_try_to_free_pages(GFP_KSWAPD, 0);
 
 		/*
 		 * Do some (very minimal) background scanning. This
@@ -1002,7 +919,7 @@ int kswapd(void *unused)
 		 * every minute. This clears old referenced bits
 		 * and moves unused pages to the inactive list.
 		 */
-		refill_inactive_scan(6, 0);
+		refill_inactive_scan(DEF_PRIORITY, 0);
 
 		/* Once a second, recalculate some VM stats. */
 		if (time_after(jiffies, recalc + HZ)) {
@@ -1010,11 +927,6 @@ int kswapd(void *unused)
 			recalculate_vm_stats();
 		}
 
-		/*
-		 * Wake up everybody waiting for free memory
-		 * and unplug the disk queue.
-		 */
-		wake_up_all(&kswapd_done);
 		run_task_queue(&tq_disk);
 
 		/* 
@@ -1045,33 +957,10 @@ int kswapd(void *unused)
 	}
 }
 
-void wakeup_kswapd(int block)
+void wakeup_kswapd(void)
 {
-	DECLARE_WAITQUEUE(wait, current);
-
-	if (current == kswapd_task)
-		return;
-
-	if (!block) {
-		if (waitqueue_active(&kswapd_wait))
-			wake_up(&kswapd_wait);
-		return;
-	}
-
-	/*
-	 * Kswapd could wake us up before we get a chance
-	 * to sleep, so we have to be very careful here to
-	 * prevent SMP races...
-	 */
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	add_wait_queue(&kswapd_done, &wait);
-
-	if (waitqueue_active(&kswapd_wait))
-		wake_up(&kswapd_wait);
-	schedule();
-
-	remove_wait_queue(&kswapd_done, &wait);
-	__set_current_state(TASK_RUNNING);
+	if (current != kswapd_task)
+		wake_up_process(kswapd_task);
 }
 
 /*
@@ -1096,7 +985,7 @@ DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
 /*
  * Kreclaimd will move pages from the inactive_clean list to the
  * free list, in order to keep atomic allocations possible under
- * all circumstances. Even when kswapd is blocked on IO.
+ * all circumstances.
  */
 int kreclaimd(void *unused)
 {