16 files changed, 598 insertions, 302 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 56e93693b..d74cdec48 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@
 O_TARGET := mm.o
 O_OBJS	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
-	    page_alloc.o swap_state.o swapfile.o numa.o
+	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o
 
 ifeq ($(CONFIG_HIGHMEM),y)
 O_OBJS += highmem.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0a8d37ba2..e9e9ef7bc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -41,11 +41,15 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 /*
  * Called once to set up the allocator itself.
  */
-static unsigned long __init init_bootmem_core (bootmem_data_t *bdata,
+static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	unsigned long mapstart, unsigned long start, unsigned long end)
 {
+	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long mapsize = ((end - start)+7)/8;
 
+	pgdat->node_next = pgdat_list;
+	pgdat_list = pgdat;
+
 	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
 	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
 	bdata->node_boot_start = (start << PAGE_SHIFT);
@@ -172,10 +176,6 @@ restart_scan:
 		preferred = 0;
 		goto restart_scan;
 	}
-	/*
-	 * Whoops, we cannot satisfy the allocation request.
-	 */
-	BUG();
 found:
 	if (start >= eidx)
 		BUG();
@@ -221,15 +221,15 @@ found:
 	return ret;
 }
 
-static unsigned long __init free_all_bootmem_core(int nid, bootmem_data_t *bdata)
+static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 {
-	struct page * page;
+	struct page *page = pgdat->node_mem_map;
+	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long i, count, total = 0;
 	unsigned long idx;
 
 	if (!bdata->node_bootmem_map) BUG();
 
-	page = NODE_MEM_MAP(nid);
 	count = 0;
 	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
 	for (i = 0; i < idx; i++, page++) {
@@ -260,59 +260,78 @@ static unsigned long __init free_all_bootmem_core(int nid, bootmem_data_t *bdata
 	return total;
 }
 
-unsigned long __init init_bootmem_node (int nid, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
+unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
 {
-	return(init_bootmem_core(NODE_DATA(nid)->bdata, freepfn, startpfn, endpfn));
+	return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
 }
 
-void __init reserve_bootmem_node (int nid, unsigned long physaddr, unsigned long size)
+void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
 {
-	reserve_bootmem_core(NODE_DATA(nid)->bdata, physaddr, size);
+	reserve_bootmem_core(pgdat->bdata, physaddr, size);
 }
 
-void __init free_bootmem_node (int nid, unsigned long physaddr, unsigned long size)
+void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
 {
-	return(free_bootmem_core(NODE_DATA(nid)->bdata, physaddr, size));
+	return(free_bootmem_core(pgdat->bdata, physaddr, size));
 }
 
-unsigned long __init free_all_bootmem_node (int nid)
+unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
 {
-	return(free_all_bootmem_core(nid, NODE_DATA(nid)->bdata));
+	return(free_all_bootmem_core(pgdat));
 }
 
 unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
 {
 	max_low_pfn = pages;
 	min_low_pfn = start;
-	return(init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages));
+	return(init_bootmem_core(&contig_page_data, start, 0, pages));
 }
 
 void __init reserve_bootmem (unsigned long addr, unsigned long size)
 {
-	reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+	reserve_bootmem_core(contig_page_data.bdata, addr, size);
 }
 
 void __init free_bootmem (unsigned long addr, unsigned long size)
 {
-	return(free_bootmem_core(NODE_DATA(0)->bdata, addr, size));
+	return(free_bootmem_core(contig_page_data.bdata, addr, size));
 }
 
 unsigned long __init free_all_bootmem (void)
 {
-	return(free_all_bootmem_core(0, NODE_DATA(0)->bdata));
+	return(free_all_bootmem_core(&contig_page_data));
 }
 
 void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
 {
+	pg_data_t *pgdat = pgdat_list;
+	void *ptr;
+
+	while (pgdat) {
+		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+						align, goal)))
+			return(ptr);
+		pgdat = pgdat->node_next;
+	}
 	/*
-	 * In the discontigmem case, all non-node specific allocations come 
-	 * from the first node, node 0.
+	 * Whoops, we cannot satisfy the allocation request.
 	 */
-	return(__alloc_bootmem_core(NODE_DATA(0)->bdata, size, align, goal));
+	BUG();
+	return NULL;
 }
 
-void * __init __alloc_bootmem_node (int nid, unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
 {
-	return(__alloc_bootmem_core(NODE_DATA(nid)->bdata, size, align, goal));
+	void *ptr;
+
+	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
+	if (ptr)
+		return (ptr);
+
+	/*
+	 * Whoops, we cannot satisfy the allocation request.
+	 */
+	BUG();
+	return NULL;
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 6aca16409..b19f4c5b3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -135,6 +135,12 @@ void invalidate_inode_pages(struct inode * inode)
 		if (TryLockPage(page))
 			continue;
 
+		/* Neither can we invalidate something in use.. */
+		if (page_count(page) != 1) {
+			UnlockPage(page);
+			continue;
+		}
+
 		__lru_cache_del(page);
 		__remove_inode_page(page);
 		UnlockPage(page);
@@ -156,6 +162,7 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
 
 static inline void truncate_complete_page(struct page *page)
 {
+	/* Leave it on the LRU if it gets converted into anonymous buffers */
 	if (!page->buffers || block_flushpage(page, 0))
 		lru_cache_del(page);
 
@@ -167,6 +174,7 @@ static inline void truncate_complete_page(struct page *page)
 	 * all sorts of fun problems ...  
 	 */
 	ClearPageDirty(page);
+	ClearPageUptodate(page);
 	remove_inode_page(page);
 	page_cache_release(page);
 }
@@ -495,20 +503,46 @@ void ___wait_on_page(struct page *page)
 }
 
 /*
- * Get an exclusive lock on the page..
+ * Get a lock on the page, assuming we need to sleep
+ * to get it..
  */
-void lock_page(struct page *page)
+static void __lock_page(struct page *page)
 {
-	while (TryLockPage(page))
-		___wait_on_page(page);
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	add_wait_queue_exclusive(&page->wait, &wait);
+	for (;;) {
+		sync_page(page);
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (PageLocked(page)) {
+			run_task_queue(&tq_disk);
+			schedule();
+			continue;
+		}
+		if (!TryLockPage(page))
+			break;
+	}
+	tsk->state = TASK_RUNNING;
+	remove_wait_queue(&page->wait, &wait);
 }
+	
 
+/*
+ * Get an exclusive lock on the page, optimistically
+ * assuming it's not locked..
+ */
+void lock_page(struct page *page)
+{
+	if (TryLockPage(page))
+		__lock_page(page);
+}
 
 /*
  * a rather lightweight function, finding and getting a reference to a
  * hashed page atomically, waiting for it if it's locked.
  */
-struct page * __find_get_page (struct address_space *mapping,
+static struct page * __find_get_page(struct address_space *mapping,
 				unsigned long offset, struct page **hash)
 {
 	struct page *page;
@@ -517,41 +551,11 @@ struct page * __find_get_page (struct address_space *mapping,
 	 * We scan the hash list read-only. Addition to and removal from
 	 * the hash-list needs a held write-lock.
 	 */
-repeat:
 	spin_lock(&pagecache_lock);
 	page = __find_page_nolock(mapping, offset, *hash);
 	if (page)
 		page_cache_get(page);
 	spin_unlock(&pagecache_lock);
-
-	/* Found the page, sleep if locked. */
-	if (page && PageLocked(page)) {
-		struct task_struct *tsk = current;
-		DECLARE_WAITQUEUE(wait, tsk);
-
-		sync_page(page);
-
-		__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		add_wait_queue(&page->wait, &wait);
-
-		if (PageLocked(page))
-			schedule();
-		__set_task_state(tsk, TASK_RUNNING);
-		remove_wait_queue(&page->wait, &wait);
-
-		/*
-		 * The page might have been unhashed meanwhile. It's
-		 * not freed though because we hold a reference to it.
-		 * If this is the case then it will be freed _here_,
-		 * and we recheck the hash anyway.
-		 */
-		page_cache_release(page);
-		goto repeat;
-	}
-	/*
-	 * It's not locked so we can return the page and we hold
-	 * a reference to it.
-	 */
 	return page;
 }
 
@@ -570,39 +574,23 @@ struct page * __find_lock_page (struct address_space *mapping,
 repeat:
 	spin_lock(&pagecache_lock);
 	page = __find_page_nolock(mapping, offset, *hash);
-	if (page)
+	if (page) {
 		page_cache_get(page);
-	spin_unlock(&pagecache_lock);
-
-	/* Found the page, sleep if locked. */
-	if (page && TryLockPage(page)) {
-		struct task_struct *tsk = current;
-		DECLARE_WAITQUEUE(wait, tsk);
-
-		sync_page(page);
+		spin_unlock(&pagecache_lock);
 
-		__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		add_wait_queue(&page->wait, &wait);
+		lock_page(page);
 
-		if (PageLocked(page))
-			schedule();
-		__set_task_state(tsk, TASK_RUNNING);
-		remove_wait_queue(&page->wait, &wait);
+		/* Is the page still hashed? Ok, good.. */
+		if (page->mapping)
+			return page;
 
-		/*
-		 * The page might have been unhashed meanwhile. It's
-		 * not freed though because we hold a reference to it.
-		 * If this is the case then it will be freed _here_,
-		 * and we recheck the hash anyway.
-		 */
+		/* Nope: we raced. Release and try again.. */
+		UnlockPage(page);
 		page_cache_release(page);
 		goto repeat;
 	}
-	/*
-	 * It's not locked so we can return the page and we hold
-	 * a reference to it.
-	 */
-	return page;
+	spin_unlock(&pagecache_lock);
+	return NULL;
 }
 
 #if 0
@@ -993,7 +981,7 @@ page_ok:
 		 * virtual addresses, take care about potential aliasing
 		 * before reading the page on the kernel side.
 		 */
-		if (page->mapping->i_mmap_shared != NULL)
+		if (mapping->i_mmap_shared != NULL)
 			flush_dcache_page(page);
 
 		/*
@@ -1027,6 +1015,15 @@ page_not_up_to_date:
 
 		/* Get exclusive access to the page ... */
 		lock_page(page);
+
+		/* Did it get unhashed before we got the lock? */
+		if (!page->mapping) {
+			UnlockPage(page);
+			page_cache_release(page);
+			continue;
+		}
+
+		/* Did somebody else fill it already? */
 		if (Page_Uptodate(page)) {
 			UnlockPage(page);
 			goto page_ok;
@@ -1323,16 +1320,16 @@ struct page * filemap_nopage(struct vm_area_struct * area,
 	struct inode *inode = file->f_dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page, **hash, *old_page;
-	unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	unsigned long size, pgoff;
 
-	unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
 
+retry_all:
 	/*
-	 * Semantics for shared and private memory areas are different
-	 * past the end of the file. A shared mapping past the last page
-	 * of the file is an error and results in a SIGBUS, while a
-	 * private mapping just maps in a zero page.
+	 * An external ptracer can access pages that normally aren't
+	 * accessible..
 	 */
+	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if ((pgoff >= size) && (area->vm_mm == current->mm))
 		return NULL;
 
@@ -1411,6 +1408,15 @@ no_cached_page:
 
 page_not_uptodate:
 	lock_page(page);
+
+	/* Did it get unhashed while we waited for it? */
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry_all;
+	}
+
+	/* Did somebody else get it up-to-date? */
 	if (Page_Uptodate(page)) {
 		UnlockPage(page);
 		goto success;
@@ -1429,6 +1435,15 @@ page_not_uptodate:
 	 * and we need to check for errors.
 	 */
 	lock_page(page);
+
+	/* Somebody truncated the page on us? */
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry_all;
+	}
+
+	/* Somebody else successfully read it in? */
 	if (Page_Uptodate(page)) {
 		UnlockPage(page);
 		goto success;
@@ -1448,17 +1463,25 @@ page_not_uptodate:
 	return NULL;
 }
 
+/*
+ * If a task terminates while we're swapping the page, the vma and
+ * and file could be released: try_to_swap_out has done a get_file.
+ * vma/file is guaranteed to exist in the unmap/sync cases because
+ * mmap_sem is held.
+ *
+ * The "mapping" test takes care of somebody having truncated the
+ * page and thus made this write-page a no-op..
+ */
 static int filemap_write_page(struct file *file,
 			      struct page * page,
 			      int wait)
 {
-	/*
-	 * If a task terminates while we're swapping the page, the vma and
-	 * and file could be released: try_to_swap_out has done a get_file.
-	 * vma/file is guaranteed to exist in the unmap/sync cases because
-	 * mmap_sem is held.
-	 */
-	return page->mapping->a_ops->writepage(file, page);
+	struct address_space * mapping = page->mapping;
+	int error = 0;
+
+	if (mapping)
+		error = mapping->a_ops->writepage(file, page);
+	return error;
 }
 
 
@@ -1475,39 +1498,47 @@ int filemap_swapout(struct page * page, struct file * file)
 	return retval;
 }
 
+/* Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 	unsigned long address, unsigned int flags)
 {
 	unsigned long pgoff;
-	pte_t pte = *ptep;
+	pte_t pte;
 	struct page *page;
 	int error;
 
+	pte = *ptep;
+
 	if (!(flags & MS_INVALIDATE)) {
 		if (!pte_present(pte))
-			return 0;
-		if (!pte_dirty(pte))
-			return 0;
+			goto out;
+		if (!ptep_test_and_clear_dirty(ptep))
+			goto out;
 		flush_page_to_ram(pte_page(pte));
 		flush_cache_page(vma, address);
-		set_pte(ptep, pte_mkclean(pte));
 		flush_tlb_page(vma, address);
 		page = pte_page(pte);
 		page_cache_get(page);
 	} else {
 		if (pte_none(pte))
-			return 0;
+			goto out;
 		flush_cache_page(vma, address);
-		pte_clear(ptep);
+
+		pte = ptep_get_and_clear(ptep);
 		flush_tlb_page(vma, address);
+
 		if (!pte_present(pte)) {
+			spin_unlock(&vma->vm_mm->page_table_lock);
 			swap_free(pte_to_swp_entry(pte));
-			return 0;
+			spin_lock(&vma->vm_mm->page_table_lock);
+			goto out;
 		}
 		page = pte_page(pte);
 		if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
 			page_cache_free(page);
-			return 0;
+			goto out;
 		}
 	}
 	pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
@@ -1516,11 +1547,20 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 		printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
 			pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
 	}
+
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	lock_page(page);
+
 	error = filemap_write_page(vma->vm_file, page, 1);
+
 	UnlockPage(page);
 	page_cache_free(page);
+
+	spin_lock(&vma->vm_mm->page_table_lock);
 	return error;
+
+out:
+	return 0;
 }
 
 static inline int filemap_sync_pte_range(pmd_t * pmd,
@@ -1590,6 +1630,11 @@ int filemap_sync(struct vm_area_struct * vma, unsigned long address,
 	unsigned long end = address + size;
 	int error = 0;
 
+	/* Aquire the lock early; it may be possible to avoid dropping
+	 * and reaquiring it repeatedly.
+	 */
+	spin_lock(&vma->vm_mm->page_table_lock);
+
 	dir = pgd_offset(vma->vm_mm, address);
 	flush_cache_range(vma->vm_mm, end - size, end);
 	if (address >= end)
@@ -1600,6 +1645,9 @@ int filemap_sync(struct vm_area_struct * vma, unsigned long address,
 		dir++;
 	} while (address && (address < end));
 	flush_tlb_range(vma->vm_mm, end - size, end);
+
+	spin_unlock(&vma->vm_mm->page_table_lock);
+
 	return error;
 }
 
@@ -1766,11 +1814,11 @@ static long madvise_fixup_start(struct vm_area_struct * vma,
 	get_file(n->vm_file);
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
 	vma->vm_start = end;
 	insert_vm_struct(current->mm, n);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -1790,10 +1838,10 @@ static long madvise_fixup_end(struct vm_area_struct * vma,
 	get_file(n->vm_file);
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_end = start;
 	insert_vm_struct(current->mm, n);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -1823,7 +1871,7 @@ static long madvise_fixup_middle(struct vm_area_struct * vma,
 		vma->vm_ops->open(left);
 		vma->vm_ops->open(right);
 	}
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
 	vma->vm_start = start;
 	vma->vm_end = end;
@@ -1831,7 +1879,7 @@ static long madvise_fixup_middle(struct vm_area_struct * vma,
 	vma->vm_raend = 0;
 	insert_vm_struct(current->mm, left);
 	insert_vm_struct(current->mm, right);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -2270,13 +2318,20 @@ struct page *read_cache_page(struct address_space *mapping,
 				int (*filler)(void *,struct page*),
 				void *data)
 {
-	struct page *page = __read_cache_page(mapping, index, filler, data);
+	struct page *page;
 	int err;
 
+retry:
+	page = __read_cache_page(mapping, index, filler, data);
 	if (IS_ERR(page) || Page_Uptodate(page))
 		goto out;
 
 	lock_page(page);
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry;
+	}
 	if (Page_Uptodate(page)) {
 		UnlockPage(page);
 		goto out;
diff --git a/mm/highmem.c b/mm/highmem.c
index 3be601c6f..d83d9bb87 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -130,10 +130,9 @@ static void flush_all_zero_pkmaps(void)
 		if (pkmap_count[i] != 1)
 			continue;
 		pkmap_count[i] = 0;
-		pte = pkmap_page_table[i];
+		pte = ptep_get_and_clear(pkmap_page_table+i);
 		if (pte_none(pte))
 			BUG();
-		pte_clear(pkmap_page_table+i);
 		page = pte_page(pte);
 		page->virtual = NULL;
 	}
@@ -310,7 +309,7 @@ struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
 repeat_bh:
 	bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
 	if (!bh) {
-		wakeup_bdflush(1);
+		wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
 		current->policy |= SCHED_YIELD;
 		schedule();
 		goto repeat_bh;
@@ -324,7 +323,7 @@ repeat_bh:
 repeat_page:
 	page = alloc_page(GFP_BUFFER);
 	if (!page) {
-		wakeup_bdflush(1);
+		wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
 		current->policy |= SCHED_YIELD;
 		schedule();
 		goto repeat_page;
diff --git a/mm/memory.c b/mm/memory.c
index 6b047821d..11048ddce 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -215,30 +215,30 @@ skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
 				/* copy_one_pte */
 
 				if (pte_none(pte))
-					goto cont_copy_pte_range;
+					goto cont_copy_pte_range_noset;
 				if (!pte_present(pte)) {
 					swap_duplicate(pte_to_swp_entry(pte));
-					set_pte(dst_pte, pte);
 					goto cont_copy_pte_range;
 				}
 				ptepage = pte_page(pte);
 				if ((!VALID_PAGE(ptepage)) || 
-				    PageReserved(ptepage)) {
-					set_pte(dst_pte, pte);
+				    PageReserved(ptepage))
 					goto cont_copy_pte_range;
-				}
+
 				/* If it's a COW mapping, write protect it both in the parent and the child */
 				if (cow) {
-					pte = pte_wrprotect(pte);
-					set_pte(src_pte, pte);
+					ptep_clear_wrprotect(src_pte);
+					pte = *src_pte;
 				}
+
 				/* If it's a shared mapping, mark it clean in the child */
 				if (vma->vm_flags & VM_SHARED)
 					pte = pte_mkclean(pte);
-				set_pte(dst_pte, pte_mkold(pte));
+				pte = pte_mkold(pte);
 				get_page(ptepage);
-			
-cont_copy_pte_range:		address += PAGE_SIZE;
+
+cont_copy_pte_range:		set_pte(dst_pte, pte);
+cont_copy_pte_range_noset:	address += PAGE_SIZE;
 				if (address >= end)
 					goto out;
 				src_pte++;
@@ -306,10 +306,9 @@ static inline int zap_pte_range(struct mm_struct *mm, pmd_t * pmd, unsigned long
 		pte_t page;
 		if (!size)
 			break;
-		page = *pte;
+		page = ptep_get_and_clear(pte);
 		pte++;
 		size--;
-		pte_clear(pte-1);
 		if (pte_none(page))
 			continue;
 		freed += free_pte(page);
@@ -642,7 +641,7 @@ static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
 		end = PMD_SIZE;
 	do {
 		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
-		pte_t oldpage = *pte;
+		pte_t oldpage = ptep_get_and_clear(pte);
 		set_pte(pte, zero_pte);
 		forget_pte(oldpage);
 		address += PAGE_SIZE;
@@ -712,8 +711,8 @@ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned
 		end = PMD_SIZE;
 	do {
 		struct page *page;
-		pte_t oldpage = *pte;
-		pte_clear(pte);
+		pte_t oldpage;
+		oldpage = ptep_get_and_clear(pte);
 
 		page = virt_to_page(__va(phys_addr));
 		if ((!VALID_PAGE(page)) || PageReserved(page))
@@ -746,6 +745,7 @@ static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned l
 	return 0;
 }
 
+/*  Note: this is only safe if the mm semaphore is held when called. */
 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 {
 	int error = 0;
@@ -781,8 +781,8 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long
  */
 static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
 {
-	flush_tlb_page(vma, address);
 	set_pte(page_table, entry);
+	flush_tlb_page(vma, address);
 	update_mmu_cache(vma, address, entry);
 }
 
@@ -867,7 +867,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	if (pte_val(*page_table) == pte_val(pte)) {
+	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
 		break_cow(vma, old_page, new_page, address, page_table);
@@ -1214,7 +1214,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	 * didn't change from under us..
 	 */
 	spin_lock(&mm->page_table_lock);
-	if (pte_val(entry) == pte_val(*pte)) {
+	if (pte_same(entry, *pte)) {
 		if (write_access) {
 			if (!pte_write(entry))
 				return do_wp_page(mm, vma, address, pte, entry);
diff --git a/mm/mlock.c b/mm/mlock.c
index a3d10ff99..f684a3c60 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,9 +14,9 @@
 
 static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags)
 {
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_flags = newflags;
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -36,11 +36,11 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma,
 		get_file(n->vm_file);
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
 	vma->vm_start = end;
 	insert_vm_struct(current->mm, n);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -61,10 +61,10 @@ static inline int mlock_fixup_end(struct vm_area_struct * vma,
 		get_file(n->vm_file);
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_end = start;
 	insert_vm_struct(current->mm, n);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -96,7 +96,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
 		vma->vm_ops->open(left);
 		vma->vm_ops->open(right);
 	}
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
 	vma->vm_start = start;
 	vma->vm_end = end;
@@ -104,7 +104,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
 	vma->vm_raend = 0;
 	insert_vm_struct(current->mm, left);
 	insert_vm_struct(current->mm, right);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -183,9 +183,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
 			break;
 		}
 	}
-	vmlist_modify_lock(current->mm);
+	spin_lock(&current->mm->page_table_lock);
 	merge_segments(current->mm, start, end);
-	vmlist_modify_unlock(current->mm);
+	spin_unlock(&current->mm->page_table_lock);
 	return error;
 }
 
@@ -257,9 +257,9 @@ static int do_mlockall(int flags)
 		if (error)
 			break;
 	}
-	vmlist_modify_lock(current->mm);
+	spin_lock(&current->mm->page_table_lock);
 	merge_segments(current->mm, 0, TASK_SIZE);
-	vmlist_modify_unlock(current->mm);
+	spin_unlock(&current->mm->page_table_lock);
 	return error;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 9c0027563..c50de6ed8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -317,12 +317,12 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 	 */
 	flags = vma->vm_flags;
 	addr = vma->vm_start; /* can addr have changed?? */
-	vmlist_modify_lock(mm);
+	spin_lock(&mm->page_table_lock);
 	insert_vm_struct(mm, vma);
 	if (correct_wcount)
 		atomic_inc(&file->f_dentry->d_inode->i_writecount);
 	merge_segments(mm, vma->vm_start, vma->vm_end);
-	vmlist_modify_unlock(mm);
+	spin_unlock(&mm->page_table_lock);
 	
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
@@ -534,11 +534,11 @@ static struct vm_area_struct * unmap_fixup(struct mm_struct *mm,
 	/* Work out to one of the ends. */
 	if (end == area->vm_end) {
 		area->vm_end = addr;
-		vmlist_modify_lock(mm);
+		spin_lock(&mm->page_table_lock);
 	} else if (addr == area->vm_start) {
 		area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
 		area->vm_start = end;
-		vmlist_modify_lock(mm);
+		spin_lock(&mm->page_table_lock);
 	} else {
 	/* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
 		/* Add end mapping -- leave beginning for below */
@@ -560,12 +560,12 @@ static struct vm_area_struct * unmap_fixup(struct mm_struct *mm,
 		if (mpnt->vm_ops && mpnt->vm_ops->open)
 			mpnt->vm_ops->open(mpnt);
 		area->vm_end = addr;	/* Truncate area */
-		vmlist_modify_lock(mm);
+		spin_lock(&mm->page_table_lock);
 		insert_vm_struct(mm, mpnt);
 	}
 
 	insert_vm_struct(mm, area);
-	vmlist_modify_unlock(mm);
+	spin_unlock(&mm->page_table_lock);
 	return extra;
 }
 
@@ -670,7 +670,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 
 	npp = (prev ? &prev->vm_next : &mm->mmap);
 	free = NULL;
-	vmlist_modify_lock(mm);
+	spin_lock(&mm->page_table_lock);
 	for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
 		*npp = mpnt->vm_next;
 		mpnt->vm_next = free;
@@ -679,7 +679,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 			avl_remove(mpnt, &mm->mmap_avl);
 	}
 	mm->mmap_cache = NULL;	/* Kill the cache. */
-	vmlist_modify_unlock(mm);
+	spin_unlock(&mm->page_table_lock);
 
 	/* Ok - we have the memory areas we should free on the 'free' list,
 	 * so release them, and unmap the page range..
@@ -811,10 +811,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	flags = vma->vm_flags;
 	addr = vma->vm_start;
 
-	vmlist_modify_lock(mm);
+	spin_lock(&mm->page_table_lock);
 	insert_vm_struct(mm, vma);
 	merge_segments(mm, vma->vm_start, vma->vm_end);
-	vmlist_modify_unlock(mm);
+	spin_unlock(&mm->page_table_lock);
 	
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
@@ -840,10 +840,10 @@ void exit_mmap(struct mm_struct * mm)
 	struct vm_area_struct * mpnt;
 
 	release_segments(mm);
+	spin_lock(&mm->page_table_lock);
 	mpnt = mm->mmap;
-	vmlist_modify_lock(mm);
 	mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
-	vmlist_modify_unlock(mm);
+	spin_unlock(&mm->page_table_lock);
 	mm->rss = 0;
 	mm->total_vm = 0;
 	mm->locked_vm = 0;
@@ -985,9 +985,9 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 		if (mpnt->vm_ops && mpnt->vm_ops->close) {
 			mpnt->vm_pgoff += (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 			mpnt->vm_start = mpnt->vm_end;
-			vmlist_modify_unlock(mm);
+			spin_unlock(&mm->page_table_lock);
 			mpnt->vm_ops->close(mpnt);
-			vmlist_modify_lock(mm);
+			spin_lock(&mm->page_table_lock);
 		}
 		mm->map_count--;
 		remove_shared_vm_struct(mpnt);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 53fc53acb..7b61abb3e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -30,9 +30,16 @@ static inline void change_pte_range(pmd_t * pmd, unsigned long address,
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		pte_t entry = *pte;
-		if (pte_present(entry))
+		if (pte_present(*pte)) {
+			pte_t entry;
+
+			/* Avoid an SMP race with hardware updated dirty/clean
+			 * bits by wiping the pte and then setting the new pte
+			 * into place.
+			 */
+			entry = ptep_get_and_clear(pte);
 			set_pte(pte, pte_modify(entry, newprot));
+		}
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
@@ -86,10 +93,10 @@ static void change_protection(unsigned long start, unsigned long end, pgprot_t n
 static inline int mprotect_fixup_all(struct vm_area_struct * vma,
 	int newflags, pgprot_t prot)
 {
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = prot;
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -111,11 +118,11 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma,
 		get_file(n->vm_file);
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
 	vma->vm_start = end;
 	insert_vm_struct(current->mm, n);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -138,10 +145,10 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma,
 		get_file(n->vm_file);
 	if (n->vm_ops && n->vm_ops->open)
 		n->vm_ops->open(n);
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_end = start;
 	insert_vm_struct(current->mm, n);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -172,7 +179,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
 		vma->vm_ops->open(left);
 		vma->vm_ops->open(right);
 	}
-	vmlist_modify_lock(vma->vm_mm);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
 	vma->vm_start = start;
 	vma->vm_end = end;
@@ -181,7 +188,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
 	vma->vm_page_prot = prot;
 	insert_vm_struct(current->mm, left);
 	insert_vm_struct(current->mm, right);
-	vmlist_modify_unlock(vma->vm_mm);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -263,9 +270,9 @@ asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot
 			break;
 		}
 	}
-	vmlist_modify_lock(current->mm);
+	spin_lock(&current->mm->page_table_lock);
 	merge_segments(current->mm, start, end);
-	vmlist_modify_unlock(current->mm);
+	spin_unlock(&current->mm->page_table_lock);
 out:
 	up(&current->mm->mmap_sem);
 	return error;
diff --git a/mm/mremap.c b/mm/mremap.c
index d1f6a7b8b..719ca1ec1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -63,14 +63,14 @@ static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst)
 	pte_t pte;
 
 	spin_lock(&mm->page_table_lock);
-	pte = *src;
-	if (!pte_none(pte)) {
-		error++;
-		if (dst) {
-			pte_clear(src);
-			set_pte(dst, pte);
-			error--;
+	if (!pte_none(*src)) {
+		pte = ptep_get_and_clear(src);
+		if (!dst) {
+			/* No dest?  We must put it back. */
+			dst = src;
+			error++;
 		}
+		set_pte(dst, pte);
 	}
 	spin_unlock(&mm->page_table_lock);
 	return error;
@@ -141,10 +141,10 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
 				get_file(new_vma->vm_file);
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
-			vmlist_modify_lock(current->mm);
+			spin_lock(&current->mm->page_table_lock);
 			insert_vm_struct(current->mm, new_vma);
 			merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end);
-			vmlist_modify_unlock(current->mm);
+			spin_unlock(&current->mm->page_table_lock);
 			do_munmap(current->mm, addr, old_len);
 			current->mm->total_vm += new_len >> PAGE_SHIFT;
 			if (new_vma->vm_flags & VM_LOCKED) {
@@ -258,9 +258,9 @@ unsigned long do_mremap(unsigned long addr,
 		/* can we just expand the current mapping? */
 		if (max_addr - addr >= new_len) {
 			int pages = (new_len - old_len) >> PAGE_SHIFT;
-			vmlist_modify_lock(vma->vm_mm);
+			spin_lock(&vma->vm_mm->page_table_lock);
 			vma->vm_end = addr + new_len;
-			vmlist_modify_unlock(vma->vm_mm);
+			spin_unlock(&vma->vm_mm->page_table_lock);
 			current->mm->total_vm += pages;
 			if (vma->vm_flags & VM_LOCKED) {
 				current->mm->locked_vm += pages;
diff --git a/mm/numa.c b/mm/numa.c
index 06ad9ec63..47cb72ec6 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -11,11 +11,11 @@
 
 int numnodes = 1;	/* Initialized for UMA platforms */
 
-#ifndef CONFIG_DISCONTIGMEM
-
 static bootmem_data_t contig_bootmem_data;
 pg_data_t contig_page_data = { bdata: &contig_bootmem_data };
 
+#ifndef CONFIG_DISCONTIGMEM
+
 /*
  * This is meant to be invoked by platforms whose physical memory starts
  * at a considerably higher value than 0. Examples are Super-H, ARM, m68k.
@@ -25,7 +25,7 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 	unsigned long *zones_size, unsigned long zone_start_paddr, 
 	unsigned long *zholes_size)
 {
-	free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 
+	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 
 				zone_start_paddr, zholes_size, pmap);
 }
 
@@ -33,7 +33,11 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 
 struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
 {
+#ifdef CONFIG_NUMA
 	return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order);
+#else
+	return alloc_pages(gfp_mask, order);
+#endif
 }
 
 #ifdef CONFIG_DISCONTIGMEM
@@ -42,13 +46,12 @@ struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
 
 static spinlock_t node_lock = SPIN_LOCK_UNLOCKED;
 
-void show_free_areas_node(int nid)
+void show_free_areas_node(pg_data_t *pgdat)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&node_lock, flags);
-	printk("Memory information for node %d:\n", nid);
-	show_free_areas_core(nid);
+	show_free_areas_core(pgdat);
 	spin_unlock_irqrestore(&node_lock, flags);
 }
 
@@ -75,10 +78,16 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		size += zones_size[i];
 	size = LONG_ALIGN((size + 7) >> 3);
-	pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(nid, size);
+	pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size);
 	memset(pgdat->valid_addr_bitmap, 0, size);
 }
 
+static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
+	unsigned long order)
+{
+	return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);
+}
+
 /*
  * This can be refined. Currently, tries to do round robin, instead
  * should do concentratic circle search, starting from current node.
@@ -86,33 +95,34 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 struct page * alloc_pages(int gfp_mask, unsigned long order)
 {
 	struct page *ret = 0;
-	int startnode, tnode;
+	pg_data_t *start, *temp;
 #ifndef CONFIG_NUMA
 	unsigned long flags;
-	static int nextnid = 0;
+	static pg_data_t *next = 0;
 #endif
 
 	if (order >= MAX_ORDER)
 		return NULL;
 #ifdef CONFIG_NUMA
-	tnode = numa_node_id();
+	temp = NODE_DATA(numa_node_id());
 #else
 	spin_lock_irqsave(&node_lock, flags);
-	tnode = nextnid;
-	nextnid++;
-	if (nextnid == numnodes)
-		nextnid = 0;
+	if (!next) next = pgdat_list;
+	temp = next;
+	next = next->node_next;
 	spin_unlock_irqrestore(&node_lock, flags);
 #endif
-	startnode = tnode;
-	while (tnode < numnodes) {
-		if ((ret = alloc_pages_node(tnode++, gfp_mask, order)))
+	start = temp;
+	while (temp) {
+		if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
 			return(ret);
+		temp = temp->node_next;
 	}
-	tnode = 0;
-	while (tnode != startnode) {
-		if ((ret = alloc_pages_node(tnode++, gfp_mask, order)))
+	temp = pgdat_list;
+	while (temp != start) {
+		if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
 			return(ret);
+		temp = temp->node_next;
 	}
 	return(0);
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
new file mode 100644
index 000000000..9882fe7cd
--- /dev/null
+++ b/mm/oom_kill.c
@@ -0,0 +1,210 @@
+/*
+ *  linux/mm/oom_kill.c
+ * 
+ *  Copyright (C)  1998,2000  Rik van Riel
+ *	Thanks go out to Claus Fischer for some serious inspiration and
+ *	for goading me into coding this file...
+ *
+ *  The routines in this file are used to kill a process when
+ *  we're seriously out of memory. This gets called from kswapd()
+ *  in linux/mm/vmscan.c when we really run out of memory.
+ *
+ *  Since we won't call these routines often (on a well-configured
+ *  machine) this file will double as a 'coding guide' and a signpost
+ *  for newbie kernel hackers. It features several pointers to major
+ *  kernel subsystems and hints as to where to find out what things do.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/timex.h>
+
+/* #define DEBUG */
+
+/**
+ * int_sqrt - oom_kill.c internal function, rough approximation to sqrt
+ * @x: integer of which to calculate the sqrt
+ * 
+ * A very rough approximation to the sqrt() function.
+ */
+static unsigned int int_sqrt(unsigned int x)
+{
+	unsigned int out = x;
+	while (x & ~(unsigned int)1) x >>=2, out >>=1;
+	if (x) out -= out >> 2;
+	return (out ? out : 1);
+}	
+
+/**
+ * oom_badness - calculate a numeric value for how bad this task has been
+ * @p: task struct of which task we should calculate
+ *
+ * The formula used is relatively simple and documented inline in the
+ * function. The main rationale is that we want to select a good task
+ * to kill when we run out of memory.
+ *
+ * Good in this context means that:
+ * 1) we lose the minimum amount of work done
+ * 2) we recover a large amount of memory
+ * 3) we don't kill anything innocent of eating tons of memory
+ * 4) we want to kill the minimum amount of processes (one)
+ * 5) we try to kill the process the user expects us to kill, this
+ *    algorithm has been meticulously tuned to meet the priniciple
+ *    of least surprise ... (be careful when you change it)
+ */
+
+static int badness(struct task_struct *p)
+{
+	int points, cpu_time, run_time;
+
+	if (!p->mm)
+		return 0;
+	/*
+	 * The memory size of the process is the basis for the badness.
+	 */
+	points = p->mm->total_vm;
+
+	/*
+	 * CPU time is in seconds and run time is in minutes. There is no
+	 * particular reason for this other than that it turned out to work
+	 * very well in practice. This is not safe against jiffie wraps
+	 * but we don't care _that_ much...
+	 */
+	cpu_time = (p->times.tms_utime + p->times.tms_stime) >> (SHIFT_HZ + 3);
+	run_time = (jiffies - p->start_time) >> (SHIFT_HZ + 10);
+
+	points /= int_sqrt(cpu_time);
+	points /= int_sqrt(int_sqrt(run_time));
+
+	/*
+	 * Niced processes are most likely less important, so double
+	 * their badness points.
+	 */
+	if (p->nice > 0)
+		points *= 2;
+
+	/*
+	 * Superuser processes are usually more important, so we make it
+	 * less likely that we kill those.
+	 */
+	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
+				p->uid == 0 || p->euid == 0)
+		points /= 4;
+
+	/*
+	 * We don't want to kill a process with direct hardware access.
+	 * Not only could that mess up the hardware, but usually users
+	 * tend to only have this flag set on applications they think
+	 * of as important.
+	 */
+	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
+		points /= 4;
+#ifdef DEBUG
+	printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
+	p->pid, p->comm, points);
+#endif
+	return points;
+}
+
+/*
+ * Simple selection loop. We chose the process with the highest
+ * number of 'points'. We need the locks to make sure that the
+ * list of task structs doesn't change while we look the other way.
+ *
+ * (not docbooked, we don't want this one cluttering up the manual)
+ */
+static struct task_struct * select_bad_process(void)
+{
+	int points = 0, maxpoints = 0;
+	struct task_struct *p = NULL;
+	struct task_struct *chosen = NULL;
+
+	read_lock(&tasklist_lock);
+	for_each_task(p)
+	{
+		if (p->pid)
+			points = badness(p);
+		if (points > maxpoints) {
+			chosen = p;
+			maxpoints = points;
+		}
+	}
+	read_unlock(&tasklist_lock);
+	return chosen;
+}
+
+/**
+ * oom_kill - kill the "best" process when we run out of memory
+ *
+ * If we run out of memory, we have the choice between either
+ * killing a random task (bad), letting the system crash (worse)
+ * OR try to be smart about which process to kill. Note that we
+ * don't have to be perfect here, we just have to be good.
+ *
+ * We must be careful though to never send SIGKILL a process with
+ * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
+ * we select a process with CAP_SYS_RAW_IO set).
+ */
+void oom_kill(void)
+{
+
+	struct task_struct *p = select_bad_process();
+
+	/* Found nothing?!?! Either we hang forever, or we panic. */
+	if (p == NULL)
+		panic("Out of memory and no killable processes...\n");
+
+	printk(KERN_ERR "Out of Memory: Killed process %d (%s).", p->pid, p->comm);
+
+	/*
+	 * We give our sacrificial lamb high priority and access to
+	 * all the memory it needs. That way it should be able to
+	 * exit() and clear out its resources quickly...
+	 */
+	p->counter = 5 * HZ;
+	p->flags |= PF_MEMALLOC;
+
+	/* This process has hardware access, be more careful. */
+	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
+		force_sig(SIGTERM, p);
+	} else {
+		force_sig(SIGKILL, p);
+	}
+
+	/*
+	 * Make kswapd go out of the way, so "p" has a good chance of
+	 * killing itself before someone else gets the chance to ask
+	 * for more memory.
+	 */
+	current->policy |= SCHED_YIELD;
+	schedule();
+	return;
+}
+
+/**
+ * out_of_memory - is the system out of memory?
+ *
+ * Returns 0 if there is still enough memory left,
+ * 1 when we are out of memory (otherwise).
+ */
+int out_of_memory(void)
+{
+	struct sysinfo swp_info;
+
+	/* Enough free memory?  Not OOM. */
+	if (nr_free_pages() > freepages.min)
+		return 0;
+
+	if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low)
+		return 0;
+
+	/* Enough swap space left?  Not OOM. */
+	si_swapinfo(&swp_info);
+	if (swp_info.freeswap > 0)
+		return 0;
+
+	/* Else... */
+	return 1;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b5990a11..90c077439 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -17,13 +17,6 @@
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 
-/* Use NUMNODES instead of numnodes for better code inside kernel APIs */
-#ifndef CONFIG_DISCONTIGMEM
-#define NUMNODES 1
-#else
-#define NUMNODES numnodes
-#endif
-
 int nr_swap_pages;
 int nr_active_pages;
 int nr_inactive_dirty_pages;
@@ -294,7 +287,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 	zone_t **zone;
 	int direct_reclaim = 0;
 	unsigned int gfp_mask = zonelist->gfp_mask;
-	struct page * page = NULL;
+	struct page * page;
 
 	/*
 	 * Allocations put pressure on the VM subsystem.
@@ -329,7 +322,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 	 * wake up bdflush.
 	 */
 	else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
-			&& nr_inactive_dirty_pages > freepages.high)
+			&& nr_inactive_dirty_pages >= freepages.high)
 		wakeup_bdflush(0);
 
 try_again:
@@ -347,7 +340,7 @@ try_again:
 		if (!z->size)
 			BUG();
 
-		if (z->free_pages > z->pages_low) {
+		if (z->free_pages >= z->pages_low) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
@@ -517,17 +510,17 @@ try_again:
 		 * happen when the OOM killer selects this task for
 		 * instant execution...
 		 */
-		if (direct_reclaim)
+		if (direct_reclaim) {
 			page = reclaim_page(z);
-		if (page)
-			return page;
+			if (page)
+				return page;
+		}
 
 		/* XXX: is pages_min/4 a good amount to reserve for this? */
 		if (z->free_pages < z->pages_min / 4 &&
 				!(current->flags & PF_MEMALLOC))
 			continue;
-		if (!page)
-			page = rmqueue(z, order);
+		page = rmqueue(z, order);
 		if (page)
 			return page;
 	}
@@ -588,12 +581,14 @@ unsigned int nr_free_pages (void)
 {
 	unsigned int sum;
 	zone_t *zone;
-	int i;
+	pg_data_t *pgdat = pgdat_list;
 
 	sum = 0;
-	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
+	while (pgdat) {
+		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
 			sum += zone->free_pages;
+		pgdat = pgdat->node_next;
+	}
 	return sum;
 }
 
@@ -604,12 +599,14 @@ unsigned int nr_inactive_clean_pages (void)
 {
 	unsigned int sum;
 	zone_t *zone;
-	int i;
+	pg_data_t *pgdat = pgdat_list;
 
 	sum = 0;
-	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
+	while (pgdat) {
+		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
 			sum += zone->inactive_clean_pages;
+		pgdat = pgdat->node_next;
+	}
 	return sum;
 }
 
@@ -644,11 +641,13 @@ unsigned int nr_free_buffer_pages (void)
 #if CONFIG_HIGHMEM
 unsigned int nr_free_highpages (void)
 {
-	int i;
+	pg_data_t *pgdat = pgdat_list;
 	unsigned int pages = 0;
 
-	for (i = 0; i < NUMNODES; i++)
-		pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages;
+	while (pgdat) {
+		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+		pgdat = pgdat->node_next;
+	}
 	return pages;
 }
 #endif
@@ -658,7 +657,7 @@ unsigned int nr_free_highpages (void)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  */
-void show_free_areas_core(int nid)
+void show_free_areas_core(pg_data_t *pgdat)
 {
  	unsigned long order;
 	unsigned type;
@@ -678,7 +677,7 @@ void show_free_areas_core(int nid)
 
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
-		zone_t *zone = NODE_DATA(nid)->node_zones + type;
+		zone_t *zone = pgdat->node_zones + type;
  		unsigned long nr, total, flags;
 
 		total = 0;
@@ -710,7 +709,7 @@ void show_free_areas_core(int nid)
 
 void show_free_areas(void)
 {
-	show_free_areas_core(0);
+	show_free_areas_core(pgdat_list);
 }
 
 /*
@@ -780,9 +779,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	unsigned long totalpages, offset, realtotalpages;
 	unsigned int cumulative = 0;
 
-	pgdat->node_next = pgdat_list;
-	pgdat_list = pgdat;
-
 	totalpages = 0;
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		unsigned long size = zones_size[i];
@@ -795,21 +791,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 			
 	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 
-	/*
-	 * Select nr of pages we try to keep free for important stuff
-	 * with a minimum of 10 pages and a maximum of 256 pages, so
-	 * that we don't waste too much memory on large systems.
-	 * This is fairly arbitrary, but based on some behaviour
-	 * analysis.
-	 */
-	i = realtotalpages >> 7;
-	if (i < 10)
-		i = 10;
-	if (i > 256)
-		i = 256;
-	freepages.min += i;
-	freepages.low += i * 2;
-	freepages.high += i * 3;
 	memlist_init(&active_list);
 	memlist_init(&inactive_dirty_list);
 
@@ -822,7 +803,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	 */
 	map_size = (totalpages + 1)*sizeof(struct page);
 	if (lmem_map == (struct page *)0) {
-		lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
+		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
 		lmem_map = (struct page *)(PAGE_OFFSET + 
 			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
 	}
@@ -875,6 +856,20 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 		zone->pages_min = mask;
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
+		/*
+		 * Add these free targets to the global free target;
+		 * we have to be SURE that freepages.high is higher
+		 * than SUM [zone->pages_min] for all zones, otherwise
+		 * we may have bad bad problems.
+		 *
+		 * This means we cannot make the freepages array writable
+		 * in /proc, but have to add a separate extra_free_target
+		 * for people who require it to catch load spikes in eg.
+		 * gigabit ethernet routing...
+		 */
+		freepages.min += mask;
+		freepages.low += mask*2;
+		freepages.high += mask*3;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
@@ -900,7 +895,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 			bitmap_size = (bitmap_size + 7) >> 3;
 			bitmap_size = LONG_ALIGN(bitmap_size);
 			zone->free_area[i].map = 
-			  (unsigned int *) alloc_bootmem_node(nid, bitmap_size);
+			  (unsigned int *) alloc_bootmem_node(pgdat, bitmap_size);
 		}
 	}
 	build_zonelists(pgdat);
@@ -908,7 +903,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 
 void __init free_area_init(unsigned long *zones_size)
 {
-	free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0, 0);
+	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
 }
 
 static int __init setup_mem_frac(char *str)
diff --git a/mm/swap.c b/mm/swap.c
index 8cb160b81..b4b9f76be 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -174,6 +174,7 @@ void deactivate_page_nolock(struct page * page)
 	 */
 	int maxcount = (page->buffers ? 3 : 2);
 	page->age = 0;
+	ClearPageReferenced(page);
 
 	/*
 	 * Don't touch it if it's not on the active list.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fa4cb133e..688e2fcdd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -223,10 +223,10 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
 		if (pte_page(pte) != page)
 			return;
 		/* We will be removing the swap cache in a moment, so... */
-		set_pte(dir, pte_mkdirty(pte));
+		ptep_mkdirty(dir);
 		return;
 	}
-	if (pte_val(pte) != entry.val)
+	if (pte_to_swp_entry(pte).val != entry.val)
 		return;
 	set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	swap_free(entry);
@@ -315,12 +315,12 @@ static void unuse_process(struct mm_struct * mm,
 	 */
 	if (!mm)
 		return;
-	vmlist_access_lock(mm);
+	spin_lock(&mm->page_table_lock);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
 		unuse_vma(vma, pgd, entry, page);
 	}
-	vmlist_access_unlock(mm);
+	spin_unlock(&mm->page_table_lock);
 	return;
 }
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e8c557e04..15261612e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,8 +34,8 @@ static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned lo
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		pte_t page = *pte;
-		pte_clear(pte);
+		pte_t page;
+		page = ptep_get_and_clear(pte);
 		address += PAGE_SIZE;
 		pte++;
 		if (pte_none(page))
@@ -142,15 +142,14 @@ inline int vmalloc_area_pages (unsigned long address, unsigned long size,
 	flush_cache_all();
 	do {
 		pmd_t *pmd;
-		pgd_t olddir = *dir;
 		
 		pmd = pmd_alloc_kernel(dir, address);
 		if (!pmd)
 			return -ENOMEM;
+
 		if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot))
 			return -ENOMEM;
-		if (pgd_val(olddir) != pgd_val(*dir))
-			set_pgdir(address, *dir);
+
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
@@ -222,14 +221,11 @@ void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot)
 		return NULL;
 	}
 	area = get_vm_area(size, VM_ALLOC);
-	if (!area) {
-		BUG();
+	if (!area)
 		return NULL;
-	}
 	addr = area->addr;
 	if (vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask, prot)) {
 		vfree(addr);
-		BUG();
 		return NULL;
 	}
 	return addr;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index aacd9a5b0..d7fd0aca8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,22 +55,8 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 
 	onlist = PageActive(page);
 	/* Don't look at this pte if it's been accessed recently. */
-	if (pte_young(pte)) {
-		set_pte(page_table, pte_mkold(pte));
-		if (onlist) {
-			/*
-			 * Transfer the "accessed" bit from the page
-			 * tables to the global page map. Page aging
-			 * will be done by refill_inactive_scan().
-			 */
-                	SetPageReferenced(page);
-		} else {
-			/*
-			 * The page is not on the active list, so
-			 * we have to do the page aging ourselves.
-			 */
-			age_page_up(page);
-		}
+	if (ptep_test_and_clear_young(page_table)) {
+		age_page_up(page);
 		goto out_failed;
 	}
 	if (!onlist)
@@ -88,6 +74,13 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 	if (TryLockPage(page))
 		goto out_failed;
 
+	/* From this point on, the odds are that we're going to
+	 * nuke this pte, so read and clear the pte.  This hook
+	 * is needed on CPUs which update the accessed and dirty
+	 * bits in hardware.
+	 */
+	pte = ptep_get_and_clear(page_table);
+
 	/*
 	 * Is the page already in the swap cache? If so, then
 	 * we can just drop our reference to it without doing
@@ -124,7 +117,6 @@ drop_pte:
 	 */
 	if (!pte_dirty(pte)) {
 		flush_cache_page(vma, address);
-		pte_clear(page_table);
 		goto drop_pte;
 	}
 
@@ -134,7 +126,7 @@ drop_pte:
 	 * locks etc.
 	 */
 	if (!(gfp_mask & __GFP_IO))
-		goto out_unlock;
+		goto out_unlock_restore;
 
 	/*
 	 * Don't do any of the expensive stuff if
@@ -143,7 +135,7 @@ drop_pte:
 	if (page->zone->free_pages + page->zone->inactive_clean_pages
 					+ page->zone->inactive_dirty_pages
 		      	> page->zone->pages_high + inactive_target)
-		goto out_unlock;
+		goto out_unlock_restore;
 
 	/*
 	 * Ok, it's really dirty. That means that
@@ -169,10 +161,10 @@ drop_pte:
 		int error;
 		struct file *file = vma->vm_file;
 		if (file) get_file(file);
-		pte_clear(page_table);
+
 		mm->rss--;
 		flush_tlb_page(vma, address);
-		vmlist_access_unlock(mm);
+		spin_unlock(&mm->page_table_lock);
 		error = swapout(page, file);
 		UnlockPage(page);
 		if (file) fput(file);
@@ -191,7 +183,7 @@ drop_pte:
 	 */
 	entry = get_swap_page();
 	if (!entry.val)
-		goto out_unlock; /* No swap space left */
+		goto out_unlock_restore; /* No swap space left */
 
 	if (!(page = prepare_highmem_swapout(page)))
 		goto out_swap_free;
@@ -205,7 +197,7 @@ drop_pte:
 	mm->rss--;
 	set_pte(page_table, swp_entry_to_pte(entry));
 	flush_tlb_page(vma, address);
-	vmlist_access_unlock(mm);
+	spin_unlock(&mm->page_table_lock);
 
 	/* OK, do a physical asynchronous write to swap.  */
 	rw_swap_page(WRITE, page, 0);
@@ -215,10 +207,12 @@ out_free_success:
 	page_cache_release(page);
 	return 1;
 out_swap_free:
+	set_pte(page_table, pte);
 	swap_free(entry);
 out_failed:
 	return 0;
-out_unlock:
+out_unlock_restore:
+	set_pte(page_table, pte);
 	UnlockPage(page);
 	return 0;
 }
@@ -307,7 +301,7 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi
 	unsigned long end;
 
 	/* Don't swap out areas which are locked down */
-	if (vma->vm_flags & VM_LOCKED)
+	if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 		return 0;
 
 	pgdir = pgd_offset(mm, address);
@@ -341,7 +335,7 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 	 * Find the proper vm-area after freezing the vma chain 
 	 * and ptes.
 	 */
-	vmlist_access_lock(mm);
+	spin_lock(&mm->page_table_lock);
 	vma = find_vma(mm, address);
 	if (vma) {
 		if (address < vma->vm_start)
@@ -364,7 +358,7 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 	mm->swap_cnt = 0;
 
 out_unlock:
-	vmlist_access_unlock(mm);
+	spin_unlock(&mm->page_table_lock);
 
 	/* We didn't find anything for the process */
 	return 0;
@@ -790,7 +784,8 @@ int refill_inactive_scan(unsigned int priority, int oneshot)
 			 *
 			 * SUBTLE: we can have buffer pages with count 1.
 			 */
-			if (page_count(page) <= (page->buffers ? 2 : 1)) {
+			if (page->age == 0 && page_count(page) <=
+						(page->buffers ? 2 : 1)) {
 				deactivate_page_nolock(page);
 				page_active = 0;
 			} else {
@@ -837,8 +832,9 @@ int free_shortage(void)
 		for(i = 0; i < MAX_NR_ZONES; i++) {
 			zone_t *zone = pgdat->node_zones+ i;
 			if (zone->size && (zone->inactive_clean_pages +
-					zone->free_pages < zone->pages_min)) {
-				sum += zone->pages_min;
+					zone->free_pages < zone->pages_min+1)) {
+				/* + 1 to have overlap with alloc_pages() !! */
+				sum += zone->pages_min + 1;
 				sum -= zone->free_pages;
 				sum -= zone->inactive_clean_pages;
 			}
@@ -1095,12 +1091,20 @@ int kswapd(void *unused)
 		 * We go to sleep for one second, but if it's needed
 		 * we'll be woken up earlier...
 		 */
-		if (!free_shortage() || !inactive_shortage())
+		if (!free_shortage() || !inactive_shortage()) {
 			interruptible_sleep_on_timeout(&kswapd_wait, HZ);
 		/*
-		 * TODO: insert out of memory check & oom killer
-		 * invocation in an else branch here.
+		 * If we couldn't free enough memory, we see if it was
+		 * due to the system just not having enough memory.
+		 * If that is the case, the only solution is to kill
+		 * a process (the alternative is enternal deadlock).
+		 *
+		 * If there still is enough memory around, we just loop
+		 * and try free some more memory...
 		 */
+		} else if (out_of_memory()) {
+			oom_kill();
+		}
 	}
 }