Merge with Linux 2.4.0-test9. Please check DECstation, I had a number

of rejects to fixup while integrating Linus patches. I also found that this kernel will only boot SMP on Origin; the UP kernel freeze soon after bootup with SCSI timeout messages. I commit this anyway since I found that the last CVS versions had the same problem.
author: Ralf Baechle <ralf@linux-mips.org> 2000-10-05 01:18:40 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-10-05 01:18:40 +0000
commit: 012bb3e61e5eced6c610f9e036372bf0c8def2d1 (patch)
tree: 87efc733f9b164e8c85c0336f92c8fb7eff6d183 /mm
parent: 625a1589d3d6464b5d90b8a0918789e3afffd220 (diff)
12 files changed, 1480 insertions, 532 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 977225432..6aca16409 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -44,9 +44,8 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
-struct list_head lru_cache;
 
-static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  *       the pagemap_lru_lock held.
@@ -92,7 +91,7 @@ static inline int sync_page(struct page *page)
  * sure the page is locked and that nobody else uses it - or that usage
  * is safe.
  */
-static inline void __remove_inode_page(struct page *page)
+void __remove_inode_page(struct page *page)
 {
 	remove_page_from_inode_queue(page);
 	remove_page_from_hash_queue(page);
@@ -146,9 +145,40 @@ void invalidate_inode_pages(struct inode * inode)
 	spin_unlock(&pagecache_lock);
 }
 
-/*
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+				
+	if (page->buffers)
+		block_flushpage(page, partial);
+
+}
+
+static inline void truncate_complete_page(struct page *page)
+{
+	if (!page->buffers || block_flushpage(page, 0))
+		lru_cache_del(page);
+
+	/*
+	 * We remove the page from the page cache _after_ we have
+	 * destroyed all buffer-cache references to it. Otherwise some
+	 * other process might think this inode page is not in the
+	 * page cache and creates a buffer-cache alias to it causing
+	 * all sorts of fun problems ...  
+	 */
+	ClearPageDirty(page);
+	remove_inode_page(page);
+	page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
  * Truncate the page cache at a set offset, removing the pages
  * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
  */
 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 {
@@ -168,11 +198,10 @@ repeat:
 
 		page = list_entry(curr, struct page, list);
 		curr = curr->next;
-
 		offset = page->index;
 
-		/* page wholly truncated - free it */
-		if (offset >= start) {
+		/* Is one of the pages to truncate? */
+		if ((offset >= start) || (partial && (offset + 1) == start)) {
 			if (TryLockPage(page)) {
 				page_cache_get(page);
 				spin_unlock(&pagecache_lock);
@@ -183,23 +212,14 @@ repeat:
 			page_cache_get(page);
 			spin_unlock(&pagecache_lock);
 
-			if (!page->buffers || block_flushpage(page, 0))
-				lru_cache_del(page);
-
-			/*
-			 * We remove the page from the page cache
-			 * _after_ we have destroyed all buffer-cache
-			 * references to it. Otherwise some other process
-			 * might think this inode page is not in the
-			 * page cache and creates a buffer-cache alias
-			 * to it causing all sorts of fun problems ...
-			 */
-			remove_inode_page(page);
-			ClearPageDirty(page);
+			if (partial && (offset + 1) == start) {
+				truncate_partial_page(page, partial);
+				partial = 0;
+			} else 
+				truncate_complete_page(page);
 
 			UnlockPage(page);
 			page_cache_release(page);
-			page_cache_release(page);
 
 			/*
 			 * We have done things without the pagecache lock,
@@ -210,176 +230,10 @@ repeat:
 			 */
 			goto repeat;
 		}
-		/*
-		 * there is only one partial page possible.
-		 */
-		if (!partial)
-			continue;
-
-		/* and it's the one preceeding the first wholly truncated page */
-		if ((offset + 1) != start)
-			continue;
-
-		/* partial truncate, clear end of page */
-		if (TryLockPage(page)) {
-			spin_unlock(&pagecache_lock);
-			goto repeat;
-		}
-		page_cache_get(page);
-		spin_unlock(&pagecache_lock);
-
-		memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-		if (page->buffers)
-			block_flushpage(page, partial);
-
-		partial = 0;
-
-		/*
-		 * we have dropped the spinlock so we have to
-		 * restart.
-		 */
-		UnlockPage(page);
-		page_cache_release(page);
-		goto repeat;
 	}
 	spin_unlock(&pagecache_lock);
 }
 
-/*
- * nr_dirty represents the number of dirty pages that we will write async
- * before doing sync writes.  We can only do sync writes if we can
- * wait for IO (__GFP_IO set).
- */
-int shrink_mmap(int priority, int gfp_mask)
-{
-	int ret = 0, count, nr_dirty;
-	struct list_head * page_lru;
-	struct page * page = NULL;
-	
-	count = nr_lru_pages / (priority + 1);
-	nr_dirty = priority;
-
-	/* we need pagemap_lru_lock for list_del() ... subtle code below */
-	spin_lock(&pagemap_lru_lock);
-	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
-		page = list_entry(page_lru, struct page, lru);
-		list_del(page_lru);
-
-		if (PageTestandClearReferenced(page))
-			goto dispose_continue;
-
-		count--;
-		/*
-		 * Avoid unscalable SMP locking for pages we can
-		 * immediate tell are untouchable..
-		 */
-		if (!page->buffers && page_count(page) > 1)
-			goto dispose_continue;
-
-		if (TryLockPage(page))
-			goto dispose_continue;
-
-		/* Release the pagemap_lru lock even if the page is not yet
-		   queued in any lru queue since we have just locked down
-		   the page so nobody else may SMP race with us running
-		   a lru_cache_del() (lru_cache_del() always run with the
-		   page locked down ;). */
-		spin_unlock(&pagemap_lru_lock);
-
-		/* avoid freeing the page while it's locked */
-		page_cache_get(page);
-
-		/*
-		 * Is it a buffer page? Try to clean it up regardless
-		 * of zone - it's old.
-		 */
-		if (page->buffers) {
-			int wait;
-			/*
-			 * 0 - free it if can do so without IO
-			 * 1 - start write-out of dirty buffers
-			 * 2 - wait for locked buffers
-			 */
-			wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0;
-			if (!try_to_free_buffers(page, wait))
-				goto unlock_continue;
-			/* page was locked, inode can't go away under us */
-			if (!page->mapping) {
-				atomic_dec(&buffermem_pages);
-				goto made_buffer_progress;
-			}
-		}
-
-		/* Take the pagecache_lock spinlock held to avoid
-		   other tasks to notice the page while we are looking at its
-		   page count. If it's a pagecache-page we'll free it
-		   in one atomic transaction after checking its page count. */
-		spin_lock(&pagecache_lock);
-
-		/*
-		 * We can't free pages unless there's just one user
-		 * (count == 2 because we added one ourselves above).
-		 */
-		if (page_count(page) != 2)
-			goto cache_unlock_continue;
-
-		/*
-		 * Is it a page swap page? If so, we want to
-		 * drop it if it is no longer used, even if it
-		 * were to be marked referenced..
-		 */
-		if (PageSwapCache(page)) {
-			spin_unlock(&pagecache_lock);
-			__delete_from_swap_cache(page);
-			goto made_inode_progress;
-		}	
-
-		/*
-		 * Page is from a zone we don't care about.
-		 * Don't drop page cache entries in vain.
-		 */
-		if (page->zone->free_pages > page->zone->pages_high)
-			goto cache_unlock_continue;
-
-		/* is it a page-cache page? */
-		if (page->mapping) {
-			if (!PageDirty(page) && !pgcache_under_min()) {
-				__remove_inode_page(page);
-				spin_unlock(&pagecache_lock);
-				goto made_inode_progress;
-			}
-			goto cache_unlock_continue;
-		}
-
-		printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
-
-cache_unlock_continue:
-		spin_unlock(&pagecache_lock);
-unlock_continue:
-		spin_lock(&pagemap_lru_lock);
-		UnlockPage(page);
-		page_cache_release(page);
-dispose_continue:
-		list_add(page_lru, &lru_cache);
-	}
-	goto out;
-
-made_inode_progress:
-	page_cache_release(page);
-made_buffer_progress:
-	UnlockPage(page);
-	page_cache_release(page);
-	ret = 1;
-	spin_lock(&pagemap_lru_lock);
-	/* nr_lru_pages needs the spinlock */
-	nr_lru_pages--;
-
-out:
-	spin_unlock(&pagemap_lru_lock);
-
-	return ret;
-}
-
 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 {
 	goto inside;
@@ -394,7 +248,14 @@ inside:
 		if (page->index == offset)
 			break;
 	}
-	SetPageReferenced(page);
+	/*
+	 * Touching the page may move it to the active list.
+	 * If we end up with too few inactive pages, we wake
+	 * up kswapd.
+	 */
+	age_page_up(page);
+	if (inactive_shortage() > inactive_target / 2 && free_shortage())
+			wakeup_kswapd(0);
 not_found:
 	return page;
 }
@@ -626,6 +487,7 @@ void ___wait_on_page(struct page *page)
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!PageLocked(page))
 			break;
+		run_task_queue(&tq_disk);
 		schedule();
 	} while (PageLocked(page));
 	tsk->state = TASK_RUNNING;
@@ -749,6 +611,53 @@ repeat:
 #endif
 
 /*
+ * We combine this with read-ahead to deactivate pages when we
+ * think there's sequential IO going on. Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * TODO:
+ * - make the readahead code smarter
+ * - move readahead to the VMA level so we can do the same
+ *   trick with mmap()
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page **hash;
+	struct page *page;
+	unsigned long start;
+
+	/* Nothing to drop-behind if we're on the first page. */
+	if (!index)
+		return;
+
+	if (index > file->f_rawin)
+		start = index - file->f_rawin;
+	else
+		start = 0;
+
+	/*
+	 * Go backwards from index-1 and drop all pages in the
+	 * readahead window. Since the readahead window may have
+	 * been increased since the last time we were called, we
+	 * stop when the page isn't there.
+	 */
+	spin_lock(&pagecache_lock);
+	while (--index >= start) {
+		hash = page_hash(mapping, index);
+		page = __find_page_nolock(mapping, index, *hash);
+		if (!page)
+			break;
+		deactivate_page(page);
+	}
+	spin_unlock(&pagecache_lock);
+}
+
+/*
  * Read-ahead profiling information
  * --------------------------------
  * Every PROFILE_MAXREADCOUNT, the following information is written 
@@ -971,6 +880,12 @@ static void generic_file_readahead(int reada_ok,
 		if (filp->f_ramax > max_readahead)
 			filp->f_ramax = max_readahead;
 
+		/*
+		 * Move the pages that have already been passed
+		 * to the inactive list.
+		 */
+		drop_behind(filp, index);
+
 #ifdef PROFILE_READAHEAD
 		profile_readahead((reada_ok == 2), filp);
 #endif
@@ -1074,6 +989,13 @@ found_page:
 			goto page_not_up_to_date;
 		generic_file_readahead(reada_ok, filp, inode, page);
 page_ok:
+		/* If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (page->mapping->i_mmap_shared != NULL)
+			flush_dcache_page(page);
+
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
 		 * now we can copy it to user space...
@@ -2002,10 +1924,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
  * Application no longer needs these pages.  If the pages are dirty,
  * it's OK to just throw them away.  The app will be more careful about
  * data it wants to keep.  Be sure to free swap resources too.  The
- * zap_page_range call sets things up for shrink_mmap to actually free
+ * zap_page_range call sets things up for refill_inactive to actually free
  * these pages later if no one else has touched them in the meantime,
  * although we could add these pages to a global reuse list for
- * shrink_mmap to pick up before reclaiming other pages.
+ * refill_inactive to pick up before reclaiming other pages.
  *
  * NB: This interface discards data rather than pushes it out to swap,
  * as some implementations do.  This has performance implications for
@@ -2530,6 +2452,7 @@ generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
 unlock:
 		/* Mark it unlocked again and drop the page.. */
 		UnlockPage(page);
+		deactivate_page(page);
 		page_cache_release(page);
 
 		if (status < 0)
diff --git a/mm/memory.c b/mm/memory.c
index 83fc97cb3..6b047821d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -67,7 +67,7 @@ static inline void copy_cow_page(struct page * from, struct page * to, unsigned
 	copy_user_highpage(to, from, address);
 }
 
-mem_map_t * mem_map = NULL;
+mem_map_t * mem_map;
 
 /*
  * Note: this doesn't free the actual pages themselves. That
@@ -924,33 +924,9 @@ static void partial_clear(struct vm_area_struct *vma, unsigned long address)
 	memclear_highpage_flush(page, offset, PAGE_SIZE - offset);
 }
 
-/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page.  Ugly, but necessary.
- */
-void vmtruncate(struct inode * inode, loff_t offset)
+static void vmtruncate_list(struct vm_area_struct *mpnt,
+			    unsigned long pgoff, unsigned long partial)
 {
-	unsigned long partial, pgoff;
-	struct vm_area_struct * mpnt;
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long limit;
-
-	if (inode->i_size < offset)
-		goto do_expand;
-	inode->i_size = offset;
-	truncate_inode_pages(mapping, offset);
-	spin_lock(&mapping->i_shared_lock);
-	if (!mapping->i_mmap)
-		goto out_unlock;
-
-	pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	partial = (unsigned long)offset & (PAGE_CACHE_SIZE - 1);
-
-	mpnt = mapping->i_mmap;
 	do {
 		struct mm_struct *mm = mpnt->vm_mm;
 		unsigned long start = mpnt->vm_start;
@@ -983,6 +959,39 @@ void vmtruncate(struct inode * inode, loff_t offset)
 		zap_page_range(mm, start, len);
 		flush_tlb_range(mm, start, end);
 	} while ((mpnt = mpnt->vm_next_share) != NULL);
+}
+			      
+
+/*
+ * Handle all mappings that got truncated by a "truncate()"
+ * system call.
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page.  Ugly, but necessary.
+ */
+void vmtruncate(struct inode * inode, loff_t offset)
+{
+	unsigned long partial, pgoff;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long limit;
+
+	if (inode->i_size < offset)
+		goto do_expand;
+	inode->i_size = offset;
+	truncate_inode_pages(mapping, offset);
+	spin_lock(&mapping->i_shared_lock);
+	if (!mapping->i_mmap && !mapping->i_mmap_shared)
+		goto out_unlock;
+
+	pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	partial = (unsigned long)offset & (PAGE_CACHE_SIZE - 1);
+
+	if (mapping->i_mmap != NULL)
+		vmtruncate_list(mapping->i_mmap, pgoff, partial);
+	if (mapping->i_mmap_shared != NULL)
+		vmtruncate_list(mapping->i_mmap_shared, pgoff, partial);
+
 out_unlock:
 	spin_unlock(&mapping->i_shared_lock);
 	/* this should go into ->truncate */
@@ -1031,7 +1040,8 @@ void swapin_readahead(swp_entry_t entry)
 	num = valid_swaphandles(entry, &offset);
 	for (i = 0; i < num; offset++, i++) {
 		/* Don't block on I/O for read-ahead */
-		if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster) {
+		if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster
+				* (1 << page_cluster)) {
 			while (i++ < num)
 				swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++));
 			break;
@@ -1095,15 +1105,12 @@ static int do_swap_page(struct mm_struct * mm,
  */
 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 {
-	int high = 0;
 	struct page *page = NULL;
 	pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 	if (write_access) {
 		page = alloc_page(GFP_HIGHUSER);
 		if (!page)
 			return -1;
-		if (PageHighMem(page))
-			high = 1;
 		clear_user_highpage(page, addr);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		mm->rss++;
@@ -1233,7 +1240,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 
 	pgd = pgd_offset(mm, address);
 	pmd = pmd_alloc(pgd, address);
-	
+
 	if (pmd) {
 		pte_t * pte = pte_alloc(pmd, address);
 		if (pte)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9667d19db..9c0027563 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -906,15 +906,21 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
 	if (file) {
 		struct inode * inode = file->f_dentry->d_inode;
 		struct address_space *mapping = inode->i_mapping;
+		struct vm_area_struct **head;
+
 		if (vmp->vm_flags & VM_DENYWRITE)
 			atomic_dec(&inode->i_writecount);
+
+		head = &mapping->i_mmap;
+		if (vmp->vm_flags & VM_SHARED)
+			head = &mapping->i_mmap_shared;
       
 		/* insert vmp into inode's share list */
 		spin_lock(&mapping->i_shared_lock);
-		if((vmp->vm_next_share = mapping->i_mmap) != NULL)
-			mapping->i_mmap->vm_pprev_share = &vmp->vm_next_share;
-		mapping->i_mmap = vmp;
-		vmp->vm_pprev_share = &mapping->i_mmap;
+		if((vmp->vm_next_share = *head) != NULL)
+			(*head)->vm_pprev_share = &vmp->vm_next_share;
+		*head = vmp;
+		vmp->vm_pprev_share = head;
 		spin_unlock(&mapping->i_shared_lock);
 	}
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index a48125178..d1f6a7b8b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -225,6 +225,10 @@ unsigned long do_mremap(unsigned long addr,
 	/* We can't remap across vm area boundaries */
 	if (old_len > vma->vm_end - addr)
 		goto out;
+	if (vma->vm_flags & VM_DONTEXPAND) {
+		if (new_len > old_len)
+			goto out;
+	}
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked = current->mm->locked_vm << PAGE_SHIFT;
 		locked += new_len - old_len;
diff --git a/mm/numa.c b/mm/numa.c
index bbe9ec6fb..06ad9ec63 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -21,12 +21,12 @@ pg_data_t contig_page_data = { bdata: &contig_bootmem_data };
  * at a considerably higher value than 0. Examples are Super-H, ARM, m68k.
  * Should be invoked with paramters (0, 0, unsigned long *[], start_paddr).
  */
-void __init free_area_init_node(int nid, pg_data_t *pgdat, 
+void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 	unsigned long *zones_size, unsigned long zone_start_paddr, 
 	unsigned long *zholes_size)
 {
 	free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 
-						zone_start_paddr, zholes_size);
+				zone_start_paddr, zholes_size, pmap);
 }
 
 #endif /* !CONFIG_DISCONTIGMEM */
@@ -55,7 +55,7 @@ void show_free_areas_node(int nid)
 /*
  * Nodes can be initialized parallely, in no particular order.
  */
-void __init free_area_init_node(int nid, pg_data_t *pgdat, 
+void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 	unsigned long *zones_size, unsigned long zone_start_paddr, 
 	unsigned long *zholes_size)
 {
@@ -66,7 +66,7 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat,
 		mem_map = (mem_map_t *)PAGE_OFFSET;
 
 	free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr,
-						zholes_size);
+					zholes_size, pmap);
 	pgdat->node_id = nid;
 
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8b74a73db..0b5990a11 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,7 +25,8 @@
 #endif
 
 int nr_swap_pages;
-int nr_lru_pages;
+int nr_active_pages;
+int nr_inactive_dirty_pages;
 pg_data_t *pgdat_list;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -33,6 +34,8 @@ static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
 
+struct list_head active_list;
+struct list_head inactive_dirty_list;
 /*
  * Free_page() adds the page to the free lists. This is optimized for
  * fast normal cases (no error jumps taken normally).
@@ -96,7 +99,16 @@ static void __free_pages_ok (struct page *page, unsigned long order)
 		BUG();
 	if (PageDirty(page))
 		BUG();
+	if (PageActive(page))
+		BUG();
+	if (PageInactiveDirty(page))
+		BUG();
+	if (PageInactiveClean(page))
+		BUG();
 
+	page->flags &= ~(1<<PG_referenced);
+	page->age = PAGE_AGE_START;
+	
 	zone = page->zone;
 
 	mask = (~0UL) << order;
@@ -142,10 +154,13 @@ static void __free_pages_ok (struct page *page, unsigned long order)
 
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-	if (zone->free_pages > zone->pages_high) {
-		zone->zone_wake_kswapd = 0;
-		zone->low_on_memory = 0;
-	}
+	/*
+	 * We don't want to protect this variable from race conditions
+	 * since it's nothing important, but we do want to make sure
+	 * it never gets negative.
+	 */
+	if (memory_pressure > NR_CPUS)
+		memory_pressure--;
 }
 
 #define MARK_USED(index, order, area) \
@@ -203,6 +218,7 @@ static struct page * rmqueue(zone_t *zone, unsigned long order)
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
 				BUG();
+			DEBUG_ADD_PAGE
 			return page;	
 		}
 		curr_order++;
@@ -213,13 +229,77 @@ static struct page * rmqueue(zone_t *zone, unsigned long order)
 	return NULL;
 }
 
+#define PAGES_MIN	0
+#define PAGES_LOW	1
+#define PAGES_HIGH	2
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+			unsigned long order, int limit, int direct_reclaim)
+{
+	zone_t **zone = zonelist->zones;
+
+	for (;;) {
+		zone_t *z = *(zone++);
+		unsigned long water_mark;
+
+		if (!z)
+			break;
+		if (!z->size)
+			BUG();
+
+		/*
+		 * We allocate if the number of free + inactive_clean
+		 * pages is above the watermark.
+		 */
+		switch (limit) {
+			default:
+			case PAGES_MIN:
+				water_mark = z->pages_min;
+				break;
+			case PAGES_LOW:
+				water_mark = z->pages_low;
+				break;
+			case PAGES_HIGH:
+				water_mark = z->pages_high;
+		}
+
+		if (z->free_pages + z->inactive_clean_pages > water_mark) {
+			struct page *page = NULL;
+			/* If possible, reclaim a page directly. */
+			if (direct_reclaim && z->free_pages < z->pages_min + 8)
+				page = reclaim_page(z);
+			/* If that fails, fall back to rmqueue. */
+			if (!page)
+				page = rmqueue(z, order);
+			if (page)
+				return page;
+		}
+	}
+
+	/* Found nothing. */
+	return NULL;
+}
+
+
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 {
 	zone_t **zone;
-	extern wait_queue_head_t kswapd_wait;
+	int direct_reclaim = 0;
+	unsigned int gfp_mask = zonelist->gfp_mask;
+	struct page * page = NULL;
+
+	/*
+	 * Allocations put pressure on the VM subsystem.
+	 */
+	memory_pressure++;
 
 	/*
 	 * (If anyone calls gfp from interrupts nonatomically then it
@@ -229,6 +309,36 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 	 * in a higher zone fails.
 	 */
 
+	/*
+	 * Can we take pages directly from the inactive_clean
+	 * list?
+	 */
+	if (order == 0 && (gfp_mask & __GFP_WAIT) &&
+			!(current->flags & PF_MEMALLOC))
+		direct_reclaim = 1;
+
+	/*
+	 * If we are about to get low on free pages and we also have
+	 * an inactive page shortage, wake up kswapd.
+	 */
+	if (inactive_shortage() > inactive_target / 2 && free_shortage())
+		wakeup_kswapd(0);
+	/*
+	 * If we are about to get low on free pages and cleaning
+	 * the inactive_dirty pages would fix the situation,
+	 * wake up bdflush.
+	 */
+	else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
+			&& nr_inactive_dirty_pages > freepages.high)
+		wakeup_bdflush(0);
+
+try_again:
+	/*
+	 * First, see if we have any zones with lots of free memory.
+	 *
+	 * We allocate free memory first because it doesn't contain
+	 * any data ... DUH!
+	 */
 	zone = zonelist->zones;
 	for (;;) {
 		zone_t *z = *(zone++);
@@ -237,82 +347,193 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 		if (!z->size)
 			BUG();
 
-		/* Are we supposed to free memory? Don't make it worse.. */
-		if (!z->zone_wake_kswapd) {
-			struct page *page = rmqueue(z, order);
-			if (z->free_pages < z->pages_low) {
-				z->zone_wake_kswapd = 1;
-				if (waitqueue_active(&kswapd_wait))
-					wake_up_interruptible(&kswapd_wait);
-			}
+		if (z->free_pages > z->pages_low) {
+			page = rmqueue(z, order);
 			if (page)
 				return page;
+		} else if (z->free_pages < z->pages_min &&
+					waitqueue_active(&kreclaimd_wait)) {
+				wake_up_interruptible(&kreclaimd_wait);
 		}
 	}
 
-	/* Three possibilities to get here
-	 * - Previous alloc_pages resulted in last zone set to have
-	 *   zone_wake_kswapd and start it. kswapd has not been able
-	 *   to release enough pages so that one zone does not have
-	 *   zone_wake_kswapd set.
-	 * - Different sets of zones (zonelist)
-	 *   previous did not have all zones with zone_wake_kswapd but
-	 *   this one has... should kswapd be woken up? it will run once.
-	 * - SMP race, kswapd went to sleep slightly after it as running
-	 *   in 'if (waitqueue_active(...))' above.
-	 * + anyway the test is very cheap to do...
+	/*
+	 * Try to allocate a page from a zone with a HIGH
+	 * amount of free + inactive_clean pages.
+	 *
+	 * If there is a lot of activity, inactive_target
+	 * will be high and we'll have a good chance of
+	 * finding a page using the HIGH limit.
 	 */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+	if (page)
+		return page;
 
 	/*
-	 * Ok, we don't have any zones that don't need some
-	 * balancing.. See if we have any that aren't critical..
+	 * Then try to allocate a page from a zone with more
+	 * than zone->pages_low free + inactive_clean pages.
+	 *
+	 * When the working set is very large and VM activity
+	 * is low, we're most likely to have our allocation
+	 * succeed here.
 	 */
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (!z->low_on_memory) {
-			struct page *page = rmqueue(z, order);
-			if (z->free_pages < z->pages_min)
-				z->low_on_memory = 1;
-			if (page)
-				return page;
-		}
+	page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * OK, none of the zones on our zonelist has lots
+	 * of pages free.
+	 *
+	 * We wake up kswapd, in the hope that kswapd will
+	 * resolve this situation before memory gets tight.
+	 *
+	 * We also yield the CPU, because that:
+	 * - gives kswapd a chance to do something
+	 * - slows down allocations, in particular the
+	 *   allocations from the fast allocator that's
+	 *   causing the problems ...
+	 * - ... which minimises the impact the "bad guys"
+	 *   have on the rest of the system
+	 * - if we don't have __GFP_IO set, kswapd may be
+	 *   able to free some memory we can't free ourselves
+	 */
+	wakeup_kswapd(0);
+	if (gfp_mask & __GFP_WAIT) {
+		__set_current_state(TASK_RUNNING);
+		current->policy |= SCHED_YIELD;
+		schedule();
 	}
 
 	/*
-	 * Uhhuh. All the zones have been critical, which means that
-	 * we'd better do some synchronous swap-out. kswapd has not
-	 * been able to cope..
+	 * After waking up kswapd, we try to allocate a page
+	 * from any zone which isn't critical yet.
+	 *
+	 * Kswapd should, in most situations, bring the situation
+	 * back to normal in no time.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * Damn, we didn't succeed.
+	 *
+	 * This can be due to 2 reasons:
+	 * - we're doing a higher-order allocation
+	 * 	--> move pages to the free list until we succeed
+	 * - we're /really/ tight on memory
+	 * 	--> wait on the kswapd waitqueue until memory is freed
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
-		int gfp_mask = zonelist->gfp_mask;
-		if (!try_to_free_pages(gfp_mask)) {
-			if (!(gfp_mask & __GFP_HIGH))
-				goto fail;
+		/*
+		 * Are we dealing with a higher order allocation?
+		 *
+		 * Move pages from the inactive_clean to the free list
+		 * in the hope of creating a large, physically contiguous
+		 * piece of free memory.
+		 */
+		if (order > 0 && (gfp_mask & __GFP_WAIT)) {
+			zone = zonelist->zones;
+			/* First, clean some dirty pages. */
+			page_launder(gfp_mask, 1);
+			for (;;) {
+				zone_t *z = *(zone++);
+				if (!z)
+					break;
+				if (!z->size)
+					continue;
+				while (z->inactive_clean_pages) {
+					struct page * page;
+					/* Move one page to the free list. */
+					page = reclaim_page(z);
+					if (!page)
+						break;
+					__free_page(page);
+					/* Try if the allocation succeeds. */
+					page = rmqueue(z, order);
+					if (page)
+						return page;
+				}
+			}
 		}
+		/*
+		 * When we arrive here, we are really tight on memory.
+		 *
+		 * We wake up kswapd and sleep until kswapd wakes us
+		 * up again. After that we loop back to the start.
+		 *
+		 * We have to do this because something else might eat
+		 * the memory kswapd frees for us and we need to be
+		 * reliable. Note that we don't loop back for higher
+		 * order allocations since it is possible that kswapd
+		 * simply cannot free a large enough contiguous area
+		 * of memory *ever*.
+		 */
+		if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
+			wakeup_kswapd(1);
+			memory_pressure++;
+			if (!order)
+				goto try_again;
+		/*
+		 * If __GFP_IO isn't set, we can't wait on kswapd because
+		 * kswapd just might need some IO locks /we/ are holding ...
+		 *
+		 * SUBTLE: The scheduling point above makes sure that
+		 * kswapd does get the chance to free memory we can't
+		 * free ourselves...
+		 */
+		} else if (gfp_mask & __GFP_WAIT) {
+			try_to_free_pages(gfp_mask);
+			memory_pressure++;
+			if (!order)
+				goto try_again;
+		}
+
 	}
 
 	/*
 	 * Final phase: allocate anything we can!
+	 *
+	 * Higher order allocations, GFP_ATOMIC allocations and
+	 * recursive allocations (PF_MEMALLOC) end up here.
+	 *
+	 * Only recursive allocations can use the very last pages
+	 * in the system, otherwise it would be just too easy to
+	 * deadlock the system...
 	 */
 	zone = zonelist->zones;
 	for (;;) {
-		struct page *page;
-
 		zone_t *z = *(zone++);
+		struct page * page = NULL;
 		if (!z)
 			break;
-		page = rmqueue(z, order);
+		if (!z->size)
+			BUG();
+
+		/*
+		 * SUBTLE: direct_reclaim is only possible if the task
+		 * becomes PF_MEMALLOC while looping above. This will
+		 * happen when the OOM killer selects this task for
+		 * instant execution...
+		 */
+		if (direct_reclaim)
+			page = reclaim_page(z);
+		if (page)
+			return page;
+
+		/* XXX: is pages_min/4 a good amount to reserve for this? */
+		if (z->free_pages < z->pages_min / 4 &&
+				!(current->flags & PF_MEMALLOC))
+			continue;
+		if (!page)
+			page = rmqueue(z, order);
 		if (page)
 			return page;
 	}
 
-fail:
 	/* No luck.. */
+	printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
 	return NULL;
 }
 
@@ -377,18 +598,46 @@ unsigned int nr_free_pages (void)
 }
 
 /*
- * Amount of free RAM allocatable as buffer memory:
+ * Total amount of inactive_clean (allocatable) RAM:
  */
-unsigned int nr_free_buffer_pages (void)
+unsigned int nr_inactive_clean_pages (void)
 {
 	unsigned int sum;
 	zone_t *zone;
 	int i;
 
-	sum = nr_lru_pages / 3;
+	sum = 0;
 	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
-			sum += zone->free_pages;
+		for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
+			sum += zone->inactive_clean_pages;
+	return sum;
+}
+
+/*
+ * Amount of free RAM allocatable as buffer memory:
+ */
+unsigned int nr_free_buffer_pages (void)
+{
+	unsigned int sum;
+
+	sum = nr_free_pages();
+	sum += nr_inactive_clean_pages();
+	sum += nr_inactive_dirty_pages;
+
+	/*
+	 * Keep our write behind queue filled, even if
+	 * kswapd lags a bit right now.
+	 */
+	if (sum < freepages.high + inactive_target)
+		sum = freepages.high + inactive_target;
+	/*
+	 * We don't want dirty page writebehind to put too
+	 * much pressure on the working set, but we want it
+	 * to be possible to have some dirty pages in the
+	 * working set without upsetting the writebehind logic.
+	 */
+	sum += nr_active_pages >> 4;
+
 	return sum;
 }
 
@@ -418,9 +667,11 @@ void show_free_areas_core(int nid)
 		nr_free_pages() << (PAGE_SHIFT-10),
 		nr_free_highpages() << (PAGE_SHIFT-10));
 
-	printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
+	printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
+		nr_active_pages,
+		nr_inactive_dirty_pages,
+		nr_inactive_clean_pages(),
 		nr_free_pages(),
-		nr_lru_pages,
 		freepages.min,
 		freepages.low,
 		freepages.high);
@@ -430,17 +681,6 @@ void show_free_areas_core(int nid)
 		zone_t *zone = NODE_DATA(nid)->node_zones + type;
  		unsigned long nr, total, flags;
 
-		printk("  %c%d%d %s: ",
-		       (zone->free_pages > zone->pages_low
-			? (zone->free_pages > zone->pages_high
-			   ? ' '
-			   : 'H')
-			: (zone->free_pages > zone->pages_min
-			   ? 'M'
-			   : 'L')),
-		       zone->zone_wake_kswapd, zone->low_on_memory,
-		       zone->name);
-
 		total = 0;
 		if (zone->size) {
 			spin_lock_irqsave(&zone->lock, flags);
@@ -532,9 +772,9 @@ static inline void build_zonelists(pg_data_t *pgdat)
  */
 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	unsigned long *zones_size, unsigned long zone_start_paddr, 
-	unsigned long *zholes_size)
+	unsigned long *zholes_size, struct page *lmem_map)
 {
-	struct page *p, *lmem_map;
+	struct page *p;
 	unsigned long i, j;
 	unsigned long map_size;
 	unsigned long totalpages, offset, realtotalpages;
@@ -570,7 +810,8 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
-	memlist_init(&lru_cache);
+	memlist_init(&active_list);
+	memlist_init(&inactive_dirty_list);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -580,9 +821,11 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	 * boundary, so that MAP_NR works.
 	 */
 	map_size = (totalpages + 1)*sizeof(struct page);
-	lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
-	lmem_map = (struct page *)(PAGE_OFFSET + 
+	if (lmem_map == (struct page *)0) {
+		lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
+		lmem_map = (struct page *)(PAGE_OFFSET + 
 			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
+	}
 	*gmap = pgdat->node_mem_map = lmem_map;
 	pgdat->node_size = totalpages;
 	pgdat->node_start_paddr = zone_start_paddr;
@@ -616,6 +859,9 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+		zone->inactive_clean_pages = 0;
+		zone->inactive_dirty_pages = 0;
+		memlist_init(&zone->inactive_clean_list);
 		if (!size)
 			continue;
 
@@ -629,8 +875,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 		zone->pages_min = mask;
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
-		zone->low_on_memory = 0;
-		zone->zone_wake_kswapd = 0;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
@@ -664,7 +908,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 
 void __init free_area_init(unsigned long *zones_size)
 {
-	free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0);
+	free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0, 0);
 }
 
 static int __init setup_mem_frac(char *str)
diff --git a/mm/page_io.c b/mm/page_io.c
index 25ed62221..185e19247 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -43,7 +43,8 @@ static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page, int w
 	struct inode *swapf = 0;
 
 	/* Don't allow too many pending pages in flight.. */
-	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
+	if ((rw == WRITE) && atomic_read(&nr_async_pages) >
+			pager_daemon.swap_cluster * (1 << page_cluster))
 		wait = 1;
 
 	if (rw == READ) {
diff --git a/mm/slab.c b/mm/slab.c
index ed5d018f1..b3bd852d1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -579,7 +579,6 @@ static void kmem_slab_destroy (kmem_cache_t *cachep, slab_t *slabp)
 		kmem_cache_free(cachep->slabp_cache, slabp);
 }
 
-
 /**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -838,48 +837,60 @@ static int is_chained_kmem_cache(kmem_cache_t * cachep)
 }
 
 #ifdef CONFIG_SMP
-static DECLARE_MUTEX(cache_drain_sem);
-static kmem_cache_t *cache_to_drain = NULL;
-static DECLARE_WAIT_QUEUE_HEAD(cache_drain_wait);
-unsigned long slab_cache_drain_mask;
-
 /*
- * Waits for all CPUs to execute slab_drain_local_cache().
- * Caller must be holding cache_drain_sem.
+ * Waits for all CPUs to execute func().
  */
-static void slab_drain_all_sync(void)
+static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
 {
-	DECLARE_WAITQUEUE(wait, current);
-
 	local_irq_disable();
-	slab_drain_local_cache();
+	func(arg);
 	local_irq_enable();
 
-	add_wait_queue(&cache_drain_wait, &wait);
-	current->state = TASK_UNINTERRUPTIBLE;
-	while (slab_cache_drain_mask != 0UL)
-		schedule();
-	current->state = TASK_RUNNING;
-	remove_wait_queue(&cache_drain_wait, &wait);
+	if (smp_call_function(func, arg, 1, 1))
+		BUG();
+}
+typedef struct ccupdate_struct_s
+{
+	kmem_cache_t *cachep;
+	cpucache_t *new[NR_CPUS];
+} ccupdate_struct_t;
+
+static void do_ccupdate_local(void *info)
+{
+	ccupdate_struct_t *new = (ccupdate_struct_t *)info;
+	cpucache_t *old = cc_data(new->cachep);
+	
+	cc_data(new->cachep) = new->new[smp_processor_id()];
+	new->new[smp_processor_id()] = old;
 }
 
+static void free_block (kmem_cache_t* cachep, void** objpp, int len);
+
 static void drain_cpu_caches(kmem_cache_t *cachep)
 {
-	unsigned long cpu_mask = 0;
+	ccupdate_struct_t new;
 	int i;
 
-	for (i = 0; i < smp_num_cpus; i++)
-		cpu_mask |= (1UL << cpu_logical_map(i));
+	memset(&new.new,0,sizeof(new.new));
 
-	down(&cache_drain_sem);
+	new.cachep = cachep;
 
-	cache_to_drain = cachep;
-	slab_cache_drain_mask = cpu_mask;
-	slab_drain_all_sync();
-	cache_to_drain = NULL;
+	down(&cache_chain_sem);
+	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
 
-	up(&cache_drain_sem);
+	for (i = 0; i < smp_num_cpus; i++) {
+		cpucache_t* ccold = new.new[cpu_logical_map(i)];
+		if (!ccold || (ccold->avail == 0))
+			continue;
+		local_irq_disable();
+		free_block(cachep, cc_entry(ccold), ccold->avail);
+		local_irq_enable();
+		ccold->avail = 0;
+	}
+	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+	up(&cache_chain_sem);
 }
+
 #else
 #define drain_cpu_caches(cachep)	do { } while (0)
 #endif
@@ -1593,56 +1604,6 @@ kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
 
 #ifdef CONFIG_SMP
 
-typedef struct ccupdate_struct_s
-{
-	kmem_cache_t *cachep;
-	cpucache_t *new[NR_CPUS];
-} ccupdate_struct_t;
-
-static ccupdate_struct_t *ccupdate_state = NULL;
-
-/* Called from per-cpu timer interrupt. */
-void slab_drain_local_cache(void)
-{
-	if (ccupdate_state != NULL) {
-		ccupdate_struct_t *new = ccupdate_state;
-		cpucache_t *old = cc_data(new->cachep);
-
-		cc_data(new->cachep) = new->new[smp_processor_id()];
-		new->new[smp_processor_id()] = old;
-	} else {
-		kmem_cache_t *cachep = cache_to_drain;
-		cpucache_t *cc = cc_data(cachep);
-
-		if (cc && cc->avail) {
-			free_block(cachep, cc_entry(cc), cc->avail);
-			cc->avail = 0;
-		}
-	}
-
-	clear_bit(smp_processor_id(), &slab_cache_drain_mask);
-	if (slab_cache_drain_mask == 0)
-		wake_up(&cache_drain_wait);
-}
-
-static void do_ccupdate(ccupdate_struct_t *data)
-{
-	unsigned long cpu_mask = 0;
-	int i;
-
-	for (i = 0; i < smp_num_cpus; i++)
-		cpu_mask |= (1UL << cpu_logical_map(i));
-
-	down(&cache_drain_sem);
-
-	ccupdate_state = data;
-	slab_cache_drain_mask = cpu_mask;
-	slab_drain_all_sync();
-	ccupdate_state = NULL;
-
-	up(&cache_drain_sem);
-}
-
 /* called with cache_chain_sem acquired.  */
 static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
 {
@@ -1666,7 +1627,6 @@ static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
 		for (i = 0; i< smp_num_cpus; i++) {
 			cpucache_t* ccnew;
 
-
 			ccnew = kmalloc(sizeof(void*)*limit+
 					sizeof(cpucache_t), GFP_KERNEL);
 			if (!ccnew)
@@ -1681,7 +1641,7 @@ static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
 	cachep->batchcount = batchcount;
 	spin_unlock_irq(&cachep->spinlock);
 
-	do_ccupdate(&new);
+	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
 
 	for (i = 0; i < smp_num_cpus; i++) {
 		cpucache_t* ccold = new.new[cpu_logical_map(i)];
@@ -1772,14 +1732,6 @@ void kmem_cache_reap (int gfp_mask)
 		/* It's safe to test this without holding the cache-lock. */
 		if (searchp->flags & SLAB_NO_REAP)
 			goto next;
-		/* FIXME: is this really a good idea? */
-		if (gfp_mask & GFP_DMA) {
-			if (!(searchp->gfpflags & GFP_DMA))
-				goto next;
-		} else {
-			if (searchp->gfpflags & GFP_DMA)
-				goto next;
-		}
 		spin_lock_irq(&searchp->spinlock);
 		if (searchp->growing)
 			goto next_unlock;
diff --git a/mm/swap.c b/mm/swap.c
index 460707ff7..8cb160b81 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,18 @@ freepages_t freepages = {
 };
 
 /* How many pages do we try to swap or page in/out together? */
-int page_cluster = 4; /* Default value modified in swap_setup() */
+int page_cluster;
+
+/*
+ * This variable contains the amount of page steals the system
+ * is doing, averaged over a minute. We use this to determine how
+ * many inactive pages we should have.
+ *
+ * In reclaim_page and __alloc_pages: memory_pressure++
+ * In __free_pages_ok: memory_pressure--
+ * In recalculate_vm_stats the value is decayed (once a second)
+ */
+int memory_pressure;
 
 /* We track the number of pages currently being asynchronously swapped
    out, so that we don't try to swap TOO many pages out at once */
@@ -61,13 +72,250 @@ buffer_mem_t page_cache = {
 pager_daemon_t pager_daemon = {
 	512,	/* base number for calculating the number of tries */
 	SWAP_CLUSTER_MAX,	/* minimum number of tries */
-	SWAP_CLUSTER_MAX,	/* do swap I/O in clusters of this size */
+	8,	/* do swap I/O in clusters of this size */
 };
 
+/**
+ * age_page_{up,down} -	page aging helper functions
+ * @page - the page we want to age
+ * @nolock - are we already holding the pagelist_lru_lock?
+ *
+ * If the page is on one of the lists (active, inactive_dirty or
+ * inactive_clean), we will grab the pagelist_lru_lock as needed.
+ * If you're already holding the lock, call this function with the
+ * nolock argument non-zero.
+ */
+void age_page_up_nolock(struct page * page)
+{
+	/*
+	 * We're dealing with an inactive page, move the page
+	 * to the active list.
+	 */
+	if (!page->age)
+		activate_page_nolock(page);
+
+	/* The actual page aging bit */
+	page->age += PAGE_AGE_ADV;
+	if (page->age > PAGE_AGE_MAX)
+		page->age = PAGE_AGE_MAX;
+}
+
 /*
- * Perform any setup for the swap system
+ * We use this (minimal) function in the case where we
+ * know we can't deactivate the page (yet).
  */
+void age_page_down_ageonly(struct page * page)
+{
+	page->age /= 2;
+}
+
+void age_page_down_nolock(struct page * page)
+{
+	/* The actual page aging bit */
+	page->age /= 2;
+
+	/*
+	 * The page is now an old page. Move to the inactive
+	 * list (if possible ... see below).
+	 */
+	if (!page->age)
+	       deactivate_page_nolock(page);
+}
 
+void age_page_up(struct page * page)
+{
+	/*
+	 * We're dealing with an inactive page, move the page
+	 * to the active list.
+	 */
+	if (!page->age)
+		activate_page(page);
+
+	/* The actual page aging bit */
+	page->age += PAGE_AGE_ADV;
+	if (page->age > PAGE_AGE_MAX)
+		page->age = PAGE_AGE_MAX;
+}
+
+void age_page_down(struct page * page)
+{
+	/* The actual page aging bit */
+	page->age /= 2;
+
+	/*
+	 * The page is now an old page. Move to the inactive
+	 * list (if possible ... see below).
+	 */
+	if (!page->age)
+	       deactivate_page(page);
+}
+
+
+/**
+ * (de)activate_page - move pages from/to active and inactive lists
+ * @page: the page we want to move
+ * @nolock - are we already holding the pagemap_lru_lock?
+ *
+ * Deactivate_page will move an active page to the right
+ * inactive list, while activate_page will move a page back
+ * from one of the inactive lists to the active list. If
+ * called on a page which is not on any of the lists, the
+ * page is left alone.
+ */
+void deactivate_page_nolock(struct page * page)
+{
+	/*
+	 * One for the cache, one for the extra reference the
+	 * caller has and (maybe) one for the buffers.
+	 *
+	 * This isn't perfect, but works for just about everything.
+	 * Besides, as long as we don't move unfreeable pages to the
+	 * inactive_clean list it doesn't need to be perfect...
+	 */
+	int maxcount = (page->buffers ? 3 : 2);
+	page->age = 0;
+
+	/*
+	 * Don't touch it if it's not on the active list.
+	 * (some pages aren't on any list at all)
+	 */
+	if (PageActive(page) && page_count(page) <= maxcount &&
+			!page_ramdisk(page)) {
+
+		/*
+		 * We can move the page to the inactive_dirty list
+		 * if we have the strong suspicion that they might
+		 * become freeable in the near future.
+		 *
+		 * That is, the page has buffer heads attached (that
+		 * need to be cleared away) and/or the function calling
+		 * us has an extra reference count on the page.
+		 */
+		if (page->buffers || page_count(page) == 2) {
+			del_page_from_active_list(page);
+			add_page_to_inactive_dirty_list(page);
+		/*
+		 * Only if we are SURE the page is clean and immediately
+		 * reusable, we move it to the inactive_clean list.
+		 */
+		} else if (page->mapping && !PageDirty(page) &&
+							!PageLocked(page)) {
+			del_page_from_active_list(page);
+			add_page_to_inactive_clean_list(page);
+		}
+		/*
+		 * OK, we cannot free the page. Leave it alone.
+		 */
+	}
+}	
+
+void deactivate_page(struct page * page)
+{
+	spin_lock(&pagemap_lru_lock);
+	deactivate_page_nolock(page);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/*
+ * Move an inactive page to the active list.
+ */
+void activate_page_nolock(struct page * page)
+{
+	if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+		add_page_to_active_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
+		add_page_to_active_list(page);
+	} else {
+		/*
+		 * The page was not on any list, so we take care
+		 * not to do anything.
+		 */
+	}
+
+	/* Make sure the page gets a fair chance at staying active. */
+	if (page->age < PAGE_AGE_START)
+		page->age = PAGE_AGE_START;
+}
+
+void activate_page(struct page * page)
+{
+	spin_lock(&pagemap_lru_lock);
+	activate_page_nolock(page);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+void lru_cache_add(struct page * page)
+{
+	spin_lock(&pagemap_lru_lock);
+	if (!PageLocked(page))
+		BUG();
+	DEBUG_ADD_PAGE
+	add_page_to_active_list(page);
+	/* This should be relatively rare */
+	if (!page->age)
+		deactivate_page_nolock(page);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * __lru_cache_del: remove a page from the page lists
+ * @page: the page to add
+ *
+ * This function is for when the caller already holds
+ * the pagemap_lru_lock.
+ */
+void __lru_cache_del(struct page * page)
+{
+	if (PageActive(page)) {
+		del_page_from_active_list(page);
+	} else if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
+	} else {
+		printk("VM: __lru_cache_del, found unknown page ?!\n");
+	}
+	DEBUG_ADD_PAGE
+}
+
+/**
+ * lru_cache_del: remove a page from the page lists
+ * @page: the page to remove
+ */
+void lru_cache_del(struct page * page)
+{
+	if (!PageLocked(page))
+		BUG();
+	spin_lock(&pagemap_lru_lock);
+	__lru_cache_del(page);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * recalculate_vm_stats - recalculate VM statistics
+ *
+ * This function should be called once a second to recalculate
+ * some useful statistics the VM subsystem uses to determine
+ * its behaviour.
+ */
+void recalculate_vm_stats(void)
+{
+	/*
+	 * Substract one second worth of memory_pressure from
+	 * memory_pressure.
+	 */
+	memory_pressure -= (memory_pressure >> INACTIVE_SHIFT);
+}
+
+/*
+ * Perform any setup for the swap system
+ */
 void __init swap_setup(void)
 {
 	/* Use a smaller cluster for memory <16MB or <32MB */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 506160354..d26c66f54 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -73,7 +73,7 @@ static inline void remove_from_swap_cache(struct page *page)
 		PAGE_BUG(page);
 
 	PageClearSwapCache(page);
-	remove_inode_page(page);
+	__remove_inode_page(page);
 }
 
 /*
@@ -105,7 +105,9 @@ void delete_from_swap_cache_nolock(struct page *page)
 	if (block_flushpage(page, 0))
 		lru_cache_del(page);
 
+	spin_lock(&pagecache_lock);
 	__delete_from_swap_cache(page);
+	spin_unlock(&pagecache_lock);
 	page_cache_release(page);
 }
 
@@ -164,7 +166,7 @@ repeat:
 			return 0;
 		/*
 		 * Though the "found" page was in the swap cache an instant
-		 * earlier, it might have been removed by shrink_mmap etc.
+		 * earlier, it might have been removed by refill_inactive etc.
 		 * Re search ... Since find_lock_page grabs a reference on
 		 * the page, it can not be reused for anything else, namely
 		 * it can not be associated with another swaphandle, so it
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 817a3966b..e8c557e04 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,7 +14,7 @@
 #include <asm/pgalloc.h>
 
 rwlock_t vmlist_lock = RW_LOCK_UNLOCKED;
-struct vm_struct * vmlist = NULL;
+struct vm_struct * vmlist;
 
 static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 95098e4d1..aacd9a5b0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -9,6 +9,7 @@
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
 
 #include <linux/slab.h>
@@ -40,6 +41,7 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 	swp_entry_t entry;
 	struct page * page;
 	int (*swapout)(struct page *, struct file *);
+	int onlist;
 
 	pte = *page_table;
 	if (!pte_present(pte))
@@ -51,16 +53,37 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 	if (mm->swap_cnt)
 		mm->swap_cnt--;
 
+	onlist = PageActive(page);
 	/* Don't look at this pte if it's been accessed recently. */
 	if (pte_young(pte)) {
-		/*
-		 * Transfer the "accessed" bit from the page
-		 * tables to the global page map.
-		 */
 		set_pte(page_table, pte_mkold(pte));
-                SetPageReferenced(page);
+		if (onlist) {
+			/*
+			 * Transfer the "accessed" bit from the page
+			 * tables to the global page map. Page aging
+			 * will be done by refill_inactive_scan().
+			 */
+                	SetPageReferenced(page);
+		} else {
+			/*
+			 * The page is not on the active list, so
+			 * we have to do the page aging ourselves.
+			 */
+			age_page_up(page);
+		}
 		goto out_failed;
 	}
+	if (!onlist)
+		/* The page is still mapped, so it can't be freeable... */
+		age_page_down_ageonly(page);
+
+	/*
+	 * If the page is in active use by us, or if the page
+	 * is in active use by others, don't unmap it or
+	 * (worse) start unneeded IO.
+	 */
+	if (page->age > 0)
+		goto out_failed;
 
 	if (TryLockPage(page))
 		goto out_failed;
@@ -79,8 +102,9 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
 		UnlockPage(page);
-		vma->vm_mm->rss--;
+		mm->rss--;
 		flush_tlb_page(vma, address);
+		deactivate_page(page);
 		page_cache_release(page);
 		goto out_failed;
 	}
@@ -96,7 +120,7 @@ drop_pte:
 	 * our scan.
 	 *
 	 * Basically, this just makes it possible for us to do
-	 * some real work in the future in "shrink_mmap()".
+	 * some real work in the future in "refill_inactive()".
 	 */
 	if (!pte_dirty(pte)) {
 		flush_cache_page(vma, address);
@@ -116,7 +140,9 @@ drop_pte:
 	 * Don't do any of the expensive stuff if
 	 * we're not really interested in this zone.
 	 */
-	if (page->zone->free_pages > page->zone->pages_high)
+	if (page->zone->free_pages + page->zone->inactive_clean_pages
+					+ page->zone->inactive_dirty_pages
+		      	> page->zone->pages_high + inactive_target)
 		goto out_unlock;
 
 	/*
@@ -134,7 +160,7 @@ drop_pte:
 	 * NOTE NOTE NOTE! This should just set a
 	 * dirty bit in 'page', and just drop the
 	 * pte. All the hard work would be done by
-	 * shrink_mmap().
+	 * refill_inactive().
 	 *
 	 * That would get rid of a lot of problems.
 	 */
@@ -144,14 +170,15 @@ drop_pte:
 		struct file *file = vma->vm_file;
 		if (file) get_file(file);
 		pte_clear(page_table);
-		vma->vm_mm->rss--;
+		mm->rss--;
 		flush_tlb_page(vma, address);
-		vmlist_access_unlock(vma->vm_mm);
+		vmlist_access_unlock(mm);
 		error = swapout(page, file);
 		UnlockPage(page);
 		if (file) fput(file);
 		if (!error)
 			goto out_free_success;
+		deactivate_page(page);
 		page_cache_release(page);
 		return error;
 	}
@@ -175,13 +202,14 @@ drop_pte:
 	add_to_swap_cache(page, entry);
 
 	/* Put the swap entry into the pte after the page is in swapcache */
-	vma->vm_mm->rss--;
+	mm->rss--;
 	set_pte(page_table, swp_entry_to_pte(entry));
 	flush_tlb_page(vma, address);
-	vmlist_access_unlock(vma->vm_mm);
+	vmlist_access_unlock(mm);
 
 	/* OK, do a physical asynchronous write to swap.  */
 	rw_swap_page(WRITE, page, 0);
+	deactivate_page(page);
 
 out_free_success:
 	page_cache_release(page);
@@ -230,7 +258,7 @@ static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vm
 
 	do {
 		int result;
-		vma->vm_mm->swap_address = address + PAGE_SIZE;
+		mm->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
@@ -282,7 +310,7 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi
 	if (vma->vm_flags & VM_LOCKED)
 		return 0;
 
-	pgdir = pgd_offset(vma->vm_mm, address);
+	pgdir = pgd_offset(mm, address);
 
 	end = vma->vm_end;
 	if (address >= end)
@@ -323,17 +351,22 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 			int result = swap_out_vma(mm, vma, address, gfp_mask);
 			if (result)
 				return result;
+			if (!mm->swap_cnt)
+				goto out_unlock;
 			vma = vma->vm_next;
 			if (!vma)
 				break;
 			address = vma->vm_start;
 		}
 	}
+	/* Reset to 0 when we reach the end of address space */
+	mm->swap_address = 0;
+	mm->swap_cnt = 0;
+
+out_unlock:
 	vmlist_access_unlock(mm);
 
 	/* We didn't find anything for the process */
-	mm->swap_cnt = 0;
-	mm->swap_address = 0;
 	return 0;
 }
 
@@ -342,7 +375,10 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int gfp_mask)
+#define SWAP_SHIFT 5
+#define SWAP_MIN 8
+
+static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time)
 {
 	struct task_struct * p;
 	int counter;
@@ -363,7 +399,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = (nr_threads << 2) >> (priority >> 2);
+	counter = (nr_threads << SWAP_SHIFT) >> priority;
 	if (counter < 1)
 		counter = 1;
 
@@ -372,6 +408,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
 		struct mm_struct *best = NULL;
 		int pid = 0;
 		int assign = 0;
+		int found_task = 0;
 	select:
 		read_lock(&tasklist_lock);
 		p = init_task.next_task;
@@ -381,9 +418,17 @@ static int swap_out(unsigned int priority, int gfp_mask)
 				continue;
 	 		if (mm->rss <= 0)
 				continue;
+			/* Skip tasks which haven't slept long enough yet when idle-swapping. */
+			if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) ||
+					time_after(p->sleep_time + idle_time * HZ, jiffies)))
+				continue;
+			found_task++;
 			/* Refresh swap_cnt? */
-			if (assign == 1)
-				mm->swap_cnt = mm->rss;
+			if (assign == 1) {
+				mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
+				if (mm->swap_cnt < SWAP_MIN)
+					mm->swap_cnt = SWAP_MIN;
+			}
 			if (mm->swap_cnt > max_cnt) {
 				max_cnt = mm->swap_cnt;
 				best = mm;
@@ -392,7 +437,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
 		}
 		read_unlock(&tasklist_lock);
 		if (!best) {
-			if (!assign) {
+			if (!assign && found_task > 0) {
 				assign = 1;
 				goto select;
 			}
@@ -418,50 +463,409 @@ out:
 	return __ret;
 }
 
-/*
- * Check if there is any memory pressure (free_pages < pages_low)
+
+/**
+ * reclaim_page -	reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
  */
-static inline int memory_pressure(void)
+struct page * reclaim_page(zone_t * zone)
 {
-	pg_data_t *pgdat = pgdat_list;
+	struct page * page = NULL;
+	struct list_head * page_lru;
+	int maxscan;
 
-	do {
-		int i;
-		for(i = 0; i < MAX_NR_ZONES; i++) {
-			zone_t *zone = pgdat->node_zones+ i;
-			if (zone->size &&
-			    zone->free_pages < zone->pages_low)
-				return 1;
+	/*
+	 * We only need the pagemap_lru_lock if we don't reclaim the page,
+	 * but we have to grab the pagecache_lock before the pagemap_lru_lock
+	 * to avoid deadlocks and most of the time we'll succeed anyway.
+	 */
+	spin_lock(&pagecache_lock);
+	spin_lock(&pagemap_lru_lock);
+	maxscan = zone->inactive_clean_pages;
+	while ((page_lru = zone->inactive_clean_list.prev) !=
+			&zone->inactive_clean_list && maxscan--) {
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageInactiveClean(page)) {
+			printk("VM: reclaim_page, wrong page on list.\n");
+			list_del(page_lru);
+			page->zone->inactive_clean_pages--;
+			continue;
 		}
-		pgdat = pgdat->node_next;
-	} while (pgdat);
 
-	return 0;
+		/* Page is or was in use?  Move it to the active list. */
+		if (PageTestandClearReferenced(page) || page->age > 0 ||
+				(!page->buffers && page_count(page) > 1)) {
+			del_page_from_inactive_clean_list(page);
+			add_page_to_active_list(page);
+			continue;
+		}
+
+		/* The page is dirty, or locked, move to inactive_diry list. */
+		if (page->buffers || TryLockPage(page)) {
+			del_page_from_inactive_clean_list(page);
+			add_page_to_inactive_dirty_list(page);
+			continue;
+		}
+
+		/* OK, remove the page from the caches. */
+                if (PageSwapCache(page)) {
+			__delete_from_swap_cache(page);
+			goto found_page;
+		}
+
+		if (page->mapping) {
+			__remove_inode_page(page);
+			goto found_page;
+		}
+
+		/* We should never ever get here. */
+		printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+		list_del(page_lru);
+		zone->inactive_clean_pages--;
+		UnlockPage(page);
+	}
+	/* Reset page pointer, maybe we encountered an unfreeable page. */
+	page = NULL;
+	goto out;
+
+found_page:
+	del_page_from_inactive_clean_list(page);
+	UnlockPage(page);
+	page->age = PAGE_AGE_START;
+	if (page_count(page) != 1)
+		printk("VM: reclaim_page, found page with count %d!\n",
+				page_count(page));
+out:
+	spin_unlock(&pagemap_lru_lock);
+	spin_unlock(&pagecache_lock);
+	memory_pressure++;
+	return page;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ * @sync: should we wait synchronously for the cleaning of pages
+ *
+ * When this function is called, we are most likely low on free +
+ * inactive_clean pages. Since we want to refill those pages as
+ * soon as possible, we'll make two loops over the inactive list,
+ * one to move the already cleaned pages to the inactive_clean lists
+ * and one to (often asynchronously) clean the dirty inactive pages.
+ *
+ * In situations where kswapd cannot keep up, user processes will
+ * end up calling this function. Since the user process needs to
+ * have a page before it can continue with its allocation, we'll
+ * do synchronous page flushing in that case.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define MAX_LAUNDER 		(4 * (1 << page_cluster))
+int page_launder(int gfp_mask, int sync)
+{
+	int launder_loop, maxscan, cleaned_pages, maxlaunder;
+	int can_get_io_locks;
+	struct list_head * page_lru;
+	struct page * page;
+
+	/*
+	 * We can only grab the IO locks (eg. for flushing dirty
+	 * buffers to disk) if __GFP_IO is set.
+	 */
+	can_get_io_locks = gfp_mask & __GFP_IO;
+
+	launder_loop = 0;
+	maxlaunder = 0;
+	cleaned_pages = 0;
+
+dirty_page_rescan:
+	spin_lock(&pagemap_lru_lock);
+	maxscan = nr_inactive_dirty_pages;
+	while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
+				maxscan-- > 0) {
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageInactiveDirty(page)) {
+			printk("VM: page_launder, wrong page on list.\n");
+			list_del(page_lru);
+			nr_inactive_dirty_pages--;
+			page->zone->inactive_dirty_pages--;
+			continue;
+		}
+
+		/* Page is or was in use?  Move it to the active list. */
+		if (PageTestandClearReferenced(page) || page->age > 0 ||
+				(!page->buffers && page_count(page) > 1) ||
+				page_ramdisk(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
+			continue;
+		}
+
+		/*
+		 * The page is locked. IO in progress?
+		 * Move it to the back of the list.
+		 */
+		if (TryLockPage(page)) {
+			list_del(page_lru);
+			list_add(page_lru, &inactive_dirty_list);
+			continue;
+		}
+
+		/*
+		 * If the page has buffers, try to free the buffer mappings
+		 * associated with this page. If we succeed we either free
+		 * the page (in case it was a buffercache only page) or we
+		 * move the page to the inactive_clean list.
+		 *
+		 * On the first round, we should free all previously cleaned
+		 * buffer pages
+		 */
+		if (page->buffers) {
+			int wait, clearedbuf;
+			int freed_page = 0;
+			/*
+			 * Since we might be doing disk IO, we have to
+			 * drop the spinlock and take an extra reference
+			 * on the page so it doesn't go away from under us.
+			 */
+			del_page_from_inactive_dirty_list(page);
+			page_cache_get(page);
+			spin_unlock(&pagemap_lru_lock);
+
+			/* Will we do (asynchronous) IO? */
+			if (launder_loop && maxlaunder == 0 && sync)
+				wait = 2;	/* Synchrounous IO */
+			else if (launder_loop && maxlaunder-- > 0)
+				wait = 1;	/* Async IO */
+			else
+				wait = 0;	/* No IO */
+
+			/* Try to free the page buffers. */
+			clearedbuf = try_to_free_buffers(page, wait);
+
+			/*
+			 * Re-take the spinlock. Note that we cannot
+			 * unlock the page yet since we're still
+			 * accessing the page_struct here...
+			 */
+			spin_lock(&pagemap_lru_lock);
+
+			/* The buffers were not freed. */
+			if (!clearedbuf) {
+				add_page_to_inactive_dirty_list(page);
+
+			/* The page was only in the buffer cache. */
+			} else if (!page->mapping) {
+				atomic_dec(&buffermem_pages);
+				freed_page = 1;
+				cleaned_pages++;
+
+			/* The page has more users besides the cache and us. */
+			} else if (page_count(page) > 2) {
+				add_page_to_active_list(page);
+
+			/* OK, we "created" a freeable page. */
+			} else /* page->mapping && page_count(page) == 2 */ {
+				add_page_to_inactive_clean_list(page);
+				cleaned_pages++;
+			}
+
+			/*
+			 * Unlock the page and drop the extra reference.
+			 * We can only do it here because we ar accessing
+			 * the page struct above.
+			 */
+			UnlockPage(page);
+			page_cache_release(page);
+
+			/* 
+			 * If we're freeing buffer cache pages, stop when
+			 * we've got enough free memory.
+			 */
+			if (freed_page && !free_shortage())
+				break;
+			continue;
+		} else if (page->mapping && !PageDirty(page)) {
+			/*
+			 * If a page had an extra reference in
+			 * deactivate_page(), we will find it here.
+			 * Now the page is really freeable, so we
+			 * move it to the inactive_clean list.
+			 */
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
+			UnlockPage(page);
+			cleaned_pages++;
+		} else {
+			/*
+			 * OK, we don't know what to do with the page.
+			 * It's no use keeping it here, so we move it to
+			 * the active list.
+			 */
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
+			UnlockPage(page);
+		}
+	}
+	spin_unlock(&pagemap_lru_lock);
+
+	/*
+	 * If we don't have enough free pages, we loop back once
+	 * to queue the dirty pages for writeout. When we were called
+	 * by a user process (that /needs/ a free page) and we didn't
+	 * free anything yet, we wait synchronously on the writeout of
+	 * MAX_SYNC_LAUNDER pages.
+	 *
+	 * We also wake up bdflush, since bdflush should, under most
+	 * loads, flush out the dirty pages before we have to wait on
+	 * IO.
+	 */
+	if (can_get_io_locks && !launder_loop && free_shortage()) {
+		launder_loop = 1;
+		/* If we cleaned pages, never do synchronous IO. */
+		if (cleaned_pages)
+			sync = 0;
+		/* We only do a few "out of order" flushes. */
+		maxlaunder = MAX_LAUNDER;
+		/* Kflushd takes care of the rest. */
+		wakeup_bdflush(0);
+		goto dirty_page_rescan;
+	}
+
+	/* Return the number of pages moved to the inactive_clean list. */
+	return cleaned_pages;
+}
+
+/**
+ * refill_inactive_scan - scan the active list and find pages to deactivate
+ * @priority: the priority at which to scan
+ * @oneshot: exit after deactivating one page
+ *
+ * This function will scan a portion of the active list to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_scan(unsigned int priority, int oneshot)
+{
+	struct list_head * page_lru;
+	struct page * page;
+	int maxscan, page_active = 0;
+	int ret = 0;
+
+	/* Take the lock while messing with the list... */
+	spin_lock(&pagemap_lru_lock);
+	maxscan = nr_active_pages >> priority;
+	while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageActive(page)) {
+			printk("VM: refill_inactive, wrong page on list.\n");
+			list_del(page_lru);
+			nr_active_pages--;
+			continue;
+		}
+
+		/* Do aging on the pages. */
+		if (PageTestandClearReferenced(page)) {
+			age_page_up_nolock(page);
+			page_active = 1;
+		} else {
+			age_page_down_ageonly(page);
+			/*
+			 * Since we don't hold a reference on the page
+			 * ourselves, we have to do our test a bit more
+			 * strict then deactivate_page(). This is needed
+			 * since otherwise the system could hang shuffling
+			 * unfreeable pages from the active list to the
+			 * inactive_dirty list and back again...
+			 *
+			 * SUBTLE: we can have buffer pages with count 1.
+			 */
+			if (page_count(page) <= (page->buffers ? 2 : 1)) {
+				deactivate_page_nolock(page);
+				page_active = 0;
+			} else {
+				page_active = 1;
+			}
+		}
+		/*
+		 * If the page is still on the active list, move it
+		 * to the other end of the list. Otherwise it was
+		 * deactivated by age_page_down and we exit successfully.
+		 */
+		if (page_active || PageActive(page)) {
+			list_del(page_lru);
+			list_add(page_lru, &active_list);
+		} else {
+			ret = 1;
+			if (oneshot)
+				break;
+		}
+	}
+	spin_unlock(&pagemap_lru_lock);
+
+	return ret;
 }
 
 /*
- * Check if all zones have recently had memory_pressure (zone_wake_kswapd)
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
  */
-static inline int keep_kswapd_awake(void)
+int free_shortage(void)
 {
-	int all_recent = 1;
 	pg_data_t *pgdat = pgdat_list;
+	int sum = 0;
+	int freeable = nr_free_pages() + nr_inactive_clean_pages();
+	int freetarget = freepages.high + inactive_target / 3;
 
+	/* Are we low on free pages globally? */
+	if (freeable < freetarget)
+		return freetarget - freeable;
+
+	/* If not, are we very low on any particular zone? */
 	do {
 		int i;
 		for(i = 0; i < MAX_NR_ZONES; i++) {
 			zone_t *zone = pgdat->node_zones+ i;
-			if (zone->size) {
-				if (zone->free_pages < zone->pages_min)
-					return 1;
-				if (!zone->zone_wake_kswapd)
-					all_recent = 0;
+			if (zone->size && (zone->inactive_clean_pages +
+					zone->free_pages < zone->pages_min)) {
+				sum += zone->pages_min;
+				sum -= zone->free_pages;
+				sum -= zone->inactive_clean_pages;
 			}
 		}
 		pgdat = pgdat->node_next;
 	} while (pgdat);
 
-	return all_recent;
+	return sum;
+}
+
+/*
+ * How many inactive pages are we short?
+ */
+int inactive_shortage(void)
+{
+	int shortage = 0;
+
+	shortage += freepages.high;
+	shortage += inactive_target;
+	shortage -= nr_free_pages();
+	shortage -= nr_inactive_clean_pages();
+	shortage -= nr_inactive_dirty_pages;
+
+	if (shortage > 0)
+		return shortage;
+
+	return 0;
 }
 
 /*
@@ -472,96 +876,140 @@ static inline int keep_kswapd_awake(void)
  * We want to try to free "count" pages, and we want to 
  * cluster them so that we get good swap-out behaviour.
  *
- * Don't try _too_ hard, though. We don't want to have bad
- * latency.
- *
- * Note: only called by kswapd and try_to_free_pages
- *       both can WAIT at top level.
+ * OTOH, if we're a user process (and not kswapd), we
+ * really care about latency. In that case we don't try
+ * to free too many pages.
  */
-#define FREE_COUNT	8
-#define SWAP_COUNT	16
-static int do_try_to_free_pages(unsigned int gfp_mask)
+static int refill_inactive(unsigned int gfp_mask, int user)
 {
-	int priority;
-	int count = FREE_COUNT;
-	int swap_count;
+	int priority, count, start_count, made_progress;
+	unsigned long idle_time;
+
+	count = inactive_shortage() + free_shortage();
+	if (user)
+		count = (1 << page_cluster);
+	start_count = count;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	priority = 64;
+	/*
+	 * Calculate the minimum time (in seconds) a process must
+	 * have slept before we consider it for idle swapping.
+	 * This must be the number of seconds it takes to go through
+	 * all of the cache. Doing this idle swapping makes the VM
+	 * smoother once we start hitting swap.
+	 */
+	idle_time = atomic_read(&page_cache_size);
+	idle_time += atomic_read(&buffermem_pages);
+	idle_time /= (inactive_target + 1);
+
+	priority = 6;
 	do {
+		made_progress = 0;
+
 		if (current->need_resched) {
+			__set_current_state(TASK_RUNNING);
 			schedule();
-			/* time has passed - pressure too? */
-			if (!memory_pressure())
-				goto done;
 		}
 
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
+		while (refill_inactive_scan(priority, 1) ||
+				swap_out(priority, gfp_mask, idle_time)) {
+			made_progress = 1;
+			if (--count <= 0)
 				goto done;
 		}
 
-		/* check if mission completed */
-		if (!keep_kswapd_awake())
-			goto done;
+		/*
+		 * don't be too light against the d/i cache since
+	   	 * refill_inactive() almost never fail when there's
+	   	 * really plenty of memory free. 
+		 */
+		shrink_dcache_memory(priority, gfp_mask);
+		shrink_icache_memory(priority, gfp_mask);
 
 		/* Try to get rid of some shared memory pages.. */
-		if (gfp_mask & __GFP_IO) {
-			/*
-			 * don't be too light against the d/i cache since
-		   	 * shrink_mmap() almost never fail when there's
-		   	 * really plenty of memory free. 
-			 */
-			count -= shrink_dcache_memory(priority, gfp_mask);
-			count -= shrink_icache_memory(priority, gfp_mask);
-			/*
-			 * Not currently working, see fixme in shrink_?cache_memory
-			 * In the inner funtions there is a comment:
-			 * "To help debugging, a zero exit status indicates
-			 *  all slabs were released." (-arca?)
-			 * lets handle it in a primitive but working way...
-			 *	if (count <= 0)
-			 *		goto done;
-			 */
-			if (!keep_kswapd_awake())
+		while (shm_swap(priority, gfp_mask)) {
+			made_progress = 1;
+			if (--count <= 0)
 				goto done;
-
-			while (shm_swap(priority, gfp_mask)) {
-				if (!--count)
-					goto done;
-			}
 		}
 
 		/*
 		 * Then, try to page stuff out..
-		 *
-		 * This will not actually free any pages (they get
-		 * put in the swap cache), so we must not count this
-		 * as a "count" success.
 		 */
-		swap_count = SWAP_COUNT;
-		while (swap_out(priority, gfp_mask))
-			if (--swap_count < 0)
-				break;
+		while (swap_out(priority, gfp_mask, 0)) {
+			made_progress = 1;
+			if (--count <= 0)
+				goto done;
+		}
 
-	} while (--priority >= 0);
+		/*
+		 * If we either have enough free memory, or if
+		 * page_launder() will be able to make enough
+		 * free memory, then stop.
+		 */
+		if (!inactive_shortage() || !free_shortage())
+			goto done;
+
+		/*
+		 * Only switch to a lower "priority" if we
+		 * didn't make any useful progress in the
+		 * last loop.
+		 */
+		if (!made_progress)
+			priority--;
+	} while (priority >= 0);
 
-	/* Always end on a shrink_mmap.., may sleep... */
-	while (shrink_mmap(0, gfp_mask)) {
-		if (!--count)
+	/* Always end on a refill_inactive.., may sleep... */
+	while (refill_inactive_scan(0, 1)) {
+		if (--count <= 0)
 			goto done;
 	}
-	/* Return 1 if any page is freed, or
-	 * there are no more memory pressure   */
-	return (count < FREE_COUNT || !keep_kswapd_awake());
- 
+
 done:
-	return 1;
+	return (count < start_count);
+}
+
+static int do_try_to_free_pages(unsigned int gfp_mask, int user)
+{
+	int ret = 0;
+
+	/*
+	 * If we're low on free pages, move pages from the
+	 * inactive_dirty list to the inactive_clean list.
+	 *
+	 * Usually bdflush will have pre-cleaned the pages
+	 * before we get around to moving them to the other
+	 * list, so this is a relatively cheap operation.
+	 */
+	if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
+			nr_inactive_clean_pages())
+		ret += page_launder(gfp_mask, user);
+
+	/*
+	 * If needed, we move pages from the active list
+	 * to the inactive list. We also "eat" pages from
+	 * the inode and dentry cache whenever we do this.
+	 */
+	if (free_shortage() || inactive_shortage()) {
+		shrink_dcache_memory(6, gfp_mask);
+		shrink_icache_memory(6, gfp_mask);
+		ret += refill_inactive(gfp_mask, user);
+	} else {
+		/*
+		 * Reclaim unused slab cache memory.
+		 */
+		kmem_cache_reap(gfp_mask);
+		ret = 1;
+	}
+
+	return ret;
 }
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+struct task_struct *kswapd_task;
 
 /*
  * The background pageout daemon, started as a kernel thread
@@ -584,6 +1032,7 @@ int kswapd(void *unused)
 	tsk->pgrp = 1;
 	strcpy(tsk->comm, "kswapd");
 	sigfillset(&tsk->blocked);
+	kswapd_task = tsk;
 	
 	/*
 	 * Tell the memory management that we're a "memory allocator",
@@ -599,54 +1048,166 @@ int kswapd(void *unused)
 	 */
 	tsk->flags |= PF_MEMALLOC;
 
+	/*
+	 * Kswapd main loop.
+	 */
 	for (;;) {
-		if (!keep_kswapd_awake()) {
-			interruptible_sleep_on(&kswapd_wait);
+		static int recalc = 0;
+
+		/* If needed, try to free some memory. */
+		if (inactive_shortage() || free_shortage()) {
+			int wait = 0;
+			/* Do we need to do some synchronous flushing? */
+			if (waitqueue_active(&kswapd_done))
+				wait = 1;
+			do_try_to_free_pages(GFP_KSWAPD, wait);
+		}
+
+		/*
+		 * Do some (very minimal) background scanning. This
+		 * will scan all pages on the active list once
+		 * every minute. This clears old referenced bits
+		 * and moves unused pages to the inactive list.
+		 */
+		refill_inactive_scan(6, 0);
+
+		/* Once a second, recalculate some VM stats. */
+		if (time_after(jiffies, recalc + HZ)) {
+			recalc = jiffies;
+			recalculate_vm_stats();
 		}
 
-		do_try_to_free_pages(GFP_KSWAPD);
+		/*
+		 * Wake up everybody waiting for free memory
+		 * and unplug the disk queue.
+		 */
+		wake_up_all(&kswapd_done);
+		run_task_queue(&tq_disk);
+
+		/* 
+		 * We go to sleep if either the free page shortage
+		 * or the inactive page shortage is gone. We do this
+		 * because:
+		 * 1) we need no more free pages   or
+		 * 2) the inactive pages need to be flushed to disk,
+		 *    it wouldn't help to eat CPU time now ...
+		 *
+		 * We go to sleep for one second, but if it's needed
+		 * we'll be woken up earlier...
+		 */
+		if (!free_shortage() || !inactive_shortage())
+			interruptible_sleep_on_timeout(&kswapd_wait, HZ);
+		/*
+		 * TODO: insert out of memory check & oom killer
+		 * invocation in an else branch here.
+		 */
 	}
 }
 
+void wakeup_kswapd(int block)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (current == kswapd_task)
+		return;
+
+	if (!block) {
+		if (waitqueue_active(&kswapd_wait))
+			wake_up(&kswapd_wait);
+		return;
+	}
+
+	/*
+	 * Kswapd could wake us up before we get a chance
+	 * to sleep, so we have to be very careful here to
+	 * prevent SMP races...
+	 */
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&kswapd_done, &wait);
+
+	if (waitqueue_active(&kswapd_wait))
+		wake_up(&kswapd_wait);
+	schedule();
+
+	remove_wait_queue(&kswapd_done, &wait);
+	__set_current_state(TASK_RUNNING);
+}
+
 /*
  * Called by non-kswapd processes when they want more
- * memory.
- *
- * In a perfect world, this should just wake up kswapd
- * and return. We don't actually want to swap stuff out
- * from user processes, because the locking issues are
- * nasty to the extreme (file write locks, and MM locking)
- *
- * One option might be to let kswapd do all the page-out
- * and VM page table scanning that needs locking, and this
- * process thread could do just the mmap shrink stage that
- * can be done by just dropping cached pages without having
- * any deadlock issues.
+ * memory but are unable to sleep on kswapd because
+ * they might be holding some IO locks ...
  */
 int try_to_free_pages(unsigned int gfp_mask)
 {
-	int retval = 1;
+	int ret = 1;
 
 	if (gfp_mask & __GFP_WAIT) {
-		current->state = TASK_RUNNING;
 		current->flags |= PF_MEMALLOC;
-		retval = do_try_to_free_pages(gfp_mask);
+		ret = do_try_to_free_pages(gfp_mask, 1);
 		current->flags &= ~PF_MEMALLOC;
 	}
 
-	/* someone needed memory that kswapd had not provided
-	 * make sure kswapd runs, should not happen often */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	return ret;
+}
+
+DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
+/*
+ * Kreclaimd will move pages from the inactive_clean list to the
+ * free list, in order to keep atomic allocations possible under
+ * all circumstances. Even when kswapd is blocked on IO.
+ */
+int kreclaimd(void *unused)
+{
+	struct task_struct *tsk = current;
+	pg_data_t *pgdat;
 
-	return retval;
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	strcpy(tsk->comm, "kreclaimd");
+	sigfillset(&tsk->blocked);
+	current->flags |= PF_MEMALLOC;
+
+	while (1) {
+
+		/*
+		 * We sleep until someone wakes us up from
+		 * page_alloc.c::__alloc_pages().
+		 */
+		interruptible_sleep_on(&kreclaimd_wait);
+
+		/*
+		 * Move some pages from the inactive_clean lists to
+		 * the free lists, if it is needed.
+		 */
+		pgdat = pgdat_list;
+		do {
+			int i;
+			for(i = 0; i < MAX_NR_ZONES; i++) {
+				zone_t *zone = pgdat->node_zones + i;
+				if (!zone->size)
+					continue;
+
+				while (zone->free_pages < zone->pages_low) {
+					struct page * page;
+					page = reclaim_page(zone);
+					if (!page)
+						break;
+					__free_page(page);
+				}
+			}
+			pgdat = pgdat->node_next;
+		} while (pgdat);
+	}
 }
 
+
 static int __init kswapd_init(void)
 {
-	printk("Starting kswapd v1.7\n");
+	printk("Starting kswapd v1.8\n");
 	swap_setup();
-	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 	return 0;
 }
author	Ralf Baechle <ralf@linux-mips.org>	2000-10-05 01:18:40 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-10-05 01:18:40 +0000
commit	012bb3e61e5eced6c610f9e036372bf0c8def2d1 (patch)
tree	87efc733f9b164e8c85c0336f92c8fb7eff6d183 /mm
parent	625a1589d3d6464b5d90b8a0918789e3afffd220 (diff)