Merge with Linux 2.3.99-pre7 and various other bits.

author: Ralf Baechle <ralf@linux-mips.org> 2000-05-12 21:05:59 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-05-12 21:05:59 +0000
commit: ba2dacab305c598cd4c34a604f8e276bf5bab5ff (patch)
tree: 78670a0139bf4d5ace617b29b7eba82bbc74d602 /mm
parent: b77bf69998121e689c5e86cc5630d39a0a9ee6ca (diff)
9 files changed, 298 insertions, 335 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index d0df8bd2c..acafb3353 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -46,7 +46,7 @@ unsigned int page_hash_bits;
 struct page **page_hash_table;
 struct list_head lru_cache;
 
-spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  *       the pagemap_lru_lock held.
@@ -67,7 +67,7 @@ void __add_page_to_hash_queue(struct page * page, struct page **p)
 		PAGE_BUG(page);
 }
 
-static void remove_page_from_hash_queue(struct page * page)
+static inline void remove_page_from_hash_queue(struct page * page)
 {
 	if(page->pprev_hash) {
 		if(page->next_hash)
@@ -92,47 +92,71 @@ static inline int sync_page(struct page *page)
  * sure the page is locked and that nobody else uses it - or that usage
  * is safe.
  */
+static inline void __remove_inode_page(struct page *page)
+{
+	remove_page_from_inode_queue(page);
+	remove_page_from_hash_queue(page);
+	page->mapping = NULL;
+}
+
 void remove_inode_page(struct page *page)
 {
 	if (!PageLocked(page))
 		PAGE_BUG(page);
 
-	/* Initiate completion of any async operations */
-	sync_page(page);
-
 	spin_lock(&pagecache_lock);
-	remove_page_from_inode_queue(page);
-	remove_page_from_hash_queue(page);
-	page->mapping = NULL;
+	__remove_inode_page(page);
 	spin_unlock(&pagecache_lock);
 }
 
+#define ITERATIONS 100
+
 void invalidate_inode_pages(struct inode * inode)
 {
 	struct list_head *head, *curr;
 	struct page * page;
+	int count;
 
- repeat:
 	head = &inode->i_mapping->pages;
-	spin_lock(&pagecache_lock);
-	curr = head->next;
 
-	while (curr != head) {
-		page = list_entry(curr, struct page, list);
-		curr = curr->next;
+	while (head != head->next) {
+		spin_lock(&pagecache_lock);
+		spin_lock(&pagemap_lru_lock);
+		head = &inode->i_mapping->pages;
+		curr = head->next;
+		count = 0;
 
-		/* We cannot invalidate a locked page */
-		if (TryLockPage(page))
-			continue;
-		spin_unlock(&pagecache_lock);
+		while ((curr != head) && (count++ < ITERATIONS)) {
+			page = list_entry(curr, struct page, list);
+			curr = curr->next;
 
-		lru_cache_del(page);
-		remove_inode_page(page);
-		UnlockPage(page);
-		page_cache_release(page);
-		goto repeat;
+			/* We cannot invalidate a locked page */
+			if (TryLockPage(page))
+				continue;
+
+			__lru_cache_del(page);
+			__remove_inode_page(page);
+			UnlockPage(page);
+			page_cache_release(page);
+		}
+
+		/* At this stage we have passed through the list
+		 * once, and there may still be locked pages. */
+
+		if (head->next!=head) {
+			page = list_entry(head->next, struct page, list);
+			get_page(page);
+			spin_unlock(&pagemap_lru_lock);
+			spin_unlock(&pagecache_lock);
+			/* We need to block */
+			lock_page(page);
+			UnlockPage(page);
+			page_cache_release(page);
+		} else {                                         
+			spin_unlock(&pagemap_lru_lock);
+			spin_unlock(&pagecache_lock);
+		}
 	}
-	spin_unlock(&pagecache_lock);
 }
 
 /*
@@ -163,10 +187,10 @@ repeat:
 		/* page wholly truncated - free it */
 		if (offset >= start) {
 			if (TryLockPage(page)) {
-				spin_unlock(&pagecache_lock);
 				get_page(page);
+				spin_unlock(&pagecache_lock);
 				wait_on_page(page);
-				put_page(page);
+				page_cache_release(page);
 				goto repeat;
 			}
 			get_page(page);
@@ -236,57 +260,47 @@ repeat:
 	spin_unlock(&pagecache_lock);
 }
 
-int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
+int shrink_mmap(int priority, int gfp_mask)
 {
-	int ret = 0, loop = 0, count;
-	LIST_HEAD(young);
+	int ret = 0, count;
 	LIST_HEAD(old);
-	LIST_HEAD(forget);
 	struct list_head * page_lru, * dispose;
 	struct page * page = NULL;
-	struct zone_struct * p_zone;
-	int maxloop = 256 >> priority;
 	
-	if (!zone)
-		BUG();
-
-	count = nr_lru_pages >> priority;
-	if (!count)
-		return ret;
+	count = nr_lru_pages / (priority + 1);
 
-	spin_lock(&pagemap_lru_lock);
-again:
 	/* we need pagemap_lru_lock for list_del() ... subtle code below */
+	spin_lock(&pagemap_lru_lock);
 	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
-		p_zone = page->zone;
 
-		/*
-		 * These two tests are there to make sure we don't free too
-		 * many pages from the "wrong" zone. We free some anyway,
-		 * they are the least recently used pages in the system.
-		 * When we don't free them, leave them in &old.
-		 */
-		dispose = &old;
-		if (p_zone != zone && (loop > (maxloop / 4) ||
-				p_zone->free_pages > p_zone->pages_high))
+		dispose = &lru_cache;
+		if (PageTestandClearReferenced(page))
 			goto dispose_continue;
 
-		/* The page is in use, or was used very recently, put it in
-		 * &young to make sure that we won't try to free it the next
-		 * time */
-		dispose = &young;
+		count--;
 
-		if (test_and_clear_bit(PG_referenced, &page->flags))
-			goto dispose_continue;
+		/*
+		 * I'm ambivalent on this one.. Should we try to
+		 * maintain LRU on the LRU list, and put pages that
+		 * are old at the end of the queue, even if that
+		 * means that we'll re-scan then again soon and
+		 * often waste CPU time? Or should be just let any
+		 * pages we do not want to touch now for one reason
+		 * or another percolate to be "young"?
+		 *
+		dispose = &old;
+		 *
+		 */
 
-		count--;
+		/*
+		 * Avoid unscalable SMP locking for pages we can
+		 * immediate tell are untouchable..
+		 */
 		if (!page->buffers && page_count(page) > 1)
 			goto dispose_continue;
 
-		/* Page not used -> free it; if that fails -> &old */
-		dispose = &old;
 		if (TryLockPage(page))
 			goto dispose_continue;
 
@@ -300,7 +314,10 @@ again:
 		/* avoid freeing the page while it's locked */
 		get_page(page);
 
-		/* Is it a buffer page? */
+		/*
+		 * Is it a buffer page? Try to clean it up regardless
+		 * of zone - it's old.
+		 */
 		if (page->buffers) {
 			if (!try_to_free_buffers(page))
 				goto unlock_continue;
@@ -335,19 +352,23 @@ again:
 			goto made_inode_progress;
 		}	
 
+		/*
+		 * Page is from a zone we don't care about.
+		 * Don't drop page cache entries in vain.
+		 */
+		if (page->zone->free_pages > page->zone->pages_high)
+			goto cache_unlock_continue;
+
 		/* is it a page-cache page? */
 		if (page->mapping) {
 			if (!PageDirty(page) && !pgcache_under_min()) {
-				remove_page_from_inode_queue(page);
-				remove_page_from_hash_queue(page);
-				page->mapping = NULL;
+				__remove_inode_page(page);
 				spin_unlock(&pagecache_lock);
 				goto made_inode_progress;
 			}
 			goto cache_unlock_continue;
 		}
 
-		dispose = &forget;
 		printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 
 cache_unlock_continue:
@@ -356,10 +377,6 @@ unlock_continue:
 		spin_lock(&pagemap_lru_lock);
 		UnlockPage(page);
 		put_page(page);
-		list_add(page_lru, dispose);
-		continue;
-
-		/* we're holding pagemap_lru_lock, so we can just loop again */
 dispose_continue:
 		list_add(page_lru, dispose);
 	}
@@ -375,13 +392,7 @@ made_buffer_progress:
 	/* nr_lru_pages needs the spinlock */
 	nr_lru_pages--;
 
-	loop++;
-	/* wrong zone?  not looped too often?    roll again... */
-	if (page->zone != zone && loop < maxloop)
-		goto again;
-
 out:
-	list_splice(&young, &lru_cache);
 	list_splice(&old, lru_cache.prev);
 
 	spin_unlock(&pagemap_lru_lock);
@@ -403,7 +414,7 @@ inside:
 		if (page->index == offset)
 			break;
 	}
-	set_bit(PG_referenced, &page->flags);
+	SetPageReferenced(page);
 not_found:
 	return page;
 }
@@ -495,6 +506,26 @@ int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsig
 }
 
 /*
+ * Add a page to the inode page cache.
+ *
+ * The caller must have locked the page and 
+ * set all the page flags correctly..
+ */
+void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
+{
+	if (!PageLocked(page))
+		BUG();
+
+	get_page(page);
+	spin_lock(&pagecache_lock);
+	page->index = index;
+	add_page_to_inode_queue(mapping, page);
+	__add_page_to_hash_queue(page, page_hash(mapping, index));
+	lru_cache_add(page);
+	spin_unlock(&pagecache_lock);
+}
+
+/*
  * This adds a page to the page cache, starting out as locked,
  * owned by us, referenced, but not uptodate and with no errors.
  */
@@ -569,7 +600,7 @@ static inline int page_cache_read(struct file * file, unsigned long offset)
 		return -ENOMEM;
 
 	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
-		int error = mapping->a_ops->readpage(file->f_dentry, page);
+		int error = mapping->a_ops->readpage(file, page);
 		page_cache_release(page);
 		return error;
 	}
@@ -1104,7 +1135,7 @@ page_not_up_to_date:
 
 readpage:
 		/* ... and start the actual read. The read will unlock the page. */
-		error = mapping->a_ops->readpage(filp->f_dentry, page);
+		error = mapping->a_ops->readpage(filp, page);
 
 		if (!error) {
 			if (Page_Uptodate(page))
@@ -1486,7 +1517,7 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file->f_dentry, page)) {
+	if (!mapping->a_ops->readpage(file, page)) {
 		wait_on_page(page);
 		if (Page_Uptodate(page))
 			goto success;
@@ -1504,7 +1535,7 @@ page_not_uptodate:
 		goto success;
 	}
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file->f_dentry, page)) {
+	if (!mapping->a_ops->readpage(file, page)) {
 		wait_on_page(page);
 		if (Page_Uptodate(page))
 			goto success;
@@ -1519,27 +1550,16 @@ page_not_uptodate:
 }
 
 static int filemap_write_page(struct file *file,
-			      unsigned long index,
 			      struct page * page,
 			      int wait)
 {
-	int result;
-	struct dentry * dentry;
-	struct inode * inode;
-
-	dentry = file->f_dentry;
-	inode = dentry->d_inode;
-
 	/*
 	 * If a task terminates while we're swapping the page, the vma and
 	 * and file could be released: try_to_swap_out has done a get_file.
 	 * vma/file is guaranteed to exist in the unmap/sync cases because
 	 * mmap_sem is held.
 	 */
-	lock_page(page);
-	result = inode->i_mapping->a_ops->writepage(file, dentry, page);
-	UnlockPage(page);
-	return result;
+	return page->mapping->a_ops->writepage(file, page);
 }
 
 
@@ -1551,7 +1571,7 @@ static int filemap_write_page(struct file *file,
 extern void wakeup_bdflush(int);
 int filemap_swapout(struct page * page, struct file * file)
 {
-	int retval = filemap_write_page(file, page->index, page, 0);
+	int retval = filemap_write_page(file, page, 0);
 	wakeup_bdflush(0);
 	return retval;
 }
@@ -1597,7 +1617,9 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 		printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
 			pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
 	}
-	error = filemap_write_page(vma->vm_file, pgoff, page, 1);
+	lock_page(page);
+	error = filemap_write_page(vma->vm_file, page, 1);
+	UnlockPage(page);
 	page_cache_free(page);
 	return error;
 }
diff --git a/mm/highmem.c b/mm/highmem.c
index 691e3df1f..3e028dced 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -24,8 +24,12 @@
 
 unsigned long highmem_mapnr;
 
+/*
+ * Take one locked page, return another low-memory locked page.
+ */
 struct page * prepare_highmem_swapout(struct page * page)
 {
+	struct page *new_page;
 	unsigned long regular_page;
 	unsigned long vaddr;
 	/*
@@ -36,6 +40,14 @@ struct page * prepare_highmem_swapout(struct page * page)
 	if (!PageHighMem(page))
 		return page;
 
+	/*
+	 * Here we break the page lock, and we split the
+	 * dirty page into two. We can unlock the old page,
+	 * and we'll now have two of them. Too bad, it would
+	 * have been nice to continue to potentially share
+	 * across a fork().
+	 */
+	UnlockPage(page);
 	regular_page = __get_free_page(GFP_ATOMIC);
 	if (!regular_page)
 		return NULL;
@@ -49,8 +61,9 @@ struct page * prepare_highmem_swapout(struct page * page)
 	 * we stored its data into the new regular_page.
 	 */
 	__free_page(page);
-
-	return mem_map + MAP_NR(regular_page);
+	new_page = mem_map + MAP_NR(regular_page);
+	LockPage(new_page);
+	return new_page;
 }
 
 struct page * replace_with_highmem(struct page * page)
diff --git a/mm/memory.c b/mm/memory.c
index 84ecb57b5..f0baed69f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -824,7 +824,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	map_nr = pte_pagenr(pte);
 	if (map_nr >= max_mapnr)
 		goto bad_wp_page;
-	mm->min_flt++;
 	old_page = mem_map + map_nr;
 	
 	/*
@@ -855,7 +854,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 		flush_cache_page(vma, address);
 		establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 		spin_unlock(&mm->page_table_lock);
-		return 1;
+		return 1;	/* Minor fault */
 	}
 
 	/*
@@ -880,7 +879,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	}
 	spin_unlock(&mm->page_table_lock);
 	__free_page(new_page);
-	return 1;
+	return 1;	/* Minor fault */
 
 bad_wp_page:
 	spin_unlock(&mm->page_table_lock);
@@ -1049,12 +1048,9 @@ static int do_swap_page(struct mm_struct * mm,
 	}
 
 	mm->rss++;
-	mm->min_flt++;
 
 	pte = mk_pte(page, vma->vm_page_prot);
 
-	SetPageSwapEntry(page);
-
 	/*
 	 * Freeze the "shared"ness of the page, ie page_count + swap_count.
 	 * Must lock page before transferring our swap count to already
@@ -1074,7 +1070,7 @@ static int do_swap_page(struct mm_struct * mm,
 	set_pte(page_table, pte);
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
-	return 1;
+	return 1;	/* Minor fault */
 }
 
 /*
@@ -1094,13 +1090,12 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma,
 		clear_user_highpage(page, addr);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		mm->rss++;
-		mm->min_flt++;
 		flush_page_to_ram(page);
 	}
 	set_pte(page_table, entry);
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
-	return 1;
+	return 1;	/* Minor fault */
 }
 
 /*
@@ -1133,7 +1128,6 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 		return 0;
 	if (new_page == NOPAGE_OOM)
 		return -1;
-	++mm->maj_flt;
 	++mm->rss;
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
@@ -1156,7 +1150,7 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 	set_pte(page_table, entry);
 	/* no need to invalidate: a not-present page shouldn't be cached */
 	update_mmu_cache(vma, address, entry);
-	return 1;
+	return 2;	/* Major fault */
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ba5ba3013..c3ea96efc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,23 +58,6 @@ static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
  */
 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
 
-#if 0
-
-static inline unsigned long classfree(zone_t *zone)
-{
-	unsigned long free = 0;
-	zone_t *z = zone->zone_pgdat->node_zones;
-
-	while (z != zone) {
-		free += z->free_pages;
-		z++;
-	}
-	free += zone->free_pages;
-	return(free);
-}
-
-#endif
-
 /*
  * Buddy system. Hairy. You really aren't expected to understand this
  *
@@ -227,67 +210,13 @@ static struct page * rmqueue(zone_t *zone, unsigned long order)
 	return NULL;
 }
 
-static int zone_balance_memory(zonelist_t *zonelist)
-{
-	int tried = 0, freed = 0;
-	zone_t **zone;
-	int gfp_mask = zonelist->gfp_mask;
-	extern wait_queue_head_t kswapd_wait;
-
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (z->free_pages > z->pages_low)
-			continue;
-
-		z->zone_wake_kswapd = 1;
-		wake_up_interruptible(&kswapd_wait);
-
-		/* Are we reaching the critical stage? */
-		if (!z->low_on_memory) {
-			/* Not yet critical, so let kswapd handle it.. */
-			if (z->free_pages > z->pages_min)
-				continue;
-			z->low_on_memory = 1;
-		}
-		/*
-		 * In the atomic allocation case we only 'kick' the
-		 * state machine, but do not try to free pages
-		 * ourselves.
-		 */
-		tried = 1;
-		freed |= try_to_free_pages(gfp_mask, z);
-	}
-	if (tried && !freed) {
-		if (!(gfp_mask & __GFP_HIGH))
-			return 0;
-	}
-	return 1;
-}
-
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 {
 	zone_t **zone = zonelist->zones;
-	int gfp_mask = zonelist->gfp_mask;
-	static int low_on_memory;
-
-	/*
-	 * If this is a recursive call, we'd better
-	 * do our best to just allocate things without
-	 * further thought.
-	 */
-	if (current->flags & PF_MEMALLOC)
-		goto allocate_ok;
-
-	/* If we're a memory hog, unmap some pages */
-	if (current->hog && low_on_memory &&
-			(gfp_mask & __GFP_WAIT))
-		swap_out(4, gfp_mask);
+	extern wait_queue_head_t kswapd_wait;
 
 	/*
 	 * (If anyone calls gfp from interrupts nonatomically then it
@@ -304,38 +233,67 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 			BUG();
 
 		/* Are we supposed to free memory? Don't make it worse.. */
-		if (!z->zone_wake_kswapd && z->free_pages > z->pages_low) {
+		if (!z->zone_wake_kswapd) {
 			struct page *page = rmqueue(z, order);
-			low_on_memory = 0;
+			if (z->free_pages < z->pages_low) {
+				z->zone_wake_kswapd = 1;
+				if (waitqueue_active(&kswapd_wait))
+					wake_up_interruptible(&kswapd_wait);
+			}
 			if (page)
 				return page;
 		}
 	}
 
-	low_on_memory = 1;
 	/*
-	 * Ok, no obvious zones were available, start
-	 * balancing things a bit..
+	 * Ok, we don't have any zones that don't need some
+	 * balancing.. See if we have any that aren't critical..
 	 */
-	if (zone_balance_memory(zonelist)) {
-		zone = zonelist->zones;
-allocate_ok:
-		for (;;) {
-			zone_t *z = *(zone++);
-			if (!z)
-				break;
-			if (z->free_pages) {
-				struct page *page = rmqueue(z, order);
-				if (page)
-					return page;
-			}
+	zone = zonelist->zones;
+	for (;;) {
+		zone_t *z = *(zone++);
+		if (!z)
+			break;
+		if (!z->low_on_memory) {
+			struct page *page = rmqueue(z, order);
+			if (z->free_pages < z->pages_min)
+				z->low_on_memory = 1;
+			if (page)
+				return page;
 		}
 	}
-	return NULL;
 
-/*
- * The main chunk of the balancing code is in this offline branch:
- */
+	/*
+	 * Uhhuh. All the zones have been critical, which means that
+	 * we'd better do some synchronous swap-out. kswapd has not
+	 * been able to cope..
+	 */
+	if (!(current->flags & PF_MEMALLOC)) {
+		int gfp_mask = zonelist->gfp_mask;
+		if (!try_to_free_pages(gfp_mask)) {
+			if (!(gfp_mask & __GFP_HIGH))
+				goto fail;
+		}
+	}
+
+	/*
+	 * Final phase: allocate anything we can!
+	 */
+	zone = zonelist->zones;
+	for (;;) {
+		struct page *page;
+
+		zone_t *z = *(zone++);
+		if (!z)
+			break;
+		page = rmqueue(z, order);
+		if (page)
+			return page;
+	}
+
+fail:
+	/* No luck.. */
+	return NULL;
 }
 
 /*
diff --git a/mm/page_io.c b/mm/page_io.c
index 23acf5af4..b2b6359d0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -74,7 +74,7 @@ static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page, int w
 		return 0;
 	}
  	if (!wait) {
- 		set_bit(PG_decr_after, &page->flags);
+ 		SetPageDecrAfter(page);
  		atomic_inc(&nr_async_pages);
  	}
 
@@ -132,6 +132,11 @@ void rw_swap_page_nolock(int rw, swp_entry_t entry, char *buf, int wait)
 		PAGE_BUG(page);
 	if (PageSwapCache(page))
 		PAGE_BUG(page);
+	if (page->mapping)
+		PAGE_BUG(page);
+	/* needs sync_page to wait I/O completation */
+	page->mapping = &swapper_space;
 	if (!rw_swap_page_base(rw, entry, page, wait))
 		UnlockPage(page);
+	page->mapping = NULL;
 }
diff --git a/mm/slab.c b/mm/slab.c
index 68bbb7d17..055282872 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -687,6 +687,9 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
 	size_t		left_over;
 	size_t		align;
 
+#if SLAB_DEBUG_SUPPORT
+	flags |= SLAB_POISON;
+#endif
 	/* Sanity checks... */
 #if	SLAB_MGMT_CHECKS
 	if (!name) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 29ba0d78b..ad686e4c3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -47,14 +47,20 @@ void show_swap_cache_info(void)
 
 void add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
+	unsigned long flags;
+
 #ifdef SWAP_CACHE_INFO
 	swap_cache_add_total++;
 #endif
+	if (!PageLocked(page))
+		BUG();
 	if (PageTestandSetSwapCache(page))
 		BUG();
 	if (page->mapping)
 		BUG();
-	add_to_page_cache(page, &swapper_space, entry.val);
+	flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty));
+	page->flags = flags | (1 << PG_referenced) | (1 << PG_uptodate);
+	add_to_page_cache_locked(page, &swapper_space, entry.val);
 }
 
 static inline void remove_from_swap_cache(struct page *page)
@@ -130,9 +136,6 @@ void free_page_and_swap_cache(struct page *page)
 		}
 		UnlockPage(page);
 	}
-
-	ClearPageSwapEntry(page);
-
 	__free_page(page);
 }
 
@@ -228,6 +231,7 @@ struct page * read_swap_cache_async(swp_entry_t entry, int wait)
 	/* 
 	 * Add it to the swap cache and read its contents.
 	 */
+	lock_page(new_page);
 	add_to_swap_cache(new_page, entry);
 	rw_swap_page(READ, new_page, wait);
 	return new_page;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index da2dd9147..c5f8db242 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -200,49 +200,6 @@ bad_count:
 	goto out;
 }
 
-/* needs the big kernel lock */
-swp_entry_t acquire_swap_entry(struct page *page)
-{
-	struct swap_info_struct * p;
-	unsigned long offset, type;
-	swp_entry_t entry;
-
-	if (!PageSwapEntry(page))
-		goto new_swap_entry;
-
-	/* We have the old entry in the page offset still */
-	if (!page->index)
-		goto new_swap_entry;
-	entry.val = page->index;
-	type = SWP_TYPE(entry);
-	if (type >= nr_swapfiles)
-		goto new_swap_entry;
-	p = type + swap_info;
-	if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
-		goto new_swap_entry;
-	offset = SWP_OFFSET(entry);
-	if (offset >= p->max)
-		goto new_swap_entry;
-	/* Has it been re-used for something else? */
-	swap_list_lock();
-	swap_device_lock(p);
-	if (p->swap_map[offset])
-		goto unlock_new_swap_entry;
-
-	/* We're cool, we can just use the old one */
-	p->swap_map[offset] = 1;
-	swap_device_unlock(p);
-	nr_swap_pages--;
-	swap_list_unlock();
-	return entry;
-
-unlock_new_swap_entry:
-	swap_device_unlock(p);
-	swap_list_unlock();
-new_swap_entry:
-	return get_swap_page();
-}
-
 /*
  * The swap entry has been read in advance, and we return 1 to indicate
  * that the page has been used or is no longer needed.
@@ -443,8 +400,7 @@ static int try_to_unuse(unsigned int type)
 asmlinkage long sys_swapoff(const char * specialfile)
 {
 	struct swap_info_struct * p = NULL;
-	struct dentry * dentry;
-	struct vfsmount *mnt;
+	struct nameidata nd;
 	int i, type, prev;
 	int err;
 	
@@ -452,9 +408,8 @@ asmlinkage long sys_swapoff(const char * specialfile)
 		return -EPERM;
 
 	lock_kernel();
-	dentry = namei(specialfile);
-	err = PTR_ERR(dentry);
-	if (IS_ERR(dentry))
+	err = user_path_walk(specialfile, &nd);
+	if (err)
 		goto out;
 
 	prev = -1;
@@ -463,11 +418,11 @@ asmlinkage long sys_swapoff(const char * specialfile)
 		p = swap_info + type;
 		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
 			if (p->swap_file) {
-				if (p->swap_file == dentry)
+				if (p->swap_file == nd.dentry)
 				  break;
 			} else {
-				if (S_ISBLK(dentry->d_inode->i_mode)
-				    && (p->swap_device == dentry->d_inode->i_rdev))
+				if (S_ISBLK(nd.dentry->d_inode->i_mode)
+				    && (p->swap_device == nd.dentry->d_inode->i_rdev))
 				  break;
 			}
 		}
@@ -509,22 +464,21 @@ asmlinkage long sys_swapoff(const char * specialfile)
 		goto out_dput;
 	}
 	if (p->swap_device)
-		blkdev_put(dentry->d_inode->i_bdev, BDEV_SWAP);
-	dput(dentry);
+		blkdev_put(nd.dentry->d_inode->i_bdev, BDEV_SWAP);
+	path_release(&nd);
 
-	dentry = p->swap_file;
+	nd.dentry = p->swap_file;
 	p->swap_file = NULL;
-	mnt = p->swap_vfsmnt;
+	nd.mnt = p->swap_vfsmnt;
 	p->swap_vfsmnt = NULL;
 	p->swap_device = 0;
 	vfree(p->swap_map);
 	p->swap_map = NULL;
 	p->flags = 0;
 	err = 0;
-	mntput(mnt);
 
 out_dput:
-	dput(dentry);
+	path_release(&nd);
 out:
 	unlock_kernel();
 	return err;
@@ -637,8 +591,8 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	if (IS_ERR(name))
 		goto bad_swap_2;
 	error = 0;
-	if (walk_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
-		error = walk_name(name, &nd);
+	if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
+		error = path_walk(name, &nd);
 	putname(name);
 	if (error)
 		goto bad_swap_2;
@@ -835,8 +789,7 @@ bad_swap_2:
 	p->flags = 0;
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
-	dput(nd.dentry);
-	mntput(nd.mnt);
+	path_release(&nd);
 out:
 	if (swap_header)
 		free_page((long) swap_header);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 691d47f18..2c07830d0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,7 +48,6 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 	if ((page-mem_map >= max_mapnr) || PageReserved(page))
 		goto out_failed;
 
-	mm->swap_cnt--;
 	/* Don't look at this pte if it's been accessed recently. */
 	if (pte_young(pte)) {
 		/*
@@ -56,11 +55,11 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 		 * tables to the global page map.
 		 */
 		set_pte(page_table, pte_mkold(pte));
-		set_bit(PG_referenced, &page->flags);
+                SetPageReferenced(page);
 		goto out_failed;
 	}
 
-	if (PageLocked(page))
+	if (TryLockPage(page))
 		goto out_failed;
 
 	/*
@@ -76,6 +75,8 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 		swap_duplicate(entry);
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
+		UnlockPage(page);
+		mm->swap_cnt--;
 		vma->vm_mm->rss--;
 		flush_tlb_page(vma, address);
 		__free_page(page);
@@ -107,7 +108,14 @@ drop_pte:
 	 * locks etc.
 	 */
 	if (!(gfp_mask & __GFP_IO))
-		goto out_failed;
+		goto out_unlock;
+
+	/*
+	 * Don't do any of the expensive stuff if
+	 * we're not really interested in this zone.
+	 */
+	if (page->zone->free_pages > page->zone->pages_high)
+		goto out_unlock;
 
 	/*
 	 * Ok, it's really dirty. That means that
@@ -134,10 +142,12 @@ drop_pte:
 		struct file *file = vma->vm_file;
 		if (file) get_file(file);
 		pte_clear(page_table);
+		mm->swap_cnt--;
 		vma->vm_mm->rss--;
 		flush_tlb_page(vma, address);
 		vmlist_access_unlock(vma->vm_mm);
 		error = swapout(page, file);
+		UnlockPage(page);
 		if (file) fput(file);
 		if (!error)
 			goto out_free_success;
@@ -151,18 +161,20 @@ drop_pte:
 	 * we have the swap cache set up to associate the
 	 * page with that swap entry.
 	 */
-	entry = acquire_swap_entry(page);
+	entry = get_swap_page();
 	if (!entry.val)
-		goto out_failed; /* No swap space left */
-		
+		goto out_unlock; /* No swap space left */
+
 	if (!(page = prepare_highmem_swapout(page)))
 		goto out_swap_free;
 
 	swap_duplicate(entry);	/* One for the process, one for the swap cache */
 
-	/* This will also lock the page */
+	/* Add it to the swap cache */
 	add_to_swap_cache(page, entry);
+
 	/* Put the swap entry into the pte after the page is in swapcache */
+	mm->swap_cnt--;
 	vma->vm_mm->rss--;
 	set_pte(page_table, swp_entry_to_pte(entry));
 	flush_tlb_page(vma, address);
@@ -178,7 +190,9 @@ out_swap_free:
 	swap_free(entry);
 out_failed:
 	return 0;
-
+out_unlock:
+	UnlockPage(page);
+	return 0;
 }
 
 /*
@@ -328,12 +342,11 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-int swap_out(unsigned int priority, int gfp_mask)
+static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p;
 	int counter;
 	int __ret = 0;
-	int assign = 0;
 
 	lock_kernel();
 	/* 
@@ -350,7 +363,7 @@ int swap_out(unsigned int priority, int gfp_mask)
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_threads / (priority+1);
+	counter = (nr_threads << 1) >> (priority >> 1);
 	if (counter < 1)
 		counter = 1;
 
@@ -358,12 +371,12 @@ int swap_out(unsigned int priority, int gfp_mask)
 		unsigned long max_cnt = 0;
 		struct mm_struct *best = NULL;
 		int pid = 0;
+		int assign = 0;
 	select:
 		read_lock(&tasklist_lock);
 		p = init_task.next_task;
 		for (; p != &init_task; p = p->next_task) {
 			struct mm_struct *mm = p->mm;
-			p->hog = 0;
 			if (!p->swappable || !mm)
 				continue;
 	 		if (mm->rss <= 0)
@@ -377,25 +390,6 @@ int swap_out(unsigned int priority, int gfp_mask)
 				pid = p->pid;
 			}
 		}
-		if (assign == 1) {
-			/* we just assigned swap_cnt, normalise values */
-			assign = 2;
-			p = init_task.next_task;
-			for (; p != &init_task; p = p->next_task) {
-				int i = 0;
-				struct mm_struct *mm = p->mm;
-				if (!p->swappable || !mm || mm->rss <= 0)
-					continue;
-				/* small processes are swapped out less */
-				while ((mm->swap_cnt << 2 * (i + 1) < max_cnt))
-					i++;
-				mm->swap_cnt >>= i;
-				mm->swap_cnt += i; /* if swap_cnt reaches 0 */
-				/* we're big -> hog treatment */
-				if (!i)
-					p->hog = 1;
-			}
-		}
 		read_unlock(&tasklist_lock);
 		if (!best) {
 			if (!assign) {
@@ -429,22 +423,25 @@ out:
  * now we need this so that we can do page allocations
  * without holding the kernel lock etc.
  *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
+ * We want to try to free "count" pages, and we want to 
+ * cluster them so that we get good swap-out behaviour.
+ *
+ * Don't try _too_ hard, though. We don't want to have bad
+ * latency.
  */
-static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
+#define FREE_COUNT	8
+#define SWAP_COUNT	8
+static int do_try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
-	int count = SWAP_CLUSTER_MAX;
-	int ret;
+	int count = FREE_COUNT;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
 	priority = 6;
 	do {
-		while ((ret = shrink_mmap(priority, gfp_mask, zone))) {
+		while (shrink_mmap(priority, gfp_mask)) {
 			if (!--count)
 				goto done;
 		}
@@ -457,27 +454,41 @@ static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 		   	 * shrink_mmap() almost never fail when there's
 		   	 * really plenty of memory free. 
 			 */
-			count -= shrink_dcache_memory(priority, gfp_mask, zone);
-			count -= shrink_icache_memory(priority, gfp_mask, zone);
+			count -= shrink_dcache_memory(priority, gfp_mask);
+			count -= shrink_icache_memory(priority, gfp_mask);
 			if (count <= 0)
 				goto done;
-			while (shm_swap(priority, gfp_mask, zone)) {
+			while (shm_swap(priority, gfp_mask)) {
 				if (!--count)
 					goto done;
 			}
 		}
 
-		/* Then, try to page stuff out..
-		 * We use swapcount here because this doesn't actually
-		 * free pages */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
+		/*
+		 * Then, try to page stuff out..
+		 *
+		 * This will not actually free any pages (they get
+		 * put in the swap cache), so we must not count this
+		 * as a "count" success.
+		 */
+		{
+			int swap_count = SWAP_COUNT;
+			while (swap_out(priority, gfp_mask))
+				if (--swap_count < 0)
+					break;
 		}
 	} while (--priority >= 0);
-done:
 
-	return priority >= 0;
+	/* Always end on a shrink_mmap.. */
+	while (shrink_mmap(0, gfp_mask)) {
+		if (!--count)
+			goto done;
+	}
+
+	return 0;
+
+done:
+	return 1;
 }
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
@@ -497,10 +508,7 @@ DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
  */
 int kswapd(void *unused)
 {
-	int i;
 	struct task_struct *tsk = current;
-	pg_data_t *pgdat;
-	zone_t *zone;
 
 	tsk->session = 1;
 	tsk->pgrp = 1;
@@ -521,27 +529,30 @@ int kswapd(void *unused)
 	 */
 	tsk->flags |= PF_MEMALLOC;
 
-	while (1) {
-		/*
-		 * If we actually get into a low-memory situation,
-		 * the processes needing more memory will wake us
-		 * up on a more timely basis.
-		 */
+	for (;;) {
+		pg_data_t *pgdat;
+		int something_to_do = 0;
+
 		pgdat = pgdat_list;
-		while (pgdat) {
-			for (i = 0; i < MAX_NR_ZONES; i++) {
-				zone = pgdat->node_zones + i;
+		do {
+			int i;
+			for(i = 0; i < MAX_NR_ZONES; i++) {
+				zone_t *zone = pgdat->node_zones+ i;
+				if (!zone->size || !zone->zone_wake_kswapd)
+					continue;
+				something_to_do = 1;
+				do_try_to_free_pages(GFP_KSWAPD);
 				if (tsk->need_resched)
 					schedule();
-				if ((!zone->size) || (!zone->zone_wake_kswapd))
-					continue;
-				do_try_to_free_pages(GFP_KSWAPD, zone);
 			}
+			run_task_queue(&tq_disk);
 			pgdat = pgdat->node_next;
+		} while (pgdat);
+
+		if (!something_to_do) {
+			tsk->state = TASK_INTERRUPTIBLE;
+			interruptible_sleep_on(&kswapd_wait);
 		}
-		run_task_queue(&tq_disk);
-		tsk->state = TASK_INTERRUPTIBLE;
-		interruptible_sleep_on(&kswapd_wait);
 	}
 }
 
@@ -560,13 +571,13 @@ int kswapd(void *unused)
  * can be done by just dropping cached pages without having
  * any deadlock issues.
  */
-int try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
+int try_to_free_pages(unsigned int gfp_mask)
 {
 	int retval = 1;
 
 	if (gfp_mask & __GFP_WAIT) {
 		current->flags |= PF_MEMALLOC;
-		retval = do_try_to_free_pages(gfp_mask, zone);
+		retval = do_try_to_free_pages(gfp_mask);
 		current->flags &= ~PF_MEMALLOC;
 	}
 	return retval;
author	Ralf Baechle <ralf@linux-mips.org>	2000-05-12 21:05:59 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-05-12 21:05:59 +0000
commit	ba2dacab305c598cd4c34a604f8e276bf5bab5ff (patch)
tree	78670a0139bf4d5ace617b29b7eba82bbc74d602 /mm
parent	b77bf69998121e689c5e86cc5630d39a0a9ee6ca (diff)