From 6d403070f28cd44860fdb3a53be5da0275c65cf4 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Mon, 19 Jun 2000 22:45:37 +0000
Subject: Merge with 2.4.0-test1-ac21 + pile of MIPS cleanups to make merging
 possible.  Chainsawed RM200 kernel to compile again.  Jazz machine status
 unknown.

---
 mm/filemap.c    | 194 ++++++++++++++++++++++++++++++++++++++------------------
 mm/memory.c     |   4 +-
 mm/mmap.c       |  29 +++++----
 mm/mremap.c     |   2 +-
 mm/page_alloc.c |  65 ++++++++++++++++---
 mm/slab.c       |  19 +-----
 mm/swap_state.c |   6 +-
 mm/swapfile.c   |  14 +---
 mm/vmscan.c     |  37 ++++++++---
 9 files changed, 244 insertions(+), 126 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index b1e2b8547..ba0048cb8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -56,6 +56,8 @@ spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
 #define CLUSTER_PAGES		(1 << page_cluster)
 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
 
+#define min(a,b)		((a < b) ? a : b)
+
 void __add_page_to_hash_queue(struct page * page, struct page **p)
 {
 	atomic_inc(&page_cache_size);
@@ -90,10 +92,16 @@ static inline int sync_page(struct page *page)
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.
+ * is safe. We need that the page don't have any buffers.
  */
 static inline void __remove_inode_page(struct page *page)
 {
+	if (!PageLocked(page))
+		PAGE_BUG(page);
+
+	if (page->buffers)
+		BUG();
+
 	remove_page_from_inode_queue(page);
 	remove_page_from_hash_queue(page);
 	page->mapping = NULL;
@@ -101,9 +109,6 @@ static inline void __remove_inode_page(struct page *page)
 
 void remove_inode_page(struct page *page)
 {
-	if (!PageLocked(page))
-		PAGE_BUG(page);
-
 	spin_lock(&pagecache_lock);
 	__remove_inode_page(page);
 	spin_unlock(&pagecache_lock);
@@ -114,16 +119,16 @@ void remove_inode_page(struct page *page)
  * @inode: the inode which pages we want to invalidate
  *
  * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
+ * remove all the pages of one inode, you must call
+ * truncate_inode_pages.  This function is not supposed to be called
+ * by block based filesystems.
  */
-
 void invalidate_inode_pages(struct inode * inode)
 {
 	struct list_head *head, *curr;
 	struct page * page;
 
 	head = &inode->i_mapping->pages;
-
 	spin_lock(&pagecache_lock);
 	spin_lock(&pagemap_lru_lock);
 	curr = head->next;
@@ -135,20 +140,53 @@ void invalidate_inode_pages(struct inode * inode)
 		/* We cannot invalidate a locked page */
 		if (TryLockPage(page))
 			continue;
+		/* We _should not be called_ by block based filesystems */
+		if (page->buffers) 
+			BUG();
 
-		__lru_cache_del(page);
 		__remove_inode_page(page);
+		__lru_cache_del(page);
 		UnlockPage(page);
 		page_cache_release(page);
 	}
-
 	spin_unlock(&pagemap_lru_lock);
 	spin_unlock(&pagecache_lock);
 }
 
-/*
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+				
+	if (page->buffers)
+		block_flushpage(page, partial);
+
+}
+
+static inline void truncate_complete_page(struct page *page)
+{
+	if (page->buffers)
+		block_destroy_buffers(page);
+	lru_cache_del(page);
+	
+	/*
+	 * We remove the page from the page cache _after_ we have
+	 * destroyed all buffer-cache references to it. Otherwise some
+	 * other process might think this inode page is not in the
+	 * page cache and creates a buffer-cache alias to it causing
+	 * all sorts of fun problems ...  
+	 */
+	remove_inode_page(page);
+	page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
  * Truncate the page cache at a set offset, removing the pages
  * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
  */
 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 {
@@ -168,11 +206,10 @@ repeat:
 
 		page = list_entry(curr, struct page, list);
 		curr = curr->next;
-
 		offset = page->index;
 
-		/* page wholly truncated - free it */
-		if (offset >= start) {
+		/* Is one of the pages to truncate? */
+		if ((offset >= start) || (partial && (offset + 1) == start)) {
 			if (TryLockPage(page)) {
 				page_cache_get(page);
 				spin_unlock(&pagecache_lock);
@@ -183,22 +220,14 @@ repeat:
 			page_cache_get(page);
 			spin_unlock(&pagecache_lock);
 
-			if (!page->buffers || block_flushpage(page, 0))
-				lru_cache_del(page);
-
-			/*
-			 * We remove the page from the page cache
-			 * _after_ we have destroyed all buffer-cache
-			 * references to it. Otherwise some other process
-			 * might think this inode page is not in the
-			 * page cache and creates a buffer-cache alias
-			 * to it causing all sorts of fun problems ...
-			 */
-			remove_inode_page(page);
+			if (partial && (offset + 1) == start) {
+				truncate_partial_page(page, partial);
+				partial = 0;
+			} else 
+				truncate_complete_page(page);
 
 			UnlockPage(page);
 			page_cache_release(page);
-			page_cache_release(page);
 
 			/*
 			 * We have done things without the pagecache lock,
@@ -209,38 +238,59 @@ repeat:
 			 */
 			goto repeat;
 		}
-		/*
-		 * there is only one partial page possible.
-		 */
-		if (!partial)
-			continue;
+	}
+	spin_unlock(&pagecache_lock);
+}
 
-		/* and it's the one preceeding the first wholly truncated page */
-		if ((offset + 1) != start)
-			continue;
+/**
+ * truncate_all_inode_pages - truncate *all* the pages
+ * @mapping: mapping to truncate
+ *
+ * Truncate all the inode pages.  If any page is locked we wait for it
+ * to become unlocked. This function can block.
+ */
+void truncate_all_inode_pages(struct address_space * mapping)
+{
+	struct list_head *head, *curr;
+	struct page * page;
+
+	head = &mapping->pages;
+repeat:
+	spin_lock(&pagecache_lock);
+	spin_lock(&pagemap_lru_lock);
+	curr = head->next;
+
+	while (curr != head) {
+		page = list_entry(curr, struct page, list);
+		curr = curr->next;
 
-		/* partial truncate, clear end of page */
 		if (TryLockPage(page)) {
+			page_cache_get(page);
+			spin_unlock(&pagemap_lru_lock);
 			spin_unlock(&pagecache_lock);
+			wait_on_page(page);
+			page_cache_release(page);
 			goto repeat;
 		}
-		page_cache_get(page);
-		spin_unlock(&pagecache_lock);
-
-		memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-		if (page->buffers)
-			block_flushpage(page, partial);
-
-		partial = 0;
-
-		/*
-		 * we have dropped the spinlock so we have to
-		 * restart.
-		 */
+		if (page->buffers) {
+			page_cache_get(page);
+			spin_unlock(&pagemap_lru_lock);
+			spin_unlock(&pagecache_lock);
+			block_destroy_buffers(page);
+			remove_inode_page(page);
+			lru_cache_del(page);
+			page_cache_release(page);
+			UnlockPage(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+		__lru_cache_del(page);
+		__remove_inode_page(page);
 		UnlockPage(page);
 		page_cache_release(page);
-		goto repeat;
 	}
+
+	spin_unlock(&pagemap_lru_lock);
 	spin_unlock(&pagecache_lock);
 }
 
@@ -264,7 +314,15 @@ int shrink_mmap(int priority, int gfp_mask)
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
 
-		if (PageTestandClearReferenced(page))
+		if (PageTestandClearReferenced(page)) {
+			page->age += PG_AGE_ADV;
+			if (page->age > PG_AGE_MAX)
+				page->age = PG_AGE_MAX;
+			goto dispose_continue;
+		}
+		page->age -= min(PG_AGE_DECL, page->age);
+
+		if (page->age)
 			goto dispose_continue;
 
 		count--;
@@ -322,17 +380,23 @@ int shrink_mmap(int priority, int gfp_mask)
 		 * were to be marked referenced..
 		 */
 		if (PageSwapCache(page)) {
-			spin_unlock(&pagecache_lock);
-			__delete_from_swap_cache(page);
-			goto made_inode_progress;
-		}	
-
-		/*
-		 * Page is from a zone we don't care about.
-		 * Don't drop page cache entries in vain.
-		 */
-		if (page->zone->free_pages > page->zone->pages_high)
+			if (!PageDirty(page)) {
+				spin_unlock(&pagecache_lock);
+				__delete_from_swap_cache(page);
+				goto made_inode_progress;
+			}
+			/* PageDeferswap -> we swap out the page now. */
+			if (gfp_mask & __GFP_IO) {
+				spin_unlock(&pagecache_lock);
+				/* Do NOT unlock the page ... brw_page does. */
+				ClearPageDirty(page);
+				rw_swap_page(WRITE, page, 0);
+				spin_lock(&pagemap_lru_lock);
+				page_cache_release(page);
+				goto dispose_continue;
+			}
 			goto cache_unlock_continue;
+		}
 
 		/* is it a page-cache page? */
 		if (page->mapping) {
@@ -1744,7 +1808,7 @@ static int msync_interval(struct vm_area_struct * vma,
 		if (!error && (flags & MS_SYNC)) {
 			struct file * file = vma->vm_file;
 			if (file && file->f_op && file->f_op->fsync)
-				error = file->f_op->fsync(file, file->f_dentry);
+				error = file->f_op->fsync(file, file->f_dentry, 1);
 		}
 		return error;
 	}
@@ -2483,7 +2547,7 @@ generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
 	if (count) {
 		remove_suid(inode);
 		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-		mark_inode_dirty(inode);
+		mark_inode_dirty_sync(inode);
 	}
 
 	while (count) {
@@ -2540,7 +2604,13 @@ unlock:
 	if (cached_page)
 		page_cache_free(cached_page);
 
+	/* For now, when the user asks for O_SYNC, we'll actually
+	 * provide O_DSYNC. */
+	if ((status >= 0) && (file->f_flags & O_SYNC))
+		status = generic_osync_inode(inode, 1); /* 1 means datasync */
+	
 	err = written ? written : status;
+
 out:
 	up(&inode->i_sem);
 	return err;
diff --git a/mm/memory.c b/mm/memory.c
index de7dc07f8..e2609758e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -847,7 +847,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 			UnlockPage(old_page);
 			break;
 		}
-		delete_from_swap_cache_nolock(old_page);
+		SetPageDirty(old_page);
 		UnlockPage(old_page);
 		/* FallThrough */
 	case 1:
@@ -1058,7 +1058,7 @@ static int do_swap_page(struct mm_struct * mm,
 	 */
 	lock_page(page);
 	swap_free(entry);
-	if (write_access && !is_page_shared(page)) {
+	if (write_access && !is_page_shared(page) && nr_free_highpages()) {
 		delete_from_swap_cache_nolock(page);
 		UnlockPage(page);
 		page = replace_with_highmem(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index 8a81bfb20..9edabc02e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -166,6 +166,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma;
+	int correct_wcount = 0;
 	int error;
 
 	if (file && (!file->f_op || !file->f_op->mmap))
@@ -296,26 +297,15 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 		goto free_vma;
 
 	if (file) {
-		int correct_wcount = 0;
 		if (vma->vm_flags & VM_DENYWRITE) {
-			if (atomic_read(&file->f_dentry->d_inode->i_writecount) > 0) {
-				error = -ETXTBSY;
+			error = deny_write_access(file);
+			if (error)
 				goto free_vma;
-			}
-	        	/* f_op->mmap might possibly sleep
-			 * (generic_file_mmap doesn't, but other code
-			 * might). In any case, this takes care of any
-			 * race that this might cause.
-			 */
-			atomic_dec(&file->f_dentry->d_inode->i_writecount);
 			correct_wcount = 1;
 		}
 		vma->vm_file = file;
 		get_file(file);
 		error = file->f_op->mmap(file, vma);
-		/* Fix up the count if necessary, then check for an error */
-		if (correct_wcount)
-			atomic_inc(&file->f_dentry->d_inode->i_writecount);
 		if (error)
 			goto unmap_and_free_vma;
 	} else if (flags & MAP_SHARED) {
@@ -330,6 +320,8 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 	addr = vma->vm_start; /* can addr have changed?? */
 	vmlist_modify_lock(mm);
 	insert_vm_struct(mm, vma);
+	if (correct_wcount)
+		atomic_inc(&file->f_dentry->d_inode->i_writecount);
 	merge_segments(mm, vma->vm_start, vma->vm_end);
 	vmlist_modify_unlock(mm);
 	
@@ -341,6 +333,8 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 	return addr;
 
 unmap_and_free_vma:
+	if (correct_wcount)
+		atomic_inc(&file->f_dentry->d_inode->i_writecount);
 	vma->vm_file = NULL;
 	fput(file);
 	/* Undo any partial mapping done by a device driver. */
@@ -692,9 +686,11 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 	 * so release them, and unmap the page range..
 	 * If the one of the segments is only being partially unmapped,
 	 * it will put new vm_area_struct(s) into the address space.
+	 * In that case we have to be careful with VM_DENYWRITE.
 	 */
 	while ((mpnt = free) != NULL) {
 		unsigned long st, end, size;
+		struct file *file = NULL;
 
 		free = free->vm_next;
 
@@ -706,6 +702,11 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 		if (mpnt->vm_ops && mpnt->vm_ops->unmap)
 			mpnt->vm_ops->unmap(mpnt, st, size);
 
+		if (mpnt->vm_flags & VM_DENYWRITE &&
+		    (st != mpnt->vm_start || end != mpnt->vm_end) &&
+		    (file = mpnt->vm_file) != NULL) {
+			atomic_dec(&file->f_dentry->d_inode->i_writecount);
+		}
 		remove_shared_vm_struct(mpnt);
 		mm->map_count--;
 
@@ -717,6 +718,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 		 * Fix the mapping, and free the old area if it wasn't reused.
 		 */
 		extra = unmap_fixup(mm, mpnt, st, size, extra);
+		if (file)
+			atomic_inc(&file->f_dentry->d_inode->i_writecount);
 	}
 
 	/* Release the extra vma struct if it wasn't used */
diff --git a/mm/mremap.c b/mm/mremap.c
index 0404dd795..a48125178 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -144,7 +144,7 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
 			vmlist_modify_lock(current->mm);
 			insert_vm_struct(current->mm, new_vma);
 			merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end);
-			vmlist_modify_unlock(vma->vm_mm);
+			vmlist_modify_unlock(current->mm);
 			do_munmap(current->mm, addr, old_len);
 			current->mm->total_vm += new_len >> PAGE_SHIFT;
 			if (new_vma->vm_flags & VM_LOCKED) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 926364499..4766127b2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -29,7 +29,7 @@ int nr_lru_pages;
 pg_data_t *pgdat_list;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 128, };
+static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 512, };
 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
 
@@ -93,6 +93,8 @@ void __free_pages_ok (struct page *page, unsigned long order)
 		BUG();
 	if (PageDecrAfter(page))
 		BUG();
+	if (PageDirty(page))
+		BUG();
 
 	zone = page->zone;
 
@@ -139,10 +141,13 @@ void __free_pages_ok (struct page *page, unsigned long order)
 
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-	if (zone->free_pages > zone->pages_high) {
-		zone->zone_wake_kswapd = 0;
+	if (zone->free_pages >= zone->pages_low) {
 		zone->low_on_memory = 0;
 	}
+
+	if (zone->free_pages >= zone->pages_high) {
+		zone->zone_wake_kswapd = 0;
+	}
 }
 
 #define MARK_USED(index, order, area) \
@@ -217,6 +222,9 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 {
 	zone_t **zone = zonelist->zones;
 	extern wait_queue_head_t kswapd_wait;
+	static int last_woke_kswapd;
+	static int kswapd_pause = HZ;
+	int gfp_mask = zonelist->gfp_mask;
 
 	/*
 	 * (If anyone calls gfp from interrupts nonatomically then it
@@ -237,18 +245,34 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 			struct page *page = rmqueue(z, order);
 			if (z->free_pages < z->pages_low) {
 				z->zone_wake_kswapd = 1;
-				if (waitqueue_active(&kswapd_wait))
-					wake_up_interruptible(&kswapd_wait);
 			}
 			if (page)
 				return page;
 		}
 	}
 
+	/*
+	 * Kswapd should be freeing enough memory to satisfy all allocations
+	 * immediately.  Calling try_to_free_pages from processes will slow
+	 * down the system a lot.  On the other hand, waking up kswapd too
+	 * often means wasted memory and cpu time.
+	 *
+	 * We tune the kswapd pause interval in such a way that kswapd is
+	 * always just agressive enough to free the amount of memory we
+	 * want freed.
+	 */
+	if (waitqueue_active(&kswapd_wait) &&
+			time_after(jiffies, last_woke_kswapd + kswapd_pause)) {
+		kswapd_pause++;
+		last_woke_kswapd = jiffies;
+		wake_up_interruptible(&kswapd_wait);
+	}
+
 	/*
 	 * Ok, we don't have any zones that don't need some
 	 * balancing.. See if we have any that aren't critical..
 	 */
+again:
 	zone = zonelist->zones;
 	for (;;) {
 		zone_t *z = *(zone++);
@@ -256,20 +280,33 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 			break;
 		if (!z->low_on_memory) {
 			struct page *page = rmqueue(z, order);
-			if (z->free_pages < z->pages_min)
+			if (z->free_pages < (z->pages_min + z->pages_low) / 2)
 				z->low_on_memory = 1;
 			if (page)
 				return page;
+		} else {
+			if (kswapd_pause > 0)
+				kswapd_pause--;
 		}
 	}
 
+	/* We didn't kick kswapd often enough... */
+	kswapd_pause /= 2;
+	if (waitqueue_active(&kswapd_wait))
+		wake_up_interruptible(&kswapd_wait);
+	/* If we're low priority, we just wait a bit and try again later. */
+	if ((gfp_mask & __GFP_WAIT) && current->need_resched &&
+				current->state == TASK_RUNNING) {
+		schedule();
+		goto again;
+	}
+
 	/*
 	 * Uhhuh. All the zones have been critical, which means that
 	 * we'd better do some synchronous swap-out. kswapd has not
 	 * been able to cope..
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
-		int gfp_mask = zonelist->gfp_mask;
 		if (!try_to_free_pages(gfp_mask)) {
 			if (!(gfp_mask & __GFP_HIGH))
 				goto fail;
@@ -277,7 +314,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 	}
 
 	/*
-	 * Final phase: allocate anything we can!
+	 * We freed something, so we're allowed to allocate anything we can!
 	 */
 	zone = zonelist->zones;
 	for (;;) {
@@ -292,6 +329,18 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 	}
 
 fail:
+	/* Last try, zone->low_on_memory isn't reset until we hit pages_low */
+	zone = zonelist->zones;
+	for (;;) {
+		zone_t *z = *(zone++);
+		if (!z)
+			break;
+		if (z->free_pages > z->pages_min) {
+			struct page *page = rmqueue(z, order);
+			if (page)
+				return page;
+		}
+	}
 	/* No luck.. */
 	return NULL;
 }
diff --git a/mm/slab.c b/mm/slab.c
index 64f33cb33..f3d04da8e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -106,11 +106,6 @@
 #include	<linux/slab.h>
 #include	<linux/interrupt.h>
 #include	<linux/init.h>
-#ifdef __mips__
-#include	<asm/pgtable.h>
-#include	<asm/addrspace.h>
-#include	<asm/pgalloc.h>
-#endif
 
 /* If there is a different PAGE_SIZE around, and it works with this allocator,
  * then change the following.
@@ -1691,19 +1686,11 @@ void *
 kmalloc(size_t size, int flags)
 {
 	cache_sizes_t	*csizep = cache_sizes;
-	unsigned long	addr;
 
 	for (; csizep->cs_size; csizep++) {
 		if (size > csizep->cs_size)
 			continue;
-		addr = __kmem_cache_alloc(csizep->cs_cachep, flags);
-#ifdef __mips__
-		if (addr && (flags & GFP_UNCACHED)) {
-			flush_cache_all(); /* Ouch ... */
-			addr = KSEG1ADDR(addr);
-		}
-#endif /* __mips__ */
-		return addr;
+		return __kmem_cache_alloc(csizep->cs_cachep, flags);
 	}
 	printk(KERN_ERR "kmalloc: Size (%lu) too large\n", (unsigned long) size);
 	return NULL;
@@ -1717,10 +1704,6 @@ kfree(const void *objp)
 
 	if (!objp)
 		goto null_ptr;
-#ifdef __mips__
-	if (KSEGX(objp) == KSEG1)
-		objp = KSEG0ADDR(objp);
-#endif __mips__
 	nr = MAP_NR(objp);
 	if (nr >= max_mapnr)
 		goto bad_ptr;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2405aba2f..87ecc0c10 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -73,6 +73,7 @@ static inline void remove_from_swap_cache(struct page *page)
 		PAGE_BUG(page);
 
 	PageClearSwapCache(page);
+	ClearPageDirty(page);
 	remove_inode_page(page);
 }
 
@@ -102,9 +103,10 @@ void delete_from_swap_cache_nolock(struct page *page)
 	if (!PageLocked(page))
 		BUG();
 
-	if (block_flushpage(page, 0))
-		lru_cache_del(page);
+	if (page->buffers)
+ 		block_destroy_buffers(page);
 
+	lru_cache_del(page);
 	__delete_from_swap_cache(page);
 	page_cache_release(page);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 55ef476a3..5d3a7f23e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -407,11 +407,11 @@ asmlinkage long sys_swapoff(const char * specialfile)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	lock_kernel();
 	err = user_path_walk(specialfile, &nd);
 	if (err)
 		goto out;
 
+	lock_kernel();
 	prev = -1;
 	swap_list_lock();
 	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
@@ -478,9 +478,9 @@ asmlinkage long sys_swapoff(const char * specialfile)
 	err = 0;
 
 out_dput:
+	unlock_kernel();
 	path_release(&nd);
 out:
-	unlock_kernel();
 	return err;
 }
 
@@ -555,7 +555,6 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	unsigned long maxpages;
 	int swapfilesize;
 	struct block_device *bdev = NULL;
-	char *name;
 	
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -586,14 +585,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	} else {
 		p->prio = --least_priority;
 	}
-	name = getname(specialfile);
-	error = PTR_ERR(name);
-	if (IS_ERR(name))
-		goto bad_swap_2;
-	error = 0;
-	if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
-		error = path_walk(name, &nd);
-	putname(name);
+	error = user_path_walk(specialfile, &nd);
 	if (error)
 		goto bad_swap_2;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1919c0961..597a1b093 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -62,6 +62,10 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 		goto out_failed;
 	}
 
+	/* Can only do this if we age all active pages. */
+	if (PageActive(page) && page->age > 1)
+		goto out_failed;
+
 	if (TryLockPage(page))
 		goto out_failed;
 
@@ -74,6 +78,8 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
 	 * memory, and we should just continue our scan.
 	 */
 	if (PageSwapCache(page)) {
+		if (pte_dirty(pte))
+			SetPageDirty(page);
 		entry.val = page->index;
 		swap_duplicate(entry);
 		set_pte(page_table, swp_entry_to_pte(entry));
@@ -181,7 +187,10 @@ drop_pte:
 	vmlist_access_unlock(vma->vm_mm);
 
 	/* OK, do a physical asynchronous write to swap.  */
-	rw_swap_page(WRITE, page, 0);
+	// rw_swap_page(WRITE, page, 0);
+	/* Let shrink_mmap handle this swapout. */
+	SetPageDirty(page);
+	UnlockPage(page);
 
 out_free_success:
 	page_cache_release(page);
@@ -430,12 +439,12 @@ out:
  * latency.
  */
 #define FREE_COUNT	8
-#define SWAP_COUNT	16
 static int do_try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
 	int count = FREE_COUNT;
-	int swap_count;
+	int swap_count = 0;
+	int ret = 0;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
@@ -443,6 +452,7 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
 	priority = 64;
 	do {
 		while (shrink_mmap(priority, gfp_mask)) {
+			ret = 1;
 			if (!--count)
 				goto done;
 		}
@@ -457,9 +467,12 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
 			 */
 			count -= shrink_dcache_memory(priority, gfp_mask);
 			count -= shrink_icache_memory(priority, gfp_mask);
-			if (count <= 0)
+			if (count <= 0) {
+				ret = 1;
 				goto done;
+			}
 			while (shm_swap(priority, gfp_mask)) {
+				ret = 1;
 				if (!--count)
 					goto done;
 			}
@@ -471,24 +484,30 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
 		 * This will not actually free any pages (they get
 		 * put in the swap cache), so we must not count this
 		 * as a "count" success.
+		 *
+		 * The amount we page out is the amount of pages we're
+		 * short freeing, amplified by the number of times we
+		 * failed above. This generates a negative feedback loop:
+		 * the more difficult it was to free pages, the easier we
+		 * will make it.
 		 */
-		swap_count = SWAP_COUNT;
-		while (swap_out(priority, gfp_mask))
+		swap_count += count;
+		while (swap_out(priority, gfp_mask)) {
 			if (--swap_count < 0)
 				break;
+		}
 
 	} while (--priority >= 0);
 
 	/* Always end on a shrink_mmap.. */
 	while (shrink_mmap(0, gfp_mask)) {
+		ret = 1;
 		if (!--count)
 			goto done;
 	}
-	/* We return 1 if we are freed some page */
-	return (count != FREE_COUNT);
 
 done:
-	return 1;
+	return ret;
 }
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-- 
cgit v1.2.3