8 files changed, 797 insertions, 383 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 455f334f3..4e885758f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1,7 +1,7 @@
 /*
  *	linux/mm/filemap.c
  *
- * Copyright (C) 1994, 1995  Linus Torvalds
+ * Copyright (C) 1994-1999  Linus Torvalds
  */
 
 /*
@@ -29,9 +29,12 @@
  * though.
  *
  * Shared mappings now work. 15.8.1995  Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  */
 
-unsigned long page_cache_size = 0;
+atomic_t page_cache_size = ATOMIC_INIT(0);
 struct page * page_hash_table[PAGE_HASH_SIZE];
 
 /* 
@@ -50,38 +53,97 @@ static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
 static kmem_cache_t *pio_request_cache;
 static DECLARE_WAIT_QUEUE_HEAD(pio_wait);
 
+spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+
+
 static inline void 
 make_pio_request(struct file *, unsigned long, unsigned long);
 
+void __add_page_to_hash_queue(struct page * page, struct page **p){
+	atomic_inc(&page_cache_size);
+	if((page->next_hash = *p) != NULL)
+		(*p)->pprev_hash = &page->next_hash;
+	*p = page;
+	page->pprev_hash = p;
+	if (page->buffers)
+		PAGE_BUG(page);
+}
+
+static void remove_page_from_hash_queue(struct page * page)
+{
+	if(page->pprev_hash) {
+		if(page->next_hash)
+			page->next_hash->pprev_hash = page->pprev_hash;
+		*page->pprev_hash = page->next_hash;
+		page->pprev_hash = NULL;
+	}
+	atomic_dec(&page_cache_size);
+}
+
+static void remove_page_from_inode_queue(struct page * page)
+{
+	struct inode * inode = page->inode;
+	struct page *prev, *next;
+
+	inode->i_nrpages--;
+	next = page->next;
+	prev = page->prev;
+	if (inode->i_pages == page)
+		inode->i_pages = next;
+	if (next)
+		next->prev = prev;
+	if (prev)
+		prev->next = next;
+	page->next = NULL;
+	page->prev = NULL;
+}
 
 /*
- * Invalidate the pages of an inode, removing all pages that aren't
- * locked down (those are sure to be up-to-date anyway, so we shouldn't
- * invalidate them).
+ * Remove a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe.
  */
+void remove_inode_page(struct page *page)
+{
+	if (!PageLocked(page))
+		PAGE_BUG(page);
+
+	spin_lock(&pagecache_lock);
+	remove_page_from_inode_queue(page);
+	remove_page_from_hash_queue(page);
+	page->inode = NULL;
+	spin_unlock(&pagecache_lock);
+}
+
 void invalidate_inode_pages(struct inode * inode)
 {
 	struct page ** p;
 	struct page * page;
 
+repeat:
+	spin_lock(&pagecache_lock);
 	p = &inode->i_pages;
 	while ((page = *p) != NULL) {
-		if (PageLocked(page)) {
-			p = &page->next;
-			continue;
+		get_page(page);
+		if (TryLockPage(page)) {
+			spin_unlock(&pagecache_lock);
+			wait_on_page(page);
+			page_cache_release(page);
+			goto repeat;
 		}
-		inode->i_nrpages--;
-		if ((*p = page->next) != NULL)
-			(*p)->prev = page->prev;
-		page->next = NULL;
-		page->prev = NULL;
+		if (page_count(page) != 2)
+			printk("hm, busy page invalidated? (not necesserily a bug)\n");
+
+		remove_page_from_inode_queue(page);
 		remove_page_from_hash_queue(page);
 		page->inode = NULL;
+		UnlockPage(page);
+		page_cache_release(page);
 		page_cache_release(page);
-		continue;
+
 	}
+	spin_unlock(&pagecache_lock);
 }
-
 /*
  * Truncate the page cache at a set offset, removing the pages
  * that are beyond that offset (and zeroing out partial pages).
@@ -90,55 +152,90 @@ void truncate_inode_pages(struct inode * inode, unsigned long start)
 {
 	struct page ** p;
 	struct page * page;
+	int partial = 0;
 
 repeat:
+	spin_lock(&pagecache_lock);
 	p = &inode->i_pages;
 	while ((page = *p) != NULL) {
 		unsigned long offset = page->offset;
 
 		/* page wholly truncated - free it */
 		if (offset >= start) {
-			if (PageLocked(page)) {
-				wait_on_page(page);
-				goto repeat;
-			}
-			inode->i_nrpages--;
-			if ((*p = page->next) != NULL)
-				(*p)->prev = page->prev;
-			page->next = NULL;
-			page->prev = NULL;
-			remove_page_from_hash_queue(page);
-			page->inode = NULL;
+			get_page(page);
+			spin_unlock(&pagecache_lock);
+
+			lock_page(page);
+
+			if (inode->i_op->flushpage)
+				inode->i_op->flushpage(inode, page, 0);
+
+			/*
+			 * We remove the page from the page cache
+			 * _after_ we have destroyed all buffer-cache
+			 * references to it. Otherwise some other process
+			 * might think this inode page is not in the
+			 * page cache and creates a buffer-cache alias
+			 * to it causing all sorts of fun problems ...
+			 */
+			remove_inode_page(page);
+
+			UnlockPage(page);
 			page_cache_release(page);
-			continue;
+			page_cache_release(page);
+
+			/*
+			 * We have done things without the pagecache lock,
+			 * so we'll have to repeat the scan.
+			 * It's not possible to deadlock here because
+			 * we are guaranteed to make progress. (ie. we have
+			 * just removed a page)
+			 */
+			goto repeat;
 		}
 		p = &page->next;
+		/*
+		 * there is only one partial page possible.
+		 */
+		if (partial)
+			continue;
+
 		offset = start - offset;
 		/* partial truncate, clear end of page */
 		if (offset < PAGE_CACHE_SIZE) {
-			unsigned long address = page_address(page);
+			unsigned long address;
+			get_page(page);
+			spin_unlock(&pagecache_lock);
+
+			lock_page(page);
+			partial = 1;
+
+			address = page_address(page);
 			memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
 			flush_page_to_ram(address);
+
+			if (inode->i_op->flushpage)
+				inode->i_op->flushpage(inode, page, offset);
+			/*
+			 * we have dropped the spinlock so we have to
+			 * restart.
+			 */
+			UnlockPage(page);
+			page_cache_release(page);
+			goto repeat;
 		}
 	}
+	spin_unlock(&pagecache_lock);
 }
 
-/*
- * Remove a page from the page cache and free it.
- */
-void remove_inode_page(struct page *page)
-{
-	remove_page_from_hash_queue(page);
-	remove_page_from_inode_queue(page);
-	page_cache_release(page);
-}
+extern atomic_t too_many_dirty_buffers;
 
 int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
 	unsigned long limit = num_physpages;
 	struct page * page;
-	int count;
+	int count, users;
 
 	count = limit >> priority;
 
@@ -164,15 +261,67 @@ int shrink_mmap(int priority, int gfp_mask)
 		
 		referenced = test_and_clear_bit(PG_referenced, &page->flags);
 
-		if (PageLocked(page))
+		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 			continue;
 
-		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+		/*
+		 * Some common cases that we just short-circuit without
+		 * getting the locks - we need to re-check this once we
+		 * have the lock, but that's fine.
+		 */
+		users = page_count(page);
+		if (!users)
+			continue;
+		if (!page->buffers) {
+			if (!page->inode)
+				continue;
+			if (users > 1)
+				continue;
+		}
+
+		/*
+		 * ok, now the page looks interesting. Re-check things
+		 * and keep the lock.
+		 */
+		spin_lock(&pagecache_lock);
+		if (!page->inode && !page->buffers) {
+			spin_unlock(&pagecache_lock);
 			continue;
+		}
+		if (!page_count(page)) {
+//			BUG();
+			spin_unlock(&pagecache_lock);
+			continue;
+		}
+		get_page(page);
+		if (TryLockPage(page)) {
+			spin_unlock(&pagecache_lock);
+			goto put_continue;
+		}
+
+		/*
+		 * we keep pagecache_lock locked and unlock it in
+		 * each branch, so that the page->inode case doesnt
+		 * have to re-grab it. Here comes the 'real' logic
+		 * to free memory:
+		 */
+
+		/* Is it a buffer page? */
+		if (page->buffers) {
+			kdev_t dev = page->buffers->b_dev;
+			spin_unlock(&pagecache_lock);
+			if (try_to_free_buffers(page))
+				goto made_progress;
+			if (!atomic_read(&too_many_dirty_buffers)) {
+				atomic_set(&too_many_dirty_buffers, 1);
+				balance_dirty(dev);
+			}
+			goto unlock_continue;
+		}
 
 		/* We can't free pages unless there's just one user */
-		if (atomic_read(&page->count) != 1)
-			continue;
+		if (page_count(page) != 2)
+			goto spin_unlock_continue;
 
 		count--;
 
@@ -182,77 +331,180 @@ int shrink_mmap(int priority, int gfp_mask)
 		 * were to be marked referenced..
 		 */
 		if (PageSwapCache(page)) {
-			if (referenced && swap_count(page->offset) != 1)
-				continue;
-			delete_from_swap_cache(page);
-			return 1;
+			spin_unlock(&pagecache_lock);
+			if (referenced && swap_count(page->offset) != 2)
+				goto unlock_continue;
+			__delete_from_swap_cache(page);
+			page_cache_release(page);
+			goto made_progress;
 		}	
 
-		if (referenced)
-			continue;
-
-		/* Is it a buffer page? */
-		if (page->buffers) {
-			if (buffer_under_min())
-				continue;
-			if (!try_to_free_buffers(page))
-				continue;
-			return 1;
-		}
-
 		/* is it a page-cache page? */
-		if (page->inode) {
-			if (pgcache_under_min())
-				continue;
-			remove_inode_page(page);
-			return 1;
-		}
+		if (!referenced && page->inode && !pgcache_under_min()) {
+			remove_page_from_inode_queue(page);
+			remove_page_from_hash_queue(page);
+			page->inode = NULL;
+			spin_unlock(&pagecache_lock);
 
+			page_cache_release(page);
+			goto made_progress;
+		}
+spin_unlock_continue:
+		spin_unlock(&pagecache_lock);
+unlock_continue:
+		UnlockPage(page);
+put_continue:
+		put_page(page);
 	} while (count > 0);
 	return 0;
+made_progress:
+	UnlockPage(page);
+	put_page(page);
+	return 1;
+}
+
+static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
+{
+	goto inside;
+
+	for (;;) {
+		page = page->next_hash;
+inside:
+		if (!page)
+			goto not_found;
+		if (page->inode != inode)
+			continue;
+		if (page->offset == offset)
+			break;
+	}
+not_found:
+	return page;
 }
 
 /*
- * Update a page cache copy, when we're doing a "write()" system call
- * See also "update_vm_cache()".
+ * By the time this is called, the page is locked and
+ * we don't have to worry about any races any more.
+ *
+ * Start the IO..
  */
-void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
+static int writeout_one_page(struct page *page)
 {
-	unsigned long offset, len;
+	struct buffer_head *bh, *head = page->buffers;
 
-	offset = (pos & ~PAGE_CACHE_MASK);
-	pos = pos & PAGE_CACHE_MASK;
-	len = PAGE_CACHE_SIZE - offset;
+	bh = head;
 	do {
-		struct page * page;
+		if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
+			continue;
 
-		if (len > count)
-			len = count;
-		page = find_page(inode, pos);
-		if (page) {
-			wait_on_page(page);
-			memcpy((void *) (offset + page_address(page)), buf, len);
-			page_cache_release(page);
-		}
-		count -= len;
-		buf += len;
-		len = PAGE_CACHE_SIZE;
-		offset = 0;
-		pos += PAGE_CACHE_SIZE;
-	} while (count);
+		bh->b_flushtime = 0;
+		ll_rw_block(WRITE, 1, &bh);	
+	} while ((bh = bh->b_this_page) != head);
+	return 0;
+}
+
+static int waitfor_one_page(struct page *page)
+{
+	int error = 0;
+	struct buffer_head *bh, *head = page->buffers;
+
+	bh = head;
+	do {
+		wait_on_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			error = -EIO;
+	} while ((bh = bh->b_this_page) != head);
+	return error;
+}
+
+static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
+{
+	struct page *next;
+	int retval = 0;
+
+	start &= PAGE_MASK;
+
+	spin_lock(&pagecache_lock);
+	next = inode->i_pages;
+	while (next) {
+		struct page *page = next;
+		next = page->next;
+		if (!page->buffers)
+			continue;
+		if (page->offset >= end)
+			continue;
+		if (page->offset < start)
+			continue;
+
+		get_page(page);
+		spin_unlock(&pagecache_lock);
+		lock_page(page);
+
+		/* The buffers could have been free'd while we waited for the page lock */
+		if (page->buffers)
+			retval |= fn(page);
+
+		UnlockPage(page);
+		spin_lock(&pagecache_lock);
+		next = page->next;
+		page_cache_release(page);
+	}
+	spin_unlock(&pagecache_lock);
+
+	return retval;
+}
+
+/*
+ * Two-stage data sync: first start the IO, then go back and
+ * collect the information..
+ */
+int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
+{
+	int retval;
+
+	retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
+	retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
+	return retval;
 }
 
-static inline void add_to_page_cache(struct page * page,
+/*
+ * This adds a page to the page cache, starting out as locked,
+ * owned by us, referenced, but not uptodate and with no errors.
+ */
+static inline void __add_to_page_cache(struct page * page,
 	struct inode * inode, unsigned long offset,
 	struct page **hash)
 {
-	atomic_inc(&page->count);
-	page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
+	unsigned long flags;
+
+	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
+	page->flags = flags |  ((1 << PG_locked) | (1 << PG_referenced));
+	page->owner = (int)current;	/* REMOVEME */
+	get_page(page);
 	page->offset = offset;
 	add_page_to_inode_queue(inode, page);
 	__add_page_to_hash_queue(page, hash);
 }
 
+int add_to_page_cache_unique(struct page * page,
+	struct inode * inode, unsigned long offset,
+	struct page **hash)
+{
+	int err;
+	struct page *alias;
+
+	spin_lock(&pagecache_lock);
+	alias = __find_page_nolock(inode, offset, *hash);
+
+	err = 1;
+	if (!alias) {
+		__add_to_page_cache(page,inode,offset,hash);
+		err = 0;
+	}
+
+	spin_unlock(&pagecache_lock);
+	return err;
+}
+
 /*
  * Try to read ahead in the file. "page_cache" is a potentially free page
  * that we could use for the cache (if it is 0 we can try to create one,
@@ -275,45 +527,173 @@ static unsigned long try_to_read_ahead(struct file * file,
 		if (offset >= inode->i_size)
 			break;
 		hash = page_hash(inode, offset);
-		page = __find_page(inode, offset, *hash);
-		if (!page) {
+		page = page_cache_entry(page_cache);
+		if (!add_to_page_cache_unique(page, inode, offset, hash)) {
 			/*
-			 * Ok, add the new page to the hash-queues...
+			 * We do not have to check the return value here
+			 * because it's a readahead.
 			 */
-			page = page_cache_entry(page_cache);
-			add_to_page_cache(page, inode, offset, hash);
 			inode->i_op->readpage(file, page);
 			page_cache = 0;
+			page_cache_release(page);
 		}
-		page_cache_release(page);
 	}
 	return page_cache;
 }
 
 /* 
- * Wait for IO to complete on a locked page.
+ * Wait for a page to get unlocked.
  *
  * This must be called with the caller "holding" the page,
  * ie with increased "page->count" so that the page won't
  * go away during the wait..
  */
-void __wait_on_page(struct page *page)
+void ___wait_on_page(struct page *page)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
 
 	add_wait_queue(&page->wait, &wait);
-repeat:
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	run_task_queue(&tq_disk);
-	if (PageLocked(page)) {
+	do {
+		tsk->state = TASK_UNINTERRUPTIBLE;
+		run_task_queue(&tq_disk);
+		if (!PageLocked(page))
+			break;
 		schedule();
-		goto repeat;
-	}
+	} while (PageLocked(page));
 	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&page->wait, &wait);
 }
 
+/*
+ * Get an exclusive lock on the page..
+ */
+void lock_page(struct page *page)
+{
+	if (TryLockPage(page)) {
+		struct task_struct *tsk = current;
+		DECLARE_WAITQUEUE(wait, current);
+
+		run_task_queue(&tq_disk);
+		add_wait_queue(&page->wait, &wait);
+		tsk->state = TASK_UNINTERRUPTIBLE;
+
+		while (TryLockPage(page)) {
+			run_task_queue(&tq_disk);
+			schedule();
+			tsk->state = TASK_UNINTERRUPTIBLE;
+		}
+
+		remove_wait_queue(&page->wait, &wait);
+		tsk->state = TASK_RUNNING;
+	}
+}
+
+
+/*
+ * a rather lightweight function, finding and getting a reference to a
+ * hashed page atomically, waiting for it if it's locked.
+ */
+struct page * __find_get_page (struct inode * inode,
+				unsigned long offset, struct page **hash)
+{
+	struct page *page;
+
+	/*
+	 * We scan the hash list read-only. Addition to and removal from
+	 * the hash-list needs a held write-lock.
+	 */
+repeat:
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(inode, offset, *hash);
+	if (page)
+		get_page(page);
+	spin_unlock(&pagecache_lock);
+
+	/* Found the page, sleep if locked. */
+	if (page && PageLocked(page)) {
+		struct task_struct *tsk = current;
+		DECLARE_WAITQUEUE(wait, tsk);
+
+		add_wait_queue(&page->wait, &wait);
+		tsk->state = TASK_UNINTERRUPTIBLE;
+
+		run_task_queue(&tq_disk);
+		if (PageLocked(page))
+			schedule();
+		tsk->state = TASK_RUNNING;
+		remove_wait_queue(&page->wait, &wait);
+
+		/*
+		 * The page might have been unhashed meanwhile. It's
+		 * not freed though because we hold a reference to it.
+		 * If this is the case then it will be freed _here_,
+		 * and we recheck the hash anyway.
+		 */
+		page_cache_release(page);
+		goto repeat;
+	}
+	/*
+	 * It's not locked so we can return the page and we hold
+	 * a reference to it.
+	 */
+	return page;
+}
+
+/*
+ * Get the lock to a page atomically.
+ */
+struct page * __find_lock_page (struct inode * inode,
+				unsigned long offset, struct page **hash)
+{
+	int locked;
+	struct page *page;
+
+	/*
+	 * We scan the hash list read-only. Addition to and removal from
+	 * the hash-list needs a held write-lock.
+	 */
+repeat:
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(inode, offset, *hash);
+	locked = 0;
+	if (page) {
+		get_page(page);
+		if (TryLockPage(page))
+			locked = 1;
+	}
+	spin_unlock(&pagecache_lock);
+
+	/* Found the page, sleep if locked. */
+	if (page && locked) {
+		struct task_struct *tsk = current;
+		DECLARE_WAITQUEUE(wait, tsk);
+
+		add_wait_queue(&page->wait, &wait);
+		tsk->state = TASK_UNINTERRUPTIBLE;
+
+		run_task_queue(&tq_disk);
+		if (PageLocked(page))
+			schedule();
+		tsk->state = TASK_RUNNING;
+		remove_wait_queue(&page->wait, &wait);
+
+		/*
+		 * The page might have been unhashed meanwhile. It's
+		 * not freed though because we hold a reference to it.
+		 * If this is the case then it will be freed _here_,
+		 * and we recheck the hash anyway.
+		 */
+		page_cache_release(page);
+		goto repeat;
+	}
+	/*
+	 * It's not locked so we can return the page and we hold
+	 * a reference to it.
+	 */
+	return page;
+}
+
 #if 0
 #define PROFILE_READAHEAD
 #define DEBUG_READAHEAD
@@ -386,14 +766,14 @@ static void profile_readahead(int async, struct file *filp)
  * -------------------
  * The read ahead context fields of the "struct file" are the following:
  * - f_raend : position of the first byte after the last page we tried to
- *             read ahead.
+ *	       read ahead.
  * - f_ramax : current read-ahead maximum size.
  * - f_ralen : length of the current IO read block we tried to read-ahead.
  * - f_rawin : length of the current read-ahead window.
- *             if last read-ahead was synchronous then
- *                  f_rawin = f_ralen
- *             otherwise (was asynchronous)
- *                  f_rawin = previous value of f_ralen + f_ralen
+ *		if last read-ahead was synchronous then
+ *			f_rawin = f_ralen
+ *		otherwise (was asynchronous)
+ *			f_rawin = previous value of f_ralen + f_ralen
  *
  * Read-ahead limits:
  * ------------------
@@ -485,7 +865,7 @@ static inline unsigned long generic_file_readahead(int reada_ok,
  * We will later force unplug device in order to force asynchronous read IO.
  */
 	else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
-	         ppos <= raend && ppos + filp->f_ralen >= raend) {
+		 ppos <= raend && ppos + filp->f_ralen >= raend) {
 /*
  * Add ONE page to max_ahead in order to try to have about the same IO max size
  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
@@ -578,6 +958,7 @@ static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descript
 	struct inode *inode = dentry->d_inode;
 	size_t pos, pgpos, page_cache;
 	int reada_ok;
+	int error;
 	int max_readahead = get_max_readahead(inode);
 
 	page_cache = 0;
@@ -633,33 +1014,22 @@ static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descript
 		 * Try to find the data in the page cache..
 		 */
 		hash = page_hash(inode, pos & PAGE_CACHE_MASK);
-		page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash);
+
+		spin_lock(&pagecache_lock);
+		page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
 		if (!page)
 			goto no_cached_page;
-
 found_page:
-/*
- * Try to read ahead only if the current page is filled or being filled.
- * Otherwise, if we were reading ahead, decrease max read ahead size to
- * the minimum value.
- * In this context, that seems to may happen only on some read error or if 
- * the page has been rewritten.
- */
-		if (PageUptodate(page) || PageLocked(page))
-			page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
-		else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
-				filp->f_ramax = MIN_READAHEAD;
-
-		wait_on_page(page);
-
-		if (!PageUptodate(page))
-			goto page_read_error;
+		get_page(page);
+		spin_unlock(&pagecache_lock);
 
-success:
-		/*
-		 * Ok, we have the page, it's up-to-date and ok,
-		 * so now we can finally copy it to user space...
-		 */
+		if (!Page_Uptodate(page))
+			goto page_not_up_to_date;
+page_ok:
+	/*
+	 * Ok, we have the page, and it's up-to-date, so
+	 * now we can copy it to user space...
+	 */
 	{
 		unsigned long offset, nr;
 
@@ -683,75 +1053,77 @@ success:
 		break;
 	}
 
+/*
+ * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
+ */
+page_not_up_to_date:
+		page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+
+		if (Page_Uptodate(page))
+			goto page_ok;
+
+		/* Get exclusive access to the page ... */
+		lock_page(page);
+		if (Page_Uptodate(page)) {
+			UnlockPage(page);
+			goto page_ok;
+		}
+
+readpage:
+		/* ... and start the actual read. The read will unlock the page. */
+		error = inode->i_op->readpage(filp, page);
+
+		if (!error) {
+			if (Page_Uptodate(page))
+				goto page_ok;
+
+			/* Again, try some read-ahead while waiting for the page to finish.. */
+			page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+			wait_on_page(page);
+			if (Page_Uptodate(page))
+				goto page_ok;
+			error = -EIO;
+		}
+
+		/* UHHUH! A synchronous read error occurred. Report it */
+		desc->error = error;
+		page_cache_release(page);
+		break;
+
 no_cached_page:
 		/*
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
+		 *
+		 * We get here with the page cache lock held.
 		 */
 		if (!page_cache) {
+			spin_unlock(&pagecache_lock);
 			page_cache = page_cache_alloc();
+			if (!page_cache) {
+				desc->error = -ENOMEM;
+				break;
+			}
+
 			/*
-			 * That could have slept, so go around to the
-			 * very beginning..
+			 * Somebody may have added the page while we
+			 * dropped the page cache lock. Check for that.
 			 */
-			if (page_cache)
-				continue;
-			desc->error = -ENOMEM;
-			break;
+			spin_lock(&pagecache_lock);
+			page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
+			if (page)
+				goto found_page;
 		}
 
 		/*
 		 * Ok, add the new page to the hash-queues...
 		 */
 		page = page_cache_entry(page_cache);
-		page_cache = 0;
-		add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
-
-		/*
-		 * Error handling is tricky. If we get a read error,
-		 * the cached page stays in the cache (but uptodate=0),
-		 * and the next process that accesses it will try to
-		 * re-read it. This is needed for NFS etc, where the
-		 * identity of the reader can decide if we can read the
-		 * page or not..
-		 */
-/*
- * We have to read the page.
- * If we were reading ahead, we had previously tried to read this page,
- * That means that the page has probably been removed from the cache before 
- * the application process needs it, or has been rewritten.
- * Decrease max readahead size to the minimum value in that situation.
- */
-		if (reada_ok && filp->f_ramax > MIN_READAHEAD)
-			filp->f_ramax = MIN_READAHEAD;
-
-		{
-			int error = inode->i_op->readpage(filp, page);
-			if (!error)
-				goto found_page;
-			desc->error = error;
-			page_cache_release(page);
-			break;
-		}
+		__add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
+		spin_unlock(&pagecache_lock);
 
-page_read_error:
-		/*
-		 * We found the page, but it wasn't up-to-date.
-		 * Try to re-read it _once_. We do this synchronously,
-		 * because this happens only if there were errors.
-		 */
-		{
-			int error = inode->i_op->readpage(filp, page);
-			if (!error) {
-				wait_on_page(page);
-				if (PageUptodate(page) && !PageError(page))
-					goto success;
-				error = -EIO; /* Some unspecified error occurred.. */
-			}
-			desc->error = error;
-			page_cache_release(page);
-			break;
-		}
+		page_cache = 0;
+		goto readpage;
 	}
 
 	*ppos = pos;
@@ -787,6 +1159,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
 {
 	ssize_t retval;
 
+	unlock_kernel();
 	retval = -EFAULT;
 	if (access_ok(VERIFY_WRITE, buf, count)) {
 		retval = 0;
@@ -804,6 +1177,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
 				retval = desc.error;
 		}
 	}
+	lock_kernel();
 	return retval;
 }
 
@@ -812,17 +1186,14 @@ static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned
 	ssize_t written;
 	unsigned long count = desc->count;
 	struct file *file = (struct file *) desc->buf;
-	struct inode *inode = file->f_dentry->d_inode;
 	mm_segment_t old_fs;
 
 	if (size > count)
 		size = count;
-	down(&inode->i_sem);
 	old_fs = get_fs();
 	set_fs(KERNEL_DS);
 	written = file->f_op->write(file, area, size, &file->f_pos);
 	set_fs(old_fs);
-	up(&inode->i_sem);
 	if (written < 0) {
 		desc->error = written;
 		written = 0;
@@ -878,6 +1249,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
 	if (retval)
 		goto fput_out;
 
+	unlock_kernel();
 	retval = 0;
 	if (count) {
 		read_descriptor_t desc;
@@ -887,7 +1259,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
 		ppos = &in_file->f_pos;
 		if (offset) {
 			if (get_user(pos, offset))
-				goto fput_out;
+				goto fput_out_lock;
 			ppos = &pos;
 		}
 
@@ -904,7 +1276,8 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
 			put_user(pos, offset);
 	}
 
-
+fput_out_lock:
+	lock_kernel();
 fput_out:
 	fput(out_file);
 fput_in:
@@ -934,17 +1307,21 @@ static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long
 	unsigned long offset, reada, i;
 	struct page * page, **hash;
 	unsigned long old_page, new_page;
+	int error;
 
 	new_page = 0;
 	offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 	if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
-		goto no_page;
+		goto no_page_nolock;
+
+	unlock_kernel();
 
 	/*
 	 * Do we have something in the page cache already?
 	 */
 	hash = page_hash(inode, offset);
-	page = __find_page(inode, offset, *hash);
+retry_find:
+	page = __find_get_page(inode, offset, hash);
 	if (!page)
 		goto no_cached_page;
 
@@ -960,15 +1337,17 @@ found_page:
 			goto failure;
 	}
 
-	if (PageLocked(page))
-		goto page_locked_wait;
-	if (!PageUptodate(page))
-		goto page_read_error;
+	if (!Page_Uptodate(page)) {
+		lock_page(page);
+		if (!Page_Uptodate(page))
+			goto page_not_uptodate;
+		UnlockPage(page);
+	}
 
 success:
 	/*
-	 * Found the page, need to check sharing and possibly
-	 * copy it over to another page..
+	 * Found the page and have a reference on it, need to check sharing
+	 * and possibly copy it over to another page..
 	 */
 	old_page = page_address(page);
 	if (!no_share) {
@@ -980,6 +1359,7 @@ success:
 			page_cache_free(new_page);
 
 		flush_page_to_ram(old_page);
+		lock_kernel();
 		return old_page;
 	}
 
@@ -989,6 +1369,7 @@ success:
 	copy_page(new_page, old_page);
 	flush_page_to_ram(new_page);
 	page_cache_release(page);
+	lock_kernel();
 	return new_page;
 
 no_cached_page:
@@ -1013,7 +1394,7 @@ no_cached_page:
 	 * cache.. The page we just got may be useful if we
 	 * can't share, so don't get rid of it here.
 	 */
-	page = find_page(inode, offset);
+	page = __find_get_page(inode, offset, hash);
 	if (page)
 		goto found_page;
 
@@ -1021,19 +1402,24 @@ no_cached_page:
 	 * Now, create a new page-cache page from the page we got
 	 */
 	page = page_cache_entry(new_page);
-	new_page = 0;
-	add_to_page_cache(page, inode, offset, hash);
+	if (add_to_page_cache_unique(page, inode, offset, hash))
+		goto retry_find;
 
-	if (inode->i_op->readpage(file, page) != 0)
-		goto failure;
+	/*
+	 * Now it's ours and locked, we can do initial IO to it:
+	 */
+	new_page = 0;
 
-	goto found_page;
+page_not_uptodate:
+	error = inode->i_op->readpage(file, page);
 
-page_locked_wait:
-	__wait_on_page(page);
-	if (PageUptodate(page))
+	if (!error) {
+		wait_on_page(page);
+		if (PageError(page))
+			goto page_read_error;
 		goto success;
-	
+	}
+
 page_read_error:
 	/*
 	 * Umm, take care of errors if the page isn't up-to-date.
@@ -1041,12 +1427,14 @@ page_read_error:
 	 * because there really aren't any performance issues here
 	 * and we need to check for errors.
 	 */
-	if (inode->i_op->readpage(file, page) != 0)
+	if (!PageLocked(page))
+		PAGE_BUG(page);
+	ClearPageError(page);
+	error = inode->i_op->readpage(file, page);
+	if (error)
 		goto failure;
 	wait_on_page(page);
-	if (PageError(page))
-		goto failure;
-	if (PageUptodate(page))
+	if (Page_Uptodate(page))
 		goto success;
 
 	/*
@@ -1058,6 +1446,8 @@ failure:
 	if (new_page)
 		page_cache_free(new_page);
 no_page:
+	lock_kernel();
+no_page_nolock:
 	return 0;
 }
 
@@ -1066,12 +1456,13 @@ no_page:
  * if the disk is full.
  */
 static inline int do_write_page(struct inode * inode, struct file * file,
-	const char * page, unsigned long offset)
+	const char * page_addr, unsigned long offset)
 {
 	int retval;
 	unsigned long size;
 	loff_t loff = offset;
-	mm_segment_t old_fs;
+	int (*writepage) (struct file *, struct page *);
+	struct page * page;
 
 	size = offset + PAGE_SIZE;
 	/* refuse to extend file size.. */
@@ -1083,12 +1474,21 @@ static inline int do_write_page(struct inode * inode, struct file * file,
 			return -EIO;
 	}
 	size -= offset;
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
 	retval = -EIO;
-	if (size == file->f_op->write(file, (const char *) page, size, &loff))
-		retval = 0;
-	set_fs(old_fs);
+	writepage = inode->i_op->writepage;
+	page = mem_map + MAP_NR(page_addr);
+	lock_page(page);
+
+	if (writepage) {
+		retval = writepage(file, page);
+	} else {
+		mm_segment_t old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		if (size == file->f_op->write(file, page_addr, size, &loff))
+			retval = 0;
+		set_fs(old_fs);
+	}
+	UnlockPage(page);
 	return retval;
 }
 
@@ -1124,9 +1524,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
 		return 0;
 	}
 	
-	down(&inode->i_sem);
 	result = do_write_page(inode, file, (const char *) page, offset);
-	up(&inode->i_sem);
 	fput(file);
 	return result;
 }
@@ -1146,7 +1544,8 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 	unsigned long address, unsigned int flags)
 {
 	pte_t pte = *ptep;
-	unsigned long page;
+	unsigned long pageaddr;
+	struct page *page;
 	int error;
 
 	if (!(flags & MS_INVALIDATE)) {
@@ -1158,8 +1557,9 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 		flush_cache_page(vma, address);
 		set_pte(ptep, pte_mkclean(pte));
 		flush_tlb_page(vma, address);
-		page = pte_page(pte);
-		atomic_inc(&page_cache_entry(page)->count);
+		pageaddr = pte_page(pte);
+		page = page_cache_entry(pageaddr);
+		get_page(page);
 	} else {
 		if (pte_none(pte))
 			return 0;
@@ -1170,14 +1570,14 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 			swap_free(pte_val(pte));
 			return 0;
 		}
-		page = pte_page(pte);
+		pageaddr = pte_page(pte);
 		if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
-			page_cache_free(page);
+			page_cache_free(pageaddr);
 			return 0;
 		}
 	}
-	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
-	page_cache_free(page);
+	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
+	page_cache_free(pageaddr);
 	return error;
 }
 
@@ -1338,10 +1738,7 @@ static int msync_interval(struct vm_area_struct * vma,
 			struct file * file = vma->vm_file;
 			if (file) {
 				struct dentry * dentry = file->f_dentry;
-				struct inode * inode = dentry->d_inode;
-				down(&inode->i_sem);
 				error = file_fsync(file, dentry);
-				up(&inode->i_sem);
 			}
 		}
 		return error;
@@ -1436,11 +1833,12 @@ generic_file_write(struct file *file, const char *buf,
 	unsigned long	page_cache = 0;
 	unsigned long	written;
 	long		status;
+	int		err;
 
-	if (file->f_error) {
-		int error = file->f_error;
+	err = file->f_error;
+	if (err) {
 		file->f_error = 0;
-		return error;
+		goto out;
 	}
 
 	written = 0;
@@ -1451,7 +1849,7 @@ generic_file_write(struct file *file, const char *buf,
 	/*
 	 * Check whether we've reached the file size limit.
 	 */
-	status = -EFBIG;
+	err = -EFBIG;
 	if (pos >= limit) {
 		send_sig(SIGXFSZ, current, 0);
 		goto out;
@@ -1467,6 +1865,8 @@ generic_file_write(struct file *file, const char *buf,
 		count = limit - pos;
 	}
 
+	unlock_kernel();
+
 	while (count) {
 		unsigned long bytes, pgpos, offset;
 		/*
@@ -1480,29 +1880,36 @@ generic_file_write(struct file *file, const char *buf,
 			bytes = count;
 
 		hash = page_hash(inode, pgpos);
-		page = __find_page(inode, pgpos, *hash);
+repeat_find:
+		page = __find_lock_page(inode, pgpos, hash);
 		if (!page) {
 			if (!page_cache) {
 				page_cache = page_cache_alloc();
 				if (page_cache)
-					continue;
+					goto repeat_find;
 				status = -ENOMEM;
 				break;
 			}
 			page = page_cache_entry(page_cache);
-			add_to_page_cache(page, inode, pgpos, hash);
+			if (add_to_page_cache_unique(page,inode,pgpos,hash))
+				goto repeat_find;
+
 			page_cache = 0;
 		}
 
-		/* Get exclusive IO access to the page.. */
-		wait_on_page(page);
-		set_bit(PG_locked, &page->flags);
+		/* We have exclusive IO access to the page.. */
+		if (!PageLocked(page)) {
+			PAGE_BUG(page);
+		} else {
+			if (page->owner != (int)current) {
+				PAGE_BUG(page);
+			}
+		}
 
 		status = write_one_page(file, page, offset, bytes, buf);
 
 		/* Mark it unlocked again and drop the page.. */
-		clear_bit(PG_locked, &page->flags);
-		wake_up(&page->wait);
+		UnlockPage(page);
 		page_cache_release(page);
 
 		if (status < 0)
@@ -1519,51 +1926,16 @@ generic_file_write(struct file *file, const char *buf,
 
 	if (page_cache)
 		page_cache_free(page_cache);
+
+	err = written ? written : status;
+	lock_kernel();
 out:
-	return written ? written : status;
+	return err;
 }
 
 /*
- * Support routines for directory cacheing using the page cache.
- */
-
-/*
- * Finds the page at the specified offset, installing a new page
- * if requested.  The count is incremented and the page is locked.
- *
- * Note: we don't have to worry about races here, as the caller
- * is holding the inode semaphore.
+ * Support routines for directory caching using the page cache.
  */
-unsigned long get_cached_page(struct inode * inode, unsigned long offset,
-				int new)
-{
-	struct page * page;
-	struct page ** hash;
-	unsigned long page_cache = 0;
-
-	hash = page_hash(inode, offset);
-	page = __find_page(inode, offset, *hash);
-	if (!page) {
-		if (!new)
-			goto out;
-		page_cache = page_cache_alloc();
-		if (!page_cache)
-			goto out;
-		clear_page(page_cache);
-		page = page_cache_entry(page_cache);
-		add_to_page_cache(page, inode, offset, hash);
-	}
-	if (atomic_read(&page->count) != 2)
-		printk(KERN_ERR "get_cached_page: page count=%d\n",
-			atomic_read(&page->count));
-	if (test_bit(PG_locked, &page->flags))
-		printk(KERN_ERR "get_cached_page: page already locked!\n");
-	set_bit(PG_locked, &page->flags);
-	page_cache = page_address(page);
-
-out:
-	return page_cache;
-}
 
 /*
  * Unlock and free a page.
@@ -1572,13 +1944,10 @@ void put_cached_page(unsigned long addr)
 {
 	struct page * page = page_cache_entry(addr);
 
-	if (!test_bit(PG_locked, &page->flags))
-		printk("put_cached_page: page not locked!\n");
-	if (atomic_read(&page->count) != 2)
-		printk("put_cached_page: page count=%d\n", 
-			atomic_read(&page->count));
-	clear_bit(PG_locked, &page->flags);
-	wake_up(&page->wait);
+	UnlockPage(page);
+	if (page_count(page) != 2)
+		panic("put_cached_page: page count=%d\n", 
+			page_count(page));
 	page_cache_release(page);
 }
 
@@ -1607,11 +1976,13 @@ static inline struct pio_request * get_pio_request(void)
 
 static inline void make_pio_request(struct file *file,
 				    unsigned long offset,
-				    unsigned long page)
+				    unsigned long pageaddr)
 {
 	struct pio_request *p;
+	struct page *page;
 
-	atomic_inc(&page_cache_entry(page)->count);
+	page = page_cache_entry(pageaddr);
+	get_page(page);
 
 	/* 
 	 * We need to allocate without causing any recursive IO in the
@@ -1634,7 +2005,7 @@ static inline void make_pio_request(struct file *file,
 	
 	p->file   = file;
 	p->offset = offset;
-	p->page   = page;
+	p->page   = pageaddr;
 
 	put_pio_request(p);
 	wake_up(&pio_wait);
@@ -1694,10 +2065,8 @@ int kpiod(void * unused)
 			dentry = p->file->f_dentry;
 			inode = dentry->d_inode;
 			
-			down(&inode->i_sem);
 			do_write_page(inode, p->file,
 				      (const char *) p->page, p->offset);
-			up(&inode->i_sem);
 			fput(p->file);
 			page_cache_free(p->page);
 			kmem_cache_free(pio_request_cache, p);
diff --git a/mm/memory.c b/mm/memory.c
index ae56831b3..aac203bbb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -272,7 +272,7 @@ skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
 				if (vma->vm_flags & VM_SHARED)
 					pte = pte_mkclean(pte);
 				set_pte(dst_pte, pte_mkold(pte));
-				atomic_inc(&mem_map[page_nr].count);
+				get_page(mem_map + page_nr);
 			
 cont_copy_pte_range:		address += PAGE_SIZE;
 				if (address >= end)
@@ -556,7 +556,7 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig
 
 	if (MAP_NR(page) >= max_mapnr)
 		printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
-	if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
+	if (page_count(mem_map + MAP_NR(page)) != 1)
 		printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 	pgd = pgd_offset(tsk->mm,address);
 	pmd = pmd_alloc(pgd, address);
@@ -604,17 +604,17 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	unsigned long address, pte_t *page_table, pte_t pte)
 {
 	unsigned long old_page, new_page;
-	struct page * page_map;
+	struct page * page;
 	
 	new_page = __get_free_page(GFP_USER);
-	/* Did swap_out() unmapped the protected page while we slept? */
+	/* Did swap_out() unmap the protected page while we slept? */
 	if (pte_val(*page_table) != pte_val(pte))
 		goto end_wp_page;
 	old_page = pte_page(pte);
 	if (MAP_NR(old_page) >= max_mapnr)
 		goto bad_wp_page;
 	tsk->min_flt++;
-	page_map = mem_map + MAP_NR(old_page);
+	page = mem_map + MAP_NR(old_page);
 	
 	/*
 	 * We can avoid the copy if:
@@ -624,13 +624,13 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	 *   in which case we can remove the page
 	 *   from the swap cache.
 	 */
-	switch (atomic_read(&page_map->count)) {
+	switch (page_count(page)) {
 	case 2:
-		if (!PageSwapCache(page_map))
+		if (!PageSwapCache(page))
 			break;
-		if (swap_count(page_map->offset) != 1)
+		if (swap_count(page->offset) != 1)
 			break;
-		delete_from_swap_cache(page_map);
+		delete_from_swap_cache(page);
 		/* FallThrough */
 	case 1:
 		flush_cache_page(vma, address);
@@ -652,7 +652,7 @@ end_wp_page:
 	if (!new_page)
 		goto no_new_page;
 
-	if (PageReserved(page_map))
+	if (PageReserved(page))
 		++vma->vm_mm->rss;
 	copy_cow_page(old_page,new_page);
 	flush_page_to_ram(old_page);
@@ -661,7 +661,7 @@ end_wp_page:
 	set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 	flush_tlb_page(vma, address);
 	unlock_kernel();
-	__free_page(page_map);
+	__free_page(page);
 	return 1;
 
 bad_wp_page:
@@ -776,7 +776,7 @@ static int do_swap_page(struct task_struct * tsk,
 		if (pte_val(*page_table) != pte_val(entry)) {
 			free_page(pte_page(page));
 		} else {
-			if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
+			if (page_count(mem_map + MAP_NR(pte_page(page))) > 1 &&
 			    !(vma->vm_flags & VM_SHARED))
 				page = pte_wrprotect(page);
 			++vma->vm_mm->rss;
@@ -861,7 +861,7 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write_access) {
 		entry = pte_mkwrite(pte_mkdirty(entry));
-	} else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
+	} else if (page_count(mem_map+MAP_NR(page)) > 1 &&
 		   !(vma->vm_flags & VM_SHARED))
 		entry = pte_wrprotect(entry);
 	set_pte(page_table, entry);
diff --git a/mm/mmap.c b/mm/mmap.c
index 6e5eda00d..e179a2932 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -63,7 +63,7 @@ int vm_enough_memory(long pages)
 	    return 1;
 
 	free = buffermem >> PAGE_SHIFT;
-	free += page_cache_size;
+	free += atomic_read(&page_cache_size);
 	free += nr_free_pages;
 	free += nr_swap_pages;
 	free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100; 
@@ -728,6 +728,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	struct vm_area_struct * vma;
 	unsigned long flags, retval;
 
+	len = PAGE_ALIGN(len);
+	if (!len)
+		return addr;
+
 	/*
 	 * mlock MCL_FUTURE?
 	 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8826b9af1..fad87ba27 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -119,33 +119,33 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
 }
 
-void __free_page(struct page *page)
+int __free_page(struct page *page)
 {
-	if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
+	if (!PageReserved(page) && put_page_testzero(page)) {
 		if (PageSwapCache(page))
-			panic ("Freeing swap cache page");
+			PAGE_BUG(page);
 		page->flags &= ~(1 << PG_referenced);
 		free_pages_ok(page - mem_map, 0);
-		return;
+		return 1;
 	}
+	return 0;
 }
 
-void free_pages(unsigned long addr, unsigned long order)
+int free_pages(unsigned long addr, unsigned long order)
 {
 	unsigned long map_nr = MAP_NR(addr);
 
 	if (map_nr < max_mapnr) {
 		mem_map_t * map = mem_map + map_nr;
-		if (PageReserved(map))
-			return;
-		if (atomic_dec_and_test(&map->count)) {
+		if (!PageReserved(map) && put_page_testzero(map)) {
 			if (PageSwapCache(map))
-				panic ("Freeing swap cache pages");
+				PAGE_BUG(map);
 			map->flags &= ~(1 << PG_referenced);
 			free_pages_ok(map_nr, order);
-			return;
+			return 1;
 		}
 	}
+	return 0;
 }
 
 /*
@@ -167,7 +167,7 @@ do { struct free_area_struct * area = free_area+order; \
 				MARK_USED(map_nr, new_order, area); \
 				nr_free_pages -= 1 << order; \
 				EXPAND(ret, map_nr, order, new_order, area); \
-				spin_unlock_irqrestore(&page_alloc_lock, flags); \
+				spin_unlock_irqrestore(&page_alloc_lock,flags);\
 				return ADDRESS(map_nr); \
 			} \
 			prev = ret; \
@@ -186,7 +186,7 @@ do { unsigned long size = 1 << high; \
 		index += size; \
 		map += size; \
 	} \
-	atomic_set(&map->count, 1); \
+	set_page_count(map, 1); \
 } while (0)
 
 int low_on_memory = 0;
@@ -321,7 +321,7 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
 	memset(mem_map, 0, start_mem - (unsigned long) mem_map);
 	do {
 		--p;
-		atomic_set(&p->count, 0);
+		set_page_count(p, 0);
 		p->flags = (1 << PG_DMA) | (1 << PG_reserved);
 		init_waitqueue_head(&p->wait);
 	} while (p > mem_map);
diff --git a/mm/page_io.c b/mm/page_io.c
index 9f5e82446..2226c2c9d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -47,7 +47,7 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
 #ifdef DEBUG_SWAP
 	printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n",
 		(rw == READ) ? "read" : "write", 
-		entry, (char *) page_address(page), atomic_read(&page->count),
+		entry, (char *) page_address(page), page_count(page),
 		wait ? "wait" : "nowait");
 #endif
 
@@ -105,12 +105,12 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
 		}
 	}
 	if (rw == READ) {
-		clear_bit(PG_uptodate, &page->flags);
+		ClearPageUptodate(page);
 		kstat.pswpin++;
 	} else
 		kstat.pswpout++;
 
-	atomic_inc(&page->count);
+	get_page(page);
 	if (p->swap_device) {
 		zones[0] = offset;
 		zones_used = 1;
@@ -167,7 +167,7 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
 				printk("swap_after_unlock_page: lock already cleared\n");
 			wake_up(&lock_queue);
 		}
-		atomic_dec(&page->count);
+		put_page(page);
 		return;
 	}
  	if (!wait) {
@@ -182,23 +182,24 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
 
  	/* block_size == PAGE_SIZE/zones_used */
  	brw_page(rw, page, dev, zones, block_size, 0);
- 
+
  	/* Note! For consistency we do all of the logic,
  	 * decrementing the page count, and unlocking the page in the
  	 * swap lock map - in the IO completion handler.
  	 */
- 	if (!wait) 
+ 	if (!wait) {
  		return;
+	}
  	wait_on_page(page);
 	/* This shouldn't happen, but check to be sure. */
-	if (atomic_read(&page->count) == 0)
+	if (page_count(page) == 0)
 		printk(KERN_ERR "rw_swap_page: page unused while waiting!\n");
 
 #ifdef DEBUG_SWAP
 	printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n",
 		(rw == READ) ? "read" : "write", 
-		(char *) page_adddress(page), 
-		atomic_read(&page->count));
+		(char *) page_address(page), 
+		page_count(page));
 #endif
 }
 
@@ -238,7 +239,7 @@ void rw_swap_page(int rw, unsigned long entry, char *buf, int wait)
 	struct page *page = mem_map + MAP_NR(buf);
 
 	if (page->inode && page->inode != &swapper_inode)
-		panic ("Tried to swap a non-swapper page");
+		PAGE_BUG(page);
 
 	/*
 	 * Make sure that we have a swap cache association for this
@@ -268,23 +269,27 @@ void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer)
 	struct page *page;
 	
 	page = mem_map + MAP_NR((unsigned long) buffer);
-	wait_on_page(page);
-	set_bit(PG_locked, &page->flags);
-	if (test_and_set_bit(PG_swap_cache, &page->flags)) {
-		printk ("VM: read_swap_page: page already in swap cache!\n");
-		return;
-	}
-	if (page->inode) {
-		printk ("VM: read_swap_page: page already in page cache!\n");
-		return;
-	}
+
+	if (TryLockPage(page))
+		PAGE_BUG(page);
+	if (test_and_set_bit(PG_swap_cache, &page->flags))
+		PAGE_BUG(page);
+	if (page->inode)
+		PAGE_BUG(page);
+	get_page(page);		/* Protect from shrink_mmap() */
 	page->inode = &swapper_inode;
 	page->offset = entry;
-	atomic_inc(&page->count);	/* Protect from shrink_mmap() */
 	rw_swap_page(rw, entry, buffer, 1);
-	atomic_dec(&page->count);
-	page->inode = 0;
-	clear_bit(PG_swap_cache, &page->flags);
+
+	/*
+	 * and now remove it from the pagecache ...
+	 */
+	if (TryLockPage(page))
+		PAGE_BUG(page);
+	PageClearSwapCache(page);
+	remove_inode_page(page);
+	page_cache_release(page);
+	UnlockPage(page);
 }
 
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8c5e7176c..21723c1db 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -25,7 +25,31 @@
  * ensure that any mistaken dereferences of this structure cause a
  * kernel oops.
  */
-struct inode swapper_inode;
+
+static struct inode_operations swapper_inode_operations = {
+	NULL,				/* default file operations */
+	NULL,				/* create */
+	NULL,				/* lookup */
+	NULL,				/* link */
+	NULL,				/* unlink */
+	NULL,				/* symlink */
+	NULL,				/* mkdir */
+	NULL,				/* rmdir */
+	NULL,				/* mknod */
+	NULL,				/* rename */
+	NULL,				/* readlink */
+	NULL,				/* follow_link */
+	NULL,				/* bmap */
+	NULL,				/* readpage */
+	NULL,				/* writepage */
+	block_flushpage,		/* flushpage */
+	NULL,				/* truncate */
+	NULL,				/* permission */
+	NULL,				/* smap */
+	NULL				/* revalidate */
+};
+
+struct inode swapper_inode = { i_op: &swapper_inode_operations };
 
 #ifdef SWAP_CACHE_INFO
 unsigned long swap_cache_add_total = 0;
@@ -49,20 +73,20 @@ int add_to_swap_cache(struct page *page, unsigned long entry)
 #endif
 #ifdef DEBUG_SWAP
 	printk("DebugVM: add_to_swap_cache(%08lx count %d, entry %08lx)\n",
-	       page_address(page), atomic_read(&page->count), entry);
+		   page_address(page), page_count(page), entry);
 #endif
 	if (PageTestandSetSwapCache(page)) {
 		printk(KERN_ERR "swap_cache: replacing non-empty entry %08lx "
-		       "on page %08lx\n",
-		       page->offset, page_address(page));
+			   "on page %08lx\n",
+			   page->offset, page_address(page));
 		return 0;
 	}
 	if (page->inode) {
 		printk(KERN_ERR "swap_cache: replacing page-cached entry "
-		       "on page %08lx\n", page_address(page));
+			   "on page %08lx\n", page_address(page));
 		return 0;
 	}
-	atomic_inc(&page->count);
+	get_page(page);
 	page->inode = &swapper_inode;
 	page->offset = entry;
 	add_page_to_hash_queue(page, &swapper_inode, entry);
@@ -111,7 +135,7 @@ int swap_duplicate(unsigned long entry)
 	result = 1;
 #ifdef DEBUG_SWAP
 	printk("DebugVM: swap_duplicate(entry %08lx, count now %d)\n",
-	       entry, p->swap_map[offset]);
+		   entry, p->swap_map[offset]);
 #endif
 out:
 	return result;
@@ -127,7 +151,7 @@ bad_offset:
 bad_unused:
 	printk(KERN_ERR
 		"swap_duplicate at %8p: entry %08lx, unused page\n", 
-	       __builtin_return_address(0), entry);
+		   __builtin_return_address(0), entry);
 	goto out;
 }
 
@@ -153,7 +177,7 @@ int swap_count(unsigned long entry)
 	retval = p->swap_map[offset];
 #ifdef DEBUG_SWAP
 	printk("DebugVM: swap_count(entry %08lx, count %d)\n",
-	       entry, retval);
+		   entry, retval);
 #endif
 out:
 	return retval;
@@ -163,16 +187,16 @@ bad_entry:
 	goto out;
 bad_file:
 	printk(KERN_ERR
-	       "swap_count: entry %08lx, nonexistent swap file!\n", entry);
+		   "swap_count: entry %08lx, nonexistent swap file!\n", entry);
 	goto out;
 bad_offset:
 	printk(KERN_ERR
-	       "swap_count: entry %08lx, offset exceeds max!\n", entry);
+		   "swap_count: entry %08lx, offset exceeds max!\n", entry);
 	goto out;
 bad_unused:
 	printk(KERN_ERR
-	       "swap_count at %8p: entry %08lx, unused page!\n", 
-	       __builtin_return_address(0), entry);
+		   "swap_count at %8p: entry %08lx, unused page!\n", 
+		   __builtin_return_address(0), entry);
 	goto out;
 }
 
@@ -190,18 +214,17 @@ static inline void remove_from_swap_cache(struct page *page)
 
 #ifdef DEBUG_SWAP
 	printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
-	       page_address(page), atomic_read(&page->count));
+		   page_address(page), page_count(page));
 #endif
-	PageClearSwapCache (page);
+	PageClearSwapCache(page);
 	remove_inode_page(page);
 }
 
-
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
  */
-void delete_from_swap_cache(struct page *page)
+void __delete_from_swap_cache(struct page *page)
 {
 	long entry = page->offset;
 
@@ -210,13 +233,27 @@ void delete_from_swap_cache(struct page *page)
 #endif
 #ifdef DEBUG_SWAP
 	printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
-	       "entry %08lx)\n",
-	       page_address(page), atomic_read(&page->count), entry);
+		   "entry %08lx)\n",
+		   page_address(page), page_count(page), entry);
 #endif
 	remove_from_swap_cache (page);
 	swap_free (entry);
 }
 
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache.
+ */
+void delete_from_swap_cache(struct page *page)
+{
+	lock_page(page);
+
+	__delete_from_swap_cache(page);
+
+	UnlockPage(page);
+	page_cache_release(page);
+}
+
 /* 
  * Perform a free_page(), also freeing any swap cache associated with
  * this page if it is the last user of the page. 
@@ -229,18 +266,18 @@ void free_page_and_swap_cache(unsigned long addr)
 	/* 
 	 * If we are the only user, then free up the swap cache. 
 	 */
-	if (PageSwapCache(page) && !is_page_shared(page)) {
+	if (PageSwapCache(page) && !is_page_shared(page))
 		delete_from_swap_cache(page);
-	}
 	
 	__free_page(page);
 }
 
 
 /*
- * Lookup a swap entry in the swap cache.  We need to be careful about
- * locked pages.  A found page will be returned with its refcount
- * incremented.
+ * Lookup a swap entry in the swap cache. A found page will be returned
+ * unlocked and with its refcount incremented - we rely on the kernel
+ * lock getting page table operations atomic even if we drop the page
+ * lock before returning.
  */
 
 struct page * lookup_swap_cache(unsigned long entry)
@@ -251,23 +288,21 @@ struct page * lookup_swap_cache(unsigned long entry)
 	swap_cache_find_total++;
 #endif
 	while (1) {
-		found = find_page(&swapper_inode, entry);
+		found = find_lock_page(&swapper_inode, entry);
 		if (!found)
 			return 0;
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
 			goto out_bad;
-		if (!PageLocked(found)) {
 #ifdef SWAP_CACHE_INFO
-			swap_cache_find_success++;
+		swap_cache_find_success++;
 #endif
-			return found;
-		}
-		__free_page(found);
-		__wait_on_page(found);
+		UnlockPage(found);
+		return found;
 	}
 
 out_bad:
 	printk (KERN_ERR "VM: Found a non-swapper swap page!\n");
+	UnlockPage(found);
 	__free_page(found);
 	return 0;
 }
@@ -288,7 +323,7 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
 	
 #ifdef DEBUG_SWAP
 	printk("DebugVM: read_swap_cache_async entry %08lx%s\n",
-	       entry, wait ? ", wait" : "");
+		   entry, wait ? ", wait" : "");
 #endif
 	/*
 	 * Make sure the swap entry is still in use.
@@ -319,12 +354,12 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
 	if (!add_to_swap_cache(new_page, entry))
 		goto out_free_page;
 
-	set_bit(PG_locked, &new_page->flags);
+	LockPage(new_page);
 	rw_swap_page(READ, entry, (char *) new_page_addr, wait);
 #ifdef DEBUG_SWAP
 	printk("DebugVM: read_swap_cache_async created "
-	       "entry %08lx at %p\n",
-	       entry, (char *) page_address(new_page));
+		   "entry %08lx at %p\n",
+			entry, (char *) page_address(new_page));
 #endif
 	return new_page;
 
@@ -335,3 +370,4 @@ out_free_swap:
 out:
 	return found_page;
 }
+
diff --git a/mm/swapfile.c b/mm/swapfile.c
index de29f1006..794e39aff 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -192,7 +192,7 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
 		return;
 	set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	swap_free(entry);
-	atomic_inc(&mem_map[MAP_NR(page)].count);
+	get_page(mem_map + MAP_NR(page));
 	++vma->vm_mm->rss;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d651e6f94..9ca4988e4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -157,7 +157,7 @@ drop_pte:
 	add_to_swap_cache(page_map, entry);
 	/* We checked we were unlocked way up above, and we
 	   have been careful not to stall until here */
-	set_bit(PG_locked, &page_map->flags);
+	LockPage(page_map);
 
 	/* OK, do a physical asynchronous write to swap.  */
 	rw_swap_page(WRITE, entry, (char *) page, 0);