Merge with 2.3.19.

author: Ralf Baechle <ralf@linux-mips.org> 1999-10-09 00:00:47 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 1999-10-09 00:00:47 +0000
commit: d6434e1042f3b0a6dfe1b1f615af369486f9b1fa (patch)
tree: e2be02f33984c48ec019c654051d27964e42c441 /mm
parent: 609d1e803baf519487233b765eb487f9ec227a18 (diff)
15 files changed, 951 insertions, 547 deletions
diff --git a/mm/Makefile b/mm/Makefile
index c64eefbd2..68404aa67 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,4 +12,8 @@ O_OBJS	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o \
 	    swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o
 
+ifeq ($(CONFIG_BIGMEM),y)
+O_OBJS += bigmem.o
+endif
+
 include $(TOPDIR)/Rules.make
diff --git a/mm/bigmem.c b/mm/bigmem.c
new file mode 100644
index 000000000..af63e860c
--- /dev/null
+++ b/mm/bigmem.c
@@ -0,0 +1,71 @@
+/*
+ * BIGMEM common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bigmem.h>
+
+unsigned long bigmem_mapnr;
+int nr_free_bigpages = 0;
+
+struct page * prepare_bigmem_swapout(struct page * page)
+{
+	/* if this is a bigmem page so it can't be swapped out directly
+	   otherwise the b_data buffer addresses will break
+	   the lowlevel device drivers. */
+	if (PageBIGMEM(page)) {
+		unsigned long regular_page;
+		unsigned long vaddr;
+
+		regular_page = __get_free_page(GFP_ATOMIC);
+		if (!regular_page)
+			return NULL;
+
+		vaddr = kmap(page_address(page), KM_READ);
+		copy_page(regular_page, vaddr);
+		kunmap(vaddr, KM_READ);
+
+		/* ok, we can just forget about our bigmem page since 
+		   we stored its data into the new regular_page. */
+		__free_page(page);
+
+		page = MAP_NR(regular_page) + mem_map;
+	}
+	return page;
+}
+
+struct page * replace_with_bigmem(struct page * page)
+{
+	if (!PageBIGMEM(page) && nr_free_bigpages) {
+		unsigned long kaddr;
+
+		kaddr = __get_free_page(GFP_ATOMIC|GFP_BIGMEM);
+		if (kaddr) {
+			struct page * bigmem_page;
+
+			bigmem_page = MAP_NR(kaddr) + mem_map;
+			if (PageBIGMEM(bigmem_page)) {
+				unsigned long vaddr;
+
+				vaddr = kmap(kaddr, KM_WRITE);
+				copy_page(vaddr, page_address(page));
+				kunmap(vaddr, KM_WRITE);
+
+				/* Preserve the caching of the swap_entry. */
+				bigmem_page->offset = page->offset;
+
+				/* We can just forget the old page since 
+				   we stored its data into the new
+				   bigmem_page. */
+				__free_page(page);
+
+				page = bigmem_page;
+			}
+		}
+	}
+	return page;
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 668c6c99f..5efa9aaf7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,8 @@
  *
  * finished 'unifying' the page and buffer cache and SMP-threaded the
  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  */
 
 atomic_t page_cache_size = ATOMIC_INIT(0);
@@ -40,7 +42,16 @@ unsigned int page_hash_bits;
 struct page **page_hash_table;
 
 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+/*
+ * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
+ *       the pagemap_lru_lock held.
+ */
+spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
 
+#define CLUSTER_PAGES		(1 << page_cluster)
+#define CLUSTER_SHIFT		(PAGE_CACHE_SHIFT + page_cluster)
+#define CLUSTER_BYTES		(1 << CLUSTER_SHIFT)
+#define CLUSTER_OFFSET(x)	(((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT)
 
 void __add_page_to_hash_queue(struct page * page, struct page **p)
 {
@@ -117,6 +128,7 @@ repeat:
 		}
 		if (page_count(page) != 2)
 			printk("hm, busy page invalidated? (not necesserily a bug)\n");
+		lru_cache_del(page);
 
 		remove_page_from_inode_queue(page);
 		remove_page_from_hash_queue(page);
@@ -151,8 +163,9 @@ repeat:
 
 			lock_page(page);
 
-			if (inode->i_op->flushpage)
-				inode->i_op->flushpage(inode, page, 0);
+			if (!inode->i_op->flushpage ||
+			    inode->i_op->flushpage(inode, page, 0))
+				lru_cache_del(page);
 
 			/*
 			 * We remove the page from the page cache
@@ -212,93 +225,75 @@ repeat:
 	spin_unlock(&pagecache_lock);
 }
 
-extern atomic_t too_many_dirty_buffers;
-
 int shrink_mmap(int priority, int gfp_mask)
 {
-	static unsigned long clock = 0;
-	unsigned long limit = num_physpages << 1;
+	int ret = 0, count;
+	LIST_HEAD(young);
+	LIST_HEAD(old);
+	LIST_HEAD(forget);
+	struct list_head * page_lru, * dispose;
 	struct page * page;
-	int count, users;
 
-	count = limit >> priority;
+	count = nr_lru_pages / (priority+1);
 
-	page = mem_map + clock;
-	do {
-		int referenced;
+	spin_lock(&pagemap_lru_lock);
 
-		/* This works even in the presence of PageSkip because
-		 * the first two entries at the beginning of a hole will
-		 * be marked, not just the first.
-		 */
-		page++;
-		clock++;
-		if (clock >= max_mapnr) {
-			clock = 0;
-			page = mem_map;
-		}
-		if (PageSkip(page)) {
-			/* next_hash is overloaded for PageSkip */
-			page = page->next_hash;
-			clock = page - mem_map;
-		}
-		
-		referenced = test_and_clear_bit(PG_referenced, &page->flags);
+	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+		page = list_entry(page_lru, struct page, lru);
+		list_del(page_lru);
 
+		dispose = &lru_cache;
+		if (test_and_clear_bit(PG_referenced, &page->flags))
+			/* Roll the page at the top of the lru list,
+			 * we could also be more aggressive putting
+			 * the page in the young-dispose-list, so
+			 * avoiding to free young pages in each pass.
+			 */
+			goto dispose_continue;
+
+		dispose = &old;
+		/* don't account passes over not DMA pages */
 		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
-			continue;
+			goto dispose_continue;
+		if (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page))
+			goto dispose_continue;
 
 		count--;
 
-		/*
-		 * Some common cases that we just short-circuit without
-		 * getting the locks - we need to re-check this once we
-		 * have the lock, but that's fine.
-		 */
-		users = page_count(page);
-		if (!users)
-			continue;
-		if (!page->buffers) {
-			if (!page->inode)
-				continue;
-			if (users > 1)
-				continue;
-		}
-
-		/*
-		 * ok, now the page looks interesting. Re-check things
-		 * and keep the lock.
-		 */
+		dispose = &young;
+		if (TryLockPage(page))
+			goto dispose_continue;
+
+		/* Release the pagemap_lru lock even if the page is not yet
+		   queued in any lru queue since we have just locked down
+		   the page so nobody else may SMP race with us running
+		   a lru_cache_del() (lru_cache_del() always run with the
+		   page locked down ;). */
+		spin_unlock(&pagemap_lru_lock);
+
+		/* avoid unscalable SMP locking */
+		if (!page->buffers && page_count(page) > 1)
+			goto unlock_noput_continue;
+
+		/* Take the pagecache_lock spinlock held to avoid
+		   other tasks to notice the page while we are looking at its
+		   page count. If it's a pagecache-page we'll free it
+		   in one atomic transaction after checking its page count. */
 		spin_lock(&pagecache_lock);
-		if (!page->inode && !page->buffers) {
-			spin_unlock(&pagecache_lock);
-			continue;
-		}
-		if (!page_count(page)) {
-			spin_unlock(&pagecache_lock);
-			BUG();
-			continue;
-		}
-		get_page(page);
-		if (TryLockPage(page)) {
-			spin_unlock(&pagecache_lock);
-			goto put_continue;
-		}
 
-		/*
-		 * we keep pagecache_lock locked and unlock it in
-		 * each branch, so that the page->inode case doesnt
-		 * have to re-grab it. Here comes the 'real' logic
-		 * to free memory:
-		 */
+		/* avoid freeing the page while it's locked */
+		get_page(page);
 
 		/* Is it a buffer page? */
 		if (page->buffers) {
-			int mem = page->inode ? 0 : PAGE_CACHE_SIZE;
 			spin_unlock(&pagecache_lock);
 			if (!try_to_free_buffers(page))
 				goto unlock_continue;
-			atomic_sub(mem, &buffermem);
+			/* page was locked, inode can't go away under us */
+			if (!page->inode) {
+				atomic_sub(PAGE_CACHE_SIZE, &buffermem);
+				goto made_buffer_progress;
+			}
 			spin_lock(&pagecache_lock);
 		}
 
@@ -307,7 +302,7 @@ int shrink_mmap(int priority, int gfp_mask)
 		 * (count == 2 because we added one ourselves above).
 		 */
 		if (page_count(page) != 2)
-			goto spin_unlock_continue;
+			goto cache_unlock_continue;
 
 		/*
 		 * Is it a page swap page? If so, we want to
@@ -316,35 +311,68 @@ int shrink_mmap(int priority, int gfp_mask)
 		 */
 		if (PageSwapCache(page)) {
 			spin_unlock(&pagecache_lock);
-			if (referenced && swap_count(page->offset) != 2)
-				goto unlock_continue;
 			__delete_from_swap_cache(page);
-			page_cache_release(page);
-			goto made_progress;
+			goto made_inode_progress;
 		}	
 
 		/* is it a page-cache page? */
-		if (!referenced && page->inode && !pgcache_under_min()) {
-			remove_page_from_inode_queue(page);
-			remove_page_from_hash_queue(page);
-			page->inode = NULL;
-			spin_unlock(&pagecache_lock);
-
-			page_cache_release(page);
-			goto made_progress;
+		if (page->inode)
+		{
+			dispose = &old;
+			if (!pgcache_under_min())
+			{
+				remove_page_from_inode_queue(page);
+				remove_page_from_hash_queue(page);
+				page->inode = NULL;
+				spin_unlock(&pagecache_lock);
+				goto made_inode_progress;
+			}
+			goto cache_unlock_continue;
 		}
-spin_unlock_continue:
+
+		dispose = &forget;
+		printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
+
+cache_unlock_continue:
 		spin_unlock(&pagecache_lock);
 unlock_continue:
 		UnlockPage(page);
-put_continue:
 		put_page(page);
-	} while (count > 0);
-	return 0;
-made_progress:
+dispose_relock_continue:
+		/* even if the dispose list is local, a truncate_inode_page()
+		   may remove a page from its queue so always
+		   synchronize with the lru lock while accesing the
+		   page->lru field */
+		spin_lock(&pagemap_lru_lock);
+		list_add(page_lru, dispose);
+		continue;
+
+unlock_noput_continue:
+		UnlockPage(page);
+		goto dispose_relock_continue;
+
+dispose_continue:
+		list_add(page_lru, dispose);
+	}
+	goto out;
+
+made_inode_progress:
+	page_cache_release(page);
+made_buffer_progress:
 	UnlockPage(page);
 	put_page(page);
-	return 1;
+	ret = 1;
+	spin_lock(&pagemap_lru_lock);
+	/* nr_lru_pages needs the spinlock */
+	nr_lru_pages--;
+
+out:
+	list_splice(&young, &lru_cache);
+	list_splice(&old, lru_cache.prev);
+
+	spin_unlock(&pagemap_lru_lock);
+
+	return ret;
 }
 
 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
@@ -461,13 +489,14 @@ static inline void __add_to_page_cache(struct page * page,
 {
 	unsigned long flags;
 
-	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
-	page->flags = flags |  ((1 << PG_locked) | (1 << PG_referenced));
-	page->owner = (int)current;	/* REMOVEME */
+	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
+	page->flags = flags | (1 << PG_locked);
+	page->owner = current;	/* REMOVEME */
 	get_page(page);
 	page->offset = offset;
 	add_page_to_inode_queue(inode, page);
 	__add_page_to_hash_queue(page, hash);
+	lru_cache_add(page);
 }
 
 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
@@ -498,39 +527,58 @@ int add_to_page_cache_unique(struct page * page,
 }
 
 /*
- * Try to read ahead in the file. "page_cache" is a potentially free page
- * that we could use for the cache (if it is 0 we can try to create one,
- * this is all overlapped with the IO on the previous page finishing anyway)
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
  */
-static unsigned long try_to_read_ahead(struct file * file,
-				unsigned long offset, unsigned long page_cache)
+static inline void page_cache_read(struct file * file, unsigned long offset) 
 {
+	unsigned long new_page;
 	struct inode *inode = file->f_dentry->d_inode;
-	struct page * page;
-	struct page ** hash;
+	struct page ** hash = page_hash(inode, offset);
+	struct page * page; 
 
-	offset &= PAGE_CACHE_MASK;
-	switch (page_cache) {
-	case 0:
-		page_cache = page_cache_alloc();
-		if (!page_cache)
-			break;
-	default:
-		if (offset >= inode->i_size)
-			break;
-		hash = page_hash(inode, offset);
-		page = page_cache_entry(page_cache);
-		if (!add_to_page_cache_unique(page, inode, offset, hash)) {
-			/*
-			 * We do not have to check the return value here
-			 * because it's a readahead.
-			 */
-			inode->i_op->readpage(file, page);
-			page_cache = 0;
-			page_cache_release(page);
-		}
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(inode, offset, *hash); 
+	spin_unlock(&pagecache_lock);
+	if (page)
+		return;
+
+	new_page = page_cache_alloc();
+	if (!new_page)
+		return;
+	page = page_cache_entry(new_page);
+
+	if (!add_to_page_cache_unique(page, inode, offset, hash)) {
+		inode->i_op->readpage(file, page);
+		page_cache_release(page);
+		return;
+	}
+
+	/*
+	 * We arrive here in the unlikely event that someone 
+	 * raced with us and added our page to the cache first.
+	 */
+	page_cache_free(new_page);
+	return;
+}
+
+/*
+ * Read in an entire cluster at once.  A cluster is usually a 64k-
+ * aligned block that includes the address requested in "offset."
+ */
+static void read_cluster_nonblocking(struct file * file,
+	unsigned long offset)
+{
+	off_t filesize = file->f_dentry->d_inode->i_size;
+	unsigned long pages = CLUSTER_PAGES;
+
+	offset = CLUSTER_OFFSET(offset);
+	while ((pages-- > 0) && (offset < filesize)) {
+		page_cache_read(file, offset);
+		offset += PAGE_CACHE_SIZE;
 	}
-	return page_cache;
+
+	return;
 }
 
 /* 
@@ -547,8 +595,8 @@ void ___wait_on_page(struct page *page)
 
 	add_wait_queue(&page->wait, &wait);
 	do {
-		tsk->state = TASK_UNINTERRUPTIBLE;
 		run_task_queue(&tq_disk);
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!PageLocked(page))
 			break;
 		schedule();
@@ -562,23 +610,8 @@ void ___wait_on_page(struct page *page)
  */
 void lock_page(struct page *page)
 {
-	if (TryLockPage(page)) {
-		struct task_struct *tsk = current;
-		DECLARE_WAITQUEUE(wait, current);
-
-		run_task_queue(&tq_disk);
-		add_wait_queue(&page->wait, &wait);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-
-		while (TryLockPage(page)) {
-			run_task_queue(&tq_disk);
-			schedule();
-			tsk->state = TASK_UNINTERRUPTIBLE;
-		}
-
-		remove_wait_queue(&page->wait, &wait);
-		tsk->state = TASK_RUNNING;
-	}
+	while (TryLockPage(page))
+		___wait_on_page(page);
 }
 
 
@@ -607,13 +640,14 @@ repeat:
 		struct task_struct *tsk = current;
 		DECLARE_WAITQUEUE(wait, tsk);
 
+		run_task_queue(&tq_disk);
+
+		__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		add_wait_queue(&page->wait, &wait);
-		tsk->state = TASK_UNINTERRUPTIBLE;
 
-		run_task_queue(&tq_disk);
 		if (PageLocked(page))
 			schedule();
-		tsk->state = TASK_RUNNING;
+		__set_task_state(tsk, TASK_RUNNING);
 		remove_wait_queue(&page->wait, &wait);
 
 		/*
@@ -656,13 +690,14 @@ repeat:
 		struct task_struct *tsk = current;
 		DECLARE_WAITQUEUE(wait, tsk);
 
+		run_task_queue(&tq_disk);
+
+		__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		add_wait_queue(&page->wait, &wait);
-		tsk->state = TASK_UNINTERRUPTIBLE;
 
-		run_task_queue(&tq_disk);
 		if (PageLocked(page))
 			schedule();
-		tsk->state = TASK_RUNNING;
+		__set_task_state(tsk, TASK_RUNNING);
 		remove_wait_queue(&page->wait, &wait);
 
 		/*
@@ -811,9 +846,9 @@ static inline int get_max_readahead(struct inode * inode)
 	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 }
 
-static inline unsigned long generic_file_readahead(int reada_ok,
+static void generic_file_readahead(int reada_ok,
 	struct file * filp, struct inode * inode,
-	unsigned long ppos, struct page * page, unsigned long page_cache)
+	unsigned long ppos, struct page * page)
 {
 	unsigned long max_ahead, ahead;
 	unsigned long raend;
@@ -877,8 +912,7 @@ static inline unsigned long generic_file_readahead(int reada_ok,
 	ahead = 0;
 	while (ahead < max_ahead) {
 		ahead += PAGE_CACHE_SIZE;
-		page_cache = try_to_read_ahead(filp, raend + ahead,
-						page_cache);
+		page_cache_read(filp, raend + ahead);
 	}
 /*
  * If we tried to read ahead some pages,
@@ -910,26 +944,9 @@ static inline unsigned long generic_file_readahead(int reada_ok,
 #endif
 	}
 
-	return page_cache;
+	return;
 }
 
-/*
- * "descriptor" for what we're up to with a read.
- * This allows us to use the same read code yet
- * have multiple different users of the data that
- * we read from a file.
- *
- * The simplest case just copies the data to user
- * mode.
- */
-typedef struct {
-	size_t written;
-	size_t count;
-	char * buf;
-	int error;
-} read_descriptor_t;
-
-typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 
 /*
  * This is a generic file read routine, and uses the
@@ -939,7 +956,7 @@ typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
  * This is really ugly. But the goto's actually try to clarify some
  * of the logic when it comes to error handling etc.
  */
-static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
+void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
@@ -1044,7 +1061,8 @@ page_ok:
  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
  */
 page_not_up_to_date:
-		page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+		generic_file_readahead(reada_ok, filp, inode,
+						pos & PAGE_CACHE_MASK, page);
 
 		if (Page_Uptodate(page))
 			goto page_ok;
@@ -1065,7 +1083,8 @@ readpage:
 				goto page_ok;
 
 			/* Again, try some read-ahead while waiting for the page to finish.. */
-			page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+			generic_file_readahead(reada_ok, filp, inode,
+						pos & PAGE_CACHE_MASK, page);
 			wait_on_page(page);
 			if (Page_Uptodate(page))
 				goto page_ok;
@@ -1267,31 +1286,36 @@ out:
 }
 
 /*
- * Semantics for shared and private memory areas are different past the end
- * of the file. A shared mapping past the last page of the file is an error
- * and results in a SIGBUS, while a private mapping just maps in a zero page.
+ * filemap_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
  *
  * The goto's are kind of ugly, but this streamlines the normal case of having
  * it in the page cache, and handles the special cases reasonably without
  * having a lot of duplicated code.
  *
- * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
- * ahead of the wait if we're sure to need it.
+ * XXX - at some point, this should return unique values to indicate to
+ *       the caller whether this is EIO, OOM, or SIGBUS.
  */
-static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
+static unsigned long filemap_nopage(struct vm_area_struct * area,
+	unsigned long address, int no_share)
 {
 	struct file * file = area->vm_file;
 	struct dentry * dentry = file->f_dentry;
 	struct inode * inode = dentry->d_inode;
-	unsigned long offset, reada, i;
 	struct page * page, **hash;
-	unsigned long old_page, new_page;
-	int error;
+	unsigned long old_page;
+
+	unsigned long offset = address - area->vm_start + area->vm_offset;
 
-	new_page = 0;
-	offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
-	if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
-		goto no_page;
+	/*
+	 * Semantics for shared and private memory areas are different
+	 * past the end of the file. A shared mapping past the last page
+	 * of the file is an error and results in a SIGBUS, while a
+	 * private mapping just maps in a zero page.
+	 */
+	if ((offset >= inode->i_size) &&
+		(area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
+		return 0;
 
 	/*
 	 * Do we have something in the page cache already?
@@ -1302,24 +1326,12 @@ retry_find:
 	if (!page)
 		goto no_cached_page;
 
-found_page:
 	/*
 	 * Ok, found a page in the page cache, now we need to check
-	 * that it's up-to-date.  First check whether we'll need an
-	 * extra page -- better to overlap the allocation with the I/O.
+	 * that it's up-to-date.
 	 */
-	if (no_share && !new_page) {
-		new_page = page_cache_alloc();
-		if (!new_page)
-			goto failure;
-	}
-
-	if (!Page_Uptodate(page)) {
-		lock_page(page);
-		if (!Page_Uptodate(page))
-			goto page_not_uptodate;
-		UnlockPage(page);
-	}
+	if (!Page_Uptodate(page))
+		goto page_not_uptodate;
 
 success:
 	/*
@@ -1327,100 +1339,76 @@ success:
 	 * and possibly copy it over to another page..
 	 */
 	old_page = page_address(page);
-	if (!no_share) {
-		/*
-		 * Ok, we can share the cached page directly.. Get rid
-		 * of any potential extra pages.
-		 */
-		if (new_page)
-			page_cache_free(new_page);
+	if (no_share) {
+		unsigned long new_page = page_cache_alloc();
 
-		flush_page_to_ram(old_page);
-		return old_page;
+		if (new_page) {
+			copy_page(new_page, old_page);
+			flush_page_to_ram(new_page);
+		}
+		page_cache_release(page);
+		return new_page;
 	}
-
-	/*
-	 * No sharing ... copy to the new page.
-	 */
-	copy_page(new_page, old_page);
-	flush_page_to_ram(new_page);
-	page_cache_release(page);
-	return new_page;
+		
+	flush_page_to_ram(old_page);
+	return old_page;
 
 no_cached_page:
 	/*
-	 * Try to read in an entire cluster at once.
-	 */
-	reada   = offset;
-	reada >>= PAGE_CACHE_SHIFT + page_cluster;
-	reada <<= PAGE_CACHE_SHIFT + page_cluster;
-
-	for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
-		new_page = try_to_read_ahead(file, reada, new_page);
-
-	if (!new_page)
-		new_page = page_cache_alloc();
-	if (!new_page)
-		goto no_page;
-
-	/*
-	 * During getting the above page we might have slept,
-	 * so we need to re-check the situation with the page
-	 * cache.. The page we just got may be useful if we
-	 * can't share, so don't get rid of it here.
-	 */
-	page = __find_get_page(inode, offset, hash);
-	if (page)
-		goto found_page;
-
-	/*
-	 * Now, create a new page-cache page from the page we got
+	 * If the requested offset is within our file, try to read a whole 
+	 * cluster of pages at once.
+	 *
+	 * Otherwise, we're off the end of a privately mapped file,
+	 * so we need to map a zero page.
 	 */
-	page = page_cache_entry(new_page);
-	if (add_to_page_cache_unique(page, inode, offset, hash))
-		goto retry_find;
+	if (offset < inode->i_size)
+		read_cluster_nonblocking(file, offset);
+	else
+		page_cache_read(file, offset);
 
 	/*
-	 * Now it's ours and locked, we can do initial IO to it:
+	 * The page we want has now been added to the page cache.
+	 * In the unlikely event that someone removed it in the
+	 * meantime, we'll just come back here and read it again.
 	 */
-	new_page = 0;
+	goto retry_find;
 
 page_not_uptodate:
-	error = inode->i_op->readpage(file, page);
+	lock_page(page);
+	if (Page_Uptodate(page)) {
+		UnlockPage(page);
+		goto success;
+	}
 
-	if (!error) {
+	if (!inode->i_op->readpage(file, page)) {
 		wait_on_page(page);
-		if (PageError(page))
-			goto page_read_error;
-		goto success;
+		if (Page_Uptodate(page))
+			goto success;
 	}
 
-page_read_error:
 	/*
 	 * Umm, take care of errors if the page isn't up-to-date.
 	 * Try to re-read it _once_. We do this synchronously,
 	 * because there really aren't any performance issues here
 	 * and we need to check for errors.
 	 */
-	if (!PageLocked(page))
-		PAGE_BUG(page);
-	ClearPageError(page);
-	error = inode->i_op->readpage(file, page);
-	if (error)
-		goto failure;
-	wait_on_page(page);
-	if (Page_Uptodate(page))
+	lock_page(page);
+	if (Page_Uptodate(page)) {
+		UnlockPage(page);
 		goto success;
+	}
+	ClearPageError(page);
+	if (!inode->i_op->readpage(file, page)) {
+		wait_on_page(page);
+		if (Page_Uptodate(page))
+			goto success;
+	}
 
 	/*
 	 * Things didn't work out. Return zero to tell the
 	 * mm layer so, possibly freeing the page cache page first.
 	 */
-failure:
 	page_cache_release(page);
-	if (new_page)
-		page_cache_free(new_page);
-no_page:
 	return 0;
 }
 
@@ -1702,7 +1690,7 @@ static int msync_interval(struct vm_area_struct * vma,
 	return 0;
 }
 
-asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
+asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 {
 	unsigned long end;
 	struct vm_area_struct * vma;
@@ -1855,28 +1843,29 @@ repeat_find:
 		if (!PageLocked(page)) {
 			PAGE_BUG(page);
 		} else {
-			if (page->owner != (int)current) {
+			if (page->owner != current) {
 				PAGE_BUG(page);
 			}
 		}
 
 		status = write_one_page(file, page, offset, bytes, buf);
 
+		if (status >= 0) {
+			written += status;
+			count -= status;
+			pos += status;
+			buf += status;
+			if (pos > inode->i_size)
+				inode->i_size = pos;
+		}
 		/* Mark it unlocked again and drop the page.. */
 		UnlockPage(page);
 		page_cache_release(page);
 
 		if (status < 0)
 			break;
-
-		written += status;
-		count -= status;
-		pos += status;
-		buf += status;
 	}
 	*ppos = pos;
-	if (pos > inode->i_size)
-		inode->i_size = pos;
 
 	if (page_cache)
 		page_cache_free(page_cache);
diff --git a/mm/memory.c b/mm/memory.c
index a31e862b2..5498dbcf0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -31,6 +31,9 @@
 /*
  * 05.04.94  -  Multi-page memory management added for v1.1.
  * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *
+ * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
+ *		(Gerhard.Wichert@pdb.siemens.de)
  */
 
 #include <linux/mm.h>
@@ -39,6 +42,8 @@
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/swapctl.h>
+#include <linux/iobuf.h>
+#include <linux/bigmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -55,10 +60,10 @@ void * high_memory = NULL;
 static inline void copy_cow_page(unsigned long from, unsigned long to)
 {
 	if (from == ZERO_PAGE(to)) {
-		clear_page(to);
+		clear_bigpage(to);
 		return;
 	}
-	copy_page(to, from);
+	copy_bigpage(to, from);
 }
 
 mem_map_t * mem_map = NULL;
@@ -142,39 +147,6 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
 	check_pgt_cache();
 }
 
-/*
- * This function just free's the page directory - the
- * pages tables themselves have been freed earlier by 
- * clear_page_tables().
- */
-void free_page_tables(struct mm_struct * mm)
-{
-	pgd_t * page_dir = mm->pgd;
-
-	if (page_dir) {
-		if (page_dir == swapper_pg_dir)
-			goto out_bad;
-		pgd_free(page_dir);
-	}
-	return;
-
-out_bad:
-	printk(KERN_ERR
-		"free_page_tables: Trying to free kernel pgd\n");
-	return;
-}
-
-int new_page_tables(struct task_struct * tsk)
-{
-	pgd_t * new_pg;
-
-	if (!(new_pg = pgd_alloc()))
-		return -ENOMEM;
-	SET_PAGE_DIR(tsk, new_pg);
-	tsk->mm->pgd = new_pg;
-	return 0;
-}
-
 #define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
 #define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
 
@@ -417,6 +389,192 @@ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long s
 	}
 }
 
+
+/*
+ * Do a quick page-table lookup for a single page. 
+ */
+static unsigned long follow_page(unsigned long address) 
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(current->mm, address);
+	pmd = pmd_offset(pgd, address);
+	if (pmd) {
+		pte_t * pte = pte_offset(pmd, address);
+		if (pte && pte_present(*pte)) {
+			return pte_page(*pte);
+		}
+	}
+	
+	printk(KERN_ERR "Missing page in follow_page\n");
+	return 0;
+}
+
+/* 
+ * Given a physical address, is there a useful struct page pointing to it?
+ */
+
+static struct page * get_page_map(unsigned long page)
+{
+	struct page *map;
+	
+	if (MAP_NR(page) >= max_mapnr)
+		return 0;
+	if (page == ZERO_PAGE(page))
+		return 0;
+	map = mem_map + MAP_NR(page);
+	if (PageReserved(map))
+		return 0;
+	return map;
+}
+
+/*
+ * Force in an entire range of pages from the current process's user VA,
+ * and pin and lock the pages for IO.  
+ */
+
+#define dprintk(x...)
+int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
+{
+	unsigned long		ptr, end;
+	int			err;
+	struct mm_struct *	mm;
+	struct vm_area_struct *	vma = 0;
+	unsigned long		page;
+	struct page *		map;
+	int			doublepage = 0;
+	int			repeat = 0;
+	int			i;
+	
+	/* Make sure the iobuf is not already mapped somewhere. */
+	if (iobuf->nr_pages)
+		return -EINVAL;
+
+	mm = current->mm;
+	dprintk ("map_user_kiobuf: begin\n");
+	
+	ptr = va & PAGE_MASK;
+	end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
+	err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
+	if (err)
+		return err;
+
+ repeat:
+	down(&mm->mmap_sem);
+
+	err = -EFAULT;
+	iobuf->locked = 1;
+	iobuf->offset = va & ~PAGE_MASK;
+	iobuf->length = len;
+	
+	i = 0;
+	
+	/* 
+	 * First of all, try to fault in all of the necessary pages
+	 */
+	while (ptr < end) {
+		if (!vma || ptr >= vma->vm_end) {
+			vma = find_vma(current->mm, ptr);
+			if (!vma) 
+				goto out_unlock;
+		}
+		if (handle_mm_fault(current, vma, ptr, (rw==READ)) <= 0) 
+			goto out_unlock;
+		spin_lock(&mm->page_table_lock);
+		page = follow_page(ptr);
+		if (!page) {
+			dprintk (KERN_ERR "Missing page in map_user_kiobuf\n");
+			map = NULL;
+			goto retry;
+		}
+		map = get_page_map(page);
+		if (map) {
+			if (TryLockPage(map)) {
+				goto retry;
+			}
+			atomic_inc(&map->count);
+		}
+		spin_unlock(&mm->page_table_lock);
+		dprintk ("Installing page %p %p: %d\n", (void *)page, map, i);
+		iobuf->pagelist[i] = page;
+		iobuf->maplist[i] = map;
+		iobuf->nr_pages = ++i;
+		
+		ptr += PAGE_SIZE;
+	}
+
+	up(&mm->mmap_sem);
+	dprintk ("map_user_kiobuf: end OK\n");
+	return 0;
+
+ out_unlock:
+	up(&mm->mmap_sem);
+	unmap_kiobuf(iobuf);
+	dprintk ("map_user_kiobuf: end %d\n", err);
+	return err;
+
+ retry:
+
+	/* 
+	 * Undo the locking so far, wait on the page we got to, and try again.
+	 */
+	spin_unlock(&mm->page_table_lock);
+	unmap_kiobuf(iobuf);
+	up(&mm->mmap_sem);
+
+	/* 
+	 * Did the release also unlock the page we got stuck on?
+	 */
+	if (map) {
+		if (!PageLocked(map)) {
+			/* If so, we may well have the page mapped twice
+			 * in the IO address range.  Bad news.  Of
+			 * course, it _might_ * just be a coincidence,
+			 * but if it happens more than * once, chances
+			 * are we have a double-mapped page. */
+			if (++doublepage >= 3) {
+				return -EINVAL;
+			}
+		}
+	
+		/*
+		 * Try again...
+		 */
+		wait_on_page(map);
+	}
+	
+	if (++repeat < 16) {
+		ptr = va & PAGE_MASK;
+		goto repeat;
+	}
+	return -EAGAIN;
+}
+
+
+/*
+ * Unmap all of the pages referenced by a kiobuf.  We release the pages,
+ * and unlock them if they were locked. 
+ */
+
+void unmap_kiobuf (struct kiobuf *iobuf) 
+{
+	int i;
+	struct page *map;
+	
+	for (i = 0; i < iobuf->nr_pages; i++) {
+		map = iobuf->maplist[i];
+		
+		if (map && iobuf->locked) {
+			__free_page(map);
+			UnlockPage(map);
+		}
+	}
+	
+	iobuf->nr_pages = 0;
+	iobuf->locked = 0;
+}
+
 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
                                      unsigned long size, pgprot_t prot)
 {
@@ -655,7 +813,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	 * Ok, we need to copy. Oh, well..
 	 */
 	spin_unlock(&tsk->mm->page_table_lock);
-	new_page = __get_free_page(GFP_USER);
+	new_page = __get_free_page(GFP_BIGUSER);
 	if (!new_page)
 		return -1;
 	spin_lock(&tsk->mm->page_table_lock);
@@ -667,7 +825,6 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 		if (PageReserved(page))
 			++vma->vm_mm->rss;
 		copy_cow_page(old_page,new_page);
-		flush_page_to_ram(old_page);
 		flush_page_to_ram(new_page);
 		flush_cache_page(vma, address);
 		set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
@@ -681,6 +838,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	return 1;
 
 bad_wp_page:
+	spin_unlock(&tsk->mm->page_table_lock);
 	printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 	return -1;
 }
@@ -781,7 +939,7 @@ out_unlock:
  * because it doesn't cost us any seek time.  We also make sure to queue
  * the 'original' request together with the readahead ones...  
  */
-static void swapin_readahead(unsigned long entry)
+void swapin_readahead(unsigned long entry)
 {
 	int i;
 	struct page *new_page;
@@ -833,12 +991,17 @@ static int do_swap_page(struct task_struct * tsk,
 
 	vma->vm_mm->rss++;
 	tsk->min_flt++;
+	lock_kernel();
 	swap_free(entry);
+	unlock_kernel();
 
 	pte = mk_pte(page_address(page), vma->vm_page_prot);
 
+	set_bit(PG_swap_entry, &page->flags);
 	if (write_access && !is_page_shared(page)) {
 		delete_from_swap_cache(page);
+		page = replace_with_bigmem(page);
+		pte = mk_pte(page_address(page), vma->vm_page_prot);
 		pte = pte_mkwrite(pte_mkdirty(pte));
 	}
 	set_pte(page_table, pte);
@@ -854,10 +1017,10 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v
 {
 	pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 	if (write_access) {
-		unsigned long page = __get_free_page(GFP_USER);
+		unsigned long page = __get_free_page(GFP_BIGUSER);
 		if (!page)
 			return -1;
-		clear_page(page);
+		clear_bigpage(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		vma->vm_mm->rss++;
 		tsk->min_flt++;
@@ -898,6 +1061,8 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
 	if (!page)
 		return 0;	/* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */
+	if (page == -1)
+		return -1;	/* OOM */
 
 	++tsk->maj_flt;
 	++vma->vm_mm->rss;
diff --git a/mm/mlock.c b/mm/mlock.c
index d6b19cfb1..be5e07cbf 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -130,7 +130,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
 	struct vm_area_struct * vma, * next;
 	int error;
 
-	if (!capable(CAP_IPC_LOCK))
+	if (on && !capable(CAP_IPC_LOCK))
 		return -EPERM;
 	len = (len + ~PAGE_MASK) & PAGE_MASK;
 	end = start + len;
@@ -172,7 +172,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
 	return error;
 }
 
-asmlinkage int sys_mlock(unsigned long start, size_t len)
+asmlinkage long sys_mlock(unsigned long start, size_t len)
 {
 	unsigned long locked;
 	unsigned long lock_limit;
@@ -203,7 +203,7 @@ out:
 	return error;
 }
 
-asmlinkage int sys_munlock(unsigned long start, size_t len)
+asmlinkage long sys_munlock(unsigned long start, size_t len)
 {
 	int ret;
 
@@ -244,7 +244,7 @@ static int do_mlockall(int flags)
 	return error;
 }
 
-asmlinkage int sys_mlockall(int flags)
+asmlinkage long sys_mlockall(int flags)
 {
 	unsigned long lock_limit;
 	int ret = -EINVAL;
@@ -271,7 +271,7 @@ out:
 	return ret;
 }
 
-asmlinkage int sys_munlockall(void)
+asmlinkage long sys_munlockall(void)
 {
 	int ret;
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 61826cfa2..3bddb5c18 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -275,7 +275,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 	vma->vm_ops = NULL;
 	vma->vm_offset = off;
 	vma->vm_file = NULL;
-	vma->vm_pte = 0;
+	vma->vm_private_data = NULL;
 
 	/* Clear old maps */
 	error = -ENOMEM;
@@ -547,7 +547,7 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area,
 		mpnt->vm_ops = area->vm_ops;
 		mpnt->vm_offset = area->vm_offset + (end - area->vm_start);
 		mpnt->vm_file = area->vm_file;
-		mpnt->vm_pte = area->vm_pte;
+		mpnt->vm_private_data = area->vm_private_data;
 		if (mpnt->vm_file)
 			get_file(mpnt->vm_file);
 		if (mpnt->vm_ops && mpnt->vm_ops->open)
@@ -707,7 +707,7 @@ int do_munmap(unsigned long addr, size_t len)
 	return 0;
 }
 
-asmlinkage int sys_munmap(unsigned long addr, size_t len)
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
 {
 	int ret;
 
@@ -778,7 +778,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	vma->vm_ops = NULL;
 	vma->vm_offset = 0;
 	vma->vm_file = NULL;
-	vma->vm_pte = 0;
+	vma->vm_private_data = NULL;
 
 	/*
 	 * merge_segments may merge our vma, so we can't refer to it
@@ -813,6 +813,7 @@ void exit_mmap(struct mm_struct * mm)
 {
 	struct vm_area_struct * mpnt;
 
+	release_segments(mm);
 	mpnt = mm->mmap;
 	mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
 	mm->rss = 0;
@@ -919,7 +920,7 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 
 		/* To share, we must have the same file, operations.. */
 		if ((mpnt->vm_file != prev->vm_file)||
-		    (mpnt->vm_pte != prev->vm_pte)	||
+		    (mpnt->vm_private_data != prev->vm_private_data)	||
 		    (mpnt->vm_ops != prev->vm_ops)	||
 		    (mpnt->vm_flags != prev->vm_flags)	||
 		    (prev->vm_end != mpnt->vm_start))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b1504af83..61ef3116d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -194,7 +194,7 @@ static int mprotect_fixup(struct vm_area_struct * vma,
 	return 0;
 }
 
-asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot)
+asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
 {
 	unsigned long nstart, end, tmp;
 	struct vm_area_struct * vma, * next;
diff --git a/mm/mremap.c b/mm/mremap.c
index 2852f9b06..95f4b4f90 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -118,7 +118,7 @@ oops_we_failed:
 	flush_cache_range(mm, new_addr, new_addr + len);
 	while ((offset += PAGE_SIZE) < len)
 		move_one_page(mm, new_addr + offset, old_addr + offset);
-	zap_page_range(mm, new_addr, new_addr + len);
+	zap_page_range(mm, new_addr, len);
 	flush_tlb_range(mm, new_addr, new_addr + len);
 	return -1;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22ce7ac00..b62783c72 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  */
 
 #include <linux/config.h>
@@ -13,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/bigmem.h> /* export bigmem vars */
 
 #include <asm/dma.h>
 #include <asm/uaccess.h> /* for copy_to/from_user */
@@ -20,6 +22,8 @@
 
 int nr_swap_pages = 0;
 int nr_free_pages = 0;
+int nr_lru_pages;
+LIST_HEAD(lru_cache);
 
 /*
  * Free area management
@@ -45,7 +49,12 @@ struct free_area_struct {
 
 #define memory_head(x) ((struct page *)(x))
 
+#ifdef CONFIG_BIGMEM
+#define BIGMEM_LISTS_OFFSET	NR_MEM_LISTS
+static struct free_area_struct free_area[NR_MEM_LISTS*2];
+#else
 static struct free_area_struct free_area[NR_MEM_LISTS];
+#endif
 
 static inline void init_mem_queue(struct free_area_struct * head)
 {
@@ -101,6 +110,12 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 
 #define list(x) (mem_map+(x))
 
+#ifdef CONFIG_BIGMEM
+	if (map_nr >= bigmem_mapnr) {
+		area += BIGMEM_LISTS_OFFSET;
+		nr_free_bigpages -= mask;
+	}
+#endif
 	map_nr &= mask;
 	nr_free_pages -= mask;
 	while (mask + (1 << (NR_MEM_LISTS-1))) {
@@ -127,7 +142,6 @@ int __free_page(struct page *page)
 		if (PageLocked(page))
 			PAGE_BUG(page);
 
-		page->flags &= ~(1 << PG_referenced);
 		free_pages_ok(page - mem_map, 0);
 		return 1;
 	}
@@ -145,7 +159,6 @@ int free_pages(unsigned long addr, unsigned long order)
 				PAGE_BUG(map);
 			if (PageLocked(map))
 				PAGE_BUG(map);
-			map->flags &= ~(1 << PG_referenced);
 			free_pages_ok(map_nr, order);
 			return 1;
 		}
@@ -160,6 +173,29 @@ int free_pages(unsigned long addr, unsigned long order)
 	change_bit((index) >> (1+(order)), (area)->map)
 #define CAN_DMA(x) (PageDMA(x))
 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
+
+#ifdef CONFIG_BIGMEM
+#define RMQUEUEBIG(order, gfp_mask) \
+if (gfp_mask & __GFP_BIGMEM) { \
+	struct free_area_struct * area = free_area+order+BIGMEM_LISTS_OFFSET; \
+	unsigned long new_order = order; \
+	do { struct page *prev = memory_head(area), *ret = prev->next; \
+		if (memory_head(area) != ret) { \
+			unsigned long map_nr; \
+			(prev->next = ret->next)->prev = prev; \
+			map_nr = ret - mem_map; \
+			MARK_USED(map_nr, new_order, area); \
+			nr_free_pages -= 1 << order; \
+			nr_free_bigpages -= 1 << order; \
+			EXPAND(ret, map_nr, order, new_order, area); \
+			spin_unlock_irqrestore(&page_alloc_lock, flags); \
+			return ADDRESS(map_nr); \
+		} \
+		new_order++; area++; \
+	} while (new_order < NR_MEM_LISTS); \
+}
+#endif
+
 #define RMQUEUE(order, gfp_mask) \
 do { struct free_area_struct * area = free_area+order; \
      unsigned long new_order = order; \
@@ -194,8 +230,6 @@ do { unsigned long size = 1 << high; \
 	set_page_count(map, 1); \
 } while (0)
 
-int low_on_memory = 0;
-
 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 {
 	unsigned long flags;
@@ -221,7 +255,9 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
 		int freed;
+		static int low_on_memory = 0;
 
+#ifndef CONFIG_BIGMEM
 		if (nr_free_pages > freepages.min) {
 			if (!low_on_memory)
 				goto ok_to_allocate;
@@ -232,6 +268,32 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 		}
 
 		low_on_memory = 1;
+#else
+		static int low_on_bigmemory = 0;
+
+		if (gfp_mask & __GFP_BIGMEM)
+		{
+			if (nr_free_pages > freepages.min) {
+				if (!low_on_bigmemory)
+					goto ok_to_allocate;
+				if (nr_free_pages >= freepages.high) {
+					low_on_bigmemory = 0;
+					goto ok_to_allocate;
+				}
+			}
+			low_on_bigmemory = 1;
+		} else {
+			if (nr_free_pages-nr_free_bigpages > freepages.min) {
+				if (!low_on_memory)
+					goto ok_to_allocate;
+				if (nr_free_pages-nr_free_bigpages >= freepages.high) {
+					low_on_memory = 0;
+					goto ok_to_allocate;
+				}
+			}
+			low_on_memory = 1;
+		}
+#endif
 		current->flags |= PF_MEMALLOC;
 		freed = try_to_free_pages(gfp_mask);
 		current->flags &= ~PF_MEMALLOC;
@@ -241,6 +303,9 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 	}
 ok_to_allocate:
 	spin_lock_irqsave(&page_alloc_lock, flags);
+#ifdef CONFIG_BIGMEM
+	RMQUEUEBIG(order, gfp_mask);
+#endif
 	RMQUEUE(order, gfp_mask);
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
 
@@ -268,9 +333,12 @@ void show_free_areas(void)
  	unsigned long order, flags;
  	unsigned long total = 0;
 
-	printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
-	printk("Free: %d (%d %d %d)\n",
+	printk("Free pages:      %6dkB (%6dkB BigMem)\n ( ",
+		nr_free_pages<<(PAGE_SHIFT-10),
+		nr_free_bigpages<<(PAGE_SHIFT-10));
+	printk("Free: %d, lru_cache: %d (%d %d %d)\n",
 		nr_free_pages,
+		nr_lru_pages,
 		freepages.min,
 		freepages.low,
 		freepages.high);
@@ -281,6 +349,13 @@ void show_free_areas(void)
 		for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) {
 			nr ++;
 		}
+#ifdef CONFIG_BIGMEM
+		for (tmp = free_area[BIGMEM_LISTS_OFFSET+order].next;
+		     tmp != memory_head(free_area+BIGMEM_LISTS_OFFSET+order);
+		     tmp = tmp->next) {
+			nr ++;
+		}
+#endif
 		total += nr * ((PAGE_SIZE>>10) << order);
 		printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
 	}
@@ -334,6 +409,9 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
 	for (i = 0 ; i < NR_MEM_LISTS ; i++) {
 		unsigned long bitmap_size;
 		init_mem_queue(free_area+i);
+#ifdef CONFIG_BIGMEM
+		init_mem_queue(free_area+BIGMEM_LISTS_OFFSET+i);
+#endif
 		mask += mask;
 		end_mem = (end_mem + ~mask) & mask;
 		bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
@@ -342,6 +420,11 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
 		free_area[i].map = (unsigned int *) start_mem;
 		memset((void *) start_mem, 0, bitmap_size);
 		start_mem += bitmap_size;
+#ifdef CONFIG_BIGMEM
+		free_area[BIGMEM_LISTS_OFFSET+i].map = (unsigned int *) start_mem;
+		memset((void *) start_mem, 0, bitmap_size);
+		start_mem += bitmap_size;
+#endif
 	}
 	return start_mem;
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 0f7e6d199..72e8cb95a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,8 +18,6 @@
 
 #include <asm/pgtable.h>
 
-static DECLARE_WAIT_QUEUE_HEAD(lock_queue);
-
 /*
  * Reads or writes a swap page.
  * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O.
@@ -35,7 +33,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lock_queue);
  * that shared pages stay shared while being swapped.
  */
 
-static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait, int dolock)
+static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait)
 {
 	unsigned long type, offset;
 	struct swap_info_struct * p;
@@ -90,7 +88,6 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
 	} else
 		kstat.pswpout++;
 
-	get_page(page);
 	if (p->swap_device) {
 		zones[0] = offset;
 		zones_used = 1;
@@ -99,58 +96,26 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
 	} else if (p->swap_file) {
 		struct inode *swapf = p->swap_file->d_inode;
 		int i;
-		if (swapf->i_op->get_block == NULL
-			&& swapf->i_op->smap != NULL){
-			/*
-				With MS-DOS, we use msdos_smap which returns
-				a sector number (not a cluster or block number).
-				It is a patch to enable the UMSDOS project.
-				Other people are working on better solution.
-
-				It sounds like ll_rw_swap_file defined
-				its operation size (sector size) based on
-				PAGE_SIZE and the number of blocks to read.
-				So using get_block or smap should work even if
-				smap will require more blocks.
-			*/
-			int j;
-			unsigned int block = offset << 3;
-
-			for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
-				if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
-					printk("rw_swap_page: bad swap file\n");
-					return;
-				}
+		int j;
+		unsigned int block = offset
+			<< (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
+
+		block_size = swapf->i_sb->s_blocksize;
+		for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size)
+			if (!(zones[i] = bmap(swapf,block++))) {
+				printk("rw_swap_page: bad swap file\n");
+				return;
 			}
-			block_size = 512;
-		}else{
-			int j;
-			unsigned int block = offset
-				<< (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
-
-			block_size = swapf->i_sb->s_blocksize;
-			for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size)
-				if (!(zones[i] = bmap(swapf,block++))) {
-					printk("rw_swap_page: bad swap file\n");
-					return;
-				}
-			zones_used = i;
-			dev = swapf->i_dev;
-		}
+		zones_used = i;
+		dev = swapf->i_dev;
 	} else {
 		printk(KERN_ERR "rw_swap_page: no swap file or device\n");
-		put_page(page);
 		return;
 	}
  	if (!wait) {
  		set_bit(PG_decr_after, &page->flags);
  		atomic_inc(&nr_async_pages);
  	}
- 	if (dolock) {
- 		set_bit(PG_free_swap_after, &page->flags);
-		p->swap_map[offset]++;
- 	}
- 	set_bit(PG_free_after, &page->flags);
 
  	/* block_size == PAGE_SIZE/zones_used */
  	brw_page(rw, page, dev, zones, block_size, 0);
@@ -192,29 +157,10 @@ void rw_swap_page(int rw, struct page *page, int wait)
 		PAGE_BUG(page);
 	if (page->inode != &swapper_inode)
 		PAGE_BUG(page);
-	rw_swap_page_base(rw, entry, page, wait, 1);
-}
-
-/*
- * Setting up a new swap file needs a simple wrapper just to read the 
- * swap signature.  SysV shared memory also needs a simple wrapper.
- */
-void rw_swap_page_nocache(int rw, unsigned long entry, char *buf)
-{
-	struct page *page = mem_map + MAP_NR(buf);
-	
-	if (TryLockPage(page))
-		PAGE_BUG(page);
-	if (PageSwapCache(page))
-		PAGE_BUG(page);
-	if (page->inode)
-		PAGE_BUG(page);
-	page->offset = entry;
-	rw_swap_page_base(rw, entry, page, 1, 1);
+	rw_swap_page_base(rw, entry, page, wait);
 }
 
 /*
- * shmfs needs a version that doesn't put the page in the page cache!
  * The swap lock map insists that pages be in the page cache!
  * Therefore we can't use it.  Later when we can remove the need for the
  * lock map and we can reduce the number of functions exported.
@@ -227,5 +173,5 @@ void rw_swap_page_nolock(int rw, unsigned long entry, char *buf, int wait)
 		PAGE_BUG(page);
 	if (PageSwapCache(page))
 		PAGE_BUG(page);
-	rw_swap_page_base(rw, entry, page, wait, 0);
+	rw_swap_page_base(rw, entry, page, wait);
 }
diff --git a/mm/slab.c b/mm/slab.c
index ef7ec9279..0350f3370 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3,6 +3,8 @@
  * Written by Mark Hemment, 1996/97.
  * (markhe@nextd.demon.co.uk)
  *
+ * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
+ *
  * 11 April '97.  Started multi-threading - markhe
  *	The global cache-chain is protected by the semaphore 'cache_chain_sem'.
  *	The sem is only needed when accessing/extending the cache-chain, which
@@ -100,16 +102,10 @@
  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
  */
 
-#include	<linux/mm.h>
+#include	<linux/config.h>
 #include	<linux/slab.h>
 #include	<linux/interrupt.h>
-#include	<linux/config.h>
 #include	<linux/init.h>
-#include	<linux/smp.h>
-
-#include	<asm/system.h>
-#include	<asm/atomic.h>
-#include	<asm/spinlock.h>
 #ifdef __mips__
 #include	<asm/pgtable.h>
 #include	<asm/addrspace.h>
@@ -989,6 +985,58 @@ opps:
 	return cachep;
 }
 
+/*
+ * This check if the kmem_cache_t pointer is chained in the cache_cache
+ * list. -arca
+ */
+static int is_chained_kmem_cache(kmem_cache_t * cachep)
+{
+	kmem_cache_t * searchp;
+	int ret = 0;
+
+	/* Find the cache in the chain of caches. */
+	down(&cache_chain_sem);
+	for (searchp = &cache_cache; searchp->c_nextp != &cache_cache;
+	     searchp = searchp->c_nextp) {
+		if (searchp->c_nextp != cachep)
+			continue;
+
+		/* Accessing clock_searchp is safe - we hold the mutex. */
+		if (cachep == clock_searchp)
+			clock_searchp = cachep->c_nextp;
+		ret = 1;
+		break;
+	}
+	up(&cache_chain_sem);
+
+	return ret;
+}
+
+/* returns 0 if every slab is been freed -arca */
+static int __kmem_cache_shrink(kmem_cache_t *cachep)
+{
+	kmem_slab_t	*slabp;
+	int	ret;
+
+	spin_lock_irq(&cachep->c_spinlock);
+
+	/* If the cache is growing, stop shrinking. */
+	while (!cachep->c_growing) {
+		slabp = cachep->c_lastp;
+		if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
+			break;
+		kmem_slab_unlink(slabp);
+		spin_unlock_irq(&cachep->c_spinlock);
+		kmem_slab_destroy(cachep, slabp);
+		spin_lock_irq(&cachep->c_spinlock);
+	}
+	ret = 1;
+	if (cachep->c_lastp == kmem_slab_end(cachep))
+		ret = 0;		/* Cache is empty. */
+	spin_unlock_irq(&cachep->c_spinlock);
+	return ret;
+}
+
 /* Shrink a cache.  Releases as many slabs as possible for a cache.
  * It is expected this function will be called by a module when it is
  * unloaded.  The cache is _not_ removed, this creates too many problems and
@@ -1000,10 +1048,6 @@ opps:
 int
 kmem_cache_shrink(kmem_cache_t *cachep)
 {
-	kmem_cache_t	*searchp;
-	kmem_slab_t	*slabp;
-	int	ret;
-
 	if (!cachep) {
 		printk(KERN_ERR "kmem_shrink: NULL ptr\n");
 		return 2;
@@ -1013,43 +1057,73 @@ kmem_cache_shrink(kmem_cache_t *cachep)
 		return 2;
 	}
 
+	if (!is_chained_kmem_cache(cachep)) {
+		printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n",
+		       cachep);
+		return 2;
+	}
+
+	return __kmem_cache_shrink(cachep);
+}
+
+/*
+ * Remove a kmem_cache_t object from the slab cache. When returns 0 it
+ * completed succesfully. -arca
+ */
+int kmem_cache_destroy(kmem_cache_t * cachep)
+{
+	kmem_cache_t * prev;
+	int ret;
+
+	if (!cachep) {
+		printk(KERN_ERR "kmem_destroy: NULL ptr\n");
+		return 1;
+	}
+	if (in_interrupt()) {
+		printk(KERN_ERR "kmem_destroy: Called during int - %s\n",
+		       cachep->c_name);
+		return 1;
+	}
+
+	ret = 0;
 	/* Find the cache in the chain of caches. */
-	down(&cache_chain_sem);		/* Semaphore is needed. */
-	searchp = &cache_cache;
-	for (;searchp->c_nextp != &cache_cache; searchp = searchp->c_nextp) {
-		if (searchp->c_nextp != cachep)
+	down(&cache_chain_sem);
+	for (prev = &cache_cache; prev->c_nextp != &cache_cache;
+	     prev = prev->c_nextp) {
+		if (prev->c_nextp != cachep)
 			continue;
 
 		/* Accessing clock_searchp is safe - we hold the mutex. */
 		if (cachep == clock_searchp)
 			clock_searchp = cachep->c_nextp;
-		goto found;
+
+		/* remove the cachep from the cache_cache list. -arca */
+		prev->c_nextp = cachep->c_nextp;
+
+		ret = 1;
+		break;
 	}
 	up(&cache_chain_sem);
-	printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep);
-	return 2;
-found:
-	/* Release the semaphore before getting the cache-lock.  This could
-	 * mean multiple engines are shrinking the cache, but so what.
-	 */
-	up(&cache_chain_sem);
-	spin_lock_irq(&cachep->c_spinlock);
 
-	/* If the cache is growing, stop shrinking. */
-	while (!cachep->c_growing) {
-		slabp = cachep->c_lastp;
-		if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
-			break;
-		kmem_slab_unlink(slabp);
-		spin_unlock_irq(&cachep->c_spinlock);
-		kmem_slab_destroy(cachep, slabp);
-		spin_lock_irq(&cachep->c_spinlock);
+	if (!ret) {
+		printk(KERN_ERR "kmem_destroy: Invalid cache addr %p\n",
+		       cachep);
+		return 1;
 	}
-	ret = 1;
-	if (cachep->c_lastp == kmem_slab_end(cachep))
-		ret--;		/* Cache is empty. */
-	spin_unlock_irq(&cachep->c_spinlock);
-	return ret;
+
+	if (__kmem_cache_shrink(cachep)) {
+		printk(KERN_ERR "kmem_destroy: Can't free all objects %p\n",
+		       cachep);
+		down(&cache_chain_sem);
+		cachep->c_nextp = cache_cache.c_nextp;
+		cache_cache.c_nextp = cachep;
+		up(&cache_chain_sem);
+		return 1;
+	}
+
+	kmem_cache_free(&cache_cache, cachep);
+
+	return 0;
 }
 
 /* Get the memory for a slab management obj. */
@@ -1587,7 +1661,7 @@ bad_slab:
 
 #if 1
 /* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
-*(int *) 0 = 0;
+	BUG();
 #endif
 
 	return;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2aa17d3a4..5cfc686dd 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -214,8 +214,6 @@ static inline void remove_from_swap_cache(struct page *page)
 		   page_address(page), page_count(page));
 #endif
 	PageClearSwapCache(page);
-	if (inode->i_op->flushpage)
-		inode->i_op->flushpage(inode, page, 0);
 	remove_inode_page(page);
 }
 
@@ -239,6 +237,15 @@ void __delete_from_swap_cache(struct page *page)
 	swap_free (entry);
 }
 
+static void delete_from_swap_cache_nolock(struct page *page)
+{
+	if (!swapper_inode.i_op->flushpage ||
+	    swapper_inode.i_op->flushpage(&swapper_inode, page, 0))
+		lru_cache_del(page);
+
+	__delete_from_swap_cache(page);
+}
+
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
@@ -247,7 +254,7 @@ void delete_from_swap_cache(struct page *page)
 {
 	lock_page(page);
 
-	__delete_from_swap_cache(page);
+	delete_from_swap_cache_nolock(page);
 
 	UnlockPage(page);
 	page_cache_release(page);
@@ -267,13 +274,13 @@ void free_page_and_swap_cache(unsigned long addr)
 	 */
 	lock_page(page);
 	if (PageSwapCache(page) && !is_page_shared(page)) {
-		long entry = page->offset;
-		remove_from_swap_cache(page);
-		swap_free(entry);
+		delete_from_swap_cache_nolock(page);
 		page_cache_release(page);
 	}
 	UnlockPage(page);
 	
+	clear_bit(PG_swap_entry, &page->flags);
+
 	__free_page(page);
 }
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ce18f34f5..c4ce5377d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -46,16 +46,36 @@ static inline int scan_swap_map(struct swap_info_struct *si)
 		}
 	}
 	si->cluster_nr = SWAPFILE_CLUSTER;
+
+	/* try to find an empty (even not aligned) cluster. */
+	offset = si->lowest_bit;
+ check_next_cluster:
+	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
+	{
+		int nr;
+		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
+			if (si->swap_map[nr])
+			{
+				offset = nr+1;
+				goto check_next_cluster;
+			}
+		/* We found a completly empty cluster, so start
+		 * using it.
+		 */
+		goto got_page;
+	}
+	/* No luck, so now go finegrined as usual. -Andrea */
 	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
 		if (si->swap_map[offset])
 			continue;
-		si->lowest_bit = offset;
-got_page:
-		si->swap_map[offset] = 1;
-		nr_swap_pages--;
+	got_page:
+		if (offset == si->lowest_bit)
+			si->lowest_bit++;
 		if (offset == si->highest_bit)
 			si->highest_bit--;
-		si->cluster_next = offset;
+		si->swap_map[offset] = 1;
+		nr_swap_pages--;
+		si->cluster_next = offset+1;
 		return offset;
 	}
 	return 0;
@@ -81,12 +101,9 @@ unsigned long get_swap_page(void)
 				entry = SWP_ENTRY(type,offset);
 				type = swap_info[type].next;
 				if (type < 0 ||
-					p->prio != swap_info[type].prio) 
-				{
+					p->prio != swap_info[type].prio) {
 						swap_list.next = swap_list.head;
-				}
-				else
-				{
+				} else {
 					swap_list.next = type;
 				}
 				return entry;
@@ -126,15 +143,16 @@ void swap_free(unsigned long entry)
 	offset = SWP_OFFSET(entry);
 	if (offset >= p->max)
 		goto bad_offset;
-	if (offset < p->lowest_bit)
-		p->lowest_bit = offset;
-	if (offset > p->highest_bit)
-		p->highest_bit = offset;
 	if (!p->swap_map[offset])
 		goto bad_free;
 	if (p->swap_map[offset] < SWAP_MAP_MAX) {
-		if (!--p->swap_map[offset])
+		if (!--p->swap_map[offset]) {
+			if (offset < p->lowest_bit)
+				p->lowest_bit = offset;
+			if (offset > p->highest_bit)
+				p->highest_bit = offset;
 			nr_swap_pages++;
+		}
 	}
 #ifdef DEBUG_SWAP
 	printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
@@ -157,6 +175,44 @@ bad_free:
 	goto out;
 }
 
+/* needs the big kernel lock */
+unsigned long acquire_swap_entry(struct page *page)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+	unsigned long entry;
+
+	if (!test_bit(PG_swap_entry, &page->flags))
+		goto new_swap_entry;
+
+	/* We have the old entry in the page offset still */
+	entry = page->offset;
+	if (!entry)
+		goto new_swap_entry;
+	type = SWP_TYPE(entry);
+	if (type & SHM_SWP_TYPE)
+		goto new_swap_entry;
+	if (type >= nr_swapfiles)
+		goto new_swap_entry;
+	p = type + swap_info;
+	if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
+		goto new_swap_entry;
+	offset = SWP_OFFSET(entry);
+	if (offset >= p->max)
+		goto new_swap_entry;
+	/* Has it been re-used for something else? */
+	if (p->swap_map[offset])
+		goto new_swap_entry;
+
+	/* We're cool, we can just use the old one */
+	p->swap_map[offset] = 1;
+	nr_swap_pages--;
+	return entry;
+
+new_swap_entry:
+	return get_swap_page();
+}
+
 /*
  * The swap entry has been read in advance, and we return 1 to indicate
  * that the page has been used or is no longer needed.
@@ -266,7 +322,7 @@ static void unuse_process(struct mm_struct * mm, unsigned long entry,
 	/*
 	 * Go through process' page directory.
 	 */
-	if (!mm || mm == &init_mm)
+	if (!mm)
 		return;
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
@@ -340,7 +396,7 @@ static int try_to_unuse(unsigned int type)
 	return 0;
 }
 
-asmlinkage int sys_swapoff(const char * specialfile)
+asmlinkage long sys_swapoff(const char * specialfile)
 {
 	struct swap_info_struct * p = NULL;
 	struct dentry * dentry;
@@ -484,7 +540,7 @@ int is_swap_partition(kdev_t dev) {
  *
  * The swapon system call
  */
-asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
+asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 {
 	struct swap_info_struct * p;
 	struct dentry * swap_dentry;
@@ -495,7 +551,6 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 	static int least_priority = 0;
 	union swap_header *swap_header = 0;
 	int swap_header_version;
-	int lock_map_size = PAGE_SIZE;
 	int nr_good_pages = 0;
 	unsigned long maxpages;
 	int swapfilesize;
@@ -661,8 +716,9 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 			else
 				p->swap_map[page] = SWAP_MAP_BAD;
 		}
-		nr_good_pages = swap_header->info.last_page - i;
-		lock_map_size = (p->max + 7) / 8;
+		nr_good_pages = swap_header->info.last_page -
+				swap_header->info.nr_badpages -
+				1 /* header page */;
 		if (error) 
 			goto bad_swap;
 	}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a4eeb1dc5..9bd4142c3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2,6 +2,7 @@
  *  linux/mm/vmalloc.c
  *
  *  Copyright (C) 1993  Linus Torvalds
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  */
 
 #include <linux/malloc.h>
@@ -94,7 +95,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo
 		unsigned long page;
 		if (!pte_none(*pte))
 			printk("alloc_area_pte: page already exists\n");
-		page = __get_free_page(GFP_KERNEL);
+		page = __get_free_page(GFP_KERNEL|GFP_BIGMEM);
 		if (!page)
 			return -ENOMEM;
 		set_pte(pte, mk_pte(page, prot));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ae052b94..8ee000fc0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -17,6 +17,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/bigmem.h>
 
 #include <asm/pgtable.h>
 
@@ -31,8 +32,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
-	unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
 {
 	pte_t pte;
 	unsigned long entry;
@@ -47,15 +47,12 @@ static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
 		goto out_failed;
 
 	page = mem_map + MAP_NR(page_addr);
-	spin_lock(&tsk->mm->page_table_lock);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	if (pte_val(pte) != pte_val(*page_table))
 		goto out_failed_unlock;
 
-	/*
-	 * Dont be too eager to get aging right if
-	 * memory is dangerously low.
-	 */
-	if (!low_on_memory && pte_young(pte)) {
+	/* Don't look at this pte if it's been accessed recently. */
+	if (pte_young(pte)) {
 		/*
 		 * Transfer the "accessed" bit from the page
 		 * tables to the global page map.
@@ -67,7 +64,8 @@ static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
 
 	if (PageReserved(page)
 	    || PageLocked(page)
-	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page)))
+	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+	    || (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page)))
 		goto out_failed_unlock;
 
 	/*
@@ -136,15 +134,16 @@ drop_pte:
 	 */
 	flush_cache_page(vma, address);
 	if (vma->vm_ops && vma->vm_ops->swapout) {
-		pid_t pid = tsk->pid;
+		int error;
 		pte_clear(page_table);
-		spin_unlock(&tsk->mm->page_table_lock);
+		spin_unlock(&vma->vm_mm->page_table_lock);
 		flush_tlb_page(vma, address);
 		vma->vm_mm->rss--;
-		
-		if (vma->vm_ops->swapout(vma, page))
-			kill_proc(pid, SIGBUS, 1);
-		goto out_free_success;
+		error = vma->vm_ops->swapout(vma, page);
+		if (!error)
+			goto out_free_success;
+		__free_page(page);
+		return error;
 	}
 
 	/*
@@ -153,14 +152,16 @@ drop_pte:
 	 * we have the swap cache set up to associate the
 	 * page with that swap entry.
 	 */
-	entry = get_swap_page();
+	entry = acquire_swap_entry(page);
 	if (!entry)
-		goto out_failed; /* No swap space left */
+		goto out_failed_unlock; /* No swap space left */
 		
+	if (!(page = prepare_bigmem_swapout(page)))
+		goto out_swap_free_unlock;
+
 	vma->vm_mm->rss--;
-	tsk->nswap++;
 	set_pte(page_table, __pte(entry));
-	spin_unlock(&tsk->mm->page_table_lock);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 
 	flush_tlb_page(vma, address);
 	swap_duplicate(entry);	/* One for the process, one for the swap cache */
@@ -175,9 +176,14 @@ out_free_success:
 	__free_page(page);
 	return 1;
 out_failed_unlock:
-	spin_unlock(&tsk->mm->page_table_lock);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 out_failed:
 	return 0;
+out_swap_free_unlock:
+	swap_free(entry);
+	spin_unlock(&vma->vm_mm->page_table_lock);
+	return 0;
+
 }
 
 /*
@@ -194,8 +200,7 @@ out_failed:
  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
  */
 
-static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -216,8 +221,8 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
 
 	do {
 		int result;
-		tsk->mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+		vma->vm_mm->swap_address = address + PAGE_SIZE;
+		result = try_to_swap_out(vma, address, pte, gfp_mask);
 		if (result)
 			return result;
 		address += PAGE_SIZE;
@@ -226,8 +231,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
 	return 0;
 }
 
-static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -247,7 +251,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
 		if (result)
 			return result;
 		address = (address + PMD_SIZE) & PMD_MASK;
@@ -256,8 +260,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
 	return 0;
 }
 
-static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int gfp_mask)
+static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -266,11 +269,11 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 	if (vma->vm_flags & VM_LOCKED)
 		return 0;
 
-	pgdir = pgd_offset(tsk->mm, address);
+	pgdir = pgd_offset(vma->vm_mm, address);
 
 	end = vma->vm_end;
 	while (address < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
 		if (result)
 			return result;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -279,7 +282,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 	return 0;
 }
 
-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -287,18 +290,18 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
 	/*
 	 * Go through process' page directory.
 	 */
-	address = p->mm->swap_address;
+	address = mm->swap_address;
 
 	/*
 	 * Find the proper vm-area
 	 */
-	vma = find_vma(p->mm, address);
+	vma = find_vma(mm, address);
 	if (vma) {
 		if (address < vma->vm_start)
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(p, vma, address, gfp_mask);
+			int result = swap_out_vma(vma, address, gfp_mask);
 			if (result)
 				return result;
 			vma = vma->vm_next;
@@ -309,8 +312,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
 	}
 
 	/* We didn't find anything for the process */
-	p->mm->swap_cnt = 0;
-	p->mm->swap_address = 0;
+	mm->swap_cnt = 0;
+	mm->swap_address = 0;
 	return 0;
 }
 
@@ -321,9 +324,11 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
  */
 static int swap_out(unsigned int priority, int gfp_mask)
 {
-	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	struct task_struct * p;
+	int counter;
+	int __ret = 0;
 
+	lock_kernel();
 	/* 
 	 * We make one or two passes through the task list, indexed by 
 	 * assign = {0, 1}:
@@ -338,46 +343,61 @@ static int swap_out(unsigned int priority, int gfp_mask)
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
+	counter = nr_threads / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
-		counter = nr_tasks;
+	if (counter > nr_threads)
+		counter = nr_threads;
 
 	for (; counter >= 0; counter--) {
-		assign = 0;
-		max_cnt = 0;
-		pbest = NULL;
+		int assign = 0;
+		int max_cnt = 0;
+		struct mm_struct *best = NULL;
+		int pid = 0;
 	select:
 		read_lock(&tasklist_lock);
 		p = init_task.next_task;
 		for (; p != &init_task; p = p->next_task) {
-			if (!p->swappable)
+			struct mm_struct *mm = p->mm;
+			if (!p->swappable || !mm)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (mm->rss <= 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
-				p->mm->swap_cnt = p->mm->rss;
-			if (p->mm->swap_cnt > max_cnt) {
-				max_cnt = p->mm->swap_cnt;
-				pbest = p;
+				mm->swap_cnt = mm->rss;
+			if (mm->swap_cnt > max_cnt) {
+				max_cnt = mm->swap_cnt;
+				best = mm;
+				pid = p->pid;
 			}
 		}
 		read_unlock(&tasklist_lock);
-		if (!pbest) {
+		if (!best) {
 			if (!assign) {
 				assign = 1;
 				goto select;
 			}
 			goto out;
-		}
+		} else {
+			int ret;
+
+			atomic_inc(&best->mm_count);
+			ret = swap_out_mm(best, gfp_mask);
+			mmdrop(best);
+
+			if (!ret)
+				continue;
 
-		if (swap_out_process(pbest, gfp_mask))
-			return 1;
+			if (ret < 0)
+				kill_proc(pid, SIGBUS, 1);
+			__ret = 1;
+			goto out;
+		}
 	}
 out:
-	return 0;
+	unlock_kernel();
+	return __ret;
 }
 
 /*
@@ -394,8 +414,6 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
 	int priority;
 	int count = SWAP_CLUSTER_MAX;
 
-	lock_kernel();
-
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
@@ -423,32 +441,10 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
 		shrink_dcache_memory(priority, gfp_mask);
 	} while (--priority >= 0);
 done:
-	unlock_kernel();
 
 	return priority >= 0;
 }
 
-/*
- * Before we start the kernel thread, print out the 
- * kswapd initialization message (otherwise the init message 
- * may be printed in the middle of another driver's init 
- * message).  It looks very bad when that happens.
- */
-void __init kswapd_setup(void)
-{
-       int i;
-       char *revision="$Revision: 1.5 $", *s, *e;
-
-       swap_setup();
-       
-       if ((s = strchr(revision, ':')) &&
-           (e = strchr(s, '$')))
-               s++, i = e - s;
-       else
-               s = revision, i = -1;
-       printk ("Starting kswapd v%.*s\n", i, s);
-}
-
 static struct task_struct *kswapd_process;
 
 /*
@@ -499,7 +495,9 @@ int kswapd(void *unused)
 		 * up on a more timely basis.
 		 */
 		do {
-			if (nr_free_pages >= freepages.high)
+			/* kswapd is critical to provide GFP_ATOMIC
+			   allocations (not GFP_BIGMEM ones). */
+			if (nr_free_pages - nr_free_bigpages >= freepages.high)
 				break;
 
 			if (!do_try_to_free_pages(GFP_KSWAPD))
@@ -535,4 +533,13 @@ int try_to_free_pages(unsigned int gfp_mask)
 		retval = do_try_to_free_pages(gfp_mask);
 	return retval;
 }
-	
+
+static int __init kswapd_init(void)
+{
+	printk("Starting kswapd v1.6\n");
+	swap_setup();
+	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	return 0;
+}
+
+module_init(kswapd_init)
author	Ralf Baechle <ralf@linux-mips.org>	1999-10-09 00:00:47 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	1999-10-09 00:00:47 +0000
commit	d6434e1042f3b0a6dfe1b1f615af369486f9b1fa (patch)
tree	e2be02f33984c48ec019c654051d27964e42c441 /mm
parent	609d1e803baf519487233b765eb487f9ec227a18 (diff)