o Merge with Linux 2.1.116.

o New Newport console code. o New G364 console code.
author: Ralf Baechle <ralf@linux-mips.org> 1998-08-25 09:12:35 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 1998-08-25 09:12:35 +0000
commit: c7fc24dc4420057f103afe8fc64524ebc25c5d37 (patch)
tree: 3682407a599b8f9f03fc096298134cafba1c9b2f /mm
parent: 1d793fade8b063fde3cf275bf1a5c2d381292cd9 (diff)
15 files changed, 680 insertions, 873 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 3f2632a15..d0bf1270f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -117,12 +117,100 @@ repeat:
 	}
 }
 
+/*
+ * Remove a page from the page cache and free it.
+ */
+void remove_inode_page(struct page *page)
+{
+	remove_page_from_hash_queue(page);
+	remove_page_from_inode_queue(page);
+	__free_page(page);
+}
+
+/*
+ * Check whether we can free this page.
+ */
+static inline int shrink_one_page(struct page *page, int gfp_mask)
+{
+	struct buffer_head *tmp, *bh;
+
+	if (PageLocked(page))
+		goto next;
+	if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+		goto next;
+	/* First of all, regenerate the page's referenced bit
+         * from any buffers in the page
+	 */
+	bh = page->buffers;
+	if (bh) {
+		tmp = bh;
+		do {
+			if (buffer_touched(tmp)) {
+				clear_bit(BH_Touched, &tmp->b_state);
+				set_bit(PG_referenced, &page->flags);
+			}
+			tmp = tmp->b_this_page;
+		} while (tmp != bh);
+
+		/* Refuse to swap out all buffer pages */
+		if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
+			goto next;
+	}
+
+	/* We can't throw away shared pages, but we do mark
+	   them as referenced.  This relies on the fact that
+	   no page is currently in both the page cache and the
+	   buffer cache; we'd have to modify the following
+	   test to allow for that case. */
+
+	switch (atomic_read(&page->count)) {
+	case 1:
+		/* is it a swap-cache or page-cache page? */
+		if (page->inode) {
+			if (test_and_clear_bit(PG_referenced, &page->flags)) {
+				touch_page(page);
+				break;
+			}
+			age_page(page);
+#if 0
+			if (page->age)
+				break;
+			if (page_cache_size * 100 < (page_cache.min_percent * num_physpages))
+				break;
+#endif
+			if (PageSwapCache(page)) {
+				delete_from_swap_cache(page);
+				return 1;
+			}
+			remove_inode_page(page);
+			return 1;
+		}
+		/* It's not a cache page, so we don't do aging.
+		 * If it has been referenced recently, don't free it */
+		if (test_and_clear_bit(PG_referenced, &page->flags))
+			break;
+
+		/* is it a buffer cache page? */
+		if (bh && try_to_free_buffer(bh, &bh, 6))
+			return 1;
+		break;
+
+	default:
+		/* more than one user: we can't throw it away */
+		set_bit(PG_referenced, &page->flags);
+		/* fall through */
+	case 0:
+		/* nothing */
+	}
+next:
+	return 0;
+}
+
 int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
-	struct page * page;
 	unsigned long limit = num_physpages;
-	struct buffer_head *tmp, *bh;
+	struct page * page;
 	int count_max, count_min;
 
 	count_max = (limit<<1) >> (priority>>1);
@@ -130,79 +218,20 @@ int shrink_mmap(int priority, int gfp_mask)
 
 	page = mem_map + clock;
 	do {
+		if (PageSkip(page)) {
+			/* next_hash is overloaded for PageSkip */
+			page = page->next_hash;
+			clock = page->map_nr;
+		}
+		
+		if (shrink_one_page(page, gfp_mask))
+			return 1;
 		count_max--;
 		if (page->inode || page->buffers)
 			count_min--;
-
-		if (PageLocked(page))
-			goto next;
-		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
-			goto next;
-		/* First of all, regenerate the page's referenced bit
-                   from any buffers in the page */
-		bh = page->buffers;
-		if (bh) {
-			tmp = bh;
-			do {
-				if (buffer_touched(tmp)) {
-					clear_bit(BH_Touched, &tmp->b_state);
-					set_bit(PG_referenced, &page->flags);
-				}
-				tmp = tmp->b_this_page;
-			} while (tmp != bh);
-
-			/* Refuse to swap out all buffer pages */
-			if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
-				goto next;
-		}
-
-		/* We can't throw away shared pages, but we do mark
-		   them as referenced.  This relies on the fact that
-		   no page is currently in both the page cache and the
-		   buffer cache; we'd have to modify the following
-		   test to allow for that case. */
-
-		switch (atomic_read(&page->count)) {
-			case 1:
-				/* is it a swap-cache or page-cache page? */
-				if (page->inode) {
-					if (test_and_clear_bit(PG_referenced, &page->flags)) {
-						touch_page(page);
-						break;
-					}
-					age_page(page);
-					if (page->age || page_cache_size * 100 < (page_cache.min_percent * num_physpages))
-						break;
-					if (PageSwapCache(page)) {
-						delete_from_swap_cache(page);
-						return 1;
-					}
-					remove_page_from_hash_queue(page);
-					remove_page_from_inode_queue(page);
-					__free_page(page);
-					return 1;
-				}
-				/* It's not a cache page, so we don't do aging.
-				 * If it has been referenced recently, don't free it */
-				if (test_and_clear_bit(PG_referenced, &page->flags))
-					break;
-
-				/* is it a buffer cache page? */
-				if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6))
-					return 1;
-				break;
-
-			default:
-				/* more than one users: we can't throw it away */
-				set_bit(PG_referenced, &page->flags);
-				/* fall through */
-			case 0:
-				/* nothing */
-		}
-next:
 		page++;
 		clock++;
-		if (clock >= limit) {
+		if (clock >= max_mapnr) {
 			clock = 0;
 			page = mem_map;
 		}
@@ -216,20 +245,17 @@ next:
  * free it from the page hash-queues etc, as we don't want to keep it
  * in-core unnecessarily.
  */
-unsigned long page_unuse(unsigned long page)
+unsigned long page_unuse(struct page * page)
 {
-	struct page * p = mem_map + MAP_NR(page);
-	int count = atomic_read(&p->count);
+	int count = atomic_read(&page->count);
 
 	if (count != 2)
 		return count;
-	if (!p->inode)
+	if (!page->inode)
 		return count;
-	if (PageSwapCache(p))
+	if (PageSwapCache(page))
 		panic ("Doing a normal page_unuse of a swap cache page");
-	remove_page_from_hash_queue(p);
-	remove_page_from_inode_queue(p);
-	free_page(page);
+	remove_inode_page(page);
 	return 1;
 }
 
@@ -303,6 +329,7 @@ static unsigned long try_to_read_ahead(struct file * file,
 			 */
 			page = mem_map + MAP_NR(page_cache);
 			add_to_page_cache(page, inode, offset, hash);
+			set_bit(PG_referenced, &page->flags);
 			inode->i_op->readpage(file, page);
 			page_cache = 0;
 		}
@@ -568,6 +595,23 @@ static inline unsigned long generic_file_readahead(int reada_ok,
 	return page_cache;
 }
 
+/*
+ * "descriptor" for what we're up to with a read.
+ * This allows us to use the same read code yet
+ * have multiple different users of the data that
+ * we read from a file.
+ *
+ * The simplest case just copies the data to user
+ * mode.
+ */
+typedef struct {
+	size_t written;
+	size_t count;
+	char * buf;
+	int error;
+} read_descriptor_t;
+
+typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 
 /*
  * This is a generic file read routine, and uses the
@@ -577,23 +621,14 @@ static inline unsigned long generic_file_readahead(int reada_ok,
  * This is really ugly. But the goto's actually try to clarify some
  * of the logic when it comes to error handling etc.
  */
-
-ssize_t generic_file_read(struct file * filp, char * buf,
-			  size_t count, loff_t *ppos)
+static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	ssize_t error, read;
 	size_t pos, pgpos, page_cache;
 	int reada_ok;
 	int max_readahead = get_max_readahead(inode);
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
-		return -EFAULT;
-	if (!count)
-		return 0;
-	error = 0;
-	read = 0;
 	page_cache = 0;
 
 	pos = *ppos;
@@ -621,12 +656,12 @@ ssize_t generic_file_read(struct file * filp, char * buf,
  * Then, at least MIN_READAHEAD if read ahead is ok,
  * and at most MAX_READAHEAD in all cases.
  */
-	if (pos + count <= (PAGE_SIZE >> 1)) {
+	if (pos + desc->count <= (PAGE_SIZE >> 1)) {
 		filp->f_ramax = 0;
 	} else {
 		unsigned long needed;
 
-		needed = ((pos + count) & PAGE_MASK) - pgpos;
+		needed = ((pos + desc->count) & PAGE_MASK) - pgpos;
 
 		if (filp->f_ramax < needed)
 			filp->f_ramax = needed;
@@ -679,20 +714,20 @@ success:
 
 		offset = pos & ~PAGE_MASK;
 		nr = PAGE_SIZE - offset;
-		if (nr > count)
-			nr = count;
 		if (nr > inode->i_size - pos)
 			nr = inode->i_size - pos;
-		nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr);
-		release_page(page);
-		error = -EFAULT;
-		if (!nr)
-			break;
-		buf += nr;
+
+		/*
+		 * The actor routine returns how many bytes were actually used..
+		 * NOTE! This may not be the same as how much of a user buffer
+		 * we filled up (we may be padding etc), so we can only update
+		 * "pos" here (the actor routine has to update the user buffer
+		 * pointers and the remaining count).
+		 */
+		nr = actor(desc, (const char *) (page_address(page) + offset), nr);
 		pos += nr;
-		read += nr;
-		count -= nr;
-		if (count)
+		release_page(page);
+		if (nr && desc->count)
 			continue;
 		break;
 	}
@@ -710,7 +745,7 @@ no_cached_page:
 			 */
 			if (page_cache)
 				continue;
-			error = -ENOMEM;
+			desc->error = -ENOMEM;
 			break;
 		}
 
@@ -739,11 +774,14 @@ no_cached_page:
 		if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 			filp->f_ramax = MIN_READAHEAD;
 
-		error = inode->i_op->readpage(filp, page);
-		if (!error)
-			goto found_page;
-		release_page(page);
-		break;
+		{
+			int error = inode->i_op->readpage(filp, page);
+			if (!error)
+				goto found_page;
+			desc->error = error;
+			release_page(page);
+			break;
+		}
 
 page_read_error:
 		/*
@@ -751,15 +789,18 @@ page_read_error:
 		 * Try to re-read it _once_. We do this synchronously,
 		 * because this happens only if there were errors.
 		 */
-		error = inode->i_op->readpage(filp, page);
-		if (!error) {
-			wait_on_page(page);
-			if (PageUptodate(page) && !PageError(page))
-				goto success;
-			error = -EIO; /* Some unspecified error occurred.. */
+		{
+			int error = inode->i_op->readpage(filp, page);
+			if (!error) {
+				wait_on_page(page);
+				if (PageUptodate(page) && !PageError(page))
+					goto success;
+				error = -EIO; /* Some unspecified error occurred.. */
+			}
+			desc->error = error;
+			release_page(page);
+			break;
 		}
-		release_page(page);
-		break;
 	}
 
 	*ppos = pos;
@@ -767,9 +808,159 @@ page_read_error:
 	if (page_cache)
 		free_page(page_cache);
 	UPDATE_ATIME(inode);
-	if (!read)
-		read = error;
-	return read;
+}
+
+static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
+{
+	unsigned long left;
+	unsigned long count = desc->count;
+
+	if (size > count)
+		size = count;
+	left = __copy_to_user(desc->buf, area, size);
+	if (left) {
+		size -= left;
+		desc->error = -EFAULT;
+	}
+	desc->count = count - size;
+	desc->written += size;
+	desc->buf += size;
+	return size;
+}
+
+/*
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+{
+	ssize_t retval;
+
+	retval = -EFAULT;
+	if (access_ok(VERIFY_WRITE, buf, count)) {
+		retval = 0;
+		if (count) {
+			read_descriptor_t desc;
+
+			desc.written = 0;
+			desc.count = count;
+			desc.buf = buf;
+			desc.error = 0;
+			do_generic_file_read(filp, ppos, &desc, file_read_actor);
+
+			retval = desc.written;
+			if (!retval)
+				retval = desc.error;
+		}
+	}
+	return retval;
+}
+
+static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
+{
+	ssize_t written;
+	unsigned long count = desc->count;
+	struct file *file = (struct file *) desc->buf;
+	struct inode *inode = file->f_dentry->d_inode;
+	mm_segment_t old_fs;
+
+	if (size > count)
+		size = count;
+	down(&inode->i_sem);
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	written = file->f_op->write(file, area, size, &file->f_pos);
+	set_fs(old_fs);
+	up(&inode->i_sem);
+	if (written < 0) {
+		desc->error = written;
+		written = 0;
+	}
+	desc->count = count - written;
+	desc->written += written;
+	return written;
+}
+
+asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+	ssize_t retval;
+	struct file * in_file, * out_file;
+	struct inode * in_inode, * out_inode;
+
+	lock_kernel();
+
+	/*
+	 * Get input file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	in_file = fget(in_fd);
+	if (!in_file)
+		goto out;
+	if (!(in_file->f_mode & FMODE_READ))
+		goto fput_in;
+	retval = -EINVAL;
+	in_inode = in_file->f_dentry->d_inode;
+	if (!in_inode)
+		goto fput_in;
+	if (!in_inode->i_op || !in_inode->i_op->readpage)
+		goto fput_in;
+	retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
+	if (retval)
+		goto fput_in;
+
+	/*
+	 * Get output file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	out_file = fget(out_fd);
+	if (!out_file)
+		goto fput_in;
+	if (!(out_file->f_mode & FMODE_WRITE))
+		goto fput_out;
+	retval = -EINVAL;
+	if (!out_file->f_op || !out_file->f_op->write)
+		goto fput_out;
+	out_inode = out_file->f_dentry->d_inode;
+	if (!out_inode)
+		goto fput_out;
+	retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
+	if (retval)
+		goto fput_out;
+
+	retval = 0;
+	if (count) {
+		read_descriptor_t desc;
+		loff_t pos = 0, *ppos;
+
+		retval = -EFAULT;
+		ppos = &in_file->f_pos;
+		if (offset) {
+			if (get_user(pos, offset))
+				goto fput_out;
+			ppos = &pos;
+		}
+
+		desc.written = 0;
+		desc.count = count;
+		desc.buf = (char *) out_file;
+		desc.error = 0;
+		do_generic_file_read(in_file, ppos, &desc, file_send_actor);
+
+		retval = desc.written;
+		if (!retval)
+			retval = desc.error;
+		if (offset)
+			put_user(pos, offset);
+	}
+
+
+fput_out:
+	fput(out_file);
+fput_in:
+	fput(in_file);
+out:
+	unlock_kernel();
+	return retval;
 }
 
 /*
@@ -903,7 +1094,7 @@ page_read_error:
 		goto success;
 
 	/*
-	 * Uhhuh.. Things didn't work out. Return zero to tell the
+	 * Things didn't work out. Return zero to tell the
 	 * mm layer so, possibly freeing the page cache page first.
 	 */
 failure:
@@ -1257,6 +1448,7 @@ asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
 	struct vm_area_struct * vma;
 	int unmapped_error, error = -EINVAL;
 
+	down(&current->mm->mmap_sem);
 	lock_kernel();
 	if (start & ~PAGE_MASK)
 		goto out;
@@ -1304,6 +1496,7 @@ asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
 	}
 out:
 	unlock_kernel();
+	up(&current->mm->mmap_sem);
 	return error;
 }
 
@@ -1412,7 +1605,7 @@ page_wait:
 		set_bit(PG_uptodate, &page->flags);
 
 do_update_page:
-		/* Alright, the page is there.  Now update it. */
+		/* All right, the page is there.  Now update it. */
 		status = inode->i_op->updatepage(file, page, buf,
 							offset, bytes, sync);
 done_with_page:
diff --git a/mm/memory.c b/mm/memory.c
index af4297702..77a814f07 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -44,6 +44,8 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -68,8 +70,6 @@ static inline void copy_cow_page(unsigned long from, unsigned long to)
 	copy_page(to, from);
 }
 
-#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
-
 mem_map_t * mem_map = NULL;
 
 /*
@@ -121,22 +121,41 @@ static inline void free_one_pgd(pgd_t * dir)
 	pmd_free(pmd);
 }
 
+/* Low and high watermarks for page table cache.
+   The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
+ */
+int pgt_cache_water[2] = { 25, 50 };
+
+/* Returns the number of pages freed */
+int check_pgt_cache(void)
+{
+	return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
+}
+
+
 /*
  * This function clears all user-level page tables of a process - this
  * is needed by execve(), so that old pages aren't in the way.
  */
 void clear_page_tables(struct task_struct * tsk)
 {
+	pgd_t * page_dir = tsk->mm->pgd;
 	int i;
-	pgd_t * page_dir;
 
-	page_dir = tsk->mm->pgd;
-	if (!page_dir || page_dir == swapper_pg_dir) {
-		printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
-		return;
-	}
+	if (!page_dir || page_dir == swapper_pg_dir)
+		goto out_bad;
 	for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 		free_one_pgd(page_dir + i);
+
+	/* keep the page table cache within bounds */
+	check_pgt_cache();
+	return;
+
+out_bad:
+	printk(KERN_ERR 
+		"clear_page_tables: %s trying to clear kernel pgd\n",
+		tsk->comm);
+	return;
 }
 
 /*
@@ -146,30 +165,34 @@ void clear_page_tables(struct task_struct * tsk)
  */
 void free_page_tables(struct mm_struct * mm)
 {
+	pgd_t * page_dir = mm->pgd;
 	int i;
-	pgd_t * page_dir;
 
-	page_dir = mm->pgd;
-	if (page_dir) {
-		if (page_dir == swapper_pg_dir) {
-			printk("free_page_tables: Trying to free kernel pgd\n");
-			return;
-		}
-		for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
-			free_one_pgd(page_dir + i);
-		pgd_free(page_dir);
-	}
+	if (!page_dir)
+		goto out;
+	if (page_dir == swapper_pg_dir)
+		goto out_bad;
+	for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
+		free_one_pgd(page_dir + i);
+	pgd_free(page_dir);
+
+	/* keep the page table cache within bounds */
+	check_pgt_cache();
+out:
+	return;
+
+out_bad:
+	printk(KERN_ERR
+		"free_page_tables: Trying to free kernel pgd\n");
+	return;
 }
 
 int new_page_tables(struct task_struct * tsk)
 {
-	pgd_t * page_dir, * new_pg;
+	pgd_t * new_pg;
 
 	if (!(new_pg = pgd_alloc()))
 		return -ENOMEM;
-	page_dir = pgd_offset(&init_mm, 0);
-	memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
-	       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
 	SET_PAGE_DIR(tsk, new_pg);
 	tsk->mm->pgd = new_pg;
 	return 0;
@@ -898,6 +921,9 @@ static inline void handle_pte_fault(struct task_struct *tsk,
 	do_wp_page(tsk, vma, address, write_access, pte);
 }
 
+/*
+ * By the time we get here, we already hold the mm semaphore
+ */
 void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 	unsigned long address, int write_access)
 {
@@ -912,9 +938,27 @@ void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 	pte = pte_alloc(pmd, address);
 	if (!pte)
 		goto no_memory;
+	lock_kernel();
 	handle_pte_fault(tsk, vma, address, write_access, pte);
+	unlock_kernel();
 	update_mmu_cache(vma, address, *pte);
 	return;
 no_memory:
 	oom(tsk);
 }
+
+/*
+ * Simplistic page force-in..
+ */
+void make_pages_present(unsigned long addr, unsigned long end)
+{
+	int write;
+	struct vm_area_struct * vma;
+
+	vma = find_vma(current->mm, addr);
+	write = (vma->vm_flags & VM_WRITE) != 0;
+	while (addr < end) {
+		handle_mm_fault(current, vma, addr, write);
+		addr += PAGE_SIZE;
+	}
+}
diff --git a/mm/mlock.c b/mm/mlock.c
index 3a322f8a5..527443946 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -126,14 +126,7 @@ static int mlock_fixup(struct vm_area_struct * vma,
 		if (!(newflags & VM_LOCKED))
 			pages = -pages;
 		vma->vm_mm->locked_vm += pages;
-
-		if (newflags & VM_LOCKED)
-			while (start < end) {
-				char c;
-				get_user(c,(char *) start);
-				__asm__ __volatile__("": :"r" (c));
-				start += PAGE_SIZE;
-			}
+		make_pages_present(start, end);
 	}
 	return retval;
 }
@@ -192,6 +185,7 @@ asmlinkage int sys_mlock(unsigned long start, size_t len)
 	unsigned long lock_limit;
 	int error = -ENOMEM;
 
+	down(&current->mm->mmap_sem);
 	lock_kernel();
 	len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK;
 	start &= PAGE_MASK;
@@ -214,6 +208,7 @@ asmlinkage int sys_mlock(unsigned long start, size_t len)
 	error = do_mlock(start, len, 1);
 out:
 	unlock_kernel();
+	up(&current->mm->mmap_sem);
 	return error;
 }
 
@@ -221,11 +216,13 @@ asmlinkage int sys_munlock(unsigned long start, size_t len)
 {
 	int ret;
 
+	down(&current->mm->mmap_sem);
 	lock_kernel();
 	len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK;
 	start &= PAGE_MASK;
 	ret = do_mlock(start, len, 0);
 	unlock_kernel();
+	up(&current->mm->mmap_sem);
 	return ret;
 }
 
@@ -263,6 +260,7 @@ asmlinkage int sys_mlockall(int flags)
 	unsigned long lock_limit;
 	int ret = -EINVAL;
 
+	down(&current->mm->mmap_sem);
 	lock_kernel();
 	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
 		goto out;
@@ -282,6 +280,7 @@ asmlinkage int sys_mlockall(int flags)
 	ret = do_mlockall(flags);
 out:
 	unlock_kernel();
+	up(&current->mm->mmap_sem);
 	return ret;
 }
 
@@ -289,8 +288,10 @@ asmlinkage int sys_munlockall(void)
 {
 	int ret;
 
+	down(&current->mm->mmap_sem);
 	lock_kernel();
 	ret = do_mlockall(0);
 	unlock_kernel();
+	up(&current->mm->mmap_sem);
 	return ret;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 52c185e85..172bcd8f1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -57,19 +57,19 @@ int vm_enough_memory(long pages)
 	 * simple, it hopefully works in most obvious cases.. Easy to
 	 * fool it, but this should catch most mistakes.
 	 */
-	long freepages;
+	long free;
 	
         /* Sometimes we want to use more memory than we have. */
 	if (sysctl_overcommit_memory)
 	    return 1;
 
-	freepages = buffermem >> PAGE_SHIFT;
-	freepages += page_cache_size;
-	freepages >>= 1;
-	freepages += nr_free_pages;
-	freepages += nr_swap_pages;
-	freepages -= num_physpages >> 4;
-	return freepages > pages;
+	free = buffermem >> PAGE_SHIFT;
+	free += page_cache_size;
+	free >>= 1;
+	free += nr_free_pages;
+	free += nr_swap_pages;
+	free -= num_physpages >> 4;
+	return free > pages;
 }
 
 /* Remove one vm structure from the inode's i_mmap ring. */
@@ -92,6 +92,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	unsigned long newbrk, oldbrk;
 	struct mm_struct *mm = current->mm;
 
+	down(&mm->mmap_sem);
 	lock_kernel();
 	if (brk < mm->end_code)
 		goto out;
@@ -109,9 +110,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 
 	/* Check against rlimit and stack.. */
 	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
-	if (rlim >= RLIM_INFINITY)
-		rlim = ~0;
-	if (brk - mm->end_code > rlim)
+	if (rlim < RLIM_INFINITY && brk - mm->end_code > rlim)
 		goto out;
 
 	/* Check against existing mmap mappings. */
@@ -132,6 +131,7 @@ set_brk:
 out:
 	retval = mm->brk;
 	unlock_kernel();
+	up(&mm->mmap_sem);
 	return retval;
 }
 
@@ -196,9 +196,14 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 			if ((prot & PROT_WRITE) && !(file->f_mode & 2))
 				return -EACCES;
 
+			/* Make sure we don't allow writing to an append-only file.. */
+			if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & 2))
+				return -EACCES;
+
 			/* make sure there are no mandatory locks on the file. */
 			if (locks_verify_locked(file->f_dentry->d_inode))
 				return -EAGAIN;
+
 			/* fall through */
 		case MAP_PRIVATE:
 			if (!(file->f_mode & 1))
@@ -316,16 +321,9 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 	merge_segments(mm, vma->vm_start, vma->vm_end);
 	
 	mm->total_vm += len >> PAGE_SHIFT;
-	if ((flags & VM_LOCKED) && !(flags & VM_IO)) {
-		unsigned long start = addr;
+	if (flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
-		do {
-			char c;
-			get_user(c,(char *) start);
-			len -= PAGE_SIZE;
-			start += PAGE_SIZE;
-			__asm__ __volatile__("": :"r" (c));
-		} while (len > 0);
+		make_pages_present(addr, addr + len);
 	}
 	return addr;
 
@@ -428,30 +426,10 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
 		insert_vm_struct(current->mm, mpnt);
 	}
 
-	/* Close the current area ... */
-	if (area->vm_ops && area->vm_ops->close) {
-		end = area->vm_end; /* save new end */
-		area->vm_end = area->vm_start;
-		area->vm_ops->close(area);
-		area->vm_end = end;
-	}
-	/* ... then reopen and reinsert. */
-	if (area->vm_ops && area->vm_ops->open)
-		area->vm_ops->open(area);
 	insert_vm_struct(current->mm, area);
 	return 1;
 }
 
-asmlinkage int sys_munmap(unsigned long addr, size_t len)
-{
-	int ret;
-
-	lock_kernel();
-	ret = do_munmap(addr, len);
-	unlock_kernel();
-	return ret;
-}
-
 /* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
  * work.  This now handles partial unmappings.
@@ -460,7 +438,7 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len)
 int do_munmap(unsigned long addr, size_t len)
 {
 	struct mm_struct * mm;
-	struct vm_area_struct *mpnt, *next, *free, *extra;
+	struct vm_area_struct *mpnt, *free, *extra;
 	int freed;
 
 	if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
@@ -481,6 +459,11 @@ int do_munmap(unsigned long addr, size_t len)
 	if (!mpnt)
 		return 0;
 
+	/* If we'll make "hole", check the vm areas limit */
+	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) &&
+	    mm->map_count > MAX_MAP_COUNT)
+		return -ENOMEM;
+
 	/*
 	 * We may need one additional vma to fix up the mappings ... 
 	 * and this is the last chance for an easy error exit.
@@ -489,9 +472,7 @@ int do_munmap(unsigned long addr, size_t len)
 	if (!extra)
 		return -ENOMEM;
 
-	next = mpnt->vm_next;
-
-	/* we have mpnt->vm_next = next and addr < mpnt->vm_end */
+	/* we have addr < mpnt->vm_end */
 	free = NULL;
 	for ( ; mpnt && mpnt->vm_start < addr+len; ) {
 		struct vm_area_struct *next = mpnt->vm_next;
@@ -505,13 +486,6 @@ int do_munmap(unsigned long addr, size_t len)
 		mpnt = next;
 	}
 
-	if (free && (free->vm_start < addr) && (free->vm_end > addr+len)) {
-		if (mm->map_count > MAX_MAP_COUNT) {
-			kmem_cache_free(vm_area_cachep, extra);
-			return -ENOMEM;
-		}
-	}
-
 	/* Ok - we have the memory areas we should free on the 'free' list,
 	 * so release them, and unmap the page range..
 	 * If the one of the segments is only being partially unmapped,
@@ -555,6 +529,18 @@ int do_munmap(unsigned long addr, size_t len)
 	return 0;
 }
 
+asmlinkage int sys_munmap(unsigned long addr, size_t len)
+{
+	int ret;
+
+	down(&current->mm->mmap_sem);
+	lock_kernel();
+	ret = do_munmap(addr, len);
+	unlock_kernel();
+	up(&current->mm->mmap_sem);
+	return ret;
+}
+
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct * mm)
 {
@@ -630,13 +616,13 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
  * This assumes that the list is ordered by address.
  * We don't need to traverse the entire list, only those segments
  * which intersect or are adjacent to a given interval.
+ *
+ * We must already hold the mm semaphore when we get here..
  */
 void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
 {
 	struct vm_area_struct *prev, *mpnt, *next;
 
-	down(&mm->mmap_sem);
-
 	prev = NULL;
 	mpnt = mm->mmap;
 	while(mpnt && mpnt->vm_end <= start_addr) {
@@ -644,7 +630,7 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 		mpnt = mpnt->vm_next;
 	}
 	if (!mpnt)
-		goto no_vma;
+		return;
 
 	next = mpnt->vm_next;
 
@@ -700,8 +686,6 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 		mpnt = prev;
 	}
 	mm->mmap_cache = NULL;		/* Kill the cache. */
-no_vma:
-	up(&mm->mmap_sem);
 }
 
 __initfunc(void vma_init(void))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5dac4cd..cc78e10ab 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -208,18 +208,20 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot)
 	struct vm_area_struct * vma, * next;
 	int error = -EINVAL;
 
-	lock_kernel();
 	if (start & ~PAGE_MASK)
-		goto out;
+		return -EINVAL;
 	len = (len + ~PAGE_MASK) & PAGE_MASK;
 	end = start + len;
 	if (end < start)
-		goto out;
+		return -EINVAL;
 	if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
-		goto out;
-	error = 0;
+		return -EINVAL;
 	if (end == start)
-		goto out;
+		return 0;
+
+	down(&current->mm->mmap_sem);
+	lock_kernel();
+
 	vma = find_vma(current->mm, start);
 	error = -EFAULT;
 	if (!vma || vma->vm_start > start)
@@ -256,5 +258,6 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot)
 	merge_segments(current->mm, start, end);
 out:
 	unlock_kernel();
+	up(&current->mm->mmap_sem);
 	return error;
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index a31a0ae14..cd7a7eb4a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -21,6 +21,8 @@
 #include <asm/system.h>
 #include <asm/pgtable.h>
 
+extern int vm_enough_memory(long pages);
+
 static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t * pgd;
@@ -167,6 +169,7 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
 	struct vm_area_struct *vma;
 	unsigned long ret = -EINVAL;
 
+	down(&current->mm->mmap_sem);
 	lock_kernel();
 	if (addr & ~PAGE_MASK)
 		goto out;
@@ -178,7 +181,7 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
 	 * the unnecessary pages..
 	 */
 	ret = addr;
-	if (old_len > new_len) {
+	if (old_len >= new_len) {
 		do_munmap(addr+new_len, old_len - new_len);
 		goto out;
 	}
@@ -204,6 +207,11 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
 	if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
 	    > current->rlim[RLIMIT_AS].rlim_cur)
 		goto out;
+	/* Private writable mapping? Check memory availability.. */
+	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
+	    !(flags & MAP_NORESERVE)				 &&
+	    !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT))
+		goto out;
 
 	/* old_len exactly to the end of the area.. */
 	if (old_len == vma->vm_end - addr &&
@@ -233,5 +241,6 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
 		ret = -ENOMEM;
 out:
 	unlock_kernel();
+	up(&current->mm->mmap_sem);
 	return ret;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d61d74f44..c51db59d9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -98,53 +98,33 @@ static inline void remove_mem_queue(struct page * entry)
  *
  * Hint: -mask = 1+~mask
  */
-static spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
 
 /*
- * This routine is used by the kernel swap deamon to determine
+ * This routine is used by the kernel swap daemon to determine
  * whether we have "enough" free pages. It is fairly arbitrary,
- * but this had better return false if any reasonable "get_free_page()"
- * allocation could currently fail..
+ * having a low-water and high-water mark.
  *
- * This will return zero if no list was found, non-zero
- * if there was memory (the bigger, the better).
+ * This returns:
+ *  0 - urgent need for memory
+ *  1 - need some memory, but do it slowly in the background
+ *  2 - no need to even think about it.
  */
-int free_memory_available(int nr)
+int free_memory_available(void)
 {
-	int retval = 0;
-	unsigned long flags;
-	struct free_area_struct * list;
+	static int available = 1;
 
-	/*
-	 * If we have more than about 3% to 5% of all memory free,
-	 * consider it to be good enough for anything.
-	 * It may not be, due to fragmentation, but we
-	 * don't want to keep on forever trying to find
-	 * free unfragmented memory.
-	 * Added low/high water marks to avoid thrashing -- Rik.
-	 */
-	if (nr_free_pages > (nr ? freepages.low : freepages.high))
-		return nr+1;
+	if (nr_free_pages < freepages.low) {
+		available = 0;
+		return 0;
+	}
 
-	list = free_area + NR_MEM_LISTS;
-	spin_lock_irqsave(&page_alloc_lock, flags);
-	/* We fall through the loop if the list contains one
-	 * item. -- thanks to Colin Plumb <colin@nyx.net>
-	 */
-	do {
-		list--;
-		/* Empty list? Bad - we need more memory */
-		if (list->next == memory_head(list))
-			break;
-		/* One item on the list? Look further */
-		if (list->next->next == memory_head(list))
-			continue;
-		/* More than one item? We're ok */
-		retval = nr + 1;
-		break;
-	} while (--nr >= 0);
-	spin_unlock_irqrestore(&page_alloc_lock, flags);
-	return retval;
+	if (nr_free_pages > freepages.high) {
+		available = 1;
+		return 2;
+	}
+
+	return available;
 }
 
 static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
@@ -182,9 +162,11 @@ void __free_page(struct page *page)
 		if (PageSwapCache(page))
 			panic ("Freeing swap cache page");
 		free_pages_ok(page->map_nr, 0);
+		return;
 	}
 	if (PageSwapCache(page) && atomic_read(&page->count) == 1)
-		panic ("Releasing swap cache page");
+		printk(KERN_WARNING "VM: Releasing swap cache page at %p",
+			__builtin_return_address(0));
 }
 
 void free_pages(unsigned long addr, unsigned long order)
@@ -202,8 +184,9 @@ void free_pages(unsigned long addr, unsigned long order)
 			return;
 		}
 		if (PageSwapCache(map) && atomic_read(&map->count) == 1)
-			panic ("Releasing swap cache pages at %p",
-			       __builtin_return_address(0));
+			printk(KERN_WARNING 
+				"VM: Releasing swap cache pages at %p",
+				__builtin_return_address(0));
 	}
 }
 
@@ -214,13 +197,11 @@ void free_pages(unsigned long addr, unsigned long order)
 	change_bit((index) >> (1+(order)), (area)->map)
 #define CAN_DMA(x) (PageDMA(x))
 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
-#define RMQUEUE(order, maxorder, dma) \
+#define RMQUEUE(order, dma) \
 do { struct free_area_struct * area = free_area+order; \
      unsigned long new_order = order; \
 	do { struct page *prev = memory_head(area), *ret = prev->next; \
 		while (memory_head(area) != ret) { \
-			if (new_order >= maxorder && ret->next == prev) \
-				break; \
 			if (!dma || CAN_DMA(ret)) { \
 				unsigned long map_nr = ret->map_nr; \
 				(prev->next = ret->next)->prev = prev; \
@@ -252,39 +233,46 @@ do { unsigned long size = 1 << high; \
 
 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 {
-	unsigned long flags, maxorder;
+	unsigned long flags;
 
 	if (order >= NR_MEM_LISTS)
 		goto nopage;
 
-	/*
-	 * "maxorder" is the highest order number that we're allowed
-	 * to empty in order to find a free page..
-	 */
-	maxorder = NR_MEM_LISTS-1;
-	if (gfp_mask & __GFP_HIGH)
-		maxorder = NR_MEM_LISTS;
-
-	if (in_interrupt() && (gfp_mask & __GFP_WAIT)) {
-		static int count = 0;
-		if (++count < 5) {
-			printk("gfp called nonatomically from interrupt %p\n",
-			       return_address());
-			gfp_mask &= ~__GFP_WAIT;
+	if (gfp_mask & __GFP_WAIT) {
+		if (in_interrupt()) {
+			static int count = 0;
+			if (++count < 5) {
+				printk("gfp called nonatomically from interrupt %p\n",
+					__builtin_return_address(0));
+			}
+			goto nopage;
 		}
-	}
 
-	for (;;) {
-		spin_lock_irqsave(&page_alloc_lock, flags);
-		RMQUEUE(order, maxorder, (gfp_mask & GFP_DMA));
-		spin_unlock_irqrestore(&page_alloc_lock, flags);
-		if (!(gfp_mask & __GFP_WAIT))
-			break;
-		if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX))
-			break;
-		gfp_mask &= ~__GFP_WAIT;	/* go through this only once */
-		maxorder = NR_MEM_LISTS;	/* Allow anything this time */
+		if (freepages.min > nr_free_pages) {
+			int freed;
+			freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
+			/*
+			 * Low priority (user) allocations must not
+			 * succeed if we didn't have enough memory
+			 * and we couldn't get more..
+			 */
+			if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+				goto nopage;
+		}
 	}
+	spin_lock_irqsave(&page_alloc_lock, flags);
+	RMQUEUE(order, (gfp_mask & GFP_DMA));
+	spin_unlock_irqrestore(&page_alloc_lock, flags);
+
+	/*
+	 * If we failed to find anything, we'll return NULL, but we'll
+	 * wake up kswapd _now_ ad even wait for it synchronously if
+	 * we can.. This way we'll at least make some forward progress
+	 * over time.
+	 */
+	wake_up(&kswapd_wait);
+	if (gfp_mask & __GFP_WAIT)
+		schedule();
 nopage:
 	return 0;
 }
@@ -300,6 +288,11 @@ void show_free_areas(void)
  	unsigned long total = 0;
 
 	printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
+	printk("Free: %d (%d %d %d)\n",
+		nr_free_pages,
+		freepages.min,
+		freepages.low,
+		freepages.high);
 	spin_lock_irqsave(&page_alloc_lock, flags);
  	for (order=0 ; order < NR_MEM_LISTS; order++) {
 		struct page * tmp;
@@ -329,22 +322,23 @@ __initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long e
 {
 	mem_map_t * p;
 	unsigned long mask = PAGE_MASK;
-	int i;
+	unsigned long i;
 
 	/*
 	 * Select nr of pages we try to keep free for important stuff
-	 * with a minimum of 48 pages and a maximum of 256 pages, so
+	 * with a minimum of 10 pages and a maximum of 256 pages, so
 	 * that we don't waste too much memory on large systems.
-	 * This is totally arbitrary.
+	 * This is fairly arbitrary, but based on some behaviour
+	 * analysis.
 	 */
 	i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
-	if (i < 48)
-		i = 48;
+	if (i < 10)
+		i = 10;
 	if (i > 256)
 		i = 256;
 	freepages.min = i;
-	freepages.low = i << 1;
-	freepages.high = freepages.low + i;
+	freepages.low = i * 2;
+	freepages.high = i * 3;
 	mem_map = (mem_map_t *) LONG_ALIGN(start_mem);
 	p = mem_map + MAP_NR(end_mem);
 	start_mem = LONG_ALIGN((unsigned long) p);
diff --git a/mm/page_io.c b/mm/page_io.c
index eb436f7b7..7e5a35186 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -74,18 +74,19 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 		return;
 	}
 	if (p->swap_map && !p->swap_map[offset]) {
-		printk("Hmm.. Trying to %s unallocated swap (%08lx)\n", 
-		       (rw == READ) ? "read" : "write", 
-		       entry);
+		printk(KERN_ERR "rw_swap_page: "
+			"Trying to %s unallocated swap (%08lx)\n", 
+			(rw == READ) ? "read" : "write", entry);
 		return;
 	}
 	if (!(p->flags & SWP_USED)) {
-		printk("Trying to swap to unused swap-device\n");
+		printk(KERN_ERR "rw_swap_page: "
+			"Trying to swap to unused swap-device\n");
 		return;
 	}
 
 	if (!PageLocked(page)) {
-		printk("VM: swap page is unlocked\n");
+		printk(KERN_ERR "VM: swap page is unlocked\n");
 		return;
 	}
 	
@@ -111,11 +112,11 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 	 * hashing for locked pages.
 	 */
 	if (!PageSwapCache(page)) {
-		printk("VM: swap page is not in swap cache\n");
+		printk(KERN_ERR "VM: swap page is not in swap cache\n");
 		return;
 	}
 	if (page->offset != entry) {
-		printk ("swap entry mismatch");
+		printk (KERN_ERR "VM: swap entry mismatch\n");
 		return;
 	}
 
@@ -142,7 +143,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 		if (swapf->i_op->bmap == NULL
 			&& swapf->i_op->smap != NULL){
 			/*
-				With MsDOS, we use msdos_smap which return
+				With MS-DOS, we use msdos_smap which return
 				a sector number (not a cluster or block number).
 				It is a patch to enable the UMSDOS project.
 				Other people are working on better solution.
@@ -179,11 +180,14 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 		clear_bit(PG_locked, &page->flags);
 		wake_up(&page->wait);
 	} else
-		printk("rw_swap_page: no swap file or device\n");
+		printk(KERN_ERR "rw_swap_page: no swap file or device\n");
 
+	/* This shouldn't happen, but check to be sure. */
+	if (atomic_read(&page->count) == 1)
+		printk(KERN_ERR "rw_swap_page: page unused while waiting!\n");
 	atomic_dec(&page->count);
 	if (offset && !test_and_clear_bit(offset,p->swap_lockmap))
-		printk("rw_swap_page: lock already cleared\n");
+		printk(KERN_ERR "rw_swap_page: lock already cleared\n");
 	wake_up(&lock_queue);
 #ifdef DEBUG_SWAP
 	printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n",
diff --git a/mm/simp.c b/mm/simp.c
deleted file mode 100644
index 581cde3d7..000000000
--- a/mm/simp.c
+++ /dev/null
@@ -1,435 +0,0 @@
-#define NULL 0
-/*
- * mm/simp.c  -- simple allocator for cached objects
- *
- * (C) 1997 Thomas Schoebel-Theuer
- */
-
-#include <linux/simp.h>
-#include <linux/tasks.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-#include <asm/spinlock.h>
-
-/* The next two defines can be independently enabled for debugging */
-/*#define DEBUG*/
-/*#define DEAD_BEEF*/
-
-#ifdef DEAD_BEEF
-#define DEBUG_BEEF 1
-#else
-#define DEBUG_BEEF 0
-#endif
-
-#ifdef __SMP__
-#define NR_PROCESSORS  NR_CPUS
-#define GLOBAL_SIZE CHUNK_SIZE
-#else
-#define NR_PROCESSORS  1
-#define GLOBAL_SIZE PAGE_SIZE
-#endif
-
-#define POSTBUFFER_SIZE 63
-#define ORDER 2
-#define CHUNK_SIZE (PAGE_SIZE*(1<<ORDER))
-#define CHUNK_BASE(ptr) (struct header*)(((unsigned long)(ptr)) & ~(CHUNK_SIZE-1))
-#define CHUNK_END(hdr) (void**)((char*)(hdr) + CHUNK_SIZE)
-
-#define COLOR_INCREMENT (8*sizeof(void*)) /* should be 1 cache line */
-#define ALIGN_CACHE(adr) ((((((unsigned long)adr) - 1) / COLOR_INCREMENT) + 1) * COLOR_INCREMENT)
-#define HEADER_SIZE ALIGN_CACHE(sizeof(struct header))
-#define ELEM_SIZE ALIGN_CACHE(sizeof(struct elem))
-#define FILL_TYPE(name,wrongsize) char name[ALIGN_CACHE(wrongsize)-(wrongsize)]
-
-#define MAX_SIMPS ((GLOBAL_SIZE / sizeof(struct simp)) - 1)
-
-struct header { /* this is at the beginning of each memory region */
-	/* 1st cache line */
-	void ** index;
-	void ** fresh;
-	struct simp * father;
-	void ** emptypos;
-	struct header * next;
-	structor again_ctor;
-	structor first_ctor;
-	void * fill[1];
-#ifdef DEBUG
-	/* 2nd cache line */
-	char magic[32];
-#endif
-};
-
-struct per_processor {
-	void ** buffer_pos;
-	void * postbuffer[POSTBUFFER_SIZE];
-};
-
-struct simp {
-	/* 1st cache lines */
-	struct per_processor private[NR_PROCESSORS];
-	/* next cache line */
-	struct header * usable_list;
-	spinlock_t lock;
-	/* This value is negative on Alpha SMP.  */
-	/* char fill[sizeof(void*) - sizeof(spinlock_t)]; */
-	long real_size;
-	long max_elems;
-	structor again_ctor;
-	structor first_ctor;
-	structor dtor;
-	long fill2;
-	/* next cache line */
-	long create_offset;
-	long color;
-	long max_color;
-	long size;
-	long fill3[4];
-	/* next cache line */
-	char name[32];
-};
-
-struct global_data {
-	/* 1st cache line */
-	long changed_flag;
-	long nr_simps;
-	spinlock_t lock;
-	char fill[(6+8)*sizeof(void*)+sizeof(void*)-sizeof(spinlock_t)];
-	/* rest */
-	struct simp simps[MAX_SIMPS];
-};
-
-static struct global_data * global = NULL;
-
-#ifdef DEBUG
-static char global_magic[32] = "SIMP header SdC581oi9rY20051962\n";
-#endif
-
-struct simp * simp_create(char * name, long size,
-			  structor first_ctor, 
-			  structor again_ctor, 
-			  structor dtor)
-{
-	struct simp * simp;
-	long fraction;
-	long real_size;
-	int cpu;
-
-	if(!global) {
-#ifdef __SMP__
-		global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER);
-		memset(global, 0, CHUNK_SIZE);
-#else
-		global = (struct global_data*)get_free_page(GFP_KERNEL);
-#endif
-		spin_lock_init(&global->lock);
-	}
-
-	spin_lock(&global->lock);
-	simp = &global->simps[global->nr_simps++];
-	spin_unlock(&global->lock);
-
-	if(global->nr_simps >= MAX_SIMPS) {
-		printk("SIMP: too many simps allocated\n");
-		return NULL;
-	}
-	memset(simp, 0, sizeof(struct simp));
-	spin_lock_init(&simp->lock);
-	strncpy(simp->name, name, 15);
-	simp->size = size;
-	simp->real_size = real_size = ALIGN_CACHE(size);
-	/* allow aggregation of very small objects in 2-power fractions of
-	 * cachelines */
-	fraction = COLOR_INCREMENT / 2;
-	while(size <= fraction && fraction >= sizeof(void*)) {
-		simp->real_size = fraction;
-		fraction >>= 1;
-	}
-	simp->first_ctor = first_ctor;
-	simp->again_ctor = again_ctor;
-	simp->dtor = dtor;
-	
-	real_size += sizeof(void*);
-	simp->max_elems = (CHUNK_SIZE - HEADER_SIZE) / real_size;
-	simp->max_color = (CHUNK_SIZE - HEADER_SIZE) % real_size;
-	for(cpu = 0; cpu < NR_PROCESSORS; cpu++) {
-		struct per_processor * private = &simp->private[cpu];
-		private->buffer_pos = private->postbuffer;
-	}
-	return simp;
-}
-
-/* Do *not* inline this, it clobbers too many registers... */
-static void alloc_header(struct simp * simp)
-{
-	struct header * hdr;
-	char * ptr;
-	void ** index;
-	long count;
-
-	spin_unlock(&simp->lock);
-	for(;;) {
-		hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER);
-		if(hdr)
-			break;
-		if(!simp_garbage())
-			return;
-	}
-#ifdef DEBUG
-	if(CHUNK_BASE(hdr) != hdr)
-		panic("simp: bad kernel page alignment");
-#endif
-
-	memset(hdr, 0, HEADER_SIZE);
-#ifdef DEBUG
-	memcpy(hdr->magic, global_magic, sizeof(global_magic));
-#endif
-	hdr->father = simp;
-	hdr->again_ctor = simp->again_ctor;
-	hdr->first_ctor = simp->first_ctor;
-	
-	/* note: races on simp->color don't produce any error :-) */
-	ptr = ((char*)hdr) + HEADER_SIZE + simp->color;
-	index = CHUNK_END(hdr);
-	for(count = 0; count < simp->max_elems; count++) {
-		*--index = ptr;
-		ptr += simp->real_size;
-		/* note: constructors are not called here in bunch but
-		 * instead at each single simp_alloc(), in order
-		 * to maximize chances that the cache will be
-		 * polluted after a simp_alloc() anyway,
-		 * and not here. */
-	}
-	hdr->index = hdr->fresh = hdr->emptypos = index;
-
-	spin_lock(&simp->lock);
-	simp->color += COLOR_INCREMENT;
-	if(simp->color >= simp->max_color)
-		simp->color = 0;
-	hdr->next = simp->usable_list;
-	simp->usable_list = hdr;
-}
-
-/* current x86 memcpy() is horribly moving around registers for nothing,
- * is doing unnecessary work if the size is dividable by a power-of-two,
- * and it clobbers way too many registers.
- * This results in nearly any other register being transfered to stack.
- * Fixing this would be a major win for the whole kernel!
- */
-static void ** bunch_alloc(struct simp * simp, void ** buffer)
-{
-	struct header * hdr;
-	void ** index;
-	void ** to;
-	void ** end;
-	structor todo;
-	long length;
-
-	spin_lock(&simp->lock);
-	hdr = simp->usable_list;
-	if(!hdr) {
-		alloc_header(simp);
-		hdr = simp->usable_list;
-		if(!hdr) {
-			spin_unlock(&simp->lock);
-			*buffer = NULL;
-			return buffer+1;
-		}
-	}
-	
-	index = hdr->index;
-	end = hdr->fresh;
-	todo = hdr->again_ctor;
-	if(index == end) {
-		end = CHUNK_END(hdr);
-		todo = hdr->first_ctor;
-	}
-	to = index + POSTBUFFER_SIZE/2;
-	if(to >= end) {
-		to = end;
-		if(to == CHUNK_END(hdr)) {
-			simp->usable_list = hdr->next;
-			hdr->next = NULL;
-		}
-	}
-	if(to > hdr->fresh)
-		hdr->fresh = to;
-	hdr->index = to;
-	length = ((unsigned long)to) - (unsigned long)index;
-	to = buffer + (length/sizeof(void**));
-
-	memcpy(buffer, index, length);
-
-	spin_unlock(&simp->lock);
-
-	if(todo) {
-		do {
-			todo(*buffer++);
-		} while(buffer < to);
-	}
-	return to;
-}
-
-void * simp_alloc(struct simp * simp)
-{
-#ifdef __SMP__
-	const long cpu = smp_processor_id();
-	struct per_processor * priv = &simp->private[cpu];
-#else
-#define priv (&simp->private[0]) /*fool gcc to use no extra register*/
-#endif
-	void ** buffer_pos = priv->buffer_pos;
-	void * res;
-
-	if(buffer_pos == priv->postbuffer) {
-		buffer_pos = bunch_alloc(simp, buffer_pos);
-	}
-	buffer_pos--;
-	res = *buffer_pos;
-	priv->buffer_pos = buffer_pos;
-	return res;
-}
-
-#ifdef DEBUG
-long check_header(struct header * hdr, void * ptr)
-{
-	void ** test;
-
-	if(!hdr) {
-		printk("SIMP: simp_free() with NULL pointer\n");
-		return 1;
-	}
-	if(strncmp(hdr->magic, global_magic, 32)) {
-		printk("SIMP: simpe_free() with bad ptr %p, or header corruption\n", ptr);
-		return 1;
-	}
-	/* This is brute force, but I don't want to pay for any
-	 * overhead if debugging is not enabled, in particular
-	 * no space overhead for keeping hashtables etc. */
-	test = hdr->index;
-	while(test < CHUNK_END(hdr)) {
-		if(*test++ == ptr) {
-			printk("SIMP: trying to simp_free(%p) again\n", ptr);
-			return 1;
-		}
-	}
-	return 0;
-}
-#endif
-
-static void ** bunch_free(struct simp * simp, void ** buffer)
-{
-	void ** stop;
-
-	stop = buffer - POSTBUFFER_SIZE/3;
-
-	spin_lock(&simp->lock);
-	while(buffer > stop) {
-		void * elem = buffer[-1];
-		struct header * hdr = CHUNK_BASE(elem);
-		void ** index = hdr->index;
-		index--;
-		hdr->index = index;
-		*index = elem;
-		if(!hdr->next) {
-			hdr->next = simp->usable_list;
-			simp->usable_list = hdr;
-		}
-
-		buffer -= 2;
-		elem = *buffer;
-		hdr = CHUNK_BASE(elem);
-		index = hdr->index;
-		index--;
-		hdr->index = index;
-		*index = elem;
-		if(!hdr->next) {
-			hdr->next = simp->usable_list;
-			simp->usable_list = hdr;
-		}
-	}
-	spin_unlock(&simp->lock);
-	global->changed_flag = 1;
-	return buffer;
-}
-
-void simp_free(void * objp)
-{
-	struct header * hdr;
-	void ** buffer_pos;
-	struct per_processor * private;
-#ifdef __SMP__
-	const long cpu = smp_processor_id();
-#else
-	const long cpu = 0;
-#endif
-
-	hdr = CHUNK_BASE(objp);
-#ifdef DEBUG
-	if(check_header(hdr, objp))
-		return;
-#endif
-
-	private = &hdr->father->private[cpu];
-	buffer_pos = private->buffer_pos;
-	if(buffer_pos >= private->postbuffer+POSTBUFFER_SIZE) {
-		buffer_pos = bunch_free(hdr->father, buffer_pos);
-	}
-
-	*buffer_pos++ = objp;
-	private->buffer_pos = buffer_pos;
-
-#ifdef DEAD_BEEF
-	{
-		unsigned int * ptr = (unsigned int*)objp;
-		int count = (hdr->father->real_size - ELEM_SIZE) / sizeof(unsigned int);
-		while(count--)
-			*ptr++ = 0xdeadbeef;
-	}
-#endif
-}
-
-long simp_garbage(void)
-{
-	int i;
-	int res;
-
-	if(!global->changed_flag)
-		return 0; /* shortcut */
-	/* Note: costs do not matter here. Any heavy thrashing of
-	 * simp chunks that could be caused by pools stealing each
-	 * other's memory has to be considered a BUG :-)
-	 * Simply avoid memory shortages by conservative allocating
-	 * policies.
-	 */
-	global->changed_flag = 0;
-	res = 0;
-	for(i = 0; i < global->nr_simps; i++) {
-		struct simp * simp = &global->simps[i];
-		struct header ** base = &simp->usable_list;
-		struct header * del;
-
-		spin_lock(&simp->lock);
-		del = *base;
-		while(del) {
-			if(del->index == del->emptypos) {
-				if(simp->dtor) {
-					void ** ptr = del->index;
-					while(ptr < CHUNK_END(del)) {
-						simp->dtor(*ptr++);
-					}
-				}
-				*base = del->next;
-#ifdef DEBUG
-				memset(del, 0, CHUNK_SIZE);
-#endif
-				free_pages((unsigned long)del, ORDER);
-				res++;
-			} else
-				base = &del->next;
-			del = *base;
-		}
-		spin_unlock(&simp->lock);
-	}
-	return res;
-}
-
diff --git a/mm/slab.c b/mm/slab.c
index a2ed8c1c5..dc9dc05d2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -70,7 +70,7 @@
  *
  *	Calls to printk() are not 100% safe (the function is not threaded).  However,
  *	printk() is only used under an error condition, and the risk is v. small (not
- *	sure if the console write functions 'enjoy' executing multiple contextes in
+ *	sure if the console write functions 'enjoy' executing multiple contexts in
  *	parallel.  I guess they don't...).
  *	Note, for most calls to printk() any held cache-lock is dropped.  This is not
  *	always done for text size reasons - having *_unlock() everywhere is bloat.
@@ -92,11 +92,11 @@
  * index to hold the bufctls.  This allows the bufctl structure to
  * be small (one word), but limits the number of objects a slab (not
  * a cache) can contain when off-slab bufctls are used.  The limit is the
- * size of the largest general-cache that does not use off-slab bufctls,
+ * size of the largest general cache that does not use off-slab bufctls,
  * divided by the size of a bufctl.  For 32bit archs, is this 256/4 = 64.
  * This is not serious, as it is only for large objects, when it is unwise
  * to have too many per slab.
- * Note: This limit can be raised by introducing a general-cache whose size
+ * Note: This limit can be raised by introducing a general cache whose size
  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
  */
 
@@ -109,7 +109,6 @@
 
 #include	<asm/system.h>
 #include	<asm/atomic.h>
-#include	<asm/smp_lock.h>
 #include	<asm/spinlock.h>
 #ifdef __mips__
 #include	<asm/pgtable.h>
@@ -128,12 +127,12 @@
  *
  * SLAB_DEBUG_SUPPORT	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE,
  *			  SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISON.
- *			  0 for faster, smaller, code (espically in the critical paths).
+ *			  0 for faster, smaller, code (especially in the critical paths).
  *
  * SLAB_STATS		- 1 to collect stats for /proc/slabinfo.
- *			  0 for faster, smaller, code (espically in the critical paths).
+ *			  0 for faster, smaller, code (especially in the critical paths).
  *
- * SLAB_SELFTEST	- 1 to perform a few tests, mainly for developement.
+ * SLAB_SELFTEST	- 1 to perform a few tests, mainly for development.
  */
 #define		SLAB_MGMT_CHECKS	1
 #define		SLAB_DEBUG_SUPPORT	0
@@ -184,7 +183,7 @@ typedef struct kmem_slab_s {
 				 s_dma:1;
 } kmem_slab_t;
 
-/* When the slab mgmt is on-slab, this gives the size to use. */
+/* When the slab management is on-slab, this gives the size to use. */
 #define	slab_align_size		(L1_CACHE_ALIGN(sizeof(kmem_slab_t)))
 
 /* Test for end of slab chain. */
@@ -192,7 +191,7 @@ typedef struct kmem_slab_s {
 
 /* s_magic */
 #define	SLAB_MAGIC_ALLOC	0xA5C32F2BUL	/* slab is alive */
-#define	SLAB_MAGIC_DESTROYED	0xB2F23C5AUL	/* slab has been destoryed */
+#define	SLAB_MAGIC_DESTROYED	0xB2F23C5AUL	/* slab has been destroyed */
 
 /* Bufctl's are used for linking objs within a slab, identifying what slab an obj
  * is in, and the address of the associated obj (for sanity checking with off-slab
@@ -264,9 +263,9 @@ struct kmem_cache_s {
 };
 
 /* internal c_flags */
-#define	SLAB_CFLGS_OFF_SLAB	0x010000UL	/* slab mgmt in own cache */
+#define	SLAB_CFLGS_OFF_SLAB	0x010000UL	/* slab management in own cache */
 #define	SLAB_CFLGS_BUFCTL	0x020000UL	/* bufctls in own cache */
-#define	SLAB_CFLGS_GENERAL	0x080000UL	/* a general-cache */
+#define	SLAB_CFLGS_GENERAL	0x080000UL	/* a general cache */
 
 /* c_dflags (dynamic flags).  Need to hold the spinlock to access this member */
 #define	SLAB_CFLGS_GROWN	0x000002UL	/* don't reap a recently grown */
@@ -311,13 +310,15 @@ static void kmem_self_test(void);
 /* maximum num of pages for a slab (prevents large requests to the VM layer) */
 #define	SLAB_MAX_GFP_ORDER	5	/* 32 pages */
 
-/* the 'prefered' minimum num of objs per slab - maybe less for large objs */
+/* the 'preferred' minimum num of objs per slab - maybe less for large objs */
 #define	SLAB_MIN_OBJS_PER_SLAB	4
 
 /* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
  * then the page order must be less than this before trying the next order.
  */
-#define	SLAB_BREAK_GFP_ORDER	2
+#define	SLAB_BREAK_GFP_ORDER_HI	2
+#define	SLAB_BREAK_GFP_ORDER_LO	1
+static int slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_LO;
 
 /* Macros for storing/retrieving the cachep and or slab from the
  * global 'mem_map'.  With off-slab bufctls, these are used to find the
@@ -329,7 +330,7 @@ static void kmem_self_test(void);
 #define	SLAB_SET_PAGE_SLAB(pg, x)	((pg)->prev = (struct page *)(x))
 #define	SLAB_GET_PAGE_SLAB(pg)		((kmem_slab_t *)(pg)->prev)
 
-/* Size description struct for general-caches. */
+/* Size description struct for general caches. */
 typedef struct cache_sizes {
 	size_t		 cs_size;
 	kmem_cache_t	*cs_cachep;
@@ -354,7 +355,7 @@ static cache_sizes_t cache_sizes[] = {
 	{0,		NULL}
 };
 
-/* Names for the general-caches.  Not placed into the sizes struct for
+/* Names for the general caches.  Not placed into the sizes struct for
  * a good reason; the string ptr is not needed while searching in kmalloc(),
  * and would 'get-in-the-way' in the h/w cache.
  */
@@ -400,7 +401,7 @@ static struct semaphore	cache_chain_sem;
 /* Place maintainer for reaping. */
 static	kmem_cache_t	*clock_searchp = &cache_cache;
 
-/* Internal slab mgmt cache, for when slab mgmt is off-slab. */
+/* Internal slab management cache, for when slab management is off-slab. */
 static kmem_cache_t	*cache_slabp = NULL;
 
 /* Max number of objs-per-slab for caches which use bufctl's.
@@ -451,6 +452,12 @@ __initfunc(long kmem_cache_init(long start, long end))
 	cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES;
 	cache_cache.c_colour_next = cache_cache.c_colour;
 
+	/*
+	 * Fragmentation resistance on low memory - only use bigger
+	 * page orders on machines with more than 32MB of memory.
+	 */
+	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+		slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_HI;
 	return start;
 }
 
@@ -467,9 +474,9 @@ __initfunc(void kmem_cache_sizes_init(void))
 		char **names = cache_sizes_name;
 		cache_sizes_t *sizes = cache_sizes;
 		do {
-			/* For performance, all the general-caches are L1 aligned.
+			/* For performance, all the general caches are L1 aligned.
 			 * This should be particularly beneficial on SMP boxes, as it
-			 * elimantes "false sharing".
+			 * eliminates "false sharing".
 			 * Note for systems short on memory removing the alignment will
 			 * allow tighter packing of the smaller caches. */
 			if (!(sizes->cs_cachep =
@@ -566,7 +573,7 @@ kmem_check_poison_obj(kmem_cache_t *cachep, void *addr)
 }
 #endif	/* SLAB_DEBUG_SUPPORT */
 
-/* Three slab chain funcs - all called with ints disabled and the appropiate
+/* Three slab chain funcs - all called with ints disabled and the appropriate
  * cache-lock held.
  */
 static inline void
@@ -608,7 +615,7 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
 {
 	if (cachep->c_dtor
 #if	SLAB_DEBUG_SUPPORT
-		|| cachep->c_flags & (SLAB_POISON || SLAB_RED_ZONE)
+		|| cachep->c_flags & (SLAB_POISON | SLAB_RED_ZONE)
 #endif	/*SLAB_DEBUG_SUPPORT*/
 	) {
 		/* Doesn't use the bufctl ptrs to find objs. */
@@ -634,7 +641,7 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
 #if	SLAB_DEBUG_SUPPORT
 			else if (cachep->c_flags & SLAB_POISON) {
 				if (kmem_check_poison_obj(cachep, objp))
-					printk(KERN_ERR "kmem_slab_destory: "
+					printk(KERN_ERR "kmem_slab_destroy: "
 					       "Bad poison - %s\n", cachep->c_name);
 			}
 			if (cachep->c_flags & SLAB_RED_ZONE)
@@ -718,7 +725,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
 	}
 
 	if (offset < 0 || offset > size) {
-		printk("%sOffset weired %d - %s\n", func_nm, (int) offset, name);
+		printk("%sOffset weird %d - %s\n", func_nm, (int) offset, name);
 		offset = 0;
 	}
 
@@ -785,11 +792,11 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
 	if (flags & SLAB_HWCACHE_ALIGN)
 		align = L1_CACHE_BYTES;
 
-	/* Determine if the slab mgmt and/or bufclts are 'on' or 'off' slab. */
+	/* Determine if the slab management and/or bufclts are 'on' or 'off' slab. */
 	extra = sizeof(kmem_bufctl_t);
 	if (size < (PAGE_SIZE>>3)) {
 		/* Size is small(ish).  Use packing where bufctl size per
-		 * obj is low, and slab mngmnt is on-slab.
+		 * obj is low, and slab management is on-slab.
 		 */
 #if	0
 		if ((flags & SLAB_HIGH_PACK)) {
@@ -806,7 +813,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
 		}
 #endif
 	} else {
-		/* Size is large, assume best to place the slab mngmnt obj
+		/* Size is large, assume best to place the slab management obj
 		 * off-slab (should allow better packing of objs).
 		 */
 		flags |= SLAB_CFLGS_OFF_SLAB;
@@ -815,7 +822,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
 			/* To avoid waste the bufctls are off-slab... */
 			flags |= SLAB_CFLGS_BUFCTL;
 			extra = 0;
-		} /* else slab mngmnt is off-slab, but freelist ptrs are on. */
+		} /* else slab management is off-slab, but freelist pointers are on. */
 	}
 	size += extra;
 
@@ -873,7 +880,7 @@ cal_wastage:
 		 * bad for the gfp()s.
 		 */
 		if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) {
-			if (cachep->c_gfporder < SLAB_BREAK_GFP_ORDER)
+			if (cachep->c_gfporder < slab_break_gfp_order)
 				goto next;
 		}
 
@@ -1022,8 +1029,8 @@ kmem_cache_shrink(kmem_cache_t *cachep)
 	printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep);
 	return 2;
 found:
-	/* Relase the sempahore before getting the cache-lock.  This could
-	 * mean multiple engines are shrinking the cache, but so what...
+	/* Release the semaphore before getting the cache-lock.  This could
+	 * mean multiple engines are shrinking the cache, but so what.
 	 */
 	up(&cache_chain_sem);
 	spin_lock_irq(&cachep->c_spinlock);
@@ -1045,17 +1052,17 @@ found:
 	return ret;
 }
 
-/* Get the mem for a slab mgmt obj. */
+/* Get the memory for a slab management obj. */
 static inline kmem_slab_t *
 kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags)
 {
 	kmem_slab_t	*slabp;
 
 	if (SLAB_OFF_SLAB(cachep->c_flags)) {
-		/* Slab mgmt obj is off-slab. */
+		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc(cache_slabp, local_flags);
 	} else {
-		/* Slab mgmnt at end of slab mem, placed so that
+		/* Slab management at end of slab memory, placed so that
 		 * the position is 'coloured'.
 		 */
 		void *end;
@@ -1203,7 +1210,7 @@ re_try:
 	if (!(objp = kmem_getpages(cachep, flags, &dma)))
 		goto failed;
 
-	/* Get slab mgmt. */
+	/* Get slab management. */
 	if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags)))
 		goto opps1;
 	if (dma)
@@ -1257,7 +1264,7 @@ failed:
 	if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) {
 		/* For large order (>0) slabs, we try again.
 		 * Needed because the gfp() functions are not good at giving
-		 * out contigious pages unless pushed (but do not push too hard).
+		 * out contiguous pages unless pushed (but do not push too hard).
 		 */
 		if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep))
 			goto re_try;
@@ -1648,19 +1655,19 @@ kfree(const void *objp)
 		goto bad_ptr;
 
 	/* Assume we own the page structure - hence no locking.
-	 * If someone is misbehaving (eg. someone calling us with a bad
+	 * If someone is misbehaving (for example, calling us with a bad
 	 * address), then access to the page structure can race with the
-	 * kmem_slab_destory() code.  Need to add a spin_lock to each page
+	 * kmem_slab_destroy() code.  Need to add a spin_lock to each page
 	 * structure, which would be useful in threading the gfp() functions....
 	 */
 	page = &mem_map[nr];
 	if (PageSlab(page)) {
 		kmem_cache_t	*cachep;
 
-		/* Here, we (again) assume the obj address is good.
+		/* Here, we again assume the obj address is good.
 		 * If it isn't, and happens to map onto another
-		 * general-cache page which has no active objs, then
-		 * we race....
+		 * general cache page which has no active objs, then
+		 * we race.
 		 */
 		cachep = SLAB_GET_PAGE_CACHE(page);
 		if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) {
@@ -1714,9 +1721,9 @@ kmem_find_general_cachep(size_t size)
 {
 	cache_sizes_t	*csizep = cache_sizes;
 
-	/* This function could be moved to the header-file, and
+	/* This function could be moved to the header file, and
 	 * made inline so consumers can quickly determine what
-	 * cache-ptr they require.
+	 * cache pointer they require.
 	 */
 	for (; csizep->cs_size; csizep++) {
 		if (size > csizep->cs_size)
@@ -1745,7 +1752,7 @@ kmem_cache_reap(int gfp_mask)
 		return;
 	}
 
-	/* We really need a test semphore op so we can avoid sleeping when
+	/* We really need a test semaphore op so we can avoid sleeping when
 	 * !wait is true.
 	 */
 	down(&cache_chain_sem);
@@ -1778,8 +1785,8 @@ kmem_cache_reap(int gfp_mask)
 		dma_flag = 0;
 		full_free = 0;
 
-		/* Count num of fully free slabs.  Hopefully there are not many,
-		 * we are holding the cache lock....
+		/* Count the fully free slabs.  There should not be not many,
+		 * since we are holding the cache lock.
 		 */
 		slabp = searchp->c_lastp;
 		while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) {
@@ -1819,7 +1826,7 @@ next:
 	up(&cache_chain_sem);
 
 	if (!best_cachep) {
-		/* couldn't find anthying to reap */
+		/* couldn't find anything to reap */
 		return;
 	}
 
diff --git a/mm/swap.c b/mm/swap.c
index c760208da..3cedb215c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -6,7 +6,7 @@
 
 /*
  * This file contains the default values for the opereation of the
- * Linux VM subsystem. Finetuning documentation can be found in
+ * Linux VM subsystem. Fine-tuning documentation can be found in
  * linux/Documentation/sysctl/vm.txt.
  * Started 18.12.91
  * Swap aging added 23.2.95, Stephen Tweedie.
@@ -67,9 +67,9 @@ swap_control_t swap_control = {
 swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
-	3,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
-	30	/* maximum percent buffer */
+	5,	/* minimum percent buffer */
+	25,	/* borrow percent buffer */
+	50	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b91583340..401c7a1fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -146,42 +146,23 @@ void remove_from_swap_cache(struct page *page)
 			"on page %08lx\n", page_address(page));
 	}
 	/*
-	 * This will be a legal case once we have a more mature swap cache.
+	 * This is a legal case, but warn about it.
 	 */
 	if (atomic_read(&page->count) == 1) {
-		printk ("VM: Removing page cache on unshared page %08lx\n", 
+		printk (KERN_WARNING 
+			"VM: Removing page cache on unshared page %08lx\n", 
 			page_address(page));
-		return;
 	}
 
-	
 #ifdef DEBUG_SWAP
 	printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
 	       page_address(page), atomic_read(&page->count));
 #endif
-	remove_page_from_hash_queue (page);
-	remove_page_from_inode_queue (page);
 	PageClearSwapCache (page);
-	__free_page (page);
+	remove_inode_page(page);
 }
 
 
-long find_in_swap_cache(struct page *page)
-{
-#ifdef SWAP_CACHE_INFO
-	swap_cache_find_total++;
-#endif
-	if (PageSwapCache (page))  {
-		long entry = page->offset;
-#ifdef SWAP_CACHE_INFO
-		swap_cache_find_success++;
-#endif	
-		remove_from_swap_cache (page);
-		return entry;
-	}
-	return 0;
-}
-
 int delete_from_swap_cache(struct page *page)
 {
 #ifdef SWAP_CACHE_INFO
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d935433bb..45f73de02 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -28,10 +28,7 @@
 
 unsigned int nr_swapfiles = 0;
 
-static struct {
-	int head;	/* head of priority-ordered swapfile list */
-	int next;	/* swapfile to be used next */
-} swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
@@ -180,7 +177,7 @@ bad_free:
  * that the page has been used or is no longer needed.
  *
  * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited).  We don't know just how many ptes will
+ * after one process has exited).  We don't know just how many PTEs will
  * share this swap entry, so be cautious and let do_wp_page work out
  * what to do if a write is requested later.
  */
@@ -535,6 +532,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 		error = blkdev_open(swap_dentry->d_inode, &filp);
 		if (error)
 			goto bad_swap_2;
+		set_blocksize(p->swap_device, PAGE_SIZE);
 		error = -ENODEV;
 		if (!p->swap_device ||
 		    (blk_size[MAJOR(p->swap_device)] &&
@@ -595,7 +593,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
 	p->flags = SWP_WRITEOK;
 	p->pages = j;
 	nr_swap_pages += j;
-	printk("Adding Swap: %dk swap-space (priority %d)\n",
+	printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
 	       j<<(PAGE_SHIFT-10), p->prio);
 
 	/* insert swap space into swap_list: */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b87beaa2..e7711c23c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -135,12 +135,16 @@ int vmalloc_area_pages(unsigned long address, unsigned long size, pgprot_t prot)
 	dir = pgd_offset_k(address);
 	flush_cache_all();
 	while (address < end) {
-		pmd_t *pmd = pmd_alloc_kernel(dir, address);
+		pmd_t *pmd;
+		pgd_t olddir = *dir;
+
+		pmd = pmd_alloc_kernel(dir, address);
 		if (!pmd)
 			return -ENOMEM;
 		if (alloc_area_pmd(pmd, address, end - address, prot))
 			return -ENOMEM;
-		set_pgdir(address, *dir);
+		if (pgd_val(olddir) != pgd_val(*dir))
+			set_pgdir(address, *dir);
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	}
@@ -150,21 +154,22 @@ int vmalloc_area_pages(unsigned long address, unsigned long size, pgprot_t prot)
 
 struct vm_struct * get_vm_area(unsigned long size)
 {
-	void *addr;
+	unsigned long addr;
 	struct vm_struct **p, *tmp, *area;
 
 	area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
 	if (!area)
 		return NULL;
-	addr = (void *) VMALLOC_START;
-	area->size = size + PAGE_SIZE;
-	area->next = NULL;
+	addr = VMALLOC_START;
 	for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
-		if (size + (unsigned long) addr < (unsigned long) tmp->addr)
+		if (size + addr < (unsigned long) tmp->addr)
 			break;
-		addr = (void *) (tmp->size + (unsigned long) tmp->addr);
+		if (addr > VMALLOC_END-size)
+			return NULL;
+		addr = tmp->size + (unsigned long) tmp->addr;
 	}
-	area->addr = addr;
+	area->addr = (void *)addr;
+	area->size = size + PAGE_SIZE;
 	area->next = *p;
 	*p = area;
 	return area;
@@ -217,16 +222,18 @@ void * vmalloc(unsigned long size)
 
 long vread(char *buf, char *addr, unsigned long count)
 {
-	struct vm_struct **p, *tmp;
+	struct vm_struct *tmp;
 	char *vaddr, *buf_start = buf;
-	int n;
+	unsigned long n;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
-	for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
+	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		vaddr = (char *) tmp->addr;
+		if (addr >= vaddr + tmp->size - PAGE_SIZE)
+			continue;
 		while (addr < vaddr) {
 			if (count == 0)
 				goto finished;
@@ -235,17 +242,15 @@ long vread(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = tmp->size - PAGE_SIZE;
-		if (addr > vaddr)
-			n -= addr - vaddr;
-		while (--n >= 0) {
+		n = vaddr + tmp->size - PAGE_SIZE - addr;
+		do {
 			if (count == 0)
 				goto finished;
 			put_user(*addr, buf);
 			buf++;
 			addr++;
 			count--;
-		}
+		} while (--n > 0);
 	}
 finished:
 	return buf - buf_start;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 919b97244..b586bce72 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,7 +42,7 @@ int swapout_interval = HZ / 4;
 /* 
  * The wait queue for waking up the pageout daemon:
  */
-static struct wait_queue * kswapd_wait = NULL;
+struct wait_queue * kswapd_wait = NULL;
 
 static void init_swap_timer(void);
 
@@ -88,7 +88,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 	 * pages, then delete the swap cache.  We can only do this if
 	 * the swap page's reference count is one: ie. there are no
 	 * other references to it beyond the swap cache (as there must
-	 * still be pte's pointing to it if count > 1).
+	 * still be PTEs pointing to it if count > 1).
 	 * 
 	 * If the page has NOT been touched, and its age reaches zero,
 	 * then we are swapping it out:
@@ -107,7 +107,17 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 
 	if (PageSwapCache(page_map)) {
 		if (pte_write(pte)) {
+			struct page *found;
 			printk ("VM: Found a writable swap-cached page!\n");
+			/* Try to diagnose the problem ... */
+			found = find_page(&swapper_inode, page_map->offset);
+			if (found) {
+				printk("page=%p@%08lx, found=%p, count=%d\n",
+					page_map, page_map->offset,
+					found, atomic_read(&found->count));
+				__free_page(found);
+			} else 
+				printk ("Spurious, page not in cache\n");
 			return 0;
 		}
 	}
@@ -144,9 +154,8 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 			 * we have the swap cache set up to associate the
 			 * page with that swap entry.
 			 */
-			if (PageSwapCache(page_map)) {
-				entry = page_map->offset;
-			} else {
+        		entry = in_swap_cache(page_map);
+			if (!entry) {
 				entry = get_swap_page();
 				if (!entry)
 					return 0; /* No swap space left */
@@ -219,8 +228,8 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = page_unuse(page);
-	free_page(page);
+	entry = page_unuse(page_map);
+	__free_page(page_map);
 	return entry;
 }
 
@@ -435,7 +444,7 @@ out:
  * to be.  This works out OK, because we now do proper aging on page
  * contents. 
  */
-static inline int do_try_to_free_page(int gfp_mask)
+static int do_try_to_free_page(int gfp_mask)
 {
 	static int state = 0;
 	int i=6;
@@ -448,9 +457,10 @@ static inline int do_try_to_free_page(int gfp_mask)
 	stop = 3;
 	if (gfp_mask & __GFP_WAIT)
 		stop = 0;
+
 	if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
 		   || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
-		state = 0;
+		shrink_mmap(i, gfp_mask);
 
 	switch (state) {
 		do {
@@ -459,7 +469,7 @@ static inline int do_try_to_free_page(int gfp_mask)
 				return 1;
 			state = 1;
 		case 1:
-			if ((gfp_mask & __GFP_IO) && shm_swap(i, gfp_mask))
+			if (shm_swap(i, gfp_mask))
 				return 1;
 			state = 2;
 		case 2:
@@ -476,23 +486,6 @@ static inline int do_try_to_free_page(int gfp_mask)
 }
 
 /*
- * This is REALLY ugly.
- *
- * We need to make the locks finer granularity, but right
- * now we need this so that we can do page allocations
- * without holding the kernel lock etc.
- */
-int try_to_free_page(int gfp_mask)
-{
-	int retval;
-
-	lock_kernel();
-	retval = do_try_to_free_page(gfp_mask);
-	unlock_kernel();
-	return retval;
-}
-
-/*
  * Before we start the kernel thread, print out the 
  * kswapd initialization message (otherwise the init message 
  * may be printed in the middle of another driver's init 
@@ -532,7 +525,7 @@ int kswapd(void *unused)
 
 	/* Give kswapd a realtime priority. */
 	current->policy = SCHED_FIFO;
-	current->priority = 32;  /* Fixme --- we need to standardise our
+	current->rt_priority = 32;  /* Fixme --- we need to standardise our
 				    namings for POSIX.4 realtime scheduling
 				    priorities.  */
 
@@ -540,7 +533,6 @@ int kswapd(void *unused)
 	add_wait_queue(&kswapd_wait, &wait);
 	while (1) {
 		int tries;
-		int tried = 0;
 
 		current->state = TASK_INTERRUPTIBLE;
 		flush_signals(current);
@@ -564,29 +556,56 @@ int kswapd(void *unused)
 		 * woken up more often and the rate will be even
 		 * higher).
 		 */
-		tries = pager_daemon.tries_base >> free_memory_available(3);
-	
-		while (tries--) {
-			int gfp_mask;
+		tries = pager_daemon.tries_base;
+		tries >>= 4*free_memory_available();
 
-			if (++tried > pager_daemon.tries_min && free_memory_available(0))
-				break;
-			gfp_mask = __GFP_IO;
-			try_to_free_page(gfp_mask);
+		do {
+			do_try_to_free_page(0);
 			/*
 			 * Syncing large chunks is faster than swapping
 			 * synchronously (less head movement). -- Rik.
 			 */
 			if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
 				run_task_queue(&tq_disk);
-
-		}
+			if (free_memory_available() > 1)
+				break;
+		} while (--tries > 0);
 	}
 	/* As if we could ever get here - maybe we want to make this killable */
 	remove_wait_queue(&kswapd_wait, &wait);
+	unlock_kernel();
 	return 0;
 }
 
+/*
+ * We need to make the locks finer granularity, but right
+ * now we need this so that we can do page allocations
+ * without holding the kernel lock etc.
+ *
+ * The "PF_MEMALLOC" flag protects us against recursion:
+ * if we need more memory as part of a swap-out effort we
+ * will just silently return "success" to tell the page
+ * allocator to accept the allocation.
+ */
+int try_to_free_pages(unsigned int gfp_mask, int count)
+{
+	int retval = 1;
+
+	lock_kernel();
+	if (current->flags & PF_MEMALLOC) {
+		current->flags |= PF_MEMALLOC;
+		do {
+			retval = do_try_to_free_page(gfp_mask);
+			if (!retval)
+				break;
+			count--;
+		} while (count > 0);
+		current->flags &= PF_MEMALLOC;
+	}
+	unlock_kernel();
+	return retval;
+}
+
 /* 
  * The swap_tick function gets called on every clock tick.
  */
@@ -606,11 +625,11 @@ void swap_tick(void)
 	 * Schedule for wakeup if there isn't lots
 	 * of free memory.
 	 */
-	switch (free_memory_available(3)) {
+	switch (free_memory_available()) {
 	case 0:
 		want = now;
 		/* Fall through */
-	case 1 ... 3:
+	case 1:
 		want_wakeup = 1;
 	default:
 	}
author	Ralf Baechle <ralf@linux-mips.org>	1998-08-25 09:12:35 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	1998-08-25 09:12:35 +0000
commit	c7fc24dc4420057f103afe8fc64524ebc25c5d37 (patch)
tree	3682407a599b8f9f03fc096298134cafba1c9b2f /mm
parent	1d793fade8b063fde3cf275bf1a5c2d381292cd9 (diff)