Initial revision

author: Ralf Baechle <ralf@linux-mips.org> 1997-06-01 03:16:17 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 1997-06-01 03:16:17 +0000
commit: d8d9b8f76f22b7a16a83e261e64f89ee611f49df (patch)
tree: 3067bc130b80d52808e6390c9fc7fc087ec1e33c /mm
parent: 19c9bba94152148523ba0f7ef7cffe3d45656b11 (diff)
12 files changed, 1609 insertions, 2111 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 5f5156049..c64eefbd2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@
 
 O_TARGET := mm.o
 O_OBJS	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
-	    kmalloc.o vmalloc.o slab.o \
+	    vmalloc.o slab.o \
 	    swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o
 
 include $(TOPDIR)/Rules.make
diff --git a/mm/filemap.c b/mm/filemap.c
index 6f58da546..88c2fd49d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -171,7 +171,7 @@ int shrink_mmap(int priority, int dma)
 		switch (atomic_read(&page->count)) {
 			case 1:
 				/* If it has been referenced recently, don't free it */
-				if (clear_bit(PG_referenced, &page->flags))
+				if (test_and_clear_bit(PG_referenced, &page->flags))
 					break;
 
 				/* is it a page cache page? */
@@ -1342,7 +1342,7 @@ generic_file_write(struct inode *inode, struct file *file, const char *buf, unsi
 		}
 
 lockit:
-		while (set_bit(PG_locked, &page->flags))
+		while (test_and_set_bit(PG_locked, &page->flags))
 			wait_on_page(page);
 
 		/*
diff --git a/mm/kmalloc.c b/mm/kmalloc.c
deleted file mode 100644
index 9de1bff51..000000000
--- a/mm/kmalloc.c
+++ /dev/null
@@ -1,453 +0,0 @@
-/*
- *  linux/mm/kmalloc.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds & Roger Wolff.
- *
- *  Written by R.E. Wolff Sept/Oct '93.
- *
- */
-
-/*
- * Modified by Alex Bligh (alex@cconcepts.co.uk) 4 Apr 1994 to use multiple
- * pages. So for 'page' throughout, read 'area'.
- *
- * Largely rewritten.. Linus
- */
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-
-#include <asm/system.h>
-#include <asm/dma.h>
-
-/* Define this if you want slow routines that try to trip errors */
-#undef SADISTIC_KMALLOC
-
-/* Private flags. */
-
-#define MF_USED 0xffaa0055
-#define MF_DMA  0xff00aa55
-#define MF_FREE 0x0055ffaa
-
-
-/*
- * Much care has gone into making these routines in this file reentrant.
- *
- * The fancy bookkeeping of nbytesmalloced and the like are only used to
- * report them to the user (oooohhhhh, aaaaahhhhh....) are not
- * protected by cli(). (If that goes wrong. So what?)
- *
- * These routines restore the interrupt status to allow calling with ints
- * off.
- */
-
-/*
- * A block header. This is in front of every malloc-block, whether free or not.
- */
-struct block_header {
-	unsigned long bh_flags;
-	union {
-		unsigned long ubh_length;
-		struct block_header *fbh_next;
-	} vp;
-};
-
-
-#define bh_length vp.ubh_length
-#define bh_next   vp.fbh_next
-#define BH(p) ((struct block_header *)(p))
-
-
-/*
- * The page descriptor is at the front of every page that malloc has in use.
- */
-struct page_descriptor {
-	struct page_descriptor *next;
-	struct block_header *firstfree;
-	int order;
-	int nfree;
-};
-
-
-#define PAGE_DESC(p) ((struct page_descriptor *)(((unsigned long)(p)) & PAGE_MASK))
-
-
-/*
- * A size descriptor describes a specific class of malloc sizes.
- * Each class of sizes has its own freelist.
- */
-struct size_descriptor {
-	struct page_descriptor *firstfree;
-	struct page_descriptor *dmafree;	/* DMA-able memory */
-	int nblocks;
-
-	int nmallocs;
-	int nfrees;
-	int nbytesmalloced;
-	int npages;
-	unsigned long gfporder;	/* number of pages in the area required */
-};
-
-/*
- * For now it is unsafe to allocate bucket sizes between n and
- * n-sizeof(page_descriptor) where n is PAGE_SIZE * any power of two
- *
- * The blocksize and sizes arrays _must_ match!
- */
-#if PAGE_SIZE == 4096
-static const unsigned int blocksize[] = {
-	32,
-	64,
-	128,
-	252,
-	508,
-	1020,
-	2040,
-	4096 - 16,
-	8192 - 16,
-	16384 - 16,
-	32768 - 16,
-	65536 - 16,
-	131072 - 16,
-	0
-};
-
-static struct size_descriptor sizes[] =
-{
-	{NULL, NULL, 127, 0, 0, 0, 0, 0},
-	{NULL, NULL, 63, 0, 0, 0, 0, 0},
-	{NULL, NULL, 31, 0, 0, 0, 0, 0},
-	{NULL, NULL, 16, 0, 0, 0, 0, 0},
-	{NULL, NULL, 8, 0, 0, 0, 0, 0},
-	{NULL, NULL, 4, 0, 0, 0, 0, 0},
-	{NULL, NULL, 2, 0, 0, 0, 0, 0},
-	{NULL, NULL, 1, 0, 0, 0, 0, 0},
-	{NULL, NULL, 1, 0, 0, 0, 0, 1},
-	{NULL, NULL, 1, 0, 0, 0, 0, 2},
-	{NULL, NULL, 1, 0, 0, 0, 0, 3},
-	{NULL, NULL, 1, 0, 0, 0, 0, 4},
-	{NULL, NULL, 1, 0, 0, 0, 0, 5},
-	{NULL, NULL, 0, 0, 0, 0, 0, 0}
-};
-#elif PAGE_SIZE == 8192
-static const unsigned int blocksize[] = {
-	64,
-	128,
-	248,
-	504,
-	1016,
-	2040,
-	4080,
-	8192 - 32,
-	16384 - 32,
-	32768 - 32,
-	65536 - 32,
-	131072 - 32,
-	262144 - 32,
-	0
-};
-
-struct size_descriptor sizes[] =
-{
-	{NULL, NULL, 127, 0, 0, 0, 0, 0},
-	{NULL, NULL, 63, 0, 0, 0, 0, 0},
-	{NULL, NULL, 31, 0, 0, 0, 0, 0},
-	{NULL, NULL, 16, 0, 0, 0, 0, 0},
-	{NULL, NULL, 8, 0, 0, 0, 0, 0},
-	{NULL, NULL, 4, 0, 0, 0, 0, 0},
-	{NULL, NULL, 2, 0, 0, 0, 0, 0},
-	{NULL, NULL, 1, 0, 0, 0, 0, 0},
-	{NULL, NULL, 1, 0, 0, 0, 0, 1},
-	{NULL, NULL, 1, 0, 0, 0, 0, 2},
-	{NULL, NULL, 1, 0, 0, 0, 0, 3},
-	{NULL, NULL, 1, 0, 0, 0, 0, 4},
-	{NULL, NULL, 1, 0, 0, 0, 0, 5},
-	{NULL, NULL, 0, 0, 0, 0, 0, 0}
-};
-#else
-#error you need to make a version for your pagesize
-#endif
-
-#define NBLOCKS(order)          (sizes[order].nblocks)
-#define BLOCKSIZE(order)        (blocksize[order])
-#define AREASIZE(order)		(PAGE_SIZE<<(sizes[order].gfporder))
-
-/*
- * Create a small cache of page allocations: this helps a bit with
- * those pesky 8kB+ allocations for NFS when we're temporarily
- * out of memory..
- *
- * This is a _truly_ small cache, we just cache one single page
- * order (for orders 0, 1 and 2, that is  4, 8 and 16kB on x86).
- */
-#define MAX_CACHE_ORDER 3
-struct page_descriptor * kmalloc_cache[MAX_CACHE_ORDER];
-
-static inline struct page_descriptor * get_kmalloc_pages(unsigned long priority,
-	unsigned long order, int dma)
-{
-	return (struct page_descriptor *) __get_free_pages(priority, order, dma);
-}
-
-static inline void free_kmalloc_pages(struct page_descriptor * page,
-	unsigned long order, int dma)
-{
-	if (!dma && order < MAX_CACHE_ORDER) {
-		page = xchg(kmalloc_cache+order, page);
-		if (!page)
-			return;
-	}
-	free_pages((unsigned long) page, order);
-}
-
-long kmalloc_init(long start_mem, long end_mem)
-{
-	int order;
-
-/*
- * Check the static info array. Things will blow up terribly if it's
- * incorrect. This is a late "compile time" check.....
- */
-	for (order = 0; BLOCKSIZE(order); order++) {
-		if ((NBLOCKS(order) * BLOCKSIZE(order) + sizeof(struct page_descriptor)) >
-		    AREASIZE(order)) {
-			printk("Cannot use %d bytes out of %d in order = %d block mallocs\n",
-			       (int) (NBLOCKS(order) * BLOCKSIZE(order) +
-				      sizeof(struct page_descriptor)),
-			        (int) AREASIZE(order),
-			       BLOCKSIZE(order));
-			panic("This only happens if someone messes with kmalloc");
-		}
-	}
-	return start_mem;
-}
-
-
-/*
- * Ugh, this is ugly, but we want the default case to run
- * straight through, which is why we have the ugly goto's
- */
-void *kmalloc(size_t size, int priority)
-{
-	unsigned long flags;
-	unsigned long type;
-	int order, dma;
-	struct block_header *p;
-	struct page_descriptor *page, **pg;
-	struct size_descriptor *bucket = sizes;
-
-	/* Get order */
-	order = 0;
-	{
-		unsigned int realsize = size + sizeof(struct block_header);
-		for (;;) {
-			int ordersize = BLOCKSIZE(order);
-			if (realsize <= ordersize)
-				break;
-			order++;
-			bucket++;
-			if (ordersize)
-				continue;
-			printk("kmalloc of too large a block (%d bytes).\n", (int) size);
-			return NULL;
-		}
-	}
-
-	dma = 0;
-	type = MF_USED;
-	pg = &bucket->firstfree;
-	if (priority & GFP_DMA) {
-		dma = 1;
-		type = MF_DMA;
-		pg = &bucket->dmafree;
-	}
-
-	priority &= GFP_LEVEL_MASK;
-
-/* Sanity check... */
-
-	if (in_interrupt() && priority != GFP_ATOMIC) {
-		static int count = 0;
-		if (++count < 5) {
-			printk("kmalloc called nonatomically from interrupt %p\n",
-			       return_address());
-			priority = GFP_ATOMIC;
-		}
-	}
-
-	save_flags(flags);
-	cli();
-	page = *pg;
-	if (!page)
-		goto no_bucket_page;
-
-	p = page->firstfree;
-	if (p->bh_flags != MF_FREE)
-		goto not_free_on_freelist;
-
-found_it:
-	page->firstfree = p->bh_next;
-	page->nfree--;
-	if (!page->nfree)
-		*pg = page->next;
-	restore_flags(flags);
-	bucket->nmallocs++;
-	bucket->nbytesmalloced += size;
-	p->bh_flags = type;	/* As of now this block is officially in use */
-	p->bh_length = size;
-#ifdef SADISTIC_KMALLOC
-	memset(p+1, 0xf0, size);
-#endif
-	return p + 1;		/* Pointer arithmetic: increments past header */
-
-
-no_bucket_page:
-	/*
-	 * If we didn't find a page already allocated for this
-	 * bucket size, we need to get one..
-	 *
-	 * This can be done with ints on: it is private to this invocation
-	 */
-	restore_flags(flags);
-
-	{
-		int i, sz;
-		
-		/* sz is the size of the blocks we're dealing with */
-		sz = BLOCKSIZE(order);
-
-		page = get_kmalloc_pages(priority, bucket->gfporder, dma);
-		if (!page)
-			goto no_free_page;
-found_cached_page:
-
-		bucket->npages++;
-
-		page->order = order;
-		/* Loop for all but last block: */
-		i = (page->nfree = bucket->nblocks) - 1;
-		p = BH(page + 1);
-		while (i > 0) {
-			i--;
-			p->bh_flags = MF_FREE;
-			p->bh_next = BH(((long) p) + sz);
-			p = p->bh_next;
-		}
-		/* Last block: */
-		p->bh_flags = MF_FREE;
-		p->bh_next = NULL;
-
-		p = BH(page+1);
-	}
-
-	/*
-	 * Now we're going to muck with the "global" freelist
-	 * for this size: this should be uninterruptible
-	 */
-	cli();
-	page->next = *pg;
-	*pg = page;
-	goto found_it;
-
-
-no_free_page:
-	/*
-	 * No free pages, check the kmalloc cache of
-	 * pages to see if maybe we have something available
-	 */
-	if (!dma && order < MAX_CACHE_ORDER) {
-		page = xchg(kmalloc_cache+order, page);
-		if (page)
-			goto found_cached_page;
-	}
-	{
-		static unsigned long last = 0;
-		if (priority != GFP_BUFFER && (last + 10 * HZ < jiffies)) {
-			last = jiffies;
-			printk("Couldn't get a free page.....\n");
-		}
-		return NULL;
-	}
-
-not_free_on_freelist:
-	restore_flags(flags);
-	printk("Problem: block on freelist at %08lx isn't free.\n", (long) p);
-	return NULL;
-}
-
-void kfree(void *__ptr)
-{
-	int dma;
-	unsigned long flags;
-	unsigned int order;
-	struct page_descriptor *page, **pg;
-	struct size_descriptor *bucket;
-
-	if (!__ptr)
-		goto null_kfree;
-#define ptr ((struct block_header *) __ptr)
-	page = PAGE_DESC(ptr);
-	__ptr = ptr - 1;
-	if (~PAGE_MASK & (unsigned long)page->next)
-		goto bad_order;
-	order = page->order;
-	if (order >= sizeof(sizes) / sizeof(sizes[0]))
-		goto bad_order;
-	bucket = sizes + order;
-	dma = 0;
-	pg = &bucket->firstfree;
-	if (ptr->bh_flags == MF_DMA) {
-		dma = 1;
-		ptr->bh_flags = MF_USED;
-		pg = &bucket->dmafree;
-	}
-	if (ptr->bh_flags != MF_USED)
-		goto bad_order;
-	ptr->bh_flags = MF_FREE;	/* As of now this block is officially free */
-#ifdef SADISTIC_KMALLOC
-	memset(ptr+1, 0x0e, ptr->bh_length);
-#endif
-	save_flags(flags);
-	cli();
-
-	bucket->nfrees++;
-	bucket->nbytesmalloced -= ptr->bh_length;
-
-	ptr->bh_next = page->firstfree;
-	page->firstfree = ptr;
-	if (!page->nfree++) {
-/* Page went from full to one free block: put it on the freelist. */
-		if (bucket->nblocks == 1)
-			goto free_page;
-		page->next = *pg;
-		*pg = page;
-	}
-/* If page is completely free, free it */
-	if (page->nfree == bucket->nblocks) {
-		for (;;) {
-			struct page_descriptor *tmp = *pg;
-			if (!tmp)
-				goto not_on_freelist;
-			if (tmp == page)
-				break;
-			pg = &tmp->next;
-		}
-		*pg = page->next;
-free_page:
-		bucket->npages--;
-		free_kmalloc_pages(page, bucket->gfporder, dma);
-	}
-	restore_flags(flags);
-null_kfree:
-	return;
-
-bad_order:
-	printk("kfree of non-kmalloced memory: %p, next= %p, order=%d\n",
-	       ptr+1, page->next, page->order);
-	return;
-
-not_on_freelist:
-	printk("Ooops. page %p doesn't show on freelist.\n", page);
-	restore_flags(flags);
-}
diff --git a/mm/memory.c b/mm/memory.c
index 27dc33efe..530a65ca9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -589,26 +589,13 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig
  * change only once the write actually happens. This avoids a few races,
  * and potentially makes it more efficient.
  */
-void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int write_access)
+static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
+	unsigned long address, int write_access, pte_t *page_table)
 {
-	pgd_t *page_dir;
-	pmd_t *page_middle;
-	pte_t *page_table, pte;
+	pte_t pte;
 	unsigned long old_page, new_page;
 
 	new_page = __get_free_page(GFP_KERNEL);
-	page_dir = pgd_offset(vma->vm_mm, address);
-	if (pgd_none(*page_dir))
-		goto end_wp_page;
-	if (pgd_bad(*page_dir))
-		goto bad_wp_pagedir;
-	page_middle = pmd_offset(page_dir, address);
-	if (pmd_none(*page_middle))
-		goto end_wp_page;
-	if (pmd_bad(*page_middle))
-		goto bad_wp_pagemiddle;
-	page_table = pte_offset(page_middle, address);
 	pte = *page_table;
 	if (!pte_present(pte))
 		goto end_wp_page;
@@ -650,14 +637,6 @@ void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 bad_wp_page:
 	printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 	send_sig(SIGKILL, tsk, 1);
-	goto end_wp_page;
-bad_wp_pagemiddle:
-	printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
-	send_sig(SIGKILL, tsk, 1);
-	goto end_wp_page;
-bad_wp_pagedir:
-	printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
-	send_sig(SIGKILL, tsk, 1);
 end_wp_page:
 	if (new_page)
 		free_page(new_page);
@@ -746,7 +725,7 @@ void vmtruncate(struct inode * inode, unsigned long offset)
 		flush_cache_range(mm, start, end);
 		zap_page_range(mm, start, len);
 		flush_tlb_range(mm, start, end);
-	} while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
+	} while ((mpnt = mpnt->vm_next_share) != NULL);
 }
 
 
@@ -785,25 +764,11 @@ static inline void do_swap_page(struct task_struct * tsk,
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
  */
-void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int write_access)
+static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
+	unsigned long address, int write_access, pte_t *page_table, pte_t entry)
 {
-	pgd_t * pgd;
-	pmd_t * pmd;
-	pte_t * page_table;
-	pte_t entry;
 	unsigned long page;
 
-	pgd = pgd_offset(tsk->mm, address);
-	pmd = pmd_alloc(pgd, address);
-	if (!pmd)
-		goto no_memory;
-	page_table = pte_alloc(pmd, address);
-	if (!page_table)
-		goto no_memory;
-	entry = *page_table;
-	if (pte_present(entry))
-		goto is_present;
 	if (!pte_none(entry))
 		goto swap_page;
 	address &= PAGE_MASK;
@@ -865,18 +830,9 @@ sigbus:
 swap_page:
 	do_swap_page(tsk, vma, address, page_table, entry, write_access);
 	return;
-
-no_memory:
-	oom(tsk);
-is_present:
-	return;
 }
 
 /*
- * The above separate functions for the no-page and wp-page
- * cases will go away (they mostly do the same thing anyway),
- * and we'll instead use only a general "handle_mm_fault()".
- *
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
  * RISC architectures).  The early dirtying is also good on the i386.
@@ -885,27 +841,30 @@ is_present:
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  */
-static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
+static inline void handle_pte_fault(struct task_struct *tsk,
+	struct vm_area_struct * vma, unsigned long address,
 	int write_access, pte_t * pte)
 {
-	if (!pte_present(*pte)) {
-		do_no_page(current, vma, address, write_access);
+	pte_t entry = *pte;
+
+	if (!pte_present(entry)) {
+		do_no_page(tsk, vma, address, write_access, pte, entry);
 		return;
 	}
-	set_pte(pte, pte_mkyoung(*pte));
+	set_pte(pte, pte_mkyoung(entry));
 	flush_tlb_page(vma, address);
 	if (!write_access)
 		return;
-	if (pte_write(*pte)) {
-		set_pte(pte, pte_mkdirty(*pte));
+	if (pte_write(entry)) {
+		set_pte(pte, pte_mkdirty(entry));
 		flush_tlb_page(vma, address);
 		return;
 	}
-	do_wp_page(current, vma, address, write_access);
+	do_wp_page(tsk, vma, address, write_access, pte);
 }
 
-void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
-	int write_access)
+void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
+	unsigned long address, int write_access)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -918,9 +877,9 @@ void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
 	pte = pte_alloc(pmd, address);
 	if (!pte)
 		goto no_memory;
-	handle_pte_fault(vma, address, write_access, pte);
+	handle_pte_fault(tsk, vma, address, write_access, pte);
 	update_mmu_cache(vma, address, *pte);
 	return;
 no_memory:
-	oom(current);
+	oom(tsk);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a5ed16a3..13b19bec0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -16,13 +16,13 @@
 #include <linux/swap.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
+#include <linux/init.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/pgtable.h>
 
-/*
- * description of effects of mapping type and prot in current implementation.
+/* description of effects of mapping type and prot in current implementation.
  * this is due to the limited x86 page protection hardware.  The expected
  * behavior is in parens:
  *
@@ -37,7 +37,6 @@
  *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
  *
  */
-
 pgprot_t protection_map[16] = {
 	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
 	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
@@ -48,20 +47,18 @@ kmem_cache_t *vm_area_cachep;
 
 int sysctl_overcommit_memory;
 
-/*
- * Check that a process has enough memory to allocate a
+/* Check that a process has enough memory to allocate a
  * new virtual mapping.
  */
 int vm_enough_memory(long pages)
 {
-	/*
-	 * stupid algorithm to decide if we have enough memory: while
+	/* Stupid algorithm to decide if we have enough memory: while
 	 * simple, it hopefully works in most obvious cases.. Easy to
 	 * fool it, but this should catch most mistakes.
 	 */
 	long freepages;
 	
-        /* sometimes we want to use more memory than we have. */
+        /* Sometimes we want to use more memory than we have. */
 	if (sysctl_overcommit_memory)
 	    return 1;
 
@@ -74,6 +71,20 @@ int vm_enough_memory(long pages)
 	return freepages > pages;
 }
 
+/* Remove one vm structure from the inode's i_mmap ring. */
+static inline void remove_shared_vm_struct(struct vm_area_struct *vma)
+{
+	struct inode * inode = vma->vm_inode;
+
+	if (inode) {
+		if (vma->vm_flags & VM_DENYWRITE)
+			inode->i_writecount++;
+		if(vma->vm_next_share)
+			vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share;
+		*vma->vm_pprev_share = vma->vm_next_share;
+	}
+}
+
 asmlinkage unsigned long sys_brk(unsigned long brk)
 {
 	unsigned long rlim, retval;
@@ -91,17 +102,14 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 		goto out;
 	}
 
-	/*
-	 * Always allow shrinking brk
-	 */
+	/* Always allow shrinking brk. */
 	if (brk <= mm->brk) {
 		retval = mm->brk = brk;
 		do_munmap(newbrk, oldbrk-newbrk);
 		goto out;
 	}
-	/*
-	 * Check against rlimit and stack..
-	 */
+
+	/* Check against rlimit and stack.. */
 	retval = mm->brk;
 	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
@@ -109,21 +117,15 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	if (brk - mm->end_code > rlim)
 		goto out;
 
-	/*
-	 * Check against existing mmap mappings.
-	 */
+	/* Check against existing mmap mappings. */
 	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
 		goto out;
 
-	/*
-	 * Check if we have enough memory..
-	 */
+	/* Check if we have enough memory.. */
 	if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
 		goto out;
 
-	/*
-	 * Ok, looks good - let it rip.
-	 */
+	/* Ok, looks good - let it rip. */
 	if(do_mmap(NULL, oldbrk, newbrk-oldbrk,
 		   PROT_READ|PROT_WRITE|PROT_EXEC,
 		   MAP_FIXED|MAP_PRIVATE, 0) == oldbrk)
@@ -134,8 +136,7 @@ out:
 	return retval;
 }
 
-/*
- * Combine the mmap "prot" and "flags" argument into one "vm_flags" used
+/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
  * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
  * into "VM_xxx".
  */
@@ -162,6 +163,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma;
+	int correct_wcount = 0;
 
 	if ((len = PAGE_ALIGN(len)) == 0)
 		return addr;
@@ -181,20 +183,17 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 			return -EAGAIN;
 	}
 
-	/*
-	 * do simple checking here so the lower-level routines won't have
+	/* Do simple checking here so the lower-level routines won't have
 	 * to. we assume access permissions have been handled by the open
 	 * of the memory object, so we don't do any here.
 	 */
-
 	if (file != NULL) {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
 			if ((prot & PROT_WRITE) && !(file->f_mode & 2))
 				return -EACCES;
-			/*
-			 * make sure there are no mandatory locks on the file.
-			 */
+
+			/* make sure there are no mandatory locks on the file. */
 			if (locks_verify_locked(file->f_inode))
 				return -EAGAIN;
 			/* fall through */
@@ -206,18 +205,12 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 		default:
 			return -EINVAL;
 		}
-		if (flags & MAP_DENYWRITE) {
-			if (file->f_inode->i_writecount > 0)
-				return -ETXTBSY;
-		}
 	} else if ((flags & MAP_TYPE) != MAP_PRIVATE)
 		return -EINVAL;
 
-	/*
-	 * obtain the address to map to. we verify (or select) it and ensure
+	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-
 	if (flags & MAP_FIXED) {
 		if (addr & ~PAGE_MASK)
 			return -EINVAL;
@@ -227,8 +220,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 			return -ENOMEM;
 	}
 
-	/*
-	 * determine the object being mapped and call the appropriate
+	/* Determine the object being mapped and call the appropriate
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
@@ -249,8 +241,8 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 			vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 		if (flags & MAP_SHARED) {
 			vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
-			/*
-			 * This looks strange, but when we don't have the file open
+
+			/* This looks strange, but when we don't have the file open
 			 * for writing, we can demote the shared mapping to a simpler
 			 * private mapping. That also takes care of a security hole
 			 * with ptrace() writing to a shared mapping without write
@@ -289,9 +281,26 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 	}
 
 	if (file) {
-		int error = file->f_op->mmap(file->f_inode, file, vma);
+		int error = 0;
+		if (vma->vm_flags & VM_DENYWRITE) {
+			if (file->f_inode->i_writecount > 0)
+				error = -ETXTBSY;
+			else {
+	        		/* f_op->mmap might possibly sleep
+				 * (generic_file_mmap doesn't, but other code
+				 * might). In any case, this takes care of any
+				 * race that this might cause.
+				 */
+				file->f_inode->i_writecount--;
+				correct_wcount = 1;
+			}
+		}
+		if (!error)
+			error = file->f_op->mmap(file->f_inode, file, vma);
 	
 		if (error) {
+			if (correct_wcount)
+				file->f_inode->i_writecount++;
 			kmem_cache_free(vm_area_cachep, vma);
 			return error;
 		}
@@ -299,6 +308,8 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 
 	flags = vma->vm_flags;
 	insert_vm_struct(mm, vma);
+	if (correct_wcount)
+		file->f_inode->i_writecount++;
 	merge_segments(mm, vma->vm_start, vma->vm_end);
 
 	/* merge_segments might have merged our vma, so we can't use it any more */
@@ -317,8 +328,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
 	return addr;
 }
 
-/*
- * Get an address range which is currently unmapped.
+/* Get an address range which is currently unmapped.
  * For mmap() without MAP_FIXED and shmat() with addr=0.
  * Return value 0 means ENOMEM.
  */
@@ -342,376 +352,7 @@ unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
 	}
 }
 
-/*
- * Searching a VMA in the linear list task->mm->mmap is horribly slow.
- * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search
- * from O(n) to O(log n), where n is the number of VMAs of the task
- * (typically around 6, but may reach 3000 in some cases).
- * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>.
- */
-
-/* We keep the list and tree sorted by address. */
-#define vm_avl_key	vm_end
-#define vm_avl_key_t	unsigned long	/* typeof(vma->avl_key) */
-
-/*
- * task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap
- * or, more exactly, its root.
- * A vm_area_struct has the following fields:
- *   vm_avl_left     left son of a tree node
- *   vm_avl_right    right son of a tree node
- *   vm_avl_height   1+max(heightof(left),heightof(right))
- * The empty tree is represented as NULL.
- */
-
-/* Since the trees are balanced, their height will never be large. */
-#define avl_maxheight	41	/* why this? a small exercise */
-#define heightof(tree)	((tree) == avl_empty ? 0 : (tree)->vm_avl_height)
-/*
- * Consistency and balancing rules:
- * 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right))
- * 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1
- * 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key,
- *    foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key.
- */
-
-/* Look up the nodes at the left and at the right of a given node. */
-static inline void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
-{
-	vm_avl_key_t key = node->vm_avl_key;
-
-	*to_the_left = *to_the_right = NULL;
-	for (;;) {
-		if (tree == avl_empty) {
-			printk("avl_neighbours: node not found in the tree\n");
-			return;
-		}
-		if (key == tree->vm_avl_key)
-			break;
-		if (key < tree->vm_avl_key) {
-			*to_the_right = tree;
-			tree = tree->vm_avl_left;
-		} else {
-			*to_the_left = tree;
-			tree = tree->vm_avl_right;
-		}
-	}
-	if (tree != node) {
-		printk("avl_neighbours: node not exactly found in the tree\n");
-		return;
-	}
-	if (tree->vm_avl_left != avl_empty) {
-		struct vm_area_struct * node;
-		for (node = tree->vm_avl_left; node->vm_avl_right != avl_empty; node = node->vm_avl_right)
-			continue;
-		*to_the_left = node;
-	}
-	if (tree->vm_avl_right != avl_empty) {
-		struct vm_area_struct * node;
-		for (node = tree->vm_avl_right; node->vm_avl_left != avl_empty; node = node->vm_avl_left)
-			continue;
-		*to_the_right = node;
-	}
-	if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right))
-		printk("avl_neighbours: tree inconsistent with list\n");
-}
-
-/*
- * Rebalance a tree.
- * After inserting or deleting a node of a tree we have a sequence of subtrees
- * nodes[0]..nodes[k-1] such that
- * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}.
- */
-static inline void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
-{
-	for ( ; count > 0 ; count--) {
-		struct vm_area_struct ** nodeplace = *--nodeplaces_ptr;
-		struct vm_area_struct * node = *nodeplace;
-		struct vm_area_struct * nodeleft = node->vm_avl_left;
-		struct vm_area_struct * noderight = node->vm_avl_right;
-		int heightleft = heightof(nodeleft);
-		int heightright = heightof(noderight);
-		if (heightright + 1 < heightleft) {
-			/*                                                      */
-			/*                            *                         */
-			/*                          /   \                       */
-			/*                       n+2      n                     */
-			/*                                                      */
-			struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left;
-			struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right;
-			int heightleftright = heightof(nodeleftright);
-			if (heightof(nodeleftleft) >= heightleftright) {
-				/*                                                        */
-				/*                *                    n+2|n+3            */
-				/*              /   \                  /    \             */
-				/*           n+2      n      -->      /   n+1|n+2         */
-				/*           / \                      |    /    \         */
-				/*         n+1 n|n+1                 n+1  n|n+1  n        */
-				/*                                                        */
-				node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node;
-				nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright);
-				*nodeplace = nodeleft;
-			} else {
-				/*                                                        */
-				/*                *                     n+2               */
-				/*              /   \                 /     \             */
-				/*           n+2      n      -->    n+1     n+1           */
-				/*           / \                    / \     / \           */
-				/*          n  n+1                 n   L   R   n          */
-				/*             / \                                        */
-				/*            L   R                                       */
-				/*                                                        */
-				nodeleft->vm_avl_right = nodeleftright->vm_avl_left;
-				node->vm_avl_left = nodeleftright->vm_avl_right;
-				nodeleftright->vm_avl_left = nodeleft;
-				nodeleftright->vm_avl_right = node;
-				nodeleft->vm_avl_height = node->vm_avl_height = heightleftright;
-				nodeleftright->vm_avl_height = heightleft;
-				*nodeplace = nodeleftright;
-			}
-		}
-		else if (heightleft + 1 < heightright) {
-			/* similar to the above, just interchange 'left' <--> 'right' */
-			struct vm_area_struct * noderightright = noderight->vm_avl_right;
-			struct vm_area_struct * noderightleft = noderight->vm_avl_left;
-			int heightrightleft = heightof(noderightleft);
-			if (heightof(noderightright) >= heightrightleft) {
-				node->vm_avl_right = noderightleft; noderight->vm_avl_left = node;
-				noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft);
-				*nodeplace = noderight;
-			} else {
-				noderight->vm_avl_left = noderightleft->vm_avl_right;
-				node->vm_avl_right = noderightleft->vm_avl_left;
-				noderightleft->vm_avl_right = noderight;
-				noderightleft->vm_avl_left = node;
-				noderight->vm_avl_height = node->vm_avl_height = heightrightleft;
-				noderightleft->vm_avl_height = heightright;
-				*nodeplace = noderightleft;
-			}
-		}
-		else {
-			int height = (heightleft<heightright ? heightright : heightleft) + 1;
-			if (height == node->vm_avl_height)
-				break;
-			node->vm_avl_height = height;
-		}
-	}
-}
-
-/* Insert a node into a tree. */
-static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree)
-{
-	vm_avl_key_t key = new_node->vm_avl_key;
-	struct vm_area_struct ** nodeplace = ptree;
-	struct vm_area_struct ** stack[avl_maxheight];
-	int stack_count = 0;
-	struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
-	for (;;) {
-		struct vm_area_struct * node = *nodeplace;
-		if (node == avl_empty)
-			break;
-		*stack_ptr++ = nodeplace; stack_count++;
-		if (key < node->vm_avl_key)
-			nodeplace = &node->vm_avl_left;
-		else
-			nodeplace = &node->vm_avl_right;
-	}
-	new_node->vm_avl_left = avl_empty;
-	new_node->vm_avl_right = avl_empty;
-	new_node->vm_avl_height = 1;
-	*nodeplace = new_node;
-	avl_rebalance(stack_ptr,stack_count);
-}
-
-/* Insert a node into a tree, and
- * return the node to the left of it and the node to the right of it.
- */
-static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,
-	struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
-{
-	vm_avl_key_t key = new_node->vm_avl_key;
-	struct vm_area_struct ** nodeplace = ptree;
-	struct vm_area_struct ** stack[avl_maxheight];
-	int stack_count = 0;
-	struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
-	*to_the_left = *to_the_right = NULL;
-	for (;;) {
-		struct vm_area_struct * node = *nodeplace;
-		if (node == avl_empty)
-			break;
-		*stack_ptr++ = nodeplace; stack_count++;
-		if (key < node->vm_avl_key) {
-			*to_the_right = node;
-			nodeplace = &node->vm_avl_left;
-		} else {
-			*to_the_left = node;
-			nodeplace = &node->vm_avl_right;
-		}
-	}
-	new_node->vm_avl_left = avl_empty;
-	new_node->vm_avl_right = avl_empty;
-	new_node->vm_avl_height = 1;
-	*nodeplace = new_node;
-	avl_rebalance(stack_ptr,stack_count);
-}
-
-/* Removes a node out of a tree. */
-static inline void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
-{
-	vm_avl_key_t key = node_to_delete->vm_avl_key;
-	struct vm_area_struct ** nodeplace = ptree;
-	struct vm_area_struct ** stack[avl_maxheight];
-	int stack_count = 0;
-	struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
-	struct vm_area_struct ** nodeplace_to_delete;
-	for (;;) {
-		struct vm_area_struct * node = *nodeplace;
-		if (node == avl_empty) {
-			/* what? node_to_delete not found in tree? */
-			printk("avl_remove: node to delete not found in tree\n");
-			return;
-		}
-		*stack_ptr++ = nodeplace; stack_count++;
-		if (key == node->vm_avl_key)
-			break;
-		if (key < node->vm_avl_key)
-			nodeplace = &node->vm_avl_left;
-		else
-			nodeplace = &node->vm_avl_right;
-	}
-	nodeplace_to_delete = nodeplace;
-	/* Have to remove node_to_delete = *nodeplace_to_delete. */
-	if (node_to_delete->vm_avl_left == avl_empty) {
-		*nodeplace_to_delete = node_to_delete->vm_avl_right;
-		stack_ptr--; stack_count--;
-	} else {
-		struct vm_area_struct *** stack_ptr_to_delete = stack_ptr;
-		struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left;
-		struct vm_area_struct * node;
-		for (;;) {
-			node = *nodeplace;
-			if (node->vm_avl_right == avl_empty)
-				break;
-			*stack_ptr++ = nodeplace; stack_count++;
-			nodeplace = &node->vm_avl_right;
-		}
-		*nodeplace = node->vm_avl_left;
-		/* node replaces node_to_delete */
-		node->vm_avl_left = node_to_delete->vm_avl_left;
-		node->vm_avl_right = node_to_delete->vm_avl_right;
-		node->vm_avl_height = node_to_delete->vm_avl_height;
-		*nodeplace_to_delete = node; /* replace node_to_delete */
-		*stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */
-	}
-	avl_rebalance(stack_ptr,stack_count);
-}
-
-#ifdef DEBUG_AVL
-
-/* print a list */
-static void printk_list (struct vm_area_struct * vma)
-{
-	printk("[");
-	while (vma) {
-		printk("%08lX-%08lX", vma->vm_start, vma->vm_end);
-		vma = vma->vm_next;
-		if (!vma)
-			break;
-		printk(" ");
-	}
-	printk("]");
-}
-
-/* print a tree */
-static void printk_avl (struct vm_area_struct * tree)
-{
-	if (tree != avl_empty) {
-		printk("(");
-		if (tree->vm_avl_left != avl_empty) {
-			printk_avl(tree->vm_avl_left);
-			printk("<");
-		}
-		printk("%08lX-%08lX", tree->vm_start, tree->vm_end);
-		if (tree->vm_avl_right != avl_empty) {
-			printk(">");
-			printk_avl(tree->vm_avl_right);
-		}
-		printk(")");
-	}
-}
-
-static char *avl_check_point = "somewhere";
-
-/* check a tree's consistency and balancing */
-static void avl_checkheights (struct vm_area_struct * tree)
-{
-	int h, hl, hr;
-
-	if (tree == avl_empty)
-		return;
-	avl_checkheights(tree->vm_avl_left);
-	avl_checkheights(tree->vm_avl_right);
-	h = tree->vm_avl_height;
-	hl = heightof(tree->vm_avl_left);
-	hr = heightof(tree->vm_avl_right);
-	if ((h == hl+1) && (hr <= hl) && (hl <= hr+1))
-		return;
-	if ((h == hr+1) && (hl <= hr) && (hr <= hl+1))
-		return;
-	printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point);
-}
-
-/* check that all values stored in a tree are < key */
-static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key)
-{
-	if (tree == avl_empty)
-		return;
-	avl_checkleft(tree->vm_avl_left,key);
-	avl_checkleft(tree->vm_avl_right,key);
-	if (tree->vm_avl_key < key)
-		return;
-	printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
-}
-
-/* check that all values stored in a tree are > key */
-static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key)
-{
-	if (tree == avl_empty)
-		return;
-	avl_checkright(tree->vm_avl_left,key);
-	avl_checkright(tree->vm_avl_right,key);
-	if (tree->vm_avl_key > key)
-		return;
-	printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
-}
-
-/* check that all values are properly increasing */
-static void avl_checkorder (struct vm_area_struct * tree)
-{
-	if (tree == avl_empty)
-		return;
-	avl_checkorder(tree->vm_avl_left);
-	avl_checkorder(tree->vm_avl_right);
-	avl_checkleft(tree->vm_avl_left,tree->vm_avl_key);
-	avl_checkright(tree->vm_avl_right,tree->vm_avl_key);
-}
-
-/* all checks */
-static void avl_check (struct task_struct * task, char *caller)
-{
-	avl_check_point = caller;
-/*	printk("task \"%s\", %s\n",task->comm,caller); */
-/*	printk("task \"%s\" list: ",task->comm); printk_list(task->mm->mmap); printk("\n"); */
-/*	printk("task \"%s\" tree: ",task->comm); printk_avl(task->mm->mmap_avl); printk("\n"); */
-	avl_checkheights(task->mm->mmap_avl);
-	avl_checkorder(task->mm->mmap_avl);
-}
-
-#endif
-
-
-/*
- * Normal function to fix up a mapping
+/* Normal function to fix up a mapping
  * This function is the default for when an area has no specific
  * function.  This may be used as part of a more specific routine.
  * This function works out what part of an area is affected and
@@ -738,19 +379,11 @@ static void unmap_fixup(struct vm_area_struct *area,
 	struct vm_area_struct *mpnt;
 	unsigned long end = addr + len;
 
-	if (addr < area->vm_start || addr >= area->vm_end ||
-	    end <= area->vm_start || end > area->vm_end ||
-	    end < addr)
-	{
-		printk("unmap_fixup: area=%lx-%lx, unmap %lx-%lx!!\n",
-		       area->vm_start, area->vm_end, addr, end);
-		return;
-	}
 	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
 	if (area->vm_flags & VM_LOCKED)
 		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 
-	/* Unmapping the whole area */
+	/* Unmapping the whole area. */
 	if (addr == area->vm_start && end == area->vm_end) {
 		if (area->vm_ops && area->vm_ops->close)
 			area->vm_ops->close(area);
@@ -759,15 +392,13 @@ static void unmap_fixup(struct vm_area_struct *area,
 		return;
 	}
 
-	/* Work out to one of the ends */
+	/* Work out to one of the ends. */
 	if (end == area->vm_end)
 		area->vm_end = addr;
-	else
-	if (addr == area->vm_start) {
+	else if (addr == area->vm_start) {
 		area->vm_offset += (end - area->vm_start);
 		area->vm_start = end;
-	}
-	else {
+	} else {
 	/* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
 		/* Add end mapping -- leave beginning for below */
 		mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
@@ -785,7 +416,7 @@ static void unmap_fixup(struct vm_area_struct *area,
 		insert_vm_struct(current->mm, mpnt);
 	}
 
-	/* construct whatever mapping is needed */
+	/* Construct whatever mapping is needed. */
 	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 	if (!mpnt)
 		return;
@@ -809,15 +440,14 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len)
 	return ret;
 }
 
-/*
- * Munmap is split into 2 main parts -- this part which finds
+/* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
  * work.  This now handles partial unmappings.
  * Jeremy Fitzhardine <jeremy@sw.oz.au>
  */
 int do_munmap(unsigned long addr, size_t len)
 {
-	struct vm_area_struct *mpnt, *prev, *next, **npp, *free;
+	struct vm_area_struct *mpnt, *next, *free;
 
 	if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
 		return -EINVAL;
@@ -825,33 +455,36 @@ int do_munmap(unsigned long addr, size_t len)
 	if ((len = PAGE_ALIGN(len)) == 0)
 		return 0;
 
-	/*
-	 * Check if this memory area is ok - put it on the temporary
+	/* Check if this memory area is ok - put it on the temporary
 	 * list if so..  The checks here are pretty simple --
 	 * every area affected in some way (by any overlap) is put
 	 * on the list.  If nothing is put on, nothing is affected.
 	 */
-	mpnt = find_vma(current->mm, addr);
+	mpnt = current->mm->mmap;
+	while(mpnt && mpnt->vm_end <= addr)
+		mpnt = mpnt->vm_next;
 	if (!mpnt)
 		return 0;
-	avl_neighbours(mpnt, current->mm->mmap_avl, &prev, &next);
-	/* we have  prev->vm_next == mpnt && mpnt->vm_next = next */
-	/* and  addr < mpnt->vm_end  */
 
-	npp = (prev ? &prev->vm_next : &current->mm->mmap);
+	next = mpnt->vm_next;
+
+	/* we have mpnt->vm_next = next and addr < mpnt->vm_end */
 	free = NULL;
-	for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
-		*npp = mpnt->vm_next;
+	for ( ; mpnt && mpnt->vm_start < addr+len; ) {
+		struct vm_area_struct *next = mpnt->vm_next;
+
+		if(mpnt->vm_next)
+			mpnt->vm_next->vm_pprev = mpnt->vm_pprev;
+		*mpnt->vm_pprev = mpnt->vm_next;
+
 		mpnt->vm_next = free;
 		free = mpnt;
-		avl_remove(mpnt, &current->mm->mmap_avl);
+		mpnt = next;
 	}
-
 	if (free == NULL)
 		return 0;
 
-	/*
-	 * Ok - we have the memory areas we should free on the 'free' list,
+	/* Ok - we have the memory areas we should free on the 'free' list,
 	 * so release them, and unmap the page range..
 	 * If the one of the segments is only being partially unmapped,
 	 * it will put new vm_area_struct(s) into the address space.
@@ -871,36 +504,27 @@ int do_munmap(unsigned long addr, size_t len)
 
 		if (mpnt->vm_ops && mpnt->vm_ops->unmap)
 			mpnt->vm_ops->unmap(mpnt, st, size);
+
 		flush_cache_range(current->mm, st, end);
 		zap_page_range(current->mm, st, size);
 		flush_tlb_range(current->mm, st, end);
+
 		unmap_fixup(mpnt, st, size);
+
 		kmem_cache_free(vm_area_cachep, mpnt);
 	} while (free);
 
-	/* we could zap the page tables here too.. */
-
+	current->mm->mmap_cache = NULL;		/* Kill the cache. */
 	return 0;
 }
 
-/* Build the AVL tree corresponding to the VMA list. */
-void build_mmap_avl(struct mm_struct * mm)
-{
-	struct vm_area_struct * vma;
-
-	mm->mmap_avl = NULL;
-	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		avl_insert(vma, &mm->mmap_avl);
-}
-
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct * mm)
 {
 	struct vm_area_struct * mpnt;
 
 	mpnt = mm->mmap;
-	mm->mmap = NULL;
-	mm->mmap_avl = NULL;
+	mm->mmap = mm->mmap_cache = NULL;
 	mm->rss = 0;
 	mm->total_vm = 0;
 	mm->locked_vm = 0;
@@ -925,81 +549,38 @@ void exit_mmap(struct mm_struct * mm)
 	}
 }
 
-/*
- * Insert vm structure into process list sorted by address
+/* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap ring.
  */
 void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
 {
-	struct vm_area_struct *share;
+	struct vm_area_struct **pprev = &mm->mmap;
 	struct inode * inode;
 
-#if 0 /* equivalent, but slow */
-	struct vm_area_struct **p, *mpnt;
+	/* Find where to link it in. */
+	while(*pprev && (*pprev)->vm_start <= vmp->vm_start)
+		pprev = &(*pprev)->vm_next;
 
-	p = &mm->mmap;
-	while ((mpnt = *p) != NULL) {
-		if (mpnt->vm_start > vmp->vm_start)
-			break;
-		if (mpnt->vm_end > vmp->vm_start)
-			printk("insert_vm_struct: overlapping memory areas\n");
-		p = &mpnt->vm_next;
-	}
-	vmp->vm_next = mpnt;
-	*p = vmp;
-#else
-	struct vm_area_struct * prev, * next;
-
-	avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next);
-	if ((prev ? prev->vm_next : mm->mmap) != next)
-		printk("insert_vm_struct: tree inconsistent with list\n");
-	if (prev)
-		prev->vm_next = vmp;
-	else
-		mm->mmap = vmp;
-	vmp->vm_next = next;
-#endif
+	/* Insert it. */
+	if((vmp->vm_next = *pprev) != NULL)
+		(*pprev)->vm_pprev = &vmp->vm_next;
+	*pprev = vmp;
+	vmp->vm_pprev = pprev;
 
 	inode = vmp->vm_inode;
-	if (!inode)
-		return;
-
-	/* insert vmp into inode's circular share list */
-	if ((share = inode->i_mmap)) {
-		vmp->vm_next_share = share->vm_next_share;
-		vmp->vm_next_share->vm_prev_share = vmp;
-		share->vm_next_share = vmp;
-		vmp->vm_prev_share = share;
-	} else
-		inode->i_mmap = vmp->vm_next_share = vmp->vm_prev_share = vmp;
-}
-
-/*
- * Remove one vm structure from the inode's i_mmap ring.
- */
-void remove_shared_vm_struct(struct vm_area_struct *mpnt)
-{
-	struct inode * inode = mpnt->vm_inode;
-
-	if (!inode)
-		return;
-
-	if (mpnt->vm_next_share == mpnt) {
-		if (inode->i_mmap != mpnt)
-			printk("Inode i_mmap ring corrupted\n");
-		inode->i_mmap = NULL;
-		return;
+	if (inode) {
+		if (vmp->vm_flags & VM_DENYWRITE)
+			inode->i_writecount--;
+      
+		/* insert vmp into inode's share list */
+		if((vmp->vm_next_share = inode->i_mmap) != NULL)
+			inode->i_mmap->vm_pprev_share = &vmp->vm_next_share;
+		inode->i_mmap = vmp;
+		vmp->vm_pprev_share = &inode->i_mmap;
 	}
-
-	if (inode->i_mmap == mpnt)
-		inode->i_mmap = mpnt->vm_next_share;
-
-	mpnt->vm_prev_share->vm_next_share = mpnt->vm_next_share;
-	mpnt->vm_next_share->vm_prev_share = mpnt->vm_prev_share;
 }
 
-/*
- * Merge the list of memory segments if possible.
+/* Merge the list of memory segments if possible.
  * Redundant vm_area_structs are freed.
  * This assumes that the list is ordered by address.
  * We don't need to traverse the entire list, only those segments
@@ -1010,13 +591,19 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 	struct vm_area_struct *prev, *mpnt, *next;
 
 	down(&mm->mmap_sem);
-	mpnt = find_vma(mm, start_addr);
+
+	prev = NULL;
+	mpnt = mm->mmap;
+	while(mpnt && mpnt->vm_end <= start_addr) {
+		prev = mpnt;
+		mpnt = mpnt->vm_next;
+	}
 	if (!mpnt)
 		goto no_vma;
 
-	avl_neighbours(mpnt, mm->mmap_avl, &prev, &next);
-	/* we have  prev->vm_next == mpnt && mpnt->vm_next = next */
+	next = mpnt->vm_next;
 
+	/* we have prev->vm_next == mpnt && mpnt->vm_next = next */
 	if (!prev) {
 		prev = mpnt;
 		mpnt = next;
@@ -1026,41 +613,32 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 	 * start_addr < mpnt->vm_end && prev->vm_start < end_addr
 	 */
 	for ( ; mpnt && prev->vm_start < end_addr ; prev = mpnt, mpnt = next) {
-#if 0
-		printk("looping in merge_segments, mpnt=0x%lX\n", (unsigned long) mpnt);
-#endif
-
 		next = mpnt->vm_next;
 
-		/*
-		 * To share, we must have the same inode, operations.. 
-		 */
-		if (mpnt->vm_inode != prev->vm_inode)
-			continue;
-		if (mpnt->vm_pte != prev->vm_pte)
-			continue;
-		if (mpnt->vm_ops != prev->vm_ops)
-			continue;
-		if (mpnt->vm_flags != prev->vm_flags)
+		/* To share, we must have the same inode, operations.. */
+		if ((mpnt->vm_inode != prev->vm_inode)	||
+		    (mpnt->vm_pte != prev->vm_pte)	||
+		    (mpnt->vm_ops != prev->vm_ops)	||
+		    (mpnt->vm_flags != prev->vm_flags)	||
+		    (prev->vm_end != mpnt->vm_start))
 			continue;
-		if (prev->vm_end != mpnt->vm_start)
-			continue;
-		/*
-		 * and if we have an inode, the offsets must be contiguous..
-		 */
+
+		/* and if we have an inode, the offsets must be contiguous.. */
 		if ((mpnt->vm_inode != NULL) || (mpnt->vm_flags & VM_SHM)) {
-			if (prev->vm_offset + prev->vm_end - prev->vm_start != mpnt->vm_offset)
+			unsigned long off = prev->vm_offset+prev->vm_end-prev->vm_start;
+			if (off != mpnt->vm_offset)
 				continue;
 		}
 
-		/*
-		 * merge prev with mpnt and set up pointers so the new
+		/* merge prev with mpnt and set up pointers so the new
 		 * big segment can possibly merge with the next one.
 		 * The old unused mpnt is freed.
 		 */
-		avl_remove(mpnt, &mm->mmap_avl);
+		if(mpnt->vm_next)
+			mpnt->vm_next->vm_pprev = mpnt->vm_pprev;
+		*mpnt->vm_pprev = mpnt->vm_next;
+
 		prev->vm_end = mpnt->vm_end;
-		prev->vm_next = mpnt->vm_next;
 		if (mpnt->vm_ops && mpnt->vm_ops->close) {
 			mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start;
 			mpnt->vm_start = mpnt->vm_end;
@@ -1072,16 +650,24 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
 		kmem_cache_free(vm_area_cachep, mpnt);
 		mpnt = prev;
 	}
+	mm->mmap_cache = NULL;		/* Kill the cache. */
 no_vma:
 	up(&mm->mmap_sem);
 }
 
-void vma_init(void)
+__initfunc(void vma_init(void))
 {
 	vm_area_cachep = kmem_cache_create("vm_area_struct",
 					   sizeof(struct vm_area_struct),
-					   sizeof(long)*8, SLAB_HWCACHE_ALIGN,
+					   0, SLAB_HWCACHE_ALIGN,
 					   NULL, NULL);
 	if(!vm_area_cachep)
 		panic("vma_init: Cannot alloc vm_area_struct cache.");
+
+	mm_cachep = kmem_cache_create("mm_struct",
+				      sizeof(struct mm_struct),
+				      0, SLAB_HWCACHE_ALIGN,
+				      NULL, NULL);
+	if(!mm_cachep)
+		panic("vma_init: Cannot alloc mm_struct cache.");
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7b71a1ec7..19b3aa125 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,12 +18,14 @@
 #include <linux/fs.h>
 #include <linux/swapctl.h>
 #include <linux/interrupt.h>
+#include <linux/init.h>
 
 #include <asm/dma.h>
 #include <asm/system.h> /* for cli()/sti() */
 #include <asm/uaccess.h> /* for copy_to/from_user */
 #include <asm/bitops.h>
 #include <asm/pgtable.h>
+#include <asm/spinlock.h>
 
 int nr_swap_pages = 0;
 int nr_free_pages = 0;
@@ -88,10 +90,6 @@ static inline void remove_mem_queue(struct page * entry)
  *
  * With the above two rules, you get a straight-line execution path
  * for the normal case, giving better asm-code.
- *
- * free_page() may sleep since the page being freed may be a buffer
- * page or present in the swap cache. It will not sleep, however,
- * for a freshly allocated page (get_free_page()).
  */
 
 /*
@@ -99,6 +97,8 @@ static inline void remove_mem_queue(struct page * entry)
  *
  * Hint: -mask = 1+~mask
  */
+static spinlock_t page_alloc_lock;
+
 static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 {
 	struct free_area_struct *area = free_area + order;
@@ -106,15 +106,14 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 	unsigned long mask = (~0UL) << order;
 	unsigned long flags;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&page_alloc_lock, flags);
 
 #define list(x) (mem_map+(x))
 
 	map_nr &= mask;
 	nr_free_pages -= mask;
 	while (mask + (1 << (NR_MEM_LISTS-1))) {
-		if (!change_bit(index, area->map))
+		if (!test_and_change_bit(index, area->map))
 			break;
 		remove_mem_queue(list(map_nr ^ -mask));
 		mask <<= 1;
@@ -126,7 +125,7 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 
 #undef list
 
-	restore_flags(flags);
+	spin_unlock_irqrestore(&page_alloc_lock, flags);
 }
 
 void __free_page(struct page *page)
@@ -172,7 +171,7 @@ do { struct free_area_struct * area = free_area+order; \
 				MARK_USED(map_nr, new_order, area); \
 				nr_free_pages -= 1 << order; \
 				EXPAND(ret, map_nr, order, new_order, area); \
-				restore_flags(flags); \
+				spin_unlock_irqrestore(&page_alloc_lock, flags); \
 				return ADDRESS(map_nr); \
 			} \
 			prev = ret; \
@@ -214,15 +213,14 @@ unsigned long __get_free_pages(int priority, unsigned long order, int dma)
 	reserved_pages = 5;
 	if (priority != GFP_NFS)
 		reserved_pages = min_free_pages;
-	save_flags(flags);
 repeat:
-	cli();
+	spin_lock_irqsave(&page_alloc_lock, flags);
 	if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 		RMQUEUE(order, dma);
-		restore_flags(flags);
+		spin_unlock_irqrestore(&page_alloc_lock, flags);
 		return 0;
 	}
-	restore_flags(flags);
+	spin_unlock_irqrestore(&page_alloc_lock, flags);
 	if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1))
 		goto repeat;
 	return 0;
@@ -239,8 +237,7 @@ void show_free_areas(void)
  	unsigned long total = 0;
 
 	printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&page_alloc_lock, flags);
  	for (order=0 ; order < NR_MEM_LISTS; order++) {
 		struct page * tmp;
 		unsigned long nr = 0;
@@ -250,7 +247,7 @@ void show_free_areas(void)
 		total += nr * ((PAGE_SIZE>>10) << order);
 		printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
 	}
-	restore_flags(flags);
+	spin_unlock_irqrestore(&page_alloc_lock, flags);
 	printk("= %lukB)\n", total);
 #ifdef SWAP_CACHE_INFO
 	show_swap_cache_info();
@@ -265,7 +262,7 @@ void show_free_areas(void)
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
-unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
+__initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem))
 {
 	mem_map_t * p;
 	unsigned long mask = PAGE_MASK;
@@ -273,7 +270,7 @@ unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
 
 	/*
 	 * select nr of pages we try to keep free for important stuff
-	 * with a minimum of 16 pages. This is totally arbitrary
+	 * with a minimum of 48 pages. This is totally arbitrary
 	 */
 	i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
 	if (i < 48)
diff --git a/mm/page_io.c b/mm/page_io.c
index 9980c52b7..6a16ccee8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -67,7 +67,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 		return;
 	}
 	/* Make sure we are the only process doing I/O with this swap page. */
-	while (set_bit(offset,p->swap_lockmap)) {
+	while (test_and_set_bit(offset,p->swap_lockmap)) {
 		run_task_queue(&tq_disk);
 		sleep_on(&lock_queue);
 	}
@@ -136,7 +136,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
 	} else
 		printk("rw_swap_page: no swap file or device\n");
 	atomic_dec(&page->count);
-	if (offset && !clear_bit(offset,p->swap_lockmap))
+	if (offset && !test_and_clear_bit(offset,p->swap_lockmap))
 		printk("rw_swap_page: lock already cleared\n");
 	wake_up(&lock_queue);
 }
@@ -158,7 +158,7 @@ void swap_after_unlock_page (unsigned long entry)
 		printk("swap_after_unlock_page: weirdness\n");
 		return;
 	}
-	if (!clear_bit(offset,p->swap_lockmap))
+	if (!test_and_clear_bit(offset,p->swap_lockmap))
 		printk("swap_after_unlock_page: lock already cleared\n");
 	wake_up(&lock_queue);
 }
@@ -187,7 +187,7 @@ void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer)
 			panic("ll_rw_page: bad block dev cmd, must be R/W");
 	}
 	page = mem_map + MAP_NR(buffer);
-	if (set_bit(PG_locked, &page->flags))
+	if (test_and_set_bit(PG_locked, &page->flags))
 		panic ("ll_rw_page: page already locked");
 	brw_page(rw, page, dev, &block, PAGE_SIZE, 0);
 }
diff --git a/mm/slab.c b/mm/slab.c
index baa4f027f..addd0796a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1,8 +1,81 @@
 /*
  * linux/mm/slab.c
- * Written by Mark Hemment, 1996.
+ * Written by Mark Hemment, 1996/97.
  * (markhe@nextd.demon.co.uk)
+ *
+ * 11 April '97.  Started multi-threading - markhe
+ *	The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ *	The sem is only needed when accessing/extending the cache-chain, which
+ *	can never happen inside an interrupt (kmem_cache_create(),
+ *	kmem_cache_shrink() and kmem_cache_reap()).
+ *	This is a medium-term exclusion lock.
+ *
+ *	Each cache has its own lock; 'c_spinlock'.  This lock is needed only
+ *	when accessing non-constant members of a cache-struct.
+ *	Note: 'constant members' are assigned a value in kmem_cache_create() before
+ *	the cache is linked into the cache-chain.  The values never change, so not
+ *	even a multi-reader lock is needed for these members.
+ *	The c_spinlock is only ever held for a few cycles.
+ *
+ *	To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which
+ *	maybe be sleeping and therefore not holding the semaphore/lock), the
+ *	c_growing field is used.  This also prevents reaping from a cache.
+ *
+ *	Note, caches can _never_ be destroyed.  When a sub-system (eg module) has
+ *	finished with a cache, it can only be shrunk.  This leaves the cache empty,
+ *	but already enabled for re-use, eg. during a module re-load.
+ *
+ *	Notes:
+ *		o Constructors/deconstructors are called while the cache-lock
+ *		  is _not_ held.  Therefore they _must_ be threaded.
+ *		o Constructors must not attempt to allocate memory from the
+ *		  same cache that they are a constructor for - infinite loop!
+ *		  (There is no easy way to trap this.)
+ *		o The per-cache locks must be obtained with local-interrupts disabled.
+ *		o When compiled with debug support, and an object-verify (upon release)
+ *		  is request for a cache, the verify-function is called with the cache
+ *		  lock held.  This helps debugging.
+ *		o The functions called from try_to_free_page() must not attempt
+ *		  to allocate memory from a cache which is being grown.
+ *		  The buffer sub-system might try to allocate memory, via buffer_cachep.
+ *		  As this pri is passed to the SLAB, and then (if necessary) onto the
+ *		  gfp() funcs (which avoid calling try_to_free_page()), no deadlock
+ *		  should happen.
+ *
+ *	The positioning of the per-cache lock is tricky.  If the lock is
+ *	placed on the same h/w cache line as commonly accessed members
+ *	the number of L1 cache-line faults is reduced.  However, this can
+ *	lead to the cache-line ping-ponging between processors when the
+ *	lock is in contention (and the common members are being accessed).
+ *	Decided to keep it away from common members.
+ *
+ *	More fine-graining is possible, with per-slab locks...but this might be
+ *	taking fine graining too far, but would have the advantage;
+ *		During most allocs/frees no writes occur to the cache-struct.
+ *		Therefore a multi-reader/one writer lock could be used (the writer
+ *		needed when the slab chain is being link/unlinked).
+ *		As we would not have an exclusion lock for the cache-structure, one
+ *		would be needed per-slab (for updating s_free ptr, and/or the contents
+ *		of s_index).
+ *	The above locking would allow parallel operations to different slabs within
+ *	the same cache with reduced spinning.
+ *
+ *	Per-engine slab caches, backed by a global cache (as in Mach's Zone allocator),
+ *	would allow most allocations from the same cache to execute in parallel.
+ *
+ *	At present, each engine can be growing a cache.  This should be blocked.
+ *
+ *	It is not currently 100% safe to examine the page_struct outside of a kernel
+ *	or global cli lock.  The risk is v. small, and non-fatal.
+ *
+ *	Calls to printk() are not 100% safe (the function is not threaded).  However,
+ *	printk() is only used under an error condition, and the risk is v. small (not
+ *	sure if the console write functions 'enjoy' executing multiple contextes in
+ *	parallel.  I guess they don't...).
+ *	Note, for most calls to printk() any held cache-lock is dropped.  This is not
+ *	always done for text size reasons - having *_unlock() everywhere is bloat.
  */
+
 /*
  * An implementation of the Slab Allocator as described in outline in;
  *	UNIX Internals: The New Frontiers by Uresh Vahalia
@@ -10,156 +83,251 @@
  * or with a little more detail in;
  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
  *	Jeff Bonwick (Sun Microsystems).
- *      Presented at: USENIX Summer 1994 Technical Conference
+ *	Presented at: USENIX Summer 1994 Technical Conference
+ */
+
+/*
+ * This implementation deviates from Bonwick's paper as it
+ * does not use a hash-table for large objects, but rather a per slab
+ * index to hold the bufctls.  This allows the bufctl structure to
+ * be small (one word), but limits the number of objects a slab (not
+ * a cache) can contain when off-slab bufctls are used.  The limit is the
+ * size of the largest general-cache that does not use off-slab bufctls,
+ * divided by the size of a bufctl.  For 32bit archs, is this 256/4 = 64.
+ * This is not serious, as it is only for large objects, when it is unwise
+ * to have too many per slab.
+ * Note: This limit can be raised by introducing a general-cache whose size
+ * is less than 512 (PAGE_SIZE<<3), but greater than 256.
  */
 
-#include	<linux/slab.h>
 #include	<linux/mm.h>
+#include	<linux/slab.h>
 #include	<linux/interrupt.h>
+#include	<linux/config.h>
+#include	<linux/init.h>
+#include	<linux/smp.h>
+
 #include	<asm/system.h>
-#include	<asm/cache.h>
-
-/* SLAB_MGMT_CHECKS	- define to enable extra checks in
- *                        kmem_cache_[create|destroy|shrink].
- *			  If you're not messing around with these funcs, then undef this.
- * SLAB_HIGH_PACK	- define to allow 'bufctl's to be stored within objs that do not
- *			  have a state.  This allows more objs per slab, but removes the
- *			  ability to sanity check an addr on release (if the addr is
- *                        within any slab, anywhere, kmem_cache_free() will accept it!).
- * SLAB_DEBUG_SUPPORT	- when defined, kmem_cache_create() will honour; SLAB_DEBUG_FREE,
- *			  SLAB_DEBUG_INITIAL and SLAB_RED_ZONE.
+#include	<asm/atomic.h>
+#include	<asm/smp_lock.h>
+#include	<asm/spinlock.h>
+
+/* If there is a different PAGE_SIZE around, and it works with this allocator,
+ * then change the following.
  */
-#define		SLAB_MGMT_CHECKS
-#undef		SLAB_HIGH_PACK
-#define		SLAB_DEBUG_SUPPORT	/* undef this when your cache is stable */
+#if	(PAGE_SIZE != 8192 && PAGE_SIZE != 4096)
+#error	Your page size is probably not correctly supported - please check
+#endif
+
+/* SLAB_MGMT_CHECKS	- 1 to enable extra checks in kmem_cache_create().
+ *			  0 if you wish to reduce memory usage.
+ *
+ * SLAB_DEBUG_SUPPORT	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE,
+ *			  SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISION.
+ *			  0 for faster, smaller, code (espically in the critical paths).
+ *
+ * SLAB_STATS		- 1 to collect stats for /proc/slabinfo.
+ *			  0 for faster, smaller, code (espically in the critical paths).
+ *
+ * SLAB_SELFTEST	- 1 to perform a few tests, mainly for developement.
+ */
+#define		SLAB_MGMT_CHECKS	1
+#define		SLAB_DEBUG_SUPPORT	0
+#define		SLAB_STATS		0
+#define		SLAB_SELFTEST		0
 
-#define	BYTES_PER_WORD	sizeof(void *)
+/* Shouldn't this be in a header file somewhere? */
+#define	BYTES_PER_WORD		sizeof(void *)
 
-/* legal flag mask for kmem_cache_create() */
-#if	defined(SLAB_DEBUG_SUPPORT)
-#define	SLAB_C_MASK		(SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_HWCACHE_ALIGN|SLAB_RED_ZONE)
+/* Legal flag mask for kmem_cache_create(). */
+#if	SLAB_DEBUG_SUPPORT
+#if	0
+#define	SLAB_C_MASK		(SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
+				 SLAB_POISION|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP| \
+				 SLAB_HIGH_PACK)
+#endif
+#define	SLAB_C_MASK		(SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
+				 SLAB_POISION|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
 #else
-#define	SLAB_C_MASK		(SLAB_HWCACHE_ALIGN)
+#if	0
+#define	SLAB_C_MASK		(SLAB_HWCACHE_ALIGN|SLAB_NO_REAP|SLAB_HIGH_PACK)
+#endif
+#define	SLAB_C_MASK		(SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
 #endif	/* SLAB_DEBUG_SUPPORT */
 
-/* Magic num for red zoning.
- * Placed in the first word after the end of an obj
- */
-#define	SLAB_RED_MAGIC1		0x5A2CF071UL	/* when obj is active */
-#define	SLAB_RED_MAGIC2		0x170FC2A5UL	/* when obj is inactive */
+/* Slab management struct.
+ * Manages the objs in a slab.  Placed either at the end of mem allocated
+ * for a slab, or from an internal obj cache (cache_slabp).
+ * Slabs are chained into a partially ordered list; fully used first, partial
+ * next, and then fully free slabs.
+ * The first 4 members are referenced during an alloc/free operation, and
+ * should always appear on the same cache line.
+ * Note: The offset between some members _must_ match offsets within
+ * the kmem_cache_t - see kmem_cache_init() for the checks. */
+
+#define	SLAB_OFFSET_BITS	16	/* could make this larger for 64bit archs */
+
+typedef struct kmem_slab_s {
+	struct kmem_bufctl_s	*s_freep;  /* ptr to first inactive obj in slab */
+	struct kmem_bufctl_s	*s_index;
+	unsigned long		 s_magic;
+	unsigned long		 s_inuse;  /* num of objs active in slab */
+
+	struct kmem_slab_s	*s_nextp;
+	struct kmem_slab_s	*s_prevp;
+	void			*s_mem;	   /* addr of first obj in slab */
+	unsigned long		 s_offset:SLAB_OFFSET_BITS,
+				 s_dma:1;
+} kmem_slab_t;
 
-/* Used for linking objs within a slab.  How much of the struct is
- * used, and where its placed, depends on the packing used in a cache.
- * Don't mess with the order!
+/* When the slab mgmt is on-slab, this gives the size to use. */
+#define	slab_align_size		(L1_CACHE_ALIGN(sizeof(kmem_slab_t)))
+
+/* Test for end of slab chain. */
+#define	kmem_slab_end(x)	((kmem_slab_t*)&((x)->c_offset))
+
+/* s_magic */
+#define	SLAB_MAGIC_ALLOC	0xA5C32F2BUL	/* slab is alive */
+#define	SLAB_MAGIC_DESTROYED	0xB2F23C5AUL	/* slab has been destoryed */
+
+/* Bufctl's are used for linking objs within a slab, identifying what slab an obj
+ * is in, and the address of the associated obj (for sanity checking with off-slab
+ * bufctls).  What a bufctl contains depends upon the state of the obj and
+ * the organisation of the cache.
  */
 typedef struct kmem_bufctl_s {
-	struct kmem_bufctl_s	*buf_nextp;
-	struct kmem_slab_s	*buf_slabp;
-	void 			*buf_objp;	/* start of obj */
-	struct kmem_bufctl_s	*buf_hnextp;
-	struct kmem_bufctl_s	**buf_hashp;
+	union {
+		struct kmem_bufctl_s	*buf_nextp;
+		kmem_slab_t		*buf_slabp;	/* slab for obj */
+		void *			 buf_objp;
+	} u;
 } kmem_bufctl_t;
 
-/* different portions of the bufctl are used - so need some macros */
-#define	kmem_bufctl_offset(x) ((unsigned long)&((kmem_bufctl_t *)0)->x)
-#define	kmem_bufctl_short_size	(kmem_bufctl_offset(buf_objp))
-#define	kmem_bufctl_very_short_size	(kmem_bufctl_offset(buf_slabp))
+/* ...shorthand... */
+#define	buf_nextp	u.buf_nextp
+#define	buf_slabp	u.buf_slabp
+#define	buf_objp	u.buf_objp
 
-/* Slab management struct.
- * Manages the objs in a slab.  Placed either at the end of mem allocated
- * for the slab, or from an internal obj cache (SLAB_CFLGS_OFF_SLAB).
- * Slabs are chain into a partially ordered list.  The linking ptrs must
- * be first in the struct!
- * The size of the struct is important(ish);  it should align well on
- * cache line(s)
+#if	SLAB_DEBUG_SUPPORT
+/* Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
  */
-typedef struct kmem_slab_s {
-	struct kmem_slab_s *s_nextp;
-	struct kmem_slab_s *s_prevp;
-	void		   *s_mem;	/* addr of mem allocated for slab */
-	unsigned long	    s_jiffies;
-	kmem_bufctl_t	   *s_freep;	/* ptr to first inactive obj in slab */
-	unsigned long	    s_flags;
-	unsigned long	    s_magic;
-	unsigned long	    s_inuse;	/* num of objs active in slab */
-} kmem_slab_t;
-
-/* to test for end of slab chain */
-#define	kmem_slab_end(x)	((kmem_slab_t*)&((x)->c_firstp))
+#define	SLAB_RED_MAGIC1		0x5A2CF071UL	/* when obj is active */
+#define	SLAB_RED_MAGIC2		0x170FC2A5UL	/* when obj is inactive */
 
-/* s_magic */
-#define	SLAB_MAGIC_ALLOC	0xA5C32F2BUL
-#define	SLAB_MAGIC_UNALLOC	0xB2F23C5AUL
+/* ...and for poisioning */
+#define	SLAB_POISION_BYTE	0x5a		/* byte value for poisioning */
+#define	SLAB_POISION_END	0xa5		/* end-byte of poisioning */
 
-/* s_flags */
-#define	SLAB_SFLGS_DMA		0x000001UL	/* slab's mem can do DMA */
+#endif	/* SLAB_DEBUG_SUPPORT */
 
-/* cache struct - manages a cache.
- * c_lastp must appear immediately after c_firstp!
+/* Cache struct - manages a cache.
+ * First four members are commonly referenced during an alloc/free operation.
  */
 struct kmem_cache_s {
 	kmem_slab_t		 *c_freep;	/* first slab w. free objs */
-	unsigned long	 	  c_flags;
+	unsigned long	 	  c_flags;	/* constant flags */
 	unsigned long		  c_offset;
-	struct kmem_bufctl_s	**c_hashp;	/* ptr for off-slab bufctls */
-	kmem_slab_t		 *c_firstp;	/* first slab in chain */
-	kmem_slab_t		 *c_lastp;	/* last slab in chain */
-	unsigned long		  c_hashbits;
 	unsigned long		  c_num;	/* # of objs per slab */
-	unsigned long		  c_gfporder;	/* order of pgs per slab (2^n) */
-	unsigned long		  c_org_size;
+
 	unsigned long		  c_magic;
 	unsigned long		  c_inuse;	/* kept at zero */
-	void (*c_ctor)(void *, int, unsigned long); /* constructor func */
-	void (*c_dtor)(void *, int, unsigned long); /* de-constructor func */
+	kmem_slab_t		 *c_firstp;	/* first slab in chain */
+	kmem_slab_t		 *c_lastp;	/* last slab in chain */
+
+	spinlock_t		  c_spinlock;
+	unsigned long		  c_growing;
+	unsigned long		  c_dflags;	/* dynamic flags */
+	size_t 			  c_org_size;
+	unsigned long		  c_gfporder;	/* order of pgs per slab (2^n) */
+	void (*c_ctor)(void *, kmem_cache_t *, unsigned long); /* constructor func */
+	void (*c_dtor)(void *, kmem_cache_t *, unsigned long); /* de-constructor func */
 	unsigned long		  c_align;	/* alignment of objs */
-	unsigned long		  c_colour;	/* cache colouring range */
-	unsigned long		  c_colour_next;/* cache colouring */
+	size_t			  c_colour;	/* cache colouring range */
+	size_t			  c_colour_next;/* cache colouring */
+	unsigned long		  c_failures;
 	const char		 *c_name;
 	struct kmem_cache_s	 *c_nextp;
+	kmem_cache_t		 *c_index_cachep;
+#if	SLAB_STATS
+	unsigned long		  c_num_active;
+	unsigned long		  c_num_allocations;
+	unsigned long		  c_high_mark;
+	unsigned long		  c_grown;
+	unsigned long		  c_reaped;
+	atomic_t 		  c_errors;
+#endif	/* SLAB_STATS */
 };
 
-/* magic # for c_magic - used to detect out-of-slabs in __kmem_cache_alloc() */
-#define	SLAB_C_MAGIC		0x4F17A36DUL
-
 /* internal c_flags */
 #define	SLAB_CFLGS_OFF_SLAB	0x010000UL	/* slab mgmt in own cache */
 #define	SLAB_CFLGS_BUFCTL	0x020000UL	/* bufctls in own cache */
-#define	SLAB_CFLGS_RELEASED	0x040000UL	/* cache is/being destroyed */
+#define	SLAB_CFLGS_GENERAL	0x080000UL	/* a general-cache */
 
-#if	defined(SLAB_HIGH_PACK)
-#define	SLAB_CFLGS_PTR_IN_OBJ	0x080000UL	/* free ptr in obj */
-#endif
+/* c_dflags (dynamic flags).  Need to hold the spinlock to access this member */
+#define	SLAB_CFLGS_GROWN	0x000002UL	/* don't reap a recently grown */
 
 #define	SLAB_OFF_SLAB(x)	((x) & SLAB_CFLGS_OFF_SLAB)
 #define	SLAB_BUFCTL(x)		((x) & SLAB_CFLGS_BUFCTL)
-#define	SLAB_RELEASED(x)	((x) & SLAB_CFLGS_RELEASED)
-#if	defined(SLAB_HIGH_PACK)
-#define	SLAB_PTR_IN_OBJ(x)	((x) & SLAB_CFLGS_PTR_IN_OBJ)
+#define	SLAB_GROWN(x)		((x) & SLAB_CFLGS_GROWN)
+
+#if	SLAB_STATS
+#define	SLAB_STATS_INC_ACTIVE(x)	((x)->c_num_active++)
+#define	SLAB_STATS_DEC_ACTIVE(x)	((x)->c_num_active--)
+#define	SLAB_STATS_INC_ALLOCED(x)	((x)->c_num_allocations++)
+#define	SLAB_STATS_INC_GROWN(x)		((x)->c_grown++)
+#define	SLAB_STATS_INC_REAPED(x)	((x)->c_reaped++)
+#define	SLAB_STATS_SET_HIGH(x)		do { if ((x)->c_num_active > (x)->c_high_mark) \
+						(x)->c_high_mark = (x)->c_num_active; \
+					} while (0)
+#define	SLAB_STATS_INC_ERR(x)		(atomic_inc(&(x)->c_errors))
 #else
-#define	SLAB_PTR_IN_OBJ(x)	(0)
+#define	SLAB_STATS_INC_ACTIVE(x)
+#define	SLAB_STATS_DEC_ACTIVE(x)
+#define	SLAB_STATS_INC_ALLOCED(x)
+#define	SLAB_STATS_INC_GROWN(x)
+#define	SLAB_STATS_INC_REAPED(x)
+#define	SLAB_STATS_SET_HIGH(x)
+#define	SLAB_STATS_INC_ERR(x)
+#endif	/* SLAB_STATS */
+
+#if	SLAB_SELFTEST
+#if	!SLAB_DEBUG_SUPPORT
+#error	Debug support needed for self-test
 #endif
+static void kmem_self_test(void);
+#endif	/* SLAB_SELFTEST */
+
+/* c_magic - used to detect 'out of slabs' in __kmem_cache_alloc() */
+#define	SLAB_C_MAGIC		0x4F17A36DUL
 
 /* maximum size of an obj (in 2^order pages) */
 #define	SLAB_OBJ_MAX_ORDER	5	/* 32 pages */
 
-/* maximum num of pages for a slab (avoids trying to ask for too may contigious pages) */
+/* maximum num of pages for a slab (prevents large requests to the VM layer) */
 #define	SLAB_MAX_GFP_ORDER	5	/* 32 pages */
 
 /* the 'prefered' minimum num of objs per slab - maybe less for large objs */
 #define	SLAB_MIN_OBJS_PER_SLAB	4
 
-/* if the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
- * then the page order must be less than this before trying the next order
+/* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
+ * then the page order must be less than this before trying the next order.
  */
 #define	SLAB_BREAK_GFP_ORDER	2
 
-/* size of hash tables for caches which use off-slab bufctls (SLAB_CFLGS_BUFCTL) */
-#define	KMEM_HASH_SIZE	128
+/* Macros for storing/retrieving the cachep and or slab from the
+ * global 'mem_map'.  With off-slab bufctls, these are used to find the
+ * slab an obj belongs to.  With kmalloc(), and kfree(), these are used
+ * to find the cache which an obj belongs to.
+ */
+#define	SLAB_SET_PAGE_CACHE(pg, x)	((pg)->next = (struct page *)(x))
+#define	SLAB_GET_PAGE_CACHE(pg)		((kmem_cache_t *)(pg)->next)
+#define	SLAB_SET_PAGE_SLAB(pg, x)	((pg)->prev = (struct page *)(x))
+#define	SLAB_GET_PAGE_SLAB(pg)		((kmem_slab_t *)(pg)->prev)
 
-/* size description struct for general-caches */
+/* Size description struct for general-caches. */
 typedef struct cache_sizes {
-	unsigned long	 cs_size;
+	size_t		 cs_size;
 	kmem_cache_t	*cs_cachep;
 } cache_sizes_t;
 
@@ -175,177 +343,177 @@ static cache_sizes_t cache_sizes[] = {
 	{2048,		NULL},
 	{4096,		NULL},
 	{8192,		NULL},
-#if	PAGE_SIZE == 8192
 	{16384,		NULL},
-#endif
+	{32768,		NULL},
+	{65536,		NULL},
+	{131072,	NULL},
 	{0,		NULL}
 };
 
-/* Names for the general-caches.
- * Not placed into the sizes struct for a good reason; the
- * string ptr is not needed while searching in kmem_alloc()/
- * kmem_free(), and would 'get-in-the-way' - think about it.
+/* Names for the general-caches.  Not placed into the sizes struct for
+ * a good reason; the string ptr is not needed while searching in kmalloc(),
+ * and would 'get-in-the-way' in the h/w cache.
  */
 static char *cache_sizes_name[] = {
 #if	PAGE_SIZE == 4096
-	"cache-32",
+	"size-32",
 #endif
-	"cache-64",
-	"cache-128",
-	"cache-256",
-	"cache-512",
-	"cache-1024",
-	"cache-2048",
-	"cache-4096",
-#if	PAGE_SIZE == 4096
-	"cache-8192"
-#elif	PAGE_SIZE == 8192
-	"cache-8192",
-	"cache-16384"
-#else
-#error	Your page size is not supported for the general-caches - please fix
-#endif
-};
-
-static void kmem_hash_ctor(void *ptr, int , unsigned long);	/* fwd ref */
-extern kmem_cache_t	cache_cache;				/* fwd ref */
-
-/* internal cache of hash objs, only used when bufctls are off-slab */
-static	kmem_cache_t	cache_hash = {
-/* freep, flags */		kmem_slab_end(&cache_hash), 0,
-/* offset, hashp */		sizeof(kmem_bufctl_t*)*KMEM_HASH_SIZE, NULL,
-/* firstp, lastp */		kmem_slab_end(&cache_hash), kmem_slab_end(&cache_hash),
-/* hashbits, num, gfporder */	0, 0, 0,
-/* org_size, magic */		sizeof(kmem_bufctl_t*)*KMEM_HASH_SIZE, SLAB_C_MAGIC,
-/* inuse, ctor, dtor, align */	0, kmem_hash_ctor, NULL, L1_CACHE_BYTES,
-/* colour, colour_next */	0, 0,
-/* name, nextp */		"hash_cache", &cache_cache
-};
-
-/* internal cache of freelist mgmnt objs, only use when bufctls are off-slab */
-static	kmem_cache_t	cache_bufctl = {
-/* freep, flags */		kmem_slab_end(&cache_bufctl), 0,
-/* offset, hashp */		sizeof(kmem_bufctl_t), NULL,
-/* firstp, lastp */		kmem_slab_end(&cache_bufctl), kmem_slab_end(&cache_bufctl),
-/* hashbits, num, gfporder */	0, 0, 0,
-/* org_size, magic */		sizeof(kmem_bufctl_t), SLAB_C_MAGIC,
-/* inuse, ctor, dtor, align */	0, NULL, NULL, BYTES_PER_WORD*2,
-/* colour, colour_next */	0, 0,
-/* name, nextp */		"bufctl_cache", &cache_hash
-};
-
-/* internal cache of slab mngmnt objs, only used when slab mgmt is off-slab */
-static	kmem_cache_t	cache_slab = {
-/* freep, flags */		kmem_slab_end(&cache_slab), 0,
-/* offset, hashp */		sizeof(kmem_slab_t), NULL,
-/* firstp, lastp */		kmem_slab_end(&cache_slab), kmem_slab_end(&cache_slab),
-/* hashbits, num, gfporder */	0, 0, 0,
-/* org_size, magic */		sizeof(kmem_slab_t), SLAB_C_MAGIC,
-/* inuse, ctor, dtor, align */	0, NULL, NULL, L1_CACHE_BYTES,
-/* colour, colour_next */	0, 0,
-/* name, nextp */		"slab_cache", &cache_bufctl
+	"size-64",
+	"size-128",
+	"size-256",
+	"size-512",
+	"size-1024",
+	"size-2048",
+	"size-4096",
+	"size-8192",
+	"size-16384",
+	"size-32768",
+	"size-65536",
+	"size-131072"
 };
 
 /* internal cache of cache description objs */
 static	kmem_cache_t	cache_cache = {
-/* freep, flags */		kmem_slab_end(&cache_cache), 0,
-/* offset, hashp */		sizeof(kmem_cache_t), NULL,
+/* freep, flags */		kmem_slab_end(&cache_cache), SLAB_NO_REAP,
+/* offset, num */		sizeof(kmem_cache_t),	0,
+/* c_magic, c_inuse */		SLAB_C_MAGIC, 0,
 /* firstp, lastp */		kmem_slab_end(&cache_cache), kmem_slab_end(&cache_cache),
-/* hashbits, num, gfporder */	0, 0, 0,
-/* org_size, magic */		sizeof(kmem_cache_t), SLAB_C_MAGIC,
-/* inuse, ctor, dtor, align */	0, NULL, NULL, L1_CACHE_BYTES,
+/* spinlock */			SPIN_LOCK_UNLOCKED,
+/* growing */			0,
+/* dflags */			0,
+/* org_size, gfp */		0, 0,
+/* ctor, dtor, align */		NULL, NULL, L1_CACHE_BYTES,
 /* colour, colour_next */	0, 0,
+/* failures */			0,
 /* name */			"kmem_cache",
-/* nextp */			&cache_slab
+/* nextp */			&cache_cache,
+/* index */			NULL,
 };
 
-/* constructor for hash tables */
-static void kmem_hash_ctor(void *ptr, int size, unsigned long flags)
-{
-	memset(ptr, 0, sizeof(kmem_bufctl_t*)*KMEM_HASH_SIZE);
-}
+/* Guard access to the cache-chain. */
+static struct semaphore	cache_chain_sem;
 
-/* place maintainer for reaping */
+/* Place maintainer for reaping. */
 static	kmem_cache_t	*clock_searchp = &cache_cache;
 
-/* Init an internal cache */
-static void
-kmem_own_cache_init(kmem_cache_t *cachep)
-{
-	unsigned long	size, i;
+/* Internal slab mgmt cache, for when slab mgmt is off-slab. */
+static kmem_cache_t	*cache_slabp = NULL;
 
-	if (cachep->c_inuse || cachep->c_magic != SLAB_C_MAGIC) {
-		panic("Bad init of internal cache %s", cachep->c_name);
-		/* NOTREACHED */
-	}
-	size = cachep->c_offset + kmem_bufctl_short_size;
-	i = size % cachep->c_align;
-	if (i)
-		size += (cachep->c_align-i);
-	cachep->c_offset = size-kmem_bufctl_short_size;
-	
-	i = ((PAGE_SIZE<<cachep->c_gfporder)-sizeof(kmem_slab_t));
-	cachep->c_num = i / size;	/* num of objs per slab */
-
-	/* cache colouring */
-	cachep->c_colour = 1 + (i-(cachep->c_num*size))/cachep->c_align;
-	cachep->c_colour_next = cachep->c_colour;
-}
+/* Max number of objs-per-slab for caches which use bufctl's.
+ * Needed to avoid a possible looping condition in kmem_cache_grow().
+ */
+static unsigned long bufctl_limit = 0;
 
-/* Initialisation - setup all internal caches */
-long
-kmem_cache_init(long start, long end)
+/* Initialisation - setup the `cache' cache. */
+__initfunc(long kmem_cache_init(long start, long end))
 {
-	/* sanity */
+	size_t size, i;
+
+#define	kmem_slab_offset(x)  ((unsigned long)&((kmem_slab_t *)0)->x)
+#define kmem_slab_diff(a,b)  (kmem_slab_offset(a) - kmem_slab_offset(b))
 #define	kmem_cache_offset(x) ((unsigned long)&((kmem_cache_t *)0)->x)
-#define	kmem_slab_offset(x) ((unsigned long)&((kmem_slab_t *)0)->x)
-	if (((kmem_cache_offset(c_magic)-kmem_cache_offset(c_firstp)) != kmem_slab_offset(s_magic)) ||
-	    ((kmem_cache_offset(c_inuse)-kmem_cache_offset(c_firstp)) != kmem_slab_offset(s_inuse))) {
+#define kmem_cache_diff(a,b) (kmem_cache_offset(a) - kmem_cache_offset(b))
+
+	/* Sanity checks... */
+	if (kmem_cache_diff(c_firstp, c_magic) != kmem_slab_diff(s_nextp, s_magic) ||
+	    kmem_cache_diff(c_firstp, c_inuse) != kmem_slab_diff(s_nextp, s_inuse) ||
+	    ((kmem_cache_offset(c_lastp) -
+	      ((unsigned long) kmem_slab_end((kmem_cache_t*)NULL))) !=
+	     kmem_slab_offset(s_prevp)) ||
+	    kmem_cache_diff(c_lastp, c_firstp) != kmem_slab_diff(s_prevp, s_nextp)) {
 		/* Offsets to the magic are incorrect, either the structures have
 		 * been incorrectly changed, or adjustments are needed for your
 		 * architecture.
 		 */
-		panic("kmem_cache_init(): Offsets are different - been messed with!\n");
+		panic("kmem_cache_init(): Offsets are wrong - I've been messed with!");
 		/* NOTREACHED */
 	}
 #undef	kmem_cache_offset
+#undef	kmem_cache_diff
 #undef	kmem_slab_offset
+#undef	kmem_slab_diff
+
+	cache_chain_sem = MUTEX;
+
+	size = cache_cache.c_offset + sizeof(kmem_bufctl_t);
+	size += (L1_CACHE_BYTES-1);
+	size &= ~(L1_CACHE_BYTES-1);
+	cache_cache.c_offset = size-sizeof(kmem_bufctl_t);
+	
+	i = (PAGE_SIZE<<cache_cache.c_gfporder)-slab_align_size;
+	cache_cache.c_num = i / size;	/* num of objs per slab */
+
+	/* Cache colouring. */
+	cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES;
+	cache_cache.c_colour_next = cache_cache.c_colour;
 
-	kmem_own_cache_init(&cache_cache);
-	kmem_own_cache_init(&cache_slab);
-	kmem_own_cache_init(&cache_bufctl);
-	kmem_own_cache_init(&cache_hash);
 	return start;
 }
 
-/* Initialisation - setup general caches */
-void
-kmem_cache_sizes_init(void)
+/* Initialisation - setup remaining internal and general caches.
+ * Called after the gfp() functions have been enabled, and before smp_init().
+ */
+__initfunc(void kmem_cache_sizes_init(void))
 {
-	unsigned long	i;
-
-	i = sizeof(cache_sizes)/sizeof(cache_sizes[0])-1;
-	while (i--)
-		cache_sizes[i].cs_cachep = kmem_cache_create(cache_sizes_name[i],
-							     cache_sizes[i].cs_size,
-							     0, 0, NULL, NULL);
+	unsigned int	found = 0;
+
+	cache_slabp = kmem_cache_create("slab_cache", sizeof(kmem_slab_t),
+					0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (cache_slabp) {
+		char **names = cache_sizes_name;
+		cache_sizes_t *sizes = cache_sizes;
+		do {
+			/* For performance, all the general-caches are L1 aligned.
+			 * This should be particularly beneficial on SMP boxes, as it
+			 * elimantes "false sharing".
+			 * Note for systems short on memory removing the alignment will
+			 * allow tighter packing of the smaller caches. */
+			if (!(sizes->cs_cachep =
+			      kmem_cache_create(*names++, sizes->cs_size,
+						0, SLAB_HWCACHE_ALIGN, NULL, NULL)))
+				goto panic_time;
+			if (!found) {
+				/* Inc off-slab bufctl limit until the ceiling is hit. */
+				if (SLAB_BUFCTL(sizes->cs_cachep->c_flags))
+					found++;
+				else
+					bufctl_limit =
+						(sizes->cs_size/sizeof(kmem_bufctl_t));
+			}
+			sizes->cs_cachep->c_flags |= SLAB_CFLGS_GENERAL;
+			sizes++;
+		} while (sizes->cs_size);
+#if	SLAB_SELFTEST
+		kmem_self_test();
+#endif	/* SLAB_SELFTEST */
+		return;
+	}
+panic_time:
+	panic("kmem_cache_sizes_init: Error creating caches");
+	/* NOTREACHED */
 }
 
-/* Interface to system's page allocator.
- * dma pts to non-zero if all of the mem is suitable for DMA
+/* Interface to system's page allocator.  Dma pts to non-zero if all
+ * of memory is DMAable. No need to hold the cache-lock.
  */
 static inline void *
-kmem_getpages(const kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
+kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
 {
-	struct page *page;
 	void	*addr;
 
-	addr = (void*) __get_free_pages(flags & SLAB_LEVEL_MASK, \
-				cachep->c_gfporder, flags & SLAB_DMA); 
-	*dma = 1<<cachep->c_gfporder;
-	if (!(flags & SLAB_DMA) && addr) {
-		/* need to check if can dma */
-		page = mem_map + MAP_NR(addr);
+	*dma = flags & SLAB_DMA;
+	addr = (void*) __get_free_pages(flags & SLAB_LEVEL_MASK,
+					cachep->c_gfporder, *dma); 
+	/* Assume that now we have the pages no one else can legally
+	 * messes with the 'struct page's.
+	 * However vm_scan() might try to test the structure to see if
+	 * it is a named-page or buffer-page.  The members it tests are
+	 * of no interest here.....
+	 */
+	if (!*dma && addr) {
+		/* Need to check if can dma. */
+		struct page *page = mem_map + MAP_NR(addr);
+		*dma = 1<<cachep->c_gfporder;
 		while ((*dma)--) {
 			if (!PageDMA(page)) {
 				*dma = 0;
@@ -357,58 +525,52 @@ kmem_getpages(const kmem_cache_t *cachep, unsigned long flags, unsigned int *dma
 	return addr;
 }
 
-/* Interface to system's page release */
+/* Interface to system's page release. */
 static inline void
 kmem_freepages(kmem_cache_t *cachep, void *addr)
 {
+	unsigned long i = (1<<cachep->c_gfporder);
+	struct page *page = &mem_map[MAP_NR(addr)];
+
+	/* free_pages() does not clear the type bit - we do that.
+	 * The pages have been unlinked from their cache-slab,
+	 * but their 'struct page's might be accessed in
+	 * vm_scan(). Shouldn't be a worry.
+	 */
+	while (i--) {
+		PageClearSlab(page);
+		page++;
+	}
 	free_pages((unsigned long)addr, cachep->c_gfporder); 
 }
 
-/* Hashing function - used for caches with off-slab bufctls */
-static inline int
-kmem_hash(const kmem_cache_t *cachep, const void *objp)
+#if	SLAB_DEBUG_SUPPORT
+static inline void
+kmem_poision_obj(kmem_cache_t *cachep, void *addr)
 {
-	return (((unsigned long)objp >> cachep->c_hashbits) & (KMEM_HASH_SIZE-1));
+	memset(addr, SLAB_POISION_BYTE, cachep->c_org_size);
+	*(unsigned char *)(addr+cachep->c_org_size-1) = SLAB_POISION_END;
 }
 
-/* Link bufctl into a hash table - used for caches with off-slab bufctls 
- * - called with ints disabled
- */
-static inline void *
-kmem_add_to_hash(kmem_cache_t *cachep, kmem_bufctl_t *bufp)
+static inline int
+kmem_check_poision_obj(kmem_cache_t *cachep, void *addr)
 {
-	kmem_bufctl_t **bufpp = bufp->buf_hashp;
-
-	bufp->buf_hnextp = *bufpp;
-	return (*bufpp = bufp)->buf_objp;
+	void *end;
+	end = memchr(addr, SLAB_POISION_END, cachep->c_org_size);
+	if (end != (addr+cachep->c_org_size-1))
+		return 1;
+	return 0;
 }
+#endif	/* SLAB_DEBUG_SUPPORT */
 
-/* Find bufcntl for given obj addr, and unlink.
- * - called with ints disabled
+/* Three slab chain funcs - all called with ints disabled and the appropiate
+ * cache-lock held.
  */
-static inline kmem_bufctl_t *
-kmem_remove_from_hash(kmem_cache_t *cachep, const void *objp)
-{
-	kmem_bufctl_t	*bufp;
-	kmem_bufctl_t	**bufpp = &cachep->c_hashp[kmem_hash(cachep, objp)];
-
-	for (;*bufpp; bufpp = &(*bufpp)->buf_hnextp) {
-		if ((*bufpp)->buf_objp != objp)
-			continue;
-		bufp = *bufpp;
-		*bufpp = bufp->buf_hnextp;
-		return bufp;
-	}
-	return NULL;
-}
-
-/* Three slab chain funcs - all called with ints disabled */
 static inline void
 kmem_slab_unlink(kmem_slab_t *slabp)
 {
 	kmem_slab_t	*prevp = slabp->s_prevp;
 	kmem_slab_t	*nextp = slabp->s_nextp;
-
 	prevp->s_nextp = nextp;
 	nextp->s_prevp = prevp;
 }
@@ -416,781 +578,881 @@ kmem_slab_unlink(kmem_slab_t *slabp)
 static inline void 
 kmem_slab_link_end(kmem_cache_t *cachep, kmem_slab_t *slabp)
 {
+	kmem_slab_t	*lastp = cachep->c_lastp;
 	slabp->s_nextp = kmem_slab_end(cachep);
-	slabp->s_prevp = cachep->c_lastp;
-	kmem_slab_end(cachep)->s_prevp = slabp;
-	slabp->s_prevp->s_nextp = slabp;
+	slabp->s_prevp = lastp;
+	cachep->c_lastp = slabp;
+	lastp->s_nextp = slabp;
 }
 
 static inline void
 kmem_slab_link_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
 {
 	kmem_slab_t	*nextp = cachep->c_freep;
-
+	kmem_slab_t	*prevp = nextp->s_prevp;
 	slabp->s_nextp = nextp;
-	cachep->c_freep = slabp;
-	slabp->s_prevp = nextp->s_prevp;
+	slabp->s_prevp = prevp;
 	nextp->s_prevp = slabp;
 	slabp->s_prevp->s_nextp = slabp;
 }
 
-/* Cal the num objs, wastage, and bytes left over for a given slab size */
-static int
-kmem_cache_cal_waste(unsigned long gfporder, unsigned long size,
-		     unsigned long extra, unsigned long flags,
-		     unsigned long *left_over, unsigned long *num)
+/* Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache.
+ * The cache-lock is not held/needed.
+ */
+static void
+kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
 {
-	unsigned long	wastage;
+	if (cachep->c_dtor
+#if	SLAB_DEBUG_SUPPORT
+		|| cachep->c_flags & (SLAB_POISION || SLAB_RED_ZONE)
+#endif	/*SLAB_DEBUG_SUPPORT*/
+	) {
+		/* Doesn't use the bufctl ptrs to find objs. */
+		unsigned long num = cachep->c_num;
+		void *objp = slabp->s_mem;
+		do {
+#if	SLAB_DEBUG_SUPPORT
+			if (cachep->c_flags & SLAB_RED_ZONE) {
+				if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1)
+					printk(KERN_ERR "kmem_slab_destroy: "
+					       "Bad front redzone - %s\n",
+					       cachep->c_name);
+				objp += BYTES_PER_WORD;
+				if (*((unsigned long*)(objp+cachep->c_org_size)) !=
+				    SLAB_RED_MAGIC1)
+					printk(KERN_ERR "kmem_slab_destroy: "
+					       "Bad rear redzone - %s\n",
+					       cachep->c_name);
+			}
+			if (cachep->c_dtor)
+#endif	/*SLAB_DEBUG_SUPPORT*/
+				(cachep->c_dtor)(objp, cachep, 0);
+#if	SLAB_DEBUG_SUPPORT
+			else if (cachep->c_flags & SLAB_POISION) {
+				if (kmem_check_poision_obj(cachep, objp))
+					printk(KERN_ERR "kmem_slab_destory: "
+					       "Bad poision - %s\n", cachep->c_name);
+			}
+			if (cachep->c_flags & SLAB_RED_ZONE)
+				objp -= BYTES_PER_WORD;
+#endif	/* SLAB_DEBUG_SUPPORT */
+			objp += cachep->c_offset;
+			if (!slabp->s_index)
+				objp += sizeof(kmem_bufctl_t);
+		} while (--num);
+	}
 
-	wastage = PAGE_SIZE << gfporder;
-	gfporder = 0;
-	if (!SLAB_OFF_SLAB(flags))
-		gfporder = sizeof(kmem_slab_t);
+	slabp->s_magic = SLAB_MAGIC_DESTROYED;
+	kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
+	if (slabp->s_index)
+		kmem_cache_free(cachep->c_index_cachep, slabp->s_index);
+	if (SLAB_OFF_SLAB(cachep->c_flags))
+		kmem_cache_free(cache_slabp, slabp);
+}
+
+/* Cal the num objs, wastage, and bytes left over for a given slab size. */
+static inline size_t
+kmem_cache_cal_waste(unsigned long gfporder, size_t size, size_t extra,
+		     unsigned long flags, size_t *left_over, unsigned long *num)
+{
+	size_t wastage = PAGE_SIZE<<gfporder;
+
+	if (SLAB_OFF_SLAB(flags))
+		gfporder = 0;
+	else
+		gfporder = slab_align_size;
 	wastage -= gfporder;
 	*num = wastage / size;
 	wastage -= (*num * size);
 	*left_over = wastage;
 
-	wastage += (extra * *num);
-	wastage += gfporder;
-
-	return wastage;
+	return (wastage + gfporder + (extra * *num));
 }
 
-/* Create a cache
+/* Create a cache:
  * Returns a ptr to the cache on success, NULL on failure.
  * Cannot be called within a int, but can be interrupted.
  * NOTE: The 'name' is assumed to be memory that is _not_  going to disappear.
  */
 kmem_cache_t *
-kmem_cache_create(const char *name, unsigned long size, unsigned long align,
-		  unsigned long flags, void (*ctor)(void*, int, unsigned long),
-		  void (*dtor)(void*, int, unsigned long))
+kmem_cache_create(const char *name, size_t size, size_t offset,
+	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
+	void (*dtor)(void*, kmem_cache_t *, unsigned long))
 {
-	const char *func_nm="kmem_create: ";
-	kmem_cache_t	*searchp, *cachep;
-	unsigned long	words, i;
-	unsigned long	num, left_over;
+	const char *func_nm= KERN_ERR "kmem_create: ";
+	kmem_cache_t	*searchp;
+	kmem_cache_t	*cachep=NULL;
+	size_t		extra;
+	size_t		left_over;
+	size_t		align;
 
-	/* sanity checks */
-#if	defined(SLAB_MGMT_CHECKS)
+	/* Sanity checks... */
+#if	SLAB_MGMT_CHECKS
 	if (!name) {
-		printk(KERN_ERR "%sNULL ptr\n", func_nm);
-		return NULL;
+		printk("%sNULL ptr\n", func_nm);
+		goto opps;
 	}
 	if (in_interrupt()) {
-		printk(KERN_ERR "%sCalled during int - %s\n", func_nm, name);
-		return NULL;
+		printk("%sCalled during int - %s\n", func_nm, name);
+		goto opps;
 	}
 
-	if (size < kmem_bufctl_very_short_size) {
-		printk(KERN_WARNING "%sSize too small %lu - %s\n", func_nm, size, name);
-		size = kmem_bufctl_very_short_size;
+	if (size < BYTES_PER_WORD) {
+		printk("%sSize too small %d - %s\n", func_nm, (int) size, name);
+		size = BYTES_PER_WORD;
 	}
 
 	if (size > ((1<<SLAB_OBJ_MAX_ORDER)*PAGE_SIZE)) {
-		printk(KERN_ERR "%sSize too large %lu - %s\n", func_nm, size, name);
-		return NULL;
-	}
-#endif	/* SLAB_MGMT_CHECKS */
-
-	/* always checks flags, a caller might be expecting debug support which
-	 * isn't available
-	 */
-	if (flags & ~SLAB_C_MASK) {
-		/* Illegal flags */
-		printk(KERN_WARNING "%sIllgl flg %lX - %s\n", func_nm, flags, name);
-		flags &= SLAB_C_MASK;
+		printk("%sSize too large %d - %s\n", func_nm, (int) size, name);
+		goto opps;
 	}
 
-#if	defined(SLAB_MGMT_CHECKS)
-	if (align < 0 || align >= size) {
-		printk(KERN_WARNING "%sAlign weired %lu - %s\n", func_nm, align, name);
-		align = 0;
+	if (dtor && !ctor) {
+		/* Decon, but no con - doesn't make sense */
+		printk("%sDecon but no con - %s\n", func_nm, name);
+		goto opps;
 	}
 
-	if (dtor && !ctor) {
-		/* Descon, but no con - doesn't make sense */
-		printk(KERN_ERR "%sDecon but no con - %s\n", func_nm, name);
-		return NULL;
+	if (offset < 0 || offset > size) {
+		printk("%sOffset weired %d - %s\n", func_nm, (int) offset, name);
+		offset = 0;
 	}
 
+#if	SLAB_DEBUG_SUPPORT
 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
 		/* No constructor, but inital state check requested */
-		printk(KERN_WARNING "%sNo con, but init state check requested - %s\n",
-		       func_nm, name);
+		printk("%sNo con, but init state check requested - %s\n", func_nm, name);
 		flags &= ~SLAB_DEBUG_INITIAL;
 	}
+
+	if ((flags & SLAB_POISION) && ctor) {
+		/* request for poisioning, but we can't do that with a constructor */
+		printk("%sPoisioning requested, but con given - %s\n", func_nm, name);
+		flags &= ~SLAB_POISION;
+	}
+#if	0
+	if ((flags & SLAB_HIGH_PACK) && ctor) {
+		printk("%sHigh pack requested, but con given - %s\n", func_nm, name);
+		flags &= ~SLAB_HIGH_PACK;
+	}
+	if ((flags & SLAB_HIGH_PACK) && (flags & (SLAB_POISION|SLAB_RED_ZONE))) {
+		printk("%sHigh pack requested, but with poisioning/red-zoning - %s\n",
+		       func_nm, name);
+		flags &= ~SLAB_HIGH_PACK;
+	}
+#endif
+#endif	/* SLAB_DEBUG_SUPPORT */
 #endif	/* SLAB_MGMT_CHECKS */
 
-	/* get cache's description obj */
+	/* Always checks flags, a caller might be expecting debug
+	 * support which isn't available.
+	 */
+	if (flags & ~SLAB_C_MASK) {
+		printk("%sIllgl flg %lX - %s\n", func_nm, flags, name);
+		flags &= SLAB_C_MASK;
+	}
+
+	/* Get cache's description obj. */
 	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
 	if (!cachep)
 		goto opps;
+	memset(cachep, 0, sizeof(kmem_cache_t));
 
-	/* remember original size, so can be passed to a constructor or decon.
-	 * Allows the same con/decon to be used for caches of similar objs
-	 * that have a different size data buffer assoicated with them
+	/* Check that size is in terms of words.  This is needed to avoid
+	 * unaligned accesses for some archs when redzoning is used, and makes
+	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
-	cachep->c_org_size = size;
+	if (size & (BYTES_PER_WORD-1)) {
+		size += (BYTES_PER_WORD-1);
+		size &= ~(BYTES_PER_WORD-1);
+		printk("%sForcing size word alignment - %s\n", func_nm, name);
+	}
 
-#if	defined(SLAB_DEBUG_SUPPORT)
-	if (flags & SLAB_RED_ZONE)
-		size += BYTES_PER_WORD;		/* word for redzone */
+#if	SLAB_DEBUG_SUPPORT
+	if (flags & SLAB_RED_ZONE) {
+		/* There is no point trying to honour cache alignment when redzoning. */
+		flags &= ~SLAB_HWCACHE_ALIGN;
+		size += 2*BYTES_PER_WORD;		/* words for redzone */
+	}
 #endif	/* SLAB_DEBUG_SUPPORT */
+	cachep->c_org_size = size;
 
-	/* Make a guess if slab mngmnt obj and/or bufctls are 'on' or 'off' slab */
-	i = kmem_bufctl_short_size;
+	align = BYTES_PER_WORD;
+	if (flags & SLAB_HWCACHE_ALIGN)
+		align = L1_CACHE_BYTES;
+
+	/* Determine if the slab mgmt and/or bufclts are 'on' or 'off' slab. */
+	extra = sizeof(kmem_bufctl_t);
 	if (size < (PAGE_SIZE>>3)) {
-		/* Size is small(ish).  Use format where bufctl size per
-		 * obj is low, and slab mngmnt is on-slab
+		/* Size is small(ish).  Use packing where bufctl size per
+		 * obj is low, and slab mngmnt is on-slab.
 		 */
-		if (!ctor && !dtor && !(flags & SLAB_RED_ZONE)) {
-			/* the objs in this cache have no state - can store
-			 * store freelist ptr within obj. (redzoning is a state)
+#if	0
+		if ((flags & SLAB_HIGH_PACK)) {
+			/* Special high packing for small objects
+			 * (mainly for vm_mapping structs, but
+			 * others can use it).
 			 */
-#if	defined(SLAB_HIGH_PACK)
-			i=0;
-			flags |= SLAB_CFLGS_PTR_IN_OBJ;
-#else
-			i = kmem_bufctl_very_short_size;
-#endif
+			if (size == (L1_CACHE_BYTES/4) || size == (L1_CACHE_BYTES/2) ||
+			    size == L1_CACHE_BYTES) {
+				/* The bufctl is stored with the object. */
+				extra = 0;
+			} else
+				flags &= ~SLAB_HIGH_PACK;
 		}
+#endif
 	} else {
 		/* Size is large, assume best to place the slab mngmnt obj
-		 * off-slab (should allow better packing of objs)
+		 * off-slab (should allow better packing of objs).
 		 */
 		flags |= SLAB_CFLGS_OFF_SLAB;
-		if (!(size & ~PAGE_MASK) ||
-		    size == (PAGE_SIZE+PAGE_SIZE/2) ||
-		    size == (PAGE_SIZE/2) ||
-		    size == (PAGE_SIZE/4) ||
-		    size == (PAGE_SIZE/8)) {
-			/* to avoid waste the bufctls are off-slab */
+		if (!(size & ~PAGE_MASK) || size == (PAGE_SIZE/2)
+		    || size == (PAGE_SIZE/4) || size == (PAGE_SIZE/8)) {
+			/* To avoid waste the bufctls are off-slab... */
 			flags |= SLAB_CFLGS_BUFCTL;
-			/* get hash table for cache */
-			cachep->c_hashp = kmem_cache_alloc(&cache_hash, SLAB_KERNEL);
-			if (cachep->c_hashp == NULL) {
-				kmem_cache_free(&cache_cache, cachep);
-				goto opps;
-			}
-			i = 0;
-			cachep->c_hashbits = PAGE_SHIFT;
-			if (size <= (PAGE_SIZE/2)) {
-				cachep->c_hashbits--;
-				if (size <= (PAGE_SIZE/4)) cachep->c_hashbits--;
-				if (size <= (PAGE_SIZE/8)) cachep->c_hashbits -= 2;
+			extra = 0;
+		} /* else slab mngmnt is off-slab, but freelist ptrs are on. */
+	}
+	size += extra;
+
+	if (flags & SLAB_HWCACHE_ALIGN) {
+		/* Need to adjust size so that objs are cache aligned. */
+		if (size > (L1_CACHE_BYTES/2)) {
+			size_t words = size % L1_CACHE_BYTES;
+			if (words)
+				size += (L1_CACHE_BYTES-words);
+		} else {
+			/* Small obj size, can get at least two per cache line. */
+			int num_per_line = L1_CACHE_BYTES/size;
+			left_over = L1_CACHE_BYTES - (num_per_line*size);
+			if (left_over) {
+				/* Need to adjust size so objs cache align. */
+				if (left_over%num_per_line) {
+					/* Odd num of objs per line - fixup. */
+					num_per_line--;
+					left_over += size;
+				}
+				size += (left_over/num_per_line);
 			}
-		}  /* else slab mngmnt is off-slab, but freelist ptrs are on */
+		}
+	} else if (!(size%L1_CACHE_BYTES)) {
+		/* Size happens to cache align... */
+		flags |= SLAB_HWCACHE_ALIGN;
+		align = L1_CACHE_BYTES;
 	}
-	size += i;
-
-	/* Adjust the mem used for objs so they will align correctly.
-	 * Force objs to start on word boundaries, but caller may specify
-	 * h/w cache line boundaries.  This 'alignment' is slightly different
-	 * to the 'align' argument.  Objs may be requested to start on h/w
-	 * lines (as that is how the members of the obj have been organised),
-	 * but the 'align' may be quite high (say 64) as the first 64 bytes
-	 * are commonly accessed/modified within a loop (stops h/w line
-	 * thrashing).  The 'align' is the slab colouring.
-	 */
-	words = BYTES_PER_WORD;
-	if (flags & SLAB_HWCACHE_ALIGN)
-		words = L1_CACHE_BYTES;
-	words--;
-	size += words;
-	size = size & ~words;
-	/* alignment might not be a factor of the boundary alignment - fix-up */
-	align += words;
-	align = align & ~words;
-
 
 	/* Cal size (in pages) of slabs, and the num of objs per slab.
-	 * This could be made much more intelligent. */
-	cachep->c_gfporder=0;
+	 * This could be made much more intelligent.  For now, try to avoid
+	 * using high page-orders for slabs.  When the gfp() funcs are more
+	 * friendly towards high-order requests, this should be changed.
+	 */
 	do {
-		unsigned long wastage;
-		wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, i,
-					       flags, &left_over, &num);
-		if (!num)
+		size_t wastage;
+		unsigned int break_flag = 0;
+cal_wastage:
+		wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, extra,
+					       flags, &left_over, &cachep->c_num);
+		if (!cachep->c_num)
 			goto next;
-		if (SLAB_PTR_IN_OBJ(flags))
+		if (break_flag)
 			break;
+		if (SLAB_BUFCTL(flags) && cachep->c_num > bufctl_limit) {
+			/* Oops, this num of objs will cause problems. */
+			cachep->c_gfporder--;
+			break_flag++;
+			goto cal_wastage;
+		}
 		if (cachep->c_gfporder == SLAB_MAX_GFP_ORDER)
 			break;
-		/* large num of objs is good, but v. large slabs are bad for the
-		 * VM sub-system
+
+		/* Large num of objs is good, but v. large slabs are currently
+		 * bad for the gfp()s.
 		 */
-		if (num <= SLAB_MIN_OBJS_PER_SLAB) {
+		if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) {
 			if (cachep->c_gfporder < SLAB_BREAK_GFP_ORDER)
 				goto next;
 		}
-		/* stop caches with small objs having a large num of pages */
-		if (left_over <= sizeof(kmem_slab_t))
+
+		/* Stop caches with small objs having a large num of pages. */
+		if (left_over <= slab_align_size)
 			break;
 		if ((wastage*8) <= (PAGE_SIZE<<cachep->c_gfporder))
-			break;	/* acceptable wastage */
+			break;	/* Acceptable internal fragmentation. */
 next:
 		cachep->c_gfporder++;
 	} while (1);
-	cachep->c_num = num;
 
-	/* try with requested alignment, but reduce it if that will
-	 * allow at least some alignment words
+	/* If the slab has been placed off-slab, and we have enough space then
+	 * move it on-slab.  This is at the expense of any extra colouring.
 	 */
-	words++;
-	if (left_over < align)
-		align = (left_over / words) * words;
-	else if (!align && words <= left_over) {
-		/* no alignment given, but space enough - give one */
-		align = words;
-		if (words == BYTES_PER_WORD) {
-			if (BYTES_PER_WORD*4 <= left_over)
-				align += align;
-			if (BYTES_PER_WORD*8 <= left_over)
-				align += align;
+	if ((flags & SLAB_CFLGS_OFF_SLAB) && !SLAB_BUFCTL(flags) &&
+	    left_over >= slab_align_size) {
+		flags &= ~SLAB_CFLGS_OFF_SLAB;
+		left_over -= slab_align_size;
+	}
+
+	/* Offset must be a factor of the alignment. */
+	offset += (align-1);
+	offset &= ~(align-1);
+
+	/* Mess around with the offset alignment. */
+	if (!left_over) {
+		offset = 0;
+	} else if (left_over < offset) {
+		offset = align;
+		if (flags & SLAB_HWCACHE_ALIGN) {
+			if (left_over < offset)
+				offset = 0;
+		} else {
+			/* Offset is BYTES_PER_WORD, and left_over is at
+			 * least BYTES_PER_WORD.
+			 */
+			if (left_over >= (BYTES_PER_WORD*2)) {
+				offset >>= 1;
+				if (left_over >= (BYTES_PER_WORD*4))
+					offset >>= 1;
+			}
+		}
+	} else if (!offset) {
+		/* No offset requested, but space enough - give one. */
+		offset = left_over/align;
+		if (flags & SLAB_HWCACHE_ALIGN) {
+			if (offset >= 8) {
+				/* A large number of colours - use a larger alignment. */
+				align <<= 1;
+			}
+		} else {
+			if (offset >= 10) {
+				align <<= 1;
+				if (offset >= 16)
+					align <<= 1;
+			}
 		}
+		offset = align;
 	}
-	cachep->c_align = align;
 
 #if	0
-	printk("Size:%lu Orig:%lu Left:%lu Align %lu Pages:%d - %s\n",
-	       size, cachep->c_org_size, left_over, align, 1<<cachep->c_gfporder, name);
-	if (SLAB_OFF_SLAB(flags)) printk("OFF SLAB\n");
-	if (SLAB_BUFCTL(flags)) printk("BUFCTL PTRS\n");
+printk("%s: Left_over:%d Align:%d Size:%d\n", name, left_over, offset, size);
 #endif
 
-	/* if the bufctl's are on-slab, c_offset does not inc the size of the bufctl */
+	if ((cachep->c_align = (unsigned long) offset))
+		cachep->c_colour = (left_over/offset);
+	cachep->c_colour_next = cachep->c_colour;
+
+	/* If the bufctl's are on-slab, c_offset does not include the size of bufctl. */
 	if (!SLAB_BUFCTL(flags))
-		size -= kmem_bufctl_short_size;
+		size -= sizeof(kmem_bufctl_t);
+	else
+		cachep->c_index_cachep =
+			kmem_find_general_cachep(cachep->c_num*sizeof(kmem_bufctl_t));
+	cachep->c_offset = (unsigned long) size;
 	cachep->c_freep = kmem_slab_end(cachep);
-	cachep->c_flags = flags;
-	cachep->c_offset = size;
 	cachep->c_firstp = kmem_slab_end(cachep);
 	cachep->c_lastp = kmem_slab_end(cachep);
+	cachep->c_flags = flags;
 	cachep->c_ctor = ctor;
 	cachep->c_dtor = dtor;
 	cachep->c_magic = SLAB_C_MAGIC;
-	cachep->c_inuse = 0;		/* always zero */
-	cachep->c_name = name;		/* simply point to the name */
+	cachep->c_name = name;		/* Simply point to the name. */
+	spin_lock_init(&cachep->c_spinlock);
 
-	cachep->c_colour = 1;
-	if (align) 
-		cachep->c_colour += (left_over/align);
-	cachep->c_colour_next = cachep->c_colour;
-
-	/* warn on dup cache names */
+	/* Need the semaphore to access the chain. */
+	down(&cache_chain_sem);
 	searchp = &cache_cache;
 	do {
+		/* The name field is constant - no lock needed. */
 		if (!strcmp(searchp->c_name, name)) {
-			printk(KERN_WARNING "%sDup name - %s\n", func_nm, name);
+			printk("%sDup name - %s\n", func_nm, name);
 			break;
 		}
 		searchp = searchp->c_nextp;
 	} while (searchp != &cache_cache);
+
+	/* There is no reason to lock our new cache before we
+	 * link it in - no one knows about it yet...
+	 */
 	cachep->c_nextp = cache_cache.c_nextp;
 	cache_cache.c_nextp = cachep;
-	return cachep;
+	up(&cache_chain_sem);
 opps:
-	printk(KERN_WARNING "%sOut of mem creating cache %s\n", func_nm, name);
-	return NULL;
-}
-
-/* Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked
- */
-static void
-kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp, unsigned long flags)
-{
-	if (cachep->c_dtor || SLAB_BUFCTL(cachep->c_flags)) {
-		kmem_bufctl_t	*bufp = slabp->s_freep;
-
-		/* for each obj in slab... */
-		while (bufp) {
-			kmem_bufctl_t	*freep;
-			if (cachep->c_dtor) {
-				void	*objp = ((void*)bufp)-cachep->c_offset;
-				if (SLAB_BUFCTL(cachep->c_flags))
-					objp = bufp->buf_objp;
-				(cachep->c_dtor)(objp, cachep->c_org_size, flags);
-			}
-			freep = bufp;
-			bufp = bufp->buf_nextp;
-			if (SLAB_BUFCTL(cachep->c_flags))
-				kmem_cache_free(&cache_bufctl, freep);
-		}
-	}
-
-	slabp->s_magic = SLAB_MAGIC_UNALLOC;
-	kmem_freepages(cachep, slabp->s_mem);
-	if (SLAB_OFF_SLAB(cachep->c_flags))
-		kmem_cache_free(&cache_slab, slabp);
-}
-
-/* Destroy (remove) a cache.
- * All objs in the cache should be inactive
- */
-int
-kmem_cache_destroy(kmem_cache_t *cachep)
-{
-	kmem_cache_t	**searchp;
-	kmem_slab_t	*slabp;
-	unsigned long	save_flags;
-
-#if	defined(SLAB_MGMT_CHECKS)
-	if (!cachep) {
-		printk(KERN_ERR "kmem_dest: NULL ptr\n");
-		goto err_end;
-	}
-
-	if (in_interrupt()) {
-		printk(KERN_ERR "kmem_dest: Called during int - %s\n", cachep->c_name);
-err_end:
-		return 1;
-	}
-#endif	/* SLAB_MGMT_CHECKS */
-
-	/* unlink the cache from the chain of active caches.
-	 * Note: the chain is never modified during an int
-	 */
-	searchp = &(cache_cache.c_nextp);
-	for (;*searchp != &cache_cache; searchp = &((*searchp)->c_nextp)) {
-		if (*searchp != cachep)
-			continue;
-		goto good_cache;
-	}
-	printk(KERN_ERR "kmem_dest: Invalid cache addr %p\n", cachep);
-	return 1;
-good_cache:
-	/* disable cache so attempts to allocated from an int can
-	 * be caught.
-	 */
-	save_flags(save_flags);
-	cli();
-	if (cachep->c_freep != kmem_slab_end(cachep)) {
-		restore_flags(save_flags);
-		printk(KERN_ERR "kmem_dest: active cache - %s\n", cachep->c_name);
-		return 2;
-	}
-	*searchp = cachep->c_nextp;	/* remove from cache chain */
-	cachep->c_flags |= SLAB_CFLGS_RELEASED;
-	cachep->c_freep = kmem_slab_end(cachep);
-	if (cachep == clock_searchp)
-		clock_searchp = cachep->c_nextp;
-	restore_flags(save_flags);
-
-	while ((slabp = cachep->c_firstp) != kmem_slab_end(cachep)) {
-		kmem_slab_unlink(slabp);
-		kmem_slab_destroy(cachep, slabp, 0);
-	}
-
-	if (SLAB_BUFCTL(cachep->c_flags))
-		kmem_cache_free(&cache_hash, cachep->c_hashp);
-	kmem_cache_free(&cache_cache, cachep);
-	return 0;
+	return cachep;
 }
 
-/* Shrink a cache, ie. remove _all_ inactive slabs.
- * Can be called when a user of a cache knows they are not going to be
- * needing any new objs for a while.
- * NOTE: This func is probably going to disappear - let me know if you
- * are using it!
+/* Shrink a cache.  Releases as many slabs as possible for a cache.
+ * It is expected this function will be called by a module when it is
+ * unloaded.  The cache is _not_ removed, this creates too many problems and
+ * the cache-structure does not take up much room.  A module should keep its
+ * cache pointer(s) in unloaded memory, so when reloaded it knows the cache
+ * is available.  To help debugging, a zero exit status indicates all slabs
+ * were released.
  */
 int
-kmem_cache_shrink(kmem_cache_t *cachep, int wait)
+kmem_cache_shrink(kmem_cache_t *cachep)
 {
+	kmem_cache_t	*searchp;
 	kmem_slab_t	*slabp;
-	unsigned long	dtor_flags;
-	unsigned long	save_flags, num_freed=0;
+	int	ret;
 
-#if	defined(SLAB_MGMT_CHECKS)
 	if (!cachep) {
 		printk(KERN_ERR "kmem_shrink: NULL ptr\n");
-		goto end;
+		return 2;
 	}
-
 	if (in_interrupt()) {
 		printk(KERN_ERR "kmem_shrink: Called during int - %s\n", cachep->c_name);
-		goto end;
+		return 2;
 	}
-#endif	/* SLAB_MGMT_CHECKS */
 
-	dtor_flags = 0;
-	if (!wait)	/* not allowed to wait */
-		dtor_flags = SLAB_DTOR_ATOMIC;
+	/* Find the cache in the chain of caches. */
+	down(&cache_chain_sem);		/* Semaphore is needed. */
+	searchp = &cache_cache;
+	for (;searchp->c_nextp != &cache_cache; searchp = searchp->c_nextp) {
+		if (searchp->c_nextp != cachep)
+			continue;
 
-	save_flags(save_flags);
-	while (0) {
-		cli();
-		slabp = cachep->c_lastp;
-		if (slabp == kmem_slab_end(cachep) || slabp->s_inuse) {
-			restore_flags(save_flags);
-			goto end;
-		}
-		kmem_slab_unlink(slabp);
-		if (cachep->c_freep == slabp)
-			cachep->c_freep = kmem_slab_end(cachep);
-		restore_flags(save_flags);
-		num_freed++;
-		kmem_slab_destroy(cachep, slabp, dtor_flags);
+		/* Accessing clock_searchp is safe - we hold the mutex. */
+		if (cachep == clock_searchp)
+			clock_searchp = cachep->c_nextp;
+		goto found;
 	}
-end:
-	return num_freed;
-}
-
-/* Search for a slab whose objs are suitable for DMA.
- * Note: since testing the first free slab (in __kmem_cache_alloc()),
- * ints must not have been enabled!
- */
-static inline kmem_slab_t *
-kmem_cache_search_dma(kmem_cache_t *cachep)
-{
-	kmem_slab_t	*slabp = cachep->c_freep->s_nextp;
+	up(&cache_chain_sem);
+	printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep);
+	return 2;
+found:
+	/* Relase the sempahore before getting the cache-lock.  This could
+	 * mean multiple engines are shrinking the cache, but so what...
+	 */
+	up(&cache_chain_sem);
+	spin_lock_irq(&cachep->c_spinlock);
 
-	for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
-		if (!(slabp->s_flags & SLAB_SFLGS_DMA))
-			continue;
+	/* If the cache is growing, stop shrinking. */
+	while (!cachep->c_growing) {
+		slabp = cachep->c_lastp;
+		if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
+			break;
 		kmem_slab_unlink(slabp);
-		kmem_slab_link_free(cachep, slabp);
-		return slabp;
+		spin_unlock_irq(&cachep->c_spinlock);
+		kmem_slab_destroy(cachep, slabp);
+		spin_lock_irq(&cachep->c_spinlock);
 	}
-	return NULL;
+	ret = 1;
+	if (cachep->c_lastp == kmem_slab_end(cachep))
+		ret--;		/* Cache is empty. */
+	spin_unlock_irq(&cachep->c_spinlock);
+	return ret;
 }
 
-/* get the mem for a slab mgmt obj */
+/* Get the mem for a slab mgmt obj. */
 static inline kmem_slab_t *
-kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, unsigned long local_flags, unsigned long offset)
+kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags)
 {
 	kmem_slab_t	*slabp;
 
 	if (SLAB_OFF_SLAB(cachep->c_flags)) {
-		/* slab mngmnt obj is off-slab */
-		if (!(slabp = kmem_cache_alloc(&cache_slab, local_flags)))
-			return NULL;
+		/* Slab mgmt obj is off-slab. */
+		slabp = kmem_cache_alloc(cache_slabp, local_flags);
 	} else {
-		/* slab mngmnt at end of slab mem */
-		slabp = objp + (PAGE_SIZE << cachep->c_gfporder);
-		slabp--;
-		if (!SLAB_PTR_IN_OBJ(cachep->c_flags)) {
-			/* A bit of extra help for the L1 cache; try to position the slab
-			 * mgmnt struct at different offsets within the gap at the end
-			 * of a slab.  This helps avoid thrashing the h/w cache lines,
-			 * that map to the end of a page, too much...
-			 */
-			unsigned long gap = cachep->c_offset;
-			if (!SLAB_BUFCTL(cachep->c_flags))
-				gap += kmem_bufctl_short_size;
-			gap = (PAGE_SIZE << cachep->c_gfporder)-((gap*cachep->c_num)+offset+sizeof(*slabp));
-			gap /= (sizeof(*slabp)/2); 
-			gap *= (sizeof(*slabp)/2); 
-			slabp = (((void*)slabp)-gap);
-		}
+		/* Slab mgmnt at end of slab mem, placed so that
+		 * the position is 'coloured'.
+		 */
+		void *end;
+		end = objp + (cachep->c_num * cachep->c_offset);
+		if (!SLAB_BUFCTL(cachep->c_flags))
+			end += (cachep->c_num * sizeof(kmem_bufctl_t));
+		slabp = (kmem_slab_t *) L1_CACHE_ALIGN((unsigned long)end);
 	}
 
-	slabp->s_flags = slabp->s_inuse = slabp->s_jiffies = 0;
+	if (slabp) {
+		slabp->s_inuse = 0;
+		slabp->s_dma = 0;
+		slabp->s_index = NULL;
+	}
 
 	return slabp;
 }
 
-static inline int
-kmem_cache_init_objs(kmem_cache_t *cachep, kmem_slab_t *slabp, void *objp,
-		     unsigned long local_flags, unsigned long ctor_flags)
+static inline void
+kmem_cache_init_objs(kmem_cache_t * cachep, kmem_slab_t * slabp, void *objp,
+				unsigned long ctor_flags)
 {
 	kmem_bufctl_t	**bufpp = &slabp->s_freep;
-	unsigned long	num = cachep->c_num;
+	unsigned long	num = cachep->c_num-1;
 
 	do {
-		if (SLAB_BUFCTL(cachep->c_flags)) {
-			if (!(*bufpp = kmem_cache_alloc(&cache_bufctl, local_flags))) {
-				kmem_slab_destroy(cachep, slabp, 0);
-				return 1;
-			}
-			(*bufpp)->buf_objp = objp;
-			(*bufpp)->buf_hashp = &cachep->c_hashp[kmem_hash(cachep, objp)];
+#if	SLAB_DEBUG_SUPPORT
+		if (cachep->c_flags & SLAB_RED_ZONE) {
+			*((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
+			objp += BYTES_PER_WORD;
+			*((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1;
 		}
+#endif	/* SLAB_DEBUG_SUPPORT */
 
+		/* Constructors are not allowed to allocate memory from the same cache
+		 * which they are a constructor for.  Otherwise, deadlock.
+		 * They must also be threaded.
+		 */
 		if (cachep->c_ctor)
-			cachep->c_ctor(objp, cachep->c_org_size, ctor_flags);
+			cachep->c_ctor(objp, cachep, ctor_flags);
+#if	SLAB_DEBUG_SUPPORT
+		else if (cachep->c_flags & SLAB_POISION) {
+			/* need to poision the objs */
+			kmem_poision_obj(cachep, objp);
+		}
 
-#if	defined(SLAB_DEBUG_SUPPORT)
-		if (cachep->c_flags & SLAB_RED_ZONE)
-			*((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1;
+		if (cachep->c_flags & SLAB_RED_ZONE) {
+			if (*((unsigned long*)(objp+cachep->c_org_size)) !=
+			    SLAB_RED_MAGIC1) {
+				*((unsigned long*)(objp+cachep->c_org_size)) =
+					SLAB_RED_MAGIC1;
+				printk(KERN_ERR "kmem_init_obj: Bad rear redzone "
+				       "after constructor - %s\n", cachep->c_name);
+			}
+			objp -= BYTES_PER_WORD;
+			if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) {
+				*((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
+				printk(KERN_ERR "kmem_init_obj: Bad front redzone "
+				       "after constructor - %s\n", cachep->c_name);
+			}
+		}
 #endif	/* SLAB_DEBUG_SUPPORT */
 
 		objp += cachep->c_offset;
-		if (!SLAB_BUFCTL(cachep->c_flags)) {
+		if (!slabp->s_index) {
 			*bufpp = objp;
-			objp += kmem_bufctl_short_size;
-		}
-		if (!SLAB_PTR_IN_OBJ(cachep->c_flags))
-			(*bufpp)->buf_slabp = slabp;
+			objp += sizeof(kmem_bufctl_t);
+		} else
+			*bufpp = &slabp->s_index[num];
 		bufpp = &(*bufpp)->buf_nextp;
-	} while (--num);
+	} while (num--);
+
 	*bufpp = NULL;
-	return 0;
 }
 
-/* Grow (by 1) the number of slabs within a cache.
- * This is called by kmem_cache_alloc() when there are no
- * inactive objs left in a cache
+/* Grow (by 1) the number of slabs within a cache.  This is called by
+ * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static void
-kmem_cache_grow(kmem_cache_t *cachep, unsigned long flags)
+static int
+kmem_cache_grow(kmem_cache_t * cachep, int flags)
 {
 	kmem_slab_t	*slabp;
+	struct page	*page;
 	void		*objp;
-	unsigned int	offset, dma;
-	unsigned long	ctor_flags, local_flags, save_flags;
+	size_t		 offset;
+	unsigned int	 dma, local_flags;
+	unsigned long	 ctor_flags;
+	unsigned long	 save_flags;
+
+	/* Be lazy and only check for valid flags here,
+ 	 * keeping it out of the critical path in kmem_cache_alloc().
+	 */
+	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) {
+		printk(KERN_WARNING "kmem_grow: Illegal flgs %X (correcting) - %s\n",
+		       flags, cachep->c_name);
+		flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW);
+	}
 
 	if (flags & SLAB_NO_GROW)
-		return; /* caller doesn't want us to grow */
+		return 0;
 
-	save_flags(save_flags);
 	/* The test for missing atomic flag is performed here, rather than
 	 * the more obvious place, simply to reduce the critical path length
-	 * in kmem_cache_alloc().  If a caller is slightly mis-behaving,
-	 * will eventually be caught here (where it matters)
+	 * in kmem_cache_alloc().  If a caller is slightly mis-behaving they
+	 * will eventually be caught here (where it matters).
 	 */
 	if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) {
-		static int count = 0;
-		if (count < 8) {
-			printk(KERN_ERR "kmem_grow: Called nonatomically from "
-			       "int - %s\n", cachep->c_name);
-			count++;
-		}
+		printk(KERN_ERR "kmem_grow: Called nonatomically from int - %s\n",
+		       cachep->c_name);
 		flags &= ~SLAB_LEVEL_MASK;
 		flags |= SLAB_ATOMIC;
 	}
-	local_flags = (flags & SLAB_LEVEL_MASK);
 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
-	if ((flags & SLAB_LEVEL_MASK) == SLAB_ATOMIC) {
-		/* Not allowed to sleep.
-		 * Need to tell a constructor about this - it
-		 * might need to know....
+	local_flags = (flags & SLAB_LEVEL_MASK);
+	if (local_flags == SLAB_ATOMIC) {
+		/* Not allowed to sleep.  Need to tell a constructor about
+		 * this - it might need to know...
 		 */
 		ctor_flags |= SLAB_CTOR_ATOMIC;
 	}
 
-	slabp = NULL;
-	/* get mem for the objs */
-	if (!(objp = kmem_getpages(cachep, flags, &dma)))
-		goto opps1;
+	/* About to mess with non-constant members - lock. */
+	spin_lock_irqsave(&cachep->c_spinlock, save_flags);
 
-	/* get colour for the slab, and cal the next value */
-	cli();
-	if (!(offset = --(cachep->c_colour_next)))
+	/* Get colour for the slab, and cal the next value. */
+	if (!(offset = cachep->c_colour_next--))
 		cachep->c_colour_next = cachep->c_colour;
-	restore_flags(save_flags);
 	offset *= cachep->c_align;
+	cachep->c_dflags = SLAB_CFLGS_GROWN;
+
+	cachep->c_growing++;
+re_try:
+	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+
+	/* A series of memory allocations for a new slab.
+	 * Neither the cache-chain semaphore, or cache-lock, are
+	 * held, but the incrementing c_growing prevents this
+	 * this cache from being reaped or shrunk.
+	 * Note: The cache could be selected in for reaping in
+	 * kmem_cache_reap(), but when the final test is made the
+	 * growing value will be seen.
+	 */
+
+	/* Get mem for the objs. */
+	if (!(objp = kmem_getpages(cachep, flags, &dma)))
+		goto failed;
 
-	/* get slab mgmt */
-	if (!(slabp = kmem_cache_slabmgmt(cachep, objp, local_flags, offset)))
-		goto opps2;
+	/* Get slab mgmt. */
+	if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags)))
+		goto opps1;
 	if (dma)
-		slabp->s_flags = SLAB_SFLGS_DMA;
-	
+		slabp->s_dma = 1;
+	if (SLAB_BUFCTL(cachep->c_flags)) {
+		slabp->s_index = kmem_cache_alloc(cachep->c_index_cachep, local_flags);
+		if (!slabp->s_index)
+			goto opps2;
+	}
+
+	/* Nasty!!!!!!  I hope this is OK. */
+	dma = 1 << cachep->c_gfporder;
+	page = &mem_map[MAP_NR(objp)];
+	do {
+		SLAB_SET_PAGE_CACHE(page, cachep);
+		SLAB_SET_PAGE_SLAB(page, slabp);
+		PageSetSlab(page);
+		page++;
+	} while (--dma);
+
+	slabp->s_offset = offset;	/* It will fit... */
+	objp += offset;		/* Address of first object. */
 	slabp->s_mem = objp;
-	objp += offset;		/* address of first object */
 
 	/* For on-slab bufctls, c_offset is the distance between the start of
 	 * an obj and its related bufctl.  For off-slab bufctls, c_offset is
 	 * the distance between objs in the slab.
-	 * Reason for bufctl at end of obj (when on slab), as opposed to the front;
-	 * if stored within the obj (has no state), and the obj is 'used' after being
-	 * freed then (normally) most activity occurs at the beginning of the obj.
-	 * By keeping the bufctl ptr away from the front, should reduce the chance of
-	 * corruption.  Also, allows easier alignment of objs onto cache lines when
-	 * bufctl is not stored with the objs.
-	 * Downsize; if, while an obj is active, a write is made past its end, then the
-	 * bufctl will be corrupted :(
 	 */
-	if (kmem_cache_init_objs(cachep, slabp, objp, local_flags, ctor_flags))
-		goto no_objs;
+	kmem_cache_init_objs(cachep, slabp, objp, ctor_flags);
+
+	spin_lock_irq(&cachep->c_spinlock);
 
-	cli();
-	/* make slab active */
+	/* Make slab active. */
 	slabp->s_magic = SLAB_MAGIC_ALLOC;
 	kmem_slab_link_end(cachep, slabp);
 	if (cachep->c_freep == kmem_slab_end(cachep))
 		cachep->c_freep = slabp;
-	restore_flags(save_flags);
-	return;
-no_objs:
-	kmem_freepages(cachep, slabp->s_mem); 
+	SLAB_STATS_INC_GROWN(cachep);
+	cachep->c_failures = 0;
+	cachep->c_growing--;
+
+	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+	return 1;
 opps2:
-	kmem_freepages(cachep, objp); 
+	if (SLAB_OFF_SLAB(cachep->c_flags))
+		kmem_cache_free(cache_slabp, slabp);
 opps1:
-	if (slabp && SLAB_OFF_SLAB(cachep->c_flags))
-		kmem_cache_free(&cache_slab, slabp);
-	/* printk("kmem_alloc: Out of mem - %s\n", cachep->c_name); */
-	return;
+	kmem_freepages(cachep, objp); 
+failed:
+	if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) {
+		/* For large order (>0) slabs, we try again.
+		 * Needed because the gfp() functions are not good at giving
+		 * out contigious pages unless pushed (but do not push too hard).
+		 */
+		spin_lock_irq(&cachep->c_spinlock);
+		if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep))
+			goto re_try;
+		cachep->c_failures = 1;	/* Memory is low, don't try as hard next time. */
+		cachep->c_growing--;
+		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+	}
+	return 0;
+}
+
+static void
+kmem_report_alloc_err(const char *str, kmem_cache_t * cachep)
+{
+	if (cachep)
+		SLAB_STATS_INC_ERR(cachep);	/* this is atomic */
+	printk(KERN_ERR "kmem_alloc: %s (name=%s)\n",
+	       str, cachep ? cachep->c_name : "unknown");
+}
+
+static void
+kmem_report_free_err(const char *str, void *objp, kmem_cache_t * cachep)
+{
+	if (cachep)
+		SLAB_STATS_INC_ERR(cachep);
+	printk(KERN_ERR "kmem_free: %s (objp=%p, name=%s)\n",
+	       str, objp, cachep ? cachep->c_name : "unknown");
+}
+
+/* Search for a slab whose objs are suitable for DMA.
+ * Note: since testing the first free slab (in __kmem_cache_alloc()),
+ * ints must not have been enabled, or the cache-lock released!
+ */
+static inline kmem_slab_t *
+kmem_cache_search_dma(kmem_cache_t * cachep)
+{
+	kmem_slab_t	*slabp = cachep->c_freep->s_nextp;
+
+	for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
+		if (!(slabp->s_dma))
+			continue;
+		kmem_slab_unlink(slabp);
+		kmem_slab_link_free(cachep, slabp);
+		cachep->c_freep = slabp;
+		break;
+	}
+	return slabp;
 }
 
-#if	defined(SLAB_DEBUG_SUPPORT)
-/* Perform extra freeing checks.
- * Currently, this check is only for caches that use bufctl structures
- * within the slab.  Those which use bufctl's from the internal cache
- * have a reasonable check when the address is searched for.
+#if	SLAB_DEBUG_SUPPORT
+/* Perform extra freeing checks.  Currently, this check is only for caches
+ * that use bufctl structures within the slab.  Those which use bufctl's
+ * from the internal cache have a reasonable check when the address is
+ * searched for.  Called with the cache-lock held.
  */
 static void *
-kmem_extra_free_checks(const kmem_cache_t *cachep, kmem_bufctl_t *search_bufp,
-		       const kmem_bufctl_t *bufp, void * objp)
+kmem_extra_free_checks(kmem_cache_t * cachep, kmem_bufctl_t *search_bufp,
+		       kmem_bufctl_t *bufp, void * objp)
 {
 	if (SLAB_BUFCTL(cachep->c_flags))
-		goto end;
+		return objp;
 
-	/* check slab's freelist to see if this obj is there */
+	/* Check slab's freelist to see if this obj is there. */
 	for (; search_bufp; search_bufp = search_bufp->buf_nextp) {
 		if (search_bufp != bufp)
 			continue;
-		printk(KERN_ERR "kmem_free: Double free detected during checking "
-		       "%p - %s\n", objp, cachep->c_name);
 		return NULL;
 	}
-end:
 	return objp;
 }
 #endif	/* SLAB_DEBUG_SUPPORT */
 
+/* Called with cache lock held. */
 static inline void
 kmem_cache_full_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
 {
-	if (!slabp->s_nextp->s_inuse)
-		return;		/* at correct position */
-	slabp->s_jiffies = jiffies;	/* set release time */
-	if (cachep->c_freep == slabp)
-		cachep->c_freep = slabp->s_nextp;
-	kmem_slab_unlink(slabp);
-	kmem_slab_link_end(cachep, slabp);
-
-	return;
+	if (slabp->s_nextp->s_inuse) {
+		/* Not at correct position. */
+		if (cachep->c_freep == slabp)
+			cachep->c_freep = slabp->s_nextp;
+		kmem_slab_unlink(slabp);
+		kmem_slab_link_end(cachep, slabp);
+	}
 }
 
+/* Called with cache lock held. */
 static inline void
 kmem_cache_one_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
 {
-	if (slabp->s_nextp->s_inuse != cachep->c_num) {
-		cachep->c_freep = slabp;
-		return;
+	if (slabp->s_nextp->s_inuse == cachep->c_num) {
+		kmem_slab_unlink(slabp);
+		kmem_slab_link_free(cachep, slabp);
 	}
-	kmem_slab_unlink(slabp);
-	kmem_slab_link_free(cachep, slabp);
-	return;
+	cachep->c_freep = slabp;
 }
 
-/* Returns a ptr to an obj in the given cache.
- * The obj is in the initial state (if there is one)
- */
+/* Returns a ptr to an obj in the given cache. */
 static inline void *
-__kmem_cache_alloc(kmem_cache_t *cachep, unsigned long flags)
+__kmem_cache_alloc(kmem_cache_t *cachep, int flags)
 {
 	kmem_slab_t	*slabp;
 	kmem_bufctl_t	*bufp;
 	void		*objp;
 	unsigned long	save_flags;
 
-	/* sanity check */
+	/* Sanity check. */
 	if (!cachep)
 		goto nul_ptr;
-	save_flags(save_flags);
-	cli();
-	/* get slab alloc is to come from */
+	spin_lock_irqsave(&cachep->c_spinlock, save_flags);
+try_again:
+	/* Get slab alloc is to come from. */
 	slabp = cachep->c_freep;
 
-	/* magic is a sanity check _and_ says if we need a new slab */
+	/* Magic is a sanity check _and_ says if we need a new slab. */
 	if (slabp->s_magic != SLAB_MAGIC_ALLOC)
 		goto alloc_new_slab;
-try_again:
-	/* DMA allocations are 'rare' - keep out of critical path */
+	/* DMA requests are 'rare' - keep out of the critical path. */
 	if (flags & SLAB_DMA)
 		goto search_dma;
 try_again_dma:
+	SLAB_STATS_INC_ALLOCED(cachep);
+	SLAB_STATS_INC_ACTIVE(cachep);
+	SLAB_STATS_SET_HIGH(cachep);
 	slabp->s_inuse++;
 	bufp = slabp->s_freep;
 	slabp->s_freep = bufp->buf_nextp;
-	if (!SLAB_BUFCTL(cachep->c_flags)) {
-		/* Nasty - we want the 'if' to be taken in the common case */
-		if (slabp->s_freep) {
-short_finished:
+	if (slabp->s_freep) {
+ret_obj:
+		if (!slabp->s_index) {
+			bufp->buf_slabp = slabp;
 			objp = ((void*)bufp) - cachep->c_offset;
-			restore_flags(save_flags);
-#if	defined(SLAB_DEBUG_SUPPORT)
+finished:
+			/* The lock is not needed by the red-zone or poision ops, and the
+			 * obj has been removed from the slab.  Should be safe to drop
+			 * the lock here.
+			 */
+			spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+#if	SLAB_DEBUG_SUPPORT
 			if (cachep->c_flags & SLAB_RED_ZONE)
 				goto red_zone;
+ret_red:
+			if ((cachep->c_flags & SLAB_POISION) && kmem_check_poision_obj(cachep, objp))
+				kmem_report_alloc_err("Bad poision", cachep);
 #endif	/* SLAB_DEBUG_SUPPORT */
 			return objp;
-		} else {
-			cachep->c_freep = slabp->s_nextp;
-			goto short_finished;
 		}
+		/* Update index ptr. */
+		objp = ((bufp-slabp->s_index)*cachep->c_offset) + slabp->s_mem;
+		bufp->buf_objp = objp;
+		goto finished;
 	}
+	cachep->c_freep = slabp->s_nextp;
+	goto ret_obj;
 
-	if (!slabp->s_freep)
-		cachep->c_freep = slabp->s_nextp;
-
-	/* link into hash chain */
-	objp = kmem_add_to_hash(cachep, bufp);
-	restore_flags(save_flags);
-#if	defined(SLAB_DEBUG_SUPPORT)
-	if (!(cachep->c_flags & SLAB_RED_ZONE))
-#endif	/* SLAB_DEBUG_SUPPORT */
-		return objp;
-
-#if	defined(SLAB_DEBUG_SUPPORT)
+#if	SLAB_DEBUG_SUPPORT
 red_zone:
-	/* set alloc red-zone, and check old one */
+	/* Set alloc red-zone, and check old one. */
+	if (xchg((unsigned long *)objp, SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
+		kmem_report_alloc_err("Bad front redzone", cachep);
+	objp += BYTES_PER_WORD;
 	if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
-		printk(KERN_ERR "kmem_alloc: Bad redzone %p - %s\n",
-		       objp, cachep->c_name);
-	return objp;
+		kmem_report_alloc_err("Bad rear redzone", cachep);
+	goto ret_red;
 #endif	/* SLAB_DEBUG_SUPPORT */
 
 search_dma:
-	if (slabp->s_flags & SLAB_SFLGS_DMA)
-		goto try_again_dma;
-	/* need to search... */
-	if ((slabp = kmem_cache_search_dma(cachep)))
+	if (slabp->s_dma || (slabp = kmem_cache_search_dma(cachep))!=kmem_slab_end(cachep))
 		goto try_again_dma;
 alloc_new_slab:
-	/* Either out of slabs, or magic number corruption */
-	if (slabp != kmem_slab_end(cachep))
-		goto bad_slab;
-	/* need a new slab */
-	restore_flags(save_flags);
-	if (SLAB_RELEASED(cachep->c_flags)) {
-		printk(KERN_ERR "kmem_alloc: destroyed cache\n");
-		goto end;
-	}
-
-	/* Be lazy and only check for valid flags
-	 * here (keeping it out of the critical path above)
-	 */
-	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) {
-		printk(KERN_ERR "kmem_alloc: Illegal flgs %lX (correcting) - %s\n",
-		       flags, cachep->c_name);
-		flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW);
+	/* Either out of slabs, or magic number corruption. */
+	if (slabp == kmem_slab_end(cachep)) {
+		/* Need a new slab.  Release the lock before calling kmem_cache_grow().
+		 * This allows objs to be released back into the cache while growing.
+		 */
+		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+		if (kmem_cache_grow(cachep, flags)) {
+			/* Someone may have stolen our objs.  Doesn't matter, we'll
+			 * just come back here again.
+			 */
+			goto try_again;
+		}
+		/* Couldn't grow, but some objs may have been freed. */
+		spin_lock_irq(&cachep->c_spinlock);
+		if (cachep->c_freep != kmem_slab_end(cachep))
+			goto try_again;
+	} else {
+		/* Very serious error - maybe panic() here? */
+		kmem_report_alloc_err("Bad slab magic (corrupt)", cachep);
 	}
-
-	kmem_cache_grow(cachep, flags);
-	cli();
-	if ((slabp=cachep->c_freep) != kmem_slab_end(cachep))
-		goto try_again;
-	restore_flags(save_flags);
-end:
+	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+err_exit:
 	return NULL;
-bad_slab:
-	/* v. serious error - maybe panic() here? */
-	printk(KERN_ERR "kmem_alloc: Bad slab magic (corruption) - %s\n",
-	       cachep->c_name);
-	goto end;
 nul_ptr:
-	printk(KERN_ERR "kmem_alloc: NULL ptr\n");
-	goto end;
+	kmem_report_alloc_err("NULL ptr", NULL);
+	goto err_exit;
 }
 
-/* Release an obj back to its cache.
- * If the obj has a constructed state, it should be
- * in this state _before_ it is released.
+/* Release an obj back to its cache.  If the obj has a constructed state,
+ * it should be in this state _before_ it is released.
  */
 static inline void
 __kmem_cache_free(kmem_cache_t *cachep, void *objp)
@@ -1199,128 +1461,137 @@ __kmem_cache_free(kmem_cache_t *cachep, void *objp)
 	kmem_bufctl_t	*bufp;
 	unsigned long	save_flags;
 
-	/* basic sanity checks */
-	if (!cachep)
-		goto nul_cache;
-	if (!objp)
-		goto nul_obj;
+	/* Basic sanity checks. */
+	if (!cachep || !objp)
+		goto null_addr;
 
-	save_flags(save_flags);
-#if	defined(SLAB_DEBUG_SUPPORT)
+#if	SLAB_DEBUG_SUPPORT
+	if (cachep->c_flags & SLAB_RED_ZONE)
+		objp -= BYTES_PER_WORD;
+#endif	/* SLAB_DEBUG_SUPPORT */
+
+
+#if	SLAB_DEBUG_SUPPORT
+	/* A verify func is called without the cache-lock held. */
 	if (cachep->c_flags & SLAB_DEBUG_INITIAL)
 		goto init_state_check;
 finished_initial:
 #endif	/* SLAB_DEBUG_SUPPORT */
 
+	spin_lock_irqsave(&cachep->c_spinlock, save_flags);
+
 	if (SLAB_BUFCTL(cachep->c_flags))
 		goto bufctl;
-
 	bufp = (kmem_bufctl_t *)(objp+cachep->c_offset);
 
-	/* get slab for the obj */
-	if (SLAB_PTR_IN_OBJ(cachep->c_flags)) {
-		/* if SLAB_HIGH_PACK is undef, the below is optimised away */		
-		slabp = (kmem_slab_t *)((((unsigned long)objp)&PAGE_MASK)+PAGE_SIZE);
-		slabp--;
-	} else
-		slabp = (kmem_slab_t *) bufp->buf_slabp;
+	/* Get slab for the object. */
+#if	0
+	/* _NASTY_IF/ELSE_, but avoids a 'distant' memory ref for some objects.
+	 * Is this worth while? XXX
+	 */
+	if (cachep->c_flags & SLAB_HIGH_PACK)
+		slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(bufp)]);
+	else
+#endif
+		slabp = bufp->buf_slabp;
 
-	if (slabp->s_magic != SLAB_MAGIC_ALLOC)		/* sanity check */
-		goto bad_obj;
-	cli();
+check_magic:
+	if (slabp->s_magic != SLAB_MAGIC_ALLOC)		/* Sanity check. */
+		goto bad_slab;
 
-#if	defined(SLAB_DEBUG_SUPPORT)
-	if (cachep->c_flags & (SLAB_DEBUG_FREE|SLAB_RED_ZONE))
+#if	SLAB_DEBUG_SUPPORT
+	if (cachep->c_flags & SLAB_DEBUG_FREE)
 		goto extra_checks;
+passed_extra:
 #endif	/* SLAB_DEBUG_SUPPORT */
 
-passed_extra:
-	if (!slabp->s_inuse)			/* sanity check */
-		goto too_many;
-	bufp->buf_nextp = slabp->s_freep;
-	slabp->s_freep = bufp;
-	if (--(slabp->s_inuse)) {
-		if (bufp->buf_nextp) {
-			restore_flags(save_flags);
-			return;
+	if (slabp->s_inuse) {		/* Sanity check. */
+		SLAB_STATS_DEC_ACTIVE(cachep);
+		slabp->s_inuse--;
+		bufp->buf_nextp = slabp->s_freep;
+		slabp->s_freep = bufp;
+		if (slabp->s_inuse) {
+			if (bufp->buf_nextp) {
+				/* (hopefully) The most common case. */
+finished:
+#if	SLAB_DEBUG_SUPPORT
+				/* Need to poision the obj while holding the lock. */
+				if (cachep->c_flags & SLAB_POISION)
+					kmem_poision_obj(cachep, objp);
+				if (cachep->c_flags & SLAB_RED_ZONE)
+					goto red_zone;
+return_red:
+#endif	/* SLAB_DEBUG_SUPPORT */
+				spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+				return;
+			}
+			kmem_cache_one_free(cachep, slabp);
+			goto finished;
 		}
-		kmem_cache_one_free(cachep, slabp);
-		restore_flags(save_flags);
-		return;
+		kmem_cache_full_free(cachep, slabp);
+		goto finished;
 	}
-	kmem_cache_full_free(cachep, slabp);
-	restore_flags(save_flags);
+
+	/* Don't add to freelist. */
+	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+	kmem_report_free_err("free with no active objs", objp, cachep);
 	return;
 bufctl:
-	/* Off-slab bufctls.  Need to search hash for bufctl, and hence the slab.
-	 * No 'extra' checks are performed for objs stored this way, finding
-	 * the obj a check enough
+	/* No 'extra' checks are performed for objs stored this way, finding
+	 * the obj is check enough.
 	 */
-	cli();
-	if ((bufp = kmem_remove_from_hash(cachep, objp))) {
-		slabp = (kmem_slab_t *) bufp->buf_slabp;
-#if	defined(SLAB_DEBUG_SUPPORT)
-		if (cachep->c_flags & SLAB_RED_ZONE)
-			goto red_zone;
-#endif	/* SLAB_DEBUG_SUPPORT */
-		goto passed_extra;
-	}
-	restore_flags(save_flags);
-	printk(KERN_ERR "kmem_free: Either bad obj addr or double free: %p - %s\n",
-	       objp, cachep->c_name);
+	slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(objp)]);
+	bufp =	&slabp->s_index[(objp - slabp->s_mem)/cachep->c_offset];
+	if (bufp->buf_objp == objp)
+		goto check_magic;
+	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+	kmem_report_free_err("Either bad obj addr or double free", objp, cachep);
 	return;
-#if	defined(SLAB_DEBUG_SUPPORT)
-red_zone:
-	if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
-		/* Either write past end of the object, or a double free */
-		printk(KERN_ERR "kmem_free: Bad redzone %p - %s\n",
-		       objp, cachep->c_name);
-	}
-	goto passed_extra;
+#if	SLAB_DEBUG_SUPPORT
 init_state_check:
-	/* Need to call the slab's constructor so that
-	 * the caller can perform a verify of its state (debugging)
+	/* Need to call the slab's constructor so the
+	 * caller can perform a verify of its state (debugging).
 	 */
-	cachep->c_ctor(objp, cachep->c_org_size, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+	cachep->c_ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
 	goto finished_initial;
 extra_checks:
-	if ((cachep->c_flags & SLAB_DEBUG_FREE) &&
-	    (objp != kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp))) {
-		restore_flags(save_flags);
+	if (!kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp)) {
+		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+		kmem_report_free_err("Double free detected during checks", objp, cachep);
 		return;
 	}
-	if (cachep->c_flags & SLAB_RED_ZONE)
-		goto red_zone;
 	goto passed_extra;
-#endif	/* SLAB_DEBUG_SUPPORT */
-bad_obj:
-	/* The addr of the slab doesn't contain the correct
-	 * magic num
+red_zone:
+	/* We hold the cache-lock while checking the red-zone, just incase
+	 * some tries to take this obj from us...
 	 */
-	if (slabp->s_magic == SLAB_MAGIC_UNALLOC) {
-		/* magic num says this is an unalloc slab */
-		printk(KERN_ERR "kmem_free: obj %p from destroyed slab - %s\n",
-		       objp, cachep->c_name);
-		return;
+	if (xchg((unsigned long *)objp, SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
+		/* Either write before start of obj, or a double free. */
+		kmem_report_free_err("Bad front redzone", objp, cachep);
 	}
-	printk(KERN_ERR "kmem_free: Bad obj %p - %s\n", objp, cachep->c_name);
-	return;
-too_many:
-	/* don't add to freelist */
-	restore_flags(save_flags);
-	printk(KERN_ERR "kmem_free: obj free for slab with no active objs - %s\n",
-	       cachep->c_name);
-	return;
-nul_obj:
-	printk(KERN_ERR "kmem_free: NULL obj - %s\n", cachep->c_name);
+	objp += BYTES_PER_WORD;
+	if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
+		/* Either write past end of obj, or a double free. */
+		kmem_report_free_err("Bad rear redzone", objp, cachep);
+	}
+	goto return_red;
+#endif	/* SLAB_DEBUG_SUPPORT */
+bad_slab:
+	/* Slab doesn't contain the correct magic num. */
+	if (slabp->s_magic == SLAB_MAGIC_DESTROYED) {
+		/* Magic num says this is a destroyed slab. */
+		kmem_report_free_err("free from inactive slab", objp, cachep);
+	} else
+		kmem_report_free_err("Bad obj addr", objp, cachep);
+	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
 	return;
-nul_cache:
-	printk(KERN_ERR "kmem_free: NULL cache ptr\n");
+null_addr:
+	kmem_report_free_err("NULL ptr", objp, cachep);
 	return;
 }
 
 void *
-kmem_cache_alloc(kmem_cache_t *cachep, unsigned long flags)
+kmem_cache_alloc(kmem_cache_t *cachep, int flags)
 {
 	return __kmem_cache_alloc(cachep, flags);
 }
@@ -1332,163 +1603,249 @@ kmem_cache_free(kmem_cache_t *cachep, void *objp)
 }
 
 void *
-kmem_alloc(unsigned long size, unsigned long flags)
+kmalloc(size_t size, int flags)
 {
-	cache_sizes_t	*cachep = cache_sizes;
+	cache_sizes_t	*csizep = cache_sizes;
 
-	for (; cachep->cs_size; cachep++) {
-		if (size > cachep->cs_size)
+	for (; csizep->cs_size; csizep++) {
+		if (size > csizep->cs_size)
 			continue;
-		/* should the inline version be used here? */
-		return kmem_cache_alloc(cachep->cs_cachep, flags);
+		return __kmem_cache_alloc(csizep->cs_cachep, flags);
 	}
-	printk(KERN_ERR "kmem_alloc: Size (%lu) too large\n", size);
+	printk(KERN_ERR "kmalloc: Size (%lu) too large\n", (unsigned long) size);
 	return NULL;
 }
 
 void
-kmem_free(void *objp, unsigned long size)
+kfree(void *objp)
 {
-	cache_sizes_t	*cachep = cache_sizes;
+	struct page *page;
+	int	nr;
 
-	for (; cachep->cs_size; cachep++) {
-		if (size > cachep->cs_size)
-			continue;
-		/* should the inline version be used here? */
-		kmem_cache_free(cachep->cs_cachep, objp);
-		return;
+	if (!objp)
+		goto null_ptr;
+	nr = MAP_NR(objp);
+	if (nr >= max_mapnr)
+		goto null_ptr;
+
+	/* Assume we own the page structure - hence no locking.
+	 * If someone is misbehaving (eg. someone calling us with a bad
+	 * address), then access to the page structure can race with the
+	 * kmem_slab_destory() code.  Need to add a spin_lock to each page
+	 * structure, which would be useful in threading the gfp() functions....
+	 */
+	page = &mem_map[nr];
+	if (PageSlab(page)) {
+		kmem_cache_t	*cachep;
+
+		/* Here, we (again) assume the obj address is good.
+		 * If it isn't, and happens to map onto another
+		 * general-cache page which has no active objs, then
+		 * we race....
+		 */
+		cachep = SLAB_GET_PAGE_CACHE(page);
+		if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) {
+			__kmem_cache_free(cachep, objp);
+			return;
+		}
+	}
+null_ptr:
+	printk(KERN_ERR "kfree: Bad obj %p\n", objp);
+while(1);
+	return;
+}
+
+void
+kfree_s(void *objp, size_t size)
+{
+	struct page *page;
+	int	nr;
+
+	if (!objp)
+		goto null_ptr;
+	nr = MAP_NR(objp);
+	if (nr >= max_mapnr)
+		goto null_ptr;
+	/* See comment in kfree() */
+	page = &mem_map[nr];
+	if (PageSlab(page)) {
+		kmem_cache_t	*cachep;
+		/* See comment in kfree() */
+		cachep = SLAB_GET_PAGE_CACHE(page);
+		if (cachep && cachep->c_flags & SLAB_CFLGS_GENERAL) {
+			if (size <= cachep->c_org_size) {	/* XXX better check */
+				__kmem_cache_free(cachep, objp);
+				return;
+			}
+		}
 	}
-	printk(KERN_ERR "kmem_free: Size (%lu) too large - strange\n", size);
+null_ptr:
+	printk(KERN_ERR "kfree_s: Bad obj %p\n", objp);
+	return;
 }
 
+kmem_cache_t *
+kmem_find_general_cachep(size_t size)
+{
+	cache_sizes_t	*csizep = cache_sizes;
+
+	/* This function could be moved to the header-file, and
+	 * made inline so consumers can quickly determine what
+	 * cache-ptr they require.
+	 */
+	for (; csizep->cs_size; csizep++) {
+		if (size > csizep->cs_size)
+			continue;
+		break;
+	}
+	return csizep->cs_cachep;
+}
 
 
 /* Called from try_to_free_page().
- * Ideal solution would have a weight for each cache, based on;
- *	o num of fully free slabs
- *	o if the objs have a constructor/deconstructor
- *	o length of time slabs have been fully free (ie. ageing)
  * This function _cannot_ be called within a int, but it
  * can be interrupted.
  */
 int
 kmem_cache_reap(int pri, int dma, int wait)
 {
-	unsigned long	 dtor_flags = 0;
-	unsigned long	 best_jiffie;
-	unsigned long	 now;
-	int		count = 8;
-	kmem_slab_t	*best_slabp = NULL;
-	kmem_cache_t	*best_cachep = NULL;
 	kmem_slab_t	*slabp;
 	kmem_cache_t	*searchp;
-	unsigned long	save_flags;
+	kmem_cache_t	*best_cachep;
+	unsigned long	 scan;
+	unsigned long	 reap_level;
 
-	/* 'pri' maps to the number of caches to examine, not the number of slabs.
-	 * This avoids only checking the jiffies for slabs in one cache at the
-	 * expensive spending more cycles
+	if (in_interrupt()) {
+		printk("kmem_cache_reap() called within int!\n");
+		return 0;
+	}
+	scan = 9-pri;
+	reap_level = pri >> 1;
+
+	/* We really need a test semphore op so we can avoid sleeping when
+	 * !wait is true.
 	 */
-	pri = (9 - pri);
-	if (!wait)	/* not allowed to wait */
-		dtor_flags = SLAB_DTOR_ATOMIC;
+	down(&cache_chain_sem);
+	best_cachep = NULL;
 	searchp = clock_searchp;
-	save_flags(save_flags);
-	now = jiffies;
-	best_jiffie = now - (2*HZ);	/* 2secs - avoid heavy thrashing */
-	while (pri--) {
-		kmem_slab_t	*local_slabp;
-		unsigned long	local_jiffie;
-		if (searchp == &cache_cache)
+	do {
+		unsigned long	full_free;
+		/* It's safe to test this without holding the cache-lock. */
+		if (searchp->c_flags & SLAB_NO_REAP)
 			goto next;
-
-		/* sanity check for corruption */
+		spin_lock_irq(&searchp->c_spinlock);
+		if (searchp->c_growing)
+			goto next_unlock;
+		if (searchp->c_dflags & SLAB_CFLGS_GROWN) {
+			searchp->c_dflags &= ~SLAB_CFLGS_GROWN;
+			goto next_unlock;
+		}
+		/* Sanity check for corruption of static values. */
 		if (searchp->c_inuse || searchp->c_magic != SLAB_C_MAGIC) {
-			printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n",
-			       searchp->c_name);
+			spin_unlock_irq(&searchp->c_spinlock);
+			printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n", searchp->c_name);
 			goto next;
 		}
+		full_free = 0;
 
-		local_slabp = NULL;
-		local_jiffie = now - (2*HZ);
-		cli();
-		/* As the fully free slabs, within a cache, have no particular
-		 * order, we need to test them all.  Infact, we only check 'count'
-		 * slabs.
+		/* Count num of fully free slabs.  Hopefully there are not many,
+		 * we are holding the cache lock....
 		 */
 		slabp = searchp->c_lastp;
-		for (;count && slabp != kmem_slab_end(searchp) && !slabp->s_inuse; slabp = slabp->s_prevp, count--) {
-			if (slabp->s_jiffies >= local_jiffie)
-				continue;
+		while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) {
+			slabp = slabp->s_prevp;
+			full_free++;
+		}
+		spin_unlock_irq(&searchp->c_spinlock);
 
-			/* weight caches with a con/decon */
-			if ((searchp->c_ctor || searchp->c_dtor) && slabp->s_jiffies >= (local_jiffie - (2*HZ)))
-				continue;
+		if (full_free) {
+			if (full_free >= 10) {
+				best_cachep = searchp;
+				break;
+			}
 
-			/* weight caches with high page orders.  Avoids stressing the
-			 * VM sub-system by reducing the frequency requests for a large
-			 * num of contigious pages
+			/* Try to avoid slabs with constructors and/or
+			 * more than one page per slab (as it can be difficult
+			 * to get high orders from gfp()).
 			 */
-			if (searchp->c_gfporder > 1 && slabp->s_jiffies >= (local_jiffie - (4*HZ)))
-				continue;
-
-			local_jiffie = slabp->s_jiffies;
-			local_slabp = slabp;
-			if (!searchp->c_gfporder && (now-local_jiffie) >= (300*HZ)) {
-				/* an old, one page slab.  Make a quick get away... */
-				pri = 0;
-				break;
+			if (pri == 6) {		/* magic '6' from try_to_free_page() */
+				if (searchp->c_ctor)
+					full_free--;
+				if (full_free && searchp->c_gfporder)
+					full_free--;
 			}
-		}
-		if (local_slabp) {
-			if (!count || local_jiffie < best_jiffie) {
-				best_slabp = local_slabp;
-				best_jiffie = local_jiffie;
+			if (full_free >= reap_level) {
+				reap_level = full_free;
 				best_cachep = searchp;
-				if (!count)
-					break;
 			}
 		}
-		restore_flags(save_flags);
+		goto next;
+next_unlock:
+		spin_unlock_irq(&searchp->c_spinlock);
 next:
 		searchp = searchp->c_nextp;
-		if (searchp == clock_searchp)
-			break;
-		count = 8;	/* # of slabs at which we force a reap */
-	}
-
-	/* only move along with we didn't find an over allocated cache */
-	if (count)
-		clock_searchp = clock_searchp->c_nextp;
+	} while (--scan && searchp != clock_searchp);
 
-	if (!best_slabp)
-		return 0;
+	clock_searchp = searchp;
+	up(&cache_chain_sem);
 
-	cli();
-	if (best_slabp->s_inuse) {
-		/* an object in our selected slab has been
-		 * allocated.  This souldn't happen v. often, so we
-		 * simply fail - which isn't ideal but will do.
-		 * NOTE: No test for the case where an obj has been
-		 * allocated from the slab, and then freed.  While
-		 * this would change our idea of the best slab to
-		 * reap, it's not worth the re-calculation effort.
-		 */
-		restore_flags(save_flags);
+	if (!best_cachep) {
+		/* couldn't find anthying to reap */
 		return 0;
 	}
 
-	if (best_cachep->c_freep == best_slabp)
-		best_cachep->c_freep = best_slabp->s_nextp;
-	kmem_slab_unlink(best_slabp);
+	spin_lock_irq(&best_cachep->c_spinlock);
+	if (!best_cachep->c_growing && !(slabp = best_cachep->c_lastp)->s_inuse && slabp != kmem_slab_end(best_cachep)) {
+		if (slabp == best_cachep->c_freep)
+			best_cachep->c_freep = kmem_slab_end(best_cachep);
+		kmem_slab_unlink(slabp);
+		SLAB_STATS_INC_REAPED(best_cachep);
 
-	restore_flags(save_flags);
-	kmem_slab_destroy(best_cachep, best_slabp, dtor_flags);
+		/* Safe to drop the lock.  The slab is no longer linked to the
+		 * cache.
+		 */
+		spin_unlock_irq(&best_cachep->c_spinlock);
+		kmem_slab_destroy(best_cachep, slabp);
+		return 1;
+	}
+	spin_unlock_irq(&best_cachep->c_spinlock);
+	return 0;
+}
 
-	return 1;
+#if	SLAB_SELFTEST
+/* A few v. simple tests */
+static void
+kmem_self_test(void)
+{
+	kmem_cache_t	*test_cachep;
+
+	printk(KERN_INFO "kmem_test() - start\n");
+	test_cachep = kmem_cache_create("test-cachep", 16, 0, SLAB_RED_ZONE|SLAB_POISION, NULL, NULL);
+	if (test_cachep) {
+		char *objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
+		if (objp) {
+			/* Write in front and past end, red-zone test. */
+			*(objp-1) = 1;
+			*(objp+16) = 1;
+			kmem_cache_free(test_cachep, objp);
+
+			/* Mess up poisioning. */
+			*objp = 10;
+			objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
+			kmem_cache_free(test_cachep, objp);
+
+			/* Mess up poisioning (again). */
+			*objp = 10;
+			kmem_cache_shrink(test_cachep);
+		}
+	}
+	printk(KERN_INFO "kmem_test() - finished\n");
 }
+#endif	/* SLAB_SELFTEST */
 
+#if	defined(CONFIG_PROC_FS)
 /* /proc/slabinfo
- *  cache-name num-active-objs total-objs num-active-slabs total-slabs num-pages-per-slab
+ * cache-name num-active-objs total-objs num-active-slabs total-slabs num-pages-per-slab
  */
 int
 get_slabinfo(char *buf)
@@ -1496,31 +1853,62 @@ get_slabinfo(char *buf)
 	kmem_cache_t	*cachep;
 	kmem_slab_t	*slabp;
 	unsigned long	active_objs;
-	unsigned long	num_slabs, active_slabs;
 	unsigned long	save_flags;
+	unsigned long	num_slabs;
+	unsigned long	num_objs;
 	int		len=0;
+#if	SLAB_STATS
+	unsigned long	active_slabs;
+#endif	/* SLAB_STATS */
 
-	/* output format version, so at least we can change it without _too_
-	 * many complaints
+	__save_flags(save_flags);
+
+	/* Output format version, so at least we can change it without _too_
+	 * many complaints.
 	 */
+#if	SLAB_STATS
+	len = sprintf(buf, "slabinfo - version: 1.0 (statistics)\n");
+#else
 	len = sprintf(buf, "slabinfo - version: 1.0\n");
-	save_flags(save_flags);
+#endif	/* SLAB_STATS */
+	down(&cache_chain_sem);
 	cachep = &cache_cache;
 	do {
-		active_slabs = num_slabs = active_objs = 0;
-		cli();
-		for (slabp = cachep->c_firstp;
-		     slabp != kmem_slab_end(cachep);
-		     slabp = slabp->s_nextp) {
-			num_slabs++;
+#if	SLAB_STATS
+		active_slabs = 0;
+#endif	/* SLAB_STATS */
+		num_slabs = active_objs = 0;
+		spin_lock_irq(&cachep->c_spinlock);
+		for (slabp = cachep->c_firstp; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
 			active_objs += slabp->s_inuse;
+			num_slabs++;
+#if	SLAB_STATS
 			if (slabp->s_inuse)
 				active_slabs++;
+#endif	/* SLAB_STATS */
 		}
-		restore_flags(save_flags);
-		len += sprintf(buf+len, "%-20s%lu %lu %lu %lu %d\n", cachep->c_name,
-			       active_objs, cachep->c_num*num_slabs,
-			       active_slabs, num_slabs, 1<<cachep->c_gfporder);
+		num_objs = cachep->c_num*num_slabs;
+#if	SLAB_STATS
+		{
+		unsigned long errors;
+		unsigned long high = cachep->c_high_mark;
+		unsigned long grown = cachep->c_grown;
+		unsigned long reaped = cachep->c_reaped;
+		unsigned long allocs = cachep->c_num_allocations;
+		errors = (unsigned long) atomic_read(&cachep->c_errors);
+		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+		len += sprintf(buf+len, "%-16s %6lu %6lu %4lu %4lu %4lu %6lu %7lu %5lu %4lu %4lu\n",
+				cachep->c_name, active_objs, num_objs, active_slabs, num_slabs,
+				(1<<cachep->c_gfporder)*num_slabs,
+				high, allocs, grown, reaped, errors);
+		}
+#else
+		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+		len += sprintf(buf+len, "%-17s %6lu %6lu\n", cachep->c_name, active_objs, num_objs);
+#endif	/* SLAB_STATS */
 	} while ((cachep = cachep->c_nextp) != &cache_cache);
+	up(&cache_chain_sem);
+
 	return len;
 }
+#endif	/* CONFIG_PROC_FS */
diff --git a/mm/swap.c b/mm/swap.c
index 7b6a0eb49..80817ecf1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -23,6 +23,7 @@
 #include <linux/fs.h>
 #include <linux/swapctl.h>
 #include <linux/pagemap.h>
+#include <linux/init.h>
 
 #include <asm/dma.h>
 #include <asm/system.h> /* for cli()/sti() */
@@ -67,7 +68,7 @@ swapstat_t swapstats = {0};
 /* General swap control */
 
 /* Parse the kernel command line "swap=" option at load time: */
-void swap_setup(char *str, int *ints)
+__initfunc(void swap_setup(char *str, int *ints))
 {
 	int * swap_vars[8] = {
 		&MAX_PAGE_AGE,
@@ -87,7 +88,7 @@ void swap_setup(char *str, int *ints)
 }
 
 /* Parse the kernel command line "buff=" option at load time: */
-void buff_setup(char *str, int *ints)
+__initfunc(void buff_setup(char *str, int *ints))
 {
 	int * buff_vars[6] = {
 		&MAX_BUFF_AGE,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 044180721..f3ffa46d5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -16,6 +16,7 @@
 #include <linux/swap.h>
 #include <linux/fs.h>
 #include <linux/swapctl.h>
+#include <linux/init.h>
 
 #include <asm/dma.h>
 #include <asm/system.h> /* for cli()/sti() */
@@ -69,8 +70,8 @@ int add_to_swap_cache(unsigned long index, unsigned long entry)
 	return 0;
 }
 
-unsigned long init_swap_cache(unsigned long mem_start,
-	unsigned long mem_end)
+__initfunc(unsigned long init_swap_cache(unsigned long mem_start,
+					 unsigned long mem_end))
 {
 	unsigned long swap_cache_size;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 91221a415..32a5ed8b0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -298,24 +298,25 @@ static int unuse_process(struct mm_struct * mm, unsigned int type, unsigned long
  */
 static int try_to_unuse(unsigned int type)
 {
-	int nr;
 	unsigned long page = get_free_page(GFP_KERNEL);
+	struct task_struct *p;
 
 	if (!page)
 		return -ENOMEM;
-	nr = 0;
-	while (nr < NR_TASKS) {
-		struct task_struct * p = task[nr];
-		if (p) {
-			if (unuse_process(p->mm, type, page)) {
-				page = get_free_page(GFP_KERNEL);
-				if (!page)
-					return -ENOMEM;
-				continue;
-			}
+again:
+	read_lock(&tasklist_lock);
+	for_each_task(p) {
+		read_unlock(&tasklist_lock);
+		if(unuse_process(p->mm, type, page)) {
+			page = get_free_page(GFP_KERNEL);
+			if(!page)
+				return -ENOMEM;
+			goto again;
 		}
-		nr++;
+		read_lock(&tasklist_lock);
 	}
+	read_unlock(&tasklist_lock);
+
 	free_page(page);
 	return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b25c0a0ac..d890be5df 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -271,54 +271,72 @@ static int swap_out_process(struct task_struct * p, int dma, int wait)
 
 static int swap_out(unsigned int priority, int dma, int wait)
 {
-	static int swap_task;
-	int loop, counter;
+	static int skip_factor = 0;
+	int limit = nr_tasks - 1;
+	int loop, counter, i;
 	struct task_struct *p;
 
 	counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+	if(skip_factor > nr_tasks)
+		skip_factor = 0;
+
+	read_lock(&tasklist_lock);
+	p = init_task.next_task;
+	i = skip_factor;
+	while(i--)
+		p = p->next_task;
 	for(; counter >= 0; counter--) {
-		/*
-		 * Check that swap_task is suitable for swapping.  If not, look for
-		 * the next suitable process.
-		 */
+		/* Check if task is suitable for swapping. */
 		loop = 0;
 		while(1) {
-			if (swap_task >= NR_TASKS) {
-				swap_task = 1;
+			if(!--limit) {
+				limit = nr_tasks - 1;
+				/* See if all processes are unswappable or
+				 * already swapped out.
+				 */
 				if (loop)
-					/* all processes are unswappable or already swapped out */
-					return 0;
+					goto out;
 				loop = 1;
 			}
-
-			p = task[swap_task];
-			if (p && p->swappable && p->mm->rss)
+			if (p->swappable && p->mm->rss)
 				break;
-
-			swap_task++;
+			if((p = p->next_task) == &init_task)
+				p = p->next_task;
 		}
+		skip_factor++;
 
-		/*
-		 * Determine the number of pages to swap from this process.
-		 */
+		/* Determine the number of pages to swap from this process. */
 		if (!p->swap_cnt) {
- 			/* Normalise the number of pages swapped by
+			/* Normalise the number of pages swapped by
 			   multiplying by (RSS / 1MB) */
 			p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
 		}
 		if (!--p->swap_cnt)
-			swap_task++;
+			skip_factor++;
+		read_unlock(&tasklist_lock);
+
 		switch (swap_out_process(p, dma, wait)) {
-			case 0:
-				if (p->swap_cnt)
-					swap_task++;
-				break;
-			case 1:
-				return 1;
-			default:
-				break;
-		}
+		case 0:
+			if (p->swap_cnt)
+				skip_factor++;
+			break;
+		case 1:
+			return 1;
+		default:
+			break;
+		};
+
+		/* Whoever we swapped may not even exist now, in fact we cannot
+		 * assume anything about the list we were searching previously.
+		 */
+		read_lock(&tasklist_lock);
+		p = init_task.next_task;
+		i = skip_factor;
+		while(i--)
+			p = p->next_task;
 	}
+out:
+	read_unlock(&tasklist_lock);
 	return 0;
 }
author	Ralf Baechle <ralf@linux-mips.org>	1997-06-01 03:16:17 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	1997-06-01 03:16:17 +0000
commit	d8d9b8f76f22b7a16a83e261e64f89ee611f49df (patch)
tree	3067bc130b80d52808e6390c9fc7fc087ec1e33c /mm
parent	19c9bba94152148523ba0f7ef7cffe3d45656b11 (diff)