diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1997-06-01 03:16:17 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1997-06-01 03:16:17 +0000 |
commit | d8d9b8f76f22b7a16a83e261e64f89ee611f49df (patch) | |
tree | 3067bc130b80d52808e6390c9fc7fc087ec1e33c /mm | |
parent | 19c9bba94152148523ba0f7ef7cffe3d45656b11 (diff) |
Initial revision
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/filemap.c | 4 | ||||
-rw-r--r-- | mm/kmalloc.c | 453 | ||||
-rw-r--r-- | mm/memory.c | 81 | ||||
-rw-r--r-- | mm/mmap.c | 690 | ||||
-rw-r--r-- | mm/page_alloc.c | 33 | ||||
-rw-r--r-- | mm/page_io.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 2338 | ||||
-rw-r--r-- | mm/swap.c | 5 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 25 | ||||
-rw-r--r-- | mm/vmscan.c | 76 |
12 files changed, 1609 insertions, 2111 deletions
diff --git a/mm/Makefile b/mm/Makefile index 5f5156049..c64eefbd2 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -9,7 +9,7 @@ O_TARGET := mm.o O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ - kmalloc.o vmalloc.o slab.o \ + vmalloc.o slab.o \ swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o include $(TOPDIR)/Rules.make diff --git a/mm/filemap.c b/mm/filemap.c index 6f58da546..88c2fd49d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -171,7 +171,7 @@ int shrink_mmap(int priority, int dma) switch (atomic_read(&page->count)) { case 1: /* If it has been referenced recently, don't free it */ - if (clear_bit(PG_referenced, &page->flags)) + if (test_and_clear_bit(PG_referenced, &page->flags)) break; /* is it a page cache page? */ @@ -1342,7 +1342,7 @@ generic_file_write(struct inode *inode, struct file *file, const char *buf, unsi } lockit: - while (set_bit(PG_locked, &page->flags)) + while (test_and_set_bit(PG_locked, &page->flags)) wait_on_page(page); /* diff --git a/mm/kmalloc.c b/mm/kmalloc.c deleted file mode 100644 index 9de1bff51..000000000 --- a/mm/kmalloc.c +++ /dev/null @@ -1,453 +0,0 @@ -/* - * linux/mm/kmalloc.c - * - * Copyright (C) 1991, 1992 Linus Torvalds & Roger Wolff. - * - * Written by R.E. Wolff Sept/Oct '93. - * - */ - -/* - * Modified by Alex Bligh (alex@cconcepts.co.uk) 4 Apr 1994 to use multiple - * pages. So for 'page' throughout, read 'area'. - * - * Largely rewritten.. Linus - */ - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/interrupt.h> - -#include <asm/system.h> -#include <asm/dma.h> - -/* Define this if you want slow routines that try to trip errors */ -#undef SADISTIC_KMALLOC - -/* Private flags. */ - -#define MF_USED 0xffaa0055 -#define MF_DMA 0xff00aa55 -#define MF_FREE 0x0055ffaa - - -/* - * Much care has gone into making these routines in this file reentrant. - * - * The fancy bookkeeping of nbytesmalloced and the like are only used to - * report them to the user (oooohhhhh, aaaaahhhhh....) are not - * protected by cli(). (If that goes wrong. So what?) - * - * These routines restore the interrupt status to allow calling with ints - * off. - */ - -/* - * A block header. This is in front of every malloc-block, whether free or not. - */ -struct block_header { - unsigned long bh_flags; - union { - unsigned long ubh_length; - struct block_header *fbh_next; - } vp; -}; - - -#define bh_length vp.ubh_length -#define bh_next vp.fbh_next -#define BH(p) ((struct block_header *)(p)) - - -/* - * The page descriptor is at the front of every page that malloc has in use. - */ -struct page_descriptor { - struct page_descriptor *next; - struct block_header *firstfree; - int order; - int nfree; -}; - - -#define PAGE_DESC(p) ((struct page_descriptor *)(((unsigned long)(p)) & PAGE_MASK)) - - -/* - * A size descriptor describes a specific class of malloc sizes. - * Each class of sizes has its own freelist. - */ -struct size_descriptor { - struct page_descriptor *firstfree; - struct page_descriptor *dmafree; /* DMA-able memory */ - int nblocks; - - int nmallocs; - int nfrees; - int nbytesmalloced; - int npages; - unsigned long gfporder; /* number of pages in the area required */ -}; - -/* - * For now it is unsafe to allocate bucket sizes between n and - * n-sizeof(page_descriptor) where n is PAGE_SIZE * any power of two - * - * The blocksize and sizes arrays _must_ match! - */ -#if PAGE_SIZE == 4096 -static const unsigned int blocksize[] = { - 32, - 64, - 128, - 252, - 508, - 1020, - 2040, - 4096 - 16, - 8192 - 16, - 16384 - 16, - 32768 - 16, - 65536 - 16, - 131072 - 16, - 0 -}; - -static struct size_descriptor sizes[] = -{ - {NULL, NULL, 127, 0, 0, 0, 0, 0}, - {NULL, NULL, 63, 0, 0, 0, 0, 0}, - {NULL, NULL, 31, 0, 0, 0, 0, 0}, - {NULL, NULL, 16, 0, 0, 0, 0, 0}, - {NULL, NULL, 8, 0, 0, 0, 0, 0}, - {NULL, NULL, 4, 0, 0, 0, 0, 0}, - {NULL, NULL, 2, 0, 0, 0, 0, 0}, - {NULL, NULL, 1, 0, 0, 0, 0, 0}, - {NULL, NULL, 1, 0, 0, 0, 0, 1}, - {NULL, NULL, 1, 0, 0, 0, 0, 2}, - {NULL, NULL, 1, 0, 0, 0, 0, 3}, - {NULL, NULL, 1, 0, 0, 0, 0, 4}, - {NULL, NULL, 1, 0, 0, 0, 0, 5}, - {NULL, NULL, 0, 0, 0, 0, 0, 0} -}; -#elif PAGE_SIZE == 8192 -static const unsigned int blocksize[] = { - 64, - 128, - 248, - 504, - 1016, - 2040, - 4080, - 8192 - 32, - 16384 - 32, - 32768 - 32, - 65536 - 32, - 131072 - 32, - 262144 - 32, - 0 -}; - -struct size_descriptor sizes[] = -{ - {NULL, NULL, 127, 0, 0, 0, 0, 0}, - {NULL, NULL, 63, 0, 0, 0, 0, 0}, - {NULL, NULL, 31, 0, 0, 0, 0, 0}, - {NULL, NULL, 16, 0, 0, 0, 0, 0}, - {NULL, NULL, 8, 0, 0, 0, 0, 0}, - {NULL, NULL, 4, 0, 0, 0, 0, 0}, - {NULL, NULL, 2, 0, 0, 0, 0, 0}, - {NULL, NULL, 1, 0, 0, 0, 0, 0}, - {NULL, NULL, 1, 0, 0, 0, 0, 1}, - {NULL, NULL, 1, 0, 0, 0, 0, 2}, - {NULL, NULL, 1, 0, 0, 0, 0, 3}, - {NULL, NULL, 1, 0, 0, 0, 0, 4}, - {NULL, NULL, 1, 0, 0, 0, 0, 5}, - {NULL, NULL, 0, 0, 0, 0, 0, 0} -}; -#else -#error you need to make a version for your pagesize -#endif - -#define NBLOCKS(order) (sizes[order].nblocks) -#define BLOCKSIZE(order) (blocksize[order]) -#define AREASIZE(order) (PAGE_SIZE<<(sizes[order].gfporder)) - -/* - * Create a small cache of page allocations: this helps a bit with - * those pesky 8kB+ allocations for NFS when we're temporarily - * out of memory.. - * - * This is a _truly_ small cache, we just cache one single page - * order (for orders 0, 1 and 2, that is 4, 8 and 16kB on x86). - */ -#define MAX_CACHE_ORDER 3 -struct page_descriptor * kmalloc_cache[MAX_CACHE_ORDER]; - -static inline struct page_descriptor * get_kmalloc_pages(unsigned long priority, - unsigned long order, int dma) -{ - return (struct page_descriptor *) __get_free_pages(priority, order, dma); -} - -static inline void free_kmalloc_pages(struct page_descriptor * page, - unsigned long order, int dma) -{ - if (!dma && order < MAX_CACHE_ORDER) { - page = xchg(kmalloc_cache+order, page); - if (!page) - return; - } - free_pages((unsigned long) page, order); -} - -long kmalloc_init(long start_mem, long end_mem) -{ - int order; - -/* - * Check the static info array. Things will blow up terribly if it's - * incorrect. This is a late "compile time" check..... - */ - for (order = 0; BLOCKSIZE(order); order++) { - if ((NBLOCKS(order) * BLOCKSIZE(order) + sizeof(struct page_descriptor)) > - AREASIZE(order)) { - printk("Cannot use %d bytes out of %d in order = %d block mallocs\n", - (int) (NBLOCKS(order) * BLOCKSIZE(order) + - sizeof(struct page_descriptor)), - (int) AREASIZE(order), - BLOCKSIZE(order)); - panic("This only happens if someone messes with kmalloc"); - } - } - return start_mem; -} - - -/* - * Ugh, this is ugly, but we want the default case to run - * straight through, which is why we have the ugly goto's - */ -void *kmalloc(size_t size, int priority) -{ - unsigned long flags; - unsigned long type; - int order, dma; - struct block_header *p; - struct page_descriptor *page, **pg; - struct size_descriptor *bucket = sizes; - - /* Get order */ - order = 0; - { - unsigned int realsize = size + sizeof(struct block_header); - for (;;) { - int ordersize = BLOCKSIZE(order); - if (realsize <= ordersize) - break; - order++; - bucket++; - if (ordersize) - continue; - printk("kmalloc of too large a block (%d bytes).\n", (int) size); - return NULL; - } - } - - dma = 0; - type = MF_USED; - pg = &bucket->firstfree; - if (priority & GFP_DMA) { - dma = 1; - type = MF_DMA; - pg = &bucket->dmafree; - } - - priority &= GFP_LEVEL_MASK; - -/* Sanity check... */ - - if (in_interrupt() && priority != GFP_ATOMIC) { - static int count = 0; - if (++count < 5) { - printk("kmalloc called nonatomically from interrupt %p\n", - return_address()); - priority = GFP_ATOMIC; - } - } - - save_flags(flags); - cli(); - page = *pg; - if (!page) - goto no_bucket_page; - - p = page->firstfree; - if (p->bh_flags != MF_FREE) - goto not_free_on_freelist; - -found_it: - page->firstfree = p->bh_next; - page->nfree--; - if (!page->nfree) - *pg = page->next; - restore_flags(flags); - bucket->nmallocs++; - bucket->nbytesmalloced += size; - p->bh_flags = type; /* As of now this block is officially in use */ - p->bh_length = size; -#ifdef SADISTIC_KMALLOC - memset(p+1, 0xf0, size); -#endif - return p + 1; /* Pointer arithmetic: increments past header */ - - -no_bucket_page: - /* - * If we didn't find a page already allocated for this - * bucket size, we need to get one.. - * - * This can be done with ints on: it is private to this invocation - */ - restore_flags(flags); - - { - int i, sz; - - /* sz is the size of the blocks we're dealing with */ - sz = BLOCKSIZE(order); - - page = get_kmalloc_pages(priority, bucket->gfporder, dma); - if (!page) - goto no_free_page; -found_cached_page: - - bucket->npages++; - - page->order = order; - /* Loop for all but last block: */ - i = (page->nfree = bucket->nblocks) - 1; - p = BH(page + 1); - while (i > 0) { - i--; - p->bh_flags = MF_FREE; - p->bh_next = BH(((long) p) + sz); - p = p->bh_next; - } - /* Last block: */ - p->bh_flags = MF_FREE; - p->bh_next = NULL; - - p = BH(page+1); - } - - /* - * Now we're going to muck with the "global" freelist - * for this size: this should be uninterruptible - */ - cli(); - page->next = *pg; - *pg = page; - goto found_it; - - -no_free_page: - /* - * No free pages, check the kmalloc cache of - * pages to see if maybe we have something available - */ - if (!dma && order < MAX_CACHE_ORDER) { - page = xchg(kmalloc_cache+order, page); - if (page) - goto found_cached_page; - } - { - static unsigned long last = 0; - if (priority != GFP_BUFFER && (last + 10 * HZ < jiffies)) { - last = jiffies; - printk("Couldn't get a free page.....\n"); - } - return NULL; - } - -not_free_on_freelist: - restore_flags(flags); - printk("Problem: block on freelist at %08lx isn't free.\n", (long) p); - return NULL; -} - -void kfree(void *__ptr) -{ - int dma; - unsigned long flags; - unsigned int order; - struct page_descriptor *page, **pg; - struct size_descriptor *bucket; - - if (!__ptr) - goto null_kfree; -#define ptr ((struct block_header *) __ptr) - page = PAGE_DESC(ptr); - __ptr = ptr - 1; - if (~PAGE_MASK & (unsigned long)page->next) - goto bad_order; - order = page->order; - if (order >= sizeof(sizes) / sizeof(sizes[0])) - goto bad_order; - bucket = sizes + order; - dma = 0; - pg = &bucket->firstfree; - if (ptr->bh_flags == MF_DMA) { - dma = 1; - ptr->bh_flags = MF_USED; - pg = &bucket->dmafree; - } - if (ptr->bh_flags != MF_USED) - goto bad_order; - ptr->bh_flags = MF_FREE; /* As of now this block is officially free */ -#ifdef SADISTIC_KMALLOC - memset(ptr+1, 0x0e, ptr->bh_length); -#endif - save_flags(flags); - cli(); - - bucket->nfrees++; - bucket->nbytesmalloced -= ptr->bh_length; - - ptr->bh_next = page->firstfree; - page->firstfree = ptr; - if (!page->nfree++) { -/* Page went from full to one free block: put it on the freelist. */ - if (bucket->nblocks == 1) - goto free_page; - page->next = *pg; - *pg = page; - } -/* If page is completely free, free it */ - if (page->nfree == bucket->nblocks) { - for (;;) { - struct page_descriptor *tmp = *pg; - if (!tmp) - goto not_on_freelist; - if (tmp == page) - break; - pg = &tmp->next; - } - *pg = page->next; -free_page: - bucket->npages--; - free_kmalloc_pages(page, bucket->gfporder, dma); - } - restore_flags(flags); -null_kfree: - return; - -bad_order: - printk("kfree of non-kmalloced memory: %p, next= %p, order=%d\n", - ptr+1, page->next, page->order); - return; - -not_on_freelist: - printk("Ooops. page %p doesn't show on freelist.\n", page); - restore_flags(flags); -} diff --git a/mm/memory.c b/mm/memory.c index 27dc33efe..530a65ca9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -589,26 +589,13 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. */ -void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, - unsigned long address, int write_access) +static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *page_table) { - pgd_t *page_dir; - pmd_t *page_middle; - pte_t *page_table, pte; + pte_t pte; unsigned long old_page, new_page; new_page = __get_free_page(GFP_KERNEL); - page_dir = pgd_offset(vma->vm_mm, address); - if (pgd_none(*page_dir)) - goto end_wp_page; - if (pgd_bad(*page_dir)) - goto bad_wp_pagedir; - page_middle = pmd_offset(page_dir, address); - if (pmd_none(*page_middle)) - goto end_wp_page; - if (pmd_bad(*page_middle)) - goto bad_wp_pagemiddle; - page_table = pte_offset(page_middle, address); pte = *page_table; if (!pte_present(pte)) goto end_wp_page; @@ -650,14 +637,6 @@ void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, bad_wp_page: printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); send_sig(SIGKILL, tsk, 1); - goto end_wp_page; -bad_wp_pagemiddle: - printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle)); - send_sig(SIGKILL, tsk, 1); - goto end_wp_page; -bad_wp_pagedir: - printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir)); - send_sig(SIGKILL, tsk, 1); end_wp_page: if (new_page) free_page(new_page); @@ -746,7 +725,7 @@ void vmtruncate(struct inode * inode, unsigned long offset) flush_cache_range(mm, start, end); zap_page_range(mm, start, len); flush_tlb_range(mm, start, end); - } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap); + } while ((mpnt = mpnt->vm_next_share) != NULL); } @@ -785,25 +764,11 @@ static inline void do_swap_page(struct task_struct * tsk, * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. */ -void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, - unsigned long address, int write_access) +static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *page_table, pte_t entry) { - pgd_t * pgd; - pmd_t * pmd; - pte_t * page_table; - pte_t entry; unsigned long page; - pgd = pgd_offset(tsk->mm, address); - pmd = pmd_alloc(pgd, address); - if (!pmd) - goto no_memory; - page_table = pte_alloc(pmd, address); - if (!page_table) - goto no_memory; - entry = *page_table; - if (pte_present(entry)) - goto is_present; if (!pte_none(entry)) goto swap_page; address &= PAGE_MASK; @@ -865,18 +830,9 @@ sigbus: swap_page: do_swap_page(tsk, vma, address, page_table, entry, write_access); return; - -no_memory: - oom(tsk); -is_present: - return; } /* - * The above separate functions for the no-page and wp-page - * cases will go away (they mostly do the same thing anyway), - * and we'll instead use only a general "handle_mm_fault()". - * * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. @@ -885,27 +841,30 @@ is_present: * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). */ -static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address, +static inline void handle_pte_fault(struct task_struct *tsk, + struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) { - if (!pte_present(*pte)) { - do_no_page(current, vma, address, write_access); + pte_t entry = *pte; + + if (!pte_present(entry)) { + do_no_page(tsk, vma, address, write_access, pte, entry); return; } - set_pte(pte, pte_mkyoung(*pte)); + set_pte(pte, pte_mkyoung(entry)); flush_tlb_page(vma, address); if (!write_access) return; - if (pte_write(*pte)) { - set_pte(pte, pte_mkdirty(*pte)); + if (pte_write(entry)) { + set_pte(pte, pte_mkdirty(entry)); flush_tlb_page(vma, address); return; } - do_wp_page(current, vma, address, write_access); + do_wp_page(tsk, vma, address, write_access, pte); } -void handle_mm_fault(struct vm_area_struct * vma, unsigned long address, - int write_access) +void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma, + unsigned long address, int write_access) { pgd_t *pgd; pmd_t *pmd; @@ -918,9 +877,9 @@ void handle_mm_fault(struct vm_area_struct * vma, unsigned long address, pte = pte_alloc(pmd, address); if (!pte) goto no_memory; - handle_pte_fault(vma, address, write_access, pte); + handle_pte_fault(tsk, vma, address, write_access, pte); update_mmu_cache(vma, address, *pte); return; no_memory: - oom(current); + oom(tsk); } @@ -16,13 +16,13 @@ #include <linux/swap.h> #include <linux/smp.h> #include <linux/smp_lock.h> +#include <linux/init.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/pgtable.h> -/* - * description of effects of mapping type and prot in current implementation. +/* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * @@ -37,7 +37,6 @@ * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * */ - pgprot_t protection_map[16] = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 @@ -48,20 +47,18 @@ kmem_cache_t *vm_area_cachep; int sysctl_overcommit_memory; -/* - * Check that a process has enough memory to allocate a +/* Check that a process has enough memory to allocate a * new virtual mapping. */ int vm_enough_memory(long pages) { - /* - * stupid algorithm to decide if we have enough memory: while + /* Stupid algorithm to decide if we have enough memory: while * simple, it hopefully works in most obvious cases.. Easy to * fool it, but this should catch most mistakes. */ long freepages; - /* sometimes we want to use more memory than we have. */ + /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; @@ -74,6 +71,20 @@ int vm_enough_memory(long pages) return freepages > pages; } +/* Remove one vm structure from the inode's i_mmap ring. */ +static inline void remove_shared_vm_struct(struct vm_area_struct *vma) +{ + struct inode * inode = vma->vm_inode; + + if (inode) { + if (vma->vm_flags & VM_DENYWRITE) + inode->i_writecount++; + if(vma->vm_next_share) + vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share; + *vma->vm_pprev_share = vma->vm_next_share; + } +} + asmlinkage unsigned long sys_brk(unsigned long brk) { unsigned long rlim, retval; @@ -91,17 +102,14 @@ asmlinkage unsigned long sys_brk(unsigned long brk) goto out; } - /* - * Always allow shrinking brk - */ + /* Always allow shrinking brk. */ if (brk <= mm->brk) { retval = mm->brk = brk; do_munmap(newbrk, oldbrk-newbrk); goto out; } - /* - * Check against rlimit and stack.. - */ + + /* Check against rlimit and stack.. */ retval = mm->brk; rlim = current->rlim[RLIMIT_DATA].rlim_cur; if (rlim >= RLIM_INFINITY) @@ -109,21 +117,15 @@ asmlinkage unsigned long sys_brk(unsigned long brk) if (brk - mm->end_code > rlim) goto out; - /* - * Check against existing mmap mappings. - */ + /* Check against existing mmap mappings. */ if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) goto out; - /* - * Check if we have enough memory.. - */ + /* Check if we have enough memory.. */ if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) goto out; - /* - * Ok, looks good - let it rip. - */ + /* Ok, looks good - let it rip. */ if(do_mmap(NULL, oldbrk, newbrk-oldbrk, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, 0) == oldbrk) @@ -134,8 +136,7 @@ out: return retval; } -/* - * Combine the mmap "prot" and "flags" argument into one "vm_flags" used +/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits * into "VM_xxx". */ @@ -162,6 +163,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, { struct mm_struct * mm = current->mm; struct vm_area_struct * vma; + int correct_wcount = 0; if ((len = PAGE_ALIGN(len)) == 0) return addr; @@ -181,20 +183,17 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, return -EAGAIN; } - /* - * do simple checking here so the lower-level routines won't have + /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - if (file != NULL) { switch (flags & MAP_TYPE) { case MAP_SHARED: if ((prot & PROT_WRITE) && !(file->f_mode & 2)) return -EACCES; - /* - * make sure there are no mandatory locks on the file. - */ + + /* make sure there are no mandatory locks on the file. */ if (locks_verify_locked(file->f_inode)) return -EAGAIN; /* fall through */ @@ -206,18 +205,12 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, default: return -EINVAL; } - if (flags & MAP_DENYWRITE) { - if (file->f_inode->i_writecount > 0) - return -ETXTBSY; - } } else if ((flags & MAP_TYPE) != MAP_PRIVATE) return -EINVAL; - /* - * obtain the address to map to. we verify (or select) it and ensure + /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ - if (flags & MAP_FIXED) { if (addr & ~PAGE_MASK) return -EINVAL; @@ -227,8 +220,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, return -ENOMEM; } - /* - * determine the object being mapped and call the appropriate + /* Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ @@ -249,8 +241,8 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_SHARED) { vma->vm_flags |= VM_SHARED | VM_MAYSHARE; - /* - * This looks strange, but when we don't have the file open + + /* This looks strange, but when we don't have the file open * for writing, we can demote the shared mapping to a simpler * private mapping. That also takes care of a security hole * with ptrace() writing to a shared mapping without write @@ -289,9 +281,26 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, } if (file) { - int error = file->f_op->mmap(file->f_inode, file, vma); + int error = 0; + if (vma->vm_flags & VM_DENYWRITE) { + if (file->f_inode->i_writecount > 0) + error = -ETXTBSY; + else { + /* f_op->mmap might possibly sleep + * (generic_file_mmap doesn't, but other code + * might). In any case, this takes care of any + * race that this might cause. + */ + file->f_inode->i_writecount--; + correct_wcount = 1; + } + } + if (!error) + error = file->f_op->mmap(file->f_inode, file, vma); if (error) { + if (correct_wcount) + file->f_inode->i_writecount++; kmem_cache_free(vm_area_cachep, vma); return error; } @@ -299,6 +308,8 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, flags = vma->vm_flags; insert_vm_struct(mm, vma); + if (correct_wcount) + file->f_inode->i_writecount++; merge_segments(mm, vma->vm_start, vma->vm_end); /* merge_segments might have merged our vma, so we can't use it any more */ @@ -317,8 +328,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, return addr; } -/* - * Get an address range which is currently unmapped. +/* Get an address range which is currently unmapped. * For mmap() without MAP_FIXED and shmat() with addr=0. * Return value 0 means ENOMEM. */ @@ -342,376 +352,7 @@ unsigned long get_unmapped_area(unsigned long addr, unsigned long len) } } -/* - * Searching a VMA in the linear list task->mm->mmap is horribly slow. - * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search - * from O(n) to O(log n), where n is the number of VMAs of the task - * (typically around 6, but may reach 3000 in some cases). - * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>. - */ - -/* We keep the list and tree sorted by address. */ -#define vm_avl_key vm_end -#define vm_avl_key_t unsigned long /* typeof(vma->avl_key) */ - -/* - * task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap - * or, more exactly, its root. - * A vm_area_struct has the following fields: - * vm_avl_left left son of a tree node - * vm_avl_right right son of a tree node - * vm_avl_height 1+max(heightof(left),heightof(right)) - * The empty tree is represented as NULL. - */ - -/* Since the trees are balanced, their height will never be large. */ -#define avl_maxheight 41 /* why this? a small exercise */ -#define heightof(tree) ((tree) == avl_empty ? 0 : (tree)->vm_avl_height) -/* - * Consistency and balancing rules: - * 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right)) - * 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1 - * 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key, - * foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key. - */ - -/* Look up the nodes at the left and at the right of a given node. */ -static inline void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) -{ - vm_avl_key_t key = node->vm_avl_key; - - *to_the_left = *to_the_right = NULL; - for (;;) { - if (tree == avl_empty) { - printk("avl_neighbours: node not found in the tree\n"); - return; - } - if (key == tree->vm_avl_key) - break; - if (key < tree->vm_avl_key) { - *to_the_right = tree; - tree = tree->vm_avl_left; - } else { - *to_the_left = tree; - tree = tree->vm_avl_right; - } - } - if (tree != node) { - printk("avl_neighbours: node not exactly found in the tree\n"); - return; - } - if (tree->vm_avl_left != avl_empty) { - struct vm_area_struct * node; - for (node = tree->vm_avl_left; node->vm_avl_right != avl_empty; node = node->vm_avl_right) - continue; - *to_the_left = node; - } - if (tree->vm_avl_right != avl_empty) { - struct vm_area_struct * node; - for (node = tree->vm_avl_right; node->vm_avl_left != avl_empty; node = node->vm_avl_left) - continue; - *to_the_right = node; - } - if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right)) - printk("avl_neighbours: tree inconsistent with list\n"); -} - -/* - * Rebalance a tree. - * After inserting or deleting a node of a tree we have a sequence of subtrees - * nodes[0]..nodes[k-1] such that - * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}. - */ -static inline void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count) -{ - for ( ; count > 0 ; count--) { - struct vm_area_struct ** nodeplace = *--nodeplaces_ptr; - struct vm_area_struct * node = *nodeplace; - struct vm_area_struct * nodeleft = node->vm_avl_left; - struct vm_area_struct * noderight = node->vm_avl_right; - int heightleft = heightof(nodeleft); - int heightright = heightof(noderight); - if (heightright + 1 < heightleft) { - /* */ - /* * */ - /* / \ */ - /* n+2 n */ - /* */ - struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left; - struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right; - int heightleftright = heightof(nodeleftright); - if (heightof(nodeleftleft) >= heightleftright) { - /* */ - /* * n+2|n+3 */ - /* / \ / \ */ - /* n+2 n --> / n+1|n+2 */ - /* / \ | / \ */ - /* n+1 n|n+1 n+1 n|n+1 n */ - /* */ - node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node; - nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright); - *nodeplace = nodeleft; - } else { - /* */ - /* * n+2 */ - /* / \ / \ */ - /* n+2 n --> n+1 n+1 */ - /* / \ / \ / \ */ - /* n n+1 n L R n */ - /* / \ */ - /* L R */ - /* */ - nodeleft->vm_avl_right = nodeleftright->vm_avl_left; - node->vm_avl_left = nodeleftright->vm_avl_right; - nodeleftright->vm_avl_left = nodeleft; - nodeleftright->vm_avl_right = node; - nodeleft->vm_avl_height = node->vm_avl_height = heightleftright; - nodeleftright->vm_avl_height = heightleft; - *nodeplace = nodeleftright; - } - } - else if (heightleft + 1 < heightright) { - /* similar to the above, just interchange 'left' <--> 'right' */ - struct vm_area_struct * noderightright = noderight->vm_avl_right; - struct vm_area_struct * noderightleft = noderight->vm_avl_left; - int heightrightleft = heightof(noderightleft); - if (heightof(noderightright) >= heightrightleft) { - node->vm_avl_right = noderightleft; noderight->vm_avl_left = node; - noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft); - *nodeplace = noderight; - } else { - noderight->vm_avl_left = noderightleft->vm_avl_right; - node->vm_avl_right = noderightleft->vm_avl_left; - noderightleft->vm_avl_right = noderight; - noderightleft->vm_avl_left = node; - noderight->vm_avl_height = node->vm_avl_height = heightrightleft; - noderightleft->vm_avl_height = heightright; - *nodeplace = noderightleft; - } - } - else { - int height = (heightleft<heightright ? heightright : heightleft) + 1; - if (height == node->vm_avl_height) - break; - node->vm_avl_height = height; - } - } -} - -/* Insert a node into a tree. */ -static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree) -{ - vm_avl_key_t key = new_node->vm_avl_key; - struct vm_area_struct ** nodeplace = ptree; - struct vm_area_struct ** stack[avl_maxheight]; - int stack_count = 0; - struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ - for (;;) { - struct vm_area_struct * node = *nodeplace; - if (node == avl_empty) - break; - *stack_ptr++ = nodeplace; stack_count++; - if (key < node->vm_avl_key) - nodeplace = &node->vm_avl_left; - else - nodeplace = &node->vm_avl_right; - } - new_node->vm_avl_left = avl_empty; - new_node->vm_avl_right = avl_empty; - new_node->vm_avl_height = 1; - *nodeplace = new_node; - avl_rebalance(stack_ptr,stack_count); -} - -/* Insert a node into a tree, and - * return the node to the left of it and the node to the right of it. - */ -static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree, - struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) -{ - vm_avl_key_t key = new_node->vm_avl_key; - struct vm_area_struct ** nodeplace = ptree; - struct vm_area_struct ** stack[avl_maxheight]; - int stack_count = 0; - struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ - *to_the_left = *to_the_right = NULL; - for (;;) { - struct vm_area_struct * node = *nodeplace; - if (node == avl_empty) - break; - *stack_ptr++ = nodeplace; stack_count++; - if (key < node->vm_avl_key) { - *to_the_right = node; - nodeplace = &node->vm_avl_left; - } else { - *to_the_left = node; - nodeplace = &node->vm_avl_right; - } - } - new_node->vm_avl_left = avl_empty; - new_node->vm_avl_right = avl_empty; - new_node->vm_avl_height = 1; - *nodeplace = new_node; - avl_rebalance(stack_ptr,stack_count); -} - -/* Removes a node out of a tree. */ -static inline void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree) -{ - vm_avl_key_t key = node_to_delete->vm_avl_key; - struct vm_area_struct ** nodeplace = ptree; - struct vm_area_struct ** stack[avl_maxheight]; - int stack_count = 0; - struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ - struct vm_area_struct ** nodeplace_to_delete; - for (;;) { - struct vm_area_struct * node = *nodeplace; - if (node == avl_empty) { - /* what? node_to_delete not found in tree? */ - printk("avl_remove: node to delete not found in tree\n"); - return; - } - *stack_ptr++ = nodeplace; stack_count++; - if (key == node->vm_avl_key) - break; - if (key < node->vm_avl_key) - nodeplace = &node->vm_avl_left; - else - nodeplace = &node->vm_avl_right; - } - nodeplace_to_delete = nodeplace; - /* Have to remove node_to_delete = *nodeplace_to_delete. */ - if (node_to_delete->vm_avl_left == avl_empty) { - *nodeplace_to_delete = node_to_delete->vm_avl_right; - stack_ptr--; stack_count--; - } else { - struct vm_area_struct *** stack_ptr_to_delete = stack_ptr; - struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left; - struct vm_area_struct * node; - for (;;) { - node = *nodeplace; - if (node->vm_avl_right == avl_empty) - break; - *stack_ptr++ = nodeplace; stack_count++; - nodeplace = &node->vm_avl_right; - } - *nodeplace = node->vm_avl_left; - /* node replaces node_to_delete */ - node->vm_avl_left = node_to_delete->vm_avl_left; - node->vm_avl_right = node_to_delete->vm_avl_right; - node->vm_avl_height = node_to_delete->vm_avl_height; - *nodeplace_to_delete = node; /* replace node_to_delete */ - *stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */ - } - avl_rebalance(stack_ptr,stack_count); -} - -#ifdef DEBUG_AVL - -/* print a list */ -static void printk_list (struct vm_area_struct * vma) -{ - printk("["); - while (vma) { - printk("%08lX-%08lX", vma->vm_start, vma->vm_end); - vma = vma->vm_next; - if (!vma) - break; - printk(" "); - } - printk("]"); -} - -/* print a tree */ -static void printk_avl (struct vm_area_struct * tree) -{ - if (tree != avl_empty) { - printk("("); - if (tree->vm_avl_left != avl_empty) { - printk_avl(tree->vm_avl_left); - printk("<"); - } - printk("%08lX-%08lX", tree->vm_start, tree->vm_end); - if (tree->vm_avl_right != avl_empty) { - printk(">"); - printk_avl(tree->vm_avl_right); - } - printk(")"); - } -} - -static char *avl_check_point = "somewhere"; - -/* check a tree's consistency and balancing */ -static void avl_checkheights (struct vm_area_struct * tree) -{ - int h, hl, hr; - - if (tree == avl_empty) - return; - avl_checkheights(tree->vm_avl_left); - avl_checkheights(tree->vm_avl_right); - h = tree->vm_avl_height; - hl = heightof(tree->vm_avl_left); - hr = heightof(tree->vm_avl_right); - if ((h == hl+1) && (hr <= hl) && (hl <= hr+1)) - return; - if ((h == hr+1) && (hl <= hr) && (hr <= hl+1)) - return; - printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point); -} - -/* check that all values stored in a tree are < key */ -static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key) -{ - if (tree == avl_empty) - return; - avl_checkleft(tree->vm_avl_left,key); - avl_checkleft(tree->vm_avl_right,key); - if (tree->vm_avl_key < key) - return; - printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->vm_avl_key,key); -} - -/* check that all values stored in a tree are > key */ -static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key) -{ - if (tree == avl_empty) - return; - avl_checkright(tree->vm_avl_left,key); - avl_checkright(tree->vm_avl_right,key); - if (tree->vm_avl_key > key) - return; - printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->vm_avl_key,key); -} - -/* check that all values are properly increasing */ -static void avl_checkorder (struct vm_area_struct * tree) -{ - if (tree == avl_empty) - return; - avl_checkorder(tree->vm_avl_left); - avl_checkorder(tree->vm_avl_right); - avl_checkleft(tree->vm_avl_left,tree->vm_avl_key); - avl_checkright(tree->vm_avl_right,tree->vm_avl_key); -} - -/* all checks */ -static void avl_check (struct task_struct * task, char *caller) -{ - avl_check_point = caller; -/* printk("task \"%s\", %s\n",task->comm,caller); */ -/* printk("task \"%s\" list: ",task->comm); printk_list(task->mm->mmap); printk("\n"); */ -/* printk("task \"%s\" tree: ",task->comm); printk_avl(task->mm->mmap_avl); printk("\n"); */ - avl_checkheights(task->mm->mmap_avl); - avl_checkorder(task->mm->mmap_avl); -} - -#endif - - -/* - * Normal function to fix up a mapping +/* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. * This function works out what part of an area is affected and @@ -738,19 +379,11 @@ static void unmap_fixup(struct vm_area_struct *area, struct vm_area_struct *mpnt; unsigned long end = addr + len; - if (addr < area->vm_start || addr >= area->vm_end || - end <= area->vm_start || end > area->vm_end || - end < addr) - { - printk("unmap_fixup: area=%lx-%lx, unmap %lx-%lx!!\n", - area->vm_start, area->vm_end, addr, end); - return; - } area->vm_mm->total_vm -= len >> PAGE_SHIFT; if (area->vm_flags & VM_LOCKED) area->vm_mm->locked_vm -= len >> PAGE_SHIFT; - /* Unmapping the whole area */ + /* Unmapping the whole area. */ if (addr == area->vm_start && end == area->vm_end) { if (area->vm_ops && area->vm_ops->close) area->vm_ops->close(area); @@ -759,15 +392,13 @@ static void unmap_fixup(struct vm_area_struct *area, return; } - /* Work out to one of the ends */ + /* Work out to one of the ends. */ if (end == area->vm_end) area->vm_end = addr; - else - if (addr == area->vm_start) { + else if (addr == area->vm_start) { area->vm_offset += (end - area->vm_start); area->vm_start = end; - } - else { + } else { /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */ /* Add end mapping -- leave beginning for below */ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); @@ -785,7 +416,7 @@ static void unmap_fixup(struct vm_area_struct *area, insert_vm_struct(current->mm, mpnt); } - /* construct whatever mapping is needed */ + /* Construct whatever mapping is needed. */ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!mpnt) return; @@ -809,15 +440,14 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len) return ret; } -/* - * Munmap is split into 2 main parts -- this part which finds +/* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. * Jeremy Fitzhardine <jeremy@sw.oz.au> */ int do_munmap(unsigned long addr, size_t len) { - struct vm_area_struct *mpnt, *prev, *next, **npp, *free; + struct vm_area_struct *mpnt, *next, *free; if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) return -EINVAL; @@ -825,33 +455,36 @@ int do_munmap(unsigned long addr, size_t len) if ((len = PAGE_ALIGN(len)) == 0) return 0; - /* - * Check if this memory area is ok - put it on the temporary + /* Check if this memory area is ok - put it on the temporary * list if so.. The checks here are pretty simple -- * every area affected in some way (by any overlap) is put * on the list. If nothing is put on, nothing is affected. */ - mpnt = find_vma(current->mm, addr); + mpnt = current->mm->mmap; + while(mpnt && mpnt->vm_end <= addr) + mpnt = mpnt->vm_next; if (!mpnt) return 0; - avl_neighbours(mpnt, current->mm->mmap_avl, &prev, &next); - /* we have prev->vm_next == mpnt && mpnt->vm_next = next */ - /* and addr < mpnt->vm_end */ - npp = (prev ? &prev->vm_next : ¤t->mm->mmap); + next = mpnt->vm_next; + + /* we have mpnt->vm_next = next and addr < mpnt->vm_end */ free = NULL; - for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { - *npp = mpnt->vm_next; + for ( ; mpnt && mpnt->vm_start < addr+len; ) { + struct vm_area_struct *next = mpnt->vm_next; + + if(mpnt->vm_next) + mpnt->vm_next->vm_pprev = mpnt->vm_pprev; + *mpnt->vm_pprev = mpnt->vm_next; + mpnt->vm_next = free; free = mpnt; - avl_remove(mpnt, ¤t->mm->mmap_avl); + mpnt = next; } - if (free == NULL) return 0; - /* - * Ok - we have the memory areas we should free on the 'free' list, + /* Ok - we have the memory areas we should free on the 'free' list, * so release them, and unmap the page range.. * If the one of the segments is only being partially unmapped, * it will put new vm_area_struct(s) into the address space. @@ -871,36 +504,27 @@ int do_munmap(unsigned long addr, size_t len) if (mpnt->vm_ops && mpnt->vm_ops->unmap) mpnt->vm_ops->unmap(mpnt, st, size); + flush_cache_range(current->mm, st, end); zap_page_range(current->mm, st, size); flush_tlb_range(current->mm, st, end); + unmap_fixup(mpnt, st, size); + kmem_cache_free(vm_area_cachep, mpnt); } while (free); - /* we could zap the page tables here too.. */ - + current->mm->mmap_cache = NULL; /* Kill the cache. */ return 0; } -/* Build the AVL tree corresponding to the VMA list. */ -void build_mmap_avl(struct mm_struct * mm) -{ - struct vm_area_struct * vma; - - mm->mmap_avl = NULL; - for (vma = mm->mmap; vma; vma = vma->vm_next) - avl_insert(vma, &mm->mmap_avl); -} - /* Release all mmaps. */ void exit_mmap(struct mm_struct * mm) { struct vm_area_struct * mpnt; mpnt = mm->mmap; - mm->mmap = NULL; - mm->mmap_avl = NULL; + mm->mmap = mm->mmap_cache = NULL; mm->rss = 0; mm->total_vm = 0; mm->locked_vm = 0; @@ -925,81 +549,38 @@ void exit_mmap(struct mm_struct * mm) } } -/* - * Insert vm structure into process list sorted by address +/* Insert vm structure into process list sorted by address * and into the inode's i_mmap ring. */ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) { - struct vm_area_struct *share; + struct vm_area_struct **pprev = &mm->mmap; struct inode * inode; -#if 0 /* equivalent, but slow */ - struct vm_area_struct **p, *mpnt; + /* Find where to link it in. */ + while(*pprev && (*pprev)->vm_start <= vmp->vm_start) + pprev = &(*pprev)->vm_next; - p = &mm->mmap; - while ((mpnt = *p) != NULL) { - if (mpnt->vm_start > vmp->vm_start) - break; - if (mpnt->vm_end > vmp->vm_start) - printk("insert_vm_struct: overlapping memory areas\n"); - p = &mpnt->vm_next; - } - vmp->vm_next = mpnt; - *p = vmp; -#else - struct vm_area_struct * prev, * next; - - avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next); - if ((prev ? prev->vm_next : mm->mmap) != next) - printk("insert_vm_struct: tree inconsistent with list\n"); - if (prev) - prev->vm_next = vmp; - else - mm->mmap = vmp; - vmp->vm_next = next; -#endif + /* Insert it. */ + if((vmp->vm_next = *pprev) != NULL) + (*pprev)->vm_pprev = &vmp->vm_next; + *pprev = vmp; + vmp->vm_pprev = pprev; inode = vmp->vm_inode; - if (!inode) - return; - - /* insert vmp into inode's circular share list */ - if ((share = inode->i_mmap)) { - vmp->vm_next_share = share->vm_next_share; - vmp->vm_next_share->vm_prev_share = vmp; - share->vm_next_share = vmp; - vmp->vm_prev_share = share; - } else - inode->i_mmap = vmp->vm_next_share = vmp->vm_prev_share = vmp; -} - -/* - * Remove one vm structure from the inode's i_mmap ring. - */ -void remove_shared_vm_struct(struct vm_area_struct *mpnt) -{ - struct inode * inode = mpnt->vm_inode; - - if (!inode) - return; - - if (mpnt->vm_next_share == mpnt) { - if (inode->i_mmap != mpnt) - printk("Inode i_mmap ring corrupted\n"); - inode->i_mmap = NULL; - return; + if (inode) { + if (vmp->vm_flags & VM_DENYWRITE) + inode->i_writecount--; + + /* insert vmp into inode's share list */ + if((vmp->vm_next_share = inode->i_mmap) != NULL) + inode->i_mmap->vm_pprev_share = &vmp->vm_next_share; + inode->i_mmap = vmp; + vmp->vm_pprev_share = &inode->i_mmap; } - - if (inode->i_mmap == mpnt) - inode->i_mmap = mpnt->vm_next_share; - - mpnt->vm_prev_share->vm_next_share = mpnt->vm_next_share; - mpnt->vm_next_share->vm_prev_share = mpnt->vm_prev_share; } -/* - * Merge the list of memory segments if possible. +/* Merge the list of memory segments if possible. * Redundant vm_area_structs are freed. * This assumes that the list is ordered by address. * We don't need to traverse the entire list, only those segments @@ -1010,13 +591,19 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l struct vm_area_struct *prev, *mpnt, *next; down(&mm->mmap_sem); - mpnt = find_vma(mm, start_addr); + + prev = NULL; + mpnt = mm->mmap; + while(mpnt && mpnt->vm_end <= start_addr) { + prev = mpnt; + mpnt = mpnt->vm_next; + } if (!mpnt) goto no_vma; - avl_neighbours(mpnt, mm->mmap_avl, &prev, &next); - /* we have prev->vm_next == mpnt && mpnt->vm_next = next */ + next = mpnt->vm_next; + /* we have prev->vm_next == mpnt && mpnt->vm_next = next */ if (!prev) { prev = mpnt; mpnt = next; @@ -1026,41 +613,32 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l * start_addr < mpnt->vm_end && prev->vm_start < end_addr */ for ( ; mpnt && prev->vm_start < end_addr ; prev = mpnt, mpnt = next) { -#if 0 - printk("looping in merge_segments, mpnt=0x%lX\n", (unsigned long) mpnt); -#endif - next = mpnt->vm_next; - /* - * To share, we must have the same inode, operations.. - */ - if (mpnt->vm_inode != prev->vm_inode) - continue; - if (mpnt->vm_pte != prev->vm_pte) - continue; - if (mpnt->vm_ops != prev->vm_ops) - continue; - if (mpnt->vm_flags != prev->vm_flags) + /* To share, we must have the same inode, operations.. */ + if ((mpnt->vm_inode != prev->vm_inode) || + (mpnt->vm_pte != prev->vm_pte) || + (mpnt->vm_ops != prev->vm_ops) || + (mpnt->vm_flags != prev->vm_flags) || + (prev->vm_end != mpnt->vm_start)) continue; - if (prev->vm_end != mpnt->vm_start) - continue; - /* - * and if we have an inode, the offsets must be contiguous.. - */ + + /* and if we have an inode, the offsets must be contiguous.. */ if ((mpnt->vm_inode != NULL) || (mpnt->vm_flags & VM_SHM)) { - if (prev->vm_offset + prev->vm_end - prev->vm_start != mpnt->vm_offset) + unsigned long off = prev->vm_offset+prev->vm_end-prev->vm_start; + if (off != mpnt->vm_offset) continue; } - /* - * merge prev with mpnt and set up pointers so the new + /* merge prev with mpnt and set up pointers so the new * big segment can possibly merge with the next one. * The old unused mpnt is freed. */ - avl_remove(mpnt, &mm->mmap_avl); + if(mpnt->vm_next) + mpnt->vm_next->vm_pprev = mpnt->vm_pprev; + *mpnt->vm_pprev = mpnt->vm_next; + prev->vm_end = mpnt->vm_end; - prev->vm_next = mpnt->vm_next; if (mpnt->vm_ops && mpnt->vm_ops->close) { mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start; mpnt->vm_start = mpnt->vm_end; @@ -1072,16 +650,24 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l kmem_cache_free(vm_area_cachep, mpnt); mpnt = prev; } + mm->mmap_cache = NULL; /* Kill the cache. */ no_vma: up(&mm->mmap_sem); } -void vma_init(void) +__initfunc(void vma_init(void)) { vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), - sizeof(long)*8, SLAB_HWCACHE_ALIGN, + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if(!vm_area_cachep) panic("vma_init: Cannot alloc vm_area_struct cache."); + + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!mm_cachep) + panic("vma_init: Cannot alloc mm_struct cache."); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7b71a1ec7..19b3aa125 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,12 +18,14 @@ #include <linux/fs.h> #include <linux/swapctl.h> #include <linux/interrupt.h> +#include <linux/init.h> #include <asm/dma.h> #include <asm/system.h> /* for cli()/sti() */ #include <asm/uaccess.h> /* for copy_to/from_user */ #include <asm/bitops.h> #include <asm/pgtable.h> +#include <asm/spinlock.h> int nr_swap_pages = 0; int nr_free_pages = 0; @@ -88,10 +90,6 @@ static inline void remove_mem_queue(struct page * entry) * * With the above two rules, you get a straight-line execution path * for the normal case, giving better asm-code. - * - * free_page() may sleep since the page being freed may be a buffer - * page or present in the swap cache. It will not sleep, however, - * for a freshly allocated page (get_free_page()). */ /* @@ -99,6 +97,8 @@ static inline void remove_mem_queue(struct page * entry) * * Hint: -mask = 1+~mask */ +static spinlock_t page_alloc_lock; + static inline void free_pages_ok(unsigned long map_nr, unsigned long order) { struct free_area_struct *area = free_area + order; @@ -106,15 +106,14 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order) unsigned long mask = (~0UL) << order; unsigned long flags; - save_flags(flags); - cli(); + spin_lock_irqsave(&page_alloc_lock, flags); #define list(x) (mem_map+(x)) map_nr &= mask; nr_free_pages -= mask; while (mask + (1 << (NR_MEM_LISTS-1))) { - if (!change_bit(index, area->map)) + if (!test_and_change_bit(index, area->map)) break; remove_mem_queue(list(map_nr ^ -mask)); mask <<= 1; @@ -126,7 +125,7 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order) #undef list - restore_flags(flags); + spin_unlock_irqrestore(&page_alloc_lock, flags); } void __free_page(struct page *page) @@ -172,7 +171,7 @@ do { struct free_area_struct * area = free_area+order; \ MARK_USED(map_nr, new_order, area); \ nr_free_pages -= 1 << order; \ EXPAND(ret, map_nr, order, new_order, area); \ - restore_flags(flags); \ + spin_unlock_irqrestore(&page_alloc_lock, flags); \ return ADDRESS(map_nr); \ } \ prev = ret; \ @@ -214,15 +213,14 @@ unsigned long __get_free_pages(int priority, unsigned long order, int dma) reserved_pages = 5; if (priority != GFP_NFS) reserved_pages = min_free_pages; - save_flags(flags); repeat: - cli(); + spin_lock_irqsave(&page_alloc_lock, flags); if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { RMQUEUE(order, dma); - restore_flags(flags); + spin_unlock_irqrestore(&page_alloc_lock, flags); return 0; } - restore_flags(flags); + spin_unlock_irqrestore(&page_alloc_lock, flags); if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1)) goto repeat; return 0; @@ -239,8 +237,7 @@ void show_free_areas(void) unsigned long total = 0; printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); - save_flags(flags); - cli(); + spin_lock_irqsave(&page_alloc_lock, flags); for (order=0 ; order < NR_MEM_LISTS; order++) { struct page * tmp; unsigned long nr = 0; @@ -250,7 +247,7 @@ void show_free_areas(void) total += nr * ((PAGE_SIZE>>10) << order); printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order)); } - restore_flags(flags); + spin_unlock_irqrestore(&page_alloc_lock, flags); printk("= %lukB)\n", total); #ifdef SWAP_CACHE_INFO show_swap_cache_info(); @@ -265,7 +262,7 @@ void show_free_areas(void) * - mark all memory queues empty * - clear the memory bitmaps */ -unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem) +__initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)) { mem_map_t * p; unsigned long mask = PAGE_MASK; @@ -273,7 +270,7 @@ unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem) /* * select nr of pages we try to keep free for important stuff - * with a minimum of 16 pages. This is totally arbitrary + * with a minimum of 48 pages. This is totally arbitrary */ i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7); if (i < 48) diff --git a/mm/page_io.c b/mm/page_io.c index 9980c52b7..6a16ccee8 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -67,7 +67,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) return; } /* Make sure we are the only process doing I/O with this swap page. */ - while (set_bit(offset,p->swap_lockmap)) { + while (test_and_set_bit(offset,p->swap_lockmap)) { run_task_queue(&tq_disk); sleep_on(&lock_queue); } @@ -136,7 +136,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) } else printk("rw_swap_page: no swap file or device\n"); atomic_dec(&page->count); - if (offset && !clear_bit(offset,p->swap_lockmap)) + if (offset && !test_and_clear_bit(offset,p->swap_lockmap)) printk("rw_swap_page: lock already cleared\n"); wake_up(&lock_queue); } @@ -158,7 +158,7 @@ void swap_after_unlock_page (unsigned long entry) printk("swap_after_unlock_page: weirdness\n"); return; } - if (!clear_bit(offset,p->swap_lockmap)) + if (!test_and_clear_bit(offset,p->swap_lockmap)) printk("swap_after_unlock_page: lock already cleared\n"); wake_up(&lock_queue); } @@ -187,7 +187,7 @@ void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer) panic("ll_rw_page: bad block dev cmd, must be R/W"); } page = mem_map + MAP_NR(buffer); - if (set_bit(PG_locked, &page->flags)) + if (test_and_set_bit(PG_locked, &page->flags)) panic ("ll_rw_page: page already locked"); brw_page(rw, page, dev, &block, PAGE_SIZE, 0); } @@ -1,8 +1,81 @@ /* * linux/mm/slab.c - * Written by Mark Hemment, 1996. + * Written by Mark Hemment, 1996/97. * (markhe@nextd.demon.co.uk) + * + * 11 April '97. Started multi-threading - markhe + * The global cache-chain is protected by the semaphore 'cache_chain_sem'. + * The sem is only needed when accessing/extending the cache-chain, which + * can never happen inside an interrupt (kmem_cache_create(), + * kmem_cache_shrink() and kmem_cache_reap()). + * This is a medium-term exclusion lock. + * + * Each cache has its own lock; 'c_spinlock'. This lock is needed only + * when accessing non-constant members of a cache-struct. + * Note: 'constant members' are assigned a value in kmem_cache_create() before + * the cache is linked into the cache-chain. The values never change, so not + * even a multi-reader lock is needed for these members. + * The c_spinlock is only ever held for a few cycles. + * + * To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which + * maybe be sleeping and therefore not holding the semaphore/lock), the + * c_growing field is used. This also prevents reaping from a cache. + * + * Note, caches can _never_ be destroyed. When a sub-system (eg module) has + * finished with a cache, it can only be shrunk. This leaves the cache empty, + * but already enabled for re-use, eg. during a module re-load. + * + * Notes: + * o Constructors/deconstructors are called while the cache-lock + * is _not_ held. Therefore they _must_ be threaded. + * o Constructors must not attempt to allocate memory from the + * same cache that they are a constructor for - infinite loop! + * (There is no easy way to trap this.) + * o The per-cache locks must be obtained with local-interrupts disabled. + * o When compiled with debug support, and an object-verify (upon release) + * is request for a cache, the verify-function is called with the cache + * lock held. This helps debugging. + * o The functions called from try_to_free_page() must not attempt + * to allocate memory from a cache which is being grown. + * The buffer sub-system might try to allocate memory, via buffer_cachep. + * As this pri is passed to the SLAB, and then (if necessary) onto the + * gfp() funcs (which avoid calling try_to_free_page()), no deadlock + * should happen. + * + * The positioning of the per-cache lock is tricky. If the lock is + * placed on the same h/w cache line as commonly accessed members + * the number of L1 cache-line faults is reduced. However, this can + * lead to the cache-line ping-ponging between processors when the + * lock is in contention (and the common members are being accessed). + * Decided to keep it away from common members. + * + * More fine-graining is possible, with per-slab locks...but this might be + * taking fine graining too far, but would have the advantage; + * During most allocs/frees no writes occur to the cache-struct. + * Therefore a multi-reader/one writer lock could be used (the writer + * needed when the slab chain is being link/unlinked). + * As we would not have an exclusion lock for the cache-structure, one + * would be needed per-slab (for updating s_free ptr, and/or the contents + * of s_index). + * The above locking would allow parallel operations to different slabs within + * the same cache with reduced spinning. + * + * Per-engine slab caches, backed by a global cache (as in Mach's Zone allocator), + * would allow most allocations from the same cache to execute in parallel. + * + * At present, each engine can be growing a cache. This should be blocked. + * + * It is not currently 100% safe to examine the page_struct outside of a kernel + * or global cli lock. The risk is v. small, and non-fatal. + * + * Calls to printk() are not 100% safe (the function is not threaded). However, + * printk() is only used under an error condition, and the risk is v. small (not + * sure if the console write functions 'enjoy' executing multiple contextes in + * parallel. I guess they don't...). + * Note, for most calls to printk() any held cache-lock is dropped. This is not + * always done for text size reasons - having *_unlock() everywhere is bloat. */ + /* * An implementation of the Slab Allocator as described in outline in; * UNIX Internals: The New Frontiers by Uresh Vahalia @@ -10,156 +83,251 @@ * or with a little more detail in; * The Slab Allocator: An Object-Caching Kernel Memory Allocator * Jeff Bonwick (Sun Microsystems). - * Presented at: USENIX Summer 1994 Technical Conference + * Presented at: USENIX Summer 1994 Technical Conference + */ + +/* + * This implementation deviates from Bonwick's paper as it + * does not use a hash-table for large objects, but rather a per slab + * index to hold the bufctls. This allows the bufctl structure to + * be small (one word), but limits the number of objects a slab (not + * a cache) can contain when off-slab bufctls are used. The limit is the + * size of the largest general-cache that does not use off-slab bufctls, + * divided by the size of a bufctl. For 32bit archs, is this 256/4 = 64. + * This is not serious, as it is only for large objects, when it is unwise + * to have too many per slab. + * Note: This limit can be raised by introducing a general-cache whose size + * is less than 512 (PAGE_SIZE<<3), but greater than 256. */ -#include <linux/slab.h> #include <linux/mm.h> +#include <linux/slab.h> #include <linux/interrupt.h> +#include <linux/config.h> +#include <linux/init.h> +#include <linux/smp.h> + #include <asm/system.h> -#include <asm/cache.h> - -/* SLAB_MGMT_CHECKS - define to enable extra checks in - * kmem_cache_[create|destroy|shrink]. - * If you're not messing around with these funcs, then undef this. - * SLAB_HIGH_PACK - define to allow 'bufctl's to be stored within objs that do not - * have a state. This allows more objs per slab, but removes the - * ability to sanity check an addr on release (if the addr is - * within any slab, anywhere, kmem_cache_free() will accept it!). - * SLAB_DEBUG_SUPPORT - when defined, kmem_cache_create() will honour; SLAB_DEBUG_FREE, - * SLAB_DEBUG_INITIAL and SLAB_RED_ZONE. +#include <asm/atomic.h> +#include <asm/smp_lock.h> +#include <asm/spinlock.h> + +/* If there is a different PAGE_SIZE around, and it works with this allocator, + * then change the following. */ -#define SLAB_MGMT_CHECKS -#undef SLAB_HIGH_PACK -#define SLAB_DEBUG_SUPPORT /* undef this when your cache is stable */ +#if (PAGE_SIZE != 8192 && PAGE_SIZE != 4096) +#error Your page size is probably not correctly supported - please check +#endif + +/* SLAB_MGMT_CHECKS - 1 to enable extra checks in kmem_cache_create(). + * 0 if you wish to reduce memory usage. + * + * SLAB_DEBUG_SUPPORT - 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE, + * SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISION. + * 0 for faster, smaller, code (espically in the critical paths). + * + * SLAB_STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller, code (espically in the critical paths). + * + * SLAB_SELFTEST - 1 to perform a few tests, mainly for developement. + */ +#define SLAB_MGMT_CHECKS 1 +#define SLAB_DEBUG_SUPPORT 0 +#define SLAB_STATS 0 +#define SLAB_SELFTEST 0 -#define BYTES_PER_WORD sizeof(void *) +/* Shouldn't this be in a header file somewhere? */ +#define BYTES_PER_WORD sizeof(void *) -/* legal flag mask for kmem_cache_create() */ -#if defined(SLAB_DEBUG_SUPPORT) -#define SLAB_C_MASK (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_HWCACHE_ALIGN|SLAB_RED_ZONE) +/* Legal flag mask for kmem_cache_create(). */ +#if SLAB_DEBUG_SUPPORT +#if 0 +#define SLAB_C_MASK (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \ + SLAB_POISION|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP| \ + SLAB_HIGH_PACK) +#endif +#define SLAB_C_MASK (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \ + SLAB_POISION|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP) #else -#define SLAB_C_MASK (SLAB_HWCACHE_ALIGN) +#if 0 +#define SLAB_C_MASK (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP|SLAB_HIGH_PACK) +#endif +#define SLAB_C_MASK (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP) #endif /* SLAB_DEBUG_SUPPORT */ -/* Magic num for red zoning. - * Placed in the first word after the end of an obj - */ -#define SLAB_RED_MAGIC1 0x5A2CF071UL /* when obj is active */ -#define SLAB_RED_MAGIC2 0x170FC2A5UL /* when obj is inactive */ +/* Slab management struct. + * Manages the objs in a slab. Placed either at the end of mem allocated + * for a slab, or from an internal obj cache (cache_slabp). + * Slabs are chained into a partially ordered list; fully used first, partial + * next, and then fully free slabs. + * The first 4 members are referenced during an alloc/free operation, and + * should always appear on the same cache line. + * Note: The offset between some members _must_ match offsets within + * the kmem_cache_t - see kmem_cache_init() for the checks. */ + +#define SLAB_OFFSET_BITS 16 /* could make this larger for 64bit archs */ + +typedef struct kmem_slab_s { + struct kmem_bufctl_s *s_freep; /* ptr to first inactive obj in slab */ + struct kmem_bufctl_s *s_index; + unsigned long s_magic; + unsigned long s_inuse; /* num of objs active in slab */ + + struct kmem_slab_s *s_nextp; + struct kmem_slab_s *s_prevp; + void *s_mem; /* addr of first obj in slab */ + unsigned long s_offset:SLAB_OFFSET_BITS, + s_dma:1; +} kmem_slab_t; -/* Used for linking objs within a slab. How much of the struct is - * used, and where its placed, depends on the packing used in a cache. - * Don't mess with the order! +/* When the slab mgmt is on-slab, this gives the size to use. */ +#define slab_align_size (L1_CACHE_ALIGN(sizeof(kmem_slab_t))) + +/* Test for end of slab chain. */ +#define kmem_slab_end(x) ((kmem_slab_t*)&((x)->c_offset)) + +/* s_magic */ +#define SLAB_MAGIC_ALLOC 0xA5C32F2BUL /* slab is alive */ +#define SLAB_MAGIC_DESTROYED 0xB2F23C5AUL /* slab has been destoryed */ + +/* Bufctl's are used for linking objs within a slab, identifying what slab an obj + * is in, and the address of the associated obj (for sanity checking with off-slab + * bufctls). What a bufctl contains depends upon the state of the obj and + * the organisation of the cache. */ typedef struct kmem_bufctl_s { - struct kmem_bufctl_s *buf_nextp; - struct kmem_slab_s *buf_slabp; - void *buf_objp; /* start of obj */ - struct kmem_bufctl_s *buf_hnextp; - struct kmem_bufctl_s **buf_hashp; + union { + struct kmem_bufctl_s *buf_nextp; + kmem_slab_t *buf_slabp; /* slab for obj */ + void * buf_objp; + } u; } kmem_bufctl_t; -/* different portions of the bufctl are used - so need some macros */ -#define kmem_bufctl_offset(x) ((unsigned long)&((kmem_bufctl_t *)0)->x) -#define kmem_bufctl_short_size (kmem_bufctl_offset(buf_objp)) -#define kmem_bufctl_very_short_size (kmem_bufctl_offset(buf_slabp)) +/* ...shorthand... */ +#define buf_nextp u.buf_nextp +#define buf_slabp u.buf_slabp +#define buf_objp u.buf_objp -/* Slab management struct. - * Manages the objs in a slab. Placed either at the end of mem allocated - * for the slab, or from an internal obj cache (SLAB_CFLGS_OFF_SLAB). - * Slabs are chain into a partially ordered list. The linking ptrs must - * be first in the struct! - * The size of the struct is important(ish); it should align well on - * cache line(s) +#if SLAB_DEBUG_SUPPORT +/* Magic nums for obj red zoning. + * Placed in the first word before and the first word after an obj. */ -typedef struct kmem_slab_s { - struct kmem_slab_s *s_nextp; - struct kmem_slab_s *s_prevp; - void *s_mem; /* addr of mem allocated for slab */ - unsigned long s_jiffies; - kmem_bufctl_t *s_freep; /* ptr to first inactive obj in slab */ - unsigned long s_flags; - unsigned long s_magic; - unsigned long s_inuse; /* num of objs active in slab */ -} kmem_slab_t; - -/* to test for end of slab chain */ -#define kmem_slab_end(x) ((kmem_slab_t*)&((x)->c_firstp)) +#define SLAB_RED_MAGIC1 0x5A2CF071UL /* when obj is active */ +#define SLAB_RED_MAGIC2 0x170FC2A5UL /* when obj is inactive */ -/* s_magic */ -#define SLAB_MAGIC_ALLOC 0xA5C32F2BUL -#define SLAB_MAGIC_UNALLOC 0xB2F23C5AUL +/* ...and for poisioning */ +#define SLAB_POISION_BYTE 0x5a /* byte value for poisioning */ +#define SLAB_POISION_END 0xa5 /* end-byte of poisioning */ -/* s_flags */ -#define SLAB_SFLGS_DMA 0x000001UL /* slab's mem can do DMA */ +#endif /* SLAB_DEBUG_SUPPORT */ -/* cache struct - manages a cache. - * c_lastp must appear immediately after c_firstp! +/* Cache struct - manages a cache. + * First four members are commonly referenced during an alloc/free operation. */ struct kmem_cache_s { kmem_slab_t *c_freep; /* first slab w. free objs */ - unsigned long c_flags; + unsigned long c_flags; /* constant flags */ unsigned long c_offset; - struct kmem_bufctl_s **c_hashp; /* ptr for off-slab bufctls */ - kmem_slab_t *c_firstp; /* first slab in chain */ - kmem_slab_t *c_lastp; /* last slab in chain */ - unsigned long c_hashbits; unsigned long c_num; /* # of objs per slab */ - unsigned long c_gfporder; /* order of pgs per slab (2^n) */ - unsigned long c_org_size; + unsigned long c_magic; unsigned long c_inuse; /* kept at zero */ - void (*c_ctor)(void *, int, unsigned long); /* constructor func */ - void (*c_dtor)(void *, int, unsigned long); /* de-constructor func */ + kmem_slab_t *c_firstp; /* first slab in chain */ + kmem_slab_t *c_lastp; /* last slab in chain */ + + spinlock_t c_spinlock; + unsigned long c_growing; + unsigned long c_dflags; /* dynamic flags */ + size_t c_org_size; + unsigned long c_gfporder; /* order of pgs per slab (2^n) */ + void (*c_ctor)(void *, kmem_cache_t *, unsigned long); /* constructor func */ + void (*c_dtor)(void *, kmem_cache_t *, unsigned long); /* de-constructor func */ unsigned long c_align; /* alignment of objs */ - unsigned long c_colour; /* cache colouring range */ - unsigned long c_colour_next;/* cache colouring */ + size_t c_colour; /* cache colouring range */ + size_t c_colour_next;/* cache colouring */ + unsigned long c_failures; const char *c_name; struct kmem_cache_s *c_nextp; + kmem_cache_t *c_index_cachep; +#if SLAB_STATS + unsigned long c_num_active; + unsigned long c_num_allocations; + unsigned long c_high_mark; + unsigned long c_grown; + unsigned long c_reaped; + atomic_t c_errors; +#endif /* SLAB_STATS */ }; -/* magic # for c_magic - used to detect out-of-slabs in __kmem_cache_alloc() */ -#define SLAB_C_MAGIC 0x4F17A36DUL - /* internal c_flags */ #define SLAB_CFLGS_OFF_SLAB 0x010000UL /* slab mgmt in own cache */ #define SLAB_CFLGS_BUFCTL 0x020000UL /* bufctls in own cache */ -#define SLAB_CFLGS_RELEASED 0x040000UL /* cache is/being destroyed */ +#define SLAB_CFLGS_GENERAL 0x080000UL /* a general-cache */ -#if defined(SLAB_HIGH_PACK) -#define SLAB_CFLGS_PTR_IN_OBJ 0x080000UL /* free ptr in obj */ -#endif +/* c_dflags (dynamic flags). Need to hold the spinlock to access this member */ +#define SLAB_CFLGS_GROWN 0x000002UL /* don't reap a recently grown */ #define SLAB_OFF_SLAB(x) ((x) & SLAB_CFLGS_OFF_SLAB) #define SLAB_BUFCTL(x) ((x) & SLAB_CFLGS_BUFCTL) -#define SLAB_RELEASED(x) ((x) & SLAB_CFLGS_RELEASED) -#if defined(SLAB_HIGH_PACK) -#define SLAB_PTR_IN_OBJ(x) ((x) & SLAB_CFLGS_PTR_IN_OBJ) +#define SLAB_GROWN(x) ((x) & SLAB_CFLGS_GROWN) + +#if SLAB_STATS +#define SLAB_STATS_INC_ACTIVE(x) ((x)->c_num_active++) +#define SLAB_STATS_DEC_ACTIVE(x) ((x)->c_num_active--) +#define SLAB_STATS_INC_ALLOCED(x) ((x)->c_num_allocations++) +#define SLAB_STATS_INC_GROWN(x) ((x)->c_grown++) +#define SLAB_STATS_INC_REAPED(x) ((x)->c_reaped++) +#define SLAB_STATS_SET_HIGH(x) do { if ((x)->c_num_active > (x)->c_high_mark) \ + (x)->c_high_mark = (x)->c_num_active; \ + } while (0) +#define SLAB_STATS_INC_ERR(x) (atomic_inc(&(x)->c_errors)) #else -#define SLAB_PTR_IN_OBJ(x) (0) +#define SLAB_STATS_INC_ACTIVE(x) +#define SLAB_STATS_DEC_ACTIVE(x) +#define SLAB_STATS_INC_ALLOCED(x) +#define SLAB_STATS_INC_GROWN(x) +#define SLAB_STATS_INC_REAPED(x) +#define SLAB_STATS_SET_HIGH(x) +#define SLAB_STATS_INC_ERR(x) +#endif /* SLAB_STATS */ + +#if SLAB_SELFTEST +#if !SLAB_DEBUG_SUPPORT +#error Debug support needed for self-test #endif +static void kmem_self_test(void); +#endif /* SLAB_SELFTEST */ + +/* c_magic - used to detect 'out of slabs' in __kmem_cache_alloc() */ +#define SLAB_C_MAGIC 0x4F17A36DUL /* maximum size of an obj (in 2^order pages) */ #define SLAB_OBJ_MAX_ORDER 5 /* 32 pages */ -/* maximum num of pages for a slab (avoids trying to ask for too may contigious pages) */ +/* maximum num of pages for a slab (prevents large requests to the VM layer) */ #define SLAB_MAX_GFP_ORDER 5 /* 32 pages */ /* the 'prefered' minimum num of objs per slab - maybe less for large objs */ #define SLAB_MIN_OBJS_PER_SLAB 4 -/* if the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB, - * then the page order must be less than this before trying the next order +/* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB, + * then the page order must be less than this before trying the next order. */ #define SLAB_BREAK_GFP_ORDER 2 -/* size of hash tables for caches which use off-slab bufctls (SLAB_CFLGS_BUFCTL) */ -#define KMEM_HASH_SIZE 128 +/* Macros for storing/retrieving the cachep and or slab from the + * global 'mem_map'. With off-slab bufctls, these are used to find the + * slab an obj belongs to. With kmalloc(), and kfree(), these are used + * to find the cache which an obj belongs to. + */ +#define SLAB_SET_PAGE_CACHE(pg, x) ((pg)->next = (struct page *)(x)) +#define SLAB_GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->next) +#define SLAB_SET_PAGE_SLAB(pg, x) ((pg)->prev = (struct page *)(x)) +#define SLAB_GET_PAGE_SLAB(pg) ((kmem_slab_t *)(pg)->prev) -/* size description struct for general-caches */ +/* Size description struct for general-caches. */ typedef struct cache_sizes { - unsigned long cs_size; + size_t cs_size; kmem_cache_t *cs_cachep; } cache_sizes_t; @@ -175,177 +343,177 @@ static cache_sizes_t cache_sizes[] = { {2048, NULL}, {4096, NULL}, {8192, NULL}, -#if PAGE_SIZE == 8192 {16384, NULL}, -#endif + {32768, NULL}, + {65536, NULL}, + {131072, NULL}, {0, NULL} }; -/* Names for the general-caches. - * Not placed into the sizes struct for a good reason; the - * string ptr is not needed while searching in kmem_alloc()/ - * kmem_free(), and would 'get-in-the-way' - think about it. +/* Names for the general-caches. Not placed into the sizes struct for + * a good reason; the string ptr is not needed while searching in kmalloc(), + * and would 'get-in-the-way' in the h/w cache. */ static char *cache_sizes_name[] = { #if PAGE_SIZE == 4096 - "cache-32", + "size-32", #endif - "cache-64", - "cache-128", - "cache-256", - "cache-512", - "cache-1024", - "cache-2048", - "cache-4096", -#if PAGE_SIZE == 4096 - "cache-8192" -#elif PAGE_SIZE == 8192 - "cache-8192", - "cache-16384" -#else -#error Your page size is not supported for the general-caches - please fix -#endif -}; - -static void kmem_hash_ctor(void *ptr, int , unsigned long); /* fwd ref */ -extern kmem_cache_t cache_cache; /* fwd ref */ - -/* internal cache of hash objs, only used when bufctls are off-slab */ -static kmem_cache_t cache_hash = { -/* freep, flags */ kmem_slab_end(&cache_hash), 0, -/* offset, hashp */ sizeof(kmem_bufctl_t*)*KMEM_HASH_SIZE, NULL, -/* firstp, lastp */ kmem_slab_end(&cache_hash), kmem_slab_end(&cache_hash), -/* hashbits, num, gfporder */ 0, 0, 0, -/* org_size, magic */ sizeof(kmem_bufctl_t*)*KMEM_HASH_SIZE, SLAB_C_MAGIC, -/* inuse, ctor, dtor, align */ 0, kmem_hash_ctor, NULL, L1_CACHE_BYTES, -/* colour, colour_next */ 0, 0, -/* name, nextp */ "hash_cache", &cache_cache -}; - -/* internal cache of freelist mgmnt objs, only use when bufctls are off-slab */ -static kmem_cache_t cache_bufctl = { -/* freep, flags */ kmem_slab_end(&cache_bufctl), 0, -/* offset, hashp */ sizeof(kmem_bufctl_t), NULL, -/* firstp, lastp */ kmem_slab_end(&cache_bufctl), kmem_slab_end(&cache_bufctl), -/* hashbits, num, gfporder */ 0, 0, 0, -/* org_size, magic */ sizeof(kmem_bufctl_t), SLAB_C_MAGIC, -/* inuse, ctor, dtor, align */ 0, NULL, NULL, BYTES_PER_WORD*2, -/* colour, colour_next */ 0, 0, -/* name, nextp */ "bufctl_cache", &cache_hash -}; - -/* internal cache of slab mngmnt objs, only used when slab mgmt is off-slab */ -static kmem_cache_t cache_slab = { -/* freep, flags */ kmem_slab_end(&cache_slab), 0, -/* offset, hashp */ sizeof(kmem_slab_t), NULL, -/* firstp, lastp */ kmem_slab_end(&cache_slab), kmem_slab_end(&cache_slab), -/* hashbits, num, gfporder */ 0, 0, 0, -/* org_size, magic */ sizeof(kmem_slab_t), SLAB_C_MAGIC, -/* inuse, ctor, dtor, align */ 0, NULL, NULL, L1_CACHE_BYTES, -/* colour, colour_next */ 0, 0, -/* name, nextp */ "slab_cache", &cache_bufctl + "size-64", + "size-128", + "size-256", + "size-512", + "size-1024", + "size-2048", + "size-4096", + "size-8192", + "size-16384", + "size-32768", + "size-65536", + "size-131072" }; /* internal cache of cache description objs */ static kmem_cache_t cache_cache = { -/* freep, flags */ kmem_slab_end(&cache_cache), 0, -/* offset, hashp */ sizeof(kmem_cache_t), NULL, +/* freep, flags */ kmem_slab_end(&cache_cache), SLAB_NO_REAP, +/* offset, num */ sizeof(kmem_cache_t), 0, +/* c_magic, c_inuse */ SLAB_C_MAGIC, 0, /* firstp, lastp */ kmem_slab_end(&cache_cache), kmem_slab_end(&cache_cache), -/* hashbits, num, gfporder */ 0, 0, 0, -/* org_size, magic */ sizeof(kmem_cache_t), SLAB_C_MAGIC, -/* inuse, ctor, dtor, align */ 0, NULL, NULL, L1_CACHE_BYTES, +/* spinlock */ SPIN_LOCK_UNLOCKED, +/* growing */ 0, +/* dflags */ 0, +/* org_size, gfp */ 0, 0, +/* ctor, dtor, align */ NULL, NULL, L1_CACHE_BYTES, /* colour, colour_next */ 0, 0, +/* failures */ 0, /* name */ "kmem_cache", -/* nextp */ &cache_slab +/* nextp */ &cache_cache, +/* index */ NULL, }; -/* constructor for hash tables */ -static void kmem_hash_ctor(void *ptr, int size, unsigned long flags) -{ - memset(ptr, 0, sizeof(kmem_bufctl_t*)*KMEM_HASH_SIZE); -} +/* Guard access to the cache-chain. */ +static struct semaphore cache_chain_sem; -/* place maintainer for reaping */ +/* Place maintainer for reaping. */ static kmem_cache_t *clock_searchp = &cache_cache; -/* Init an internal cache */ -static void -kmem_own_cache_init(kmem_cache_t *cachep) -{ - unsigned long size, i; +/* Internal slab mgmt cache, for when slab mgmt is off-slab. */ +static kmem_cache_t *cache_slabp = NULL; - if (cachep->c_inuse || cachep->c_magic != SLAB_C_MAGIC) { - panic("Bad init of internal cache %s", cachep->c_name); - /* NOTREACHED */ - } - size = cachep->c_offset + kmem_bufctl_short_size; - i = size % cachep->c_align; - if (i) - size += (cachep->c_align-i); - cachep->c_offset = size-kmem_bufctl_short_size; - - i = ((PAGE_SIZE<<cachep->c_gfporder)-sizeof(kmem_slab_t)); - cachep->c_num = i / size; /* num of objs per slab */ - - /* cache colouring */ - cachep->c_colour = 1 + (i-(cachep->c_num*size))/cachep->c_align; - cachep->c_colour_next = cachep->c_colour; -} +/* Max number of objs-per-slab for caches which use bufctl's. + * Needed to avoid a possible looping condition in kmem_cache_grow(). + */ +static unsigned long bufctl_limit = 0; -/* Initialisation - setup all internal caches */ -long -kmem_cache_init(long start, long end) +/* Initialisation - setup the `cache' cache. */ +__initfunc(long kmem_cache_init(long start, long end)) { - /* sanity */ + size_t size, i; + +#define kmem_slab_offset(x) ((unsigned long)&((kmem_slab_t *)0)->x) +#define kmem_slab_diff(a,b) (kmem_slab_offset(a) - kmem_slab_offset(b)) #define kmem_cache_offset(x) ((unsigned long)&((kmem_cache_t *)0)->x) -#define kmem_slab_offset(x) ((unsigned long)&((kmem_slab_t *)0)->x) - if (((kmem_cache_offset(c_magic)-kmem_cache_offset(c_firstp)) != kmem_slab_offset(s_magic)) || - ((kmem_cache_offset(c_inuse)-kmem_cache_offset(c_firstp)) != kmem_slab_offset(s_inuse))) { +#define kmem_cache_diff(a,b) (kmem_cache_offset(a) - kmem_cache_offset(b)) + + /* Sanity checks... */ + if (kmem_cache_diff(c_firstp, c_magic) != kmem_slab_diff(s_nextp, s_magic) || + kmem_cache_diff(c_firstp, c_inuse) != kmem_slab_diff(s_nextp, s_inuse) || + ((kmem_cache_offset(c_lastp) - + ((unsigned long) kmem_slab_end((kmem_cache_t*)NULL))) != + kmem_slab_offset(s_prevp)) || + kmem_cache_diff(c_lastp, c_firstp) != kmem_slab_diff(s_prevp, s_nextp)) { /* Offsets to the magic are incorrect, either the structures have * been incorrectly changed, or adjustments are needed for your * architecture. */ - panic("kmem_cache_init(): Offsets are different - been messed with!\n"); + panic("kmem_cache_init(): Offsets are wrong - I've been messed with!"); /* NOTREACHED */ } #undef kmem_cache_offset +#undef kmem_cache_diff #undef kmem_slab_offset +#undef kmem_slab_diff + + cache_chain_sem = MUTEX; + + size = cache_cache.c_offset + sizeof(kmem_bufctl_t); + size += (L1_CACHE_BYTES-1); + size &= ~(L1_CACHE_BYTES-1); + cache_cache.c_offset = size-sizeof(kmem_bufctl_t); + + i = (PAGE_SIZE<<cache_cache.c_gfporder)-slab_align_size; + cache_cache.c_num = i / size; /* num of objs per slab */ + + /* Cache colouring. */ + cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES; + cache_cache.c_colour_next = cache_cache.c_colour; - kmem_own_cache_init(&cache_cache); - kmem_own_cache_init(&cache_slab); - kmem_own_cache_init(&cache_bufctl); - kmem_own_cache_init(&cache_hash); return start; } -/* Initialisation - setup general caches */ -void -kmem_cache_sizes_init(void) +/* Initialisation - setup remaining internal and general caches. + * Called after the gfp() functions have been enabled, and before smp_init(). + */ +__initfunc(void kmem_cache_sizes_init(void)) { - unsigned long i; - - i = sizeof(cache_sizes)/sizeof(cache_sizes[0])-1; - while (i--) - cache_sizes[i].cs_cachep = kmem_cache_create(cache_sizes_name[i], - cache_sizes[i].cs_size, - 0, 0, NULL, NULL); + unsigned int found = 0; + + cache_slabp = kmem_cache_create("slab_cache", sizeof(kmem_slab_t), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (cache_slabp) { + char **names = cache_sizes_name; + cache_sizes_t *sizes = cache_sizes; + do { + /* For performance, all the general-caches are L1 aligned. + * This should be particularly beneficial on SMP boxes, as it + * elimantes "false sharing". + * Note for systems short on memory removing the alignment will + * allow tighter packing of the smaller caches. */ + if (!(sizes->cs_cachep = + kmem_cache_create(*names++, sizes->cs_size, + 0, SLAB_HWCACHE_ALIGN, NULL, NULL))) + goto panic_time; + if (!found) { + /* Inc off-slab bufctl limit until the ceiling is hit. */ + if (SLAB_BUFCTL(sizes->cs_cachep->c_flags)) + found++; + else + bufctl_limit = + (sizes->cs_size/sizeof(kmem_bufctl_t)); + } + sizes->cs_cachep->c_flags |= SLAB_CFLGS_GENERAL; + sizes++; + } while (sizes->cs_size); +#if SLAB_SELFTEST + kmem_self_test(); +#endif /* SLAB_SELFTEST */ + return; + } +panic_time: + panic("kmem_cache_sizes_init: Error creating caches"); + /* NOTREACHED */ } -/* Interface to system's page allocator. - * dma pts to non-zero if all of the mem is suitable for DMA +/* Interface to system's page allocator. Dma pts to non-zero if all + * of memory is DMAable. No need to hold the cache-lock. */ static inline void * -kmem_getpages(const kmem_cache_t *cachep, unsigned long flags, unsigned int *dma) +kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma) { - struct page *page; void *addr; - addr = (void*) __get_free_pages(flags & SLAB_LEVEL_MASK, \ - cachep->c_gfporder, flags & SLAB_DMA); - *dma = 1<<cachep->c_gfporder; - if (!(flags & SLAB_DMA) && addr) { - /* need to check if can dma */ - page = mem_map + MAP_NR(addr); + *dma = flags & SLAB_DMA; + addr = (void*) __get_free_pages(flags & SLAB_LEVEL_MASK, + cachep->c_gfporder, *dma); + /* Assume that now we have the pages no one else can legally + * messes with the 'struct page's. + * However vm_scan() might try to test the structure to see if + * it is a named-page or buffer-page. The members it tests are + * of no interest here..... + */ + if (!*dma && addr) { + /* Need to check if can dma. */ + struct page *page = mem_map + MAP_NR(addr); + *dma = 1<<cachep->c_gfporder; while ((*dma)--) { if (!PageDMA(page)) { *dma = 0; @@ -357,58 +525,52 @@ kmem_getpages(const kmem_cache_t *cachep, unsigned long flags, unsigned int *dma return addr; } -/* Interface to system's page release */ +/* Interface to system's page release. */ static inline void kmem_freepages(kmem_cache_t *cachep, void *addr) { + unsigned long i = (1<<cachep->c_gfporder); + struct page *page = &mem_map[MAP_NR(addr)]; + + /* free_pages() does not clear the type bit - we do that. + * The pages have been unlinked from their cache-slab, + * but their 'struct page's might be accessed in + * vm_scan(). Shouldn't be a worry. + */ + while (i--) { + PageClearSlab(page); + page++; + } free_pages((unsigned long)addr, cachep->c_gfporder); } -/* Hashing function - used for caches with off-slab bufctls */ -static inline int -kmem_hash(const kmem_cache_t *cachep, const void *objp) +#if SLAB_DEBUG_SUPPORT +static inline void +kmem_poision_obj(kmem_cache_t *cachep, void *addr) { - return (((unsigned long)objp >> cachep->c_hashbits) & (KMEM_HASH_SIZE-1)); + memset(addr, SLAB_POISION_BYTE, cachep->c_org_size); + *(unsigned char *)(addr+cachep->c_org_size-1) = SLAB_POISION_END; } -/* Link bufctl into a hash table - used for caches with off-slab bufctls - * - called with ints disabled - */ -static inline void * -kmem_add_to_hash(kmem_cache_t *cachep, kmem_bufctl_t *bufp) +static inline int +kmem_check_poision_obj(kmem_cache_t *cachep, void *addr) { - kmem_bufctl_t **bufpp = bufp->buf_hashp; - - bufp->buf_hnextp = *bufpp; - return (*bufpp = bufp)->buf_objp; + void *end; + end = memchr(addr, SLAB_POISION_END, cachep->c_org_size); + if (end != (addr+cachep->c_org_size-1)) + return 1; + return 0; } +#endif /* SLAB_DEBUG_SUPPORT */ -/* Find bufcntl for given obj addr, and unlink. - * - called with ints disabled +/* Three slab chain funcs - all called with ints disabled and the appropiate + * cache-lock held. */ -static inline kmem_bufctl_t * -kmem_remove_from_hash(kmem_cache_t *cachep, const void *objp) -{ - kmem_bufctl_t *bufp; - kmem_bufctl_t **bufpp = &cachep->c_hashp[kmem_hash(cachep, objp)]; - - for (;*bufpp; bufpp = &(*bufpp)->buf_hnextp) { - if ((*bufpp)->buf_objp != objp) - continue; - bufp = *bufpp; - *bufpp = bufp->buf_hnextp; - return bufp; - } - return NULL; -} - -/* Three slab chain funcs - all called with ints disabled */ static inline void kmem_slab_unlink(kmem_slab_t *slabp) { kmem_slab_t *prevp = slabp->s_prevp; kmem_slab_t *nextp = slabp->s_nextp; - prevp->s_nextp = nextp; nextp->s_prevp = prevp; } @@ -416,781 +578,881 @@ kmem_slab_unlink(kmem_slab_t *slabp) static inline void kmem_slab_link_end(kmem_cache_t *cachep, kmem_slab_t *slabp) { + kmem_slab_t *lastp = cachep->c_lastp; slabp->s_nextp = kmem_slab_end(cachep); - slabp->s_prevp = cachep->c_lastp; - kmem_slab_end(cachep)->s_prevp = slabp; - slabp->s_prevp->s_nextp = slabp; + slabp->s_prevp = lastp; + cachep->c_lastp = slabp; + lastp->s_nextp = slabp; } static inline void kmem_slab_link_free(kmem_cache_t *cachep, kmem_slab_t *slabp) { kmem_slab_t *nextp = cachep->c_freep; - + kmem_slab_t *prevp = nextp->s_prevp; slabp->s_nextp = nextp; - cachep->c_freep = slabp; - slabp->s_prevp = nextp->s_prevp; + slabp->s_prevp = prevp; nextp->s_prevp = slabp; slabp->s_prevp->s_nextp = slabp; } -/* Cal the num objs, wastage, and bytes left over for a given slab size */ -static int -kmem_cache_cal_waste(unsigned long gfporder, unsigned long size, - unsigned long extra, unsigned long flags, - unsigned long *left_over, unsigned long *num) +/* Destroy all the objs in a slab, and release the mem back to the system. + * Before calling the slab must have been unlinked from the cache. + * The cache-lock is not held/needed. + */ +static void +kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp) { - unsigned long wastage; + if (cachep->c_dtor +#if SLAB_DEBUG_SUPPORT + || cachep->c_flags & (SLAB_POISION || SLAB_RED_ZONE) +#endif /*SLAB_DEBUG_SUPPORT*/ + ) { + /* Doesn't use the bufctl ptrs to find objs. */ + unsigned long num = cachep->c_num; + void *objp = slabp->s_mem; + do { +#if SLAB_DEBUG_SUPPORT + if (cachep->c_flags & SLAB_RED_ZONE) { + if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) + printk(KERN_ERR "kmem_slab_destroy: " + "Bad front redzone - %s\n", + cachep->c_name); + objp += BYTES_PER_WORD; + if (*((unsigned long*)(objp+cachep->c_org_size)) != + SLAB_RED_MAGIC1) + printk(KERN_ERR "kmem_slab_destroy: " + "Bad rear redzone - %s\n", + cachep->c_name); + } + if (cachep->c_dtor) +#endif /*SLAB_DEBUG_SUPPORT*/ + (cachep->c_dtor)(objp, cachep, 0); +#if SLAB_DEBUG_SUPPORT + else if (cachep->c_flags & SLAB_POISION) { + if (kmem_check_poision_obj(cachep, objp)) + printk(KERN_ERR "kmem_slab_destory: " + "Bad poision - %s\n", cachep->c_name); + } + if (cachep->c_flags & SLAB_RED_ZONE) + objp -= BYTES_PER_WORD; +#endif /* SLAB_DEBUG_SUPPORT */ + objp += cachep->c_offset; + if (!slabp->s_index) + objp += sizeof(kmem_bufctl_t); + } while (--num); + } - wastage = PAGE_SIZE << gfporder; - gfporder = 0; - if (!SLAB_OFF_SLAB(flags)) - gfporder = sizeof(kmem_slab_t); + slabp->s_magic = SLAB_MAGIC_DESTROYED; + kmem_freepages(cachep, slabp->s_mem-slabp->s_offset); + if (slabp->s_index) + kmem_cache_free(cachep->c_index_cachep, slabp->s_index); + if (SLAB_OFF_SLAB(cachep->c_flags)) + kmem_cache_free(cache_slabp, slabp); +} + +/* Cal the num objs, wastage, and bytes left over for a given slab size. */ +static inline size_t +kmem_cache_cal_waste(unsigned long gfporder, size_t size, size_t extra, + unsigned long flags, size_t *left_over, unsigned long *num) +{ + size_t wastage = PAGE_SIZE<<gfporder; + + if (SLAB_OFF_SLAB(flags)) + gfporder = 0; + else + gfporder = slab_align_size; wastage -= gfporder; *num = wastage / size; wastage -= (*num * size); *left_over = wastage; - wastage += (extra * *num); - wastage += gfporder; - - return wastage; + return (wastage + gfporder + (extra * *num)); } -/* Create a cache +/* Create a cache: * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * NOTE: The 'name' is assumed to be memory that is _not_ going to disappear. */ kmem_cache_t * -kmem_cache_create(const char *name, unsigned long size, unsigned long align, - unsigned long flags, void (*ctor)(void*, int, unsigned long), - void (*dtor)(void*, int, unsigned long)) +kmem_cache_create(const char *name, size_t size, size_t offset, + unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), + void (*dtor)(void*, kmem_cache_t *, unsigned long)) { - const char *func_nm="kmem_create: "; - kmem_cache_t *searchp, *cachep; - unsigned long words, i; - unsigned long num, left_over; + const char *func_nm= KERN_ERR "kmem_create: "; + kmem_cache_t *searchp; + kmem_cache_t *cachep=NULL; + size_t extra; + size_t left_over; + size_t align; - /* sanity checks */ -#if defined(SLAB_MGMT_CHECKS) + /* Sanity checks... */ +#if SLAB_MGMT_CHECKS if (!name) { - printk(KERN_ERR "%sNULL ptr\n", func_nm); - return NULL; + printk("%sNULL ptr\n", func_nm); + goto opps; } if (in_interrupt()) { - printk(KERN_ERR "%sCalled during int - %s\n", func_nm, name); - return NULL; + printk("%sCalled during int - %s\n", func_nm, name); + goto opps; } - if (size < kmem_bufctl_very_short_size) { - printk(KERN_WARNING "%sSize too small %lu - %s\n", func_nm, size, name); - size = kmem_bufctl_very_short_size; + if (size < BYTES_PER_WORD) { + printk("%sSize too small %d - %s\n", func_nm, (int) size, name); + size = BYTES_PER_WORD; } if (size > ((1<<SLAB_OBJ_MAX_ORDER)*PAGE_SIZE)) { - printk(KERN_ERR "%sSize too large %lu - %s\n", func_nm, size, name); - return NULL; - } -#endif /* SLAB_MGMT_CHECKS */ - - /* always checks flags, a caller might be expecting debug support which - * isn't available - */ - if (flags & ~SLAB_C_MASK) { - /* Illegal flags */ - printk(KERN_WARNING "%sIllgl flg %lX - %s\n", func_nm, flags, name); - flags &= SLAB_C_MASK; + printk("%sSize too large %d - %s\n", func_nm, (int) size, name); + goto opps; } -#if defined(SLAB_MGMT_CHECKS) - if (align < 0 || align >= size) { - printk(KERN_WARNING "%sAlign weired %lu - %s\n", func_nm, align, name); - align = 0; + if (dtor && !ctor) { + /* Decon, but no con - doesn't make sense */ + printk("%sDecon but no con - %s\n", func_nm, name); + goto opps; } - if (dtor && !ctor) { - /* Descon, but no con - doesn't make sense */ - printk(KERN_ERR "%sDecon but no con - %s\n", func_nm, name); - return NULL; + if (offset < 0 || offset > size) { + printk("%sOffset weired %d - %s\n", func_nm, (int) offset, name); + offset = 0; } +#if SLAB_DEBUG_SUPPORT if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { /* No constructor, but inital state check requested */ - printk(KERN_WARNING "%sNo con, but init state check requested - %s\n", - func_nm, name); + printk("%sNo con, but init state check requested - %s\n", func_nm, name); flags &= ~SLAB_DEBUG_INITIAL; } + + if ((flags & SLAB_POISION) && ctor) { + /* request for poisioning, but we can't do that with a constructor */ + printk("%sPoisioning requested, but con given - %s\n", func_nm, name); + flags &= ~SLAB_POISION; + } +#if 0 + if ((flags & SLAB_HIGH_PACK) && ctor) { + printk("%sHigh pack requested, but con given - %s\n", func_nm, name); + flags &= ~SLAB_HIGH_PACK; + } + if ((flags & SLAB_HIGH_PACK) && (flags & (SLAB_POISION|SLAB_RED_ZONE))) { + printk("%sHigh pack requested, but with poisioning/red-zoning - %s\n", + func_nm, name); + flags &= ~SLAB_HIGH_PACK; + } +#endif +#endif /* SLAB_DEBUG_SUPPORT */ #endif /* SLAB_MGMT_CHECKS */ - /* get cache's description obj */ + /* Always checks flags, a caller might be expecting debug + * support which isn't available. + */ + if (flags & ~SLAB_C_MASK) { + printk("%sIllgl flg %lX - %s\n", func_nm, flags, name); + flags &= SLAB_C_MASK; + } + + /* Get cache's description obj. */ cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto opps; + memset(cachep, 0, sizeof(kmem_cache_t)); - /* remember original size, so can be passed to a constructor or decon. - * Allows the same con/decon to be used for caches of similar objs - * that have a different size data buffer assoicated with them + /* Check that size is in terms of words. This is needed to avoid + * unaligned accesses for some archs when redzoning is used, and makes + * sure any on-slab bufctl's are also correctly aligned. */ - cachep->c_org_size = size; + if (size & (BYTES_PER_WORD-1)) { + size += (BYTES_PER_WORD-1); + size &= ~(BYTES_PER_WORD-1); + printk("%sForcing size word alignment - %s\n", func_nm, name); + } -#if defined(SLAB_DEBUG_SUPPORT) - if (flags & SLAB_RED_ZONE) - size += BYTES_PER_WORD; /* word for redzone */ +#if SLAB_DEBUG_SUPPORT + if (flags & SLAB_RED_ZONE) { + /* There is no point trying to honour cache alignment when redzoning. */ + flags &= ~SLAB_HWCACHE_ALIGN; + size += 2*BYTES_PER_WORD; /* words for redzone */ + } #endif /* SLAB_DEBUG_SUPPORT */ + cachep->c_org_size = size; - /* Make a guess if slab mngmnt obj and/or bufctls are 'on' or 'off' slab */ - i = kmem_bufctl_short_size; + align = BYTES_PER_WORD; + if (flags & SLAB_HWCACHE_ALIGN) + align = L1_CACHE_BYTES; + + /* Determine if the slab mgmt and/or bufclts are 'on' or 'off' slab. */ + extra = sizeof(kmem_bufctl_t); if (size < (PAGE_SIZE>>3)) { - /* Size is small(ish). Use format where bufctl size per - * obj is low, and slab mngmnt is on-slab + /* Size is small(ish). Use packing where bufctl size per + * obj is low, and slab mngmnt is on-slab. */ - if (!ctor && !dtor && !(flags & SLAB_RED_ZONE)) { - /* the objs in this cache have no state - can store - * store freelist ptr within obj. (redzoning is a state) +#if 0 + if ((flags & SLAB_HIGH_PACK)) { + /* Special high packing for small objects + * (mainly for vm_mapping structs, but + * others can use it). */ -#if defined(SLAB_HIGH_PACK) - i=0; - flags |= SLAB_CFLGS_PTR_IN_OBJ; -#else - i = kmem_bufctl_very_short_size; -#endif + if (size == (L1_CACHE_BYTES/4) || size == (L1_CACHE_BYTES/2) || + size == L1_CACHE_BYTES) { + /* The bufctl is stored with the object. */ + extra = 0; + } else + flags &= ~SLAB_HIGH_PACK; } +#endif } else { /* Size is large, assume best to place the slab mngmnt obj - * off-slab (should allow better packing of objs) + * off-slab (should allow better packing of objs). */ flags |= SLAB_CFLGS_OFF_SLAB; - if (!(size & ~PAGE_MASK) || - size == (PAGE_SIZE+PAGE_SIZE/2) || - size == (PAGE_SIZE/2) || - size == (PAGE_SIZE/4) || - size == (PAGE_SIZE/8)) { - /* to avoid waste the bufctls are off-slab */ + if (!(size & ~PAGE_MASK) || size == (PAGE_SIZE/2) + || size == (PAGE_SIZE/4) || size == (PAGE_SIZE/8)) { + /* To avoid waste the bufctls are off-slab... */ flags |= SLAB_CFLGS_BUFCTL; - /* get hash table for cache */ - cachep->c_hashp = kmem_cache_alloc(&cache_hash, SLAB_KERNEL); - if (cachep->c_hashp == NULL) { - kmem_cache_free(&cache_cache, cachep); - goto opps; - } - i = 0; - cachep->c_hashbits = PAGE_SHIFT; - if (size <= (PAGE_SIZE/2)) { - cachep->c_hashbits--; - if (size <= (PAGE_SIZE/4)) cachep->c_hashbits--; - if (size <= (PAGE_SIZE/8)) cachep->c_hashbits -= 2; + extra = 0; + } /* else slab mngmnt is off-slab, but freelist ptrs are on. */ + } + size += extra; + + if (flags & SLAB_HWCACHE_ALIGN) { + /* Need to adjust size so that objs are cache aligned. */ + if (size > (L1_CACHE_BYTES/2)) { + size_t words = size % L1_CACHE_BYTES; + if (words) + size += (L1_CACHE_BYTES-words); + } else { + /* Small obj size, can get at least two per cache line. */ + int num_per_line = L1_CACHE_BYTES/size; + left_over = L1_CACHE_BYTES - (num_per_line*size); + if (left_over) { + /* Need to adjust size so objs cache align. */ + if (left_over%num_per_line) { + /* Odd num of objs per line - fixup. */ + num_per_line--; + left_over += size; + } + size += (left_over/num_per_line); } - } /* else slab mngmnt is off-slab, but freelist ptrs are on */ + } + } else if (!(size%L1_CACHE_BYTES)) { + /* Size happens to cache align... */ + flags |= SLAB_HWCACHE_ALIGN; + align = L1_CACHE_BYTES; } - size += i; - - /* Adjust the mem used for objs so they will align correctly. - * Force objs to start on word boundaries, but caller may specify - * h/w cache line boundaries. This 'alignment' is slightly different - * to the 'align' argument. Objs may be requested to start on h/w - * lines (as that is how the members of the obj have been organised), - * but the 'align' may be quite high (say 64) as the first 64 bytes - * are commonly accessed/modified within a loop (stops h/w line - * thrashing). The 'align' is the slab colouring. - */ - words = BYTES_PER_WORD; - if (flags & SLAB_HWCACHE_ALIGN) - words = L1_CACHE_BYTES; - words--; - size += words; - size = size & ~words; - /* alignment might not be a factor of the boundary alignment - fix-up */ - align += words; - align = align & ~words; - /* Cal size (in pages) of slabs, and the num of objs per slab. - * This could be made much more intelligent. */ - cachep->c_gfporder=0; + * This could be made much more intelligent. For now, try to avoid + * using high page-orders for slabs. When the gfp() funcs are more + * friendly towards high-order requests, this should be changed. + */ do { - unsigned long wastage; - wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, i, - flags, &left_over, &num); - if (!num) + size_t wastage; + unsigned int break_flag = 0; +cal_wastage: + wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, extra, + flags, &left_over, &cachep->c_num); + if (!cachep->c_num) goto next; - if (SLAB_PTR_IN_OBJ(flags)) + if (break_flag) break; + if (SLAB_BUFCTL(flags) && cachep->c_num > bufctl_limit) { + /* Oops, this num of objs will cause problems. */ + cachep->c_gfporder--; + break_flag++; + goto cal_wastage; + } if (cachep->c_gfporder == SLAB_MAX_GFP_ORDER) break; - /* large num of objs is good, but v. large slabs are bad for the - * VM sub-system + + /* Large num of objs is good, but v. large slabs are currently + * bad for the gfp()s. */ - if (num <= SLAB_MIN_OBJS_PER_SLAB) { + if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) { if (cachep->c_gfporder < SLAB_BREAK_GFP_ORDER) goto next; } - /* stop caches with small objs having a large num of pages */ - if (left_over <= sizeof(kmem_slab_t)) + + /* Stop caches with small objs having a large num of pages. */ + if (left_over <= slab_align_size) break; if ((wastage*8) <= (PAGE_SIZE<<cachep->c_gfporder)) - break; /* acceptable wastage */ + break; /* Acceptable internal fragmentation. */ next: cachep->c_gfporder++; } while (1); - cachep->c_num = num; - /* try with requested alignment, but reduce it if that will - * allow at least some alignment words + /* If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. */ - words++; - if (left_over < align) - align = (left_over / words) * words; - else if (!align && words <= left_over) { - /* no alignment given, but space enough - give one */ - align = words; - if (words == BYTES_PER_WORD) { - if (BYTES_PER_WORD*4 <= left_over) - align += align; - if (BYTES_PER_WORD*8 <= left_over) - align += align; + if ((flags & SLAB_CFLGS_OFF_SLAB) && !SLAB_BUFCTL(flags) && + left_over >= slab_align_size) { + flags &= ~SLAB_CFLGS_OFF_SLAB; + left_over -= slab_align_size; + } + + /* Offset must be a factor of the alignment. */ + offset += (align-1); + offset &= ~(align-1); + + /* Mess around with the offset alignment. */ + if (!left_over) { + offset = 0; + } else if (left_over < offset) { + offset = align; + if (flags & SLAB_HWCACHE_ALIGN) { + if (left_over < offset) + offset = 0; + } else { + /* Offset is BYTES_PER_WORD, and left_over is at + * least BYTES_PER_WORD. + */ + if (left_over >= (BYTES_PER_WORD*2)) { + offset >>= 1; + if (left_over >= (BYTES_PER_WORD*4)) + offset >>= 1; + } + } + } else if (!offset) { + /* No offset requested, but space enough - give one. */ + offset = left_over/align; + if (flags & SLAB_HWCACHE_ALIGN) { + if (offset >= 8) { + /* A large number of colours - use a larger alignment. */ + align <<= 1; + } + } else { + if (offset >= 10) { + align <<= 1; + if (offset >= 16) + align <<= 1; + } } + offset = align; } - cachep->c_align = align; #if 0 - printk("Size:%lu Orig:%lu Left:%lu Align %lu Pages:%d - %s\n", - size, cachep->c_org_size, left_over, align, 1<<cachep->c_gfporder, name); - if (SLAB_OFF_SLAB(flags)) printk("OFF SLAB\n"); - if (SLAB_BUFCTL(flags)) printk("BUFCTL PTRS\n"); +printk("%s: Left_over:%d Align:%d Size:%d\n", name, left_over, offset, size); #endif - /* if the bufctl's are on-slab, c_offset does not inc the size of the bufctl */ + if ((cachep->c_align = (unsigned long) offset)) + cachep->c_colour = (left_over/offset); + cachep->c_colour_next = cachep->c_colour; + + /* If the bufctl's are on-slab, c_offset does not include the size of bufctl. */ if (!SLAB_BUFCTL(flags)) - size -= kmem_bufctl_short_size; + size -= sizeof(kmem_bufctl_t); + else + cachep->c_index_cachep = + kmem_find_general_cachep(cachep->c_num*sizeof(kmem_bufctl_t)); + cachep->c_offset = (unsigned long) size; cachep->c_freep = kmem_slab_end(cachep); - cachep->c_flags = flags; - cachep->c_offset = size; cachep->c_firstp = kmem_slab_end(cachep); cachep->c_lastp = kmem_slab_end(cachep); + cachep->c_flags = flags; cachep->c_ctor = ctor; cachep->c_dtor = dtor; cachep->c_magic = SLAB_C_MAGIC; - cachep->c_inuse = 0; /* always zero */ - cachep->c_name = name; /* simply point to the name */ + cachep->c_name = name; /* Simply point to the name. */ + spin_lock_init(&cachep->c_spinlock); - cachep->c_colour = 1; - if (align) - cachep->c_colour += (left_over/align); - cachep->c_colour_next = cachep->c_colour; - - /* warn on dup cache names */ + /* Need the semaphore to access the chain. */ + down(&cache_chain_sem); searchp = &cache_cache; do { + /* The name field is constant - no lock needed. */ if (!strcmp(searchp->c_name, name)) { - printk(KERN_WARNING "%sDup name - %s\n", func_nm, name); + printk("%sDup name - %s\n", func_nm, name); break; } searchp = searchp->c_nextp; } while (searchp != &cache_cache); + + /* There is no reason to lock our new cache before we + * link it in - no one knows about it yet... + */ cachep->c_nextp = cache_cache.c_nextp; cache_cache.c_nextp = cachep; - return cachep; + up(&cache_chain_sem); opps: - printk(KERN_WARNING "%sOut of mem creating cache %s\n", func_nm, name); - return NULL; -} - -/* Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked - */ -static void -kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp, unsigned long flags) -{ - if (cachep->c_dtor || SLAB_BUFCTL(cachep->c_flags)) { - kmem_bufctl_t *bufp = slabp->s_freep; - - /* for each obj in slab... */ - while (bufp) { - kmem_bufctl_t *freep; - if (cachep->c_dtor) { - void *objp = ((void*)bufp)-cachep->c_offset; - if (SLAB_BUFCTL(cachep->c_flags)) - objp = bufp->buf_objp; - (cachep->c_dtor)(objp, cachep->c_org_size, flags); - } - freep = bufp; - bufp = bufp->buf_nextp; - if (SLAB_BUFCTL(cachep->c_flags)) - kmem_cache_free(&cache_bufctl, freep); - } - } - - slabp->s_magic = SLAB_MAGIC_UNALLOC; - kmem_freepages(cachep, slabp->s_mem); - if (SLAB_OFF_SLAB(cachep->c_flags)) - kmem_cache_free(&cache_slab, slabp); -} - -/* Destroy (remove) a cache. - * All objs in the cache should be inactive - */ -int -kmem_cache_destroy(kmem_cache_t *cachep) -{ - kmem_cache_t **searchp; - kmem_slab_t *slabp; - unsigned long save_flags; - -#if defined(SLAB_MGMT_CHECKS) - if (!cachep) { - printk(KERN_ERR "kmem_dest: NULL ptr\n"); - goto err_end; - } - - if (in_interrupt()) { - printk(KERN_ERR "kmem_dest: Called during int - %s\n", cachep->c_name); -err_end: - return 1; - } -#endif /* SLAB_MGMT_CHECKS */ - - /* unlink the cache from the chain of active caches. - * Note: the chain is never modified during an int - */ - searchp = &(cache_cache.c_nextp); - for (;*searchp != &cache_cache; searchp = &((*searchp)->c_nextp)) { - if (*searchp != cachep) - continue; - goto good_cache; - } - printk(KERN_ERR "kmem_dest: Invalid cache addr %p\n", cachep); - return 1; -good_cache: - /* disable cache so attempts to allocated from an int can - * be caught. - */ - save_flags(save_flags); - cli(); - if (cachep->c_freep != kmem_slab_end(cachep)) { - restore_flags(save_flags); - printk(KERN_ERR "kmem_dest: active cache - %s\n", cachep->c_name); - return 2; - } - *searchp = cachep->c_nextp; /* remove from cache chain */ - cachep->c_flags |= SLAB_CFLGS_RELEASED; - cachep->c_freep = kmem_slab_end(cachep); - if (cachep == clock_searchp) - clock_searchp = cachep->c_nextp; - restore_flags(save_flags); - - while ((slabp = cachep->c_firstp) != kmem_slab_end(cachep)) { - kmem_slab_unlink(slabp); - kmem_slab_destroy(cachep, slabp, 0); - } - - if (SLAB_BUFCTL(cachep->c_flags)) - kmem_cache_free(&cache_hash, cachep->c_hashp); - kmem_cache_free(&cache_cache, cachep); - return 0; + return cachep; } -/* Shrink a cache, ie. remove _all_ inactive slabs. - * Can be called when a user of a cache knows they are not going to be - * needing any new objs for a while. - * NOTE: This func is probably going to disappear - let me know if you - * are using it! +/* Shrink a cache. Releases as many slabs as possible for a cache. + * It is expected this function will be called by a module when it is + * unloaded. The cache is _not_ removed, this creates too many problems and + * the cache-structure does not take up much room. A module should keep its + * cache pointer(s) in unloaded memory, so when reloaded it knows the cache + * is available. To help debugging, a zero exit status indicates all slabs + * were released. */ int -kmem_cache_shrink(kmem_cache_t *cachep, int wait) +kmem_cache_shrink(kmem_cache_t *cachep) { + kmem_cache_t *searchp; kmem_slab_t *slabp; - unsigned long dtor_flags; - unsigned long save_flags, num_freed=0; + int ret; -#if defined(SLAB_MGMT_CHECKS) if (!cachep) { printk(KERN_ERR "kmem_shrink: NULL ptr\n"); - goto end; + return 2; } - if (in_interrupt()) { printk(KERN_ERR "kmem_shrink: Called during int - %s\n", cachep->c_name); - goto end; + return 2; } -#endif /* SLAB_MGMT_CHECKS */ - dtor_flags = 0; - if (!wait) /* not allowed to wait */ - dtor_flags = SLAB_DTOR_ATOMIC; + /* Find the cache in the chain of caches. */ + down(&cache_chain_sem); /* Semaphore is needed. */ + searchp = &cache_cache; + for (;searchp->c_nextp != &cache_cache; searchp = searchp->c_nextp) { + if (searchp->c_nextp != cachep) + continue; - save_flags(save_flags); - while (0) { - cli(); - slabp = cachep->c_lastp; - if (slabp == kmem_slab_end(cachep) || slabp->s_inuse) { - restore_flags(save_flags); - goto end; - } - kmem_slab_unlink(slabp); - if (cachep->c_freep == slabp) - cachep->c_freep = kmem_slab_end(cachep); - restore_flags(save_flags); - num_freed++; - kmem_slab_destroy(cachep, slabp, dtor_flags); + /* Accessing clock_searchp is safe - we hold the mutex. */ + if (cachep == clock_searchp) + clock_searchp = cachep->c_nextp; + goto found; } -end: - return num_freed; -} - -/* Search for a slab whose objs are suitable for DMA. - * Note: since testing the first free slab (in __kmem_cache_alloc()), - * ints must not have been enabled! - */ -static inline kmem_slab_t * -kmem_cache_search_dma(kmem_cache_t *cachep) -{ - kmem_slab_t *slabp = cachep->c_freep->s_nextp; + up(&cache_chain_sem); + printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep); + return 2; +found: + /* Relase the sempahore before getting the cache-lock. This could + * mean multiple engines are shrinking the cache, but so what... + */ + up(&cache_chain_sem); + spin_lock_irq(&cachep->c_spinlock); - for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) { - if (!(slabp->s_flags & SLAB_SFLGS_DMA)) - continue; + /* If the cache is growing, stop shrinking. */ + while (!cachep->c_growing) { + slabp = cachep->c_lastp; + if (slabp->s_inuse || slabp == kmem_slab_end(cachep)) + break; kmem_slab_unlink(slabp); - kmem_slab_link_free(cachep, slabp); - return slabp; + spin_unlock_irq(&cachep->c_spinlock); + kmem_slab_destroy(cachep, slabp); + spin_lock_irq(&cachep->c_spinlock); } - return NULL; + ret = 1; + if (cachep->c_lastp == kmem_slab_end(cachep)) + ret--; /* Cache is empty. */ + spin_unlock_irq(&cachep->c_spinlock); + return ret; } -/* get the mem for a slab mgmt obj */ +/* Get the mem for a slab mgmt obj. */ static inline kmem_slab_t * -kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, unsigned long local_flags, unsigned long offset) +kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags) { kmem_slab_t *slabp; if (SLAB_OFF_SLAB(cachep->c_flags)) { - /* slab mngmnt obj is off-slab */ - if (!(slabp = kmem_cache_alloc(&cache_slab, local_flags))) - return NULL; + /* Slab mgmt obj is off-slab. */ + slabp = kmem_cache_alloc(cache_slabp, local_flags); } else { - /* slab mngmnt at end of slab mem */ - slabp = objp + (PAGE_SIZE << cachep->c_gfporder); - slabp--; - if (!SLAB_PTR_IN_OBJ(cachep->c_flags)) { - /* A bit of extra help for the L1 cache; try to position the slab - * mgmnt struct at different offsets within the gap at the end - * of a slab. This helps avoid thrashing the h/w cache lines, - * that map to the end of a page, too much... - */ - unsigned long gap = cachep->c_offset; - if (!SLAB_BUFCTL(cachep->c_flags)) - gap += kmem_bufctl_short_size; - gap = (PAGE_SIZE << cachep->c_gfporder)-((gap*cachep->c_num)+offset+sizeof(*slabp)); - gap /= (sizeof(*slabp)/2); - gap *= (sizeof(*slabp)/2); - slabp = (((void*)slabp)-gap); - } + /* Slab mgmnt at end of slab mem, placed so that + * the position is 'coloured'. + */ + void *end; + end = objp + (cachep->c_num * cachep->c_offset); + if (!SLAB_BUFCTL(cachep->c_flags)) + end += (cachep->c_num * sizeof(kmem_bufctl_t)); + slabp = (kmem_slab_t *) L1_CACHE_ALIGN((unsigned long)end); } - slabp->s_flags = slabp->s_inuse = slabp->s_jiffies = 0; + if (slabp) { + slabp->s_inuse = 0; + slabp->s_dma = 0; + slabp->s_index = NULL; + } return slabp; } -static inline int -kmem_cache_init_objs(kmem_cache_t *cachep, kmem_slab_t *slabp, void *objp, - unsigned long local_flags, unsigned long ctor_flags) +static inline void +kmem_cache_init_objs(kmem_cache_t * cachep, kmem_slab_t * slabp, void *objp, + unsigned long ctor_flags) { kmem_bufctl_t **bufpp = &slabp->s_freep; - unsigned long num = cachep->c_num; + unsigned long num = cachep->c_num-1; do { - if (SLAB_BUFCTL(cachep->c_flags)) { - if (!(*bufpp = kmem_cache_alloc(&cache_bufctl, local_flags))) { - kmem_slab_destroy(cachep, slabp, 0); - return 1; - } - (*bufpp)->buf_objp = objp; - (*bufpp)->buf_hashp = &cachep->c_hashp[kmem_hash(cachep, objp)]; +#if SLAB_DEBUG_SUPPORT + if (cachep->c_flags & SLAB_RED_ZONE) { + *((unsigned long*)(objp)) = SLAB_RED_MAGIC1; + objp += BYTES_PER_WORD; + *((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1; } +#endif /* SLAB_DEBUG_SUPPORT */ + /* Constructors are not allowed to allocate memory from the same cache + * which they are a constructor for. Otherwise, deadlock. + * They must also be threaded. + */ if (cachep->c_ctor) - cachep->c_ctor(objp, cachep->c_org_size, ctor_flags); + cachep->c_ctor(objp, cachep, ctor_flags); +#if SLAB_DEBUG_SUPPORT + else if (cachep->c_flags & SLAB_POISION) { + /* need to poision the objs */ + kmem_poision_obj(cachep, objp); + } -#if defined(SLAB_DEBUG_SUPPORT) - if (cachep->c_flags & SLAB_RED_ZONE) - *((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1; + if (cachep->c_flags & SLAB_RED_ZONE) { + if (*((unsigned long*)(objp+cachep->c_org_size)) != + SLAB_RED_MAGIC1) { + *((unsigned long*)(objp+cachep->c_org_size)) = + SLAB_RED_MAGIC1; + printk(KERN_ERR "kmem_init_obj: Bad rear redzone " + "after constructor - %s\n", cachep->c_name); + } + objp -= BYTES_PER_WORD; + if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) { + *((unsigned long*)(objp)) = SLAB_RED_MAGIC1; + printk(KERN_ERR "kmem_init_obj: Bad front redzone " + "after constructor - %s\n", cachep->c_name); + } + } #endif /* SLAB_DEBUG_SUPPORT */ objp += cachep->c_offset; - if (!SLAB_BUFCTL(cachep->c_flags)) { + if (!slabp->s_index) { *bufpp = objp; - objp += kmem_bufctl_short_size; - } - if (!SLAB_PTR_IN_OBJ(cachep->c_flags)) - (*bufpp)->buf_slabp = slabp; + objp += sizeof(kmem_bufctl_t); + } else + *bufpp = &slabp->s_index[num]; bufpp = &(*bufpp)->buf_nextp; - } while (--num); + } while (num--); + *bufpp = NULL; - return 0; } -/* Grow (by 1) the number of slabs within a cache. - * This is called by kmem_cache_alloc() when there are no - * inactive objs left in a cache +/* Grow (by 1) the number of slabs within a cache. This is called by + * kmem_cache_alloc() when there are no active objs left in a cache. */ -static void -kmem_cache_grow(kmem_cache_t *cachep, unsigned long flags) +static int +kmem_cache_grow(kmem_cache_t * cachep, int flags) { kmem_slab_t *slabp; + struct page *page; void *objp; - unsigned int offset, dma; - unsigned long ctor_flags, local_flags, save_flags; + size_t offset; + unsigned int dma, local_flags; + unsigned long ctor_flags; + unsigned long save_flags; + + /* Be lazy and only check for valid flags here, + * keeping it out of the critical path in kmem_cache_alloc(). + */ + if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) { + printk(KERN_WARNING "kmem_grow: Illegal flgs %X (correcting) - %s\n", + flags, cachep->c_name); + flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW); + } if (flags & SLAB_NO_GROW) - return; /* caller doesn't want us to grow */ + return 0; - save_flags(save_flags); /* The test for missing atomic flag is performed here, rather than * the more obvious place, simply to reduce the critical path length - * in kmem_cache_alloc(). If a caller is slightly mis-behaving, - * will eventually be caught here (where it matters) + * in kmem_cache_alloc(). If a caller is slightly mis-behaving they + * will eventually be caught here (where it matters). */ if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) { - static int count = 0; - if (count < 8) { - printk(KERN_ERR "kmem_grow: Called nonatomically from " - "int - %s\n", cachep->c_name); - count++; - } + printk(KERN_ERR "kmem_grow: Called nonatomically from int - %s\n", + cachep->c_name); flags &= ~SLAB_LEVEL_MASK; flags |= SLAB_ATOMIC; } - local_flags = (flags & SLAB_LEVEL_MASK); ctor_flags = SLAB_CTOR_CONSTRUCTOR; - if ((flags & SLAB_LEVEL_MASK) == SLAB_ATOMIC) { - /* Not allowed to sleep. - * Need to tell a constructor about this - it - * might need to know.... + local_flags = (flags & SLAB_LEVEL_MASK); + if (local_flags == SLAB_ATOMIC) { + /* Not allowed to sleep. Need to tell a constructor about + * this - it might need to know... */ ctor_flags |= SLAB_CTOR_ATOMIC; } - slabp = NULL; - /* get mem for the objs */ - if (!(objp = kmem_getpages(cachep, flags, &dma))) - goto opps1; + /* About to mess with non-constant members - lock. */ + spin_lock_irqsave(&cachep->c_spinlock, save_flags); - /* get colour for the slab, and cal the next value */ - cli(); - if (!(offset = --(cachep->c_colour_next))) + /* Get colour for the slab, and cal the next value. */ + if (!(offset = cachep->c_colour_next--)) cachep->c_colour_next = cachep->c_colour; - restore_flags(save_flags); offset *= cachep->c_align; + cachep->c_dflags = SLAB_CFLGS_GROWN; + + cachep->c_growing++; +re_try: + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + + /* A series of memory allocations for a new slab. + * Neither the cache-chain semaphore, or cache-lock, are + * held, but the incrementing c_growing prevents this + * this cache from being reaped or shrunk. + * Note: The cache could be selected in for reaping in + * kmem_cache_reap(), but when the final test is made the + * growing value will be seen. + */ + + /* Get mem for the objs. */ + if (!(objp = kmem_getpages(cachep, flags, &dma))) + goto failed; - /* get slab mgmt */ - if (!(slabp = kmem_cache_slabmgmt(cachep, objp, local_flags, offset))) - goto opps2; + /* Get slab mgmt. */ + if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags))) + goto opps1; if (dma) - slabp->s_flags = SLAB_SFLGS_DMA; - + slabp->s_dma = 1; + if (SLAB_BUFCTL(cachep->c_flags)) { + slabp->s_index = kmem_cache_alloc(cachep->c_index_cachep, local_flags); + if (!slabp->s_index) + goto opps2; + } + + /* Nasty!!!!!! I hope this is OK. */ + dma = 1 << cachep->c_gfporder; + page = &mem_map[MAP_NR(objp)]; + do { + SLAB_SET_PAGE_CACHE(page, cachep); + SLAB_SET_PAGE_SLAB(page, slabp); + PageSetSlab(page); + page++; + } while (--dma); + + slabp->s_offset = offset; /* It will fit... */ + objp += offset; /* Address of first object. */ slabp->s_mem = objp; - objp += offset; /* address of first object */ /* For on-slab bufctls, c_offset is the distance between the start of * an obj and its related bufctl. For off-slab bufctls, c_offset is * the distance between objs in the slab. - * Reason for bufctl at end of obj (when on slab), as opposed to the front; - * if stored within the obj (has no state), and the obj is 'used' after being - * freed then (normally) most activity occurs at the beginning of the obj. - * By keeping the bufctl ptr away from the front, should reduce the chance of - * corruption. Also, allows easier alignment of objs onto cache lines when - * bufctl is not stored with the objs. - * Downsize; if, while an obj is active, a write is made past its end, then the - * bufctl will be corrupted :( */ - if (kmem_cache_init_objs(cachep, slabp, objp, local_flags, ctor_flags)) - goto no_objs; + kmem_cache_init_objs(cachep, slabp, objp, ctor_flags); + + spin_lock_irq(&cachep->c_spinlock); - cli(); - /* make slab active */ + /* Make slab active. */ slabp->s_magic = SLAB_MAGIC_ALLOC; kmem_slab_link_end(cachep, slabp); if (cachep->c_freep == kmem_slab_end(cachep)) cachep->c_freep = slabp; - restore_flags(save_flags); - return; -no_objs: - kmem_freepages(cachep, slabp->s_mem); + SLAB_STATS_INC_GROWN(cachep); + cachep->c_failures = 0; + cachep->c_growing--; + + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + return 1; opps2: - kmem_freepages(cachep, objp); + if (SLAB_OFF_SLAB(cachep->c_flags)) + kmem_cache_free(cache_slabp, slabp); opps1: - if (slabp && SLAB_OFF_SLAB(cachep->c_flags)) - kmem_cache_free(&cache_slab, slabp); - /* printk("kmem_alloc: Out of mem - %s\n", cachep->c_name); */ - return; + kmem_freepages(cachep, objp); +failed: + if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) { + /* For large order (>0) slabs, we try again. + * Needed because the gfp() functions are not good at giving + * out contigious pages unless pushed (but do not push too hard). + */ + spin_lock_irq(&cachep->c_spinlock); + if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep)) + goto re_try; + cachep->c_failures = 1; /* Memory is low, don't try as hard next time. */ + cachep->c_growing--; + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + } + return 0; +} + +static void +kmem_report_alloc_err(const char *str, kmem_cache_t * cachep) +{ + if (cachep) + SLAB_STATS_INC_ERR(cachep); /* this is atomic */ + printk(KERN_ERR "kmem_alloc: %s (name=%s)\n", + str, cachep ? cachep->c_name : "unknown"); +} + +static void +kmem_report_free_err(const char *str, void *objp, kmem_cache_t * cachep) +{ + if (cachep) + SLAB_STATS_INC_ERR(cachep); + printk(KERN_ERR "kmem_free: %s (objp=%p, name=%s)\n", + str, objp, cachep ? cachep->c_name : "unknown"); +} + +/* Search for a slab whose objs are suitable for DMA. + * Note: since testing the first free slab (in __kmem_cache_alloc()), + * ints must not have been enabled, or the cache-lock released! + */ +static inline kmem_slab_t * +kmem_cache_search_dma(kmem_cache_t * cachep) +{ + kmem_slab_t *slabp = cachep->c_freep->s_nextp; + + for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) { + if (!(slabp->s_dma)) + continue; + kmem_slab_unlink(slabp); + kmem_slab_link_free(cachep, slabp); + cachep->c_freep = slabp; + break; + } + return slabp; } -#if defined(SLAB_DEBUG_SUPPORT) -/* Perform extra freeing checks. - * Currently, this check is only for caches that use bufctl structures - * within the slab. Those which use bufctl's from the internal cache - * have a reasonable check when the address is searched for. +#if SLAB_DEBUG_SUPPORT +/* Perform extra freeing checks. Currently, this check is only for caches + * that use bufctl structures within the slab. Those which use bufctl's + * from the internal cache have a reasonable check when the address is + * searched for. Called with the cache-lock held. */ static void * -kmem_extra_free_checks(const kmem_cache_t *cachep, kmem_bufctl_t *search_bufp, - const kmem_bufctl_t *bufp, void * objp) +kmem_extra_free_checks(kmem_cache_t * cachep, kmem_bufctl_t *search_bufp, + kmem_bufctl_t *bufp, void * objp) { if (SLAB_BUFCTL(cachep->c_flags)) - goto end; + return objp; - /* check slab's freelist to see if this obj is there */ + /* Check slab's freelist to see if this obj is there. */ for (; search_bufp; search_bufp = search_bufp->buf_nextp) { if (search_bufp != bufp) continue; - printk(KERN_ERR "kmem_free: Double free detected during checking " - "%p - %s\n", objp, cachep->c_name); return NULL; } -end: return objp; } #endif /* SLAB_DEBUG_SUPPORT */ +/* Called with cache lock held. */ static inline void kmem_cache_full_free(kmem_cache_t *cachep, kmem_slab_t *slabp) { - if (!slabp->s_nextp->s_inuse) - return; /* at correct position */ - slabp->s_jiffies = jiffies; /* set release time */ - if (cachep->c_freep == slabp) - cachep->c_freep = slabp->s_nextp; - kmem_slab_unlink(slabp); - kmem_slab_link_end(cachep, slabp); - - return; + if (slabp->s_nextp->s_inuse) { + /* Not at correct position. */ + if (cachep->c_freep == slabp) + cachep->c_freep = slabp->s_nextp; + kmem_slab_unlink(slabp); + kmem_slab_link_end(cachep, slabp); + } } +/* Called with cache lock held. */ static inline void kmem_cache_one_free(kmem_cache_t *cachep, kmem_slab_t *slabp) { - if (slabp->s_nextp->s_inuse != cachep->c_num) { - cachep->c_freep = slabp; - return; + if (slabp->s_nextp->s_inuse == cachep->c_num) { + kmem_slab_unlink(slabp); + kmem_slab_link_free(cachep, slabp); } - kmem_slab_unlink(slabp); - kmem_slab_link_free(cachep, slabp); - return; + cachep->c_freep = slabp; } -/* Returns a ptr to an obj in the given cache. - * The obj is in the initial state (if there is one) - */ +/* Returns a ptr to an obj in the given cache. */ static inline void * -__kmem_cache_alloc(kmem_cache_t *cachep, unsigned long flags) +__kmem_cache_alloc(kmem_cache_t *cachep, int flags) { kmem_slab_t *slabp; kmem_bufctl_t *bufp; void *objp; unsigned long save_flags; - /* sanity check */ + /* Sanity check. */ if (!cachep) goto nul_ptr; - save_flags(save_flags); - cli(); - /* get slab alloc is to come from */ + spin_lock_irqsave(&cachep->c_spinlock, save_flags); +try_again: + /* Get slab alloc is to come from. */ slabp = cachep->c_freep; - /* magic is a sanity check _and_ says if we need a new slab */ + /* Magic is a sanity check _and_ says if we need a new slab. */ if (slabp->s_magic != SLAB_MAGIC_ALLOC) goto alloc_new_slab; -try_again: - /* DMA allocations are 'rare' - keep out of critical path */ + /* DMA requests are 'rare' - keep out of the critical path. */ if (flags & SLAB_DMA) goto search_dma; try_again_dma: + SLAB_STATS_INC_ALLOCED(cachep); + SLAB_STATS_INC_ACTIVE(cachep); + SLAB_STATS_SET_HIGH(cachep); slabp->s_inuse++; bufp = slabp->s_freep; slabp->s_freep = bufp->buf_nextp; - if (!SLAB_BUFCTL(cachep->c_flags)) { - /* Nasty - we want the 'if' to be taken in the common case */ - if (slabp->s_freep) { -short_finished: + if (slabp->s_freep) { +ret_obj: + if (!slabp->s_index) { + bufp->buf_slabp = slabp; objp = ((void*)bufp) - cachep->c_offset; - restore_flags(save_flags); -#if defined(SLAB_DEBUG_SUPPORT) +finished: + /* The lock is not needed by the red-zone or poision ops, and the + * obj has been removed from the slab. Should be safe to drop + * the lock here. + */ + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); +#if SLAB_DEBUG_SUPPORT if (cachep->c_flags & SLAB_RED_ZONE) goto red_zone; +ret_red: + if ((cachep->c_flags & SLAB_POISION) && kmem_check_poision_obj(cachep, objp)) + kmem_report_alloc_err("Bad poision", cachep); #endif /* SLAB_DEBUG_SUPPORT */ return objp; - } else { - cachep->c_freep = slabp->s_nextp; - goto short_finished; } + /* Update index ptr. */ + objp = ((bufp-slabp->s_index)*cachep->c_offset) + slabp->s_mem; + bufp->buf_objp = objp; + goto finished; } + cachep->c_freep = slabp->s_nextp; + goto ret_obj; - if (!slabp->s_freep) - cachep->c_freep = slabp->s_nextp; - - /* link into hash chain */ - objp = kmem_add_to_hash(cachep, bufp); - restore_flags(save_flags); -#if defined(SLAB_DEBUG_SUPPORT) - if (!(cachep->c_flags & SLAB_RED_ZONE)) -#endif /* SLAB_DEBUG_SUPPORT */ - return objp; - -#if defined(SLAB_DEBUG_SUPPORT) +#if SLAB_DEBUG_SUPPORT red_zone: - /* set alloc red-zone, and check old one */ + /* Set alloc red-zone, and check old one. */ + if (xchg((unsigned long *)objp, SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1) + kmem_report_alloc_err("Bad front redzone", cachep); + objp += BYTES_PER_WORD; if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1) - printk(KERN_ERR "kmem_alloc: Bad redzone %p - %s\n", - objp, cachep->c_name); - return objp; + kmem_report_alloc_err("Bad rear redzone", cachep); + goto ret_red; #endif /* SLAB_DEBUG_SUPPORT */ search_dma: - if (slabp->s_flags & SLAB_SFLGS_DMA) - goto try_again_dma; - /* need to search... */ - if ((slabp = kmem_cache_search_dma(cachep))) + if (slabp->s_dma || (slabp = kmem_cache_search_dma(cachep))!=kmem_slab_end(cachep)) goto try_again_dma; alloc_new_slab: - /* Either out of slabs, or magic number corruption */ - if (slabp != kmem_slab_end(cachep)) - goto bad_slab; - /* need a new slab */ - restore_flags(save_flags); - if (SLAB_RELEASED(cachep->c_flags)) { - printk(KERN_ERR "kmem_alloc: destroyed cache\n"); - goto end; - } - - /* Be lazy and only check for valid flags - * here (keeping it out of the critical path above) - */ - if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) { - printk(KERN_ERR "kmem_alloc: Illegal flgs %lX (correcting) - %s\n", - flags, cachep->c_name); - flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW); + /* Either out of slabs, or magic number corruption. */ + if (slabp == kmem_slab_end(cachep)) { + /* Need a new slab. Release the lock before calling kmem_cache_grow(). + * This allows objs to be released back into the cache while growing. + */ + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + if (kmem_cache_grow(cachep, flags)) { + /* Someone may have stolen our objs. Doesn't matter, we'll + * just come back here again. + */ + goto try_again; + } + /* Couldn't grow, but some objs may have been freed. */ + spin_lock_irq(&cachep->c_spinlock); + if (cachep->c_freep != kmem_slab_end(cachep)) + goto try_again; + } else { + /* Very serious error - maybe panic() here? */ + kmem_report_alloc_err("Bad slab magic (corrupt)", cachep); } - - kmem_cache_grow(cachep, flags); - cli(); - if ((slabp=cachep->c_freep) != kmem_slab_end(cachep)) - goto try_again; - restore_flags(save_flags); -end: + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); +err_exit: return NULL; -bad_slab: - /* v. serious error - maybe panic() here? */ - printk(KERN_ERR "kmem_alloc: Bad slab magic (corruption) - %s\n", - cachep->c_name); - goto end; nul_ptr: - printk(KERN_ERR "kmem_alloc: NULL ptr\n"); - goto end; + kmem_report_alloc_err("NULL ptr", NULL); + goto err_exit; } -/* Release an obj back to its cache. - * If the obj has a constructed state, it should be - * in this state _before_ it is released. +/* Release an obj back to its cache. If the obj has a constructed state, + * it should be in this state _before_ it is released. */ static inline void __kmem_cache_free(kmem_cache_t *cachep, void *objp) @@ -1199,128 +1461,137 @@ __kmem_cache_free(kmem_cache_t *cachep, void *objp) kmem_bufctl_t *bufp; unsigned long save_flags; - /* basic sanity checks */ - if (!cachep) - goto nul_cache; - if (!objp) - goto nul_obj; + /* Basic sanity checks. */ + if (!cachep || !objp) + goto null_addr; - save_flags(save_flags); -#if defined(SLAB_DEBUG_SUPPORT) +#if SLAB_DEBUG_SUPPORT + if (cachep->c_flags & SLAB_RED_ZONE) + objp -= BYTES_PER_WORD; +#endif /* SLAB_DEBUG_SUPPORT */ + + +#if SLAB_DEBUG_SUPPORT + /* A verify func is called without the cache-lock held. */ if (cachep->c_flags & SLAB_DEBUG_INITIAL) goto init_state_check; finished_initial: #endif /* SLAB_DEBUG_SUPPORT */ + spin_lock_irqsave(&cachep->c_spinlock, save_flags); + if (SLAB_BUFCTL(cachep->c_flags)) goto bufctl; - bufp = (kmem_bufctl_t *)(objp+cachep->c_offset); - /* get slab for the obj */ - if (SLAB_PTR_IN_OBJ(cachep->c_flags)) { - /* if SLAB_HIGH_PACK is undef, the below is optimised away */ - slabp = (kmem_slab_t *)((((unsigned long)objp)&PAGE_MASK)+PAGE_SIZE); - slabp--; - } else - slabp = (kmem_slab_t *) bufp->buf_slabp; + /* Get slab for the object. */ +#if 0 + /* _NASTY_IF/ELSE_, but avoids a 'distant' memory ref for some objects. + * Is this worth while? XXX + */ + if (cachep->c_flags & SLAB_HIGH_PACK) + slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(bufp)]); + else +#endif + slabp = bufp->buf_slabp; - if (slabp->s_magic != SLAB_MAGIC_ALLOC) /* sanity check */ - goto bad_obj; - cli(); +check_magic: + if (slabp->s_magic != SLAB_MAGIC_ALLOC) /* Sanity check. */ + goto bad_slab; -#if defined(SLAB_DEBUG_SUPPORT) - if (cachep->c_flags & (SLAB_DEBUG_FREE|SLAB_RED_ZONE)) +#if SLAB_DEBUG_SUPPORT + if (cachep->c_flags & SLAB_DEBUG_FREE) goto extra_checks; +passed_extra: #endif /* SLAB_DEBUG_SUPPORT */ -passed_extra: - if (!slabp->s_inuse) /* sanity check */ - goto too_many; - bufp->buf_nextp = slabp->s_freep; - slabp->s_freep = bufp; - if (--(slabp->s_inuse)) { - if (bufp->buf_nextp) { - restore_flags(save_flags); - return; + if (slabp->s_inuse) { /* Sanity check. */ + SLAB_STATS_DEC_ACTIVE(cachep); + slabp->s_inuse--; + bufp->buf_nextp = slabp->s_freep; + slabp->s_freep = bufp; + if (slabp->s_inuse) { + if (bufp->buf_nextp) { + /* (hopefully) The most common case. */ +finished: +#if SLAB_DEBUG_SUPPORT + /* Need to poision the obj while holding the lock. */ + if (cachep->c_flags & SLAB_POISION) + kmem_poision_obj(cachep, objp); + if (cachep->c_flags & SLAB_RED_ZONE) + goto red_zone; +return_red: +#endif /* SLAB_DEBUG_SUPPORT */ + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + return; + } + kmem_cache_one_free(cachep, slabp); + goto finished; } - kmem_cache_one_free(cachep, slabp); - restore_flags(save_flags); - return; + kmem_cache_full_free(cachep, slabp); + goto finished; } - kmem_cache_full_free(cachep, slabp); - restore_flags(save_flags); + + /* Don't add to freelist. */ + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + kmem_report_free_err("free with no active objs", objp, cachep); return; bufctl: - /* Off-slab bufctls. Need to search hash for bufctl, and hence the slab. - * No 'extra' checks are performed for objs stored this way, finding - * the obj a check enough + /* No 'extra' checks are performed for objs stored this way, finding + * the obj is check enough. */ - cli(); - if ((bufp = kmem_remove_from_hash(cachep, objp))) { - slabp = (kmem_slab_t *) bufp->buf_slabp; -#if defined(SLAB_DEBUG_SUPPORT) - if (cachep->c_flags & SLAB_RED_ZONE) - goto red_zone; -#endif /* SLAB_DEBUG_SUPPORT */ - goto passed_extra; - } - restore_flags(save_flags); - printk(KERN_ERR "kmem_free: Either bad obj addr or double free: %p - %s\n", - objp, cachep->c_name); + slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(objp)]); + bufp = &slabp->s_index[(objp - slabp->s_mem)/cachep->c_offset]; + if (bufp->buf_objp == objp) + goto check_magic; + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + kmem_report_free_err("Either bad obj addr or double free", objp, cachep); return; -#if defined(SLAB_DEBUG_SUPPORT) -red_zone: - if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) { - /* Either write past end of the object, or a double free */ - printk(KERN_ERR "kmem_free: Bad redzone %p - %s\n", - objp, cachep->c_name); - } - goto passed_extra; +#if SLAB_DEBUG_SUPPORT init_state_check: - /* Need to call the slab's constructor so that - * the caller can perform a verify of its state (debugging) + /* Need to call the slab's constructor so the + * caller can perform a verify of its state (debugging). */ - cachep->c_ctor(objp, cachep->c_org_size, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); + cachep->c_ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); goto finished_initial; extra_checks: - if ((cachep->c_flags & SLAB_DEBUG_FREE) && - (objp != kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp))) { - restore_flags(save_flags); + if (!kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp)) { + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + kmem_report_free_err("Double free detected during checks", objp, cachep); return; } - if (cachep->c_flags & SLAB_RED_ZONE) - goto red_zone; goto passed_extra; -#endif /* SLAB_DEBUG_SUPPORT */ -bad_obj: - /* The addr of the slab doesn't contain the correct - * magic num +red_zone: + /* We hold the cache-lock while checking the red-zone, just incase + * some tries to take this obj from us... */ - if (slabp->s_magic == SLAB_MAGIC_UNALLOC) { - /* magic num says this is an unalloc slab */ - printk(KERN_ERR "kmem_free: obj %p from destroyed slab - %s\n", - objp, cachep->c_name); - return; + if (xchg((unsigned long *)objp, SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) { + /* Either write before start of obj, or a double free. */ + kmem_report_free_err("Bad front redzone", objp, cachep); } - printk(KERN_ERR "kmem_free: Bad obj %p - %s\n", objp, cachep->c_name); - return; -too_many: - /* don't add to freelist */ - restore_flags(save_flags); - printk(KERN_ERR "kmem_free: obj free for slab with no active objs - %s\n", - cachep->c_name); - return; -nul_obj: - printk(KERN_ERR "kmem_free: NULL obj - %s\n", cachep->c_name); + objp += BYTES_PER_WORD; + if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) { + /* Either write past end of obj, or a double free. */ + kmem_report_free_err("Bad rear redzone", objp, cachep); + } + goto return_red; +#endif /* SLAB_DEBUG_SUPPORT */ +bad_slab: + /* Slab doesn't contain the correct magic num. */ + if (slabp->s_magic == SLAB_MAGIC_DESTROYED) { + /* Magic num says this is a destroyed slab. */ + kmem_report_free_err("free from inactive slab", objp, cachep); + } else + kmem_report_free_err("Bad obj addr", objp, cachep); + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); return; -nul_cache: - printk(KERN_ERR "kmem_free: NULL cache ptr\n"); +null_addr: + kmem_report_free_err("NULL ptr", objp, cachep); return; } void * -kmem_cache_alloc(kmem_cache_t *cachep, unsigned long flags) +kmem_cache_alloc(kmem_cache_t *cachep, int flags) { return __kmem_cache_alloc(cachep, flags); } @@ -1332,163 +1603,249 @@ kmem_cache_free(kmem_cache_t *cachep, void *objp) } void * -kmem_alloc(unsigned long size, unsigned long flags) +kmalloc(size_t size, int flags) { - cache_sizes_t *cachep = cache_sizes; + cache_sizes_t *csizep = cache_sizes; - for (; cachep->cs_size; cachep++) { - if (size > cachep->cs_size) + for (; csizep->cs_size; csizep++) { + if (size > csizep->cs_size) continue; - /* should the inline version be used here? */ - return kmem_cache_alloc(cachep->cs_cachep, flags); + return __kmem_cache_alloc(csizep->cs_cachep, flags); } - printk(KERN_ERR "kmem_alloc: Size (%lu) too large\n", size); + printk(KERN_ERR "kmalloc: Size (%lu) too large\n", (unsigned long) size); return NULL; } void -kmem_free(void *objp, unsigned long size) +kfree(void *objp) { - cache_sizes_t *cachep = cache_sizes; + struct page *page; + int nr; - for (; cachep->cs_size; cachep++) { - if (size > cachep->cs_size) - continue; - /* should the inline version be used here? */ - kmem_cache_free(cachep->cs_cachep, objp); - return; + if (!objp) + goto null_ptr; + nr = MAP_NR(objp); + if (nr >= max_mapnr) + goto null_ptr; + + /* Assume we own the page structure - hence no locking. + * If someone is misbehaving (eg. someone calling us with a bad + * address), then access to the page structure can race with the + * kmem_slab_destory() code. Need to add a spin_lock to each page + * structure, which would be useful in threading the gfp() functions.... + */ + page = &mem_map[nr]; + if (PageSlab(page)) { + kmem_cache_t *cachep; + + /* Here, we (again) assume the obj address is good. + * If it isn't, and happens to map onto another + * general-cache page which has no active objs, then + * we race.... + */ + cachep = SLAB_GET_PAGE_CACHE(page); + if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) { + __kmem_cache_free(cachep, objp); + return; + } + } +null_ptr: + printk(KERN_ERR "kfree: Bad obj %p\n", objp); +while(1); + return; +} + +void +kfree_s(void *objp, size_t size) +{ + struct page *page; + int nr; + + if (!objp) + goto null_ptr; + nr = MAP_NR(objp); + if (nr >= max_mapnr) + goto null_ptr; + /* See comment in kfree() */ + page = &mem_map[nr]; + if (PageSlab(page)) { + kmem_cache_t *cachep; + /* See comment in kfree() */ + cachep = SLAB_GET_PAGE_CACHE(page); + if (cachep && cachep->c_flags & SLAB_CFLGS_GENERAL) { + if (size <= cachep->c_org_size) { /* XXX better check */ + __kmem_cache_free(cachep, objp); + return; + } + } } - printk(KERN_ERR "kmem_free: Size (%lu) too large - strange\n", size); +null_ptr: + printk(KERN_ERR "kfree_s: Bad obj %p\n", objp); + return; } +kmem_cache_t * +kmem_find_general_cachep(size_t size) +{ + cache_sizes_t *csizep = cache_sizes; + + /* This function could be moved to the header-file, and + * made inline so consumers can quickly determine what + * cache-ptr they require. + */ + for (; csizep->cs_size; csizep++) { + if (size > csizep->cs_size) + continue; + break; + } + return csizep->cs_cachep; +} /* Called from try_to_free_page(). - * Ideal solution would have a weight for each cache, based on; - * o num of fully free slabs - * o if the objs have a constructor/deconstructor - * o length of time slabs have been fully free (ie. ageing) * This function _cannot_ be called within a int, but it * can be interrupted. */ int kmem_cache_reap(int pri, int dma, int wait) { - unsigned long dtor_flags = 0; - unsigned long best_jiffie; - unsigned long now; - int count = 8; - kmem_slab_t *best_slabp = NULL; - kmem_cache_t *best_cachep = NULL; kmem_slab_t *slabp; kmem_cache_t *searchp; - unsigned long save_flags; + kmem_cache_t *best_cachep; + unsigned long scan; + unsigned long reap_level; - /* 'pri' maps to the number of caches to examine, not the number of slabs. - * This avoids only checking the jiffies for slabs in one cache at the - * expensive spending more cycles + if (in_interrupt()) { + printk("kmem_cache_reap() called within int!\n"); + return 0; + } + scan = 9-pri; + reap_level = pri >> 1; + + /* We really need a test semphore op so we can avoid sleeping when + * !wait is true. */ - pri = (9 - pri); - if (!wait) /* not allowed to wait */ - dtor_flags = SLAB_DTOR_ATOMIC; + down(&cache_chain_sem); + best_cachep = NULL; searchp = clock_searchp; - save_flags(save_flags); - now = jiffies; - best_jiffie = now - (2*HZ); /* 2secs - avoid heavy thrashing */ - while (pri--) { - kmem_slab_t *local_slabp; - unsigned long local_jiffie; - if (searchp == &cache_cache) + do { + unsigned long full_free; + /* It's safe to test this without holding the cache-lock. */ + if (searchp->c_flags & SLAB_NO_REAP) goto next; - - /* sanity check for corruption */ + spin_lock_irq(&searchp->c_spinlock); + if (searchp->c_growing) + goto next_unlock; + if (searchp->c_dflags & SLAB_CFLGS_GROWN) { + searchp->c_dflags &= ~SLAB_CFLGS_GROWN; + goto next_unlock; + } + /* Sanity check for corruption of static values. */ if (searchp->c_inuse || searchp->c_magic != SLAB_C_MAGIC) { - printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n", - searchp->c_name); + spin_unlock_irq(&searchp->c_spinlock); + printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n", searchp->c_name); goto next; } + full_free = 0; - local_slabp = NULL; - local_jiffie = now - (2*HZ); - cli(); - /* As the fully free slabs, within a cache, have no particular - * order, we need to test them all. Infact, we only check 'count' - * slabs. + /* Count num of fully free slabs. Hopefully there are not many, + * we are holding the cache lock.... */ slabp = searchp->c_lastp; - for (;count && slabp != kmem_slab_end(searchp) && !slabp->s_inuse; slabp = slabp->s_prevp, count--) { - if (slabp->s_jiffies >= local_jiffie) - continue; + while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) { + slabp = slabp->s_prevp; + full_free++; + } + spin_unlock_irq(&searchp->c_spinlock); - /* weight caches with a con/decon */ - if ((searchp->c_ctor || searchp->c_dtor) && slabp->s_jiffies >= (local_jiffie - (2*HZ))) - continue; + if (full_free) { + if (full_free >= 10) { + best_cachep = searchp; + break; + } - /* weight caches with high page orders. Avoids stressing the - * VM sub-system by reducing the frequency requests for a large - * num of contigious pages + /* Try to avoid slabs with constructors and/or + * more than one page per slab (as it can be difficult + * to get high orders from gfp()). */ - if (searchp->c_gfporder > 1 && slabp->s_jiffies >= (local_jiffie - (4*HZ))) - continue; - - local_jiffie = slabp->s_jiffies; - local_slabp = slabp; - if (!searchp->c_gfporder && (now-local_jiffie) >= (300*HZ)) { - /* an old, one page slab. Make a quick get away... */ - pri = 0; - break; + if (pri == 6) { /* magic '6' from try_to_free_page() */ + if (searchp->c_ctor) + full_free--; + if (full_free && searchp->c_gfporder) + full_free--; } - } - if (local_slabp) { - if (!count || local_jiffie < best_jiffie) { - best_slabp = local_slabp; - best_jiffie = local_jiffie; + if (full_free >= reap_level) { + reap_level = full_free; best_cachep = searchp; - if (!count) - break; } } - restore_flags(save_flags); + goto next; +next_unlock: + spin_unlock_irq(&searchp->c_spinlock); next: searchp = searchp->c_nextp; - if (searchp == clock_searchp) - break; - count = 8; /* # of slabs at which we force a reap */ - } - - /* only move along with we didn't find an over allocated cache */ - if (count) - clock_searchp = clock_searchp->c_nextp; + } while (--scan && searchp != clock_searchp); - if (!best_slabp) - return 0; + clock_searchp = searchp; + up(&cache_chain_sem); - cli(); - if (best_slabp->s_inuse) { - /* an object in our selected slab has been - * allocated. This souldn't happen v. often, so we - * simply fail - which isn't ideal but will do. - * NOTE: No test for the case where an obj has been - * allocated from the slab, and then freed. While - * this would change our idea of the best slab to - * reap, it's not worth the re-calculation effort. - */ - restore_flags(save_flags); + if (!best_cachep) { + /* couldn't find anthying to reap */ return 0; } - if (best_cachep->c_freep == best_slabp) - best_cachep->c_freep = best_slabp->s_nextp; - kmem_slab_unlink(best_slabp); + spin_lock_irq(&best_cachep->c_spinlock); + if (!best_cachep->c_growing && !(slabp = best_cachep->c_lastp)->s_inuse && slabp != kmem_slab_end(best_cachep)) { + if (slabp == best_cachep->c_freep) + best_cachep->c_freep = kmem_slab_end(best_cachep); + kmem_slab_unlink(slabp); + SLAB_STATS_INC_REAPED(best_cachep); - restore_flags(save_flags); - kmem_slab_destroy(best_cachep, best_slabp, dtor_flags); + /* Safe to drop the lock. The slab is no longer linked to the + * cache. + */ + spin_unlock_irq(&best_cachep->c_spinlock); + kmem_slab_destroy(best_cachep, slabp); + return 1; + } + spin_unlock_irq(&best_cachep->c_spinlock); + return 0; +} - return 1; +#if SLAB_SELFTEST +/* A few v. simple tests */ +static void +kmem_self_test(void) +{ + kmem_cache_t *test_cachep; + + printk(KERN_INFO "kmem_test() - start\n"); + test_cachep = kmem_cache_create("test-cachep", 16, 0, SLAB_RED_ZONE|SLAB_POISION, NULL, NULL); + if (test_cachep) { + char *objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL); + if (objp) { + /* Write in front and past end, red-zone test. */ + *(objp-1) = 1; + *(objp+16) = 1; + kmem_cache_free(test_cachep, objp); + + /* Mess up poisioning. */ + *objp = 10; + objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL); + kmem_cache_free(test_cachep, objp); + + /* Mess up poisioning (again). */ + *objp = 10; + kmem_cache_shrink(test_cachep); + } + } + printk(KERN_INFO "kmem_test() - finished\n"); } +#endif /* SLAB_SELFTEST */ +#if defined(CONFIG_PROC_FS) /* /proc/slabinfo - * cache-name num-active-objs total-objs num-active-slabs total-slabs num-pages-per-slab + * cache-name num-active-objs total-objs num-active-slabs total-slabs num-pages-per-slab */ int get_slabinfo(char *buf) @@ -1496,31 +1853,62 @@ get_slabinfo(char *buf) kmem_cache_t *cachep; kmem_slab_t *slabp; unsigned long active_objs; - unsigned long num_slabs, active_slabs; unsigned long save_flags; + unsigned long num_slabs; + unsigned long num_objs; int len=0; +#if SLAB_STATS + unsigned long active_slabs; +#endif /* SLAB_STATS */ - /* output format version, so at least we can change it without _too_ - * many complaints + __save_flags(save_flags); + + /* Output format version, so at least we can change it without _too_ + * many complaints. */ +#if SLAB_STATS + len = sprintf(buf, "slabinfo - version: 1.0 (statistics)\n"); +#else len = sprintf(buf, "slabinfo - version: 1.0\n"); - save_flags(save_flags); +#endif /* SLAB_STATS */ + down(&cache_chain_sem); cachep = &cache_cache; do { - active_slabs = num_slabs = active_objs = 0; - cli(); - for (slabp = cachep->c_firstp; - slabp != kmem_slab_end(cachep); - slabp = slabp->s_nextp) { - num_slabs++; +#if SLAB_STATS + active_slabs = 0; +#endif /* SLAB_STATS */ + num_slabs = active_objs = 0; + spin_lock_irq(&cachep->c_spinlock); + for (slabp = cachep->c_firstp; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) { active_objs += slabp->s_inuse; + num_slabs++; +#if SLAB_STATS if (slabp->s_inuse) active_slabs++; +#endif /* SLAB_STATS */ } - restore_flags(save_flags); - len += sprintf(buf+len, "%-20s%lu %lu %lu %lu %d\n", cachep->c_name, - active_objs, cachep->c_num*num_slabs, - active_slabs, num_slabs, 1<<cachep->c_gfporder); + num_objs = cachep->c_num*num_slabs; +#if SLAB_STATS + { + unsigned long errors; + unsigned long high = cachep->c_high_mark; + unsigned long grown = cachep->c_grown; + unsigned long reaped = cachep->c_reaped; + unsigned long allocs = cachep->c_num_allocations; + errors = (unsigned long) atomic_read(&cachep->c_errors); + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + len += sprintf(buf+len, "%-16s %6lu %6lu %4lu %4lu %4lu %6lu %7lu %5lu %4lu %4lu\n", + cachep->c_name, active_objs, num_objs, active_slabs, num_slabs, + (1<<cachep->c_gfporder)*num_slabs, + high, allocs, grown, reaped, errors); + } +#else + spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); + len += sprintf(buf+len, "%-17s %6lu %6lu\n", cachep->c_name, active_objs, num_objs); +#endif /* SLAB_STATS */ } while ((cachep = cachep->c_nextp) != &cache_cache); + up(&cache_chain_sem); + return len; } +#endif /* CONFIG_PROC_FS */ @@ -23,6 +23,7 @@ #include <linux/fs.h> #include <linux/swapctl.h> #include <linux/pagemap.h> +#include <linux/init.h> #include <asm/dma.h> #include <asm/system.h> /* for cli()/sti() */ @@ -67,7 +68,7 @@ swapstat_t swapstats = {0}; /* General swap control */ /* Parse the kernel command line "swap=" option at load time: */ -void swap_setup(char *str, int *ints) +__initfunc(void swap_setup(char *str, int *ints)) { int * swap_vars[8] = { &MAX_PAGE_AGE, @@ -87,7 +88,7 @@ void swap_setup(char *str, int *ints) } /* Parse the kernel command line "buff=" option at load time: */ -void buff_setup(char *str, int *ints) +__initfunc(void buff_setup(char *str, int *ints)) { int * buff_vars[6] = { &MAX_BUFF_AGE, diff --git a/mm/swap_state.c b/mm/swap_state.c index 044180721..f3ffa46d5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -16,6 +16,7 @@ #include <linux/swap.h> #include <linux/fs.h> #include <linux/swapctl.h> +#include <linux/init.h> #include <asm/dma.h> #include <asm/system.h> /* for cli()/sti() */ @@ -69,8 +70,8 @@ int add_to_swap_cache(unsigned long index, unsigned long entry) return 0; } -unsigned long init_swap_cache(unsigned long mem_start, - unsigned long mem_end) +__initfunc(unsigned long init_swap_cache(unsigned long mem_start, + unsigned long mem_end)) { unsigned long swap_cache_size; diff --git a/mm/swapfile.c b/mm/swapfile.c index 91221a415..32a5ed8b0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -298,24 +298,25 @@ static int unuse_process(struct mm_struct * mm, unsigned int type, unsigned long */ static int try_to_unuse(unsigned int type) { - int nr; unsigned long page = get_free_page(GFP_KERNEL); + struct task_struct *p; if (!page) return -ENOMEM; - nr = 0; - while (nr < NR_TASKS) { - struct task_struct * p = task[nr]; - if (p) { - if (unuse_process(p->mm, type, page)) { - page = get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - continue; - } +again: + read_lock(&tasklist_lock); + for_each_task(p) { + read_unlock(&tasklist_lock); + if(unuse_process(p->mm, type, page)) { + page = get_free_page(GFP_KERNEL); + if(!page) + return -ENOMEM; + goto again; } - nr++; + read_lock(&tasklist_lock); } + read_unlock(&tasklist_lock); + free_page(page); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index b25c0a0ac..d890be5df 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -271,54 +271,72 @@ static int swap_out_process(struct task_struct * p, int dma, int wait) static int swap_out(unsigned int priority, int dma, int wait) { - static int swap_task; - int loop, counter; + static int skip_factor = 0; + int limit = nr_tasks - 1; + int loop, counter, i; struct task_struct *p; counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority; + if(skip_factor > nr_tasks) + skip_factor = 0; + + read_lock(&tasklist_lock); + p = init_task.next_task; + i = skip_factor; + while(i--) + p = p->next_task; for(; counter >= 0; counter--) { - /* - * Check that swap_task is suitable for swapping. If not, look for - * the next suitable process. - */ + /* Check if task is suitable for swapping. */ loop = 0; while(1) { - if (swap_task >= NR_TASKS) { - swap_task = 1; + if(!--limit) { + limit = nr_tasks - 1; + /* See if all processes are unswappable or + * already swapped out. + */ if (loop) - /* all processes are unswappable or already swapped out */ - return 0; + goto out; loop = 1; } - - p = task[swap_task]; - if (p && p->swappable && p->mm->rss) + if (p->swappable && p->mm->rss) break; - - swap_task++; + if((p = p->next_task) == &init_task) + p = p->next_task; } + skip_factor++; - /* - * Determine the number of pages to swap from this process. - */ + /* Determine the number of pages to swap from this process. */ if (!p->swap_cnt) { - /* Normalise the number of pages swapped by + /* Normalise the number of pages swapped by multiplying by (RSS / 1MB) */ p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss); } if (!--p->swap_cnt) - swap_task++; + skip_factor++; + read_unlock(&tasklist_lock); + switch (swap_out_process(p, dma, wait)) { - case 0: - if (p->swap_cnt) - swap_task++; - break; - case 1: - return 1; - default: - break; - } + case 0: + if (p->swap_cnt) + skip_factor++; + break; + case 1: + return 1; + default: + break; + }; + + /* Whoever we swapped may not even exist now, in fact we cannot + * assume anything about the list we were searching previously. + */ + read_lock(&tasklist_lock); + p = init_task.next_task; + i = skip_factor; + while(i--) + p = p->next_task; } +out: + read_unlock(&tasklist_lock); return 0; } |