From e7c2a72e2680827d6a733931273a93461c0d8d1b Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 14 Nov 1995 08:00:00 +0000 Subject: Import of Linux/MIPS 1.3.0 --- mm/Makefile | 32 ++ mm/filemap.c | 274 +++++++++++++ mm/kmalloc.c | 407 +++++++++++++++++++ mm/memory.c | 1137 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/mmap.c | 980 +++++++++++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 251 ++++++++++++ mm/swap.c | 1231 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/vmalloc.c | 244 ++++++++++++ 8 files changed, 4556 insertions(+) create mode 100644 mm/Makefile create mode 100644 mm/filemap.c create mode 100644 mm/kmalloc.c create mode 100644 mm/memory.c create mode 100644 mm/mmap.c create mode 100644 mm/mprotect.c create mode 100644 mm/swap.c create mode 100644 mm/vmalloc.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile new file mode 100644 index 000000000..35f51d45f --- /dev/null +++ b/mm/Makefile @@ -0,0 +1,32 @@ +# +# Makefile for the linux memory manager. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +.c.o: + $(CC) $(CFLAGS) -c $< +.s.o: + $(AS) -o $*.o $< +.c.s: + $(CC) $(CFLAGS) -S $< + +OBJS = memory.o swap.o mmap.o filemap.o mprotect.o kmalloc.o vmalloc.o + +mm.o: $(OBJS) + $(LD) -r -o mm.o $(OBJS) + +modules: + +dep: + $(CPP) -M *.c > .depend + +# +# include a dependency file if one exists +# +ifeq (.depend,$(wildcard .depend)) +include .depend +endif diff --git a/mm/filemap.c b/mm/filemap.c new file mode 100644 index 000000000..5a1e99142 --- /dev/null +++ b/mm/filemap.c @@ -0,0 +1,274 @@ +/* + * linux/mm/filemmap.c + * + * Copyright (C) 1994 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem does this differently, for example) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + */ + +static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, + unsigned long page, int no_share) +{ + struct inode * inode = area->vm_inode; + unsigned int block; + int nr[8]; + int i, *p; + + address &= PAGE_MASK; + block = address - area->vm_start + area->vm_offset; + block >>= inode->i_sb->s_blocksize_bits; + i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; + p = nr; + do { + *p = bmap(inode,block); + i--; + block++; + p++; + } while (i > 0); + return bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, no_share); +} + +/* + * NOTE! mmap sync doesn't really work yet. This is mainly a stub for it, + * which only works if the buffers and the page were already sharing the + * same physical page (that's actually pretty common, especially if the + * file has been mmap'ed before being read the normal way). + * + * Todo: + * - non-shared pages also need to be synced with the buffers. + * - the "swapout()" function needs to swap out the page to + * the shared file instead of using the swap device. + */ +static void filemap_sync_page(struct vm_area_struct * vma, + unsigned long offset, + unsigned long page) +{ + struct buffer_head * bh; + + printk("msync: %ld: [%08lx]\n", offset, page); + bh = buffer_pages[MAP_NR(page)]; + if (bh) { + /* whee.. just mark the buffer heads dirty */ + struct buffer_head * tmp = bh; + do { + mark_buffer_dirty(tmp, 0); + tmp = tmp->b_this_page; + } while (tmp != bh); + return; + } + /* we'll need to go fetch the buffer heads etc.. RSN */ + printk("Can't handle non-shared page yet\n"); + return; +} + +static inline void filemap_sync_pte(pte_t * pte, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + pte_t page = *pte; + + if (!pte_present(page)) + return; + if (!pte_dirty(page)) + return; + if (flags & MS_INVALIDATE) { + pte_clear(pte); + } else { + mem_map[MAP_NR(pte_page(page))]++; + *pte = pte_mkclean(page); + } + filemap_sync_page(vma, address - vma->vm_start, pte_page(page)); + free_page(pte_page(page)); +} + +static inline void filemap_sync_pte_range(pmd_t * pmd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned long offset, unsigned int flags) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd)); + pmd_clear(pmd); + return; + } + pte = pte_offset(pmd, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + filemap_sync_pte(pte, vma, address + offset, flags); + address += PAGE_SIZE; + pte++; + } while (address < end); +} + +static inline void filemap_sync_pmd_range(pgd_t * pgd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned int flags) +{ + pmd_t * pmd; + unsigned long offset, end; + + if (pgd_none(*pgd)) + return; + if (pgd_bad(*pgd)) { + printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd)); + pgd_clear(pgd); + return; + } + pmd = pmd_offset(pgd, address); + offset = address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); +} + +static void filemap_sync(struct vm_area_struct * vma, unsigned long address, + size_t size, unsigned int flags) +{ + pgd_t * dir; + unsigned long end = address + size; + + dir = pgd_offset(current, address); + while (address < end) { + filemap_sync_pmd_range(dir, address, end - address, vma, flags); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } + invalidate(); + return; +} + +/* + * This handles area unmaps.. + */ +static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len) +{ + filemap_sync(vma, start, len, MS_ASYNC); +} + +/* + * This handles complete area closes.. + */ +static void filemap_close(struct vm_area_struct * vma) +{ + filemap_sync(vma, vma->vm_start, vma->vm_end - vma->vm_start, MS_ASYNC); +} + +/* + * This isn't implemented yet: you'll get a warning and incorrect behaviour. + * + * Note that the page is free'd by the higher-level after return, + * so we have to either write it out or just forget it. We currently + * forget it.. + */ +void filemap_swapout(struct vm_area_struct * vma, + unsigned long offset, + pte_t *page_table) +{ + printk("swapout not implemented on shared files..\n"); + pte_clear(page_table); +} + +/* + * Shared mappings need to be able to do the right thing at + * close/unmap/sync. They will also use the private file as + * backing-store for swapping.. + */ +static struct vm_operations_struct file_shared_mmap = { + NULL, /* open */ + filemap_close, /* close */ + filemap_unmap, /* unmap */ + NULL, /* protect */ + filemap_sync, /* sync */ + NULL, /* advise */ + filemap_nopage, /* nopage */ + NULL, /* wppage */ + filemap_swapout, /* swapout */ + NULL, /* swapin */ +}; + +/* + * Private mappings just need to be able to load in the map + * + * (this is actually used for shared mappings as well, if we + * know they can't ever get write permissions..) + */ +static struct vm_operations_struct file_private_mmap = { + NULL, /* open */ + NULL, /* close */ + NULL, /* unmap */ + NULL, /* protect */ + NULL, /* sync */ + NULL, /* advise */ + filemap_nopage, /* nopage */ + NULL, /* wppage */ + NULL, /* swapout */ + NULL, /* swapin */ +}; + +/* This is used for a general mmap of a disk file */ +int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma) +{ + struct vm_operations_struct * ops; + + if (vma->vm_offset & (inode->i_sb->s_blocksize - 1)) + return -EINVAL; + if (!inode->i_sb || !S_ISREG(inode->i_mode)) + return -EACCES; + if (!inode->i_op || !inode->i_op->bmap) + return -ENOEXEC; + ops = &file_private_mmap; + if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) { + static int nr = 0; + ops = &file_shared_mmap; +#ifndef SHARED_MMAP_REALLY_WORKS /* it doesn't, yet */ + if (nr++ < 5) + printk("%s tried to do a shared writeable mapping\n", current->comm); + return -EINVAL; +#endif + } + } + if (!IS_RDONLY(inode)) { + inode->i_atime = CURRENT_TIME; + inode->i_dirt = 1; + } + vma->vm_inode = inode; + inode->i_count++; + vma->vm_ops = ops; + return 0; +} diff --git a/mm/kmalloc.c b/mm/kmalloc.c new file mode 100644 index 000000000..e288ecf2f --- /dev/null +++ b/mm/kmalloc.c @@ -0,0 +1,407 @@ +/* + * linux/mm/kmalloc.c + * + * Copyright (C) 1991, 1992 Linus Torvalds & Roger Wolff. + * + * Written by R.E. Wolff Sept/Oct '93. + * + */ + +/* + * Modified by Alex Bligh (alex@cconcepts.co.uk) 4 Apr 1994 to use multiple + * pages. So for 'page' throughout, read 'area'. + */ + +#include +#include +#include + +#define GFP_LEVEL_MASK 0xf + +/* I want this low enough for a while to catch errors. + I want this number to be increased in the near future: + loadable device drivers should use this function to get memory */ + +#define MAX_KMALLOC_K ((PAGE_SIZE<<(NUM_AREA_ORDERS-1))>>10) + + +/* This defines how many times we should try to allocate a free page before + giving up. Normally this shouldn't happen at all. */ +#define MAX_GET_FREE_PAGE_TRIES 4 + + +/* Private flags. */ + +#define MF_USED 0xffaa0055 +#define MF_FREE 0x0055ffaa + + +/* + * Much care has gone into making these routines in this file reentrant. + * + * The fancy bookkeeping of nbytesmalloced and the like are only used to + * report them to the user (oooohhhhh, aaaaahhhhh....) are not + * protected by cli(). (If that goes wrong. So what?) + * + * These routines restore the interrupt status to allow calling with ints + * off. + */ + +/* + * A block header. This is in front of every malloc-block, whether free or not. + */ +struct block_header { + unsigned long bh_flags; + union { + unsigned long ubh_length; + struct block_header *fbh_next; + } vp; +}; + + +#define bh_length vp.ubh_length +#define bh_next vp.fbh_next +#define BH(p) ((struct block_header *)(p)) + + +/* + * The page descriptor is at the front of every page that malloc has in use. + */ +struct page_descriptor { + struct page_descriptor *next; + struct block_header *firstfree; + int order; + int nfree; +}; + + +#define PAGE_DESC(p) ((struct page_descriptor *)(((unsigned long)(p)) & PAGE_MASK)) + + +/* + * A size descriptor describes a specific class of malloc sizes. + * Each class of sizes has its own freelist. + */ +struct size_descriptor { + struct page_descriptor *firstfree; + struct page_descriptor *dmafree; /* DMA-able memory */ + int size; + int nblocks; + + int nmallocs; + int nfrees; + int nbytesmalloced; + int npages; + unsigned long gfporder; /* number of pages in the area required */ +}; + +/* + * For now it is unsafe to allocate bucket sizes between n & n=16 where n is + * 4096 * any power of two + */ +#if PAGE_SIZE == 4096 +struct size_descriptor sizes[] = { + { NULL, NULL, 32,127, 0,0,0,0, 0}, + { NULL, NULL, 64, 63, 0,0,0,0, 0 }, + { NULL, NULL, 128, 31, 0,0,0,0, 0 }, + { NULL, NULL, 252, 16, 0,0,0,0, 0 }, + { NULL, NULL, 508, 8, 0,0,0,0, 0 }, + { NULL, NULL,1020, 4, 0,0,0,0, 0 }, + { NULL, NULL,2040, 2, 0,0,0,0, 0 }, + { NULL, NULL,4096-16, 1, 0,0,0,0, 0 }, + { NULL, NULL,8192-16, 1, 0,0,0,0, 1 }, + { NULL, NULL,16384-16, 1, 0,0,0,0, 2 }, + { NULL, NULL,32768-16, 1, 0,0,0,0, 3 }, + { NULL, NULL,65536-16, 1, 0,0,0,0, 4 }, + { NULL, NULL,131072-16, 1, 0,0,0,0, 5 }, + { NULL, NULL, 0, 0, 0,0,0,0, 0 } +}; +#elif PAGE_SIZE == 8192 +struct size_descriptor sizes[] = { + { NULL, NULL, 64,127, 0,0,0,0, 0}, + { NULL, NULL, 128, 63, 0,0,0,0, 0 }, + { NULL, NULL, 248, 31, 0,0,0,0, 0 }, + { NULL, NULL, 504, 16, 0,0,0,0, 0 }, + { NULL, NULL,1016, 8, 0,0,0,0, 0 }, + { NULL, NULL,2040, 4, 0,0,0,0, 0 }, + { NULL, NULL,4080, 2, 0,0,0,0, 0 }, + { NULL, NULL,8192-32, 1, 0,0,0,0, 0 }, + { NULL, NULL,16384-32, 1, 0,0,0,0, 1 }, + { NULL, NULL,32768-32, 1, 0,0,0,0, 2 }, + { NULL, NULL,65536-32, 1, 0,0,0,0, 3 }, + { NULL, NULL,131072-32, 1, 0,0,0,0, 4 }, + { NULL, NULL,262144-32, 1, 0,0,0,0, 5 }, + { NULL, NULL, 0, 0, 0,0,0,0, 0 } +}; +#else +#error you need to make a version for your pagesize +#endif + +#define NBLOCKS(order) (sizes[order].nblocks) +#define BLOCKSIZE(order) (sizes[order].size) +#define AREASIZE(order) (PAGE_SIZE<<(sizes[order].gfporder)) + + +long kmalloc_init (long start_mem,long end_mem) +{ + int order; + +/* + * Check the static info array. Things will blow up terribly if it's + * incorrect. This is a late "compile time" check..... + */ +for (order = 0;BLOCKSIZE(order);order++) + { + if ((NBLOCKS (order)*BLOCKSIZE(order) + sizeof (struct page_descriptor)) > + AREASIZE(order)) + { + printk ("Cannot use %d bytes out of %d in order = %d block mallocs\n", + (int) (NBLOCKS (order) * BLOCKSIZE(order) + + sizeof (struct page_descriptor)), + (int) AREASIZE(order), + BLOCKSIZE (order)); + panic ("This only happens if someone messes with kmalloc"); + } + } +return start_mem; +} + + + +int get_order (int size) +{ + int order; + + /* Add the size of the header */ + size += sizeof (struct block_header); + for (order = 0;BLOCKSIZE(order);order++) + if (size <= BLOCKSIZE (order)) + return order; + return -1; +} + +void * kmalloc (size_t size, int priority) +{ + unsigned long flags; + int order,tries,i,sz; + int dma_flag; + struct block_header *p; + struct page_descriptor *page; + + dma_flag = (priority & GFP_DMA); + priority &= GFP_LEVEL_MASK; + +/* Sanity check... */ + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("kmalloc called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + priority = GFP_ATOMIC; + } + } + +order = get_order (size); +if (order < 0) + { + printk ("kmalloc of too large a block (%d bytes).\n",(int) size); + return (NULL); + } + +save_flags(flags); + +/* It seems VERY unlikely to me that it would be possible that this + loop will get executed more than once. */ +tries = MAX_GET_FREE_PAGE_TRIES; +while (tries --) + { + /* Try to allocate a "recently" freed memory block */ + cli (); + if ((page = (dma_flag ? sizes[order].dmafree : sizes[order].firstfree)) && + (p = page->firstfree)) + { + if (p->bh_flags == MF_FREE) + { + page->firstfree = p->bh_next; + page->nfree--; + if (!page->nfree) + { + if(dma_flag) + sizes[order].dmafree = page->next; + else + sizes[order].firstfree = page->next; + page->next = NULL; + } + restore_flags(flags); + + sizes [order].nmallocs++; + sizes [order].nbytesmalloced += size; + p->bh_flags = MF_USED; /* As of now this block is officially in use */ + p->bh_length = size; + return p+1; /* Pointer arithmetic: increments past header */ + } + printk ("Problem: block on freelist at %08lx isn't free.\n",(long)p); + return (NULL); + } + restore_flags(flags); + + + /* Now we're in trouble: We need to get a new free page..... */ + + sz = BLOCKSIZE(order); /* sz is the size of the blocks we're dealing with */ + + /* This can be done with ints on: This is private to this invocation */ + if (dma_flag) + page = (struct page_descriptor *) __get_dma_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder); + else + page = (struct page_descriptor *) __get_free_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder); + + if (!page) { + static unsigned long last = 0; + if (last + 10*HZ < jiffies) { + last = jiffies; + printk ("Couldn't get a free page.....\n"); + } + return NULL; + } +#if 0 + printk ("Got page %08x to use for %d byte mallocs....",(long)page,sz); +#endif + sizes[order].npages++; + + /* Loop for all but last block: */ + for (i=NBLOCKS(order),p=BH (page+1);i > 1;i--,p=p->bh_next) + { + p->bh_flags = MF_FREE; + p->bh_next = BH ( ((long)p)+sz); + } + /* Last block: */ + p->bh_flags = MF_FREE; + p->bh_next = NULL; + + page->order = order; + page->nfree = NBLOCKS(order); + page->firstfree = BH(page+1); +#if 0 + printk ("%d blocks per page\n",page->nfree); +#endif + /* Now we're going to muck with the "global" freelist for this size: + this should be uninterruptible */ + cli (); + /* + * sizes[order].firstfree used to be NULL, otherwise we wouldn't be + * here, but you never know.... + */ + if (dma_flag) { + page->next = sizes[order].dmafree; + sizes[order].dmafree = page; + } else { + page->next = sizes[order].firstfree; + sizes[order].firstfree = page; + } + restore_flags(flags); + } + +/* Pray that printk won't cause this to happen again :-) */ + +printk ("Hey. This is very funny. I tried %d times to allocate a whole\n" + "new page for an object only %d bytes long, but some other process\n" + "beat me to actually allocating it. Also note that this 'error'\n" + "message is soooo very long to catch your attention. I'd appreciate\n" + "it if you'd be so kind as to report what conditions caused this to\n" + "the author of this kmalloc: wolff@dutecai.et.tudelft.nl.\n" + "(Executive summary: This can't happen)\n", + MAX_GET_FREE_PAGE_TRIES, + (int) size); +return NULL; +} + +void kfree_s (void *ptr,int size) +{ +unsigned long flags; +int order; +register struct block_header *p=((struct block_header *)ptr) -1; +struct page_descriptor *page,*pg2; + +page = PAGE_DESC (p); +order = page->order; +if ((order < 0) || + (order > sizeof (sizes)/sizeof (sizes[0])) || + (((long)(page->next)) & ~PAGE_MASK) || + (p->bh_flags != MF_USED)) + { + printk ("kfree of non-kmalloced memory: %p, next= %p, order=%d\n", + p, page->next, page->order); + return; + } +if (size && + size != p->bh_length) + { + printk ("Trying to free pointer at %p with wrong size: %d instead of %lu.\n", + p,size,p->bh_length); + return; + } +size = p->bh_length; +p->bh_flags = MF_FREE; /* As of now this block is officially free */ +save_flags(flags); +cli (); +p->bh_next = page->firstfree; +page->firstfree = p; +page->nfree ++; + +if (page->nfree == 1) + { /* Page went from full to one free block: put it on the freelist. Do not bother + trying to put it on the DMA list. */ + if (page->next) + { + printk ("Page %p already on freelist dazed and confused....\n", page); + } + else + { + page->next = sizes[order].firstfree; + sizes[order].firstfree = page; + } + } + +/* If page is completely free, free it */ +if (page->nfree == NBLOCKS (page->order)) + { +#if 0 + printk ("Freeing page %08x.\n", (long)page); +#endif + if (sizes[order].firstfree == page) + { + sizes[order].firstfree = page->next; + } + else if (sizes[order].dmafree == page) + { + sizes[order].dmafree = page->next; + } + else + { + for (pg2=sizes[order].firstfree; + (pg2 != NULL) && (pg2->next != page); + pg2=pg2->next) + /* Nothing */; + if (!pg2) + for (pg2=sizes[order].dmafree; + (pg2 != NULL) && (pg2->next != page); + pg2=pg2->next) + /* Nothing */; + if (pg2 != NULL) + pg2->next = page->next; + else + printk ("Ooops. page %p doesn't show on freelist.\n", page); + } +/* FIXME: I'm sure we should do something with npages here (like npages--) */ + free_pages ((long)page, sizes[order].gfporder); + } +restore_flags(flags); + +/* FIXME: ?? Are these increment & decrement operations guaranteed to be + * atomic? Could an IRQ not occur between the read & the write? + * Maybe yes on a x86 with GCC...?? + */ +sizes[order].nfrees++; /* Noncritical (monitoring) admin stuff */ +sizes[order].nbytesmalloced -= size; +} diff --git a/mm/memory.c b/mm/memory.c new file mode 100644 index 000000000..4fba3a4c4 --- /dev/null +++ b/mm/memory.c @@ -0,0 +1,1137 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +unsigned long high_memory = 0; + +/* + * The free_area_list arrays point to the queue heads of the free areas + * of different sizes + */ +int nr_swap_pages = 0; +int nr_free_pages = 0; +struct mem_list free_area_list[NR_MEM_LISTS]; +unsigned char * free_area_map[NR_MEM_LISTS]; + +#if 0 +/* + * This now resides in include/asm/page.h + */ +#define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE) +#endif + +#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) + +mem_map_t * mem_map = NULL; + +/* + * oom() prints a message (so that the user knows why the process died), + * and gives the process an untrappable SIGKILL. + */ +void oom(struct task_struct * task) +{ + printk("\nOut of memory for %s.\n", current->comm); + task->sigaction[SIGKILL-1].sa_handler = NULL; + task->blocked &= ~(1<<(SIGKILL-1)); + send_sig(SIGKILL,task,1); +} + +static inline void free_one_pte(pte_t * page_table) +{ + pte_t page = *page_table; + + if (pte_none(page)) + return; + pte_clear(page_table); + if (!pte_present(page)) { + swap_free(pte_val(page)); + return; + } + free_page(pte_page(page)); + return; +} + +static inline void free_one_pmd(pmd_t * dir) +{ + int j; + pte_t * pte; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir)); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, 0); + pmd_clear(dir); + if (pte_inuse(pte)) { + pte_free(pte); + return; + } + for (j = 0; j < PTRS_PER_PTE ; j++) + free_one_pte(pte+j); + pte_free(pte); +} + +static inline void free_one_pgd(pgd_t * dir) +{ + int j; + pmd_t * pmd; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir)); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, 0); + pgd_clear(dir); + if (pmd_inuse(pmd)) { + pmd_free(pmd); + return; + } + for (j = 0; j < PTRS_PER_PMD ; j++) + free_one_pmd(pmd+j); + pmd_free(pmd); +} + + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. Note that + * unlike 'free_page_tables()', this function still leaves a valid + * page-table-tree in memory: it just removes the user pages. The two + * functions are similar, but there is a fundamental difference. + */ +void clear_page_tables(struct task_struct * tsk) +{ + int i; + pgd_t * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) + panic("task[0] (swapper) doesn't support exec()\n"); + page_dir = pgd_offset(tsk, 0); + if (!page_dir || page_dir == swapper_pg_dir) { + printk("%s trying to clear kernel page-directory: not good\n", tsk->comm); + return; + } + if (pgd_inuse(page_dir)) { + pgd_t * new_pg; + + if (!(new_pg = pgd_alloc())) { + oom(tsk); + return; + } + for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++) + new_pg[i] = page_dir[i]; + SET_PAGE_DIR(tsk, new_pg); + pgd_free(page_dir); + return; + } + for (i = 0 ; i < USER_PTRS_PER_PGD ; i++) + free_one_pgd(page_dir + i); + invalidate(); + return; +} + +/* + * This function frees up all page tables of a process when it exits. + */ +void free_page_tables(struct task_struct * tsk) +{ + int i; + pgd_t * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) { + printk("task[0] (swapper) killed: unable to recover\n"); + panic("Trying to free up swapper memory space"); + } + page_dir = pgd_offset(tsk, 0); + if (!page_dir || page_dir == swapper_pg_dir) { + printk("%s trying to free kernel page-directory: not good\n", tsk->comm); + return; + } + SET_PAGE_DIR(tsk, swapper_pg_dir); + if (pgd_inuse(page_dir)) { + pgd_free(page_dir); + return; + } + for (i = 0 ; i < PTRS_PER_PGD ; i++) + free_one_pgd(page_dir + i); + pgd_free(page_dir); + invalidate(); +} + +/* + * clone_page_tables() clones the page table for a process - both + * processes will have the exact same pages in memory. There are + * probably races in the memory management with cloning, but we'll + * see.. + */ +int clone_page_tables(struct task_struct * tsk) +{ + pgd_t * pg_dir; + + pg_dir = pgd_offset(current, 0); + pgd_reuse(pg_dir); + SET_PAGE_DIR(tsk, pg_dir); + return 0; +} + +static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte) +{ + pte_t pte = *old_pte; + + if (pte_none(pte)) + return; + if (!pte_present(pte)) { + swap_duplicate(pte_val(pte)); + *new_pte = pte; + return; + } + if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) { + *new_pte = pte; + return; + } + if (pte_cow(pte)) + pte = pte_wrprotect(pte); + if (delete_from_swap_cache(pte_page(pte))) + pte = pte_mkdirty(pte); + *new_pte = pte_mkold(pte); + *old_pte = pte; + mem_map[MAP_NR(pte_page(pte))]++; +} + +static inline int copy_one_pmd(pmd_t * old_pmd, pmd_t * new_pmd) +{ + int j; + pte_t *old_pte, *new_pte; + + if (pmd_none(*old_pmd)) + return 0; + if (pmd_bad(*old_pmd)) { + printk("copy_one_pmd: bad page table: probable memory corruption\n"); + pmd_clear(old_pmd); + return 0; + } + old_pte = pte_offset(old_pmd, 0); + if (pte_inuse(old_pte)) { + pte_reuse(old_pte); + *new_pmd = *old_pmd; + return 0; + } + new_pte = pte_alloc(new_pmd, 0); + if (!new_pte) + return -ENOMEM; + for (j = 0 ; j < PTRS_PER_PTE ; j++) { + copy_one_pte(old_pte, new_pte); + old_pte++; + new_pte++; + } + return 0; +} + +static inline int copy_one_pgd(pgd_t * old_pgd, pgd_t * new_pgd) +{ + int j; + pmd_t *old_pmd, *new_pmd; + + if (pgd_none(*old_pgd)) + return 0; + if (pgd_bad(*old_pgd)) { + printk("copy_one_pgd: bad page table (%p: %08lx): probable memory corruption\n", old_pgd, pgd_val(*old_pgd)); + pgd_clear(old_pgd); + return 0; + } + old_pmd = pmd_offset(old_pgd, 0); + if (pmd_inuse(old_pmd)) { + pmd_reuse(old_pmd); + *new_pgd = *old_pgd; + return 0; + } + new_pmd = pmd_alloc(new_pgd, 0); + if (!new_pmd) + return -ENOMEM; + for (j = 0 ; j < PTRS_PER_PMD ; j++) { + int error = copy_one_pmd(old_pmd, new_pmd); + if (error) + return error; + old_pmd++; + new_pmd++; + } + return 0; +} + +/* + * copy_page_tables() just copies the whole process memory range: + * note the special handling of RESERVED (ie kernel) pages, which + * means that they are always shared by all processes. + */ +int copy_page_tables(struct task_struct * tsk) +{ + int i; + pgd_t *old_pgd; + pgd_t *new_pgd; + + new_pgd = pgd_alloc(); + if (!new_pgd) + return -ENOMEM; + SET_PAGE_DIR(tsk, new_pgd); + old_pgd = pgd_offset(current, 0); + for (i = 0 ; i < PTRS_PER_PGD ; i++) { + int errno = copy_one_pgd(old_pgd, new_pgd); + if (errno) { + free_page_tables(tsk); + invalidate(); + return errno; + } + old_pgd++; + new_pgd++; + } + invalidate(); + return 0; +} + +static inline void forget_pte(pte_t page) +{ + if (pte_none(page)) + return; + if (pte_present(page)) { + free_page(pte_page(page)); + if (mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED) + return; + if (current->mm->rss <= 0) + return; + current->mm->rss--; + return; + } + swap_free(pte_val(page)); +} + +static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd)); + pmd_clear(pmd); + return; + } + pte = pte_offset(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end >= PMD_SIZE) + end = PMD_SIZE; + do { + pte_t page = *pte; + pte_clear(pte); + forget_pte(page); + address += PAGE_SIZE; + pte++; + } while (address < end); +} + +static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir)); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + unmap_pte_range(pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); +} + +/* + * a more complete version of free_page_tables which performs with page + * granularity. + */ +int unmap_page_range(unsigned long address, unsigned long size) +{ + pgd_t * dir; + unsigned long end = address + size; + + dir = pgd_offset(current, address); + while (address < end) { + unmap_pmd_range(dir, address, end - address); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } + invalidate(); + return 0; +} + +static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t oldpage = *pte; + *pte = zero_pte; + forget_pte(oldpage); + address += PAGE_SIZE; + pte++; + } while (address < end); +} + +static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pte_t * pte = pte_alloc(pmd, address); + if (!pte) + return -ENOMEM; + zeromap_pte_range(pte, address, end - address, zero_pte); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return 0; +} + +int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long end = address + size; + pte_t zero_pte; + + zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot)); + dir = pgd_offset(current, address); + while (address < end) { + pmd_t *pmd = pmd_alloc(dir, address); + error = -ENOMEM; + if (!pmd) + break; + error = zeromap_pmd_range(pmd, address, end - address, zero_pte); + if (error) + break; + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } + invalidate(); + return error; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, + unsigned long offset, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t oldpage = *pte; + pte_clear(pte); + if (offset >= high_memory || (mem_map[MAP_NR(offset)] & MAP_PAGE_RESERVED)) + *pte = mk_pte(offset, prot); + else if (mem_map[MAP_NR(offset)]) { + mem_map[MAP_NR(offset)]++; + *pte = mk_pte(offset, prot); + } + forget_pte(oldpage); + address += PAGE_SIZE; + offset += PAGE_SIZE; + pte++; + } while (address < end); +} + +static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, + unsigned long offset, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + offset -= address; + do { + pte_t * pte = pte_alloc(pmd, address); + if (!pte) + return -ENOMEM; + remap_pte_range(pte, address, end - address, address + offset, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return 0; +} + +int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long end = from + size; + + offset -= from; + dir = pgd_offset(current, from); + while (from < end) { + pmd_t *pmd = pmd_alloc(dir, from); + error = -ENOMEM; + if (!pmd) + break; + error = remap_pmd_range(pmd, from, end - from, offset + from, prot); + if (error) + break; + from = (from + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } + invalidate(); + return error; +} + +/* + * sanity-check function.. + */ +static void put_page(pte_t * page_table, pte_t pte) +{ + if (!pte_none(*page_table)) { + printk("put_page: page already exists %08lx\n", pte_val(*page_table)); + free_page(pte_page(pte)); + return; + } +/* no need for invalidate */ + *page_table = pte; +} + +/* + * This routine is used to map in a page into an address space: needed by + * execve() for the initial stack and environment pages. + */ +unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address) +{ + pgd_t * pgd; + pmd_t * pmd; + pte_t * pte; + + if (page >= high_memory) + printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address); + if (mem_map[MAP_NR(page)] != 1) + printk("mem_map disagrees with %08lx at %08lx\n",page,address); + pgd = pgd_offset(tsk,address); + pmd = pmd_alloc(pgd, address); + if (!pmd) { + free_page(page); + oom(tsk); + return 0; + } + pte = pte_alloc(pmd, address); + if (!pte) { + free_page(page); + oom(tsk); + return 0; + } + if (!pte_none(*pte)) { + printk("put_dirty_page: page already exists\n"); + pte_clear(pte); + invalidate(); + } + *pte = pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))); +/* no need for invalidate */ + return page; +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + */ +void do_wp_page(struct vm_area_struct * vma, unsigned long address, + int write_access) +{ + pgd_t *page_dir; + pmd_t *page_middle; + pte_t *page_table, pte; + unsigned long old_page, new_page; + + new_page = __get_free_page(GFP_KERNEL); + page_dir = pgd_offset(vma->vm_task,address); + if (pgd_none(*page_dir)) + goto end_wp_page; + if (pgd_bad(*page_dir)) + goto bad_wp_pagedir; + page_middle = pmd_offset(page_dir, address); + if (pmd_none(*page_middle)) + goto end_wp_page; + if (pmd_bad(*page_middle)) + goto bad_wp_pagemiddle; + page_table = pte_offset(page_middle, address); + pte = *page_table; + if (!pte_present(pte)) + goto end_wp_page; + if (pte_write(pte)) + goto end_wp_page; + old_page = pte_page(pte); + if (old_page >= high_memory) + goto bad_wp_page; + vma->vm_task->mm->min_flt++; + /* + * Do we need to copy? + */ + if (mem_map[MAP_NR(old_page)] != 1) { + if (new_page) { + if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED) + ++vma->vm_task->mm->rss; + copy_page(old_page,new_page); + *page_table = pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))); + free_page(old_page); + invalidate(); + return; + } + *page_table = BAD_PAGE; + free_page(old_page); + oom(vma->vm_task); + invalidate(); + return; + } + *page_table = pte_mkdirty(pte_mkwrite(pte)); + invalidate(); + if (new_page) + free_page(new_page); + return; +bad_wp_page: + printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); + send_sig(SIGKILL, vma->vm_task, 1); + goto end_wp_page; +bad_wp_pagemiddle: + printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle)); + send_sig(SIGKILL, vma->vm_task, 1); + goto end_wp_page; +bad_wp_pagedir: + printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir)); + send_sig(SIGKILL, vma->vm_task, 1); +end_wp_page: + if (new_page) + free_page(new_page); + return; +} + +/* + * Ugly, ugly, but the goto's result in better assembly.. + */ +int verify_area(int type, const void * addr, unsigned long size) +{ + struct vm_area_struct * vma; + unsigned long start = (unsigned long) addr; + + /* If the current user space is mapped to kernel space (for the + * case where we use a fake user buffer with get_fs/set_fs()) we + * don't expect to find the address in the user vm map. + */ + if (get_fs() == get_ds()) + return 0; + + vma = find_vma(current, start); + if (!vma) + goto bad_area; + if (vma->vm_start <= start) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + +good_area: + if (type == VERIFY_WRITE) + goto check_write; + for (;;) { + struct vm_area_struct * next; + if (!(vma->vm_flags & VM_READ)) + goto bad_area; + if (vma->vm_end - start >= size) + return 0; + next = vma->vm_next; + if (!next || vma->vm_end != next->vm_start) + goto bad_area; + vma = next; + } + +check_write: + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + if (!wp_works_ok) + goto check_wp_fault_by_hand; + for (;;) { + if (vma->vm_end - start >= size) + break; + if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start) + goto bad_area; + vma = vma->vm_next; + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + } + return 0; + +check_wp_fault_by_hand: + size--; + size += start & ~PAGE_MASK; + size >>= PAGE_SHIFT; + start &= PAGE_MASK; + + for (;;) { + do_wp_page(vma, start, 1); + if (!size) + break; + size--; + start += PAGE_SIZE; + if (start < vma->vm_end) + continue; + vma = vma->vm_next; + if (!vma || vma->vm_start != start) + goto bad_area; + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area;; + } + return 0; + +bad_area: + return -EFAULT; +} + +static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table) +{ + unsigned long tmp; + + if (!(tmp = get_free_page(GFP_KERNEL))) { + oom(vma->vm_task); + put_page(page_table, BAD_PAGE); + return; + } + put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot))); +} + +/* + * try_to_share() checks the page at address "address" in the task "p", + * to see if it exists, and if it is clean. If so, share it with the current + * task. + * + * NOTE! This assumes we have checked that p != current, and that they + * share the same inode and can generally otherwise be shared. + */ +static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area, + unsigned long from_address, struct vm_area_struct * from_area, + unsigned long newpage) +{ + pgd_t * from_dir, * to_dir; + pmd_t * from_middle, * to_middle; + pte_t * from_table, * to_table; + pte_t from, to; + + from_dir = pgd_offset(from_area->vm_task,from_address); +/* is there a page-directory at from? */ + if (pgd_none(*from_dir)) + return 0; + if (pgd_bad(*from_dir)) { + printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir)); + pgd_clear(from_dir); + return 0; + } + from_middle = pmd_offset(from_dir, from_address); +/* is there a mid-directory at from? */ + if (pmd_none(*from_middle)) + return 0; + if (pmd_bad(*from_middle)) { + printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle)); + pmd_clear(from_middle); + return 0; + } + from_table = pte_offset(from_middle, from_address); + from = *from_table; +/* is the page present? */ + if (!pte_present(from)) + return 0; +/* if it is dirty it must be from a shared mapping to be shared */ + if (pte_dirty(from)) { + if (!(from_area->vm_flags & VM_SHARED)) + return 0; + if (pte_write(from)) { + printk("nonwritable, but dirty, shared page\n"); + return 0; + } + } +/* is the page reasonable at all? */ + if (pte_page(from) >= high_memory) + return 0; + if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED) + return 0; +/* is the destination ok? */ + to_dir = pgd_offset(to_area->vm_task,to_address); +/* is there a page-directory at to? */ + if (pgd_none(*to_dir)) + return 0; + if (pgd_bad(*to_dir)) { + printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir)); + return 0; + } + to_middle = pmd_offset(to_dir, to_address); +/* is there a mid-directory at to? */ + if (pmd_none(*to_middle)) + return 0; + if (pmd_bad(*to_middle)) { + printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle)); + return 0; + } + to_table = pte_offset(to_middle, to_address); + to = *to_table; + if (!pte_none(to)) + return 0; +/* do we copy? */ + if (newpage) { + /* if it's in the swap cache, it's dirty by implication */ + /* so we can't use it if it's not from a shared mapping */ + if (in_swap_cache(pte_page(from))) { + if (!(from_area->vm_flags & VM_SHARED)) + return 0; + if (!pte_write(from)) { + printk("nonwritable, but dirty, shared page\n"); + return 0; + } + } + copy_page(pte_page(from), newpage); + *to_table = mk_pte(newpage, to_area->vm_page_prot); + return 1; + } +/* + * do a final swap-cache test before sharing them: if it's in the swap + * cache, we have to remove it now, as we get two pointers to the same + * physical page and the cache can't handle it. Mark the original dirty. + * + * NOTE! Even if "from" is dirty, "to" will be clean: if we get here + * with a dirty "from", the from-mapping is a shared map, so we can trust + * the page contents to be up-to-date + */ + if (in_swap_cache(pte_page(from))) { + if (!(from_area->vm_flags & VM_SHARED)) + return 0; + *from_table = pte_mkdirty(from); + delete_from_swap_cache(pte_page(from)); + } + mem_map[MAP_NR(pte_page(from))]++; + *to_table = mk_pte(pte_page(from), to_area->vm_page_prot); +/* Check if we need to do anything at all to the 'from' field */ + if (!pte_write(from)) + return 1; + if (from_area->vm_flags & VM_SHARED) + return 1; +/* ok, need to mark it read-only, so invalidate any possible old TB entry */ + *from_table = pte_wrprotect(from); + invalidate(); + return 1; +} + +/* + * share_page() tries to find a process that could share a page with + * the current one. + * + * We first check if it is at all feasible by checking inode->i_count. + * It should be >1 if there are other tasks sharing this inode. + */ +static int share_page(struct vm_area_struct * area, unsigned long address, + int write_access, unsigned long newpage) +{ + struct inode * inode; + unsigned long offset; + unsigned long from_address; + unsigned long give_page; + struct vm_area_struct * mpnt; + + if (!area || !(inode = area->vm_inode) || inode->i_count < 2) + return 0; + /* do we need to copy or can we just share? */ + give_page = 0; + if (write_access && !(area->vm_flags & VM_SHARED)) { + if (!newpage) + return 0; + give_page = newpage; + } + offset = address - area->vm_start + area->vm_offset; + /* See if there is something in the VM we can share pages with. */ + /* Traverse the entire circular i_mmap list, except `area' itself. */ + for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) { + /* must be same inode */ + if (mpnt->vm_inode != inode) { + printk("Aiee! Corrupt vm_area_struct i_mmap ring\n"); + break; + } + /* offsets must be mutually page-aligned */ + if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK) + continue; + /* the other area must actually cover the wanted page.. */ + from_address = offset + mpnt->vm_start - mpnt->vm_offset; + if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end) + continue; + /* .. NOW we can actually try to use the same physical page */ + if (!try_to_share(address, area, from_address, mpnt, give_page)) + continue; + /* free newpage if we never used it.. */ + if (give_page || !newpage) + return 1; + free_page(newpage); + return 1; + } + return 0; +} + +/* + * fill in an empty page-table if none exists. + */ +static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(tsk, address); + pmd = pmd_alloc(pgd, address); + if (!pmd) { + oom(tsk); + return NULL; + } + pte = pte_alloc(pmd, address); + if (!pte) { + oom(tsk); + return NULL; + } + return pte; +} + +static inline void do_swap_page(struct vm_area_struct * vma, unsigned long address, + pte_t * page_table, pte_t entry, int write_access) +{ + pte_t page; + + if (!vma->vm_ops || !vma->vm_ops->swapin) { + swap_in(vma, page_table, pte_val(entry), write_access); + return; + } + page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry)); + if (pte_val(*page_table) != pte_val(entry)) { + free_page(pte_page(page)); + return; + } + if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED)) + page = pte_wrprotect(page); + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->maj_flt; + *page_table = page; + return; +} + +/* + * do_no_page() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the "write_access" parameter is true in order to avoid the next + * page fault. + */ +void do_no_page(struct vm_area_struct * vma, unsigned long address, + int write_access) +{ + pte_t * page_table; + pte_t entry; + unsigned long page; + + page_table = get_empty_pgtable(vma->vm_task,address); + if (!page_table) + return; + entry = *page_table; + if (pte_present(entry)) + return; + if (!pte_none(entry)) { + do_swap_page(vma, address, page_table, entry, write_access); + return; + } + address &= PAGE_MASK; + if (!vma->vm_ops || !vma->vm_ops->nopage) { + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->min_flt; + get_empty_page(vma, page_table); + return; + } + page = __get_free_page(GFP_KERNEL); + if (share_page(vma, address, write_access, page)) { + ++vma->vm_task->mm->min_flt; + ++vma->vm_task->mm->rss; + return; + } + if (!page) { + oom(current); + put_page(page_table, BAD_PAGE); + return; + } + ++vma->vm_task->mm->maj_flt; + ++vma->vm_task->mm->rss; + /* + * The fourth argument is "no_share", which tells the low-level code + * to copy, not share the page even if sharing is possible. It's + * essentially an early COW detection + */ + page = vma->vm_ops->nopage(vma, address, page, + write_access && !(vma->vm_flags & VM_SHARED)); + if (share_page(vma, address, write_access, 0)) { + free_page(page); + return; + } + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * a exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + entry = mk_pte(page, vma->vm_page_prot); + if (write_access) { + entry = pte_mkwrite(pte_mkdirty(entry)); + } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED)) + entry = pte_wrprotect(entry); + put_page(page_table, entry); +} + +/* + * The above separate functions for the no-page and wp-page + * cases will go away (they mostly do the same thing anyway), + * and we'll instead use only a general "handle_mm_fault()". + * + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + */ +static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address, + int write_access, pte_t * pte) +{ + if (!pte_present(*pte)) { + do_no_page(vma, address, write_access); + return; + } + *pte = pte_mkyoung(*pte); + if (!write_access) + return; + if (pte_write(*pte)) { + *pte = pte_mkdirty(*pte); + return; + } + do_wp_page(vma, address, write_access); +} + +void handle_mm_fault(struct vm_area_struct * vma, unsigned long address, + int write_access) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(vma->vm_task, address); + pmd = pmd_alloc(pgd, address); + if (!pmd) + goto no_memory; + pte = pte_alloc(pmd, address); + if (!pte) + goto no_memory; + handle_pte_fault(vma, address, write_access, pte); + update_mmu_cache(vma, address, *pte); + return; +no_memory: + oom(vma->vm_task); +} diff --git a/mm/mmap.c b/mm/mmap.c new file mode 100644 index 000000000..3253a06c0 --- /dev/null +++ b/mm/mmap.c @@ -0,0 +1,980 @@ +/* + * linux/mm/mmap.c + * + * Written by obz. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int anon_map(struct inode *, struct file *, struct vm_area_struct *); + +/* + * description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ + +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, unsigned long off) +{ + int error; + struct vm_area_struct * vma; + + if ((len = PAGE_ALIGN(len)) == 0) + return addr; + + if (addr > TASK_SIZE || len > TASK_SIZE || addr > TASK_SIZE-len) + return -EINVAL; + + /* offset overflow? */ + if (off + len < off) + return -EINVAL; + + /* + * do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + + if (file != NULL) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot & PROT_WRITE) && !(file->f_mode & 2)) + return -EACCES; + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & 1)) + return -EACCES; + break; + + default: + return -EINVAL; + } + if ((flags & MAP_DENYWRITE) && (file->f_inode->i_wcount > 0)) + return -ETXTBSY; + } else if ((flags & MAP_TYPE) != MAP_PRIVATE) + return -EINVAL; + + /* + * obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + + if (flags & MAP_FIXED) { + if (addr & ~PAGE_MASK) + return -EINVAL; + if (len > TASK_SIZE || addr > TASK_SIZE - len) + return -EINVAL; + } else { + addr = get_unmapped_area(addr, len); + if (!addr) + return -ENOMEM; + } + + /* + * determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + if (file && (!file->f_op || !file->f_op->mmap)) + return -ENODEV; + + vma = (struct vm_area_struct *)kmalloc(sizeof(struct vm_area_struct), + GFP_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_task = current; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = prot & (VM_READ | VM_WRITE | VM_EXEC); + vma->vm_flags |= flags & (VM_GROWSDOWN | VM_DENYWRITE | VM_EXECUTABLE); + + if (file) { + if (file->f_mode & 1) + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (flags & MAP_SHARED) { + vma->vm_flags |= VM_SHARED | VM_MAYSHARE; + /* + * This looks strange, but when we don't have the file open + * for writing, we can demote the shared mapping to a simpler + * private mapping. That also takes care of a security hole + * with ptrace() writing to a shared mapping without write + * permissions. + * + * We leave the VM_MAYSHARE bit on, just to get correct output + * from /proc/xxx/maps.. + */ + if (!(file->f_mode & 2)) + vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + } + } else + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_offset = off; + vma->vm_inode = NULL; + vma->vm_pte = 0; + + do_munmap(addr, len); /* Clear old maps */ + + if (file) + error = file->f_op->mmap(file->f_inode, file, vma); + else + error = anon_map(NULL, NULL, vma); + + if (error) { + kfree(vma); + return error; + } + insert_vm_struct(current, vma); + merge_segments(current, vma->vm_start, vma->vm_end); + return addr; +} + +/* + * Get an address range which is currently unmapped. + * For mmap() without MAP_FIXED and shmat() with addr=0. + * Return value 0 means ENOMEM. + */ +unsigned long get_unmapped_area(unsigned long addr, unsigned long len) +{ + struct vm_area_struct * vmm; + + if (len > TASK_SIZE) + return 0; + if (!addr) + addr = TASK_SIZE / 3; + addr = PAGE_ALIGN(addr); + + for (vmm = current->mm->mmap; ; vmm = vmm->vm_next) { + if (TASK_SIZE - len < addr) + return 0; + if (!vmm) + return addr; + if (addr > vmm->vm_end) + continue; + if (addr + len > vmm->vm_start) { + addr = vmm->vm_end; + continue; + } + return addr; + } +} + +asmlinkage int sys_mmap(unsigned long *buffer) +{ + int error; + unsigned long flags; + struct file * file = NULL; + + error = verify_area(VERIFY_READ, buffer, 6*sizeof(long)); + if (error) + return error; + flags = get_fs_long(buffer+3); + if (!(flags & MAP_ANONYMOUS)) { + unsigned long fd = get_fs_long(buffer+4); + if (fd >= NR_OPEN || !(file = current->files->fd[fd])) + return -EBADF; + } + return do_mmap(file, get_fs_long(buffer), get_fs_long(buffer+1), + get_fs_long(buffer+2), flags, get_fs_long(buffer+5)); +} + + +/* + * Searching a VMA in the linear list task->mm->mmap is horribly slow. + * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search + * from O(n) to O(log n), where n is the number of VMAs of the task + * (typically around 6, but may reach 3000 in some cases). + * Written by Bruno Haible . + */ + +/* We keep the list and tree sorted by address. */ +#define vm_avl_key vm_end +#define vm_avl_key_t unsigned long /* typeof(vma->avl_key) */ + +/* + * task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap + * or, more exactly, its root. + * A vm_area_struct has the following fields: + * vm_avl_left left son of a tree node + * vm_avl_right right son of a tree node + * vm_avl_height 1+max(heightof(left),heightof(right)) + * The empty tree is represented as NULL. + */ +#define avl_empty (struct vm_area_struct *) NULL + +/* Since the trees are balanced, their height will never be large. */ +#define avl_maxheight 41 /* why this? a small exercise */ +#define heightof(tree) ((tree) == avl_empty ? 0 : (tree)->vm_avl_height) +/* + * Consistency and balancing rules: + * 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right)) + * 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1 + * 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key, + * foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key. + */ + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma (struct task_struct * task, unsigned long addr) +{ +#if 0 /* equivalent, but slow */ + struct vm_area_struct * vma; + + for (vma = task->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + return NULL; + if (vma->vm_end > addr) + return vma; + } +#else + struct vm_area_struct * result = NULL; + struct vm_area_struct * tree; + + for (tree = task->mm->mmap_avl ; ; ) { + if (tree == avl_empty) + return result; + if (tree->vm_end > addr) { + if (tree->vm_start <= addr) + return tree; + result = tree; + tree = tree->vm_avl_left; + } else + tree = tree->vm_avl_right; + } +#endif +} + +/* Look up the first VMA which intersects the interval start_addr..end_addr-1, + NULL if none. Assume start_addr < end_addr. */ +struct vm_area_struct * find_vma_intersection (struct task_struct * task, unsigned long start_addr, unsigned long end_addr) +{ + struct vm_area_struct * vma; + +#if 0 /* equivalent, but slow */ + for (vma = task->mm->mmap; vma; vma = vma->vm_next) { + if (end_addr <= vma->vm_start) + break; + if (start_addr < vma->vm_end) + return vma; + } + return NULL; +#else + vma = find_vma(task,start_addr); + if (!vma || end_addr <= vma->vm_start) + return NULL; + return vma; +#endif +} + +/* Look up the nodes at the left and at the right of a given node. */ +static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) +{ + vm_avl_key_t key = node->vm_avl_key; + + *to_the_left = *to_the_right = NULL; + for (;;) { + if (tree == avl_empty) { + printk("avl_neighbours: node not found in the tree\n"); + return; + } + if (key == tree->vm_avl_key) + break; + if (key < tree->vm_avl_key) { + *to_the_right = tree; + tree = tree->vm_avl_left; + } else { + *to_the_left = tree; + tree = tree->vm_avl_right; + } + } + if (tree != node) { + printk("avl_neighbours: node not exactly found in the tree\n"); + return; + } + if (tree->vm_avl_left != avl_empty) { + struct vm_area_struct * node; + for (node = tree->vm_avl_left; node->vm_avl_right != avl_empty; node = node->vm_avl_right) + continue; + *to_the_left = node; + } + if (tree->vm_avl_right != avl_empty) { + struct vm_area_struct * node; + for (node = tree->vm_avl_right; node->vm_avl_left != avl_empty; node = node->vm_avl_left) + continue; + *to_the_right = node; + } + if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right)) + printk("avl_neighbours: tree inconsistent with list\n"); +} + +/* + * Rebalance a tree. + * After inserting or deleting a node of a tree we have a sequence of subtrees + * nodes[0]..nodes[k-1] such that + * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}. + */ +static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count) +{ + for ( ; count > 0 ; count--) { + struct vm_area_struct ** nodeplace = *--nodeplaces_ptr; + struct vm_area_struct * node = *nodeplace; + struct vm_area_struct * nodeleft = node->vm_avl_left; + struct vm_area_struct * noderight = node->vm_avl_right; + int heightleft = heightof(nodeleft); + int heightright = heightof(noderight); + if (heightright + 1 < heightleft) { + /* */ + /* * */ + /* / \ */ + /* n+2 n */ + /* */ + struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left; + struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right; + int heightleftright = heightof(nodeleftright); + if (heightof(nodeleftleft) >= heightleftright) { + /* */ + /* * n+2|n+3 */ + /* / \ / \ */ + /* n+2 n --> / n+1|n+2 */ + /* / \ | / \ */ + /* n+1 n|n+1 n+1 n|n+1 n */ + /* */ + node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node; + nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright); + *nodeplace = nodeleft; + } else { + /* */ + /* * n+2 */ + /* / \ / \ */ + /* n+2 n --> n+1 n+1 */ + /* / \ / \ / \ */ + /* n n+1 n L R n */ + /* / \ */ + /* L R */ + /* */ + nodeleft->vm_avl_right = nodeleftright->vm_avl_left; + node->vm_avl_left = nodeleftright->vm_avl_right; + nodeleftright->vm_avl_left = nodeleft; + nodeleftright->vm_avl_right = node; + nodeleft->vm_avl_height = node->vm_avl_height = heightleftright; + nodeleftright->vm_avl_height = heightleft; + *nodeplace = nodeleftright; + } + } + else if (heightleft + 1 < heightright) { + /* similar to the above, just interchange 'left' <--> 'right' */ + struct vm_area_struct * noderightright = noderight->vm_avl_right; + struct vm_area_struct * noderightleft = noderight->vm_avl_left; + int heightrightleft = heightof(noderightleft); + if (heightof(noderightright) >= heightrightleft) { + node->vm_avl_right = noderightleft; noderight->vm_avl_left = node; + noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft); + *nodeplace = noderight; + } else { + noderight->vm_avl_left = noderightleft->vm_avl_right; + node->vm_avl_right = noderightleft->vm_avl_left; + noderightleft->vm_avl_right = noderight; + noderightleft->vm_avl_left = node; + noderight->vm_avl_height = node->vm_avl_height = heightrightleft; + noderightleft->vm_avl_height = heightright; + *nodeplace = noderightleft; + } + } + else { + int height = (heightleftvm_avl_height) + break; + node->vm_avl_height = height; + } + } +} + +/* Insert a node into a tree. */ +static void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree) +{ + vm_avl_key_t key = new_node->vm_avl_key; + struct vm_area_struct ** nodeplace = ptree; + struct vm_area_struct ** stack[avl_maxheight]; + int stack_count = 0; + struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ + for (;;) { + struct vm_area_struct * node = *nodeplace; + if (node == avl_empty) + break; + *stack_ptr++ = nodeplace; stack_count++; + if (key < node->vm_avl_key) + nodeplace = &node->vm_avl_left; + else + nodeplace = &node->vm_avl_right; + } + new_node->vm_avl_left = avl_empty; + new_node->vm_avl_right = avl_empty; + new_node->vm_avl_height = 1; + *nodeplace = new_node; + avl_rebalance(stack_ptr,stack_count); +} + +/* Insert a node into a tree, and + * return the node to the left of it and the node to the right of it. + */ +static void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree, + struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) +{ + vm_avl_key_t key = new_node->vm_avl_key; + struct vm_area_struct ** nodeplace = ptree; + struct vm_area_struct ** stack[avl_maxheight]; + int stack_count = 0; + struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ + *to_the_left = *to_the_right = NULL; + for (;;) { + struct vm_area_struct * node = *nodeplace; + if (node == avl_empty) + break; + *stack_ptr++ = nodeplace; stack_count++; + if (key < node->vm_avl_key) { + *to_the_right = node; + nodeplace = &node->vm_avl_left; + } else { + *to_the_left = node; + nodeplace = &node->vm_avl_right; + } + } + new_node->vm_avl_left = avl_empty; + new_node->vm_avl_right = avl_empty; + new_node->vm_avl_height = 1; + *nodeplace = new_node; + avl_rebalance(stack_ptr,stack_count); +} + +/* Removes a node out of a tree. */ +static void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree) +{ + vm_avl_key_t key = node_to_delete->vm_avl_key; + struct vm_area_struct ** nodeplace = ptree; + struct vm_area_struct ** stack[avl_maxheight]; + int stack_count = 0; + struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ + struct vm_area_struct ** nodeplace_to_delete; + for (;;) { + struct vm_area_struct * node = *nodeplace; + if (node == avl_empty) { + /* what? node_to_delete not found in tree? */ + printk("avl_remove: node to delete not found in tree\n"); + return; + } + *stack_ptr++ = nodeplace; stack_count++; + if (key == node->vm_avl_key) + break; + if (key < node->vm_avl_key) + nodeplace = &node->vm_avl_left; + else + nodeplace = &node->vm_avl_right; + } + nodeplace_to_delete = nodeplace; + /* Have to remove node_to_delete = *nodeplace_to_delete. */ + if (node_to_delete->vm_avl_left == avl_empty) { + *nodeplace_to_delete = node_to_delete->vm_avl_right; + stack_ptr--; stack_count--; + } else { + struct vm_area_struct *** stack_ptr_to_delete = stack_ptr; + struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left; + struct vm_area_struct * node; + for (;;) { + node = *nodeplace; + if (node->vm_avl_right == avl_empty) + break; + *stack_ptr++ = nodeplace; stack_count++; + nodeplace = &node->vm_avl_right; + } + *nodeplace = node->vm_avl_left; + /* node replaces node_to_delete */ + node->vm_avl_left = node_to_delete->vm_avl_left; + node->vm_avl_right = node_to_delete->vm_avl_right; + node->vm_avl_height = node_to_delete->vm_avl_height; + *nodeplace_to_delete = node; /* replace node_to_delete */ + *stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */ + } + avl_rebalance(stack_ptr,stack_count); +} + +#ifdef DEBUG_AVL + +/* print a list */ +static void printk_list (struct vm_area_struct * vma) +{ + printk("["); + while (vma) { + printk("%08lX-%08lX", vma->vm_start, vma->vm_end); + vma = vma->vm_next; + if (!vma) + break; + printk(" "); + } + printk("]"); +} + +/* print a tree */ +static void printk_avl (struct vm_area_struct * tree) +{ + if (tree != avl_empty) { + printk("("); + if (tree->vm_avl_left != avl_empty) { + printk_avl(tree->vm_avl_left); + printk("<"); + } + printk("%08lX-%08lX", tree->vm_start, tree->vm_end); + if (tree->vm_avl_right != avl_empty) { + printk(">"); + printk_avl(tree->vm_avl_right); + } + printk(")"); + } +} + +static char *avl_check_point = "somewhere"; + +/* check a tree's consistency and balancing */ +static void avl_checkheights (struct vm_area_struct * tree) +{ + int h, hl, hr; + + if (tree == avl_empty) + return; + avl_checkheights(tree->vm_avl_left); + avl_checkheights(tree->vm_avl_right); + h = tree->vm_avl_height; + hl = heightof(tree->vm_avl_left); + hr = heightof(tree->vm_avl_right); + if ((h == hl+1) && (hr <= hl) && (hl <= hr+1)) + return; + if ((h == hr+1) && (hl <= hr) && (hr <= hl+1)) + return; + printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point); +} + +/* check that all values stored in a tree are < key */ +static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key) +{ + if (tree == avl_empty) + return; + avl_checkleft(tree->vm_avl_left,key); + avl_checkleft(tree->vm_avl_right,key); + if (tree->vm_avl_key < key) + return; + printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->vm_avl_key,key); +} + +/* check that all values stored in a tree are > key */ +static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key) +{ + if (tree == avl_empty) + return; + avl_checkright(tree->vm_avl_left,key); + avl_checkright(tree->vm_avl_right,key); + if (tree->vm_avl_key > key) + return; + printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->vm_avl_key,key); +} + +/* check that all values are properly increasing */ +static void avl_checkorder (struct vm_area_struct * tree) +{ + if (tree == avl_empty) + return; + avl_checkorder(tree->vm_avl_left); + avl_checkorder(tree->vm_avl_right); + avl_checkleft(tree->vm_avl_left,tree->vm_avl_key); + avl_checkright(tree->vm_avl_right,tree->vm_avl_key); +} + +/* all checks */ +static void avl_check (struct task_struct * task, char *caller) +{ + avl_check_point = caller; +/* printk("task \"%s\", %s\n",task->comm,caller); */ +/* printk("task \"%s\" list: ",task->comm); printk_list(task->mm->mmap); printk("\n"); */ +/* printk("task \"%s\" tree: ",task->comm); printk_avl(task->mm->mmap_avl); printk("\n"); */ + avl_checkheights(task->mm->mmap_avl); + avl_checkorder(task->mm->mmap_avl); +} + +#endif + + +/* + * Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * This function works out what part of an area is affected and + * adjusts the mapping information. Since the actual page + * manipulation is done in do_mmap(), none need be done here, + * though it would probably be more appropriate. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list, so it needs to be + * reinserted if necessary. + * + * The 4 main cases are: + * Unmapping the whole area + * Unmapping from the start of the segment to a point in it + * Unmapping from an intermediate point to the end + * Unmapping between to intermediate points, making a hole. + * + * Case 4 involves the creation of 2 new areas, for each side of + * the hole. + */ +void unmap_fixup(struct vm_area_struct *area, + unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt; + unsigned long end = addr + len; + + if (addr < area->vm_start || addr >= area->vm_end || + end <= area->vm_start || end > area->vm_end || + end < addr) + { + printk("unmap_fixup: area=%lx-%lx, unmap %lx-%lx!!\n", + area->vm_start, area->vm_end, addr, end); + return; + } + + /* Unmapping the whole area */ + if (addr == area->vm_start && end == area->vm_end) { + if (area->vm_ops && area->vm_ops->close) + area->vm_ops->close(area); + if (area->vm_inode) + iput(area->vm_inode); + return; + } + + /* Work out to one of the ends */ + if (end == area->vm_end) + area->vm_end = addr; + else + if (addr == area->vm_start) { + area->vm_offset += (end - area->vm_start); + area->vm_start = end; + } + else { + /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */ + /* Add end mapping -- leave beginning for below */ + mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL); + + if (!mpnt) + return; + *mpnt = *area; + mpnt->vm_offset += (end - area->vm_start); + mpnt->vm_start = end; + if (mpnt->vm_inode) + mpnt->vm_inode->i_count++; + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + area->vm_end = addr; /* Truncate area */ + insert_vm_struct(current, mpnt); + } + + /* construct whatever mapping is needed */ + mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL); + if (!mpnt) + return; + *mpnt = *area; + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + if (area->vm_ops && area->vm_ops->close) { + area->vm_end = area->vm_start; + area->vm_ops->close(area); + } + insert_vm_struct(current, mpnt); +} + +asmlinkage int sys_munmap(unsigned long addr, size_t len) +{ + return do_munmap(addr, len); +} + +/* + * Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardine + */ +int do_munmap(unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt, *prev, *next, **npp, *free; + + if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return 0; + + /* + * Check if this memory area is ok - put it on the temporary + * list if so.. The checks here are pretty simple -- + * every area affected in some way (by any overlap) is put + * on the list. If nothing is put on, nothing is affected. + */ + mpnt = find_vma(current, addr); + if (!mpnt) + return 0; + avl_neighbours(mpnt, current->mm->mmap_avl, &prev, &next); + /* we have prev->vm_next == mpnt && mpnt->vm_next = next */ + /* and addr < mpnt->vm_end */ + + npp = (prev ? &prev->vm_next : ¤t->mm->mmap); + free = NULL; + for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { + *npp = mpnt->vm_next; + mpnt->vm_next = free; + free = mpnt; + avl_remove(mpnt, ¤t->mm->mmap_avl); + } + + if (free == NULL) + return 0; + + /* + * Ok - we have the memory areas we should free on the 'free' list, + * so release them, and unmap the page range.. + * If the one of the segments is only being partially unmapped, + * it will put new vm_area_struct(s) into the address space. + */ + while (free) { + unsigned long st, end; + + mpnt = free; + free = free->vm_next; + + remove_shared_vm_struct(mpnt); + + st = addr < mpnt->vm_start ? mpnt->vm_start : addr; + end = addr+len; + end = end > mpnt->vm_end ? mpnt->vm_end : end; + + if (mpnt->vm_ops && mpnt->vm_ops->unmap) + mpnt->vm_ops->unmap(mpnt, st, end-st); + + unmap_fixup(mpnt, st, end-st); + kfree(mpnt); + } + + unmap_page_range(addr, len); + return 0; +} + +/* Build the AVL tree corresponding to the VMA list. */ +void build_mmap_avl(struct task_struct * task) +{ + struct vm_area_struct * vma; + + task->mm->mmap_avl = NULL; + for (vma = task->mm->mmap; vma; vma = vma->vm_next) + avl_insert(vma, &task->mm->mmap_avl); +} + +/* Release all mmaps. */ +void exit_mmap(struct task_struct * task) +{ + struct vm_area_struct * mpnt; + + mpnt = task->mm->mmap; + task->mm->mmap = NULL; + task->mm->mmap_avl = NULL; + while (mpnt) { + struct vm_area_struct * next = mpnt->vm_next; + if (mpnt->vm_ops && mpnt->vm_ops->close) + mpnt->vm_ops->close(mpnt); + remove_shared_vm_struct(mpnt); + if (mpnt->vm_inode) + iput(mpnt->vm_inode); + kfree(mpnt); + mpnt = next; + } +} + +/* + * Insert vm structure into process list sorted by address + * and into the inode's i_mmap ring. + */ +void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp) +{ + struct vm_area_struct *share; + struct inode * inode; + +#if 0 /* equivalent, but slow */ + struct vm_area_struct **p, *mpnt; + + p = &t->mm->mmap; + while ((mpnt = *p) != NULL) { + if (mpnt->vm_start > vmp->vm_start) + break; + if (mpnt->vm_end > vmp->vm_start) + printk("insert_vm_struct: overlapping memory areas\n"); + p = &mpnt->vm_next; + } + vmp->vm_next = mpnt; + *p = vmp; +#else + struct vm_area_struct * prev, * next; + + avl_insert_neighbours(vmp, &t->mm->mmap_avl, &prev, &next); + if ((prev ? prev->vm_next : t->mm->mmap) != next) + printk("insert_vm_struct: tree inconsistent with list\n"); + if (prev) + prev->vm_next = vmp; + else + t->mm->mmap = vmp; + vmp->vm_next = next; +#endif + + inode = vmp->vm_inode; + if (!inode) + return; + + /* insert vmp into inode's circular share list */ + if ((share = inode->i_mmap)) { + vmp->vm_next_share = share->vm_next_share; + vmp->vm_next_share->vm_prev_share = vmp; + share->vm_next_share = vmp; + vmp->vm_prev_share = share; + } else + inode->i_mmap = vmp->vm_next_share = vmp->vm_prev_share = vmp; +} + +/* + * Remove one vm structure from the inode's i_mmap ring. + */ +void remove_shared_vm_struct(struct vm_area_struct *mpnt) +{ + struct inode * inode = mpnt->vm_inode; + + if (!inode) + return; + + if (mpnt->vm_next_share == mpnt) { + if (inode->i_mmap != mpnt) + printk("Inode i_mmap ring corrupted\n"); + inode->i_mmap = NULL; + return; + } + + if (inode->i_mmap == mpnt) + inode->i_mmap = mpnt->vm_next_share; + + mpnt->vm_prev_share->vm_next_share = mpnt->vm_next_share; + mpnt->vm_next_share->vm_prev_share = mpnt->vm_prev_share; +} + +/* + * Merge the list of memory segments if possible. + * Redundant vm_area_structs are freed. + * This assumes that the list is ordered by address. + * We don't need to traverse the entire list, only those segments + * which intersect or are adjacent to a given interval. + */ +void merge_segments (struct task_struct * task, unsigned long start_addr, unsigned long end_addr) +{ + struct vm_area_struct *prev, *mpnt, *next; + + mpnt = find_vma(task, start_addr); + if (!mpnt) + return; + avl_neighbours(mpnt, task->mm->mmap_avl, &prev, &next); + /* we have prev->vm_next == mpnt && mpnt->vm_next = next */ + + if (!prev) { + prev = mpnt; + mpnt = next; + } + + /* prev and mpnt cycle through the list, as long as + * start_addr < mpnt->vm_end && prev->vm_start < end_addr + */ + for ( ; mpnt && prev->vm_start < end_addr ; prev = mpnt, mpnt = next) { +#if 0 + printk("looping in merge_segments, mpnt=0x%lX\n", (unsigned long) mpnt); +#endif + + next = mpnt->vm_next; + + /* + * To share, we must have the same inode, operations.. + */ + if (mpnt->vm_inode != prev->vm_inode) + continue; + if (mpnt->vm_pte != prev->vm_pte) + continue; + if (mpnt->vm_ops != prev->vm_ops) + continue; + if (mpnt->vm_flags != prev->vm_flags) + continue; + if (prev->vm_end != mpnt->vm_start) + continue; + /* + * and if we have an inode, the offsets must be contiguous.. + */ + if ((mpnt->vm_inode != NULL) || (mpnt->vm_flags & VM_SHM)) { + if (prev->vm_offset + prev->vm_end - prev->vm_start != mpnt->vm_offset) + continue; + } + + /* + * merge prev with mpnt and set up pointers so the new + * big segment can possibly merge with the next one. + * The old unused mpnt is freed. + */ + avl_remove(mpnt, &task->mm->mmap_avl); + prev->vm_end = mpnt->vm_end; + prev->vm_next = mpnt->vm_next; + if (mpnt->vm_ops && mpnt->vm_ops->close) { + mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start; + mpnt->vm_start = mpnt->vm_end; + mpnt->vm_ops->close(mpnt); + } + remove_shared_vm_struct(mpnt); + if (mpnt->vm_inode) + mpnt->vm_inode->i_count--; + kfree_s(mpnt, sizeof(*mpnt)); + mpnt = prev; + } +} + +/* + * Map memory not associated with any file into a process + * address space. Adjacent memory is merged. + */ +static int anon_map(struct inode *ino, struct file * file, struct vm_area_struct * vma) +{ + if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) + return -ENOMEM; + return 0; +} diff --git a/mm/mprotect.c b/mm/mprotect.c new file mode 100644 index 000000000..ecf73730c --- /dev/null +++ b/mm/mprotect.c @@ -0,0 +1,251 @@ +/* + * linux/mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static inline void change_pte_range(pmd_t * pmd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + printk("change_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd)); + pmd_clear(pmd); + return; + } + pte = pte_offset(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t entry = *pte; + if (pte_present(entry)) + *pte = pte_modify(entry, newprot); + address += PAGE_SIZE; + pte++; + } while (address < end); +} + +static inline void change_pmd_range(pgd_t * pgd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*pgd)) + return; + if (pgd_bad(*pgd)) { + printk("change_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd)); + pgd_clear(pgd); + return; + } + pmd = pmd_offset(pgd, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + change_pte_range(pmd, address, end - address, newprot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); +} + +static void change_protection(unsigned long start, unsigned long end, pgprot_t newprot) +{ + pgd_t *dir; + + dir = pgd_offset(current, start); + while (start < end) { + change_pmd_range(dir, start, end - start, newprot); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } + invalidate(); + return; +} + +static inline int mprotect_fixup_all(struct vm_area_struct * vma, + int newflags, pgprot_t prot) +{ + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + return 0; +} + +static inline int mprotect_fixup_start(struct vm_area_struct * vma, + unsigned long end, + int newflags, pgprot_t prot) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + vma->vm_start = end; + n->vm_end = end; + vma->vm_offset += vma->vm_start - n->vm_start; + n->vm_flags = newflags; + n->vm_page_prot = prot; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current, n); + return 0; +} + +static inline int mprotect_fixup_end(struct vm_area_struct * vma, + unsigned long start, + int newflags, pgprot_t prot) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + vma->vm_end = start; + n->vm_start = start; + n->vm_offset += n->vm_start - vma->vm_start; + n->vm_flags = newflags; + n->vm_page_prot = prot; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current, n); + return 0; +} + +static inline int mprotect_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, + int newflags, pgprot_t prot) +{ + struct vm_area_struct * left, * right; + + left = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!left) + return -ENOMEM; + right = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!right) { + kfree(left); + return -ENOMEM; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + vma->vm_start = start; + vma->vm_end = end; + right->vm_start = end; + vma->vm_offset += vma->vm_start - left->vm_start; + right->vm_offset += right->vm_start - left->vm_start; + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + if (vma->vm_inode) + vma->vm_inode->i_count += 2; + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + insert_vm_struct(current, left); + insert_vm_struct(current, right); + return 0; +} + +static int mprotect_fixup(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned int newflags) +{ + pgprot_t newprot; + int error; + + if (newflags == vma->vm_flags) + return 0; + newprot = protection_map[newflags & 0xf]; + if (start == vma->vm_start) + if (end == vma->vm_end) + error = mprotect_fixup_all(vma, newflags, newprot); + else + error = mprotect_fixup_start(vma, end, newflags, newprot); + else if (end == vma->vm_end) + error = mprotect_fixup_end(vma, start, newflags, newprot); + else + error = mprotect_fixup_middle(vma, start, end, newflags, newprot); + + if (error) + return error; + + change_protection(start, end, newprot); + return 0; +} + +asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * next; + int error; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + return -EINVAL; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) + return -EINVAL; + if (end == start) + return 0; + vma = find_vma(current, start); + if (!vma || vma->vm_start > start) + return -EFAULT; + + for (nstart = start ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC)); + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + break; + } + + if (vma->vm_end >= end) { + error = mprotect_fixup(vma, nstart, end, newflags); + break; + } + + tmp = vma->vm_end; + next = vma->vm_next; + error = mprotect_fixup(vma, nstart, tmp, newflags); + if (error) + break; + nstart = tmp; + vma = next; + if (!vma || vma->vm_start != nstart) { + error = -EFAULT; + break; + } + } + merge_segments(current, start, end); + return error; +} diff --git a/mm/swap.c b/mm/swap.c new file mode 100644 index 000000000..2906df9c2 --- /dev/null +++ b/mm/swap.c @@ -0,0 +1,1231 @@ +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file should contain most things doing the swapping from/to disk. + * Started 18.12.91 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include /* for cli()/sti() */ +#include +#include + +#define MAX_SWAPFILES 8 + +#define SWP_USED 1 +#define SWP_WRITEOK 3 + +int min_free_pages = 20; + +static int nr_swapfiles = 0; +static struct wait_queue * lock_queue = NULL; + +static struct swap_info_struct { + unsigned long flags; + struct inode * swap_file; + unsigned int swap_device; + unsigned char * swap_map; + unsigned char * swap_lockmap; + int pages; + int lowest_bit; + int highest_bit; + unsigned long max; +} swap_info[MAX_SWAPFILES]; + +extern int shm_swap (int); + +unsigned long *swap_cache; + +#ifdef SWAP_CACHE_INFO +unsigned long swap_cache_add_total = 0; +unsigned long swap_cache_add_success = 0; +unsigned long swap_cache_del_total = 0; +unsigned long swap_cache_del_success = 0; +unsigned long swap_cache_find_total = 0; +unsigned long swap_cache_find_success = 0; + +extern inline void show_swap_cache_info(void) +{ + printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n", + swap_cache_add_total, swap_cache_add_success, + swap_cache_del_total, swap_cache_del_success, + swap_cache_find_total, swap_cache_find_success); +} +#endif + +static int add_to_swap_cache(unsigned long addr, unsigned long entry) +{ + struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; + +#ifdef SWAP_CACHE_INFO + swap_cache_add_total++; +#endif + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry); + if (entry) { + printk("swap_cache: replacing non-NULL entry\n"); + } +#ifdef SWAP_CACHE_INFO + swap_cache_add_success++; +#endif + return 1; + } + return 0; +} + +static unsigned long init_swap_cache(unsigned long mem_start, + unsigned long mem_end) +{ + unsigned long swap_cache_size; + + mem_start = (mem_start + 15) & ~15; + swap_cache = (unsigned long *) mem_start; + swap_cache_size = MAP_NR(mem_end); + memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long)); + return (unsigned long) (swap_cache + swap_cache_size); +} + +void rw_swap_page(int rw, unsigned long entry, char * buf) +{ + unsigned long type, offset; + struct swap_info_struct * p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk("Internal error: bad swap-device\n"); + return; + } + p = &swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("rw_swap_page: weirdness\n"); + return; + } + if (p->swap_map && !p->swap_map[offset]) { + printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to swap to unused swap-device\n"); + return; + } + while (set_bit(offset,p->swap_lockmap)) + sleep_on(&lock_queue); + if (rw == READ) + kstat.pswpin++; + else + kstat.pswpout++; + if (p->swap_device) { + ll_rw_page(rw,p->swap_device,offset,buf); + } else if (p->swap_file) { + struct inode *swapf = p->swap_file; + unsigned int zones[PAGE_SIZE/512]; + int i; + if (swapf->i_op->bmap == NULL + && swapf->i_op->smap != NULL){ + /* + With MsDOS, we use msdos_smap which return + a sector number (not a cluster or block number). + It is a patch to enable the UMSDOS project. + Other people are working on better solution. + + It sounds like ll_rw_swap_file defined + it operation size (sector size) based on + PAGE_SIZE and the number of block to read. + So using bmap or smap should work even if + smap will require more blocks. + */ + int j; + unsigned int block = offset << 3; + + for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){ + if (!(zones[i] = swapf->i_op->smap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + }else{ + int j; + unsigned int block = offset + << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); + + for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize) + if (!(zones[i] = bmap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf); + } else + printk("re_swap_page: no swap file or device\n"); + if (offset && !clear_bit(offset,p->swap_lockmap)) + printk("rw_swap_page: lock already cleared\n"); + wake_up(&lock_queue); +} + +unsigned long get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) { + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) { + if (p->swap_map[offset]) + continue; + if (test_bit(offset, p->swap_lockmap)) + continue; + p->swap_map[offset] = 1; + nr_swap_pages--; + if (offset == p->highest_bit) + p->highest_bit--; + p->lowest_bit = offset; + return SWP_ENTRY(type,offset); + } + } + return 0; +} + +void swap_duplicate(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return; + offset = SWP_OFFSET(entry); + type = SWP_TYPE(entry); + if (type == SHM_SWP_TYPE) + return; + if (type >= nr_swapfiles) { + printk("Trying to duplicate nonexistent swap-page\n"); + return; + } + p = type + swap_info; + if (offset >= p->max) { + printk("swap_duplicate: weirdness\n"); + return; + } + if (!p->swap_map[offset]) { + printk("swap_duplicate: trying to duplicate unused page\n"); + return; + } + p->swap_map[offset]++; + return; +} + +void swap_free(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return; + type = SWP_TYPE(entry); + if (type == SHM_SWP_TYPE) + return; + if (type >= nr_swapfiles) { + printk("Trying to free nonexistent swap-page\n"); + return; + } + p = & swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("swap_free: weirdness\n"); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to free swap from unused swap-device\n"); + return; + } + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (!p->swap_map[offset]) + printk("swap_free: swap-space map bad (entry %08lx)\n",entry); + else + if (!--p->swap_map[offset]) + nr_swap_pages++; +} + +/* + * The tests may look silly, but it essentially makes sure that + * no other process did a swap-in on us just as we were waiting. + * + * Also, don't bother to add to the swap cache if this page-in + * was due to a write access. + */ +void swap_in(struct vm_area_struct * vma, pte_t * page_table, + unsigned long entry, int write_access) +{ + unsigned long page = get_free_page(GFP_KERNEL); + + if (pte_val(*page_table) != entry) { + free_page(page); + return; + } + if (!page) { + *page_table = BAD_PAGE; + swap_free(entry); + oom(current); + return; + } + read_swap_page(entry, (char *) page); + if (pte_val(*page_table) != entry) { + free_page(page); + return; + } + vma->vm_task->mm->rss++; + vma->vm_task->mm->maj_flt++; + if (!write_access && add_to_swap_cache(page, entry)) { + *page_table = mk_pte(page, vma->vm_page_prot); + return; + } + *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + swap_free(entry); + return; +} + +/* + * The swap-out functions return 1 if they successfully + * threw something out, and we got a free page. It returns + * zero if it couldn't do anything, and any other value + * indicates it decreased rss, but the page was shared. + * + * NOTE! If it sleeps, it *must* return 1 to make sure we + * don't continue with the swap-out. Otherwise we may be + * using a process that no longer actually exists (it might + * have died while we slept). + */ +static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table) +{ + pte_t pte; + unsigned long entry; + unsigned long page; + + pte = *page_table; + if (!pte_present(pte)) + return 0; + page = pte_page(pte); + if (page >= high_memory) + return 0; + if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) + return 0; + if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte)) { + *page_table = pte_mkold(pte); + return 0; + } + if (pte_dirty(pte)) { + if (mem_map[MAP_NR(page)] != 1) + return 0; + if (vma->vm_ops && vma->vm_ops->swapout) { + vma->vm_task->mm->rss--; + vma->vm_ops->swapout(vma, address-vma->vm_start, page_table); + } else { + if (!(entry = get_swap_page())) + return 0; + vma->vm_task->mm->rss--; + pte_val(*page_table) = entry; + invalidate(); + write_swap_page(entry, (char *) page); + } + free_page(page); + return 1; /* we slept: the process may not exist any more */ + } + if ((entry = find_in_swap_cache(page))) { + if (mem_map[MAP_NR(page)] != 1) { + *page_table = pte_mkdirty(pte); + printk("Aiee.. duplicated cached swap-cache entry\n"); + return 0; + } + vma->vm_task->mm->rss--; + pte_val(*page_table) = entry; + invalidate(); + free_page(page); + return 1; + } + vma->vm_task->mm->rss--; + pte_clear(page_table); + invalidate(); + entry = mem_map[MAP_NR(page)]; + free_page(page); + return entry; +} + +/* + * A new implementation of swap_out(). We do not swap complete processes, + * but only a small number of blocks, before we continue with the next + * process. The number of blocks actually swapped is determined on the + * number of page faults, that this process actually had in the last time, + * so we won't swap heavily used processes all the time ... + * + * Note: the priority argument is a hint on much CPU to waste with the + * swap block search, not a hint, of how much blocks to swap with + * each process. + * + * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de + */ + +/* + * These are the minimum and maximum number of pages to swap from one process, + * before proceeding to the next: + */ +#define SWAP_MIN 4 +#define SWAP_MAX 32 + +/* + * The actual number of pages to swap is determined as: + * SWAP_RATIO / (number of recent major page faults) + */ +#define SWAP_RATIO 128 + +static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, + unsigned long address, unsigned long end) +{ + pte_t * pte; + unsigned long pmd_end; + + if (pmd_none(*dir)) + return 0; + if (pmd_bad(*dir)) { + printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); + pmd_clear(dir); + return 0; + } + + pte = pte_offset(dir, address); + + pmd_end = (address + PMD_SIZE) & PMD_MASK; + if (end > pmd_end) + end = pmd_end; + + do { + int result; + vma->vm_task->mm->swap_address = address + PAGE_SIZE; + result = try_to_swap_out(vma, address, pte); + if (result) + return result; + address += PAGE_SIZE; + pte++; + } while (address < end); + return 0; +} + +static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, + unsigned long address, unsigned long end) +{ + pmd_t * pmd; + unsigned long pgd_end; + + if (pgd_none(*dir)) + return 0; + if (pgd_bad(*dir)) { + printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); + pgd_clear(dir); + return 0; + } + + pmd = pmd_offset(dir, address); + + pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; + if (end > pgd_end) + end = pgd_end; + + do { + int result = swap_out_pmd(vma, pmd, address, end); + if (result) + return result; + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return 0; +} + +static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir, + unsigned long start) +{ + unsigned long end; + + /* Don't swap out areas like shared memory which have their + own separate swapping mechanism. */ + if (vma->vm_flags & VM_SHM) + return 0; + + end = vma->vm_end; + while (start < end) { + int result = swap_out_pgd(vma, pgdir, start, end); + if (result) + return result; + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } + return 0; +} + +static int swap_out_process(struct task_struct * p) +{ + unsigned long address; + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + address = p->mm->swap_address; + p->mm->swap_address = 0; + + /* + * Find the proper vm-area + */ + vma = find_vma(p, address); + if (!vma) + return 0; + if (address < vma->vm_start) + address = vma->vm_start; + + for (;;) { + int result = swap_out_vma(vma, pgd_offset(p, address), address); + if (result) + return result; + vma = vma->vm_next; + if (!vma) + break; + address = vma->vm_start; + } + p->mm->swap_address = 0; + return 0; +} + +static int swap_out(unsigned int priority) +{ + static int swap_task; + int loop, counter; + struct task_struct *p; + + counter = 6*nr_tasks >> priority; + for(; counter >= 0; counter--) { + /* + * Check that swap_task is suitable for swapping. If not, look for + * the next suitable process. + */ + loop = 0; + while(1) { + if (swap_task >= NR_TASKS) { + swap_task = 1; + if (loop) + /* all processes are unswappable or already swapped out */ + return 0; + loop = 1; + } + + p = task[swap_task]; + if (p && p->mm->swappable && p->mm->rss) + break; + + swap_task++; + } + + /* + * Determine the number of pages to swap from this process. + */ + if (!p->mm->swap_cnt) { + p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt; + p->mm->old_maj_flt = p->mm->maj_flt; + + if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) { + p->mm->dec_flt = SWAP_RATIO / SWAP_MIN; + p->mm->swap_cnt = SWAP_MIN; + } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX) + p->mm->swap_cnt = SWAP_MAX; + else + p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt; + } + if (!--p->mm->swap_cnt) + swap_task++; + switch (swap_out_process(p)) { + case 0: + if (p->mm->swap_cnt) + swap_task++; + break; + case 1: + return 1; + default: + break; + } + } + return 0; +} + +/* + * we keep on shrinking one resource until it's considered "too hard", + * and then switch to the next one (priority being an indication on how + * hard we should try with the resource). + * + * This should automatically find the resource that can most easily be + * free'd, so hopefully we'll get reasonable behaviour even under very + * different circumstances. + */ +static int try_to_free_page(int priority) +{ + static int state = 0; + int i=6; + + switch (state) { + do { + case 0: + if (priority != GFP_NOBUFFER && shrink_buffers(i)) + return 1; + state = 1; + case 1: + if (shm_swap(i)) + return 1; + state = 2; + default: + if (swap_out(i)) + return 1; + state = 0; + } while(i--); + } + return 0; +} + +static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry) +{ + entry->prev = head; + (entry->next = head->next)->prev = entry; + head->next = entry; +} + +static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry) +{ + entry->next->prev = entry->prev; + entry->prev->next = entry->next; +} + +/* + * Free_page() adds the page to the free lists. This is optimized for + * fast normal cases (no error jumps taken normally). + * + * The way to optimize jumps for gcc-2.2.2 is to: + * - select the "normal" case and put it inside the if () { XXX } + * - no else-statements if you can avoid them + * + * With the above two rules, you get a straight-line execution path + * for the normal case, giving better asm-code. + * + * free_page() may sleep since the page being freed may be a buffer + * page or present in the swap cache. It will not sleep, however, + * for a freshly allocated page (get_free_page()). + */ + +/* + * Buddy system. Hairy. You really aren't expected to understand this + */ +static inline void free_pages_ok(unsigned long addr, unsigned long order) +{ + unsigned long index = MAP_NR(addr) >> (1 + order); + unsigned long mask = PAGE_MASK << order; + + addr &= mask; + nr_free_pages += 1 << order; + while (order < NR_MEM_LISTS-1) { + if (!change_bit(index, free_area_map[order])) + break; + remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask))); + order++; + index >>= 1; + mask <<= 1; + addr &= mask; + } + add_mem_queue(free_area_list+order, (struct mem_list *) addr); +} + +static inline void check_free_buffers(unsigned long addr) +{ + struct buffer_head * bh; + + bh = buffer_pages[MAP_NR(addr)]; + if (bh) { + struct buffer_head *tmp = bh; + do { + if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff) + refile_buffer(tmp); + tmp = tmp->b_this_page; + } while (tmp != bh); + } +} + +void free_pages(unsigned long addr, unsigned long order) +{ + if (addr < high_memory) { + unsigned long flag; + mem_map_t * map = mem_map + MAP_NR(addr); + if (*map) { + if (!(*map & MAP_PAGE_RESERVED)) { + save_flags(flag); + cli(); + if (!--*map) { + free_pages_ok(addr, order); + delete_from_swap_cache(addr); + } + restore_flags(flag); + if (*map == 1) + check_free_buffers(addr); + } + return; + } + printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr); + printk("PC = %p\n", __builtin_return_address(0)); + return; + } +} + +/* + * Some ugly macros to speed up __get_free_pages().. + */ +#define RMQUEUE(order) \ +do { struct mem_list * queue = free_area_list+order; \ + unsigned long new_order = order; \ + do { struct mem_list *next = queue->next; \ + if (queue != next) { \ + (queue->next = next->next)->prev = queue; \ + mark_used((unsigned long) next, new_order); \ + nr_free_pages -= 1 << order; \ + restore_flags(flags); \ + EXPAND(next, order, new_order); \ + return (unsigned long) next; \ + } new_order++; queue++; \ + } while (new_order < NR_MEM_LISTS); \ +} while (0) + +static inline int mark_used(unsigned long addr, unsigned long order) +{ + return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]); +} + +#define EXPAND(addr,low,high) \ +do { unsigned long size = PAGE_SIZE << high; \ + while (high > low) { \ + high--; size >>= 1; cli(); \ + add_mem_queue(free_area_list+high, addr); \ + mark_used((unsigned long) addr, high); \ + restore_flags(flags); \ + addr = (struct mem_list *) (size + (unsigned long) addr); \ + } mem_map[MAP_NR((unsigned long) addr)] = 1; \ +} while (0) + +unsigned long __get_free_pages(int priority, unsigned long order) +{ + unsigned long flags; + int reserved_pages; + + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("gfp called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + priority = GFP_ATOMIC; + } + } + reserved_pages = 5; + if (priority != GFP_NFS) + reserved_pages = min_free_pages; + save_flags(flags); +repeat: + cli(); + if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { + RMQUEUE(order); + restore_flags(flags); + return 0; + } + restore_flags(flags); + if (priority != GFP_BUFFER && try_to_free_page(priority)) + goto repeat; + return 0; +} + +/* + * Yes, I know this is ugly. Don't tell me. + */ +unsigned long __get_dma_pages(int priority, unsigned long order) +{ + unsigned long list = 0; + unsigned long result; + unsigned long limit = MAX_DMA_ADDRESS; + + /* if (EISA_bus) limit = ~0UL; */ + if (priority != GFP_ATOMIC) + priority = GFP_BUFFER; + for (;;) { + result = __get_free_pages(priority, order); + if (result < limit) /* covers failure as well */ + break; + *(unsigned long *) result = list; + list = result; + } + while (list) { + unsigned long tmp = list; + list = *(unsigned long *) list; + free_pages(tmp, order); + } + return result; +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + unsigned long order, flags; + unsigned long total = 0; + + printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); + save_flags(flags); + cli(); + for (order=0 ; order < NR_MEM_LISTS; order++) { + struct mem_list * tmp; + unsigned long nr = 0; + for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) { + nr ++; + } + total += nr * ((PAGE_SIZE>>10) << order); + printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order); + } + restore_flags(flags); + printk("= %lukB)\n", total); +#ifdef SWAP_CACHE_INFO + show_swap_cache_info(); +#endif +} + +/* + * Trying to stop swapping from a file is fraught with races, so + * we repeat quite a bit here when we have to pause. swapoff() + * isn't exactly timing-critical, so who cares (but this is /really/ + * inefficient, ugh). + * + * We return 1 after having slept, which makes the process start over + * from the beginning for this process.. + */ +static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address, + pte_t *dir, unsigned int type, unsigned long page) +{ + pte_t pte = *dir; + + if (pte_none(pte)) + return 0; + if (pte_present(pte)) { + unsigned long page = pte_page(pte); + if (page >= high_memory) + return 0; + if (!in_swap_cache(page)) + return 0; + if (SWP_TYPE(in_swap_cache(page)) != type) + return 0; + delete_from_swap_cache(page); + *dir = pte_mkdirty(pte); + return 0; + } + if (SWP_TYPE(pte_val(pte)) != type) + return 0; + read_swap_page(pte_val(pte), (char *) page); + if (pte_val(*dir) != pte_val(pte)) { + free_page(page); + return 1; + } + *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + ++vma->vm_task->mm->rss; + swap_free(pte_val(pte)); + return 1; +} + +static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, + unsigned long address, unsigned long size, unsigned long offset, + unsigned int type, unsigned long page) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*dir)) + return 0; + if (pmd_bad(*dir)) { + printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); + pmd_clear(dir); + return 0; + } + pte = pte_offset(dir, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page)) + return 1; + address += PAGE_SIZE; + pte++; + } while (address < end); + return 0; +} + +static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, + unsigned long address, unsigned long size, + unsigned int type, unsigned long page) +{ + pmd_t * pmd; + unsigned long offset, end; + + if (pgd_none(*dir)) + return 0; + if (pgd_bad(*dir)) { + printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); + pgd_clear(dir); + return 0; + } + pmd = pmd_offset(dir, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + if (unuse_pmd(vma, pmd, address, end - address, offset, type, page)) + return 1; + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return 0; +} + +static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, + unsigned long start, unsigned long end, + unsigned int type, unsigned long page) +{ + while (start < end) { + if (unuse_pgd(vma, pgdir, start, end - start, type, page)) + return 1; + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } + return 0; +} + +static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page) +{ + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + vma = p->mm->mmap; + while (vma) { + pgd_t * pgd = pgd_offset(p, vma->vm_start); + if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page)) + return 1; + vma = vma->vm_next; + } + return 0; +} + +/* + * To avoid races, we repeat for each process after having + * swapped something in. That gets rid of a few pesky races, + * and "swapoff" isn't exactly timing critical. + */ +static int try_to_unuse(unsigned int type) +{ + int nr; + unsigned long page = get_free_page(GFP_KERNEL); + + if (!page) + return -ENOMEM; + nr = 0; + while (nr < NR_TASKS) { + if (task[nr]) { + if (unuse_process(task[nr], type, page)) { + page = get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + continue; + } + } + nr++; + } + free_page(page); + return 0; +} + +asmlinkage int sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * inode; + unsigned int type; + struct file filp; + int i; + + if (!suser()) + return -EPERM; + i = namei(specialfile,&inode); + if (i) + return i; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) { + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + if (p->swap_file) { + if (p->swap_file == inode) + break; + } else { + if (!S_ISBLK(inode->i_mode)) + continue; + if (p->swap_device == inode->i_rdev) + break; + } + } + + if (type >= nr_swapfiles){ + iput(inode); + return -EINVAL; + } + p->flags = SWP_USED; + i = try_to_unuse(type); + if (i) { + iput(inode); + p->flags = SWP_WRITEOK; + return i; + } + + if(p->swap_device){ + memset(&filp, 0, sizeof(filp)); + filp.f_inode = inode; + filp.f_mode = 3; /* read write */ + /* open it again to get fops */ + if( !blkdev_open(inode, &filp) && + filp.f_op && filp.f_op->release){ + filp.f_op->release(inode,&filp); + filp.f_op->release(inode,&filp); + } + } + iput(inode); + + nr_swap_pages -= p->pages; + iput(p->swap_file); + p->swap_file = NULL; + p->swap_device = 0; + vfree(p->swap_map); + p->swap_map = NULL; + free_page((long) p->swap_lockmap); + p->swap_lockmap = NULL; + p->flags = 0; + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage int sys_swapon(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * swap_inode; + unsigned int type; + int i,j; + int error; + struct file filp; + + memset(&filp, 0, sizeof(filp)); + if (!suser()) + return -EPERM; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + if (type >= MAX_SWAPFILES) + return -EPERM; + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->max = 1; + error = namei(specialfile,&swap_inode); + if (error) + goto bad_swap_2; + p->swap_file = swap_inode; + error = -EBUSY; + if (swap_inode->i_count != 1) + goto bad_swap_2; + error = -EINVAL; + + if (S_ISBLK(swap_inode->i_mode)) { + p->swap_device = swap_inode->i_rdev; + + filp.f_inode = swap_inode; + filp.f_mode = 3; /* read write */ + error = blkdev_open(swap_inode, &filp); + p->swap_file = NULL; + iput(swap_inode); + if(error) + goto bad_swap_2; + error = -ENODEV; + if (!p->swap_device) + goto bad_swap; + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + if (i == type) + continue; + if (p->swap_device == swap_info[i].swap_device) + goto bad_swap; + } + } else if (!S_ISREG(swap_inode->i_mode)) + goto bad_swap; + p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER); + if (!p->swap_lockmap) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap); + if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + memset(p->swap_lockmap+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,p->swap_lockmap)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + p->max = i+1; + j++; + } + } + if (!j) { + printk("Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map = (unsigned char *) vmalloc(p->max); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < p->max ; i++) { + if (test_bit(i,p->swap_lockmap)) + p->swap_map[i] = 0; + else + p->swap_map[i] = 0x80; + } + p->swap_map[0] = 0x80; + memset(p->swap_lockmap,0,PAGE_SIZE); + p->flags = SWP_WRITEOK; + p->pages = j; + nr_swap_pages += j; + printk("Adding Swap: %dk swap-space\n",j<<(PAGE_SHIFT-10)); + return 0; +bad_swap: + if(filp.f_op && filp.f_op->release) + filp.f_op->release(filp.f_inode,&filp); +bad_swap_2: + free_page((long) p->swap_lockmap); + vfree(p->swap_map); + iput(p->swap_file); + p->swap_device = 0; + p->swap_file = NULL; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->flags = 0; + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i, j; + + val->freeswap = val->totalswap = 0; + for (i = 0; i < nr_swapfiles; i++) { + if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (j = 0; j < swap_info[i].max; ++j) + switch (swap_info[i].swap_map[j]) { + case 128: + continue; + case 0: + ++val->freeswap; + default: + ++val->totalswap; + } + } + val->freeswap <<= PAGE_SHIFT; + val->totalswap <<= PAGE_SHIFT; + return; +} + +/* + * set up the free-area data structures: + * - mark all pages MAP_PAGE_RESERVED + * - mark all memory queues empty + * - clear the memory bitmaps + */ +unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem) +{ + mem_map_t * p; + unsigned long mask = PAGE_MASK; + int i; + + /* + * select nr of pages we try to keep free for important stuff + * with a minimum of 16 pages. This is totally arbitrary + */ + i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6); + if (i < 16) + i = 16; + min_free_pages = i; + start_mem = init_swap_cache(start_mem, end_mem); + mem_map = (mem_map_t *) start_mem; + p = mem_map + MAP_NR(end_mem); + start_mem = (unsigned long) p; + while (p > mem_map) + *--p = MAP_PAGE_RESERVED; + + for (i = 0 ; i < NR_MEM_LISTS ; i++) { + unsigned long bitmap_size; + free_area_list[i].prev = free_area_list[i].next = &free_area_list[i]; + mask += mask; + end_mem = (end_mem + ~mask) & mask; + bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i); + bitmap_size = (bitmap_size + 7) >> 3; + bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1); + free_area_map[i] = (unsigned char *) start_mem; + memset((void *) start_mem, 0, bitmap_size); + start_mem += bitmap_size; + } + return start_mem; +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c new file mode 100644 index 000000000..107be5546 --- /dev/null +++ b/mm/vmalloc.c @@ -0,0 +1,244 @@ +/* + * linux/mm/vmalloc.c + * + * Copyright (C) 1993 Linus Torvalds + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct vm_struct { + unsigned long flags; + void * addr; + unsigned long size; + struct vm_struct * next; +}; + +static struct vm_struct * vmlist = NULL; + +static inline void set_pgdir(unsigned long address, pgd_t entry) +{ + struct task_struct * p; + + for_each_task(p) + *pgd_offset(p,address) = entry; +} + +static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + printk("free_area_pte: bad pmd (%08lx)\n", pmd_val(*pmd)); + pmd_clear(pmd); + return; + } + pte = pte_offset(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + while (address < end) { + pte_t page = *pte; + pte_clear(pte); + address += PAGE_SIZE; + pte++; + if (pte_none(page)) + continue; + if (pte_present(page)) { + free_page(pte_page(page)); + continue; + } + printk("Whee.. Swapped out page in kernel page table\n"); + } +} + +static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + printk("free_area_pmd: bad pgd (%08lx)\n", pgd_val(*dir)); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + while (address < end) { + free_area_pte(pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } +} + +static void free_area_pages(unsigned long address, unsigned long size) +{ + pgd_t * dir; + unsigned long end = address + size; + + dir = pgd_offset(&init_task, address); + while (address < end) { + free_area_pmd(dir, address, end - address); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } + invalidate(); +} + +static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned long size) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + while (address < end) { + unsigned long page; + if (!pte_none(*pte)) + printk("alloc_area_pte: page already exists\n"); + page = __get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + *pte = mk_pte(page, PAGE_KERNEL); + address += PAGE_SIZE; + pte++; + } + return 0; +} + +static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + while (address < end) { + pte_t * pte = pte_alloc_kernel(pmd, address); + if (!pte) + return -ENOMEM; + if (alloc_area_pte(pte, address, end - address)) + return -ENOMEM; + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } + return 0; +} + +static int alloc_area_pages(unsigned long address, unsigned long size) +{ + pgd_t * dir; + unsigned long end = address + size; + + dir = pgd_offset(&init_task, address); + while (address < end) { + pmd_t *pmd = pmd_alloc_kernel(dir, address); + if (!pmd) + return -ENOMEM; + if (alloc_area_pmd(pmd, address, end - address)) + return -ENOMEM; + set_pgdir(address, *dir); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } + invalidate(); + return 0; +} + +void vfree(void * addr) +{ + struct vm_struct **p, *tmp; + + if (!addr) + return; + if ((PAGE_SIZE-1) & (unsigned long) addr) { + printk("Trying to vfree() bad address (%p)\n", addr); + return; + } + for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { + if (tmp->addr == addr) { + *p = tmp->next; + free_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size); + kfree(tmp); + return; + } + } + printk("Trying to vfree() nonexistent vm area (%p)\n", addr); +} + +void * vmalloc(unsigned long size) +{ + void * addr; + struct vm_struct **p, *tmp, *area; + + size = PAGE_ALIGN(size); + if (!size || size > high_memory) + return NULL; + area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + addr = (void *) VMALLOC_START; + area->size = size + PAGE_SIZE; + area->next = NULL; + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + if (size + (unsigned long) addr < (unsigned long) tmp->addr) + break; + addr = (void *) (tmp->size + (unsigned long) tmp->addr); + } + area->addr = addr; + area->next = *p; + *p = area; + if (alloc_area_pages(VMALLOC_VMADDR(addr), size)) { + vfree(addr); + return NULL; + } + return addr; +} + +int vread(char *buf, char *addr, int count) +{ + struct vm_struct **p, *tmp; + char *vaddr, *buf_start = buf; + int n; + + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + vaddr = (char *) tmp->addr; + while (addr < vaddr) { + if (count == 0) + goto finished; + put_fs_byte('\0', buf++), addr++, count--; + } + n = tmp->size - PAGE_SIZE; + if (addr > vaddr) + n -= addr - vaddr; + while (--n >= 0) { + if (count == 0) + goto finished; + put_fs_byte(*addr++, buf++), count--; + } + } +finished: + return buf - buf_start; +} -- cgit v1.2.3