diff options
Diffstat (limited to 'arch/i386/mm')
-rw-r--r-- | arch/i386/mm/Makefile | 30 | ||||
-rw-r--r-- | arch/i386/mm/kmalloc.c | 362 | ||||
-rw-r--r-- | arch/i386/mm/memory.c | 1320 | ||||
-rw-r--r-- | arch/i386/mm/mmap.c | 470 | ||||
-rw-r--r-- | arch/i386/mm/mprotect.c | 230 | ||||
-rw-r--r-- | arch/i386/mm/swap.c | 1017 | ||||
-rw-r--r-- | arch/i386/mm/vmalloc.c | 202 |
7 files changed, 3631 insertions, 0 deletions
diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile new file mode 100644 index 000000000..5063d60c2 --- /dev/null +++ b/arch/i386/mm/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for the linux memory manager. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +.c.o: + $(CC) $(CFLAGS) -c $< +.s.o: + $(AS) -o $*.o $< +.c.s: + $(CC) $(CFLAGS) -S $< + +OBJS = memory.o swap.o mmap.o mprotect.o kmalloc.o vmalloc.o + +mm.o: $(OBJS) + $(LD) -r -o mm.o $(OBJS) + +dep: + $(CPP) -M *.c > .depend + +# +# include a dependency file if one exists +# +ifeq (.depend,$(wildcard .depend)) +include .depend +endif diff --git a/arch/i386/mm/kmalloc.c b/arch/i386/mm/kmalloc.c new file mode 100644 index 000000000..018f8db8f --- /dev/null +++ b/arch/i386/mm/kmalloc.c @@ -0,0 +1,362 @@ +/* + * linux/mm/kmalloc.c + * + * Copyright (C) 1991, 1992 Linus Torvalds & Roger Wolff. + * + * Written by R.E. Wolff Sept/Oct '93. + * + */ + +/* + * Modified by Alex Bligh (alex@cconcepts.co.uk) 4 Apr 1994 to use multiple + * pages. So for 'page' throughout, read 'area'. + */ + +#include <linux/mm.h> +#include <asm/system.h> +#include <linux/delay.h> + +#define GFP_LEVEL_MASK 0xf + +/* I want this low enough for a while to catch errors. + I want this number to be increased in the near future: + loadable device drivers should use this function to get memory */ + +#define MAX_KMALLOC_K ((PAGE_SIZE<<(NUM_AREA_ORDERS-1))>>10) + + +/* This defines how many times we should try to allocate a free page before + giving up. Normally this shouldn't happen at all. */ +#define MAX_GET_FREE_PAGE_TRIES 4 + + +/* Private flags. */ + +#define MF_USED 0xffaa0055 +#define MF_FREE 0x0055ffaa + + +/* + * Much care has gone into making these routines in this file reentrant. + * + * The fancy bookkeeping of nbytesmalloced and the like are only used to + * report them to the user (oooohhhhh, aaaaahhhhh....) are not + * protected by cli(). (If that goes wrong. So what?) + * + * These routines restore the interrupt status to allow calling with ints + * off. + */ + +/* + * A block header. This is in front of every malloc-block, whether free or not. + */ +struct block_header { + unsigned long bh_flags; + union { + unsigned long ubh_length; + struct block_header *fbh_next; + } vp; +}; + + +#define bh_length vp.ubh_length +#define bh_next vp.fbh_next +#define BH(p) ((struct block_header *)(p)) + + +/* + * The page descriptor is at the front of every page that malloc has in use. + */ +struct page_descriptor { + struct page_descriptor *next; + struct block_header *firstfree; + int order; + int nfree; +}; + + +#define PAGE_DESC(p) ((struct page_descriptor *)(((unsigned long)(p)) & PAGE_MASK)) + + +/* + * A size descriptor describes a specific class of malloc sizes. + * Each class of sizes has its own freelist. + */ +struct size_descriptor { + struct page_descriptor *firstfree; + int size; + int nblocks; + + int nmallocs; + int nfrees; + int nbytesmalloced; + int npages; + unsigned long gfporder; /* number of pages in the area required */ +}; + +/* + * For now it is unsafe to allocate bucket sizes between n & n=16 where n is + * 4096 * any power of two + */ + +struct size_descriptor sizes[] = { + { NULL, 32,127, 0,0,0,0, 0}, + { NULL, 64, 63, 0,0,0,0, 0 }, + { NULL, 128, 31, 0,0,0,0, 0 }, + { NULL, 252, 16, 0,0,0,0, 0 }, + { NULL, 508, 8, 0,0,0,0, 0 }, + { NULL,1020, 4, 0,0,0,0, 0 }, + { NULL,2040, 2, 0,0,0,0, 0 }, + { NULL,4096-16, 1, 0,0,0,0, 0 }, + { NULL,8192-16, 1, 0,0,0,0, 1 }, + { NULL,16384-16, 1, 0,0,0,0, 2 }, + { NULL,32768-16, 1, 0,0,0,0, 3 }, + { NULL,65536-16, 1, 0,0,0,0, 4 }, + { NULL,131072-16, 1, 0,0,0,0, 5 }, + { NULL, 0, 0, 0,0,0,0, 0 } +}; + + +#define NBLOCKS(order) (sizes[order].nblocks) +#define BLOCKSIZE(order) (sizes[order].size) +#define AREASIZE(order) (PAGE_SIZE<<(sizes[order].gfporder)) + + +long kmalloc_init (long start_mem,long end_mem) +{ + int order; + +/* + * Check the static info array. Things will blow up terribly if it's + * incorrect. This is a late "compile time" check..... + */ +for (order = 0;BLOCKSIZE(order);order++) + { + if ((NBLOCKS (order)*BLOCKSIZE(order) + sizeof (struct page_descriptor)) > + AREASIZE(order)) + { + printk ("Cannot use %d bytes out of %d in order = %d block mallocs\n", + NBLOCKS (order) * BLOCKSIZE(order) + + sizeof (struct page_descriptor), + (int) AREASIZE(order), + BLOCKSIZE (order)); + panic ("This only happens if someone messes with kmalloc"); + } + } +return start_mem; +} + + + +int get_order (int size) +{ + int order; + + /* Add the size of the header */ + size += sizeof (struct block_header); + for (order = 0;BLOCKSIZE(order);order++) + if (size <= BLOCKSIZE (order)) + return order; + return -1; +} + +void * kmalloc (size_t size, int priority) +{ + unsigned long flags; + int order,tries,i,sz; + struct block_header *p; + struct page_descriptor *page; + +/* Sanity check... */ + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("kmalloc called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + priority = GFP_ATOMIC; + } + } + +order = get_order (size); +if (order < 0) + { + printk ("kmalloc of too large a block (%d bytes).\n",size); + return (NULL); + } + +save_flags(flags); + +/* It seems VERY unlikely to me that it would be possible that this + loop will get executed more than once. */ +tries = MAX_GET_FREE_PAGE_TRIES; +while (tries --) + { + /* Try to allocate a "recently" freed memory block */ + cli (); + if ((page = sizes[order].firstfree) && + (p = page->firstfree)) + { + if (p->bh_flags == MF_FREE) + { + page->firstfree = p->bh_next; + page->nfree--; + if (!page->nfree) + { + sizes[order].firstfree = page->next; + page->next = NULL; + } + restore_flags(flags); + + sizes [order].nmallocs++; + sizes [order].nbytesmalloced += size; + p->bh_flags = MF_USED; /* As of now this block is officially in use */ + p->bh_length = size; + return p+1; /* Pointer arithmetic: increments past header */ + } + printk ("Problem: block on freelist at %08lx isn't free.\n",(long)p); + return (NULL); + } + restore_flags(flags); + + + /* Now we're in trouble: We need to get a new free page..... */ + + sz = BLOCKSIZE(order); /* sz is the size of the blocks we're dealing with */ + + /* This can be done with ints on: This is private to this invocation */ + page = (struct page_descriptor *) __get_free_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder); + if (!page) { + static unsigned long last = 0; + if (last + 10*HZ < jiffies) { + last = jiffies; + printk ("Couldn't get a free page.....\n"); + } + return NULL; + } +#if 0 + printk ("Got page %08x to use for %d byte mallocs....",(long)page,sz); +#endif + sizes[order].npages++; + + /* Loop for all but last block: */ + for (i=NBLOCKS(order),p=BH (page+1);i > 1;i--,p=p->bh_next) + { + p->bh_flags = MF_FREE; + p->bh_next = BH ( ((long)p)+sz); + } + /* Last block: */ + p->bh_flags = MF_FREE; + p->bh_next = NULL; + + page->order = order; + page->nfree = NBLOCKS(order); + page->firstfree = BH(page+1); +#if 0 + printk ("%d blocks per page\n",page->nfree); +#endif + /* Now we're going to muck with the "global" freelist for this size: + this should be uninterruptible */ + cli (); + /* + * sizes[order].firstfree used to be NULL, otherwise we wouldn't be + * here, but you never know.... + */ + page->next = sizes[order].firstfree; + sizes[order].firstfree = page; + restore_flags(flags); + } + +/* Pray that printk won't cause this to happen again :-) */ + +printk ("Hey. This is very funny. I tried %d times to allocate a whole\n" + "new page for an object only %d bytes long, but some other process\n" + "beat me to actually allocating it. Also note that this 'error'\n" + "message is soooo very long to catch your attention. I'd appreciate\n" + "it if you'd be so kind as to report what conditions caused this to\n" + "the author of this kmalloc: wolff@dutecai.et.tudelft.nl.\n" + "(Executive summary: This can't happen)\n", + MAX_GET_FREE_PAGE_TRIES, + size); +return NULL; +} + + +void kfree_s (void *ptr,int size) +{ +unsigned long flags; +int order; +register struct block_header *p=((struct block_header *)ptr) -1; +struct page_descriptor *page,*pg2; + +page = PAGE_DESC (p); +order = page->order; +if ((order < 0) || + (order > sizeof (sizes)/sizeof (sizes[0])) || + (((long)(page->next)) & ~PAGE_MASK) || + (p->bh_flags != MF_USED)) + { + printk ("kfree of non-kmalloced memory: %p, next= %p, order=%d\n", + p, page->next, page->order); + return; + } +if (size && + size != p->bh_length) + { + printk ("Trying to free pointer at %p with wrong size: %d instead of %lu.\n", + p,size,p->bh_length); + return; + } +size = p->bh_length; +p->bh_flags = MF_FREE; /* As of now this block is officially free */ +save_flags(flags); +cli (); +p->bh_next = page->firstfree; +page->firstfree = p; +page->nfree ++; + +if (page->nfree == 1) + { /* Page went from full to one free block: put it on the freelist */ + if (page->next) + { + printk ("Page %p already on freelist dazed and confused....\n", page); + } + else + { + page->next = sizes[order].firstfree; + sizes[order].firstfree = page; + } + } + +/* If page is completely free, free it */ +if (page->nfree == NBLOCKS (page->order)) + { +#if 0 + printk ("Freeing page %08x.\n", (long)page); +#endif + if (sizes[order].firstfree == page) + { + sizes[order].firstfree = page->next; + } + else + { + for (pg2=sizes[order].firstfree; + (pg2 != NULL) && (pg2->next != page); + pg2=pg2->next) + /* Nothing */; + if (pg2 != NULL) + pg2->next = page->next; + else + printk ("Ooops. page %p doesn't show on freelist.\n", page); + } +/* FIXME: I'm sure we should do something with npages here (like npages--) */ + free_pages ((long)page, sizes[order].gfporder); + } +restore_flags(flags); + +/* FIXME: ?? Are these increment & decrement operations guaranteed to be + * atomic? Could an IRQ not occur between the read & the write? + * Maybe yes on a x86 with GCC...?? + */ +sizes[order].nfrees++; /* Noncritical (monitoring) admin stuff */ +sizes[order].nbytesmalloced -= size; +} diff --git a/arch/i386/mm/memory.c b/arch/i386/mm/memory.c new file mode 100644 index 000000000..3e5a67041 --- /dev/null +++ b/arch/i386/mm/memory.c @@ -0,0 +1,1320 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + */ + +#include <linux/config.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> + +#include <asm/system.h> +#include <asm/segment.h> + +/* + * Define this if things work differently on a i386 and a i486: + * it will (on a i486) warn about kernel memory accesses that are + * done without a 'verify_area(VERIFY_WRITE,..)' + */ +#undef CONFIG_TEST_VERIFY_AREA + +unsigned long high_memory = 0; + +extern unsigned long pg0[1024]; /* page table for 0-4MB for everybody */ + +extern void sound_mem_init(void); +extern void die_if_kernel(char *,struct pt_regs *,long); +extern void show_net_buffers(void); + +/* + * The free_area_list arrays point to the queue heads of the free areas + * of different sizes + */ +int nr_swap_pages = 0; +int nr_free_pages = 0; +struct mem_list free_area_list[NR_MEM_LISTS]; +unsigned char * free_area_map[NR_MEM_LISTS]; + +#define copy_page(from,to) \ +__asm__("cld ; rep ; movsl": :"S" (from),"D" (to),"c" (1024):"cx","di","si") + +unsigned short * mem_map = NULL; + +#define CODE_SPACE(addr,p) ((addr) < (p)->end_code) + +/* + * oom() prints a message (so that the user knows why the process died), + * and gives the process an untrappable SIGKILL. + */ +void oom(struct task_struct * task) +{ + printk("\nOut of memory.\n"); + task->sigaction[SIGKILL-1].sa_handler = NULL; + task->blocked &= ~(1<<(SIGKILL-1)); + send_sig(SIGKILL,task,1); +} + +static void free_one_table(unsigned long * page_dir) +{ + int j; + unsigned long pg_table = *page_dir; + unsigned long * page_table; + + if (!pg_table) + return; + *page_dir = 0; + if (pg_table >= high_memory || !(pg_table & PAGE_PRESENT)) { + printk("Bad page table: [%p]=%08lx\n",page_dir,pg_table); + return; + } + if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED) + return; + page_table = (unsigned long *) (pg_table & PAGE_MASK); + for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++) { + unsigned long pg = *page_table; + + if (!pg) + continue; + *page_table = 0; + if (pg & PAGE_PRESENT) + free_page(PAGE_MASK & pg); + else + swap_free(pg); + } + free_page(PAGE_MASK & pg_table); +} + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. Note that + * unlike 'free_page_tables()', this function still leaves a valid + * page-table-tree in memory: it just removes the user pages. The two + * functions are similar, but there is a fundamental difference. + */ +void clear_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) + panic("task[0] (swapper) doesn't support exec()\n"); + pg_dir = tsk->tss.cr3; + page_dir = (unsigned long *) pg_dir; + if (!page_dir || page_dir == swapper_pg_dir) { + printk("Trying to clear kernel page-directory: not good\n"); + return; + } + if (mem_map[MAP_NR(pg_dir)] > 1) { + unsigned long * new_pg; + + if (!(new_pg = (unsigned long*) get_free_page(GFP_KERNEL))) { + oom(tsk); + return; + } + for (i = 768 ; i < 1024 ; i++) + new_pg[i] = page_dir[i]; + free_page(pg_dir); + tsk->tss.cr3 = (unsigned long) new_pg; + return; + } + for (i = 0 ; i < 768 ; i++,page_dir++) + free_one_table(page_dir); + invalidate(); + return; +} + +/* + * This function frees up all page tables of a process when it exits. + */ +void free_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) { + printk("task[0] (swapper) killed: unable to recover\n"); + panic("Trying to free up swapper memory space"); + } + pg_dir = tsk->tss.cr3; + if (!pg_dir || pg_dir == (unsigned long) swapper_pg_dir) { + printk("Trying to free kernel page-directory: not good\n"); + return; + } + tsk->tss.cr3 = (unsigned long) swapper_pg_dir; + if (tsk == current) + __asm__ __volatile__("movl %0,%%cr3": :"a" (tsk->tss.cr3)); + if (mem_map[MAP_NR(pg_dir)] > 1) { + free_page(pg_dir); + return; + } + page_dir = (unsigned long *) pg_dir; + for (i = 0 ; i < PTRS_PER_PAGE ; i++,page_dir++) + free_one_table(page_dir); + free_page(pg_dir); + invalidate(); +} + +/* + * clone_page_tables() clones the page table for a process - both + * processes will have the exact same pages in memory. There are + * probably races in the memory management with cloning, but we'll + * see.. + */ +int clone_page_tables(struct task_struct * tsk) +{ + unsigned long pg_dir; + + pg_dir = current->tss.cr3; + mem_map[MAP_NR(pg_dir)]++; + tsk->tss.cr3 = pg_dir; + return 0; +} + +/* + * copy_page_tables() just copies the whole process memory range: + * note the special handling of RESERVED (ie kernel) pages, which + * means that they are always shared by all processes. + */ +int copy_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long old_pg_dir, *old_page_dir; + unsigned long new_pg_dir, *new_page_dir; + + if (!(new_pg_dir = get_free_page(GFP_KERNEL))) + return -ENOMEM; + old_pg_dir = current->tss.cr3; + tsk->tss.cr3 = new_pg_dir; + old_page_dir = (unsigned long *) old_pg_dir; + new_page_dir = (unsigned long *) new_pg_dir; + for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) { + int j; + unsigned long old_pg_table, *old_page_table; + unsigned long new_pg_table, *new_page_table; + + old_pg_table = *old_page_dir; + if (!old_pg_table) + continue; + if (old_pg_table >= high_memory || !(old_pg_table & PAGE_PRESENT)) { + printk("copy_page_tables: bad page table: " + "probable memory corruption\n"); + *old_page_dir = 0; + continue; + } + if (mem_map[MAP_NR(old_pg_table)] & MAP_PAGE_RESERVED) { + *new_page_dir = old_pg_table; + continue; + } + if (!(new_pg_table = get_free_page(GFP_KERNEL))) { + free_page_tables(tsk); + return -ENOMEM; + } + old_page_table = (unsigned long *) (PAGE_MASK & old_pg_table); + new_page_table = (unsigned long *) (PAGE_MASK & new_pg_table); + for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) { + unsigned long pg; + pg = *old_page_table; + if (!pg) + continue; + if (!(pg & PAGE_PRESENT)) { + *new_page_table = swap_duplicate(pg); + continue; + } + if (pg > high_memory || (mem_map[MAP_NR(pg)] & MAP_PAGE_RESERVED)) { + *new_page_table = pg; + continue; + } + if (pg & PAGE_COW) + pg &= ~PAGE_RW; + if (delete_from_swap_cache(pg)) + pg |= PAGE_DIRTY; + *new_page_table = pg; + *old_page_table = pg; + mem_map[MAP_NR(pg)]++; + } + *new_page_dir = new_pg_table | PAGE_TABLE; + } + invalidate(); + return 0; +} + +/* + * a more complete version of free_page_tables which performs with page + * granularity. + */ +int unmap_page_range(unsigned long from, unsigned long size) +{ + unsigned long page, page_dir; + unsigned long *page_table, *dir; + unsigned long poff, pcnt, pc; + + if (from & ~PAGE_MASK) { + printk("unmap_page_range called with wrong alignment\n"); + return -EINVAL; + } + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + for ( ; size > 0; ++dir, size -= pcnt, + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size)) { + if (!(page_dir = *dir)) { + poff = 0; + continue; + } + if (!(page_dir & PAGE_PRESENT)) { + printk("unmap_page_range: bad page directory."); + continue; + } + page_table = (unsigned long *)(PAGE_MASK & page_dir); + if (poff) { + page_table += poff; + poff = 0; + } + for (pc = pcnt; pc--; page_table++) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (PAGE_PRESENT & page) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + } + if (pcnt == PTRS_PER_PAGE) { + *dir = 0; + free_page(PAGE_MASK & page_dir); + } + } + invalidate(); + return 0; +} + +int zeromap_page_range(unsigned long from, unsigned long size, int mask) +{ + unsigned long *page_table, *dir; + unsigned long poff, pcnt; + unsigned long page; + + if (mask) { + if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) { + printk("zeromap_page_range: mask = %08x\n",mask); + return -EINVAL; + } + mask |= ZERO_PAGE; + } + if (from & ~PAGE_MASK) { + printk("zeromap_page_range: from = %08lx\n",from); + return -EINVAL; + } + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + while (size > 0) { + if (!(PAGE_PRESENT & *dir)) { + /* clear page needed here? SRB. */ + if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) { + invalidate(); + return -ENOMEM; + } + if (PAGE_PRESENT & *dir) { + free_page((unsigned long) page_table); + page_table = (unsigned long *)(PAGE_MASK & *dir++); + } else + *dir++ = ((unsigned long) page_table) | PAGE_TABLE; + } else + page_table = (unsigned long *)(PAGE_MASK & *dir++); + page_table += poff; + poff = 0; + for (size -= pcnt; pcnt-- ;) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (page & PAGE_PRESENT) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + *page_table++ = mask; + } + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size); + } + invalidate(); + return 0; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +int remap_page_range(unsigned long from, unsigned long to, unsigned long size, int mask) +{ + unsigned long *page_table, *dir; + unsigned long poff, pcnt; + unsigned long page; + + if (mask) { + if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) { + printk("remap_page_range: mask = %08x\n",mask); + return -EINVAL; + } + } + if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) { + printk("remap_page_range: from = %08lx, to=%08lx\n",from,to); + return -EINVAL; + } + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + while (size > 0) { + if (!(PAGE_PRESENT & *dir)) { + /* clearing page here, needed? SRB. */ + if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) { + invalidate(); + return -1; + } + *dir++ = ((unsigned long) page_table) | PAGE_TABLE; + } + else + page_table = (unsigned long *)(PAGE_MASK & *dir++); + if (poff) { + page_table += poff; + poff = 0; + } + + for (size -= pcnt; pcnt-- ;) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (PAGE_PRESENT & page) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + + /* + * the first condition should return an invalid access + * when the page is referenced. current assumptions + * cause it to be treated as demand allocation in some + * cases. + */ + if (!mask) + *page_table++ = 0; /* not present */ + else if (to >= high_memory) + *page_table++ = (to | mask); + else if (!mem_map[MAP_NR(to)]) + *page_table++ = 0; /* not present */ + else { + *page_table++ = (to | mask); + if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) { + ++current->mm->rss; + mem_map[MAP_NR(to)]++; + } + } + to += PAGE_SIZE; + } + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size); + } + invalidate(); + return 0; +} + +/* + * This function puts a page in memory at the wanted address. + * It returns the physical address of the page gotten, 0 if + * out of memory (either when trying to access page-table or + * page.) + */ +unsigned long put_page(struct task_struct * tsk,unsigned long page, + unsigned long address,int prot) +{ + unsigned long *page_table; + + if ((prot & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) + printk("put_page: prot = %08x\n",prot); + if (page >= high_memory) { + printk("put_page: trying to put page %08lx at %08lx\n",page,address); + return 0; + } + page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if ((*page_table) & PAGE_PRESENT) + page_table = (unsigned long *) (PAGE_MASK & *page_table); + else { + printk("put_page: bad page directory entry\n"); + oom(tsk); + *page_table = BAD_PAGETABLE | PAGE_TABLE; + return 0; + } + page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if (*page_table) { + printk("put_page: page already exists\n"); + *page_table = 0; + invalidate(); + } + *page_table = page | prot; +/* no need for invalidate */ + return page; +} + +/* + * The previous function doesn't work very well if you also want to mark + * the page dirty: exec.c wants this, as it has earlier changed the page, + * and we want the dirty-status to be correct (for VM). Thus the same + * routine, but this time we mark it dirty too. + */ +unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address) +{ + unsigned long tmp, *page_table; + + if (page >= high_memory) + printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address); + if (mem_map[MAP_NR(page)] != 1) + printk("mem_map disagrees with %08lx at %08lx\n",page,address); + page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *page_table) + page_table = (unsigned long *) (PAGE_MASK & *page_table); + else { + if (!(tmp = get_free_page(GFP_KERNEL))) + return 0; + if (PAGE_PRESENT & *page_table) { + free_page(tmp); + page_table = (unsigned long *) (PAGE_MASK & *page_table); + } else { + *page_table = tmp | PAGE_TABLE; + page_table = (unsigned long *) tmp; + } + } + page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if (*page_table) { + printk("put_dirty_page: page already exists\n"); + *page_table = 0; + invalidate(); + } + *page_table = page | (PAGE_DIRTY | PAGE_PRIVATE); +/* no need for invalidate */ + return page; +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + */ +void do_wp_page(struct vm_area_struct * vma, unsigned long address, + unsigned long error_code) +{ + unsigned long *pde, pte, old_page, prot; + unsigned long new_page; + + new_page = __get_free_page(GFP_KERNEL); + pde = PAGE_DIR_OFFSET(vma->vm_task->tss.cr3,address); + pte = *pde; + if (!(pte & PAGE_PRESENT)) + goto end_wp_page; + if ((pte & PAGE_TABLE) != PAGE_TABLE || pte >= high_memory) + goto bad_wp_pagetable; + pte &= PAGE_MASK; + pte += PAGE_PTR(address); + old_page = *(unsigned long *) pte; + if (!(old_page & PAGE_PRESENT)) + goto end_wp_page; + if (old_page >= high_memory) + goto bad_wp_page; + if (old_page & PAGE_RW) + goto end_wp_page; + vma->vm_task->mm->min_flt++; + prot = (old_page & ~PAGE_MASK) | PAGE_RW | PAGE_DIRTY; + old_page &= PAGE_MASK; + if (mem_map[MAP_NR(old_page)] != 1) { + if (new_page) { + if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED) + ++vma->vm_task->mm->rss; + copy_page(old_page,new_page); + *(unsigned long *) pte = new_page | prot; + free_page(old_page); + invalidate(); + return; + } + free_page(old_page); + oom(vma->vm_task); + *(unsigned long *) pte = BAD_PAGE | prot; + invalidate(); + return; + } + *(unsigned long *) pte |= PAGE_RW | PAGE_DIRTY; + invalidate(); + if (new_page) + free_page(new_page); + return; +bad_wp_page: + printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); + *(unsigned long *) pte = BAD_PAGE | PAGE_SHARED; + send_sig(SIGKILL, vma->vm_task, 1); + goto end_wp_page; +bad_wp_pagetable: + printk("do_wp_page: bogus page-table at address %08lx (%08lx)\n",address,pte); + *pde = BAD_PAGETABLE | PAGE_TABLE; + send_sig(SIGKILL, vma->vm_task, 1); +end_wp_page: + if (new_page) + free_page(new_page); + return; +} + +/* + * Ugly, ugly, but the goto's result in better assembly.. + */ +int verify_area(int type, const void * addr, unsigned long size) +{ + struct vm_area_struct * vma; + unsigned long start = (unsigned long) addr; + + /* If the current user space is mapped to kernel space (for the + * case where we use a fake user buffer with get_fs/set_fs()) we + * don't expect to find the address in the user vm map. + */ + if (get_fs() == get_ds()) + return 0; + + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + goto bad_area; + if (vma->vm_end > start) + break; + } + if (vma->vm_start <= start) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + +good_area: + if (!wp_works_ok && type == VERIFY_WRITE) + goto check_wp_fault_by_hand; + for (;;) { + struct vm_area_struct * next; + if (!(vma->vm_page_prot & PAGE_USER)) + goto bad_area; + if (type != VERIFY_READ && !(vma->vm_page_prot & (PAGE_COW | PAGE_RW))) + goto bad_area; + if (vma->vm_end - start >= size) + return 0; + next = vma->vm_next; + if (!next || vma->vm_end != next->vm_start) + goto bad_area; + vma = next; + } + +check_wp_fault_by_hand: + size--; + size += start & ~PAGE_MASK; + size >>= PAGE_SHIFT; + start &= PAGE_MASK; + + for (;;) { + if (!(vma->vm_page_prot & (PAGE_COW | PAGE_RW))) + goto bad_area; + do_wp_page(vma, start, PAGE_PRESENT); + if (!size) + return 0; + size--; + start += PAGE_SIZE; + if (start < vma->vm_end) + continue; + vma = vma->vm_next; + if (!vma || vma->vm_start != start) + break; + } + +bad_area: + return -EFAULT; +} + +static inline void get_empty_page(struct task_struct * tsk, unsigned long address) +{ + unsigned long tmp; + + if (!(tmp = get_free_page(GFP_KERNEL))) { + oom(tsk); + tmp = BAD_PAGE; + } + if (!put_page(tsk,tmp,address,PAGE_PRIVATE)) + free_page(tmp); +} + +/* + * try_to_share() checks the page at address "address" in the task "p", + * to see if it exists, and if it is clean. If so, share it with the current + * task. + * + * NOTE! This assumes we have checked that p != current, and that they + * share the same inode and can generally otherwise be shared. + */ +static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area, + unsigned long from_address, struct vm_area_struct * from_area, + unsigned long newpage) +{ + unsigned long from; + unsigned long to; + unsigned long from_page; + unsigned long to_page; + + from_page = (unsigned long)PAGE_DIR_OFFSET(from_area->vm_task->tss.cr3,from_address); + to_page = (unsigned long)PAGE_DIR_OFFSET(to_area->vm_task->tss.cr3,to_address); +/* is there a page-directory at from? */ + from = *(unsigned long *) from_page; + if (!(from & PAGE_PRESENT)) + return 0; + from &= PAGE_MASK; + from_page = from + PAGE_PTR(from_address); + from = *(unsigned long *) from_page; +/* is the page present? */ + if (!(from & PAGE_PRESENT)) + return 0; +/* if it is private, it must be clean to be shared */ + if (from & PAGE_DIRTY) { + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + } +/* is the page reasonable at all? */ + if (from >= high_memory) + return 0; + if (mem_map[MAP_NR(from)] & MAP_PAGE_RESERVED) + return 0; +/* is the destination ok? */ + to = *(unsigned long *) to_page; + if (!(to & PAGE_PRESENT)) + return 0; + to &= PAGE_MASK; + to_page = to + PAGE_PTR(to_address); + if (*(unsigned long *) to_page) + return 0; +/* do we copy? */ + if (newpage) { + if (in_swap_cache(from)) { /* implies PAGE_DIRTY */ + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + } + copy_page((from & PAGE_MASK), newpage); + *(unsigned long *) to_page = newpage | to_area->vm_page_prot; + return 1; + } +/* do a final swap-cache test before sharing them.. */ + if (in_swap_cache(from)) { + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + from |= PAGE_DIRTY; + *(unsigned long *) from_page = from; + delete_from_swap_cache(from); + invalidate(); + } + mem_map[MAP_NR(from)]++; +/* fill in the 'to' field, checking for COW-stuff */ + to = (from & (PAGE_MASK | PAGE_DIRTY)) | to_area->vm_page_prot; + if (to & PAGE_COW) + to &= ~PAGE_RW; + *(unsigned long *) to_page = to; +/* Check if we need to do anything at all to the 'from' field */ + if (!(from & PAGE_RW)) + return 1; + if (!(from_area->vm_page_prot & PAGE_COW)) + return 1; +/* ok, need to mark it read-only, so invalidate any possible old TB entry */ + from &= ~PAGE_RW; + *(unsigned long *) from_page = from; + invalidate(); + return 1; +} + +/* + * share_page() tries to find a process that could share a page with + * the current one. + * + * We first check if it is at all feasible by checking inode->i_count. + * It should be >1 if there are other tasks sharing this inode. + */ +static int share_page(struct vm_area_struct * area, unsigned long address, + unsigned long error_code, unsigned long newpage) +{ + struct inode * inode; + struct task_struct ** p; + unsigned long offset; + unsigned long from_address; + unsigned long give_page; + + if (!area || !(inode = area->vm_inode) || inode->i_count < 2) + return 0; + /* do we need to copy or can we just share? */ + give_page = 0; + if ((area->vm_page_prot & PAGE_COW) && (error_code & PAGE_RW)) { + if (!newpage) + return 0; + give_page = newpage; + } + offset = address - area->vm_start + area->vm_offset; + for (p = &LAST_TASK ; p > &FIRST_TASK ; --p) { + struct vm_area_struct * mpnt; + if (!*p) + continue; + if (area->vm_task == *p) + continue; + /* Now see if there is something in the VMM that + we can share pages with */ + for (mpnt = (*p)->mm->mmap; mpnt; mpnt = mpnt->vm_next) { + /* must be same inode */ + if (mpnt->vm_inode != inode) + continue; + /* offsets must be mutually page-aligned */ + if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK) + continue; + /* the other area must actually cover the wanted page.. */ + from_address = offset + mpnt->vm_start - mpnt->vm_offset; + if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end) + continue; + /* .. NOW we can actually try to use the same physical page */ + if (!try_to_share(address, area, from_address, mpnt, give_page)) + continue; + /* free newpage if we never used it.. */ + if (give_page || !newpage) + return 1; + free_page(newpage); + return 1; + } + } + return 0; +} + +/* + * fill in an empty page-table if none exists. + */ +static inline unsigned long get_empty_pgtable(struct task_struct * tsk,unsigned long address) +{ + unsigned long page; + unsigned long *p; + + p = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *p) + return *p; + if (*p) { + printk("get_empty_pgtable: bad page-directory entry \n"); + *p = 0; + } + page = get_free_page(GFP_KERNEL); + p = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *p) { + free_page(page); + return *p; + } + if (*p) { + printk("get_empty_pgtable: bad page-directory entry \n"); + *p = 0; + } + if (page) { + *p = page | PAGE_TABLE; + return *p; + } + oom(current); + *p = BAD_PAGETABLE | PAGE_TABLE; + return 0; +} + +static inline void do_swap_page(struct vm_area_struct * vma, + unsigned long address, unsigned long * pge, unsigned long entry) +{ + unsigned long page; + + if (vma->vm_ops && vma->vm_ops->swapin) + page = vma->vm_ops->swapin(vma, entry); + else + page = swap_in(entry); + if (*pge != entry) { + free_page(page); + return; + } + page = page | vma->vm_page_prot; + if (mem_map[MAP_NR(page)] > 1 && (page & PAGE_COW)) + page &= ~PAGE_RW; + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->maj_flt; + *pge = page; + return; +} + +void do_no_page(struct vm_area_struct * vma, unsigned long address, + unsigned long error_code) +{ + unsigned long page, entry, prot; + + page = get_empty_pgtable(vma->vm_task,address); + if (!page) + return; + page &= PAGE_MASK; + page += PAGE_PTR(address); + entry = *(unsigned long *) page; + if (entry & PAGE_PRESENT) + return; + if (entry) { + do_swap_page(vma, address, (unsigned long *) page, entry); + return; + } + address &= PAGE_MASK; + + if (!vma->vm_ops || !vma->vm_ops->nopage) { + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->min_flt; + get_empty_page(vma->vm_task,address); + return; + } + page = get_free_page(GFP_KERNEL); + if (share_page(vma, address, error_code, page)) { + ++vma->vm_task->mm->min_flt; + ++vma->vm_task->mm->rss; + return; + } + if (!page) { + oom(current); + put_page(vma->vm_task, BAD_PAGE, address, PAGE_PRIVATE); + return; + } + ++vma->vm_task->mm->maj_flt; + ++vma->vm_task->mm->rss; + prot = vma->vm_page_prot; + /* + * The fourth argument is "no_share", which tells the low-level code + * to copy, not share the page even if sharing is possible. It's + * essentially an early COW detection ("moo at 5 AM"). + */ + page = vma->vm_ops->nopage(vma, address, page, (error_code & PAGE_RW) && (prot & PAGE_COW)); + if (share_page(vma, address, error_code, 0)) { + free_page(page); + return; + } + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. + */ + if (error_code & PAGE_RW) { + prot |= PAGE_DIRTY; /* can't be COW-shared: see "no_share" above */ + } else if ((prot & PAGE_COW) && mem_map[MAP_NR(page)] > 1) + prot &= ~PAGE_RW; + if (put_page(vma->vm_task, page, address, prot)) + return; + free_page(page); + oom(current); +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct vm_area_struct * vma; + unsigned long address; + unsigned long page; + + /* get the address */ + __asm__("movl %%cr2,%0":"=r" (address)); + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + goto bad_area; + if (vma->vm_end > address) + break; + } + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + vma->vm_offset -= vma->vm_start - (address & PAGE_MASK); + vma->vm_start = (address & PAGE_MASK); +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + if (regs->eflags & VM_MASK) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + current->screen_bitmap |= 1 << bit; + } + if (!(vma->vm_page_prot & PAGE_USER)) + goto bad_area; + if (error_code & PAGE_PRESENT) { + if (!(vma->vm_page_prot & (PAGE_RW | PAGE_COW))) + goto bad_area; +#ifdef CONFIG_TEST_VERIFY_AREA + if (regs->cs == KERNEL_CS) + printk("WP fault at %08x\n", regs->eip); +#endif + do_wp_page(vma, address, error_code); + return; + } + do_no_page(vma, address, error_code); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + if (error_code & PAGE_USER) { + current->tss.cr2 = address; + current->tss.error_code = error_code; + current->tss.trap_no = 14; + send_sig(SIGSEGV, current, 1); + return; + } +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + if (wp_works_ok < 0 && address == TASK_SIZE && (error_code & PAGE_PRESENT)) { + wp_works_ok = 1; + pg0[0] = PAGE_SHARED; + invalidate(); + printk("This processor honours the WP bit even when in supervisor mode. Good.\n"); + return; + } + if ((unsigned long) (address-TASK_SIZE) < PAGE_SIZE) { + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + pg0[0] = PAGE_SHARED; + } else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at virtual address %08lx\n",address); + __asm__("movl %%cr3,%0" : "=r" (page)); + printk(KERN_ALERT "current->tss.cr3 = %08lx, %%cr3 = %08lx\n", + current->tss.cr3, page); + page = ((unsigned long *) page)[address >> 22]; + printk(KERN_ALERT "*pde = %08lx\n", page); + if (page & PAGE_PRESENT) { + page &= PAGE_MASK; + address &= 0x003ff000; + page = ((unsigned long *) page)[address >> PAGE_SHIFT]; + printk(KERN_ALERT "*pte = %08lx\n", page); + } + die_if_kernel("Oops", regs, error_code); + do_exit(SIGKILL); +} + +/* + * BAD_PAGE is the page that is used for page faults when linux + * is out-of-memory. Older versions of linux just did a + * do_exit(), but using this instead means there is less risk + * for a process dying in kernel mode, possibly leaving a inode + * unused etc.. + * + * BAD_PAGETABLE is the accompanying page-table: it is initialized + * to point to BAD_PAGE entries. + * + * ZERO_PAGE is a special page that is used for zero-initialized + * data and COW. + */ +unsigned long __bad_pagetable(void) +{ + extern char empty_bad_page_table[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (BAD_PAGE + PAGE_TABLE), + "D" ((long) empty_bad_page_table), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_bad_page_table; +} + +unsigned long __bad_page(void) +{ + extern char empty_bad_page[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (0), + "D" ((long) empty_bad_page), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_bad_page; +} + +unsigned long __zero_page(void) +{ + extern char empty_zero_page[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (0), + "D" ((long) empty_zero_page), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_zero_page; +} + +void show_mem(void) +{ + int i,free = 0,total = 0,reserved = 0; + int shared = 0; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); + i = high_memory >> PAGE_SHIFT; + while (i-- > 0) { + total++; + if (mem_map[i] & MAP_PAGE_RESERVED) + reserved++; + else if (!mem_map[i]) + free++; + else + shared += mem_map[i]-1; + } + printk("%d pages of RAM\n",total); + printk("%d free pages\n",free); + printk("%d reserved pages\n",reserved); + printk("%d pages shared\n",shared); + show_buffers(); +#ifdef CONFIG_NET + show_net_buffers(); +#endif +} + +extern unsigned long free_area_init(unsigned long, unsigned long); + +/* + * paging_init() sets up the page tables - note that the first 4MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +unsigned long paging_init(unsigned long start_mem, unsigned long end_mem) +{ + unsigned long * pg_dir; + unsigned long * pg_table; + unsigned long tmp; + unsigned long address; + +/* + * Physical page 0 is special; it's not touched by Linux since BIOS + * and SMM (for laptops with [34]86/SL chips) may need it. It is read + * and write protected to detect null pointer references in the + * kernel. + */ +#if 0 + memset((void *) 0, 0, PAGE_SIZE); +#endif + start_mem = PAGE_ALIGN(start_mem); + address = 0; + pg_dir = swapper_pg_dir; + while (address < end_mem) { + tmp = *(pg_dir + 768); /* at virtual addr 0xC0000000 */ + if (!tmp) { + tmp = start_mem | PAGE_TABLE; + *(pg_dir + 768) = tmp; + start_mem += PAGE_SIZE; + } + *pg_dir = tmp; /* also map it in at 0x0000000 for init */ + pg_dir++; + pg_table = (unsigned long *) (tmp & PAGE_MASK); + for (tmp = 0 ; tmp < PTRS_PER_PAGE ; tmp++,pg_table++) { + if (address < end_mem) + *pg_table = address | PAGE_SHARED; + else + *pg_table = 0; + address += PAGE_SIZE; + } + } + invalidate(); + return free_area_init(start_mem, end_mem); +} + +void mem_init(unsigned long start_low_mem, + unsigned long start_mem, unsigned long end_mem) +{ + int codepages = 0; + int reservedpages = 0; + int datapages = 0; + unsigned long tmp; + extern int etext; + + cli(); + end_mem &= PAGE_MASK; + high_memory = end_mem; + + /* mark usable pages in the mem_map[] */ + start_low_mem = PAGE_ALIGN(start_low_mem); + start_mem = PAGE_ALIGN(start_mem); + + /* + * IBM messed up *AGAIN* in their thinkpad: 0xA0000 -> 0x9F000. + * They seem to have done something stupid with the floppy + * controller as well.. + */ + while (start_low_mem < 0x9f000) { + mem_map[MAP_NR(start_low_mem)] = 0; + start_low_mem += PAGE_SIZE; + } + + while (start_mem < high_memory) { + mem_map[MAP_NR(start_mem)] = 0; + start_mem += PAGE_SIZE; + } +#ifdef CONFIG_SOUND + sound_mem_init(); +#endif + for (tmp = 0 ; tmp < high_memory ; tmp += PAGE_SIZE) { + if (mem_map[MAP_NR(tmp)]) { + if (tmp >= 0xA0000 && tmp < 0x100000) + reservedpages++; + else if (tmp < (unsigned long) &etext) + codepages++; + else + datapages++; + continue; + } + mem_map[MAP_NR(tmp)] = 1; + free_page(tmp); + } + tmp = nr_free_pages << PAGE_SHIFT; + printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data)\n", + tmp >> 10, + high_memory >> 10, + codepages << (PAGE_SHIFT-10), + reservedpages << (PAGE_SHIFT-10), + datapages << (PAGE_SHIFT-10)); +/* test if the WP bit is honoured in supervisor mode */ + wp_works_ok = -1; + pg0[0] = PAGE_READONLY; + invalidate(); + __asm__ __volatile__("movb 0,%%al ; movb %%al,0": : :"ax", "memory"); + pg0[0] = 0; + invalidate(); + if (wp_works_ok < 0) + wp_works_ok = 0; +#ifdef CONFIG_TEST_VERIFY_AREA + wp_works_ok = 0; +#endif + return; +} + +void si_meminfo(struct sysinfo *val) +{ + int i; + + i = high_memory >> PAGE_SHIFT; + val->totalram = 0; + val->sharedram = 0; + val->freeram = nr_free_pages << PAGE_SHIFT; + val->bufferram = buffermem; + while (i-- > 0) { + if (mem_map[i] & MAP_PAGE_RESERVED) + continue; + val->totalram++; + if (!mem_map[i]) + continue; + val->sharedram += mem_map[i]-1; + } + val->totalram <<= PAGE_SHIFT; + val->sharedram <<= PAGE_SHIFT; + return; +} + + +/* + * This handles a generic mmap of a disk file. + */ +static unsigned long file_mmap_nopage(struct vm_area_struct * area, unsigned long address, + unsigned long page, int no_share) +{ + struct inode * inode = area->vm_inode; + unsigned int block; + int nr[8]; + int i, *p; + + address &= PAGE_MASK; + block = address - area->vm_start + area->vm_offset; + block >>= inode->i_sb->s_blocksize_bits; + i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; + p = nr; + do { + *p = bmap(inode,block); + i--; + block++; + p++; + } while (i > 0); + return bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, no_share); +} + +struct vm_operations_struct file_mmap = { + NULL, /* open */ + NULL, /* close */ + file_mmap_nopage, /* nopage */ + NULL, /* wppage */ + NULL, /* share */ + NULL, /* unmap */ +}; diff --git a/arch/i386/mm/mmap.c b/arch/i386/mm/mmap.c new file mode 100644 index 000000000..fbbea985c --- /dev/null +++ b/arch/i386/mm/mmap.c @@ -0,0 +1,470 @@ +/* + * linux/mm/mmap.c + * + * Written by obz. + */ +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/errno.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <linux/malloc.h> + +#include <asm/segment.h> +#include <asm/system.h> + +static int anon_map(struct inode *, struct file *, struct vm_area_struct *); + +/* + * description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ + +int do_mmap(struct file * file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, unsigned long off) +{ + int mask, error; + struct vm_area_struct * vma; + + if ((len = PAGE_ALIGN(len)) == 0) + return addr; + + if (addr > TASK_SIZE || len > TASK_SIZE || addr > TASK_SIZE-len) + return -EINVAL; + + /* offset overflow? */ + if (off + len < off) + return -EINVAL; + + /* + * do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + + if (file != NULL) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot & PROT_WRITE) && !(file->f_mode & 2)) + return -EACCES; + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & 1)) + return -EACCES; + break; + + default: + return -EINVAL; + } + if ((flags & MAP_DENYWRITE) && (file->f_inode->i_wcount > 0)) + return -ETXTBSY; + } else if ((flags & MAP_TYPE) == MAP_SHARED) + return -EINVAL; + + /* + * obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + + if (flags & MAP_FIXED) { + if (addr & ~PAGE_MASK) + return -EINVAL; + if (len > TASK_SIZE || addr > TASK_SIZE - len) + return -EINVAL; + } else { + addr = get_unmapped_area(len); + if (!addr) + return -ENOMEM; + } + + /* + * determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + if (file && (!file->f_op || !file->f_op->mmap)) + return -ENODEV; + mask = PAGE_PRESENT; + if (prot & (PROT_READ | PROT_EXEC)) + mask |= PAGE_READONLY; + if (prot & PROT_WRITE) + if ((flags & MAP_TYPE) == MAP_PRIVATE) + mask |= PAGE_COPY; + else + mask |= PAGE_SHARED; + + vma = (struct vm_area_struct *)kmalloc(sizeof(struct vm_area_struct), + GFP_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_task = current; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_page_prot = mask; + vma->vm_flags = prot & (VM_READ | VM_WRITE | VM_EXEC); + vma->vm_flags |= flags & (VM_GROWSDOWN | VM_DENYWRITE | VM_EXECUTABLE); + + if (file) { + if (file->f_mode & 1) + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (flags & MAP_SHARED) { + vma->vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & 2)) + vma->vm_flags &= ~VM_MAYWRITE; + } + } else + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + vma->vm_ops = NULL; + vma->vm_offset = off; + vma->vm_inode = NULL; + vma->vm_pte = 0; + + do_munmap(addr, len); /* Clear old maps */ + + if (file) + error = file->f_op->mmap(file->f_inode, file, vma); + else + error = anon_map(NULL, NULL, vma); + + if (error) { + kfree(vma); + return error; + } + insert_vm_struct(current, vma); + merge_segments(current->mm->mmap); + return addr; +} + +/* + * Get an address range which is currently unmapped. + * For mmap() without MAP_FIXED and shmat() with addr=0. + * Return value 0 means ENOMEM. + */ +unsigned long get_unmapped_area(unsigned long len) +{ + struct vm_area_struct * vmm; + unsigned long gap_start = 0, gap_end; + + for (vmm = current->mm->mmap; ; vmm = vmm->vm_next) { + if (gap_start < SHM_RANGE_START) + gap_start = SHM_RANGE_START; + if (!vmm || ((gap_end = vmm->vm_start) > SHM_RANGE_END)) + gap_end = SHM_RANGE_END; + gap_start = PAGE_ALIGN(gap_start); + gap_end &= PAGE_MASK; + if ((gap_start <= gap_end) && (gap_end - gap_start >= len)) + return gap_start; + if (!vmm) + return 0; + gap_start = vmm->vm_end; + } +} + +asmlinkage int sys_mmap(unsigned long *buffer) +{ + int error; + unsigned long flags; + struct file * file = NULL; + + error = verify_area(VERIFY_READ, buffer, 6*sizeof(long)); + if (error) + return error; + flags = get_fs_long(buffer+3); + if (!(flags & MAP_ANONYMOUS)) { + unsigned long fd = get_fs_long(buffer+4); + if (fd >= NR_OPEN || !(file = current->files->fd[fd])) + return -EBADF; + } + return do_mmap(file, get_fs_long(buffer), get_fs_long(buffer+1), + get_fs_long(buffer+2), flags, get_fs_long(buffer+5)); +} + +/* + * Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * This function works out what part of an area is affected and + * adjusts the mapping information. Since the actual page + * manipulation is done in do_mmap(), none need be done here, + * though it would probably be more appropriate. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list, so it needs to be + * reinserted if necessary. + * + * The 4 main cases are: + * Unmapping the whole area + * Unmapping from the start of the segment to a point in it + * Unmapping from an intermediate point to the end + * Unmapping between to intermediate points, making a hole. + * + * Case 4 involves the creation of 2 new areas, for each side of + * the hole. + */ +void unmap_fixup(struct vm_area_struct *area, + unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt; + unsigned long end = addr + len; + + if (addr < area->vm_start || addr >= area->vm_end || + end <= area->vm_start || end > area->vm_end || + end < addr) + { + printk("unmap_fixup: area=%lx-%lx, unmap %lx-%lx!!\n", + area->vm_start, area->vm_end, addr, end); + return; + } + + /* Unmapping the whole area */ + if (addr == area->vm_start && end == area->vm_end) { + if (area->vm_ops && area->vm_ops->close) + area->vm_ops->close(area); + if (area->vm_inode) + iput(area->vm_inode); + return; + } + + /* Work out to one of the ends */ + if (addr >= area->vm_start && end == area->vm_end) + area->vm_end = addr; + if (addr == area->vm_start && end <= area->vm_end) { + area->vm_offset += (end - area->vm_start); + area->vm_start = end; + } + + /* Unmapping a hole */ + if (addr > area->vm_start && end < area->vm_end) + { + /* Add end mapping -- leave beginning for below */ + mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL); + + if (!mpnt) + return; + *mpnt = *area; + mpnt->vm_offset += (end - area->vm_start); + mpnt->vm_start = end; + if (mpnt->vm_inode) + mpnt->vm_inode->i_count++; + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + area->vm_end = addr; /* Truncate area */ + insert_vm_struct(current, mpnt); + } + + /* construct whatever mapping is needed */ + mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL); + if (!mpnt) + return; + *mpnt = *area; + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + if (area->vm_ops && area->vm_ops->close) { + area->vm_end = area->vm_start; + area->vm_ops->close(area); + } + insert_vm_struct(current, mpnt); +} + +asmlinkage int sys_munmap(unsigned long addr, size_t len) +{ + return do_munmap(addr, len); +} + +/* + * Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardine <jeremy@sw.oz.au> + */ +int do_munmap(unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt, **npp, *free; + + if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return 0; + + /* + * Check if this memory area is ok - put it on the temporary + * list if so.. The checks here are pretty simple -- + * every area affected in some way (by any overlap) is put + * on the list. If nothing is put on, nothing is affected. + */ + npp = ¤t->mm->mmap; + free = NULL; + for (mpnt = *npp; mpnt != NULL; mpnt = *npp) { + unsigned long end = addr+len; + + if ((addr < mpnt->vm_start && end <= mpnt->vm_start) || + (addr >= mpnt->vm_end && end > mpnt->vm_end)) + { + npp = &mpnt->vm_next; + continue; + } + + *npp = mpnt->vm_next; + mpnt->vm_next = free; + free = mpnt; + } + + if (free == NULL) + return 0; + + /* + * Ok - we have the memory areas we should free on the 'free' list, + * so release them, and unmap the page range.. + * If the one of the segments is only being partially unmapped, + * it will put new vm_area_struct(s) into the address space. + */ + while (free) { + unsigned long st, end; + + mpnt = free; + free = free->vm_next; + + st = addr < mpnt->vm_start ? mpnt->vm_start : addr; + end = addr+len; + end = end > mpnt->vm_end ? mpnt->vm_end : end; + + if (mpnt->vm_ops && mpnt->vm_ops->unmap) + mpnt->vm_ops->unmap(mpnt, st, end-st); + else + unmap_fixup(mpnt, st, end-st); + + kfree(mpnt); + } + + unmap_page_range(addr, len); + return 0; +} + +/* This is used for a general mmap of a disk file */ +int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma) +{ + extern struct vm_operations_struct file_mmap; + + if (vma->vm_page_prot & PAGE_RW) /* only PAGE_COW or read-only supported right now */ + return -EINVAL; + if (vma->vm_offset & (inode->i_sb->s_blocksize - 1)) + return -EINVAL; + if (!inode->i_sb || !S_ISREG(inode->i_mode)) + return -EACCES; + if (!inode->i_op || !inode->i_op->bmap) + return -ENOEXEC; + if (!IS_RDONLY(inode)) { + inode->i_atime = CURRENT_TIME; + inode->i_dirt = 1; + } + vma->vm_inode = inode; + inode->i_count++; + vma->vm_ops = &file_mmap; + return 0; +} + +/* + * Insert vm structure into process list sorted by address. + */ +void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp) +{ + struct vm_area_struct **p, *mpnt; + + p = &t->mm->mmap; + while ((mpnt = *p) != NULL) { + if (mpnt->vm_start > vmp->vm_start) + break; + if (mpnt->vm_end > vmp->vm_start) + printk("insert_vm_struct: overlapping memory areas\n"); + p = &mpnt->vm_next; + } + vmp->vm_next = mpnt; + *p = vmp; +} + +/* + * Merge a list of memory segments if possible. + * Redundant vm_area_structs are freed. + * This assumes that the list is ordered by address. + */ +void merge_segments(struct vm_area_struct *mpnt) +{ + struct vm_area_struct *prev, *next; + + if (mpnt == NULL) + return; + + for(prev = mpnt, mpnt = mpnt->vm_next; + mpnt != NULL; + prev = mpnt, mpnt = next) + { + next = mpnt->vm_next; + + /* + * To share, we must have the same inode, operations.. + */ + if (mpnt->vm_inode != prev->vm_inode) + continue; + if (mpnt->vm_pte != prev->vm_pte) + continue; + if (mpnt->vm_ops != prev->vm_ops) + continue; + if (mpnt->vm_page_prot != prev->vm_page_prot || + mpnt->vm_flags != prev->vm_flags) + continue; + if (prev->vm_end != mpnt->vm_start) + continue; + /* + * and if we have an inode, the offsets must be contiguous.. + */ + if ((mpnt->vm_inode != NULL) || (mpnt->vm_flags & VM_SHM)) { + if (prev->vm_offset + prev->vm_end - prev->vm_start != mpnt->vm_offset) + continue; + } + + /* + * merge prev with mpnt and set up pointers so the new + * big segment can possibly merge with the next one. + * The old unused mpnt is freed. + */ + prev->vm_end = mpnt->vm_end; + prev->vm_next = mpnt->vm_next; + if (mpnt->vm_ops && mpnt->vm_ops->close) { + mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start; + mpnt->vm_start = mpnt->vm_end; + mpnt->vm_ops->close(mpnt); + } + if (mpnt->vm_inode) + mpnt->vm_inode->i_count--; + kfree_s(mpnt, sizeof(*mpnt)); + mpnt = prev; + } +} + +/* + * Map memory not associated with any file into a process + * address space. Adjacent memory is merged. + */ +static int anon_map(struct inode *ino, struct file * file, struct vm_area_struct * vma) +{ + if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) + return -ENOMEM; + return 0; +} diff --git a/arch/i386/mm/mprotect.c b/arch/i386/mm/mprotect.c new file mode 100644 index 000000000..99252183b --- /dev/null +++ b/arch/i386/mm/mprotect.c @@ -0,0 +1,230 @@ +/* + * linux/mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + */ +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/errno.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <linux/malloc.h> + +#include <asm/segment.h> +#include <asm/system.h> + +#define CHG_MASK (PAGE_MASK | PAGE_ACCESSED | PAGE_DIRTY | PAGE_PWT | PAGE_PCD) + +static void change_protection(unsigned long start, unsigned long end, int prot) +{ + unsigned long *page_table, *dir; + unsigned long page, offset; + int nr; + + dir = PAGE_DIR_OFFSET(current->tss.cr3, start); + offset = (start >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + nr = (end - start) >> PAGE_SHIFT; + while (nr > 0) { + page = *dir; + dir++; + if (!(page & PAGE_PRESENT)) { + nr = nr - PTRS_PER_PAGE + offset; + offset = 0; + continue; + } + page_table = offset + (unsigned long *) (page & PAGE_MASK); + offset = PTRS_PER_PAGE - offset; + if (offset > nr) + offset = nr; + nr = nr - offset; + do { + page = *page_table; + if (page & PAGE_PRESENT) + *page_table = (page & CHG_MASK) | prot; + ++page_table; + } while (--offset); + } + return; +} + +static inline int mprotect_fixup_all(struct vm_area_struct * vma, + int newflags, int prot) +{ + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + return 0; +} + +static inline int mprotect_fixup_start(struct vm_area_struct * vma, + unsigned long end, + int newflags, int prot) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + vma->vm_start = end; + n->vm_end = end; + vma->vm_offset += vma->vm_start - n->vm_start; + n->vm_flags = newflags; + n->vm_page_prot = prot; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current, n); + return 0; +} + +static inline int mprotect_fixup_end(struct vm_area_struct * vma, + unsigned long start, + int newflags, int prot) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + vma->vm_end = start; + n->vm_start = start; + n->vm_offset += n->vm_start - vma->vm_start; + n->vm_flags = newflags; + n->vm_page_prot = prot; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current, n); + return 0; +} + +static inline int mprotect_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, + int newflags, int prot) +{ + struct vm_area_struct * left, * right; + + left = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!left) + return -ENOMEM; + right = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!right) { + kfree(left); + return -ENOMEM; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + vma->vm_start = start; + vma->vm_end = end; + right->vm_start = end; + vma->vm_offset += vma->vm_start - left->vm_start; + right->vm_offset += right->vm_start - left->vm_start; + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + if (vma->vm_inode) + vma->vm_inode->i_count += 2; + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + insert_vm_struct(current, left); + insert_vm_struct(current, right); + return 0; +} + +static int mprotect_fixup(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned int newflags) +{ + int prot, error; + + if (newflags == vma->vm_flags) + return 0; + prot = PAGE_PRESENT; + if (newflags & (VM_READ | VM_EXEC)) + prot |= PAGE_READONLY; + if (newflags & VM_WRITE) + if (newflags & VM_SHARED) + prot |= PAGE_SHARED; + else + prot |= PAGE_COPY; + + if (start == vma->vm_start) + if (end == vma->vm_end) + error = mprotect_fixup_all(vma, newflags, prot); + else + error = mprotect_fixup_start(vma, end, newflags, prot); + else if (end == vma->vm_end) + error = mprotect_fixup_end(vma, start, newflags, prot); + else + error = mprotect_fixup_middle(vma, start, end, newflags, prot); + + if (error) + return error; + + change_protection(start, end, prot); + return 0; +} + +asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + unsigned long end, tmp; + struct vm_area_struct * vma, * next; + int error; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + return -EINVAL; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) + return -EINVAL; + if (end == start) + return 0; + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + return -EFAULT; + if (vma->vm_end > start) + break; + } + if (vma->vm_start > start) + return -EFAULT; + + for ( ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= start < vma->vm_end. */ + + newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC)); + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + break; + } + + if (vma->vm_end >= end) { + error = mprotect_fixup(vma, start, end, newflags); + break; + } + + tmp = vma->vm_end; + next = vma->vm_next; + error = mprotect_fixup(vma, start, tmp, newflags); + if (error) + break; + start = tmp; + vma = next; + if (!vma || vma->vm_start != start) { + error = -EFAULT; + break; + } + } + merge_segments(current->mm->mmap); + return error; +} diff --git a/arch/i386/mm/swap.c b/arch/i386/mm/swap.c new file mode 100644 index 000000000..f7a1f54b3 --- /dev/null +++ b/arch/i386/mm/swap.c @@ -0,0 +1,1017 @@ +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file should contain most things doing the swapping from/to disk. + * Started 18.12.91 + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/fs.h> + +#include <asm/system.h> /* for cli()/sti() */ +#include <asm/bitops.h> + +#define MAX_SWAPFILES 8 + +#define SWP_USED 1 +#define SWP_WRITEOK 3 + +#define SWP_TYPE(entry) (((entry) & 0xfe) >> 1) +#define SWP_OFFSET(entry) ((entry) >> PAGE_SHIFT) +#define SWP_ENTRY(type,offset) (((type) << 1) | ((offset) << PAGE_SHIFT)) + +int min_free_pages = 20; + +static int nr_swapfiles = 0; +static struct wait_queue * lock_queue = NULL; + +static struct swap_info_struct { + unsigned long flags; + struct inode * swap_file; + unsigned int swap_device; + unsigned char * swap_map; + unsigned char * swap_lockmap; + int pages; + int lowest_bit; + int highest_bit; + unsigned long max; +} swap_info[MAX_SWAPFILES]; + +extern int shm_swap (int); + +unsigned long *swap_cache; + +#ifdef SWAP_CACHE_INFO +unsigned long swap_cache_add_total = 0; +unsigned long swap_cache_add_success = 0; +unsigned long swap_cache_del_total = 0; +unsigned long swap_cache_del_success = 0; +unsigned long swap_cache_find_total = 0; +unsigned long swap_cache_find_success = 0; + +extern inline void show_swap_cache_info(void) +{ + printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n", + swap_cache_add_total, swap_cache_add_success, + swap_cache_del_total, swap_cache_del_success, + swap_cache_find_total, swap_cache_find_success); +} +#endif + +extern inline int add_to_swap_cache(unsigned long addr, unsigned long entry) +{ + struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; + +#ifdef SWAP_CACHE_INFO + swap_cache_add_total++; +#endif + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + __asm__ __volatile__ ( + "xchgl %0,%1\n" + : "=m" (swap_cache[addr >> PAGE_SHIFT]), + "=r" (entry) + : "0" (swap_cache[addr >> PAGE_SHIFT]), + "1" (entry)); + if (entry) { + printk("swap_cache: replacing non-NULL entry\n"); + } +#ifdef SWAP_CACHE_INFO + swap_cache_add_success++; +#endif + return 1; + } + return 0; +} + +extern inline int add_to_swap_cache(unsigned long addr, unsigned long entry) +{ + struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; + +#ifdef SWAP_CACHE_INFO + swap_cache_add_total++; +#endif + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + __asm__ __volatile__ ( + "xchgl %0,%1\n" + : "=m" (swap_cache[addr >> PAGE_SHIFT]), + "=r" (entry) + : "0" (swap_cache[addr >> PAGE_SHIFT]), + "1" (entry) + ); + if (entry) { + printk("swap_cache: replacing non-NULL entry\n"); + } +#ifdef SWAP_CACHE_INFO + swap_cache_add_success++; +#endif + return 1; + } + return 0; +} + +static unsigned long init_swap_cache(unsigned long mem_start, + unsigned long mem_end) +{ + unsigned long swap_cache_size; + + mem_start = (mem_start + 15) & ~15; + swap_cache = (unsigned long *) mem_start; + swap_cache_size = mem_end >> PAGE_SHIFT; + memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long)); + return (unsigned long) (swap_cache + swap_cache_size); +} + +void rw_swap_page(int rw, unsigned long entry, char * buf) +{ + unsigned long type, offset; + struct swap_info_struct * p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk("Internal error: bad swap-device\n"); + return; + } + p = &swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("rw_swap_page: weirdness\n"); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to swap to unused swap-device\n"); + return; + } + while (set_bit(offset,p->swap_lockmap)) + sleep_on(&lock_queue); + if (rw == READ) + kstat.pswpin++; + else + kstat.pswpout++; + if (p->swap_device) { + ll_rw_page(rw,p->swap_device,offset,buf); + } else if (p->swap_file) { + struct inode *swapf = p->swap_file; + unsigned int zones[8]; + int i; + if (swapf->i_op->bmap == NULL + && swapf->i_op->smap != NULL){ + /* + With MsDOS, we use msdos_smap which return + a sector number (not a cluster or block number). + It is a patch to enable the UMSDOS project. + Other people are working on better solution. + + It sounds like ll_rw_swap_file defined + it operation size (sector size) based on + PAGE_SIZE and the number of block to read. + So using bmap or smap should work even if + smap will require more blocks. + */ + int j; + unsigned int block = offset << 3; + + for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){ + if (!(zones[i] = swapf->i_op->smap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + }else{ + int j; + unsigned int block = offset + << (12 - swapf->i_sb->s_blocksize_bits); + + for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize) + if (!(zones[i] = bmap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf); + } else + printk("re_swap_page: no swap file or device\n"); + if (offset && !clear_bit(offset,p->swap_lockmap)) + printk("rw_swap_page: lock already cleared\n"); + wake_up(&lock_queue); +} + +unsigned int get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned int offset, type; + + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) { + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) { + if (p->swap_map[offset]) + continue; + p->swap_map[offset] = 1; + nr_swap_pages--; + if (offset == p->highest_bit) + p->highest_bit--; + p->lowest_bit = offset; + return SWP_ENTRY(type,offset); + } + } + return 0; +} + +unsigned long swap_duplicate(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return 0; + offset = SWP_OFFSET(entry); + type = SWP_TYPE(entry); + if (type == SHM_SWP_TYPE) + return entry; + if (type >= nr_swapfiles) { + printk("Trying to duplicate nonexistent swap-page\n"); + return 0; + } + p = type + swap_info; + if (offset >= p->max) { + printk("swap_duplicate: weirdness\n"); + return 0; + } + if (!p->swap_map[offset]) { + printk("swap_duplicate: trying to duplicate unused page\n"); + return 0; + } + p->swap_map[offset]++; + return entry; +} + +void swap_free(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return; + type = SWP_TYPE(entry); + if (type == SHM_SWP_TYPE) + return; + if (type >= nr_swapfiles) { + printk("Trying to free nonexistent swap-page\n"); + return; + } + p = & swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("swap_free: weirdness\n"); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to free swap from unused swap-device\n"); + return; + } + while (set_bit(offset,p->swap_lockmap)) + sleep_on(&lock_queue); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (!p->swap_map[offset]) + printk("swap_free: swap-space map bad (entry %08lx)\n",entry); + else + if (!--p->swap_map[offset]) + nr_swap_pages++; + if (!clear_bit(offset,p->swap_lockmap)) + printk("swap_free: lock already cleared\n"); + wake_up(&lock_queue); +} + +unsigned long swap_in(unsigned long entry) +{ + unsigned long page; + + if (!(page = get_free_page(GFP_KERNEL))) { + oom(current); + return BAD_PAGE; + } + read_swap_page(entry, (char *) page); + if (add_to_swap_cache(page, entry)) + return page | PAGE_PRESENT; + swap_free(entry); + return page | PAGE_DIRTY | PAGE_PRESENT; +} + +static inline int try_to_swap_out(unsigned long * table_ptr) +{ + unsigned long page, entry; + + page = *table_ptr; + if (!(PAGE_PRESENT & page)) + return 0; + if (page >= high_memory) + return 0; + if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) + return 0; + + if ((PAGE_DIRTY & page) && delete_from_swap_cache(page)) { + *table_ptr &= ~PAGE_ACCESSED; + return 0; + } + if (PAGE_ACCESSED & page) { + *table_ptr &= ~PAGE_ACCESSED; + return 0; + } + if (PAGE_DIRTY & page) { + page &= PAGE_MASK; + if (mem_map[MAP_NR(page)] != 1) + return 0; + if (!(entry = get_swap_page())) + return 0; + *table_ptr = entry; + invalidate(); + write_swap_page(entry, (char *) page); + free_page(page); + return 1; + } + if ((entry = find_in_swap_cache(page))) { + if (mem_map[MAP_NR(page)] != 1) { + *table_ptr |= PAGE_DIRTY; + printk("Aiee.. duplicated cached swap-cache entry\n"); + return 0; + } + *table_ptr = entry; + invalidate(); + free_page(page & PAGE_MASK); + return 1; + } + page &= PAGE_MASK; + *table_ptr = 0; + invalidate(); + free_page(page); + return 1 + mem_map[MAP_NR(page)]; +} + +/* + * A new implementation of swap_out(). We do not swap complete processes, + * but only a small number of blocks, before we continue with the next + * process. The number of blocks actually swapped is determined on the + * number of page faults, that this process actually had in the last time, + * so we won't swap heavily used processes all the time ... + * + * Note: the priority argument is a hint on much CPU to waste with the + * swap block search, not a hint, of how much blocks to swap with + * each process. + * + * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de + */ + +/* + * These are the minimum and maximum number of pages to swap from one process, + * before proceeding to the next: + */ +#define SWAP_MIN 4 +#define SWAP_MAX 32 + +/* + * The actual number of pages to swap is determined as: + * SWAP_RATIO / (number of recent major page faults) + */ +#define SWAP_RATIO 128 + +static int swap_out_process(struct task_struct * p) +{ + unsigned long address; + unsigned long offset; + unsigned long *pgdir; + unsigned long pg_table; + + /* + * Go through process' page directory. + */ + address = p->mm->swap_address; + pgdir = (address >> PGDIR_SHIFT) + (unsigned long *) p->tss.cr3; + offset = address & ~PGDIR_MASK; + address &= PGDIR_MASK; + for ( ; address < TASK_SIZE ; + pgdir++, address = address + PGDIR_SIZE, offset = 0) { + pg_table = *pgdir; + if (pg_table >= high_memory) + continue; + if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED) + continue; + if (!(PAGE_PRESENT & pg_table)) { + printk("swap_out_process (%s): bad page-table at vm %08lx: %08lx\n", + p->comm, address + offset, pg_table); + *pgdir = 0; + continue; + } + pg_table &= 0xfffff000; + + /* + * Go through this page table. + */ + for( ; offset < ~PGDIR_MASK ; offset += PAGE_SIZE) { + switch(try_to_swap_out((unsigned long *) (pg_table + (offset >> 10)))) { + case 0: + break; + + case 1: + p->mm->rss--; + /* continue with the following page the next time */ + p->mm->swap_address = address + offset + PAGE_SIZE; + return 1; + + default: + p->mm->rss--; + break; + } + } + } + /* + * Finish work with this process, if we reached the end of the page + * directory. Mark restart from the beginning the next time. + */ + p->mm->swap_address = 0; + return 0; +} + +static int swap_out(unsigned int priority) +{ + static int swap_task; + int loop; + int counter = NR_TASKS * 2 >> priority; + struct task_struct *p; + + counter = NR_TASKS * 2 >> priority; + for(; counter >= 0; counter--, swap_task++) { + /* + * Check that swap_task is suitable for swapping. If not, look for + * the next suitable process. + */ + loop = 0; + while(1) { + if (swap_task >= NR_TASKS) { + swap_task = 1; + if (loop) + /* all processes are unswappable or already swapped out */ + return 0; + loop = 1; + } + + p = task[swap_task]; + if (p && p->mm->swappable && p->mm->rss) + break; + + swap_task++; + } + + /* + * Determine the number of pages to swap from this process. + */ + if (!p->mm->swap_cnt) { + p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt; + p->mm->old_maj_flt = p->mm->maj_flt; + + if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) { + p->mm->dec_flt = SWAP_RATIO / SWAP_MIN; + p->mm->swap_cnt = SWAP_MIN; + } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX) + p->mm->swap_cnt = SWAP_MAX; + else + p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt; + } + if (swap_out_process(p)) { + if ((--p->mm->swap_cnt) == 0) + swap_task++; + return 1; + } + } + return 0; +} + +static int try_to_free_page(int priority) +{ + int i=6; + + while (i--) { + if (priority != GFP_NOBUFFER && shrink_buffers(i)) + return 1; + if (shm_swap(i)) + return 1; + if (swap_out(i)) + return 1; + } + return 0; +} + +static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry) +{ + entry->prev = head; + entry->next = head->next; + entry->next->prev = entry; + head->next = entry; +} + +static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry) +{ + entry->next->prev = entry->prev; + entry->prev->next = entry->next; +} + +/* + * Free_page() adds the page to the free lists. This is optimized for + * fast normal cases (no error jumps taken normally). + * + * The way to optimize jumps for gcc-2.2.2 is to: + * - select the "normal" case and put it inside the if () { XXX } + * - no else-statements if you can avoid them + * + * With the above two rules, you get a straight-line execution path + * for the normal case, giving better asm-code. + */ + +/* + * Buddy system. Hairy. You really aren't expected to understand this + */ +static inline void free_pages_ok(unsigned long addr, unsigned long order) +{ + unsigned long index = addr >> (PAGE_SHIFT + 1 + order); + unsigned long mask = PAGE_MASK << order; + + addr &= mask; + nr_free_pages += 1 << order; + while (order < NR_MEM_LISTS-1) { + if (!change_bit(index, free_area_map[order])) + break; + remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask))); + order++; + index >>= 1; + mask <<= 1; + addr &= mask; + } + add_mem_queue(free_area_list+order, (struct mem_list *) addr); +} + +static inline void check_free_buffers(unsigned long addr) +{ + struct buffer_head * bh; + + bh = buffer_pages[MAP_NR(addr)]; + if (bh) { + struct buffer_head *tmp = bh; + do { + if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff) + refile_buffer(tmp); + tmp = tmp->b_this_page; + } while (tmp != bh); + } +} + +void free_pages(unsigned long addr, unsigned long order) +{ + if (addr < high_memory) { + unsigned long flag; + unsigned short * map = mem_map + MAP_NR(addr); + if (*map) { + if (!(*map & MAP_PAGE_RESERVED)) { + save_flags(flag); + cli(); + if (!--*map) { + free_pages_ok(addr, order); + delete_from_swap_cache(addr); + } + restore_flags(flag); + if (*map == 1) + check_free_buffers(addr); + } + return; + } + printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr); + printk("PC = %08lx\n",*(((unsigned long *)&addr)-1)); + return; + } +} + +/* + * Some ugly macros to speed up __get_free_pages().. + */ +#define RMQUEUE(order) \ +do { struct mem_list * queue = free_area_list+order; \ + unsigned long new_order = order; \ + do { struct mem_list *next = queue->next; \ + if (queue != next) { \ + (queue->next = next->next)->prev = queue; \ + mark_used((unsigned long) next, new_order); \ + nr_free_pages -= 1 << order; \ + restore_flags(flags); \ + EXPAND(next, order, new_order); \ + return (unsigned long) next; \ + } new_order++; queue++; \ + } while (new_order < NR_MEM_LISTS); \ +} while (0) + +static inline int mark_used(unsigned long addr, unsigned long order) +{ + return change_bit(addr >> (PAGE_SHIFT+1+order), free_area_map[order]); +} + +#define EXPAND(addr,low,high) \ +do { unsigned long size = PAGE_SIZE << high; \ + while (high > low) { \ + high--; size >>= 1; cli(); \ + add_mem_queue(free_area_list+high, addr); \ + mark_used((unsigned long) addr, high); \ + restore_flags(flags); \ + addr = (struct mem_list *) (size + (unsigned long) addr); \ + } mem_map[MAP_NR((unsigned long) addr)] = 1; \ +} while (0) + +unsigned long __get_free_pages(int priority, unsigned long order) +{ + unsigned long flags; + int reserved_pages; + + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("gfp called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + priority = GFP_ATOMIC; + } + } + reserved_pages = 5; + if (priority != GFP_NFS) + reserved_pages = min_free_pages; + save_flags(flags); +repeat: + cli(); + if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { + RMQUEUE(order); + restore_flags(flags); + return 0; + } + restore_flags(flags); + if (priority != GFP_BUFFER && try_to_free_page(priority)) + goto repeat; + return 0; +} + +/* + * Yes, I know this is ugly. Don't tell me. + */ +unsigned long __get_dma_pages(int priority, unsigned long order) +{ + unsigned long list = 0; + unsigned long result; + unsigned long limit = 16*1024*1024; + + /* if (EISA_bus) limit = ~0UL; */ + if (priority != GFP_ATOMIC) + priority = GFP_BUFFER; + for (;;) { + result = __get_free_pages(priority, order); + if (result < limit) /* covers failure as well */ + break; + *(unsigned long *) result = list; + list = result; + } + while (list) { + unsigned long tmp = list; + list = *(unsigned long *) list; + free_pages(tmp, order); + } + return result; +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + unsigned long order, flags; + unsigned long total = 0; + + printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); + save_flags(flags); + cli(); + for (order=0 ; order < NR_MEM_LISTS; order++) { + struct mem_list * tmp; + unsigned long nr = 0; + for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) { + nr ++; + } + total += nr * (4 << order); + printk("%lu*%ukB ", nr, 4 << order); + } + restore_flags(flags); + printk("= %lukB)\n", total); +#ifdef SWAP_CACHE_INFO + show_swap_cache_info(); +#endif +} + +/* + * Trying to stop swapping from a file is fraught with races, so + * we repeat quite a bit here when we have to pause. swapoff() + * isn't exactly timing-critical, so who cares? + */ +static int try_to_unuse(unsigned int type) +{ + int nr, pgt, pg; + unsigned long page, *ppage; + unsigned long tmp = 0; + struct task_struct *p; + + nr = 0; + +/* + * When we have to sleep, we restart the whole algorithm from the same + * task we stopped in. That at least rids us of all races. + */ +repeat: + for (; nr < NR_TASKS ; nr++) { + p = task[nr]; + if (!p) + continue; + for (pgt = 0 ; pgt < PTRS_PER_PAGE ; pgt++) { + ppage = pgt + ((unsigned long *) p->tss.cr3); + page = *ppage; + if (!page) + continue; + if (!(page & PAGE_PRESENT) || (page >= high_memory)) + continue; + if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) + continue; + ppage = (unsigned long *) (page & PAGE_MASK); + for (pg = 0 ; pg < PTRS_PER_PAGE ; pg++,ppage++) { + page = *ppage; + if (!page) + continue; + if (page & PAGE_PRESENT) { + if (!(page = in_swap_cache(page))) + continue; + if (SWP_TYPE(page) != type) + continue; + *ppage |= PAGE_DIRTY; + delete_from_swap_cache(*ppage); + continue; + } + if (SWP_TYPE(page) != type) + continue; + if (!tmp) { + if (!(tmp = __get_free_page(GFP_KERNEL))) + return -ENOMEM; + goto repeat; + } + read_swap_page(page, (char *) tmp); + if (*ppage == page) { + *ppage = tmp | (PAGE_DIRTY | PAGE_PRIVATE); + ++p->mm->rss; + swap_free(page); + tmp = 0; + } + goto repeat; + } + } + } + free_page(tmp); + return 0; +} + +asmlinkage int sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * inode; + unsigned int type; + int i; + + if (!suser()) + return -EPERM; + i = namei(specialfile,&inode); + if (i) + return i; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) { + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + if (p->swap_file) { + if (p->swap_file == inode) + break; + } else { + if (!S_ISBLK(inode->i_mode)) + continue; + if (p->swap_device == inode->i_rdev) + break; + } + } + iput(inode); + if (type >= nr_swapfiles) + return -EINVAL; + p->flags = SWP_USED; + i = try_to_unuse(type); + if (i) { + p->flags = SWP_WRITEOK; + return i; + } + nr_swap_pages -= p->pages; + iput(p->swap_file); + p->swap_file = NULL; + p->swap_device = 0; + vfree(p->swap_map); + p->swap_map = NULL; + free_page((long) p->swap_lockmap); + p->swap_lockmap = NULL; + p->flags = 0; + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage int sys_swapon(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * swap_inode; + unsigned int type; + int i,j; + int error; + + if (!suser()) + return -EPERM; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + if (type >= MAX_SWAPFILES) + return -EPERM; + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->max = 1; + error = namei(specialfile,&swap_inode); + if (error) + goto bad_swap; + p->swap_file = swap_inode; + error = -EBUSY; + if (swap_inode->i_count != 1) + goto bad_swap; + error = -EINVAL; + if (S_ISBLK(swap_inode->i_mode)) { + p->swap_device = swap_inode->i_rdev; + p->swap_file = NULL; + iput(swap_inode); + error = -ENODEV; + if (!p->swap_device) + goto bad_swap; + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + if (i == type) + continue; + if (p->swap_device == swap_info[i].swap_device) + goto bad_swap; + } + } else if (!S_ISREG(swap_inode->i_mode)) + goto bad_swap; + p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER); + if (!p->swap_lockmap) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap); + if (memcmp("SWAP-SPACE",p->swap_lockmap+4086,10)) { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + memset(p->swap_lockmap+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,p->swap_lockmap)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + p->max = i+1; + j++; + } + } + if (!j) { + printk("Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map = (unsigned char *) vmalloc(p->max); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < p->max ; i++) { + if (test_bit(i,p->swap_lockmap)) + p->swap_map[i] = 0; + else + p->swap_map[i] = 0x80; + } + p->swap_map[0] = 0x80; + memset(p->swap_lockmap,0,PAGE_SIZE); + p->flags = SWP_WRITEOK; + p->pages = j; + nr_swap_pages += j; + printk("Adding Swap: %dk swap-space\n",j<<2); + return 0; +bad_swap: + free_page((long) p->swap_lockmap); + vfree(p->swap_map); + iput(p->swap_file); + p->swap_device = 0; + p->swap_file = NULL; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->flags = 0; + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i, j; + + val->freeswap = val->totalswap = 0; + for (i = 0; i < nr_swapfiles; i++) { + if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (j = 0; j < swap_info[i].max; ++j) + switch (swap_info[i].swap_map[j]) { + case 128: + continue; + case 0: + ++val->freeswap; + default: + ++val->totalswap; + } + } + val->freeswap <<= PAGE_SHIFT; + val->totalswap <<= PAGE_SHIFT; + return; +} + +/* + * set up the free-area data structures: + * - mark all pages MAP_PAGE_RESERVED + * - mark all memory queues empty + * - clear the memory bitmaps + */ +unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem) +{ + unsigned short * p; + unsigned long mask = PAGE_MASK; + int i; + + /* + * select nr of pages we try to keep free for important stuff + * with a minimum of 16 pages. This is totally arbitrary + */ + i = end_mem >> (PAGE_SHIFT+6); + if (i < 16) + i = 16; + min_free_pages = i; + start_mem = init_swap_cache(start_mem, end_mem); + mem_map = (unsigned short *) start_mem; + p = mem_map + MAP_NR(end_mem); + start_mem = (unsigned long) p; + while (p > mem_map) + *--p = MAP_PAGE_RESERVED; + + for (i = 0 ; i < NR_MEM_LISTS ; i++, mask <<= 1) { + unsigned long bitmap_size; + free_area_list[i].prev = free_area_list[i].next = &free_area_list[i]; + end_mem = (end_mem + ~mask) & mask; + bitmap_size = end_mem >> (PAGE_SHIFT + i); + bitmap_size = (bitmap_size + 7) >> 3; + free_area_map[i] = (unsigned char *) start_mem; + memset((void *) start_mem, 0, bitmap_size); + start_mem += bitmap_size; + } + return start_mem; +} diff --git a/arch/i386/mm/vmalloc.c b/arch/i386/mm/vmalloc.c new file mode 100644 index 000000000..0dbd16d54 --- /dev/null +++ b/arch/i386/mm/vmalloc.c @@ -0,0 +1,202 @@ +/* + * linux/mm/vmalloc.c + * + * Copyright (C) 1993 Linus Torvalds + */ + +#include <asm/system.h> +#include <linux/config.h> + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/malloc.h> +#include <asm/segment.h> + +struct vm_struct { + unsigned long flags; + void * addr; + unsigned long size; + struct vm_struct * next; +}; + +static struct vm_struct * vmlist = NULL; + +/* Just any arbitrary offset to the start of the vmalloc VM area: the + * current 8MB value just means that there will be a 8MB "hole" after the + * physical memory until the kernel virtual memory starts. That means that + * any out-of-bounds memory accesses will hopefully be caught. + * The vmalloc() routines leaves a hole of 4kB between each vmalloced + * area for the same reason. ;) + */ +#define VMALLOC_OFFSET (8*1024*1024) + +static inline void set_pgdir(unsigned long dindex, unsigned long value) +{ + struct task_struct * p; + + p = &init_task; + do { + ((unsigned long *) p->tss.cr3)[dindex] = value; + p = p->next_task; + } while (p != &init_task); +} + +static int free_area_pages(unsigned long dindex, unsigned long index, unsigned long nr) +{ + unsigned long page, *pte; + + if (!(PAGE_PRESENT & (page = swapper_pg_dir[dindex]))) + return 0; + page &= PAGE_MASK; + pte = index + (unsigned long *) page; + do { + unsigned long pg = *pte; + *pte = 0; + if (pg & PAGE_PRESENT) + free_page(pg); + pte++; + } while (--nr); + pte = (unsigned long *) page; + for (nr = 0 ; nr < 1024 ; nr++, pte++) + if (*pte) + return 0; + set_pgdir(dindex,0); + mem_map[MAP_NR(page)] = 1; + free_page(page); + invalidate(); + return 0; +} + +static int alloc_area_pages(unsigned long dindex, unsigned long index, unsigned long nr) +{ + unsigned long page, *pte; + + page = swapper_pg_dir[dindex]; + if (!page) { + page = get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + if (swapper_pg_dir[dindex]) { + free_page(page); + page = swapper_pg_dir[dindex]; + } else { + mem_map[MAP_NR(page)] = MAP_PAGE_RESERVED; + set_pgdir(dindex, page | PAGE_SHARED); + } + } + page &= PAGE_MASK; + pte = index + (unsigned long *) page; + *pte = PAGE_SHARED; /* remove a race with vfree() */ + do { + unsigned long pg = get_free_page(GFP_KERNEL); + + if (!pg) + return -ENOMEM; + *pte = pg | PAGE_SHARED; + pte++; + } while (--nr); + invalidate(); + return 0; +} + +static int do_area(void * addr, unsigned long size, + int (*area_fn)(unsigned long,unsigned long,unsigned long)) +{ + unsigned long nr, dindex, index; + + nr = size >> PAGE_SHIFT; + dindex = (TASK_SIZE + (unsigned long) addr) >> 22; + index = (((unsigned long) addr) >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + while (nr > 0) { + unsigned long i = PTRS_PER_PAGE - index; + + if (i > nr) + i = nr; + nr -= i; + if (area_fn(dindex, index, i)) + return -1; + index = 0; + dindex++; + } + return 0; +} + +void vfree(void * addr) +{ + struct vm_struct **p, *tmp; + + if (!addr) + return; + if ((PAGE_SIZE-1) & (unsigned long) addr) { + printk("Trying to vfree() bad address (%p)\n", addr); + return; + } + for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { + if (tmp->addr == addr) { + *p = tmp->next; + do_area(tmp->addr, tmp->size, free_area_pages); + kfree(tmp); + return; + } + } + printk("Trying to vfree() nonexistent vm area (%p)\n", addr); +} + +void * vmalloc(unsigned long size) +{ + void * addr; + struct vm_struct **p, *tmp, *area; + + size = PAGE_ALIGN(size); + if (!size || size > high_memory) + return NULL; + area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + addr = (void *) ((high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)); + area->size = size + PAGE_SIZE; + area->next = NULL; + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + if (size + (unsigned long) addr < (unsigned long) tmp->addr) + break; + addr = (void *) (tmp->size + (unsigned long) tmp->addr); + } + area->addr = addr; + area->next = *p; + *p = area; + if (do_area(addr, size, alloc_area_pages)) { + vfree(addr); + return NULL; + } + return addr; +} + +int vread(char *buf, char *addr, int count) +{ + struct vm_struct **p, *tmp; + char *vaddr, *buf_start = buf; + int n; + + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + vaddr = (char *) tmp->addr; + while (addr < vaddr) { + if (count == 0) + goto finished; + put_fs_byte('\0', buf++), addr++, count--; + } + n = tmp->size - PAGE_SIZE; + if (addr > vaddr) + n -= addr - vaddr; + while (--n >= 0) { + if (count == 0) + goto finished; + put_fs_byte(*addr++, buf++), count--; + } + } +finished: + return buf - buf_start; +} |