diff options
Diffstat (limited to 'arch/i386/mm/memory.c')
-rw-r--r-- | arch/i386/mm/memory.c | 1320 |
1 files changed, 1320 insertions, 0 deletions
diff --git a/arch/i386/mm/memory.c b/arch/i386/mm/memory.c new file mode 100644 index 000000000..3e5a67041 --- /dev/null +++ b/arch/i386/mm/memory.c @@ -0,0 +1,1320 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + */ + +#include <linux/config.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> + +#include <asm/system.h> +#include <asm/segment.h> + +/* + * Define this if things work differently on a i386 and a i486: + * it will (on a i486) warn about kernel memory accesses that are + * done without a 'verify_area(VERIFY_WRITE,..)' + */ +#undef CONFIG_TEST_VERIFY_AREA + +unsigned long high_memory = 0; + +extern unsigned long pg0[1024]; /* page table for 0-4MB for everybody */ + +extern void sound_mem_init(void); +extern void die_if_kernel(char *,struct pt_regs *,long); +extern void show_net_buffers(void); + +/* + * The free_area_list arrays point to the queue heads of the free areas + * of different sizes + */ +int nr_swap_pages = 0; +int nr_free_pages = 0; +struct mem_list free_area_list[NR_MEM_LISTS]; +unsigned char * free_area_map[NR_MEM_LISTS]; + +#define copy_page(from,to) \ +__asm__("cld ; rep ; movsl": :"S" (from),"D" (to),"c" (1024):"cx","di","si") + +unsigned short * mem_map = NULL; + +#define CODE_SPACE(addr,p) ((addr) < (p)->end_code) + +/* + * oom() prints a message (so that the user knows why the process died), + * and gives the process an untrappable SIGKILL. + */ +void oom(struct task_struct * task) +{ + printk("\nOut of memory.\n"); + task->sigaction[SIGKILL-1].sa_handler = NULL; + task->blocked &= ~(1<<(SIGKILL-1)); + send_sig(SIGKILL,task,1); +} + +static void free_one_table(unsigned long * page_dir) +{ + int j; + unsigned long pg_table = *page_dir; + unsigned long * page_table; + + if (!pg_table) + return; + *page_dir = 0; + if (pg_table >= high_memory || !(pg_table & PAGE_PRESENT)) { + printk("Bad page table: [%p]=%08lx\n",page_dir,pg_table); + return; + } + if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED) + return; + page_table = (unsigned long *) (pg_table & PAGE_MASK); + for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++) { + unsigned long pg = *page_table; + + if (!pg) + continue; + *page_table = 0; + if (pg & PAGE_PRESENT) + free_page(PAGE_MASK & pg); + else + swap_free(pg); + } + free_page(PAGE_MASK & pg_table); +} + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. Note that + * unlike 'free_page_tables()', this function still leaves a valid + * page-table-tree in memory: it just removes the user pages. The two + * functions are similar, but there is a fundamental difference. + */ +void clear_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) + panic("task[0] (swapper) doesn't support exec()\n"); + pg_dir = tsk->tss.cr3; + page_dir = (unsigned long *) pg_dir; + if (!page_dir || page_dir == swapper_pg_dir) { + printk("Trying to clear kernel page-directory: not good\n"); + return; + } + if (mem_map[MAP_NR(pg_dir)] > 1) { + unsigned long * new_pg; + + if (!(new_pg = (unsigned long*) get_free_page(GFP_KERNEL))) { + oom(tsk); + return; + } + for (i = 768 ; i < 1024 ; i++) + new_pg[i] = page_dir[i]; + free_page(pg_dir); + tsk->tss.cr3 = (unsigned long) new_pg; + return; + } + for (i = 0 ; i < 768 ; i++,page_dir++) + free_one_table(page_dir); + invalidate(); + return; +} + +/* + * This function frees up all page tables of a process when it exits. + */ +void free_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) { + printk("task[0] (swapper) killed: unable to recover\n"); + panic("Trying to free up swapper memory space"); + } + pg_dir = tsk->tss.cr3; + if (!pg_dir || pg_dir == (unsigned long) swapper_pg_dir) { + printk("Trying to free kernel page-directory: not good\n"); + return; + } + tsk->tss.cr3 = (unsigned long) swapper_pg_dir; + if (tsk == current) + __asm__ __volatile__("movl %0,%%cr3": :"a" (tsk->tss.cr3)); + if (mem_map[MAP_NR(pg_dir)] > 1) { + free_page(pg_dir); + return; + } + page_dir = (unsigned long *) pg_dir; + for (i = 0 ; i < PTRS_PER_PAGE ; i++,page_dir++) + free_one_table(page_dir); + free_page(pg_dir); + invalidate(); +} + +/* + * clone_page_tables() clones the page table for a process - both + * processes will have the exact same pages in memory. There are + * probably races in the memory management with cloning, but we'll + * see.. + */ +int clone_page_tables(struct task_struct * tsk) +{ + unsigned long pg_dir; + + pg_dir = current->tss.cr3; + mem_map[MAP_NR(pg_dir)]++; + tsk->tss.cr3 = pg_dir; + return 0; +} + +/* + * copy_page_tables() just copies the whole process memory range: + * note the special handling of RESERVED (ie kernel) pages, which + * means that they are always shared by all processes. + */ +int copy_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long old_pg_dir, *old_page_dir; + unsigned long new_pg_dir, *new_page_dir; + + if (!(new_pg_dir = get_free_page(GFP_KERNEL))) + return -ENOMEM; + old_pg_dir = current->tss.cr3; + tsk->tss.cr3 = new_pg_dir; + old_page_dir = (unsigned long *) old_pg_dir; + new_page_dir = (unsigned long *) new_pg_dir; + for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) { + int j; + unsigned long old_pg_table, *old_page_table; + unsigned long new_pg_table, *new_page_table; + + old_pg_table = *old_page_dir; + if (!old_pg_table) + continue; + if (old_pg_table >= high_memory || !(old_pg_table & PAGE_PRESENT)) { + printk("copy_page_tables: bad page table: " + "probable memory corruption\n"); + *old_page_dir = 0; + continue; + } + if (mem_map[MAP_NR(old_pg_table)] & MAP_PAGE_RESERVED) { + *new_page_dir = old_pg_table; + continue; + } + if (!(new_pg_table = get_free_page(GFP_KERNEL))) { + free_page_tables(tsk); + return -ENOMEM; + } + old_page_table = (unsigned long *) (PAGE_MASK & old_pg_table); + new_page_table = (unsigned long *) (PAGE_MASK & new_pg_table); + for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) { + unsigned long pg; + pg = *old_page_table; + if (!pg) + continue; + if (!(pg & PAGE_PRESENT)) { + *new_page_table = swap_duplicate(pg); + continue; + } + if (pg > high_memory || (mem_map[MAP_NR(pg)] & MAP_PAGE_RESERVED)) { + *new_page_table = pg; + continue; + } + if (pg & PAGE_COW) + pg &= ~PAGE_RW; + if (delete_from_swap_cache(pg)) + pg |= PAGE_DIRTY; + *new_page_table = pg; + *old_page_table = pg; + mem_map[MAP_NR(pg)]++; + } + *new_page_dir = new_pg_table | PAGE_TABLE; + } + invalidate(); + return 0; +} + +/* + * a more complete version of free_page_tables which performs with page + * granularity. + */ +int unmap_page_range(unsigned long from, unsigned long size) +{ + unsigned long page, page_dir; + unsigned long *page_table, *dir; + unsigned long poff, pcnt, pc; + + if (from & ~PAGE_MASK) { + printk("unmap_page_range called with wrong alignment\n"); + return -EINVAL; + } + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + for ( ; size > 0; ++dir, size -= pcnt, + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size)) { + if (!(page_dir = *dir)) { + poff = 0; + continue; + } + if (!(page_dir & PAGE_PRESENT)) { + printk("unmap_page_range: bad page directory."); + continue; + } + page_table = (unsigned long *)(PAGE_MASK & page_dir); + if (poff) { + page_table += poff; + poff = 0; + } + for (pc = pcnt; pc--; page_table++) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (PAGE_PRESENT & page) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + } + if (pcnt == PTRS_PER_PAGE) { + *dir = 0; + free_page(PAGE_MASK & page_dir); + } + } + invalidate(); + return 0; +} + +int zeromap_page_range(unsigned long from, unsigned long size, int mask) +{ + unsigned long *page_table, *dir; + unsigned long poff, pcnt; + unsigned long page; + + if (mask) { + if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) { + printk("zeromap_page_range: mask = %08x\n",mask); + return -EINVAL; + } + mask |= ZERO_PAGE; + } + if (from & ~PAGE_MASK) { + printk("zeromap_page_range: from = %08lx\n",from); + return -EINVAL; + } + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + while (size > 0) { + if (!(PAGE_PRESENT & *dir)) { + /* clear page needed here? SRB. */ + if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) { + invalidate(); + return -ENOMEM; + } + if (PAGE_PRESENT & *dir) { + free_page((unsigned long) page_table); + page_table = (unsigned long *)(PAGE_MASK & *dir++); + } else + *dir++ = ((unsigned long) page_table) | PAGE_TABLE; + } else + page_table = (unsigned long *)(PAGE_MASK & *dir++); + page_table += poff; + poff = 0; + for (size -= pcnt; pcnt-- ;) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (page & PAGE_PRESENT) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + *page_table++ = mask; + } + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size); + } + invalidate(); + return 0; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +int remap_page_range(unsigned long from, unsigned long to, unsigned long size, int mask) +{ + unsigned long *page_table, *dir; + unsigned long poff, pcnt; + unsigned long page; + + if (mask) { + if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) { + printk("remap_page_range: mask = %08x\n",mask); + return -EINVAL; + } + } + if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) { + printk("remap_page_range: from = %08lx, to=%08lx\n",from,to); + return -EINVAL; + } + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + while (size > 0) { + if (!(PAGE_PRESENT & *dir)) { + /* clearing page here, needed? SRB. */ + if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) { + invalidate(); + return -1; + } + *dir++ = ((unsigned long) page_table) | PAGE_TABLE; + } + else + page_table = (unsigned long *)(PAGE_MASK & *dir++); + if (poff) { + page_table += poff; + poff = 0; + } + + for (size -= pcnt; pcnt-- ;) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (PAGE_PRESENT & page) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + + /* + * the first condition should return an invalid access + * when the page is referenced. current assumptions + * cause it to be treated as demand allocation in some + * cases. + */ + if (!mask) + *page_table++ = 0; /* not present */ + else if (to >= high_memory) + *page_table++ = (to | mask); + else if (!mem_map[MAP_NR(to)]) + *page_table++ = 0; /* not present */ + else { + *page_table++ = (to | mask); + if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) { + ++current->mm->rss; + mem_map[MAP_NR(to)]++; + } + } + to += PAGE_SIZE; + } + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size); + } + invalidate(); + return 0; +} + +/* + * This function puts a page in memory at the wanted address. + * It returns the physical address of the page gotten, 0 if + * out of memory (either when trying to access page-table or + * page.) + */ +unsigned long put_page(struct task_struct * tsk,unsigned long page, + unsigned long address,int prot) +{ + unsigned long *page_table; + + if ((prot & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) + printk("put_page: prot = %08x\n",prot); + if (page >= high_memory) { + printk("put_page: trying to put page %08lx at %08lx\n",page,address); + return 0; + } + page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if ((*page_table) & PAGE_PRESENT) + page_table = (unsigned long *) (PAGE_MASK & *page_table); + else { + printk("put_page: bad page directory entry\n"); + oom(tsk); + *page_table = BAD_PAGETABLE | PAGE_TABLE; + return 0; + } + page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if (*page_table) { + printk("put_page: page already exists\n"); + *page_table = 0; + invalidate(); + } + *page_table = page | prot; +/* no need for invalidate */ + return page; +} + +/* + * The previous function doesn't work very well if you also want to mark + * the page dirty: exec.c wants this, as it has earlier changed the page, + * and we want the dirty-status to be correct (for VM). Thus the same + * routine, but this time we mark it dirty too. + */ +unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address) +{ + unsigned long tmp, *page_table; + + if (page >= high_memory) + printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address); + if (mem_map[MAP_NR(page)] != 1) + printk("mem_map disagrees with %08lx at %08lx\n",page,address); + page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *page_table) + page_table = (unsigned long *) (PAGE_MASK & *page_table); + else { + if (!(tmp = get_free_page(GFP_KERNEL))) + return 0; + if (PAGE_PRESENT & *page_table) { + free_page(tmp); + page_table = (unsigned long *) (PAGE_MASK & *page_table); + } else { + *page_table = tmp | PAGE_TABLE; + page_table = (unsigned long *) tmp; + } + } + page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if (*page_table) { + printk("put_dirty_page: page already exists\n"); + *page_table = 0; + invalidate(); + } + *page_table = page | (PAGE_DIRTY | PAGE_PRIVATE); +/* no need for invalidate */ + return page; +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + */ +void do_wp_page(struct vm_area_struct * vma, unsigned long address, + unsigned long error_code) +{ + unsigned long *pde, pte, old_page, prot; + unsigned long new_page; + + new_page = __get_free_page(GFP_KERNEL); + pde = PAGE_DIR_OFFSET(vma->vm_task->tss.cr3,address); + pte = *pde; + if (!(pte & PAGE_PRESENT)) + goto end_wp_page; + if ((pte & PAGE_TABLE) != PAGE_TABLE || pte >= high_memory) + goto bad_wp_pagetable; + pte &= PAGE_MASK; + pte += PAGE_PTR(address); + old_page = *(unsigned long *) pte; + if (!(old_page & PAGE_PRESENT)) + goto end_wp_page; + if (old_page >= high_memory) + goto bad_wp_page; + if (old_page & PAGE_RW) + goto end_wp_page; + vma->vm_task->mm->min_flt++; + prot = (old_page & ~PAGE_MASK) | PAGE_RW | PAGE_DIRTY; + old_page &= PAGE_MASK; + if (mem_map[MAP_NR(old_page)] != 1) { + if (new_page) { + if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED) + ++vma->vm_task->mm->rss; + copy_page(old_page,new_page); + *(unsigned long *) pte = new_page | prot; + free_page(old_page); + invalidate(); + return; + } + free_page(old_page); + oom(vma->vm_task); + *(unsigned long *) pte = BAD_PAGE | prot; + invalidate(); + return; + } + *(unsigned long *) pte |= PAGE_RW | PAGE_DIRTY; + invalidate(); + if (new_page) + free_page(new_page); + return; +bad_wp_page: + printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); + *(unsigned long *) pte = BAD_PAGE | PAGE_SHARED; + send_sig(SIGKILL, vma->vm_task, 1); + goto end_wp_page; +bad_wp_pagetable: + printk("do_wp_page: bogus page-table at address %08lx (%08lx)\n",address,pte); + *pde = BAD_PAGETABLE | PAGE_TABLE; + send_sig(SIGKILL, vma->vm_task, 1); +end_wp_page: + if (new_page) + free_page(new_page); + return; +} + +/* + * Ugly, ugly, but the goto's result in better assembly.. + */ +int verify_area(int type, const void * addr, unsigned long size) +{ + struct vm_area_struct * vma; + unsigned long start = (unsigned long) addr; + + /* If the current user space is mapped to kernel space (for the + * case where we use a fake user buffer with get_fs/set_fs()) we + * don't expect to find the address in the user vm map. + */ + if (get_fs() == get_ds()) + return 0; + + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + goto bad_area; + if (vma->vm_end > start) + break; + } + if (vma->vm_start <= start) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + +good_area: + if (!wp_works_ok && type == VERIFY_WRITE) + goto check_wp_fault_by_hand; + for (;;) { + struct vm_area_struct * next; + if (!(vma->vm_page_prot & PAGE_USER)) + goto bad_area; + if (type != VERIFY_READ && !(vma->vm_page_prot & (PAGE_COW | PAGE_RW))) + goto bad_area; + if (vma->vm_end - start >= size) + return 0; + next = vma->vm_next; + if (!next || vma->vm_end != next->vm_start) + goto bad_area; + vma = next; + } + +check_wp_fault_by_hand: + size--; + size += start & ~PAGE_MASK; + size >>= PAGE_SHIFT; + start &= PAGE_MASK; + + for (;;) { + if (!(vma->vm_page_prot & (PAGE_COW | PAGE_RW))) + goto bad_area; + do_wp_page(vma, start, PAGE_PRESENT); + if (!size) + return 0; + size--; + start += PAGE_SIZE; + if (start < vma->vm_end) + continue; + vma = vma->vm_next; + if (!vma || vma->vm_start != start) + break; + } + +bad_area: + return -EFAULT; +} + +static inline void get_empty_page(struct task_struct * tsk, unsigned long address) +{ + unsigned long tmp; + + if (!(tmp = get_free_page(GFP_KERNEL))) { + oom(tsk); + tmp = BAD_PAGE; + } + if (!put_page(tsk,tmp,address,PAGE_PRIVATE)) + free_page(tmp); +} + +/* + * try_to_share() checks the page at address "address" in the task "p", + * to see if it exists, and if it is clean. If so, share it with the current + * task. + * + * NOTE! This assumes we have checked that p != current, and that they + * share the same inode and can generally otherwise be shared. + */ +static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area, + unsigned long from_address, struct vm_area_struct * from_area, + unsigned long newpage) +{ + unsigned long from; + unsigned long to; + unsigned long from_page; + unsigned long to_page; + + from_page = (unsigned long)PAGE_DIR_OFFSET(from_area->vm_task->tss.cr3,from_address); + to_page = (unsigned long)PAGE_DIR_OFFSET(to_area->vm_task->tss.cr3,to_address); +/* is there a page-directory at from? */ + from = *(unsigned long *) from_page; + if (!(from & PAGE_PRESENT)) + return 0; + from &= PAGE_MASK; + from_page = from + PAGE_PTR(from_address); + from = *(unsigned long *) from_page; +/* is the page present? */ + if (!(from & PAGE_PRESENT)) + return 0; +/* if it is private, it must be clean to be shared */ + if (from & PAGE_DIRTY) { + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + } +/* is the page reasonable at all? */ + if (from >= high_memory) + return 0; + if (mem_map[MAP_NR(from)] & MAP_PAGE_RESERVED) + return 0; +/* is the destination ok? */ + to = *(unsigned long *) to_page; + if (!(to & PAGE_PRESENT)) + return 0; + to &= PAGE_MASK; + to_page = to + PAGE_PTR(to_address); + if (*(unsigned long *) to_page) + return 0; +/* do we copy? */ + if (newpage) { + if (in_swap_cache(from)) { /* implies PAGE_DIRTY */ + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + } + copy_page((from & PAGE_MASK), newpage); + *(unsigned long *) to_page = newpage | to_area->vm_page_prot; + return 1; + } +/* do a final swap-cache test before sharing them.. */ + if (in_swap_cache(from)) { + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + from |= PAGE_DIRTY; + *(unsigned long *) from_page = from; + delete_from_swap_cache(from); + invalidate(); + } + mem_map[MAP_NR(from)]++; +/* fill in the 'to' field, checking for COW-stuff */ + to = (from & (PAGE_MASK | PAGE_DIRTY)) | to_area->vm_page_prot; + if (to & PAGE_COW) + to &= ~PAGE_RW; + *(unsigned long *) to_page = to; +/* Check if we need to do anything at all to the 'from' field */ + if (!(from & PAGE_RW)) + return 1; + if (!(from_area->vm_page_prot & PAGE_COW)) + return 1; +/* ok, need to mark it read-only, so invalidate any possible old TB entry */ + from &= ~PAGE_RW; + *(unsigned long *) from_page = from; + invalidate(); + return 1; +} + +/* + * share_page() tries to find a process that could share a page with + * the current one. + * + * We first check if it is at all feasible by checking inode->i_count. + * It should be >1 if there are other tasks sharing this inode. + */ +static int share_page(struct vm_area_struct * area, unsigned long address, + unsigned long error_code, unsigned long newpage) +{ + struct inode * inode; + struct task_struct ** p; + unsigned long offset; + unsigned long from_address; + unsigned long give_page; + + if (!area || !(inode = area->vm_inode) || inode->i_count < 2) + return 0; + /* do we need to copy or can we just share? */ + give_page = 0; + if ((area->vm_page_prot & PAGE_COW) && (error_code & PAGE_RW)) { + if (!newpage) + return 0; + give_page = newpage; + } + offset = address - area->vm_start + area->vm_offset; + for (p = &LAST_TASK ; p > &FIRST_TASK ; --p) { + struct vm_area_struct * mpnt; + if (!*p) + continue; + if (area->vm_task == *p) + continue; + /* Now see if there is something in the VMM that + we can share pages with */ + for (mpnt = (*p)->mm->mmap; mpnt; mpnt = mpnt->vm_next) { + /* must be same inode */ + if (mpnt->vm_inode != inode) + continue; + /* offsets must be mutually page-aligned */ + if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK) + continue; + /* the other area must actually cover the wanted page.. */ + from_address = offset + mpnt->vm_start - mpnt->vm_offset; + if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end) + continue; + /* .. NOW we can actually try to use the same physical page */ + if (!try_to_share(address, area, from_address, mpnt, give_page)) + continue; + /* free newpage if we never used it.. */ + if (give_page || !newpage) + return 1; + free_page(newpage); + return 1; + } + } + return 0; +} + +/* + * fill in an empty page-table if none exists. + */ +static inline unsigned long get_empty_pgtable(struct task_struct * tsk,unsigned long address) +{ + unsigned long page; + unsigned long *p; + + p = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *p) + return *p; + if (*p) { + printk("get_empty_pgtable: bad page-directory entry \n"); + *p = 0; + } + page = get_free_page(GFP_KERNEL); + p = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *p) { + free_page(page); + return *p; + } + if (*p) { + printk("get_empty_pgtable: bad page-directory entry \n"); + *p = 0; + } + if (page) { + *p = page | PAGE_TABLE; + return *p; + } + oom(current); + *p = BAD_PAGETABLE | PAGE_TABLE; + return 0; +} + +static inline void do_swap_page(struct vm_area_struct * vma, + unsigned long address, unsigned long * pge, unsigned long entry) +{ + unsigned long page; + + if (vma->vm_ops && vma->vm_ops->swapin) + page = vma->vm_ops->swapin(vma, entry); + else + page = swap_in(entry); + if (*pge != entry) { + free_page(page); + return; + } + page = page | vma->vm_page_prot; + if (mem_map[MAP_NR(page)] > 1 && (page & PAGE_COW)) + page &= ~PAGE_RW; + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->maj_flt; + *pge = page; + return; +} + +void do_no_page(struct vm_area_struct * vma, unsigned long address, + unsigned long error_code) +{ + unsigned long page, entry, prot; + + page = get_empty_pgtable(vma->vm_task,address); + if (!page) + return; + page &= PAGE_MASK; + page += PAGE_PTR(address); + entry = *(unsigned long *) page; + if (entry & PAGE_PRESENT) + return; + if (entry) { + do_swap_page(vma, address, (unsigned long *) page, entry); + return; + } + address &= PAGE_MASK; + + if (!vma->vm_ops || !vma->vm_ops->nopage) { + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->min_flt; + get_empty_page(vma->vm_task,address); + return; + } + page = get_free_page(GFP_KERNEL); + if (share_page(vma, address, error_code, page)) { + ++vma->vm_task->mm->min_flt; + ++vma->vm_task->mm->rss; + return; + } + if (!page) { + oom(current); + put_page(vma->vm_task, BAD_PAGE, address, PAGE_PRIVATE); + return; + } + ++vma->vm_task->mm->maj_flt; + ++vma->vm_task->mm->rss; + prot = vma->vm_page_prot; + /* + * The fourth argument is "no_share", which tells the low-level code + * to copy, not share the page even if sharing is possible. It's + * essentially an early COW detection ("moo at 5 AM"). + */ + page = vma->vm_ops->nopage(vma, address, page, (error_code & PAGE_RW) && (prot & PAGE_COW)); + if (share_page(vma, address, error_code, 0)) { + free_page(page); + return; + } + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. + */ + if (error_code & PAGE_RW) { + prot |= PAGE_DIRTY; /* can't be COW-shared: see "no_share" above */ + } else if ((prot & PAGE_COW) && mem_map[MAP_NR(page)] > 1) + prot &= ~PAGE_RW; + if (put_page(vma->vm_task, page, address, prot)) + return; + free_page(page); + oom(current); +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct vm_area_struct * vma; + unsigned long address; + unsigned long page; + + /* get the address */ + __asm__("movl %%cr2,%0":"=r" (address)); + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + goto bad_area; + if (vma->vm_end > address) + break; + } + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + vma->vm_offset -= vma->vm_start - (address & PAGE_MASK); + vma->vm_start = (address & PAGE_MASK); +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + if (regs->eflags & VM_MASK) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + current->screen_bitmap |= 1 << bit; + } + if (!(vma->vm_page_prot & PAGE_USER)) + goto bad_area; + if (error_code & PAGE_PRESENT) { + if (!(vma->vm_page_prot & (PAGE_RW | PAGE_COW))) + goto bad_area; +#ifdef CONFIG_TEST_VERIFY_AREA + if (regs->cs == KERNEL_CS) + printk("WP fault at %08x\n", regs->eip); +#endif + do_wp_page(vma, address, error_code); + return; + } + do_no_page(vma, address, error_code); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + if (error_code & PAGE_USER) { + current->tss.cr2 = address; + current->tss.error_code = error_code; + current->tss.trap_no = 14; + send_sig(SIGSEGV, current, 1); + return; + } +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + if (wp_works_ok < 0 && address == TASK_SIZE && (error_code & PAGE_PRESENT)) { + wp_works_ok = 1; + pg0[0] = PAGE_SHARED; + invalidate(); + printk("This processor honours the WP bit even when in supervisor mode. Good.\n"); + return; + } + if ((unsigned long) (address-TASK_SIZE) < PAGE_SIZE) { + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + pg0[0] = PAGE_SHARED; + } else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at virtual address %08lx\n",address); + __asm__("movl %%cr3,%0" : "=r" (page)); + printk(KERN_ALERT "current->tss.cr3 = %08lx, %%cr3 = %08lx\n", + current->tss.cr3, page); + page = ((unsigned long *) page)[address >> 22]; + printk(KERN_ALERT "*pde = %08lx\n", page); + if (page & PAGE_PRESENT) { + page &= PAGE_MASK; + address &= 0x003ff000; + page = ((unsigned long *) page)[address >> PAGE_SHIFT]; + printk(KERN_ALERT "*pte = %08lx\n", page); + } + die_if_kernel("Oops", regs, error_code); + do_exit(SIGKILL); +} + +/* + * BAD_PAGE is the page that is used for page faults when linux + * is out-of-memory. Older versions of linux just did a + * do_exit(), but using this instead means there is less risk + * for a process dying in kernel mode, possibly leaving a inode + * unused etc.. + * + * BAD_PAGETABLE is the accompanying page-table: it is initialized + * to point to BAD_PAGE entries. + * + * ZERO_PAGE is a special page that is used for zero-initialized + * data and COW. + */ +unsigned long __bad_pagetable(void) +{ + extern char empty_bad_page_table[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (BAD_PAGE + PAGE_TABLE), + "D" ((long) empty_bad_page_table), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_bad_page_table; +} + +unsigned long __bad_page(void) +{ + extern char empty_bad_page[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (0), + "D" ((long) empty_bad_page), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_bad_page; +} + +unsigned long __zero_page(void) +{ + extern char empty_zero_page[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (0), + "D" ((long) empty_zero_page), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_zero_page; +} + +void show_mem(void) +{ + int i,free = 0,total = 0,reserved = 0; + int shared = 0; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); + i = high_memory >> PAGE_SHIFT; + while (i-- > 0) { + total++; + if (mem_map[i] & MAP_PAGE_RESERVED) + reserved++; + else if (!mem_map[i]) + free++; + else + shared += mem_map[i]-1; + } + printk("%d pages of RAM\n",total); + printk("%d free pages\n",free); + printk("%d reserved pages\n",reserved); + printk("%d pages shared\n",shared); + show_buffers(); +#ifdef CONFIG_NET + show_net_buffers(); +#endif +} + +extern unsigned long free_area_init(unsigned long, unsigned long); + +/* + * paging_init() sets up the page tables - note that the first 4MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +unsigned long paging_init(unsigned long start_mem, unsigned long end_mem) +{ + unsigned long * pg_dir; + unsigned long * pg_table; + unsigned long tmp; + unsigned long address; + +/* + * Physical page 0 is special; it's not touched by Linux since BIOS + * and SMM (for laptops with [34]86/SL chips) may need it. It is read + * and write protected to detect null pointer references in the + * kernel. + */ +#if 0 + memset((void *) 0, 0, PAGE_SIZE); +#endif + start_mem = PAGE_ALIGN(start_mem); + address = 0; + pg_dir = swapper_pg_dir; + while (address < end_mem) { + tmp = *(pg_dir + 768); /* at virtual addr 0xC0000000 */ + if (!tmp) { + tmp = start_mem | PAGE_TABLE; + *(pg_dir + 768) = tmp; + start_mem += PAGE_SIZE; + } + *pg_dir = tmp; /* also map it in at 0x0000000 for init */ + pg_dir++; + pg_table = (unsigned long *) (tmp & PAGE_MASK); + for (tmp = 0 ; tmp < PTRS_PER_PAGE ; tmp++,pg_table++) { + if (address < end_mem) + *pg_table = address | PAGE_SHARED; + else + *pg_table = 0; + address += PAGE_SIZE; + } + } + invalidate(); + return free_area_init(start_mem, end_mem); +} + +void mem_init(unsigned long start_low_mem, + unsigned long start_mem, unsigned long end_mem) +{ + int codepages = 0; + int reservedpages = 0; + int datapages = 0; + unsigned long tmp; + extern int etext; + + cli(); + end_mem &= PAGE_MASK; + high_memory = end_mem; + + /* mark usable pages in the mem_map[] */ + start_low_mem = PAGE_ALIGN(start_low_mem); + start_mem = PAGE_ALIGN(start_mem); + + /* + * IBM messed up *AGAIN* in their thinkpad: 0xA0000 -> 0x9F000. + * They seem to have done something stupid with the floppy + * controller as well.. + */ + while (start_low_mem < 0x9f000) { + mem_map[MAP_NR(start_low_mem)] = 0; + start_low_mem += PAGE_SIZE; + } + + while (start_mem < high_memory) { + mem_map[MAP_NR(start_mem)] = 0; + start_mem += PAGE_SIZE; + } +#ifdef CONFIG_SOUND + sound_mem_init(); +#endif + for (tmp = 0 ; tmp < high_memory ; tmp += PAGE_SIZE) { + if (mem_map[MAP_NR(tmp)]) { + if (tmp >= 0xA0000 && tmp < 0x100000) + reservedpages++; + else if (tmp < (unsigned long) &etext) + codepages++; + else + datapages++; + continue; + } + mem_map[MAP_NR(tmp)] = 1; + free_page(tmp); + } + tmp = nr_free_pages << PAGE_SHIFT; + printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data)\n", + tmp >> 10, + high_memory >> 10, + codepages << (PAGE_SHIFT-10), + reservedpages << (PAGE_SHIFT-10), + datapages << (PAGE_SHIFT-10)); +/* test if the WP bit is honoured in supervisor mode */ + wp_works_ok = -1; + pg0[0] = PAGE_READONLY; + invalidate(); + __asm__ __volatile__("movb 0,%%al ; movb %%al,0": : :"ax", "memory"); + pg0[0] = 0; + invalidate(); + if (wp_works_ok < 0) + wp_works_ok = 0; +#ifdef CONFIG_TEST_VERIFY_AREA + wp_works_ok = 0; +#endif + return; +} + +void si_meminfo(struct sysinfo *val) +{ + int i; + + i = high_memory >> PAGE_SHIFT; + val->totalram = 0; + val->sharedram = 0; + val->freeram = nr_free_pages << PAGE_SHIFT; + val->bufferram = buffermem; + while (i-- > 0) { + if (mem_map[i] & MAP_PAGE_RESERVED) + continue; + val->totalram++; + if (!mem_map[i]) + continue; + val->sharedram += mem_map[i]-1; + } + val->totalram <<= PAGE_SHIFT; + val->sharedram <<= PAGE_SHIFT; + return; +} + + +/* + * This handles a generic mmap of a disk file. + */ +static unsigned long file_mmap_nopage(struct vm_area_struct * area, unsigned long address, + unsigned long page, int no_share) +{ + struct inode * inode = area->vm_inode; + unsigned int block; + int nr[8]; + int i, *p; + + address &= PAGE_MASK; + block = address - area->vm_start + area->vm_offset; + block >>= inode->i_sb->s_blocksize_bits; + i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; + p = nr; + do { + *p = bmap(inode,block); + i--; + block++; + p++; + } while (i > 0); + return bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, no_share); +} + +struct vm_operations_struct file_mmap = { + NULL, /* open */ + NULL, /* close */ + file_mmap_nopage, /* nopage */ + NULL, /* wppage */ + NULL, /* share */ + NULL, /* unmap */ +}; |