From 59223edaa18759982db0a8aced0e77457d10c68e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 17 Jun 1999 13:25:08 +0000 Subject: Merge with Linux 2.3.6. Sorry, this isn't tested on silicon, I don't have a MIPS box at hand. --- mm/filemap.c | 40 ++++------------- mm/memory.c | 65 +++++++++++---------------- mm/mlock.c | 7 +-- mm/mmap.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++--------- mm/mremap.c | 4 +- mm/page_alloc.c | 1 + mm/page_io.c | 2 +- mm/slab.c | 4 +- mm/swapfile.c | 49 ++++++++++++++++----- 9 files changed, 196 insertions(+), 109 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 62d85dc02..455f334f3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -48,7 +48,7 @@ struct pio_request }; static struct pio_request *pio_first = NULL, **pio_last = &pio_first; static kmem_cache_t *pio_request_cache; -static struct wait_queue *pio_wait = NULL; +static DECLARE_WAIT_QUEUE_HEAD(pio_wait); static inline void make_pio_request(struct file *, unsigned long, unsigned long); @@ -300,9 +300,8 @@ static unsigned long try_to_read_ahead(struct file * file, void __wait_on_page(struct page *page) { struct task_struct *tsk = current; - struct wait_queue wait; + DECLARE_WAITQUEUE(wait, tsk); - wait.task = tsk; add_wait_queue(&page->wait, &wait); repeat: tsk->state = TASK_UNINTERRUPTIBLE; @@ -1312,18 +1311,9 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) struct vm_operations_struct * ops; struct inode *inode = file->f_dentry->d_inode; - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + ops = &file_private_mmap; + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) ops = &file_shared_mmap; - /* share_page() can only guarantee proper page sharing if - * the offsets are all page aligned. */ - if (vma->vm_offset & (PAGE_SIZE - 1)) - return -EINVAL; - } else { - ops = &file_private_mmap; - if (inode->i_op && inode->i_op->bmap && - (vma->vm_offset & (inode->i_sb->s_blocksize - 1))) - return -EINVAL; - } if (!inode->i_sb || !S_ISREG(inode->i_mode)) return -EACCES; if (!inode->i_op || !inode->i_op->readpage) @@ -1435,7 +1425,8 @@ out: */ ssize_t generic_file_write(struct file *file, const char *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos, + writepage_t write_one_page) { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; @@ -1444,10 +1435,7 @@ generic_file_write(struct file *file, const char *buf, struct page *page, **hash; unsigned long page_cache = 0; unsigned long written; - long status, sync; - - if (!inode->i_op || !inode->i_op->updatepage) - return -EIO; + long status; if (file->f_error) { int error = file->f_error; @@ -1455,7 +1443,6 @@ generic_file_write(struct file *file, const char *buf, return error; } - sync = file->f_flags & O_SYNC; written = 0; if (file->f_flags & O_APPEND) @@ -1511,15 +1498,7 @@ generic_file_write(struct file *file, const char *buf, wait_on_page(page); set_bit(PG_locked, &page->flags); - /* - * Do the real work.. If the writer ends up delaying the write, - * the writer needs to increment the page use counts until he - * is done with the page. - */ - bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes); - status = -EFAULT; - if (bytes) - status = inode->i_op->updatepage(file, page, offset, bytes, sync); + status = write_one_page(file, page, offset, bytes, buf); /* Mark it unlocked again and drop the page.. */ clear_bit(PG_locked, &page->flags); @@ -1677,7 +1656,7 @@ static inline void make_pio_request(struct file *file, int kpiod(void * unused) { struct task_struct *tsk = current; - struct wait_queue wait = { tsk, }; + DECLARE_WAITQUEUE(wait, tsk); struct inode * inode; struct dentry * dentry; struct pio_request * p; @@ -1686,7 +1665,6 @@ int kpiod(void * unused) tsk->pgrp = 1; strcpy(tsk->comm, "kpiod"); sigfillset(&tsk->blocked); - init_waitqueue(&pio_wait); /* * Mark this task as a memory allocator - we don't want to get caught * up in the regular mm freeing frenzy if we have to allocate memory diff --git a/mm/memory.c b/mm/memory.c index 49b02737f..ae56831b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -130,16 +130,14 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) { pgd_t * page_dir = mm->pgd; - if (page_dir && page_dir != swapper_pg_dir) { - page_dir += first; - do { - free_one_pgd(page_dir); - page_dir++; - } while (--nr); + page_dir += first; + do { + free_one_pgd(page_dir); + page_dir++; + } while (--nr); - /* keep the page table cache within bounds */ - check_pgt_cache(); - } + /* keep the page table cache within bounds */ + check_pgt_cache(); } /* @@ -546,19 +544,6 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long return error; } -/* - * sanity-check function.. - */ -static void put_page(pte_t * page_table, pte_t pte) -{ - if (!pte_none(*page_table)) { - free_page_and_swap_cache(pte_page(pte)); - return; - } -/* no need for flush_tlb */ - set_pte(page_table, pte); -} - /* * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. @@ -616,21 +601,15 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig * and potentially makes it more efficient. */ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, - unsigned long address, pte_t *page_table) + unsigned long address, pte_t *page_table, pte_t pte) { - pte_t pte; unsigned long old_page, new_page; struct page * page_map; - pte = *page_table; new_page = __get_free_page(GFP_USER); - /* Did someone else copy this page for us while we slept? */ + /* Did swap_out() unmapped the protected page while we slept? */ if (pte_val(*page_table) != pte_val(pte)) goto end_wp_page; - if (!pte_present(pte)) - goto end_wp_page; - if (pte_write(pte)) - goto end_wp_page; old_page = pte_page(pte); if (MAP_NR(old_page) >= max_mapnr) goto bad_wp_page; @@ -654,36 +633,42 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, delete_from_swap_cache(page_map); /* FallThrough */ case 1: - /* We can release the kernel lock now.. */ - unlock_kernel(); - flush_cache_page(vma, address); set_pte(page_table, pte_mkdirty(pte_mkwrite(pte))); flush_tlb_page(vma, address); end_wp_page: + /* + * We can release the kernel lock now.. Now swap_out will see + * a dirty page and so won't get confused and flush_tlb_page + * won't SMP race. -Andrea + */ + unlock_kernel(); + if (new_page) free_page(new_page); return 1; } - unlock_kernel(); if (!new_page) - return 0; + goto no_new_page; - if (PageReserved(mem_map + MAP_NR(old_page))) + if (PageReserved(page_map)) ++vma->vm_mm->rss; copy_cow_page(old_page,new_page); flush_page_to_ram(old_page); flush_page_to_ram(new_page); flush_cache_page(vma, address); set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); - free_page(old_page); flush_tlb_page(vma, address); + unlock_kernel(); + __free_page(page_map); return 1; bad_wp_page: printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); send_sig(SIGKILL, tsk, 1); +no_new_page: + unlock_kernel(); if (new_page) free_page(new_page); return 0; @@ -820,7 +805,7 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v tsk->min_flt++; flush_page_to_ram(page); } - put_page(page_table, entry); + set_pte(page_table, entry); return 1; } @@ -879,7 +864,7 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 && !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); - put_page(page_table, entry); + set_pte(page_table, entry); /* no need to invalidate: a not-present page shouldn't be cached */ return 1; } @@ -913,7 +898,7 @@ static inline int handle_pte_fault(struct task_struct *tsk, flush_tlb_page(vma, address); if (write_access) { if (!pte_write(entry)) - return do_wp_page(tsk, vma, address, pte); + return do_wp_page(tsk, vma, address, pte, entry); entry = pte_mkdirty(entry); set_pte(pte, entry); diff --git a/mm/mlock.c b/mm/mlock.c index 1c9035095..4a938c958 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -115,10 +115,11 @@ static int mlock_fixup(struct vm_area_struct * vma, if (!retval) { /* keep track of amount of locked VM */ pages = (end - start) >> PAGE_SHIFT; - if (!(newflags & VM_LOCKED)) + if (newflags & VM_LOCKED) { pages = -pages; - vma->vm_mm->locked_vm += pages; - make_pages_present(start, end); + make_pages_present(start, end); + } + vma->vm_mm->locked_vm -= pages; } return retval; } diff --git a/mm/mmap.c b/mm/mmap.c index 9f7d32851..6e5eda00d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -84,6 +84,13 @@ static inline void remove_shared_vm_struct(struct vm_area_struct *vma) } } +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ asmlinkage unsigned long sys_brk(unsigned long brk) { unsigned long rlim, retval; @@ -92,20 +99,6 @@ asmlinkage unsigned long sys_brk(unsigned long brk) down(&mm->mmap_sem); - /* - * This lock-kernel is one of the main contention points for - * certain normal loads. And it really should not be here: almost - * everything in brk()/mmap()/munmap() is protected sufficiently by - * the mmap semaphore that we got above. - * - * We should move this into the few things that really want the - * lock, namely anything that actually touches a file descriptor - * etc. We can do all the normal anonymous mapping cases without - * ever getting the lock at all - the actual memory management - * code is already completely thread-safe. - */ - lock_kernel(); - if (brk < mm->end_code) goto out; newbrk = PAGE_ALIGN(brk); @@ -134,15 +127,12 @@ asmlinkage unsigned long sys_brk(unsigned long brk) goto out; /* Ok, looks good - let it rip. */ - if (do_mmap(NULL, oldbrk, newbrk-oldbrk, - PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_FIXED|MAP_PRIVATE, 0) != oldbrk) + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out; set_brk: mm->brk = brk; out: retval = mm->brk; - unlock_kernel(); up(&mm->mmap_sem); return retval; } @@ -185,6 +175,9 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, if (len > TASK_SIZE || addr > TASK_SIZE-len) return -EINVAL; + if (off & ~PAGE_MASK) + return -EINVAL; + /* offset overflow? */ if (off + len < off) return -EINVAL; @@ -467,6 +460,28 @@ struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, return NULL; } +struct vm_area_struct * find_extend_vma(struct task_struct * tsk, unsigned long addr) +{ + struct vm_area_struct * vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(tsk->mm,addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) { + make_pages_present(addr, start); + } + return vma; +} + /* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. @@ -662,6 +677,8 @@ int do_munmap(unsigned long addr, size_t len) end = end > mpnt->vm_end ? mpnt->vm_end : end; size = end - st; + lock_kernel(); + if (mpnt->vm_ops && mpnt->vm_ops->unmap) mpnt->vm_ops->unmap(mpnt, st, size); @@ -676,6 +693,8 @@ int do_munmap(unsigned long addr, size_t len) * Fix the mapping, and free the old area if it wasn't reused. */ extra = unmap_fixup(mpnt, st, size, extra); + + unlock_kernel(); } /* Release the extra vma struct if it wasn't used */ @@ -693,13 +712,87 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len) int ret; down(¤t->mm->mmap_sem); - lock_kernel(); ret = do_munmap(addr, len); - unlock_kernel(); up(¤t->mm->mmap_sem); return ret; } +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma; + unsigned long flags, retval; + + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + /* + * Clear old maps. this also does some error checking for us + */ + retval = do_munmap(addr, len); + if (retval != 0) + return retval; + + /* Check against address space limits *after* clearing old maps... */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (mm->map_count > MAX_MAP_COUNT) + return -ENOMEM; + + if (!vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags(PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE) | mm->def_flags; + + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_offset = 0; + vma->vm_file = NULL; + vma->vm_pte = 0; + + /* + * merge_segments may merge our vma, so we can't refer to it + * after the call. Save the values we need now ... + */ + flags = vma->vm_flags; + addr = vma->vm_start; + insert_vm_struct(mm, vma); + merge_segments(mm, vma->vm_start, vma->vm_end); + + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; +} + /* Build the AVL tree corresponding to the VMA list. */ void build_mmap_avl(struct mm_struct * mm) { diff --git a/mm/mremap.c b/mm/mremap.c index a10870318..b50e00dec 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -134,12 +134,14 @@ static inline unsigned long move_vma(struct vm_area_struct * vma, new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start); + lock_kernel(); if (new_vma->vm_file) new_vma->vm_file->f_count++; if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); insert_vm_struct(current->mm, new_vma); merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end); + unlock_kernel(); do_munmap(addr, old_len); current->mm->total_vm += new_len >> PAGE_SHIFT; if (new_vma->vm_flags & VM_LOCKED) { @@ -166,7 +168,6 @@ asmlinkage unsigned long sys_mremap(unsigned long addr, unsigned long ret = -EINVAL; down(¤t->mm->mmap_sem); - lock_kernel(); if (addr & ~PAGE_MASK) goto out; old_len = PAGE_ALIGN(old_len); @@ -239,7 +240,6 @@ asmlinkage unsigned long sys_mremap(unsigned long addr, else ret = -ENOMEM; out: - unlock_kernel(); up(¤t->mm->mmap_sem); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 16bbf0179..8826b9af1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -323,6 +323,7 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m --p; atomic_set(&p->count, 0); p->flags = (1 << PG_DMA) | (1 << PG_reserved); + init_waitqueue_head(&p->wait); } while (p > mem_map); for (i = 0 ; i < NR_MEM_LISTS ; i++) { diff --git a/mm/page_io.c b/mm/page_io.c index 498e4f63d..9f5e82446 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,7 +18,7 @@ #include -static struct wait_queue * lock_queue = NULL; +static DECLARE_WAIT_QUEUE_HEAD(lock_queue); /* * Reads or writes a swap page. diff --git a/mm/slab.c b/mm/slab.c index 29680bd68..ef7ec9279 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -438,7 +438,7 @@ long __init kmem_cache_init(long start, long end) #undef kmem_slab_offset #undef kmem_slab_diff - cache_chain_sem = MUTEX; + init_MUTEX(&cache_chain_sem); size = cache_cache.c_offset + sizeof(kmem_bufctl_t); size += (L1_CACHE_BYTES-1); @@ -902,7 +902,7 @@ next: left_over -= slab_align_size; } - /* Offset must be a factor of the alignment. */ + /* Offset must be a multiple of the alignment. */ offset += (align-1); offset &= ~(align-1); diff --git a/mm/swapfile.c b/mm/swapfile.c index 42ca4900a..de29f1006 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -5,6 +5,7 @@ * Swap reorganised 29.12.95, Stephen Tweedie */ +#include #include #include #include @@ -473,6 +474,18 @@ int get_swaparea_info(char *buf) return len; } +int is_swap_partition(kdev_t dev) { + struct swap_info_struct *ptr = swap_info; + int i; + + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if (ptr->flags & SWP_USED) + if (ptr->swap_device == dev) + return 1; + } + return 0; +} + /* * Written 01/25/92 by Simmule Turner, heavily changed by Linus. * @@ -491,7 +504,9 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) int swap_header_version; int lock_map_size = PAGE_SIZE; int nr_good_pages = 0; + unsigned long maxpages; unsigned long tmp_lock_map = 0; + int swapfilesize; lock_kernel(); if (!capable(CAP_SYS_ADMIN)) @@ -530,35 +545,41 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) error = -EINVAL; if (S_ISBLK(swap_dentry->d_inode->i_mode)) { - p->swap_device = swap_dentry->d_inode->i_rdev; - set_blocksize(p->swap_device, PAGE_SIZE); + kdev_t dev = swap_dentry->d_inode->i_rdev; + + p->swap_device = dev; + set_blocksize(dev, PAGE_SIZE); filp.f_dentry = swap_dentry; filp.f_mode = 3; /* read write */ error = blkdev_open(swap_dentry->d_inode, &filp); if (error) goto bad_swap_2; - set_blocksize(p->swap_device, PAGE_SIZE); + set_blocksize(dev, PAGE_SIZE); error = -ENODEV; - if (!p->swap_device || - (blk_size[MAJOR(p->swap_device)] && - !blk_size[MAJOR(p->swap_device)][MINOR(p->swap_device)])) + if (!dev || (blk_size[MAJOR(dev)] && + !blk_size[MAJOR(dev)][MINOR(dev)])) goto bad_swap; error = -EBUSY; for (i = 0 ; i < nr_swapfiles ; i++) { if (i == type) continue; - if (p->swap_device == swap_info[i].swap_device) + if (dev == swap_info[i].swap_device) goto bad_swap; } + swapfilesize = 0; + if (blk_size[MAJOR(dev)]) + swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] + / (PAGE_SIZE / 1024); } else if (S_ISREG(swap_dentry->d_inode->i_mode)) { error = -EBUSY; for (i = 0 ; i < nr_swapfiles ; i++) { - if (i == type) + if (i == type || !swap_info[i].swap_file) continue; if (swap_dentry->d_inode == swap_info[i].swap_file->d_inode) goto bad_swap; } + swapfilesize = swap_dentry->d_inode->i_size / PAGE_SIZE; } else goto bad_swap; @@ -627,11 +648,13 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) p->highest_bit = swap_header->info.last_page - 1; p->max = swap_header->info.last_page; + maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)); + if (p->max >= maxpages) + p->max = maxpages-1; + error = -EINVAL; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) goto bad_swap; - if (p->max >= SWP_OFFSET(SWP_ENTRY(0,~0UL))) - goto bad_swap; /* OK, set up the swap map and apply the bad block list */ if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) { @@ -654,6 +677,12 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) goto bad_swap; } + if (swapfilesize && p->max > swapfilesize) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + error = -EINVAL; + goto bad_swap; + } if (!nr_good_pages) { printk(KERN_WARNING "Empty swap-file\n"); error = -EINVAL; -- cgit v1.2.3