diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1998-08-25 09:12:35 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1998-08-25 09:12:35 +0000 |
commit | c7fc24dc4420057f103afe8fc64524ebc25c5d37 (patch) | |
tree | 3682407a599b8f9f03fc096298134cafba1c9b2f /mm | |
parent | 1d793fade8b063fde3cf275bf1a5c2d381292cd9 (diff) |
o Merge with Linux 2.1.116.
o New Newport console code.
o New G364 console code.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 433 | ||||
-rw-r--r-- | mm/memory.c | 90 | ||||
-rw-r--r-- | mm/mlock.c | 17 | ||||
-rw-r--r-- | mm/mmap.c | 96 | ||||
-rw-r--r-- | mm/mprotect.c | 15 | ||||
-rw-r--r-- | mm/mremap.c | 11 | ||||
-rw-r--r-- | mm/page_alloc.c | 150 | ||||
-rw-r--r-- | mm/page_io.c | 24 | ||||
-rw-r--r-- | mm/simp.c | 435 | ||||
-rw-r--r-- | mm/slab.c | 97 | ||||
-rw-r--r-- | mm/swap.c | 8 | ||||
-rw-r--r-- | mm/swap_state.c | 27 | ||||
-rw-r--r-- | mm/swapfile.c | 10 | ||||
-rw-r--r-- | mm/vmalloc.c | 39 | ||||
-rw-r--r-- | mm/vmscan.c | 101 |
15 files changed, 680 insertions, 873 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 3f2632a15..d0bf1270f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -117,12 +117,100 @@ repeat: } } +/* + * Remove a page from the page cache and free it. + */ +void remove_inode_page(struct page *page) +{ + remove_page_from_hash_queue(page); + remove_page_from_inode_queue(page); + __free_page(page); +} + +/* + * Check whether we can free this page. + */ +static inline int shrink_one_page(struct page *page, int gfp_mask) +{ + struct buffer_head *tmp, *bh; + + if (PageLocked(page)) + goto next; + if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) + goto next; + /* First of all, regenerate the page's referenced bit + * from any buffers in the page + */ + bh = page->buffers; + if (bh) { + tmp = bh; + do { + if (buffer_touched(tmp)) { + clear_bit(BH_Touched, &tmp->b_state); + set_bit(PG_referenced, &page->flags); + } + tmp = tmp->b_this_page; + } while (tmp != bh); + + /* Refuse to swap out all buffer pages */ + if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages)) + goto next; + } + + /* We can't throw away shared pages, but we do mark + them as referenced. This relies on the fact that + no page is currently in both the page cache and the + buffer cache; we'd have to modify the following + test to allow for that case. */ + + switch (atomic_read(&page->count)) { + case 1: + /* is it a swap-cache or page-cache page? */ + if (page->inode) { + if (test_and_clear_bit(PG_referenced, &page->flags)) { + touch_page(page); + break; + } + age_page(page); +#if 0 + if (page->age) + break; + if (page_cache_size * 100 < (page_cache.min_percent * num_physpages)) + break; +#endif + if (PageSwapCache(page)) { + delete_from_swap_cache(page); + return 1; + } + remove_inode_page(page); + return 1; + } + /* It's not a cache page, so we don't do aging. + * If it has been referenced recently, don't free it */ + if (test_and_clear_bit(PG_referenced, &page->flags)) + break; + + /* is it a buffer cache page? */ + if (bh && try_to_free_buffer(bh, &bh, 6)) + return 1; + break; + + default: + /* more than one user: we can't throw it away */ + set_bit(PG_referenced, &page->flags); + /* fall through */ + case 0: + /* nothing */ + } +next: + return 0; +} + int shrink_mmap(int priority, int gfp_mask) { static unsigned long clock = 0; - struct page * page; unsigned long limit = num_physpages; - struct buffer_head *tmp, *bh; + struct page * page; int count_max, count_min; count_max = (limit<<1) >> (priority>>1); @@ -130,79 +218,20 @@ int shrink_mmap(int priority, int gfp_mask) page = mem_map + clock; do { + if (PageSkip(page)) { + /* next_hash is overloaded for PageSkip */ + page = page->next_hash; + clock = page->map_nr; + } + + if (shrink_one_page(page, gfp_mask)) + return 1; count_max--; if (page->inode || page->buffers) count_min--; - - if (PageLocked(page)) - goto next; - if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) - goto next; - /* First of all, regenerate the page's referenced bit - from any buffers in the page */ - bh = page->buffers; - if (bh) { - tmp = bh; - do { - if (buffer_touched(tmp)) { - clear_bit(BH_Touched, &tmp->b_state); - set_bit(PG_referenced, &page->flags); - } - tmp = tmp->b_this_page; - } while (tmp != bh); - - /* Refuse to swap out all buffer pages */ - if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages)) - goto next; - } - - /* We can't throw away shared pages, but we do mark - them as referenced. This relies on the fact that - no page is currently in both the page cache and the - buffer cache; we'd have to modify the following - test to allow for that case. */ - - switch (atomic_read(&page->count)) { - case 1: - /* is it a swap-cache or page-cache page? */ - if (page->inode) { - if (test_and_clear_bit(PG_referenced, &page->flags)) { - touch_page(page); - break; - } - age_page(page); - if (page->age || page_cache_size * 100 < (page_cache.min_percent * num_physpages)) - break; - if (PageSwapCache(page)) { - delete_from_swap_cache(page); - return 1; - } - remove_page_from_hash_queue(page); - remove_page_from_inode_queue(page); - __free_page(page); - return 1; - } - /* It's not a cache page, so we don't do aging. - * If it has been referenced recently, don't free it */ - if (test_and_clear_bit(PG_referenced, &page->flags)) - break; - - /* is it a buffer cache page? */ - if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6)) - return 1; - break; - - default: - /* more than one users: we can't throw it away */ - set_bit(PG_referenced, &page->flags); - /* fall through */ - case 0: - /* nothing */ - } -next: page++; clock++; - if (clock >= limit) { + if (clock >= max_mapnr) { clock = 0; page = mem_map; } @@ -216,20 +245,17 @@ next: * free it from the page hash-queues etc, as we don't want to keep it * in-core unnecessarily. */ -unsigned long page_unuse(unsigned long page) +unsigned long page_unuse(struct page * page) { - struct page * p = mem_map + MAP_NR(page); - int count = atomic_read(&p->count); + int count = atomic_read(&page->count); if (count != 2) return count; - if (!p->inode) + if (!page->inode) return count; - if (PageSwapCache(p)) + if (PageSwapCache(page)) panic ("Doing a normal page_unuse of a swap cache page"); - remove_page_from_hash_queue(p); - remove_page_from_inode_queue(p); - free_page(page); + remove_inode_page(page); return 1; } @@ -303,6 +329,7 @@ static unsigned long try_to_read_ahead(struct file * file, */ page = mem_map + MAP_NR(page_cache); add_to_page_cache(page, inode, offset, hash); + set_bit(PG_referenced, &page->flags); inode->i_op->readpage(file, page); page_cache = 0; } @@ -568,6 +595,23 @@ static inline unsigned long generic_file_readahead(int reada_ok, return page_cache; } +/* + * "descriptor" for what we're up to with a read. + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ +typedef struct { + size_t written; + size_t count; + char * buf; + int error; +} read_descriptor_t; + +typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long); /* * This is a generic file read routine, and uses the @@ -577,23 +621,14 @@ static inline unsigned long generic_file_readahead(int reada_ok, * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ - -ssize_t generic_file_read(struct file * filp, char * buf, - size_t count, loff_t *ppos) +static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) { struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; - ssize_t error, read; size_t pos, pgpos, page_cache; int reada_ok; int max_readahead = get_max_readahead(inode); - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - if (!count) - return 0; - error = 0; - read = 0; page_cache = 0; pos = *ppos; @@ -621,12 +656,12 @@ ssize_t generic_file_read(struct file * filp, char * buf, * Then, at least MIN_READAHEAD if read ahead is ok, * and at most MAX_READAHEAD in all cases. */ - if (pos + count <= (PAGE_SIZE >> 1)) { + if (pos + desc->count <= (PAGE_SIZE >> 1)) { filp->f_ramax = 0; } else { unsigned long needed; - needed = ((pos + count) & PAGE_MASK) - pgpos; + needed = ((pos + desc->count) & PAGE_MASK) - pgpos; if (filp->f_ramax < needed) filp->f_ramax = needed; @@ -679,20 +714,20 @@ success: offset = pos & ~PAGE_MASK; nr = PAGE_SIZE - offset; - if (nr > count) - nr = count; if (nr > inode->i_size - pos) nr = inode->i_size - pos; - nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr); - release_page(page); - error = -EFAULT; - if (!nr) - break; - buf += nr; + + /* + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + nr = actor(desc, (const char *) (page_address(page) + offset), nr); pos += nr; - read += nr; - count -= nr; - if (count) + release_page(page); + if (nr && desc->count) continue; break; } @@ -710,7 +745,7 @@ no_cached_page: */ if (page_cache) continue; - error = -ENOMEM; + desc->error = -ENOMEM; break; } @@ -739,11 +774,14 @@ no_cached_page: if (reada_ok && filp->f_ramax > MIN_READAHEAD) filp->f_ramax = MIN_READAHEAD; - error = inode->i_op->readpage(filp, page); - if (!error) - goto found_page; - release_page(page); - break; + { + int error = inode->i_op->readpage(filp, page); + if (!error) + goto found_page; + desc->error = error; + release_page(page); + break; + } page_read_error: /* @@ -751,15 +789,18 @@ page_read_error: * Try to re-read it _once_. We do this synchronously, * because this happens only if there were errors. */ - error = inode->i_op->readpage(filp, page); - if (!error) { - wait_on_page(page); - if (PageUptodate(page) && !PageError(page)) - goto success; - error = -EIO; /* Some unspecified error occurred.. */ + { + int error = inode->i_op->readpage(filp, page); + if (!error) { + wait_on_page(page); + if (PageUptodate(page) && !PageError(page)) + goto success; + error = -EIO; /* Some unspecified error occurred.. */ + } + desc->error = error; + release_page(page); + break; } - release_page(page); - break; } *ppos = pos; @@ -767,9 +808,159 @@ page_read_error: if (page_cache) free_page(page_cache); UPDATE_ATIME(inode); - if (!read) - read = error; - return read; +} + +static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size) +{ + unsigned long left; + unsigned long count = desc->count; + + if (size > count) + size = count; + left = __copy_to_user(desc->buf, area, size); + if (left) { + size -= left; + desc->error = -EFAULT; + } + desc->count = count - size; + desc->written += size; + desc->buf += size; + return size; +} + +/* + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) +{ + ssize_t retval; + + retval = -EFAULT; + if (access_ok(VERIFY_WRITE, buf, count)) { + retval = 0; + if (count) { + read_descriptor_t desc; + + desc.written = 0; + desc.count = count; + desc.buf = buf; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + + retval = desc.written; + if (!retval) + retval = desc.error; + } + } + return retval; +} + +static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size) +{ + ssize_t written; + unsigned long count = desc->count; + struct file *file = (struct file *) desc->buf; + struct inode *inode = file->f_dentry->d_inode; + mm_segment_t old_fs; + + if (size > count) + size = count; + down(&inode->i_sem); + old_fs = get_fs(); + set_fs(KERNEL_DS); + written = file->f_op->write(file, area, size, &file->f_pos); + set_fs(old_fs); + up(&inode->i_sem); + if (written < 0) { + desc->error = written; + written = 0; + } + desc->count = count - written; + desc->written += written; + return written; +} + +asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + ssize_t retval; + struct file * in_file, * out_file; + struct inode * in_inode, * out_inode; + + lock_kernel(); + + /* + * Get input file, and verify that it is ok.. + */ + retval = -EBADF; + in_file = fget(in_fd); + if (!in_file) + goto out; + if (!(in_file->f_mode & FMODE_READ)) + goto fput_in; + retval = -EINVAL; + in_inode = in_file->f_dentry->d_inode; + if (!in_inode) + goto fput_in; + if (!in_inode->i_op || !in_inode->i_op->readpage) + goto fput_in; + retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count); + if (retval) + goto fput_in; + + /* + * Get output file, and verify that it is ok.. + */ + retval = -EBADF; + out_file = fget(out_fd); + if (!out_file) + goto fput_in; + if (!(out_file->f_mode & FMODE_WRITE)) + goto fput_out; + retval = -EINVAL; + if (!out_file->f_op || !out_file->f_op->write) + goto fput_out; + out_inode = out_file->f_dentry->d_inode; + if (!out_inode) + goto fput_out; + retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count); + if (retval) + goto fput_out; + + retval = 0; + if (count) { + read_descriptor_t desc; + loff_t pos = 0, *ppos; + + retval = -EFAULT; + ppos = &in_file->f_pos; + if (offset) { + if (get_user(pos, offset)) + goto fput_out; + ppos = &pos; + } + + desc.written = 0; + desc.count = count; + desc.buf = (char *) out_file; + desc.error = 0; + do_generic_file_read(in_file, ppos, &desc, file_send_actor); + + retval = desc.written; + if (!retval) + retval = desc.error; + if (offset) + put_user(pos, offset); + } + + +fput_out: + fput(out_file); +fput_in: + fput(in_file); +out: + unlock_kernel(); + return retval; } /* @@ -903,7 +1094,7 @@ page_read_error: goto success; /* - * Uhhuh.. Things didn't work out. Return zero to tell the + * Things didn't work out. Return zero to tell the * mm layer so, possibly freeing the page cache page first. */ failure: @@ -1257,6 +1448,7 @@ asmlinkage int sys_msync(unsigned long start, size_t len, int flags) struct vm_area_struct * vma; int unmapped_error, error = -EINVAL; + down(¤t->mm->mmap_sem); lock_kernel(); if (start & ~PAGE_MASK) goto out; @@ -1304,6 +1496,7 @@ asmlinkage int sys_msync(unsigned long start, size_t len, int flags) } out: unlock_kernel(); + up(¤t->mm->mmap_sem); return error; } @@ -1412,7 +1605,7 @@ page_wait: set_bit(PG_uptodate, &page->flags); do_update_page: - /* Alright, the page is there. Now update it. */ + /* All right, the page is there. Now update it. */ status = inode->i_op->updatepage(file, page, buf, offset, bytes, sync); done_with_page: diff --git a/mm/memory.c b/mm/memory.c index af4297702..77a814f07 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -44,6 +44,8 @@ #include <linux/mman.h> #include <linux/mm.h> #include <linux/swap.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -68,8 +70,6 @@ static inline void copy_cow_page(unsigned long from, unsigned long to) copy_page(to, from); } -#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) - mem_map_t * mem_map = NULL; /* @@ -121,22 +121,41 @@ static inline void free_one_pgd(pgd_t * dir) pmd_free(pmd); } +/* Low and high watermarks for page table cache. + The system should try to have pgt_water[0] <= cache elements <= pgt_water[1] + */ +int pgt_cache_water[2] = { 25, 50 }; + +/* Returns the number of pages freed */ +int check_pgt_cache(void) +{ + return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]); +} + + /* * This function clears all user-level page tables of a process - this * is needed by execve(), so that old pages aren't in the way. */ void clear_page_tables(struct task_struct * tsk) { + pgd_t * page_dir = tsk->mm->pgd; int i; - pgd_t * page_dir; - page_dir = tsk->mm->pgd; - if (!page_dir || page_dir == swapper_pg_dir) { - printk("%s trying to clear kernel page-directory: not good\n", tsk->comm); - return; - } + if (!page_dir || page_dir == swapper_pg_dir) + goto out_bad; for (i = 0 ; i < USER_PTRS_PER_PGD ; i++) free_one_pgd(page_dir + i); + + /* keep the page table cache within bounds */ + check_pgt_cache(); + return; + +out_bad: + printk(KERN_ERR + "clear_page_tables: %s trying to clear kernel pgd\n", + tsk->comm); + return; } /* @@ -146,30 +165,34 @@ void clear_page_tables(struct task_struct * tsk) */ void free_page_tables(struct mm_struct * mm) { + pgd_t * page_dir = mm->pgd; int i; - pgd_t * page_dir; - page_dir = mm->pgd; - if (page_dir) { - if (page_dir == swapper_pg_dir) { - printk("free_page_tables: Trying to free kernel pgd\n"); - return; - } - for (i = 0 ; i < USER_PTRS_PER_PGD ; i++) - free_one_pgd(page_dir + i); - pgd_free(page_dir); - } + if (!page_dir) + goto out; + if (page_dir == swapper_pg_dir) + goto out_bad; + for (i = 0 ; i < USER_PTRS_PER_PGD ; i++) + free_one_pgd(page_dir + i); + pgd_free(page_dir); + + /* keep the page table cache within bounds */ + check_pgt_cache(); +out: + return; + +out_bad: + printk(KERN_ERR + "free_page_tables: Trying to free kernel pgd\n"); + return; } int new_page_tables(struct task_struct * tsk) { - pgd_t * page_dir, * new_pg; + pgd_t * new_pg; if (!(new_pg = pgd_alloc())) return -ENOMEM; - page_dir = pgd_offset(&init_mm, 0); - memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t)); SET_PAGE_DIR(tsk, new_pg); tsk->mm->pgd = new_pg; return 0; @@ -898,6 +921,9 @@ static inline void handle_pte_fault(struct task_struct *tsk, do_wp_page(tsk, vma, address, write_access, pte); } +/* + * By the time we get here, we already hold the mm semaphore + */ void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma, unsigned long address, int write_access) { @@ -912,9 +938,27 @@ void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma, pte = pte_alloc(pmd, address); if (!pte) goto no_memory; + lock_kernel(); handle_pte_fault(tsk, vma, address, write_access, pte); + unlock_kernel(); update_mmu_cache(vma, address, *pte); return; no_memory: oom(tsk); } + +/* + * Simplistic page force-in.. + */ +void make_pages_present(unsigned long addr, unsigned long end) +{ + int write; + struct vm_area_struct * vma; + + vma = find_vma(current->mm, addr); + write = (vma->vm_flags & VM_WRITE) != 0; + while (addr < end) { + handle_mm_fault(current, vma, addr, write); + addr += PAGE_SIZE; + } +} diff --git a/mm/mlock.c b/mm/mlock.c index 3a322f8a5..527443946 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -126,14 +126,7 @@ static int mlock_fixup(struct vm_area_struct * vma, if (!(newflags & VM_LOCKED)) pages = -pages; vma->vm_mm->locked_vm += pages; - - if (newflags & VM_LOCKED) - while (start < end) { - char c; - get_user(c,(char *) start); - __asm__ __volatile__("": :"r" (c)); - start += PAGE_SIZE; - } + make_pages_present(start, end); } return retval; } @@ -192,6 +185,7 @@ asmlinkage int sys_mlock(unsigned long start, size_t len) unsigned long lock_limit; int error = -ENOMEM; + down(¤t->mm->mmap_sem); lock_kernel(); len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK; start &= PAGE_MASK; @@ -214,6 +208,7 @@ asmlinkage int sys_mlock(unsigned long start, size_t len) error = do_mlock(start, len, 1); out: unlock_kernel(); + up(¤t->mm->mmap_sem); return error; } @@ -221,11 +216,13 @@ asmlinkage int sys_munlock(unsigned long start, size_t len) { int ret; + down(¤t->mm->mmap_sem); lock_kernel(); len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK; start &= PAGE_MASK; ret = do_mlock(start, len, 0); unlock_kernel(); + up(¤t->mm->mmap_sem); return ret; } @@ -263,6 +260,7 @@ asmlinkage int sys_mlockall(int flags) unsigned long lock_limit; int ret = -EINVAL; + down(¤t->mm->mmap_sem); lock_kernel(); if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) goto out; @@ -282,6 +280,7 @@ asmlinkage int sys_mlockall(int flags) ret = do_mlockall(flags); out: unlock_kernel(); + up(¤t->mm->mmap_sem); return ret; } @@ -289,8 +288,10 @@ asmlinkage int sys_munlockall(void) { int ret; + down(¤t->mm->mmap_sem); lock_kernel(); ret = do_mlockall(0); unlock_kernel(); + up(¤t->mm->mmap_sem); return ret; } @@ -57,19 +57,19 @@ int vm_enough_memory(long pages) * simple, it hopefully works in most obvious cases.. Easy to * fool it, but this should catch most mistakes. */ - long freepages; + long free; /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; - freepages = buffermem >> PAGE_SHIFT; - freepages += page_cache_size; - freepages >>= 1; - freepages += nr_free_pages; - freepages += nr_swap_pages; - freepages -= num_physpages >> 4; - return freepages > pages; + free = buffermem >> PAGE_SHIFT; + free += page_cache_size; + free >>= 1; + free += nr_free_pages; + free += nr_swap_pages; + free -= num_physpages >> 4; + return free > pages; } /* Remove one vm structure from the inode's i_mmap ring. */ @@ -92,6 +92,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk) unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; + down(&mm->mmap_sem); lock_kernel(); if (brk < mm->end_code) goto out; @@ -109,9 +110,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk) /* Check against rlimit and stack.. */ rlim = current->rlim[RLIMIT_DATA].rlim_cur; - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (brk - mm->end_code > rlim) + if (rlim < RLIM_INFINITY && brk - mm->end_code > rlim) goto out; /* Check against existing mmap mappings. */ @@ -132,6 +131,7 @@ set_brk: out: retval = mm->brk; unlock_kernel(); + up(&mm->mmap_sem); return retval; } @@ -196,9 +196,14 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, if ((prot & PROT_WRITE) && !(file->f_mode & 2)) return -EACCES; + /* Make sure we don't allow writing to an append-only file.. */ + if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & 2)) + return -EACCES; + /* make sure there are no mandatory locks on the file. */ if (locks_verify_locked(file->f_dentry->d_inode)) return -EAGAIN; + /* fall through */ case MAP_PRIVATE: if (!(file->f_mode & 1)) @@ -316,16 +321,9 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, merge_segments(mm, vma->vm_start, vma->vm_end); mm->total_vm += len >> PAGE_SHIFT; - if ((flags & VM_LOCKED) && !(flags & VM_IO)) { - unsigned long start = addr; + if (flags & VM_LOCKED) { mm->locked_vm += len >> PAGE_SHIFT; - do { - char c; - get_user(c,(char *) start); - len -= PAGE_SIZE; - start += PAGE_SIZE; - __asm__ __volatile__("": :"r" (c)); - } while (len > 0); + make_pages_present(addr, addr + len); } return addr; @@ -428,30 +426,10 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr, insert_vm_struct(current->mm, mpnt); } - /* Close the current area ... */ - if (area->vm_ops && area->vm_ops->close) { - end = area->vm_end; /* save new end */ - area->vm_end = area->vm_start; - area->vm_ops->close(area); - area->vm_end = end; - } - /* ... then reopen and reinsert. */ - if (area->vm_ops && area->vm_ops->open) - area->vm_ops->open(area); insert_vm_struct(current->mm, area); return 1; } -asmlinkage int sys_munmap(unsigned long addr, size_t len) -{ - int ret; - - lock_kernel(); - ret = do_munmap(addr, len); - unlock_kernel(); - return ret; -} - /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -460,7 +438,7 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len) int do_munmap(unsigned long addr, size_t len) { struct mm_struct * mm; - struct vm_area_struct *mpnt, *next, *free, *extra; + struct vm_area_struct *mpnt, *free, *extra; int freed; if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) @@ -481,6 +459,11 @@ int do_munmap(unsigned long addr, size_t len) if (!mpnt) return 0; + /* If we'll make "hole", check the vm areas limit */ + if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) && + mm->map_count > MAX_MAP_COUNT) + return -ENOMEM; + /* * We may need one additional vma to fix up the mappings ... * and this is the last chance for an easy error exit. @@ -489,9 +472,7 @@ int do_munmap(unsigned long addr, size_t len) if (!extra) return -ENOMEM; - next = mpnt->vm_next; - - /* we have mpnt->vm_next = next and addr < mpnt->vm_end */ + /* we have addr < mpnt->vm_end */ free = NULL; for ( ; mpnt && mpnt->vm_start < addr+len; ) { struct vm_area_struct *next = mpnt->vm_next; @@ -505,13 +486,6 @@ int do_munmap(unsigned long addr, size_t len) mpnt = next; } - if (free && (free->vm_start < addr) && (free->vm_end > addr+len)) { - if (mm->map_count > MAX_MAP_COUNT) { - kmem_cache_free(vm_area_cachep, extra); - return -ENOMEM; - } - } - /* Ok - we have the memory areas we should free on the 'free' list, * so release them, and unmap the page range.. * If the one of the segments is only being partially unmapped, @@ -555,6 +529,18 @@ int do_munmap(unsigned long addr, size_t len) return 0; } +asmlinkage int sys_munmap(unsigned long addr, size_t len) +{ + int ret; + + down(¤t->mm->mmap_sem); + lock_kernel(); + ret = do_munmap(addr, len); + unlock_kernel(); + up(¤t->mm->mmap_sem); + return ret; +} + /* Release all mmaps. */ void exit_mmap(struct mm_struct * mm) { @@ -630,13 +616,13 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) * This assumes that the list is ordered by address. * We don't need to traverse the entire list, only those segments * which intersect or are adjacent to a given interval. + * + * We must already hold the mm semaphore when we get here.. */ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) { struct vm_area_struct *prev, *mpnt, *next; - down(&mm->mmap_sem); - prev = NULL; mpnt = mm->mmap; while(mpnt && mpnt->vm_end <= start_addr) { @@ -644,7 +630,7 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l mpnt = mpnt->vm_next; } if (!mpnt) - goto no_vma; + return; next = mpnt->vm_next; @@ -700,8 +686,6 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l mpnt = prev; } mm->mmap_cache = NULL; /* Kill the cache. */ -no_vma: - up(&mm->mmap_sem); } __initfunc(void vma_init(void)) diff --git a/mm/mprotect.c b/mm/mprotect.c index 0c5dac4cd..cc78e10ab 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -208,18 +208,20 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) struct vm_area_struct * vma, * next; int error = -EINVAL; - lock_kernel(); if (start & ~PAGE_MASK) - goto out; + return -EINVAL; len = (len + ~PAGE_MASK) & PAGE_MASK; end = start + len; if (end < start) - goto out; + return -EINVAL; if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) - goto out; - error = 0; + return -EINVAL; if (end == start) - goto out; + return 0; + + down(¤t->mm->mmap_sem); + lock_kernel(); + vma = find_vma(current->mm, start); error = -EFAULT; if (!vma || vma->vm_start > start) @@ -256,5 +258,6 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) merge_segments(current->mm, start, end); out: unlock_kernel(); + up(¤t->mm->mmap_sem); return error; } diff --git a/mm/mremap.c b/mm/mremap.c index a31a0ae14..cd7a7eb4a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -21,6 +21,8 @@ #include <asm/system.h> #include <asm/pgtable.h> +extern int vm_enough_memory(long pages); + static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) { pgd_t * pgd; @@ -167,6 +169,7 @@ asmlinkage unsigned long sys_mremap(unsigned long addr, struct vm_area_struct *vma; unsigned long ret = -EINVAL; + down(¤t->mm->mmap_sem); lock_kernel(); if (addr & ~PAGE_MASK) goto out; @@ -178,7 +181,7 @@ asmlinkage unsigned long sys_mremap(unsigned long addr, * the unnecessary pages.. */ ret = addr; - if (old_len > new_len) { + if (old_len >= new_len) { do_munmap(addr+new_len, old_len - new_len); goto out; } @@ -204,6 +207,11 @@ asmlinkage unsigned long sys_mremap(unsigned long addr, if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) > current->rlim[RLIMIT_AS].rlim_cur) goto out; + /* Private writable mapping? Check memory availability.. */ + if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && + !(flags & MAP_NORESERVE) && + !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) + goto out; /* old_len exactly to the end of the area.. */ if (old_len == vma->vm_end - addr && @@ -233,5 +241,6 @@ asmlinkage unsigned long sys_mremap(unsigned long addr, ret = -ENOMEM; out: unlock_kernel(); + up(¤t->mm->mmap_sem); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d61d74f44..c51db59d9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -98,53 +98,33 @@ static inline void remove_mem_queue(struct page * entry) * * Hint: -mask = 1+~mask */ -static spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED; +spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED; /* - * This routine is used by the kernel swap deamon to determine + * This routine is used by the kernel swap daemon to determine * whether we have "enough" free pages. It is fairly arbitrary, - * but this had better return false if any reasonable "get_free_page()" - * allocation could currently fail.. + * having a low-water and high-water mark. * - * This will return zero if no list was found, non-zero - * if there was memory (the bigger, the better). + * This returns: + * 0 - urgent need for memory + * 1 - need some memory, but do it slowly in the background + * 2 - no need to even think about it. */ -int free_memory_available(int nr) +int free_memory_available(void) { - int retval = 0; - unsigned long flags; - struct free_area_struct * list; + static int available = 1; - /* - * If we have more than about 3% to 5% of all memory free, - * consider it to be good enough for anything. - * It may not be, due to fragmentation, but we - * don't want to keep on forever trying to find - * free unfragmented memory. - * Added low/high water marks to avoid thrashing -- Rik. - */ - if (nr_free_pages > (nr ? freepages.low : freepages.high)) - return nr+1; + if (nr_free_pages < freepages.low) { + available = 0; + return 0; + } - list = free_area + NR_MEM_LISTS; - spin_lock_irqsave(&page_alloc_lock, flags); - /* We fall through the loop if the list contains one - * item. -- thanks to Colin Plumb <colin@nyx.net> - */ - do { - list--; - /* Empty list? Bad - we need more memory */ - if (list->next == memory_head(list)) - break; - /* One item on the list? Look further */ - if (list->next->next == memory_head(list)) - continue; - /* More than one item? We're ok */ - retval = nr + 1; - break; - } while (--nr >= 0); - spin_unlock_irqrestore(&page_alloc_lock, flags); - return retval; + if (nr_free_pages > freepages.high) { + available = 1; + return 2; + } + + return available; } static inline void free_pages_ok(unsigned long map_nr, unsigned long order) @@ -182,9 +162,11 @@ void __free_page(struct page *page) if (PageSwapCache(page)) panic ("Freeing swap cache page"); free_pages_ok(page->map_nr, 0); + return; } if (PageSwapCache(page) && atomic_read(&page->count) == 1) - panic ("Releasing swap cache page"); + printk(KERN_WARNING "VM: Releasing swap cache page at %p", + __builtin_return_address(0)); } void free_pages(unsigned long addr, unsigned long order) @@ -202,8 +184,9 @@ void free_pages(unsigned long addr, unsigned long order) return; } if (PageSwapCache(map) && atomic_read(&map->count) == 1) - panic ("Releasing swap cache pages at %p", - __builtin_return_address(0)); + printk(KERN_WARNING + "VM: Releasing swap cache pages at %p", + __builtin_return_address(0)); } } @@ -214,13 +197,11 @@ void free_pages(unsigned long addr, unsigned long order) change_bit((index) >> (1+(order)), (area)->map) #define CAN_DMA(x) (PageDMA(x)) #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT)) -#define RMQUEUE(order, maxorder, dma) \ +#define RMQUEUE(order, dma) \ do { struct free_area_struct * area = free_area+order; \ unsigned long new_order = order; \ do { struct page *prev = memory_head(area), *ret = prev->next; \ while (memory_head(area) != ret) { \ - if (new_order >= maxorder && ret->next == prev) \ - break; \ if (!dma || CAN_DMA(ret)) { \ unsigned long map_nr = ret->map_nr; \ (prev->next = ret->next)->prev = prev; \ @@ -252,39 +233,46 @@ do { unsigned long size = 1 << high; \ unsigned long __get_free_pages(int gfp_mask, unsigned long order) { - unsigned long flags, maxorder; + unsigned long flags; if (order >= NR_MEM_LISTS) goto nopage; - /* - * "maxorder" is the highest order number that we're allowed - * to empty in order to find a free page.. - */ - maxorder = NR_MEM_LISTS-1; - if (gfp_mask & __GFP_HIGH) - maxorder = NR_MEM_LISTS; - - if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { - static int count = 0; - if (++count < 5) { - printk("gfp called nonatomically from interrupt %p\n", - return_address()); - gfp_mask &= ~__GFP_WAIT; + if (gfp_mask & __GFP_WAIT) { + if (in_interrupt()) { + static int count = 0; + if (++count < 5) { + printk("gfp called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + } + goto nopage; } - } - for (;;) { - spin_lock_irqsave(&page_alloc_lock, flags); - RMQUEUE(order, maxorder, (gfp_mask & GFP_DMA)); - spin_unlock_irqrestore(&page_alloc_lock, flags); - if (!(gfp_mask & __GFP_WAIT)) - break; - if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX)) - break; - gfp_mask &= ~__GFP_WAIT; /* go through this only once */ - maxorder = NR_MEM_LISTS; /* Allow anything this time */ + if (freepages.min > nr_free_pages) { + int freed; + freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX); + /* + * Low priority (user) allocations must not + * succeed if we didn't have enough memory + * and we couldn't get more.. + */ + if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) + goto nopage; + } } + spin_lock_irqsave(&page_alloc_lock, flags); + RMQUEUE(order, (gfp_mask & GFP_DMA)); + spin_unlock_irqrestore(&page_alloc_lock, flags); + + /* + * If we failed to find anything, we'll return NULL, but we'll + * wake up kswapd _now_ ad even wait for it synchronously if + * we can.. This way we'll at least make some forward progress + * over time. + */ + wake_up(&kswapd_wait); + if (gfp_mask & __GFP_WAIT) + schedule(); nopage: return 0; } @@ -300,6 +288,11 @@ void show_free_areas(void) unsigned long total = 0; printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); + printk("Free: %d (%d %d %d)\n", + nr_free_pages, + freepages.min, + freepages.low, + freepages.high); spin_lock_irqsave(&page_alloc_lock, flags); for (order=0 ; order < NR_MEM_LISTS; order++) { struct page * tmp; @@ -329,22 +322,23 @@ __initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long e { mem_map_t * p; unsigned long mask = PAGE_MASK; - int i; + unsigned long i; /* * Select nr of pages we try to keep free for important stuff - * with a minimum of 48 pages and a maximum of 256 pages, so + * with a minimum of 10 pages and a maximum of 256 pages, so * that we don't waste too much memory on large systems. - * This is totally arbitrary. + * This is fairly arbitrary, but based on some behaviour + * analysis. */ i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7); - if (i < 48) - i = 48; + if (i < 10) + i = 10; if (i > 256) i = 256; freepages.min = i; - freepages.low = i << 1; - freepages.high = freepages.low + i; + freepages.low = i * 2; + freepages.high = i * 3; mem_map = (mem_map_t *) LONG_ALIGN(start_mem); p = mem_map + MAP_NR(end_mem); start_mem = LONG_ALIGN((unsigned long) p); diff --git a/mm/page_io.c b/mm/page_io.c index eb436f7b7..7e5a35186 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -74,18 +74,19 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) return; } if (p->swap_map && !p->swap_map[offset]) { - printk("Hmm.. Trying to %s unallocated swap (%08lx)\n", - (rw == READ) ? "read" : "write", - entry); + printk(KERN_ERR "rw_swap_page: " + "Trying to %s unallocated swap (%08lx)\n", + (rw == READ) ? "read" : "write", entry); return; } if (!(p->flags & SWP_USED)) { - printk("Trying to swap to unused swap-device\n"); + printk(KERN_ERR "rw_swap_page: " + "Trying to swap to unused swap-device\n"); return; } if (!PageLocked(page)) { - printk("VM: swap page is unlocked\n"); + printk(KERN_ERR "VM: swap page is unlocked\n"); return; } @@ -111,11 +112,11 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) * hashing for locked pages. */ if (!PageSwapCache(page)) { - printk("VM: swap page is not in swap cache\n"); + printk(KERN_ERR "VM: swap page is not in swap cache\n"); return; } if (page->offset != entry) { - printk ("swap entry mismatch"); + printk (KERN_ERR "VM: swap entry mismatch\n"); return; } @@ -142,7 +143,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) if (swapf->i_op->bmap == NULL && swapf->i_op->smap != NULL){ /* - With MsDOS, we use msdos_smap which return + With MS-DOS, we use msdos_smap which return a sector number (not a cluster or block number). It is a patch to enable the UMSDOS project. Other people are working on better solution. @@ -179,11 +180,14 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) clear_bit(PG_locked, &page->flags); wake_up(&page->wait); } else - printk("rw_swap_page: no swap file or device\n"); + printk(KERN_ERR "rw_swap_page: no swap file or device\n"); + /* This shouldn't happen, but check to be sure. */ + if (atomic_read(&page->count) == 1) + printk(KERN_ERR "rw_swap_page: page unused while waiting!\n"); atomic_dec(&page->count); if (offset && !test_and_clear_bit(offset,p->swap_lockmap)) - printk("rw_swap_page: lock already cleared\n"); + printk(KERN_ERR "rw_swap_page: lock already cleared\n"); wake_up(&lock_queue); #ifdef DEBUG_SWAP printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n", diff --git a/mm/simp.c b/mm/simp.c deleted file mode 100644 index 581cde3d7..000000000 --- a/mm/simp.c +++ /dev/null @@ -1,435 +0,0 @@ -#define NULL 0 -/* - * mm/simp.c -- simple allocator for cached objects - * - * (C) 1997 Thomas Schoebel-Theuer - */ - -#include <linux/simp.h> -#include <linux/tasks.h> -#include <linux/smp.h> -#include <linux/mm.h> -#include <asm/spinlock.h> - -/* The next two defines can be independently enabled for debugging */ -/*#define DEBUG*/ -/*#define DEAD_BEEF*/ - -#ifdef DEAD_BEEF -#define DEBUG_BEEF 1 -#else -#define DEBUG_BEEF 0 -#endif - -#ifdef __SMP__ -#define NR_PROCESSORS NR_CPUS -#define GLOBAL_SIZE CHUNK_SIZE -#else -#define NR_PROCESSORS 1 -#define GLOBAL_SIZE PAGE_SIZE -#endif - -#define POSTBUFFER_SIZE 63 -#define ORDER 2 -#define CHUNK_SIZE (PAGE_SIZE*(1<<ORDER)) -#define CHUNK_BASE(ptr) (struct header*)(((unsigned long)(ptr)) & ~(CHUNK_SIZE-1)) -#define CHUNK_END(hdr) (void**)((char*)(hdr) + CHUNK_SIZE) - -#define COLOR_INCREMENT (8*sizeof(void*)) /* should be 1 cache line */ -#define ALIGN_CACHE(adr) ((((((unsigned long)adr) - 1) / COLOR_INCREMENT) + 1) * COLOR_INCREMENT) -#define HEADER_SIZE ALIGN_CACHE(sizeof(struct header)) -#define ELEM_SIZE ALIGN_CACHE(sizeof(struct elem)) -#define FILL_TYPE(name,wrongsize) char name[ALIGN_CACHE(wrongsize)-(wrongsize)] - -#define MAX_SIMPS ((GLOBAL_SIZE / sizeof(struct simp)) - 1) - -struct header { /* this is at the beginning of each memory region */ - /* 1st cache line */ - void ** index; - void ** fresh; - struct simp * father; - void ** emptypos; - struct header * next; - structor again_ctor; - structor first_ctor; - void * fill[1]; -#ifdef DEBUG - /* 2nd cache line */ - char magic[32]; -#endif -}; - -struct per_processor { - void ** buffer_pos; - void * postbuffer[POSTBUFFER_SIZE]; -}; - -struct simp { - /* 1st cache lines */ - struct per_processor private[NR_PROCESSORS]; - /* next cache line */ - struct header * usable_list; - spinlock_t lock; - /* This value is negative on Alpha SMP. */ - /* char fill[sizeof(void*) - sizeof(spinlock_t)]; */ - long real_size; - long max_elems; - structor again_ctor; - structor first_ctor; - structor dtor; - long fill2; - /* next cache line */ - long create_offset; - long color; - long max_color; - long size; - long fill3[4]; - /* next cache line */ - char name[32]; -}; - -struct global_data { - /* 1st cache line */ - long changed_flag; - long nr_simps; - spinlock_t lock; - char fill[(6+8)*sizeof(void*)+sizeof(void*)-sizeof(spinlock_t)]; - /* rest */ - struct simp simps[MAX_SIMPS]; -}; - -static struct global_data * global = NULL; - -#ifdef DEBUG -static char global_magic[32] = "SIMP header SdC581oi9rY20051962\n"; -#endif - -struct simp * simp_create(char * name, long size, - structor first_ctor, - structor again_ctor, - structor dtor) -{ - struct simp * simp; - long fraction; - long real_size; - int cpu; - - if(!global) { -#ifdef __SMP__ - global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER); - memset(global, 0, CHUNK_SIZE); -#else - global = (struct global_data*)get_free_page(GFP_KERNEL); -#endif - spin_lock_init(&global->lock); - } - - spin_lock(&global->lock); - simp = &global->simps[global->nr_simps++]; - spin_unlock(&global->lock); - - if(global->nr_simps >= MAX_SIMPS) { - printk("SIMP: too many simps allocated\n"); - return NULL; - } - memset(simp, 0, sizeof(struct simp)); - spin_lock_init(&simp->lock); - strncpy(simp->name, name, 15); - simp->size = size; - simp->real_size = real_size = ALIGN_CACHE(size); - /* allow aggregation of very small objects in 2-power fractions of - * cachelines */ - fraction = COLOR_INCREMENT / 2; - while(size <= fraction && fraction >= sizeof(void*)) { - simp->real_size = fraction; - fraction >>= 1; - } - simp->first_ctor = first_ctor; - simp->again_ctor = again_ctor; - simp->dtor = dtor; - - real_size += sizeof(void*); - simp->max_elems = (CHUNK_SIZE - HEADER_SIZE) / real_size; - simp->max_color = (CHUNK_SIZE - HEADER_SIZE) % real_size; - for(cpu = 0; cpu < NR_PROCESSORS; cpu++) { - struct per_processor * private = &simp->private[cpu]; - private->buffer_pos = private->postbuffer; - } - return simp; -} - -/* Do *not* inline this, it clobbers too many registers... */ -static void alloc_header(struct simp * simp) -{ - struct header * hdr; - char * ptr; - void ** index; - long count; - - spin_unlock(&simp->lock); - for(;;) { - hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER); - if(hdr) - break; - if(!simp_garbage()) - return; - } -#ifdef DEBUG - if(CHUNK_BASE(hdr) != hdr) - panic("simp: bad kernel page alignment"); -#endif - - memset(hdr, 0, HEADER_SIZE); -#ifdef DEBUG - memcpy(hdr->magic, global_magic, sizeof(global_magic)); -#endif - hdr->father = simp; - hdr->again_ctor = simp->again_ctor; - hdr->first_ctor = simp->first_ctor; - - /* note: races on simp->color don't produce any error :-) */ - ptr = ((char*)hdr) + HEADER_SIZE + simp->color; - index = CHUNK_END(hdr); - for(count = 0; count < simp->max_elems; count++) { - *--index = ptr; - ptr += simp->real_size; - /* note: constructors are not called here in bunch but - * instead at each single simp_alloc(), in order - * to maximize chances that the cache will be - * polluted after a simp_alloc() anyway, - * and not here. */ - } - hdr->index = hdr->fresh = hdr->emptypos = index; - - spin_lock(&simp->lock); - simp->color += COLOR_INCREMENT; - if(simp->color >= simp->max_color) - simp->color = 0; - hdr->next = simp->usable_list; - simp->usable_list = hdr; -} - -/* current x86 memcpy() is horribly moving around registers for nothing, - * is doing unnecessary work if the size is dividable by a power-of-two, - * and it clobbers way too many registers. - * This results in nearly any other register being transfered to stack. - * Fixing this would be a major win for the whole kernel! - */ -static void ** bunch_alloc(struct simp * simp, void ** buffer) -{ - struct header * hdr; - void ** index; - void ** to; - void ** end; - structor todo; - long length; - - spin_lock(&simp->lock); - hdr = simp->usable_list; - if(!hdr) { - alloc_header(simp); - hdr = simp->usable_list; - if(!hdr) { - spin_unlock(&simp->lock); - *buffer = NULL; - return buffer+1; - } - } - - index = hdr->index; - end = hdr->fresh; - todo = hdr->again_ctor; - if(index == end) { - end = CHUNK_END(hdr); - todo = hdr->first_ctor; - } - to = index + POSTBUFFER_SIZE/2; - if(to >= end) { - to = end; - if(to == CHUNK_END(hdr)) { - simp->usable_list = hdr->next; - hdr->next = NULL; - } - } - if(to > hdr->fresh) - hdr->fresh = to; - hdr->index = to; - length = ((unsigned long)to) - (unsigned long)index; - to = buffer + (length/sizeof(void**)); - - memcpy(buffer, index, length); - - spin_unlock(&simp->lock); - - if(todo) { - do { - todo(*buffer++); - } while(buffer < to); - } - return to; -} - -void * simp_alloc(struct simp * simp) -{ -#ifdef __SMP__ - const long cpu = smp_processor_id(); - struct per_processor * priv = &simp->private[cpu]; -#else -#define priv (&simp->private[0]) /*fool gcc to use no extra register*/ -#endif - void ** buffer_pos = priv->buffer_pos; - void * res; - - if(buffer_pos == priv->postbuffer) { - buffer_pos = bunch_alloc(simp, buffer_pos); - } - buffer_pos--; - res = *buffer_pos; - priv->buffer_pos = buffer_pos; - return res; -} - -#ifdef DEBUG -long check_header(struct header * hdr, void * ptr) -{ - void ** test; - - if(!hdr) { - printk("SIMP: simp_free() with NULL pointer\n"); - return 1; - } - if(strncmp(hdr->magic, global_magic, 32)) { - printk("SIMP: simpe_free() with bad ptr %p, or header corruption\n", ptr); - return 1; - } - /* This is brute force, but I don't want to pay for any - * overhead if debugging is not enabled, in particular - * no space overhead for keeping hashtables etc. */ - test = hdr->index; - while(test < CHUNK_END(hdr)) { - if(*test++ == ptr) { - printk("SIMP: trying to simp_free(%p) again\n", ptr); - return 1; - } - } - return 0; -} -#endif - -static void ** bunch_free(struct simp * simp, void ** buffer) -{ - void ** stop; - - stop = buffer - POSTBUFFER_SIZE/3; - - spin_lock(&simp->lock); - while(buffer > stop) { - void * elem = buffer[-1]; - struct header * hdr = CHUNK_BASE(elem); - void ** index = hdr->index; - index--; - hdr->index = index; - *index = elem; - if(!hdr->next) { - hdr->next = simp->usable_list; - simp->usable_list = hdr; - } - - buffer -= 2; - elem = *buffer; - hdr = CHUNK_BASE(elem); - index = hdr->index; - index--; - hdr->index = index; - *index = elem; - if(!hdr->next) { - hdr->next = simp->usable_list; - simp->usable_list = hdr; - } - } - spin_unlock(&simp->lock); - global->changed_flag = 1; - return buffer; -} - -void simp_free(void * objp) -{ - struct header * hdr; - void ** buffer_pos; - struct per_processor * private; -#ifdef __SMP__ - const long cpu = smp_processor_id(); -#else - const long cpu = 0; -#endif - - hdr = CHUNK_BASE(objp); -#ifdef DEBUG - if(check_header(hdr, objp)) - return; -#endif - - private = &hdr->father->private[cpu]; - buffer_pos = private->buffer_pos; - if(buffer_pos >= private->postbuffer+POSTBUFFER_SIZE) { - buffer_pos = bunch_free(hdr->father, buffer_pos); - } - - *buffer_pos++ = objp; - private->buffer_pos = buffer_pos; - -#ifdef DEAD_BEEF - { - unsigned int * ptr = (unsigned int*)objp; - int count = (hdr->father->real_size - ELEM_SIZE) / sizeof(unsigned int); - while(count--) - *ptr++ = 0xdeadbeef; - } -#endif -} - -long simp_garbage(void) -{ - int i; - int res; - - if(!global->changed_flag) - return 0; /* shortcut */ - /* Note: costs do not matter here. Any heavy thrashing of - * simp chunks that could be caused by pools stealing each - * other's memory has to be considered a BUG :-) - * Simply avoid memory shortages by conservative allocating - * policies. - */ - global->changed_flag = 0; - res = 0; - for(i = 0; i < global->nr_simps; i++) { - struct simp * simp = &global->simps[i]; - struct header ** base = &simp->usable_list; - struct header * del; - - spin_lock(&simp->lock); - del = *base; - while(del) { - if(del->index == del->emptypos) { - if(simp->dtor) { - void ** ptr = del->index; - while(ptr < CHUNK_END(del)) { - simp->dtor(*ptr++); - } - } - *base = del->next; -#ifdef DEBUG - memset(del, 0, CHUNK_SIZE); -#endif - free_pages((unsigned long)del, ORDER); - res++; - } else - base = &del->next; - del = *base; - } - spin_unlock(&simp->lock); - } - return res; -} - @@ -70,7 +70,7 @@ * * Calls to printk() are not 100% safe (the function is not threaded). However, * printk() is only used under an error condition, and the risk is v. small (not - * sure if the console write functions 'enjoy' executing multiple contextes in + * sure if the console write functions 'enjoy' executing multiple contexts in * parallel. I guess they don't...). * Note, for most calls to printk() any held cache-lock is dropped. This is not * always done for text size reasons - having *_unlock() everywhere is bloat. @@ -92,11 +92,11 @@ * index to hold the bufctls. This allows the bufctl structure to * be small (one word), but limits the number of objects a slab (not * a cache) can contain when off-slab bufctls are used. The limit is the - * size of the largest general-cache that does not use off-slab bufctls, + * size of the largest general cache that does not use off-slab bufctls, * divided by the size of a bufctl. For 32bit archs, is this 256/4 = 64. * This is not serious, as it is only for large objects, when it is unwise * to have too many per slab. - * Note: This limit can be raised by introducing a general-cache whose size + * Note: This limit can be raised by introducing a general cache whose size * is less than 512 (PAGE_SIZE<<3), but greater than 256. */ @@ -109,7 +109,6 @@ #include <asm/system.h> #include <asm/atomic.h> -#include <asm/smp_lock.h> #include <asm/spinlock.h> #ifdef __mips__ #include <asm/pgtable.h> @@ -128,12 +127,12 @@ * * SLAB_DEBUG_SUPPORT - 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE, * SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISON. - * 0 for faster, smaller, code (espically in the critical paths). + * 0 for faster, smaller, code (especially in the critical paths). * * SLAB_STATS - 1 to collect stats for /proc/slabinfo. - * 0 for faster, smaller, code (espically in the critical paths). + * 0 for faster, smaller, code (especially in the critical paths). * - * SLAB_SELFTEST - 1 to perform a few tests, mainly for developement. + * SLAB_SELFTEST - 1 to perform a few tests, mainly for development. */ #define SLAB_MGMT_CHECKS 1 #define SLAB_DEBUG_SUPPORT 0 @@ -184,7 +183,7 @@ typedef struct kmem_slab_s { s_dma:1; } kmem_slab_t; -/* When the slab mgmt is on-slab, this gives the size to use. */ +/* When the slab management is on-slab, this gives the size to use. */ #define slab_align_size (L1_CACHE_ALIGN(sizeof(kmem_slab_t))) /* Test for end of slab chain. */ @@ -192,7 +191,7 @@ typedef struct kmem_slab_s { /* s_magic */ #define SLAB_MAGIC_ALLOC 0xA5C32F2BUL /* slab is alive */ -#define SLAB_MAGIC_DESTROYED 0xB2F23C5AUL /* slab has been destoryed */ +#define SLAB_MAGIC_DESTROYED 0xB2F23C5AUL /* slab has been destroyed */ /* Bufctl's are used for linking objs within a slab, identifying what slab an obj * is in, and the address of the associated obj (for sanity checking with off-slab @@ -264,9 +263,9 @@ struct kmem_cache_s { }; /* internal c_flags */ -#define SLAB_CFLGS_OFF_SLAB 0x010000UL /* slab mgmt in own cache */ +#define SLAB_CFLGS_OFF_SLAB 0x010000UL /* slab management in own cache */ #define SLAB_CFLGS_BUFCTL 0x020000UL /* bufctls in own cache */ -#define SLAB_CFLGS_GENERAL 0x080000UL /* a general-cache */ +#define SLAB_CFLGS_GENERAL 0x080000UL /* a general cache */ /* c_dflags (dynamic flags). Need to hold the spinlock to access this member */ #define SLAB_CFLGS_GROWN 0x000002UL /* don't reap a recently grown */ @@ -311,13 +310,15 @@ static void kmem_self_test(void); /* maximum num of pages for a slab (prevents large requests to the VM layer) */ #define SLAB_MAX_GFP_ORDER 5 /* 32 pages */ -/* the 'prefered' minimum num of objs per slab - maybe less for large objs */ +/* the 'preferred' minimum num of objs per slab - maybe less for large objs */ #define SLAB_MIN_OBJS_PER_SLAB 4 /* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB, * then the page order must be less than this before trying the next order. */ -#define SLAB_BREAK_GFP_ORDER 2 +#define SLAB_BREAK_GFP_ORDER_HI 2 +#define SLAB_BREAK_GFP_ORDER_LO 1 +static int slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_LO; /* Macros for storing/retrieving the cachep and or slab from the * global 'mem_map'. With off-slab bufctls, these are used to find the @@ -329,7 +330,7 @@ static void kmem_self_test(void); #define SLAB_SET_PAGE_SLAB(pg, x) ((pg)->prev = (struct page *)(x)) #define SLAB_GET_PAGE_SLAB(pg) ((kmem_slab_t *)(pg)->prev) -/* Size description struct for general-caches. */ +/* Size description struct for general caches. */ typedef struct cache_sizes { size_t cs_size; kmem_cache_t *cs_cachep; @@ -354,7 +355,7 @@ static cache_sizes_t cache_sizes[] = { {0, NULL} }; -/* Names for the general-caches. Not placed into the sizes struct for +/* Names for the general caches. Not placed into the sizes struct for * a good reason; the string ptr is not needed while searching in kmalloc(), * and would 'get-in-the-way' in the h/w cache. */ @@ -400,7 +401,7 @@ static struct semaphore cache_chain_sem; /* Place maintainer for reaping. */ static kmem_cache_t *clock_searchp = &cache_cache; -/* Internal slab mgmt cache, for when slab mgmt is off-slab. */ +/* Internal slab management cache, for when slab management is off-slab. */ static kmem_cache_t *cache_slabp = NULL; /* Max number of objs-per-slab for caches which use bufctl's. @@ -451,6 +452,12 @@ __initfunc(long kmem_cache_init(long start, long end)) cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES; cache_cache.c_colour_next = cache_cache.c_colour; + /* + * Fragmentation resistance on low memory - only use bigger + * page orders on machines with more than 32MB of memory. + */ + if (num_physpages > (32 << 20) >> PAGE_SHIFT) + slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_HI; return start; } @@ -467,9 +474,9 @@ __initfunc(void kmem_cache_sizes_init(void)) char **names = cache_sizes_name; cache_sizes_t *sizes = cache_sizes; do { - /* For performance, all the general-caches are L1 aligned. + /* For performance, all the general caches are L1 aligned. * This should be particularly beneficial on SMP boxes, as it - * elimantes "false sharing". + * eliminates "false sharing". * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ if (!(sizes->cs_cachep = @@ -566,7 +573,7 @@ kmem_check_poison_obj(kmem_cache_t *cachep, void *addr) } #endif /* SLAB_DEBUG_SUPPORT */ -/* Three slab chain funcs - all called with ints disabled and the appropiate +/* Three slab chain funcs - all called with ints disabled and the appropriate * cache-lock held. */ static inline void @@ -608,7 +615,7 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp) { if (cachep->c_dtor #if SLAB_DEBUG_SUPPORT - || cachep->c_flags & (SLAB_POISON || SLAB_RED_ZONE) + || cachep->c_flags & (SLAB_POISON | SLAB_RED_ZONE) #endif /*SLAB_DEBUG_SUPPORT*/ ) { /* Doesn't use the bufctl ptrs to find objs. */ @@ -634,7 +641,7 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp) #if SLAB_DEBUG_SUPPORT else if (cachep->c_flags & SLAB_POISON) { if (kmem_check_poison_obj(cachep, objp)) - printk(KERN_ERR "kmem_slab_destory: " + printk(KERN_ERR "kmem_slab_destroy: " "Bad poison - %s\n", cachep->c_name); } if (cachep->c_flags & SLAB_RED_ZONE) @@ -718,7 +725,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset, } if (offset < 0 || offset > size) { - printk("%sOffset weired %d - %s\n", func_nm, (int) offset, name); + printk("%sOffset weird %d - %s\n", func_nm, (int) offset, name); offset = 0; } @@ -785,11 +792,11 @@ kmem_cache_create(const char *name, size_t size, size_t offset, if (flags & SLAB_HWCACHE_ALIGN) align = L1_CACHE_BYTES; - /* Determine if the slab mgmt and/or bufclts are 'on' or 'off' slab. */ + /* Determine if the slab management and/or bufclts are 'on' or 'off' slab. */ extra = sizeof(kmem_bufctl_t); if (size < (PAGE_SIZE>>3)) { /* Size is small(ish). Use packing where bufctl size per - * obj is low, and slab mngmnt is on-slab. + * obj is low, and slab management is on-slab. */ #if 0 if ((flags & SLAB_HIGH_PACK)) { @@ -806,7 +813,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset, } #endif } else { - /* Size is large, assume best to place the slab mngmnt obj + /* Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= SLAB_CFLGS_OFF_SLAB; @@ -815,7 +822,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset, /* To avoid waste the bufctls are off-slab... */ flags |= SLAB_CFLGS_BUFCTL; extra = 0; - } /* else slab mngmnt is off-slab, but freelist ptrs are on. */ + } /* else slab management is off-slab, but freelist pointers are on. */ } size += extra; @@ -873,7 +880,7 @@ cal_wastage: * bad for the gfp()s. */ if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) { - if (cachep->c_gfporder < SLAB_BREAK_GFP_ORDER) + if (cachep->c_gfporder < slab_break_gfp_order) goto next; } @@ -1022,8 +1029,8 @@ kmem_cache_shrink(kmem_cache_t *cachep) printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep); return 2; found: - /* Relase the sempahore before getting the cache-lock. This could - * mean multiple engines are shrinking the cache, but so what... + /* Release the semaphore before getting the cache-lock. This could + * mean multiple engines are shrinking the cache, but so what. */ up(&cache_chain_sem); spin_lock_irq(&cachep->c_spinlock); @@ -1045,17 +1052,17 @@ found: return ret; } -/* Get the mem for a slab mgmt obj. */ +/* Get the memory for a slab management obj. */ static inline kmem_slab_t * kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags) { kmem_slab_t *slabp; if (SLAB_OFF_SLAB(cachep->c_flags)) { - /* Slab mgmt obj is off-slab. */ + /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc(cache_slabp, local_flags); } else { - /* Slab mgmnt at end of slab mem, placed so that + /* Slab management at end of slab memory, placed so that * the position is 'coloured'. */ void *end; @@ -1203,7 +1210,7 @@ re_try: if (!(objp = kmem_getpages(cachep, flags, &dma))) goto failed; - /* Get slab mgmt. */ + /* Get slab management. */ if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags))) goto opps1; if (dma) @@ -1257,7 +1264,7 @@ failed: if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) { /* For large order (>0) slabs, we try again. * Needed because the gfp() functions are not good at giving - * out contigious pages unless pushed (but do not push too hard). + * out contiguous pages unless pushed (but do not push too hard). */ if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep)) goto re_try; @@ -1648,19 +1655,19 @@ kfree(const void *objp) goto bad_ptr; /* Assume we own the page structure - hence no locking. - * If someone is misbehaving (eg. someone calling us with a bad + * If someone is misbehaving (for example, calling us with a bad * address), then access to the page structure can race with the - * kmem_slab_destory() code. Need to add a spin_lock to each page + * kmem_slab_destroy() code. Need to add a spin_lock to each page * structure, which would be useful in threading the gfp() functions.... */ page = &mem_map[nr]; if (PageSlab(page)) { kmem_cache_t *cachep; - /* Here, we (again) assume the obj address is good. + /* Here, we again assume the obj address is good. * If it isn't, and happens to map onto another - * general-cache page which has no active objs, then - * we race.... + * general cache page which has no active objs, then + * we race. */ cachep = SLAB_GET_PAGE_CACHE(page); if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) { @@ -1714,9 +1721,9 @@ kmem_find_general_cachep(size_t size) { cache_sizes_t *csizep = cache_sizes; - /* This function could be moved to the header-file, and + /* This function could be moved to the header file, and * made inline so consumers can quickly determine what - * cache-ptr they require. + * cache pointer they require. */ for (; csizep->cs_size; csizep++) { if (size > csizep->cs_size) @@ -1745,7 +1752,7 @@ kmem_cache_reap(int gfp_mask) return; } - /* We really need a test semphore op so we can avoid sleeping when + /* We really need a test semaphore op so we can avoid sleeping when * !wait is true. */ down(&cache_chain_sem); @@ -1778,8 +1785,8 @@ kmem_cache_reap(int gfp_mask) dma_flag = 0; full_free = 0; - /* Count num of fully free slabs. Hopefully there are not many, - * we are holding the cache lock.... + /* Count the fully free slabs. There should not be not many, + * since we are holding the cache lock. */ slabp = searchp->c_lastp; while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) { @@ -1819,7 +1826,7 @@ next: up(&cache_chain_sem); if (!best_cachep) { - /* couldn't find anthying to reap */ + /* couldn't find anything to reap */ return; } @@ -6,7 +6,7 @@ /* * This file contains the default values for the opereation of the - * Linux VM subsystem. Finetuning documentation can be found in + * Linux VM subsystem. Fine-tuning documentation can be found in * linux/Documentation/sysctl/vm.txt. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. @@ -67,9 +67,9 @@ swap_control_t swap_control = { swapstat_t swapstats = {0}; buffer_mem_t buffer_mem = { - 3, /* minimum percent buffer */ - 10, /* borrow percent buffer */ - 30 /* maximum percent buffer */ + 5, /* minimum percent buffer */ + 25, /* borrow percent buffer */ + 50 /* maximum percent buffer */ }; buffer_mem_t page_cache = { diff --git a/mm/swap_state.c b/mm/swap_state.c index b91583340..401c7a1fc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -146,42 +146,23 @@ void remove_from_swap_cache(struct page *page) "on page %08lx\n", page_address(page)); } /* - * This will be a legal case once we have a more mature swap cache. + * This is a legal case, but warn about it. */ if (atomic_read(&page->count) == 1) { - printk ("VM: Removing page cache on unshared page %08lx\n", + printk (KERN_WARNING + "VM: Removing page cache on unshared page %08lx\n", page_address(page)); - return; } - #ifdef DEBUG_SWAP printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n", page_address(page), atomic_read(&page->count)); #endif - remove_page_from_hash_queue (page); - remove_page_from_inode_queue (page); PageClearSwapCache (page); - __free_page (page); + remove_inode_page(page); } -long find_in_swap_cache(struct page *page) -{ -#ifdef SWAP_CACHE_INFO - swap_cache_find_total++; -#endif - if (PageSwapCache (page)) { - long entry = page->offset; -#ifdef SWAP_CACHE_INFO - swap_cache_find_success++; -#endif - remove_from_swap_cache (page); - return entry; - } - return 0; -} - int delete_from_swap_cache(struct page *page) { #ifdef SWAP_CACHE_INFO diff --git a/mm/swapfile.c b/mm/swapfile.c index d935433bb..45f73de02 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -28,10 +28,7 @@ unsigned int nr_swapfiles = 0; -static struct { - int head; /* head of priority-ordered swapfile list */ - int next; /* swapfile to be used next */ -} swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; struct swap_info_struct swap_info[MAX_SWAPFILES]; @@ -180,7 +177,7 @@ bad_free: * that the page has been used or is no longer needed. * * Always set the resulting pte to be nowrite (the same as COW pages - * after one process has exited). We don't know just how many ptes will + * after one process has exited). We don't know just how many PTEs will * share this swap entry, so be cautious and let do_wp_page work out * what to do if a write is requested later. */ @@ -535,6 +532,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) error = blkdev_open(swap_dentry->d_inode, &filp); if (error) goto bad_swap_2; + set_blocksize(p->swap_device, PAGE_SIZE); error = -ENODEV; if (!p->swap_device || (blk_size[MAJOR(p->swap_device)] && @@ -595,7 +593,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) p->flags = SWP_WRITEOK; p->pages = j; nr_swap_pages += j; - printk("Adding Swap: %dk swap-space (priority %d)\n", + printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", j<<(PAGE_SHIFT-10), p->prio); /* insert swap space into swap_list: */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6b87beaa2..e7711c23c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -135,12 +135,16 @@ int vmalloc_area_pages(unsigned long address, unsigned long size, pgprot_t prot) dir = pgd_offset_k(address); flush_cache_all(); while (address < end) { - pmd_t *pmd = pmd_alloc_kernel(dir, address); + pmd_t *pmd; + pgd_t olddir = *dir; + + pmd = pmd_alloc_kernel(dir, address); if (!pmd) return -ENOMEM; if (alloc_area_pmd(pmd, address, end - address, prot)) return -ENOMEM; - set_pgdir(address, *dir); + if (pgd_val(olddir) != pgd_val(*dir)) + set_pgdir(address, *dir); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } @@ -150,21 +154,22 @@ int vmalloc_area_pages(unsigned long address, unsigned long size, pgprot_t prot) struct vm_struct * get_vm_area(unsigned long size) { - void *addr; + unsigned long addr; struct vm_struct **p, *tmp, *area; area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); if (!area) return NULL; - addr = (void *) VMALLOC_START; - area->size = size + PAGE_SIZE; - area->next = NULL; + addr = VMALLOC_START; for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { - if (size + (unsigned long) addr < (unsigned long) tmp->addr) + if (size + addr < (unsigned long) tmp->addr) break; - addr = (void *) (tmp->size + (unsigned long) tmp->addr); + if (addr > VMALLOC_END-size) + return NULL; + addr = tmp->size + (unsigned long) tmp->addr; } - area->addr = addr; + area->addr = (void *)addr; + area->size = size + PAGE_SIZE; area->next = *p; *p = area; return area; @@ -217,16 +222,18 @@ void * vmalloc(unsigned long size) long vread(char *buf, char *addr, unsigned long count) { - struct vm_struct **p, *tmp; + struct vm_struct *tmp; char *vaddr, *buf_start = buf; - int n; + unsigned long n; /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; - for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + for (tmp = vmlist; tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; + if (addr >= vaddr + tmp->size - PAGE_SIZE) + continue; while (addr < vaddr) { if (count == 0) goto finished; @@ -235,17 +242,15 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = tmp->size - PAGE_SIZE; - if (addr > vaddr) - n -= addr - vaddr; - while (--n >= 0) { + n = vaddr + tmp->size - PAGE_SIZE - addr; + do { if (count == 0) goto finished; put_user(*addr, buf); buf++; addr++; count--; - } + } while (--n > 0); } finished: return buf - buf_start; diff --git a/mm/vmscan.c b/mm/vmscan.c index 919b97244..b586bce72 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -42,7 +42,7 @@ int swapout_interval = HZ / 4; /* * The wait queue for waking up the pageout daemon: */ -static struct wait_queue * kswapd_wait = NULL; +struct wait_queue * kswapd_wait = NULL; static void init_swap_timer(void); @@ -88,7 +88,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc * pages, then delete the swap cache. We can only do this if * the swap page's reference count is one: ie. there are no * other references to it beyond the swap cache (as there must - * still be pte's pointing to it if count > 1). + * still be PTEs pointing to it if count > 1). * * If the page has NOT been touched, and its age reaches zero, * then we are swapping it out: @@ -107,7 +107,17 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc if (PageSwapCache(page_map)) { if (pte_write(pte)) { + struct page *found; printk ("VM: Found a writable swap-cached page!\n"); + /* Try to diagnose the problem ... */ + found = find_page(&swapper_inode, page_map->offset); + if (found) { + printk("page=%p@%08lx, found=%p, count=%d\n", + page_map, page_map->offset, + found, atomic_read(&found->count)); + __free_page(found); + } else + printk ("Spurious, page not in cache\n"); return 0; } } @@ -144,9 +154,8 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc * we have the swap cache set up to associate the * page with that swap entry. */ - if (PageSwapCache(page_map)) { - entry = page_map->offset; - } else { + entry = in_swap_cache(page_map); + if (!entry) { entry = get_swap_page(); if (!entry) return 0; /* No swap space left */ @@ -219,8 +228,8 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc flush_cache_page(vma, address); pte_clear(page_table); flush_tlb_page(vma, address); - entry = page_unuse(page); - free_page(page); + entry = page_unuse(page_map); + __free_page(page_map); return entry; } @@ -435,7 +444,7 @@ out: * to be. This works out OK, because we now do proper aging on page * contents. */ -static inline int do_try_to_free_page(int gfp_mask) +static int do_try_to_free_page(int gfp_mask) { static int state = 0; int i=6; @@ -448,9 +457,10 @@ static inline int do_try_to_free_page(int gfp_mask) stop = 3; if (gfp_mask & __GFP_WAIT) stop = 0; + if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages) || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages)) - state = 0; + shrink_mmap(i, gfp_mask); switch (state) { do { @@ -459,7 +469,7 @@ static inline int do_try_to_free_page(int gfp_mask) return 1; state = 1; case 1: - if ((gfp_mask & __GFP_IO) && shm_swap(i, gfp_mask)) + if (shm_swap(i, gfp_mask)) return 1; state = 2; case 2: @@ -476,23 +486,6 @@ static inline int do_try_to_free_page(int gfp_mask) } /* - * This is REALLY ugly. - * - * We need to make the locks finer granularity, but right - * now we need this so that we can do page allocations - * without holding the kernel lock etc. - */ -int try_to_free_page(int gfp_mask) -{ - int retval; - - lock_kernel(); - retval = do_try_to_free_page(gfp_mask); - unlock_kernel(); - return retval; -} - -/* * Before we start the kernel thread, print out the * kswapd initialization message (otherwise the init message * may be printed in the middle of another driver's init @@ -532,7 +525,7 @@ int kswapd(void *unused) /* Give kswapd a realtime priority. */ current->policy = SCHED_FIFO; - current->priority = 32; /* Fixme --- we need to standardise our + current->rt_priority = 32; /* Fixme --- we need to standardise our namings for POSIX.4 realtime scheduling priorities. */ @@ -540,7 +533,6 @@ int kswapd(void *unused) add_wait_queue(&kswapd_wait, &wait); while (1) { int tries; - int tried = 0; current->state = TASK_INTERRUPTIBLE; flush_signals(current); @@ -564,29 +556,56 @@ int kswapd(void *unused) * woken up more often and the rate will be even * higher). */ - tries = pager_daemon.tries_base >> free_memory_available(3); - - while (tries--) { - int gfp_mask; + tries = pager_daemon.tries_base; + tries >>= 4*free_memory_available(); - if (++tried > pager_daemon.tries_min && free_memory_available(0)) - break; - gfp_mask = __GFP_IO; - try_to_free_page(gfp_mask); + do { + do_try_to_free_page(0); /* * Syncing large chunks is faster than swapping * synchronously (less head movement). -- Rik. */ if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster) run_task_queue(&tq_disk); - - } + if (free_memory_available() > 1) + break; + } while (--tries > 0); } /* As if we could ever get here - maybe we want to make this killable */ remove_wait_queue(&kswapd_wait, &wait); + unlock_kernel(); return 0; } +/* + * We need to make the locks finer granularity, but right + * now we need this so that we can do page allocations + * without holding the kernel lock etc. + * + * The "PF_MEMALLOC" flag protects us against recursion: + * if we need more memory as part of a swap-out effort we + * will just silently return "success" to tell the page + * allocator to accept the allocation. + */ +int try_to_free_pages(unsigned int gfp_mask, int count) +{ + int retval = 1; + + lock_kernel(); + if (current->flags & PF_MEMALLOC) { + current->flags |= PF_MEMALLOC; + do { + retval = do_try_to_free_page(gfp_mask); + if (!retval) + break; + count--; + } while (count > 0); + current->flags &= PF_MEMALLOC; + } + unlock_kernel(); + return retval; +} + /* * The swap_tick function gets called on every clock tick. */ @@ -606,11 +625,11 @@ void swap_tick(void) * Schedule for wakeup if there isn't lots * of free memory. */ - switch (free_memory_available(3)) { + switch (free_memory_available()) { case 0: want = now; /* Fall through */ - case 1 ... 3: + case 1: want_wakeup = 1; default: } |