diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 262 | ||||
-rw-r--r-- | mm/memory.c | 113 | ||||
-rw-r--r-- | mm/mmap.c | 254 | ||||
-rw-r--r-- | mm/mmap_avl.c | 374 | ||||
-rw-r--r-- | mm/page_alloc.c | 153 | ||||
-rw-r--r-- | mm/page_io.c | 215 | ||||
-rw-r--r-- | mm/swap.c | 39 | ||||
-rw-r--r-- | mm/swap_state.c | 60 | ||||
-rw-r--r-- | mm/swapfile.c | 13 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 526 |
11 files changed, 1191 insertions, 822 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 227bcd5a9..3c15ea63b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -118,140 +118,83 @@ void remove_inode_page(struct page *page) __free_page(page); } -/* - * Check whether we can free this page. - */ -static inline int shrink_one_page(struct page *page, int gfp_mask) -{ - struct buffer_head *tmp, *bh; - - if (PageLocked(page)) - goto next; - if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) - goto next; - /* First of all, regenerate the page's referenced bit - * from any buffers in the page - */ - bh = page->buffers; - if (bh) { - tmp = bh; - do { - if (buffer_touched(tmp)) { - clear_bit(BH_Touched, &tmp->b_state); - set_bit(PG_referenced, &page->flags); - } - tmp = tmp->b_this_page; - } while (tmp != bh); - - /* Refuse to swap out all buffer pages */ - if (buffer_under_min()) - goto next; - } - - /* We can't throw away shared pages, but we do mark - them as referenced. This relies on the fact that - no page is currently in both the page cache and the - buffer cache; we'd have to modify the following - test to allow for that case. */ - - switch (atomic_read(&page->count)) { - case 1: - /* is it a swap-cache or page-cache page? */ - if (page->inode) { - if (test_and_clear_bit(PG_referenced, &page->flags)) - break; - if (pgcache_under_min()) - break; - if (PageSwapCache(page)) { - delete_from_swap_cache(page); - return 1; - } - remove_inode_page(page); - return 1; - } - /* It's not a cache page, so we don't do aging. - * If it has been referenced recently, don't free it */ - if (test_and_clear_bit(PG_referenced, &page->flags)) - break; - - if (buffer_under_min()) - break; - - /* is it a buffer cache page? */ - if (bh && try_to_free_buffer(bh, &bh, 6)) - return 1; - break; - - default: - /* more than one user: we can't throw it away */ - set_bit(PG_referenced, &page->flags); - /* fall through */ - case 0: - /* nothing */ - } -next: - return 0; -} - int shrink_mmap(int priority, int gfp_mask) { static unsigned long clock = 0; unsigned long limit = num_physpages; struct page * page; - int count_max, count_min; + int count; - count_max = limit; - count_min = (limit<<2) >> (priority); + count = limit >> priority; page = mem_map + clock; do { - if (PageSkip(page)) { - /* next_hash is overloaded for PageSkip */ - page = page->next_hash; - clock = page->map_nr; - } - - if (shrink_one_page(page, gfp_mask)) - return 1; - count_max--; - /* - * If the page we looked at was recyclable but we didn't - * reclaim it (presumably due to PG_referenced), don't - * count it as scanned. This way, the more referenced - * page cache pages we encounter, the more rapidly we - * will age them. + int referenced; + + /* This works even in the presence of PageSkip because + * the first two entries at the beginning of a hole will + * be marked, not just the first. */ - if (atomic_read(&page->count) != 1 || - (!page->inode && !page->buffers)) - count_min--; page++; clock++; if (clock >= max_mapnr) { clock = 0; page = mem_map; } - } while (count_max > 0 && count_min > 0); - return 0; -} + if (PageSkip(page)) { + /* next_hash is overloaded for PageSkip */ + page = page->next_hash; + clock = page - mem_map; + } + + referenced = test_and_clear_bit(PG_referenced, &page->flags); -/* - * This is called from try_to_swap_out() when we try to get rid of some - * pages.. If we're unmapping the last occurrence of this page, we also - * free it from the page hash-queues etc, as we don't want to keep it - * in-core unnecessarily. - */ -unsigned long page_unuse(struct page * page) -{ - int count = atomic_read(&page->count); - - if (count != 2) - return count; - if (!page->inode) - return count; - if (PageSwapCache(page)) - panic ("Doing a normal page_unuse of a swap cache page"); - remove_inode_page(page); - return 1; + if (PageLocked(page)) + continue; + + if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) + continue; + + /* We can't free pages unless there's just one user */ + if (atomic_read(&page->count) != 1) + continue; + + count--; + + /* + * Is it a page swap page? If so, we want to + * drop it if it is no longer used, even if it + * were to be marked referenced.. + */ + if (PageSwapCache(page)) { + if (referenced && swap_count(page->offset) != 1) + continue; + delete_from_swap_cache(page); + return 1; + } + + if (referenced) + continue; + + /* Is it a buffer page? */ + if (page->buffers) { + if (buffer_under_min()) + continue; + if (!try_to_free_buffers(page)) + continue; + return 1; + } + + /* is it a page-cache page? */ + if (page->inode) { + if (pgcache_under_min()) + continue; + remove_inode_page(page); + return 1; + } + + } while (count > 0); + return 0; } /* @@ -974,7 +917,7 @@ static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long struct file * file = area->vm_file; struct dentry * dentry = file->f_dentry; struct inode * inode = dentry->d_inode; - unsigned long offset; + unsigned long offset, reada, i; struct page * page, **hash; unsigned long old_page, new_page; @@ -1035,7 +978,18 @@ success: return new_page; no_cached_page: - new_page = __get_free_page(GFP_USER); + /* + * Try to read in an entire cluster at once. + */ + reada = offset; + reada >>= PAGE_SHIFT + page_cluster; + reada <<= PAGE_SHIFT + page_cluster; + + for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_SIZE) + new_page = try_to_read_ahead(file, reada, new_page); + + if (!new_page) + new_page = __get_free_page(GFP_USER); if (!new_page) goto no_page; @@ -1059,11 +1013,6 @@ no_cached_page: if (inode->i_op->readpage(file, page) != 0) goto failure; - /* - * Do a very limited read-ahead if appropriate - */ - if (PageLocked(page)) - new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0); goto found_page; page_locked_wait: @@ -1137,22 +1086,6 @@ static int filemap_write_page(struct vm_area_struct * vma, struct file * file; struct dentry * dentry; struct inode * inode; - struct buffer_head * bh; - - bh = mem_map[MAP_NR(page)].buffers; - if (bh) { - /* whee.. just mark the buffer heads dirty */ - struct buffer_head * tmp = bh; - do { - /* - * WSH: There's a race here: mark_buffer_dirty() - * could block, and the buffers aren't pinned down. - */ - mark_buffer_dirty(tmp, 0); - tmp = tmp->b_this_page; - } while (tmp != bh); - return 0; - } file = vma->vm_file; dentry = file->f_dentry; @@ -1174,50 +1107,15 @@ static int filemap_write_page(struct vm_area_struct * vma, /* - * Swapping to a shared file: while we're busy writing out the page - * (and the page still exists in memory), we save the page information - * in the page table, so that "filemap_swapin()" can re-use the page - * immediately if it is called while we're busy swapping it out.. - * - * Once we've written it all out, we mark the page entry "empty", which - * will result in a normal page-in (instead of a swap-in) from the now - * up-to-date disk file. + * The page cache takes care of races between somebody + * trying to swap something out and swap something in + * at the same time.. */ -int filemap_swapout(struct vm_area_struct * vma, - unsigned long offset, - pte_t *page_table) -{ - int error; - unsigned long page = pte_page(*page_table); - unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page)); - - flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset)); - set_pte(page_table, __pte(entry)); - flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset)); - error = filemap_write_page(vma, offset, page); - if (pte_val(*page_table) == entry) - pte_clear(page_table); - return error; -} - -/* - * filemap_swapin() is called only if we have something in the page - * tables that is non-zero (but not present), which we know to be the - * page index of a page that is busy being swapped out (see above). - * So we just use it directly.. - */ -static pte_t filemap_swapin(struct vm_area_struct * vma, - unsigned long offset, - unsigned long entry) +int filemap_swapout(struct vm_area_struct * vma, struct page * page) { - unsigned long page = SWP_OFFSET(entry); - - atomic_inc(&mem_map[page].count); - page = (page << PAGE_SHIFT) + PAGE_OFFSET; - return mk_pte(page,vma->vm_page_prot); + return filemap_write_page(vma, page->offset, page_address(page)); } - static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { @@ -1358,7 +1256,7 @@ static struct vm_operations_struct file_shared_mmap = { filemap_nopage, /* nopage */ NULL, /* wppage */ filemap_swapout, /* swapout */ - filemap_swapin, /* swapin */ + NULL, /* swapin */ }; /* @@ -1637,7 +1535,7 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset, if (!page) { if (!new) goto out; - page_cache = get_free_page(GFP_KERNEL); + page_cache = get_free_page(GFP_USER); if (!page_cache) goto out; page = mem_map + MAP_NR(page_cache); diff --git a/mm/memory.c b/mm/memory.c index 932c35648..f788163c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -126,48 +126,36 @@ int check_pgt_cache(void) * This function clears all user-level page tables of a process - this * is needed by execve(), so that old pages aren't in the way. */ -void clear_page_tables(struct task_struct * tsk) +void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) { - pgd_t * page_dir = tsk->mm->pgd; - int i; - - if (!page_dir || page_dir == swapper_pg_dir) - goto out_bad; - for (i = 0 ; i < USER_PTRS_PER_PGD ; i++) - free_one_pgd(page_dir + i); + pgd_t * page_dir = mm->pgd; - /* keep the page table cache within bounds */ - check_pgt_cache(); - return; + if (page_dir && page_dir != swapper_pg_dir) { + page_dir += first; + do { + free_one_pgd(page_dir); + page_dir++; + } while (--nr); -out_bad: - printk(KERN_ERR - "clear_page_tables: %s trying to clear kernel pgd\n", - tsk->comm); - return; + /* keep the page table cache within bounds */ + check_pgt_cache(); + } } /* - * This function frees up all page tables of a process when it exits. It - * is the same as "clear_page_tables()", except it also frees the old - * page table directory. + * This function just free's the page directory - the + * pages tables themselves have been freed earlier by + * clear_page_tables(). */ void free_page_tables(struct mm_struct * mm) { pgd_t * page_dir = mm->pgd; - int i; - - if (!page_dir) - goto out; - if (page_dir == swapper_pg_dir) - goto out_bad; - for (i = 0 ; i < USER_PTRS_PER_PGD ; i++) - free_one_pgd(page_dir + i); - pgd_free(page_dir); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -out: + + if (page_dir) { + if (page_dir == swapper_pg_dir) + goto out_bad; + pgd_free(page_dir); + } return; out_bad: @@ -204,7 +192,7 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, pgd_t * src_pgd, * dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; - unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; + unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -277,10 +265,15 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; set_pte(dst_pte, pte); goto cont_copy_pte_range; } - if (cow) + /* If it's a COW mapping, write protect it both in the parent and the child */ + if (cow) { pte = pte_wrprotect(pte); + set_pte(src_pte, pte); + } + /* If it's a shared mapping, mark it clean in the child */ + if (vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); set_pte(dst_pte, pte_mkold(pte)); - set_pte(src_pte, pte); atomic_inc(&mem_map[page_nr].count); cont_copy_pte_range: address += PAGE_SIZE; @@ -644,37 +637,47 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, page_map = mem_map + MAP_NR(old_page); /* - * Do we need to copy? + * We can avoid the copy if: + * - we're the only user (count == 1) + * - the only other user is the swap cache, + * and the only swap cache user is itself, + * in which case we can remove the page + * from the swap cache. */ - if (is_page_shared(page_map)) { + switch (atomic_read(&page_map->count)) { + case 2: + if (!PageSwapCache(page_map)) + break; + if (swap_count(page_map->offset) != 1) + break; + delete_from_swap_cache(page_map); + /* FallThrough */ + case 1: + /* We can release the kernel lock now.. */ unlock_kernel(); - if (!new_page) - return 0; - if (PageReserved(mem_map + MAP_NR(old_page))) - ++vma->vm_mm->rss; - copy_cow_page(old_page,new_page); - flush_page_to_ram(old_page); - flush_page_to_ram(new_page); flush_cache_page(vma, address); - set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); - free_page(old_page); + set_pte(page_table, pte_mkdirty(pte_mkwrite(pte))); flush_tlb_page(vma, address); +end_wp_page: + if (new_page) + free_page(new_page); return 1; } - - if (PageSwapCache(page_map)) - delete_from_swap_cache(page_map); - - /* We can release the kernel lock now.. */ + unlock_kernel(); + if (!new_page) + return 0; + if (PageReserved(mem_map + MAP_NR(old_page))) + ++vma->vm_mm->rss; + copy_cow_page(old_page,new_page); + flush_page_to_ram(old_page); + flush_page_to_ram(new_page); flush_cache_page(vma, address); - set_pte(page_table, pte_mkdirty(pte_mkwrite(pte))); + set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); + free_page(old_page); flush_tlb_page(vma, address); -end_wp_page: - if (new_page) - free_page(new_page); return 1; bad_wp_page: @@ -371,6 +371,100 @@ unsigned long get_unmapped_area(unsigned long addr, unsigned long len) } } +#define vm_avl_empty (struct vm_area_struct *) NULL + +#include "mmap_avl.c" + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + if (!mm->mmap_avl) { + /* Go through the linear list. */ + vma = mm->mmap; + while (vma && vma->vm_end <= addr) + vma = vma->vm_next; + } else { + /* Then go through the AVL tree quickly. */ + struct vm_area_struct * tree = mm->mmap_avl; + vma = NULL; + for (;;) { + if (tree == vm_avl_empty) + break; + if (tree->vm_end > addr) { + vma = tree; + if (tree->vm_start <= addr) + break; + tree = tree->vm_avl_left; + } else + tree = tree->vm_avl_right; + } + } + if (vma) + mm->mmap_cache = vma; + } + } + return vma; +} + +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + if (mm) { + if (!mm->mmap_avl) { + /* Go through the linear list. */ + struct vm_area_struct * prev = NULL; + struct vm_area_struct * vma = mm->mmap; + while (vma && vma->vm_end <= addr) { + prev = vma; + vma = vma->vm_next; + } + *pprev = prev; + return vma; + } else { + /* Go through the AVL tree quickly. */ + struct vm_area_struct * vma = NULL; + struct vm_area_struct * last_turn_right = NULL; + struct vm_area_struct * prev = NULL; + struct vm_area_struct * tree = mm->mmap_avl; + for (;;) { + if (tree == vm_avl_empty) + break; + if (tree->vm_end > addr) { + vma = tree; + prev = last_turn_right; + if (tree->vm_start <= addr) + break; + tree = tree->vm_avl_left; + } else { + last_turn_right = tree; + tree = tree->vm_avl_right; + } + } + if (vma) { + if (vma->vm_avl_left != vm_avl_empty) { + prev = vma->vm_avl_left; + while (prev->vm_avl_right != vm_avl_empty) + prev = prev->vm_avl_right; + } + if ((prev ? prev->vm_next : mm->mmap) != vma) + printk("find_vma_prev: tree inconsistent with list\n"); + *pprev = prev; + return vma; + } + } + } + *pprev = NULL; + return NULL; +} + /* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. @@ -446,6 +540,57 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr, return 1; } +/* + * Try to free as many page directory entries as we can, + * without having to work very hard at actually scanning + * the page tables themselves. + * + * Right now we try to free page tables if we have a nice + * PGDIR-aligned area that got free'd up. We could be more + * granular if we want to, but this is fast and simple, + * and covers the bad cases. + * + * "prev", if it exists, points to a vma before the one + * we just free'd - but there's no telling how much before. + */ +static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + unsigned long first = start & PGDIR_MASK; + unsigned long last = (end + PGDIR_SIZE - 1) & PGDIR_MASK; + + if (!prev) { + prev = mm->mmap; + if (!prev) + goto no_mmaps; + if (prev->vm_end > start) { + if (last > prev->vm_end) + last = prev->vm_end; + goto no_mmaps; + } + } + for (;;) { + struct vm_area_struct *next = prev->vm_next; + + if (next) { + if (next->vm_start < start) { + prev = next; + continue; + } + if (last > next->vm_start) + last = next->vm_start; + } + if (prev->vm_end > first) + first = prev->vm_end + PGDIR_SIZE - 1; + break; + } +no_mmaps: + first = first >> PGDIR_SHIFT; + last = last >> PGDIR_SHIFT; + if (last > first) + clear_page_tables(mm, first, last-first); +} + /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -454,8 +599,7 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr, int do_munmap(unsigned long addr, size_t len) { struct mm_struct * mm; - struct vm_area_struct *mpnt, *free, *extra; - int freed; + struct vm_area_struct *mpnt, *prev, **npp, *free, *extra; if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) return -EINVAL; @@ -469,15 +613,17 @@ int do_munmap(unsigned long addr, size_t len) * on the list. If nothing is put on, nothing is affected. */ mm = current->mm; - mpnt = mm->mmap; - while(mpnt && mpnt->vm_end <= addr) - mpnt = mpnt->vm_next; + mpnt = find_vma_prev(mm, addr, &prev); if (!mpnt) return 0; + /* we have addr < mpnt->vm_end */ + + if (mpnt->vm_start >= addr+len) + return 0; /* If we'll make "hole", check the vm areas limit */ - if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) && - mm->map_count > MAX_MAP_COUNT) + if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) + && mm->map_count >= MAX_MAP_COUNT) return -ENOMEM; /* @@ -488,18 +634,14 @@ int do_munmap(unsigned long addr, size_t len) if (!extra) return -ENOMEM; - /* we have addr < mpnt->vm_end */ + npp = (prev ? &prev->vm_next : &mm->mmap); free = NULL; - for ( ; mpnt && mpnt->vm_start < addr+len; ) { - struct vm_area_struct *next = mpnt->vm_next; - - if(mpnt->vm_next) - mpnt->vm_next->vm_pprev = mpnt->vm_pprev; - *mpnt->vm_pprev = mpnt->vm_next; - + for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { + *npp = mpnt->vm_next; mpnt->vm_next = free; free = mpnt; - mpnt = next; + if (mm->mmap_avl) + avl_remove(mpnt, &mm->mmap_avl); } /* Ok - we have the memory areas we should free on the 'free' list, @@ -507,15 +649,10 @@ int do_munmap(unsigned long addr, size_t len) * If the one of the segments is only being partially unmapped, * it will put new vm_area_struct(s) into the address space. */ - freed = 0; while ((mpnt = free) != NULL) { unsigned long st, end, size; free = free->vm_next; - freed = 1; - - mm->map_count--; - remove_shared_vm_struct(mpnt); st = addr < mpnt->vm_start ? mpnt->vm_start : addr; end = addr+len; @@ -525,6 +662,9 @@ int do_munmap(unsigned long addr, size_t len) if (mpnt->vm_ops && mpnt->vm_ops->unmap) mpnt->vm_ops->unmap(mpnt, st, size); + remove_shared_vm_struct(mpnt); + mm->map_count--; + flush_cache_range(mm, st, end); zap_page_range(mm, st, size); flush_tlb_range(mm, st, end); @@ -540,8 +680,9 @@ int do_munmap(unsigned long addr, size_t len) if (extra) kmem_cache_free(vm_area_cachep, extra); - if (freed) - mm->mmap_cache = NULL; /* Kill the cache. */ + free_pgtables(mm, prev, addr, addr+len); + + mm->mmap_cache = NULL; /* Kill the cache. */ return 0; } @@ -557,13 +698,23 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len) return ret; } +/* Build the AVL tree corresponding to the VMA list. */ +void build_mmap_avl(struct mm_struct * mm) +{ + struct vm_area_struct * vma; + + mm->mmap_avl = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) + avl_insert(vma, &mm->mmap_avl); +} + /* Release all mmaps. */ void exit_mmap(struct mm_struct * mm) { struct vm_area_struct * mpnt; mpnt = mm->mmap; - mm->mmap = mm->mmap_cache = NULL; + mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL; mm->rss = 0; mm->total_vm = 0; mm->locked_vm = 0; @@ -591,6 +742,8 @@ void exit_mmap(struct mm_struct * mm) /* This is just debugging */ if (mm->map_count) printk("exit_mmap: map count is %d\n", mm->map_count); + + clear_page_tables(mm, 0, USER_PTRS_PER_PGD); } /* Insert vm structure into process list sorted by address @@ -598,20 +751,26 @@ void exit_mmap(struct mm_struct * mm) */ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) { - struct vm_area_struct **pprev = &mm->mmap; + struct vm_area_struct **pprev; struct file * file; - mm->map_count++; - - /* Find where to link it in. */ - while(*pprev && (*pprev)->vm_start <= vmp->vm_start) - pprev = &(*pprev)->vm_next; - - /* Insert it. */ - if((vmp->vm_next = *pprev) != NULL) - (*pprev)->vm_pprev = &vmp->vm_next; + if (!mm->mmap_avl) { + pprev = &mm->mmap; + while (*pprev && (*pprev)->vm_start <= vmp->vm_start) + pprev = &(*pprev)->vm_next; + } else { + struct vm_area_struct *prev, *next; + avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next); + pprev = (prev ? &prev->vm_next : &mm->mmap); + if (*pprev != next) + printk("insert_vm_struct: tree inconsistent with list\n"); + } + vmp->vm_next = *pprev; *pprev = vmp; - vmp->vm_pprev = pprev; + + mm->map_count++; + if (mm->map_count >= AVL_MIN_MAP_COUNT && !mm->mmap_avl) + build_mmap_avl(mm); file = vmp->vm_file; if (file) { @@ -637,23 +796,17 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) */ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) { - struct vm_area_struct *prev, *mpnt, *next; + struct vm_area_struct *prev, *mpnt, *next, *prev1; - prev = NULL; - mpnt = mm->mmap; - while(mpnt && mpnt->vm_end <= start_addr) { - prev = mpnt; - mpnt = mpnt->vm_next; - } + mpnt = find_vma_prev(mm, start_addr, &prev1); if (!mpnt) return; - next = mpnt->vm_next; - - /* we have prev->vm_next == mpnt && mpnt->vm_next = next */ - if (!prev) { + if (prev1) { + prev = prev1; + } else { prev = mpnt; - mpnt = next; + mpnt = mpnt->vm_next; } /* prev and mpnt cycle through the list, as long as @@ -684,11 +837,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l * big segment can possibly merge with the next one. * The old unused mpnt is freed. */ - if(mpnt->vm_next) - mpnt->vm_next->vm_pprev = mpnt->vm_pprev; - *mpnt->vm_pprev = mpnt->vm_next; - + if (mm->mmap_avl) + avl_remove(mpnt, &mm->mmap_avl); prev->vm_end = mpnt->vm_end; + prev->vm_next = mpnt->vm_next; if (mpnt->vm_ops && mpnt->vm_ops->close) { mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start; mpnt->vm_start = mpnt->vm_end; diff --git a/mm/mmap_avl.c b/mm/mmap_avl.c new file mode 100644 index 000000000..5a48ce89b --- /dev/null +++ b/mm/mmap_avl.c @@ -0,0 +1,374 @@ +/* + * Searching a VMA in the linear list task->mm->mmap is horribly slow. + * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search + * from O(n) to O(log n), where n is the number of VMAs of the task + * n is typically around 6, but may reach 3000 in some cases: object-oriented + * databases, persistent store, generational garbage collection (Java, Lisp), + * ElectricFence. + * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>. + */ + +/* We keep the list and tree sorted by address. */ +#define vm_avl_key vm_end +#define vm_avl_key_t unsigned long /* typeof(vma->avl_key) */ + +/* + * task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap + * or, more exactly, its root. + * A vm_area_struct has the following fields: + * vm_avl_left left son of a tree node + * vm_avl_right right son of a tree node + * vm_avl_height 1+max(heightof(left),heightof(right)) + * The empty tree is represented as NULL. + */ + +/* Since the trees are balanced, their height will never be large. */ +#define avl_maxheight 41 /* why this? a small exercise */ +#define heightof(tree) ((tree) == vm_avl_empty ? 0 : (tree)->vm_avl_height) +/* + * Consistency and balancing rules: + * 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right)) + * 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1 + * 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key, + * foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key. + */ + +#ifdef DEBUG_AVL + +/* Look up the nodes at the left and at the right of a given node. */ +static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) +{ + vm_avl_key_t key = node->vm_avl_key; + + *to_the_left = *to_the_right = NULL; + for (;;) { + if (tree == vm_avl_empty) { + printk("avl_neighbours: node not found in the tree\n"); + return; + } + if (key == tree->vm_avl_key) + break; + if (key < tree->vm_avl_key) { + *to_the_right = tree; + tree = tree->vm_avl_left; + } else { + *to_the_left = tree; + tree = tree->vm_avl_right; + } + } + if (tree != node) { + printk("avl_neighbours: node not exactly found in the tree\n"); + return; + } + if (tree->vm_avl_left != vm_avl_empty) { + struct vm_area_struct * node; + for (node = tree->vm_avl_left; node->vm_avl_right != vm_avl_empty; node = node->vm_avl_right) + continue; + *to_the_left = node; + } + if (tree->vm_avl_right != vm_avl_empty) { + struct vm_area_struct * node; + for (node = tree->vm_avl_right; node->vm_avl_left != vm_avl_empty; node = node->vm_avl_left) + continue; + *to_the_right = node; + } + if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right)) + printk("avl_neighbours: tree inconsistent with list\n"); +} + +#endif + +/* + * Rebalance a tree. + * After inserting or deleting a node of a tree we have a sequence of subtrees + * nodes[0]..nodes[k-1] such that + * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}. + */ +static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count) +{ + for ( ; count > 0 ; count--) { + struct vm_area_struct ** nodeplace = *--nodeplaces_ptr; + struct vm_area_struct * node = *nodeplace; + struct vm_area_struct * nodeleft = node->vm_avl_left; + struct vm_area_struct * noderight = node->vm_avl_right; + int heightleft = heightof(nodeleft); + int heightright = heightof(noderight); + if (heightright + 1 < heightleft) { + /* */ + /* * */ + /* / \ */ + /* n+2 n */ + /* */ + struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left; + struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right; + int heightleftright = heightof(nodeleftright); + if (heightof(nodeleftleft) >= heightleftright) { + /* */ + /* * n+2|n+3 */ + /* / \ / \ */ + /* n+2 n --> / n+1|n+2 */ + /* / \ | / \ */ + /* n+1 n|n+1 n+1 n|n+1 n */ + /* */ + node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node; + nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright); + *nodeplace = nodeleft; + } else { + /* */ + /* * n+2 */ + /* / \ / \ */ + /* n+2 n --> n+1 n+1 */ + /* / \ / \ / \ */ + /* n n+1 n L R n */ + /* / \ */ + /* L R */ + /* */ + nodeleft->vm_avl_right = nodeleftright->vm_avl_left; + node->vm_avl_left = nodeleftright->vm_avl_right; + nodeleftright->vm_avl_left = nodeleft; + nodeleftright->vm_avl_right = node; + nodeleft->vm_avl_height = node->vm_avl_height = heightleftright; + nodeleftright->vm_avl_height = heightleft; + *nodeplace = nodeleftright; + } + } + else if (heightleft + 1 < heightright) { + /* similar to the above, just interchange 'left' <--> 'right' */ + struct vm_area_struct * noderightright = noderight->vm_avl_right; + struct vm_area_struct * noderightleft = noderight->vm_avl_left; + int heightrightleft = heightof(noderightleft); + if (heightof(noderightright) >= heightrightleft) { + node->vm_avl_right = noderightleft; noderight->vm_avl_left = node; + noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft); + *nodeplace = noderight; + } else { + noderight->vm_avl_left = noderightleft->vm_avl_right; + node->vm_avl_right = noderightleft->vm_avl_left; + noderightleft->vm_avl_right = noderight; + noderightleft->vm_avl_left = node; + noderight->vm_avl_height = node->vm_avl_height = heightrightleft; + noderightleft->vm_avl_height = heightright; + *nodeplace = noderightleft; + } + } + else { + int height = (heightleft<heightright ? heightright : heightleft) + 1; + if (height == node->vm_avl_height) + break; + node->vm_avl_height = height; + } + } +} + +/* Insert a node into a tree. */ +static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree) +{ + vm_avl_key_t key = new_node->vm_avl_key; + struct vm_area_struct ** nodeplace = ptree; + struct vm_area_struct ** stack[avl_maxheight]; + int stack_count = 0; + struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ + for (;;) { + struct vm_area_struct * node = *nodeplace; + if (node == vm_avl_empty) + break; + *stack_ptr++ = nodeplace; stack_count++; + if (key < node->vm_avl_key) + nodeplace = &node->vm_avl_left; + else + nodeplace = &node->vm_avl_right; + } + new_node->vm_avl_left = vm_avl_empty; + new_node->vm_avl_right = vm_avl_empty; + new_node->vm_avl_height = 1; + *nodeplace = new_node; + avl_rebalance(stack_ptr,stack_count); +} + +/* Insert a node into a tree, and + * return the node to the left of it and the node to the right of it. + */ +static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree, + struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) +{ + vm_avl_key_t key = new_node->vm_avl_key; + struct vm_area_struct ** nodeplace = ptree; + struct vm_area_struct ** stack[avl_maxheight]; + int stack_count = 0; + struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ + *to_the_left = *to_the_right = NULL; + for (;;) { + struct vm_area_struct * node = *nodeplace; + if (node == vm_avl_empty) + break; + *stack_ptr++ = nodeplace; stack_count++; + if (key < node->vm_avl_key) { + *to_the_right = node; + nodeplace = &node->vm_avl_left; + } else { + *to_the_left = node; + nodeplace = &node->vm_avl_right; + } + } + new_node->vm_avl_left = vm_avl_empty; + new_node->vm_avl_right = vm_avl_empty; + new_node->vm_avl_height = 1; + *nodeplace = new_node; + avl_rebalance(stack_ptr,stack_count); +} + +/* Removes a node out of a tree. */ +static void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree) +{ + vm_avl_key_t key = node_to_delete->vm_avl_key; + struct vm_area_struct ** nodeplace = ptree; + struct vm_area_struct ** stack[avl_maxheight]; + int stack_count = 0; + struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ + struct vm_area_struct ** nodeplace_to_delete; + for (;;) { + struct vm_area_struct * node = *nodeplace; +#ifdef DEBUG_AVL + if (node == vm_avl_empty) { + /* what? node_to_delete not found in tree? */ + printk("avl_remove: node to delete not found in tree\n"); + return; + } +#endif + *stack_ptr++ = nodeplace; stack_count++; + if (key == node->vm_avl_key) + break; + if (key < node->vm_avl_key) + nodeplace = &node->vm_avl_left; + else + nodeplace = &node->vm_avl_right; + } + nodeplace_to_delete = nodeplace; + /* Have to remove node_to_delete = *nodeplace_to_delete. */ + if (node_to_delete->vm_avl_left == vm_avl_empty) { + *nodeplace_to_delete = node_to_delete->vm_avl_right; + stack_ptr--; stack_count--; + } else { + struct vm_area_struct *** stack_ptr_to_delete = stack_ptr; + struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left; + struct vm_area_struct * node; + for (;;) { + node = *nodeplace; + if (node->vm_avl_right == vm_avl_empty) + break; + *stack_ptr++ = nodeplace; stack_count++; + nodeplace = &node->vm_avl_right; + } + *nodeplace = node->vm_avl_left; + /* node replaces node_to_delete */ + node->vm_avl_left = node_to_delete->vm_avl_left; + node->vm_avl_right = node_to_delete->vm_avl_right; + node->vm_avl_height = node_to_delete->vm_avl_height; + *nodeplace_to_delete = node; /* replace node_to_delete */ + *stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */ + } + avl_rebalance(stack_ptr,stack_count); +} + +#ifdef DEBUG_AVL + +/* print a list */ +static void printk_list (struct vm_area_struct * vma) +{ + printk("["); + while (vma) { + printk("%08lX-%08lX", vma->vm_start, vma->vm_end); + vma = vma->vm_next; + if (!vma) + break; + printk(" "); + } + printk("]"); +} + +/* print a tree */ +static void printk_avl (struct vm_area_struct * tree) +{ + if (tree != vm_avl_empty) { + printk("("); + if (tree->vm_avl_left != vm_avl_empty) { + printk_avl(tree->vm_avl_left); + printk("<"); + } + printk("%08lX-%08lX", tree->vm_start, tree->vm_end); + if (tree->vm_avl_right != vm_avl_empty) { + printk(">"); + printk_avl(tree->vm_avl_right); + } + printk(")"); + } +} + +static char *avl_check_point = "somewhere"; + +/* check a tree's consistency and balancing */ +static void avl_checkheights (struct vm_area_struct * tree) +{ + int h, hl, hr; + + if (tree == vm_avl_empty) + return; + avl_checkheights(tree->vm_avl_left); + avl_checkheights(tree->vm_avl_right); + h = tree->vm_avl_height; + hl = heightof(tree->vm_avl_left); + hr = heightof(tree->vm_avl_right); + if ((h == hl+1) && (hr <= hl) && (hl <= hr+1)) + return; + if ((h == hr+1) && (hl <= hr) && (hr <= hl+1)) + return; + printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point); +} + +/* check that all values stored in a tree are < key */ +static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key) +{ + if (tree == vm_avl_empty) + return; + avl_checkleft(tree->vm_avl_left,key); + avl_checkleft(tree->vm_avl_right,key); + if (tree->vm_avl_key < key) + return; + printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->vm_avl_key,key); +} + +/* check that all values stored in a tree are > key */ +static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key) +{ + if (tree == vm_avl_empty) + return; + avl_checkright(tree->vm_avl_left,key); + avl_checkright(tree->vm_avl_right,key); + if (tree->vm_avl_key > key) + return; + printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->vm_avl_key,key); +} + +/* check that all values are properly increasing */ +static void avl_checkorder (struct vm_area_struct * tree) +{ + if (tree == vm_avl_empty) + return; + avl_checkorder(tree->vm_avl_left); + avl_checkorder(tree->vm_avl_right); + avl_checkleft(tree->vm_avl_left,tree->vm_avl_key); + avl_checkright(tree->vm_avl_right,tree->vm_avl_key); +} + +/* all checks */ +static void avl_check (struct task_struct * task, char *caller) +{ + avl_check_point = caller; +/* printk("task \"%s\", %s\n",task->comm,caller); */ +/* printk("task \"%s\" list: ",task->comm); printk_list(task->mm->mmap); printk("\n"); */ +/* printk("task \"%s\" tree: ",task->comm); printk_avl(task->mm->mmap_avl); printk("\n"); */ + avl_checkheights(task->mm->mmap_avl); + avl_checkorder(task->mm->mmap_avl); +} + +#endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ceec01b9..4a956c085 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -90,33 +90,6 @@ static inline void remove_mem_queue(struct page * entry) */ spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED; -/* - * This routine is used by the kernel swap daemon to determine - * whether we have "enough" free pages. It is fairly arbitrary, - * having a low-water and high-water mark. - * - * This returns: - * 0 - urgent need for memory - * 1 - need some memory, but do it slowly in the background - * 2 - no need to even think about it. - */ -int free_memory_available(void) -{ - static int available = 1; - - if (nr_free_pages < freepages.low) { - available = 0; - return 0; - } - - if (nr_free_pages > freepages.high) { - available = 1; - return 2; - } - - return available; -} - static inline void free_pages_ok(unsigned long map_nr, unsigned long order) { struct free_area_struct *area = free_area + order; @@ -151,14 +124,10 @@ void __free_page(struct page *page) if (!PageReserved(page) && atomic_dec_and_test(&page->count)) { if (PageSwapCache(page)) panic ("Freeing swap cache page"); - free_pages_ok(page->map_nr, 0); + page->flags &= ~(1 << PG_referenced); + free_pages_ok(page - mem_map, 0); return; } -#if 0 - if (PageSwapCache(page) && atomic_read(&page->count) == 1) - printk(KERN_WARNING "VM: Releasing swap cache page at %p", - __builtin_return_address(0)); -#endif } void free_pages(unsigned long addr, unsigned long order) @@ -172,15 +141,10 @@ void free_pages(unsigned long addr, unsigned long order) if (atomic_dec_and_test(&map->count)) { if (PageSwapCache(map)) panic ("Freeing swap cache pages"); + map->flags &= ~(1 << PG_referenced); free_pages_ok(map_nr, order); return; } -#if 0 - if (PageSwapCache(map) && atomic_read(&map->count) == 1) - printk(KERN_WARNING - "VM: Releasing swap cache pages at %p", - __builtin_return_address(0)); -#endif } } @@ -191,14 +155,15 @@ void free_pages(unsigned long addr, unsigned long order) change_bit((index) >> (1+(order)), (area)->map) #define CAN_DMA(x) (PageDMA(x)) #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT)) -#define RMQUEUE(order, dma) \ +#define RMQUEUE(order, gfp_mask) \ do { struct free_area_struct * area = free_area+order; \ unsigned long new_order = order; \ do { struct page *prev = memory_head(area), *ret = prev->next; \ while (memory_head(area) != ret) { \ - if (!dma || CAN_DMA(ret)) { \ - unsigned long map_nr = ret->map_nr; \ + if (!(gfp_mask & __GFP_DMA) || CAN_DMA(ret)) { \ + unsigned long map_nr; \ (prev->next = ret->next)->prev = prev; \ + map_nr = ret - mem_map; \ MARK_USED(map_nr, new_order, area); \ nr_free_pages -= 1 << order; \ EXPAND(ret, map_nr, order, new_order, area); \ @@ -224,6 +189,8 @@ do { unsigned long size = 1 << high; \ atomic_set(&map->count, 1); \ } while (0) +int low_on_memory = 0; + unsigned long __get_free_pages(int gfp_mask, unsigned long order) { unsigned long flags; @@ -231,30 +198,45 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order) if (order >= NR_MEM_LISTS) goto nopage; - if (gfp_mask & __GFP_WAIT) { - if (in_interrupt()) { - static int count = 0; - if (++count < 5) { - printk("gfp called nonatomically from interrupt %p\n", - __builtin_return_address(0)); - } - goto nopage; +#ifdef ATOMIC_MEMORY_DEBUGGING + if ((gfp_mask & __GFP_WAIT) && in_interrupt()) { + static int count = 0; + if (++count < 5) { + printk("gfp called nonatomically from interrupt %p\n", + __builtin_return_address(0)); } + goto nopage; + } +#endif - if (freepages.min > nr_free_pages) { - int freed; - freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX); - /* - * Low priority (user) allocations must not - * succeed if we didn't have enough memory - * and we couldn't get more.. - */ - if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) - goto nopage; + /* + * If this is a recursive call, we'd better + * do our best to just allocate things without + * further thought. + */ + if (!(current->flags & PF_MEMALLOC)) { + int freed; + + if (nr_free_pages > freepages.min) { + if (!low_on_memory) + goto ok_to_allocate; + if (nr_free_pages >= freepages.high) { + low_on_memory = 0; + goto ok_to_allocate; + } } + + low_on_memory = 1; + current->flags |= PF_MEMALLOC; + freed = try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; + + if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) + goto nopage; } +ok_to_allocate: spin_lock_irqsave(&page_alloc_lock, flags); - RMQUEUE(order, (gfp_mask & GFP_DMA)); + RMQUEUE(order, gfp_mask); spin_unlock_irqrestore(&page_alloc_lock, flags); /* @@ -341,7 +323,6 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m --p; atomic_set(&p->count, 0); p->flags = (1 << PG_DMA) | (1 << PG_reserved); - p->map_nr = p - mem_map; } while (p > mem_map); for (i = 0 ; i < NR_MEM_LISTS ; i++) { @@ -359,6 +340,46 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m return start_mem; } +/* + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + */ +void swapin_readahead(unsigned long entry) +{ + int i; + struct page *new_page; + unsigned long offset = SWP_OFFSET(entry); + struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; + + offset = (offset >> page_cluster) << page_cluster; + + i = 1 << page_cluster; + do { + /* Don't read-ahead past the end of the swap area */ + if (offset >= swapdev->max) + break; + /* Don't block on I/O for read-ahead */ + if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster) + break; + /* Don't read in bad or busy pages */ + if (!swapdev->swap_map[offset]) + break; + if (swapdev->swap_map[offset] == SWAP_MAP_BAD) + break; + if (test_bit(offset, swapdev->swap_lockmap)) + break; + + /* Ok, do the async read-ahead now */ + new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0); + if (new_page != NULL) + __free_page(new_page); + offset++; + } while (--i); + return; +} + /* * The tests may look silly, but it essentially makes sure that * no other process did a swap-in on us just as we were waiting. @@ -370,10 +391,12 @@ void swap_in(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table, unsigned long entry, int write_access) { unsigned long page; - struct page *page_map; - - page_map = read_swap_cache(entry); + struct page *page_map = lookup_swap_cache(entry); + if (!page_map) { + swapin_readahead(entry); + page_map = read_swap_cache(entry); + } if (pte_val(*page_table) != entry) { if (page_map) free_page_and_swap_cache(page_address(page_map)); diff --git a/mm/page_io.c b/mm/page_io.c index 2dd24facc..498e4f63d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -7,6 +7,7 @@ * Asynchronous swapping added 30.12.95. Stephen Tweedie * Removed race in async swapping. 14.4.1996. Bruno Haible * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie + * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman */ #include <linux/mm.h> @@ -15,8 +16,6 @@ #include <linux/locks.h> #include <linux/swapctl.h> -#include <asm/dma.h> -#include <asm/uaccess.h> /* for copy_to/from_user */ #include <asm/pgtable.h> static struct wait_queue * lock_queue = NULL; @@ -24,8 +23,6 @@ static struct wait_queue * lock_queue = NULL; /* * Reads or writes a swap page. * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O. - * All IO to swap files (as opposed to swap partitions) is done - * synchronously. * * Important prevention of race condition: the caller *must* atomically * create a unique swap cache entry for this swap page before calling @@ -38,21 +35,22 @@ static struct wait_queue * lock_queue = NULL; * that shared pages stay shared while being swapped. */ -void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) +static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait) { unsigned long type, offset; struct swap_info_struct * p; - struct page *page = mem_map + MAP_NR(buf); + int zones[PAGE_SIZE/512]; + int zones_used; + kdev_t dev = 0; + int block_size; #ifdef DEBUG_SWAP printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n", (rw == READ) ? "read" : "write", - entry, buf, atomic_read(&page->count), + entry, (char *) page_address(page), atomic_read(&page->count), wait ? "wait" : "nowait"); #endif - if (page->inode && page->inode != &swapper_inode) - panic ("Tried to swap a non-swapper page"); type = SWP_TYPE(entry); if (type >= nr_swapfiles) { printk("Internal error: bad swap-device\n"); @@ -60,7 +58,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) } /* Don't allow too many pending pages in flight.. */ - if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX) + if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster) wait = 1; p = &swap_info[type]; @@ -85,13 +83,27 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) printk(KERN_ERR "VM: swap page is unlocked\n"); return; } - - /* Make sure we are the only process doing I/O with this swap page. */ - while (test_and_set_bit(offset,p->swap_lockmap)) { - run_task_queue(&tq_disk); - sleep_on(&lock_queue); + + if (PageSwapCache(page)) { + /* Make sure we are the only process doing I/O with this swap page. */ + while (test_and_set_bit(offset,p->swap_lockmap)) { + run_task_queue(&tq_disk); + sleep_on(&lock_queue); + } + + /* + * Make sure that we have a swap cache association for this + * page. We need this to find which swap page to unlock once + * the swap IO has completed to the physical page. If the page + * is not already in the cache, just overload the offset entry + * as if it were: we are not allowed to manipulate the inode + * hashing for locked pages. + */ + if (page->offset != entry) { + printk ("swap entry mismatch"); + return; + } } - if (rw == READ) { clear_bit(PG_uptodate, &page->flags); kstat.pswpin++; @@ -99,54 +111,25 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) kstat.pswpout++; atomic_inc(&page->count); - /* - * Make sure that we have a swap cache association for this - * page. We need this to find which swap page to unlock once - * the swap IO has completed to the physical page. If the page - * is not already in the cache, just overload the offset entry - * as if it were: we are not allowed to manipulate the inode - * hashing for locked pages. - */ - if (!PageSwapCache(page)) { - printk(KERN_ERR "VM: swap page is not in swap cache\n"); - return; - } - if (page->offset != entry) { - printk (KERN_ERR "VM: swap entry mismatch\n"); - return; - } - if (p->swap_device) { - if (!wait) { - set_bit(PG_free_after, &page->flags); - set_bit(PG_decr_after, &page->flags); - set_bit(PG_swap_unlock_after, &page->flags); - atomic_inc(&nr_async_pages); - } - ll_rw_page(rw,p->swap_device,offset,buf); - /* - * NOTE! We don't decrement the page count if we - * don't wait - that will happen asynchronously - * when the IO completes. - */ - if (!wait) - return; - wait_on_page(page); + zones[0] = offset; + zones_used = 1; + dev = p->swap_device; + block_size = PAGE_SIZE; } else if (p->swap_file) { struct inode *swapf = p->swap_file->d_inode; - unsigned int zones[PAGE_SIZE/512]; int i; if (swapf->i_op->bmap == NULL && swapf->i_op->smap != NULL){ /* - With MS-DOS, we use msdos_smap which return + With MS-DOS, we use msdos_smap which returns a sector number (not a cluster or block number). It is a patch to enable the UMSDOS project. Other people are working on better solution. It sounds like ll_rw_swap_file defined - it operation size (sector size) based on - PAGE_SIZE and the number of block to read. + its operation size (sector size) based on + PAGE_SIZE and the number of blocks to read. So using bmap or smap should work even if smap will require more blocks. */ @@ -159,39 +142,72 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) return; } } + block_size = 512; }else{ int j; unsigned int block = offset << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); - for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize) + block_size = swapf->i_sb->s_blocksize; + for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size) if (!(zones[i] = bmap(swapf,block++))) { printk("rw_swap_page: bad swap file\n"); return; } + zones_used = i; + dev = swapf->i_dev; } - ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf); - /* Unlike ll_rw_page, ll_rw_swap_file won't unlock the - page for us. */ - clear_bit(PG_locked, &page->flags); - wake_up(&page->wait); - } else + } else { printk(KERN_ERR "rw_swap_page: no swap file or device\n"); + /* Do some cleaning up so if this ever happens we can hopefully + * trigger controlled shutdown. + */ + if (PageSwapCache(page)) { + if (!test_and_clear_bit(offset,p->swap_lockmap)) + printk("swap_after_unlock_page: lock already cleared\n"); + wake_up(&lock_queue); + } + atomic_dec(&page->count); + return; + } + if (!wait) { + set_bit(PG_decr_after, &page->flags); + atomic_inc(&nr_async_pages); + } + if (PageSwapCache(page)) { + /* only lock/unlock swap cache pages! */ + set_bit(PG_swap_unlock_after, &page->flags); + } + set_bit(PG_free_after, &page->flags); + /* block_size == PAGE_SIZE/zones_used */ + brw_page(rw, page, dev, zones, block_size, 0); + + /* Note! For consistency we do all of the logic, + * decrementing the page count, and unlocking the page in the + * swap lock map - in the IO completion handler. + */ + if (!wait) + return; + wait_on_page(page); /* This shouldn't happen, but check to be sure. */ - if (atomic_read(&page->count) == 1) + if (atomic_read(&page->count) == 0) printk(KERN_ERR "rw_swap_page: page unused while waiting!\n"); - atomic_dec(&page->count); - if (offset && !test_and_clear_bit(offset,p->swap_lockmap)) - printk(KERN_ERR "rw_swap_page: lock already cleared\n"); - wake_up(&lock_queue); + #ifdef DEBUG_SWAP printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n", (rw == READ) ? "read" : "write", - buf, atomic_read(&page->count)); + (char *) page_adddress(page), + atomic_read(&page->count)); #endif } +/* Note: We could remove this totally asynchronous function, + * and improve swap performance, and remove the need for the swap lock map, + * by not removing pages from the swap cache until after I/O has been + * processed and letting remove_from_page_cache decrement the swap count + * just before it removes the page from the page cache. + */ /* This is run when asynchronous page I/O has completed. */ void swap_after_unlock_page (unsigned long entry) { @@ -214,6 +230,35 @@ void swap_after_unlock_page (unsigned long entry) wake_up(&lock_queue); } +/* A simple wrapper so the base function doesn't need to enforce + * that all swap pages go through the swap cache! + */ +void rw_swap_page(int rw, unsigned long entry, char *buf, int wait) +{ + struct page *page = mem_map + MAP_NR(buf); + + if (page->inode && page->inode != &swapper_inode) + panic ("Tried to swap a non-swapper page"); + + /* + * Make sure that we have a swap cache association for this + * page. We need this to find which swap page to unlock once + * the swap IO has completed to the physical page. If the page + * is not already in the cache, just overload the offset entry + * as if it were: we are not allowed to manipulate the inode + * hashing for locked pages. + */ + if (!PageSwapCache(page)) { + printk("VM: swap page is not in swap cache\n"); + return; + } + if (page->offset != entry) { + printk ("swap entry mismatch"); + return; + } + rw_swap_page_base(rw, entry, page, wait); +} + /* * Setting up a new swap file needs a simple wrapper just to read the * swap signature. SysV shared memory also needs a simple wrapper. @@ -242,33 +287,23 @@ void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer) clear_bit(PG_swap_cache, &page->flags); } - - /* - * Swap partitions are now read via brw_page. ll_rw_page is an - * asynchronous function now --- we must call wait_on_page afterwards - * if synchronous IO is required. + * shmfs needs a version that doesn't put the page in the page cache! + * The swap lock map insists that pages be in the page cache! + * Therefore we can't use it. Later when we can remove the need for the + * lock map and we can reduce the number of functions exported. */ -void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer) +void rw_swap_page_nolock(int rw, unsigned long entry, char *buffer, int wait) { - int block = offset; - struct page *page; - - switch (rw) { - case READ: - break; - case WRITE: - if (is_read_only(dev)) { - printk("Can't page to read-only device %s\n", - kdevname(dev)); - return; - } - break; - default: - panic("ll_rw_page: bad block dev cmd, must be R/W"); + struct page *page = mem_map + MAP_NR((unsigned long) buffer); + + if (!PageLocked(page)) { + printk("VM: rw_swap_page_nolock: page not locked!\n"); + return; + } + if (PageSwapCache(page)) { + printk ("VM: rw_swap_page_nolock: page in swap cache!\n"); + return; } - page = mem_map + MAP_NR(buffer); - if (!PageLocked(page)) - panic ("ll_rw_page: page not already locked"); - brw_page(rw, page, dev, &block, PAGE_SIZE, 0); + rw_swap_page_base(rw, entry, page, wait); } @@ -39,35 +39,21 @@ freepages_t freepages = { 144 /* freepages.high */ }; +/* How many pages do we try to swap or page in/out together? */ +int page_cluster = 4; /* Default value modified in swap_setup() */ + /* We track the number of pages currently being asynchronously swapped out, so that we don't try to swap TOO many pages out at once */ atomic_t nr_async_pages = ATOMIC_INIT(0); -/* - * Constants for the page aging mechanism: the maximum age (actually, - * the maximum "youthfulness"); the quanta by which pages rejuvenate - * and age; and the initial age for new pages. - * - * The "pageout_weight" is strictly a fixedpoint number with the - * ten low bits being the fraction (ie 8192 really means "8.0"). - */ -swap_control_t swap_control = { - 20, 3, 1, 3, /* Page aging */ - 32, 4, /* Aging cluster */ - 8192, /* sc_pageout_weight aka PAGEOUT_WEIGHT */ - 8192, /* sc_bufferout_weight aka BUFFEROUT_WEIGHT */ -}; - -swapstat_t swapstats = {0}; - buffer_mem_t buffer_mem = { - 5, /* minimum percent buffer */ + 2, /* minimum percent buffer */ 10, /* borrow percent buffer */ 60 /* maximum percent buffer */ }; buffer_mem_t page_cache = { - 5, /* minimum percent page cache */ + 2, /* minimum percent page cache */ 15, /* borrow percent page cache */ 75 /* maximum */ }; @@ -77,3 +63,18 @@ pager_daemon_t pager_daemon = { SWAP_CLUSTER_MAX, /* minimum number of tries */ SWAP_CLUSTER_MAX, /* do swap I/O in clusters of this size */ }; + +/* + * Perform any setup for the swap system + */ + +void __init swap_setup(void) +{ + /* Use a smaller cluster for memory <16MB or <32MB */ + if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT)) + page_cluster = 2; + else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT)) + page_cluster = 3; + else + page_cluster = 4; +} diff --git a/mm/swap_state.c b/mm/swap_state.c index e098974b2..8c5e7176c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -29,18 +29,16 @@ struct inode swapper_inode; #ifdef SWAP_CACHE_INFO unsigned long swap_cache_add_total = 0; -unsigned long swap_cache_add_success = 0; unsigned long swap_cache_del_total = 0; -unsigned long swap_cache_del_success = 0; unsigned long swap_cache_find_total = 0; unsigned long swap_cache_find_success = 0; void show_swap_cache_info(void) { - printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n", - swap_cache_add_total, swap_cache_add_success, - swap_cache_del_total, swap_cache_del_success, - swap_cache_find_total, swap_cache_find_success); + printk("Swap cache: add %ld, delete %ld, find %ld/%ld\n", + swap_cache_add_total, + swap_cache_del_total, + swap_cache_find_success, swap_cache_find_total); } #endif @@ -69,9 +67,6 @@ int add_to_swap_cache(struct page *page, unsigned long entry) page->offset = entry; add_page_to_hash_queue(page, &swapper_inode, entry); add_page_to_inode_queue(&swapper_inode, page); -#ifdef SWAP_CACHE_INFO - swap_cache_add_success++; -#endif return 1; } @@ -192,16 +187,6 @@ static inline void remove_from_swap_cache(struct page *page) printk ("VM: Removing swap cache page with wrong inode hash " "on page %08lx\n", page_address(page)); } -#if 0 - /* - * This is a legal case, but warn about it. - */ - if (atomic_read(&page->count) == 1) { - printk (KERN_WARNING - "VM: Removing page cache on unshared page %08lx\n", - page_address(page)); - } -#endif #ifdef DEBUG_SWAP printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n", @@ -222,7 +207,6 @@ void delete_from_swap_cache(struct page *page) #ifdef SWAP_CACHE_INFO swap_cache_del_total++; - swap_cache_del_success++; #endif #ifdef DEBUG_SWAP printk("DebugVM: delete_from_swap_cache(%08lx count %d, " @@ -241,6 +225,7 @@ void delete_from_swap_cache(struct page *page) void free_page_and_swap_cache(unsigned long addr) { struct page *page = mem_map + MAP_NR(addr); + /* * If we are the only user, then free up the swap cache. */ @@ -248,7 +233,7 @@ void free_page_and_swap_cache(unsigned long addr) delete_from_swap_cache(page); } - free_page(addr); + __free_page(page); } @@ -258,18 +243,25 @@ void free_page_and_swap_cache(unsigned long addr) * incremented. */ -static struct page * lookup_swap_cache(unsigned long entry) +struct page * lookup_swap_cache(unsigned long entry) { struct page *found; - + +#ifdef SWAP_CACHE_INFO + swap_cache_find_total++; +#endif while (1) { found = find_page(&swapper_inode, entry); if (!found) return 0; if (found->inode != &swapper_inode || !PageSwapCache(found)) goto out_bad; - if (!PageLocked(found)) + if (!PageLocked(found)) { +#ifdef SWAP_CACHE_INFO + swap_cache_find_success++; +#endif return found; + } __free_page(found); __wait_on_page(found); } @@ -291,7 +283,7 @@ out_bad: struct page * read_swap_cache_async(unsigned long entry, int wait) { - struct page *found_page, *new_page; + struct page *found_page = 0, *new_page; unsigned long new_page_addr; #ifdef DEBUG_SWAP @@ -299,15 +291,20 @@ struct page * read_swap_cache_async(unsigned long entry, int wait) entry, wait ? ", wait" : ""); #endif /* + * Make sure the swap entry is still in use. + */ + if (!swap_duplicate(entry)) /* Account for the swap cache */ + goto out; + /* * Look for the page in the swap cache. */ found_page = lookup_swap_cache(entry); if (found_page) - goto out; + goto out_free_swap; - new_page_addr = __get_free_page(GFP_KERNEL); + new_page_addr = __get_free_page(GFP_USER); if (!new_page_addr) - goto out; /* Out of memory */ + goto out_free_swap; /* Out of memory */ new_page = mem_map + MAP_NR(new_page_addr); /* @@ -316,11 +313,6 @@ struct page * read_swap_cache_async(unsigned long entry, int wait) found_page = lookup_swap_cache(entry); if (found_page) goto out_free_page; - /* - * Make sure the swap entry is still in use. - */ - if (!swap_duplicate(entry)) /* Account for the swap cache */ - goto out_free_page; /* * Add it to the swap cache and read its contents. */ @@ -338,6 +330,8 @@ struct page * read_swap_cache_async(unsigned long entry, int wait) out_free_page: __free_page(new_page); +out_free_swap: + swap_free(entry); out: return found_page; } diff --git a/mm/swapfile.c b/mm/swapfile.c index c574fb59a..dd66701b5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -23,6 +23,7 @@ struct swap_list_t swap_list = {-1, -1}; struct swap_info_struct swap_info[MAX_SWAPFILES]; +#define SWAPFILE_CLUSTER 256 static inline int scan_swap_map(struct swap_info_struct *si) { @@ -30,7 +31,7 @@ static inline int scan_swap_map(struct swap_info_struct *si) /* * We try to cluster swap pages by allocating them * sequentially in swap. Once we've allocated - * SWAP_CLUSTER_MAX pages this way, however, we resort to + * SWAPFILE_CLUSTER pages this way, however, we resort to * first-free allocation, starting a new cluster. This * prevents us from scattering swap pages all over the entire * swap partition, so that we reduce overall disk seek times @@ -46,7 +47,7 @@ static inline int scan_swap_map(struct swap_info_struct *si) goto got_page; } } - si->cluster_nr = SWAP_CLUSTER_MAX; + si->cluster_nr = SWAPFILE_CLUSTER; for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { if (si->swap_map[offset]) continue; @@ -626,11 +627,11 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) p->highest_bit = swap_header->info.last_page - 1; p->max = swap_header->info.last_page; - if (p->max >= 0x7fffffffL/PAGE_SIZE || - (void *) &swap_header->info.badpages[(int) swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) { - error = -EINVAL; + error = -EINVAL; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; + if (p->max >= SWP_OFFSET(SWP_ENTRY(0,~0UL))) goto bad_swap; - } /* OK, set up the swap map and apply the bad block list */ if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e99ad35fb..7063b2df1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -161,8 +161,10 @@ struct vm_struct * get_vm_area(unsigned long size) for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { if (size + addr < (unsigned long) tmp->addr) break; - if (addr > VMALLOC_END-size) + if (addr > VMALLOC_END-size) { + kfree(area); return NULL; + } addr = tmp->size + (unsigned long) tmp->addr; } area->addr = (void *)addr; diff --git a/mm/vmscan.c b/mm/vmscan.c index c5efa52a2..116096153 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -20,13 +20,6 @@ #include <asm/pgtable.h> -/* - * The wait queue for waking up the pageout daemon: - */ -static struct task_struct * kswapd_task = NULL; - -static void init_swap_timer(void); - /* * The swap-out functions return 1 if they successfully * threw something out, and we got a free page. It returns @@ -38,7 +31,7 @@ static void init_swap_timer(void); * using a process that no longer actually exists (it might * have died while we slept). */ -static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, +static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; @@ -59,50 +52,6 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))) return 0; - /* - * Deal with page aging. There are several special cases to - * consider: - * - * Page has been accessed, but is swap cached. If the page is - * getting sufficiently "interesting" --- its age is getting - * high --- then if we are sufficiently short of free swap - * pages, then delete the swap cache. We can only do this if - * the swap page's reference count is one: ie. there are no - * other references to it beyond the swap cache (as there must - * still be PTEs pointing to it if count > 1). - * - * If the page has NOT been touched, and its age reaches zero, - * then we are swapping it out: - * - * If there is already a swap cache page for this page, then - * another process has already allocated swap space, so just - * dereference the physical page and copy in the swap entry - * from the swap cache. - * - * Note, we rely on all pages read in from swap either having - * the swap cache flag set, OR being marked writable in the pte, - * but NEVER BOTH. (It IS legal to be neither cached nor dirty, - * however.) - * - * -- Stephen Tweedie 1998 */ - - if (PageSwapCache(page_map)) { - if (pte_write(pte)) { - struct page *found; - printk ("VM: Found a writable swap-cached page!\n"); - /* Try to diagnose the problem ... */ - found = find_page(&swapper_inode, page_map->offset); - if (found) { - printk("page=%p@%08lx, found=%p, count=%d\n", - page_map, page_map->offset, - found, atomic_read(&found->count)); - __free_page(found); - } else - printk ("Spurious, page not in cache\n"); - return 0; - } - } - if (pte_young(pte)) { /* * Transfer the "accessed" bit from the page @@ -110,109 +59,111 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc */ set_pte(page_table, pte_mkold(pte)); set_bit(PG_referenced, &page_map->flags); + return 0; + } - /* - * We should test here to see if we want to recover any - * swap cache page here. We do this if the page seeing - * enough activity, AND we are sufficiently low on swap - * - * We need to track both the number of available swap - * pages and the total number present before we can do - * this... - */ + /* + * Is the page already in the swap cache? If so, then + * we can just drop our reference to it without doing + * any IO - it's already up-to-date on disk. + * + * Return 0, as we didn't actually free any real + * memory, and we should just continue our scan. + */ + if (PageSwapCache(page_map)) { + entry = page_map->offset; + swap_duplicate(entry); + set_pte(page_table, __pte(entry)); +drop_pte: + vma->vm_mm->rss--; + flush_tlb_page(vma, address); + __free_page(page_map); return 0; } - if (pte_dirty(pte)) { - if (vma->vm_ops && vma->vm_ops->swapout) { - pid_t pid = tsk->pid; - vma->vm_mm->rss--; - if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table)) - kill_proc(pid, SIGBUS, 1); - } else { - /* - * This is a dirty, swappable page. First of all, - * get a suitable swap entry for it, and make sure - * we have the swap cache set up to associate the - * page with that swap entry. - */ - entry = in_swap_cache(page_map); - if (!entry) { - entry = get_swap_page(); - if (!entry) - return 0; /* No swap space left */ - } - - vma->vm_mm->rss--; - tsk->nswap++; - flush_cache_page(vma, address); - set_pte(page_table, __pte(entry)); - flush_tlb_page(vma, address); - swap_duplicate(entry); - - /* Now to write back the page. We have two - * cases: if the page is already part of the - * swap cache, then it is already on disk. Just - * free the page and return (we release the swap - * cache on the last accessor too). - * - * If we have made a new swap entry, then we - * start the write out to disk. If the page is - * shared, however, we still need to keep the - * copy in memory, so we add it to the swap - * cache. */ - if (PageSwapCache(page_map)) { - free_page(page); - return (atomic_read(&page_map->count) == 0); - } - add_to_swap_cache(page_map, entry); - /* We checked we were unlocked way up above, and we - have been careful not to stall until here */ - set_bit(PG_locked, &page_map->flags); - /* OK, do a physical write to swap. */ - rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT)); - } - /* Now we can free the current physical page. We also - * free up the swap cache if this is the last use of the - * page. Note that there is a race here: the page may - * still be shared COW by another process, but that - * process may exit while we are writing out the page - * asynchronously. That's no problem, shrink_mmap() can - * correctly clean up the occassional unshared page - * which gets left behind in the swap cache. */ - free_page(page); - return 1; /* we slept: the process may not exist any more */ + /* + * Is it a clean page? Then it must be recoverable + * by just paging it in again, and we can just drop + * it.. + * + * However, this won't actually free any real + * memory, as the page will just be in the page cache + * somewhere, and as such we should just continue + * our scan. + * + * Basically, this just makes it possible for us to do + * some real work in the future in "shrink_mmap()". + */ + if (!pte_dirty(pte)) { + pte_clear(page_table); + goto drop_pte; } - /* The page was _not_ dirty, but still has a zero age. It must - * already be uptodate on disk. If it is in the swap cache, - * then we can just unlink the page now. Remove the swap cache - * too if this is the last user. */ - if ((entry = in_swap_cache(page_map))) { - vma->vm_mm->rss--; - flush_cache_page(vma, address); - set_pte(page_table, __pte(entry)); - flush_tlb_page(vma, address); - swap_duplicate(entry); - free_page(page); - return (atomic_read(&page_map->count) == 0); - } - /* - * A clean page to be discarded? Must be mmap()ed from - * somewhere. Unlink the pte, and tell the filemap code to - * discard any cached backing page if this is the last user. + /* + * Don't go down into the swap-out stuff if + * we cannot do I/O! Avoid recursing on FS + * locks etc. */ - if (PageSwapCache(page_map)) { - printk ("VM: How can this page _still_ be cached?"); + if (!(gfp_mask & __GFP_IO)) return 0; + + /* + * Ok, it's really dirty. That means that + * we should either create a new swap cache + * entry for it, or we should write it back + * to its own backing store. + * + * Note that in neither case do we actually + * know that we make a page available, but + * as we potentially sleep we can no longer + * continue scanning, so we migth as well + * assume we free'd something. + * + * NOTE NOTE NOTE! This should just set a + * dirty bit in page_map, and just drop the + * pte. All the hard work would be done by + * shrink_mmap(). + * + * That would get rid of a lot of problems. + */ + flush_cache_page(vma, address); + if (vma->vm_ops && vma->vm_ops->swapout) { + pid_t pid = tsk->pid; + pte_clear(page_table); + flush_tlb_page(vma, address); + vma->vm_mm->rss--; + + if (vma->vm_ops->swapout(vma, page_map)) + kill_proc(pid, SIGBUS, 1); + __free_page(page_map); + return 1; } + + /* + * This is a dirty, swappable page. First of all, + * get a suitable swap entry for it, and make sure + * we have the swap cache set up to associate the + * page with that swap entry. + */ + entry = get_swap_page(); + if (!entry) + return 0; /* No swap space left */ + vma->vm_mm->rss--; - flush_cache_page(vma, address); - pte_clear(page_table); + tsk->nswap++; + set_pte(page_table, __pte(entry)); flush_tlb_page(vma, address); - entry = (atomic_read(&page_map->count) == 1); + swap_duplicate(entry); /* One for the process, one for the swap cache */ + add_to_swap_cache(page_map, entry); + /* We checked we were unlocked way up above, and we + have been careful not to stall until here */ + set_bit(PG_locked, &page_map->flags); + + /* OK, do a physical asynchronous write to swap. */ + rw_swap_page(WRITE, entry, (char *) page, 0); + __free_page(page_map); - return entry; + return 1; } /* @@ -363,13 +314,23 @@ static int swap_out(unsigned int priority, int gfp_mask) /* * We make one or two passes through the task list, indexed by * assign = {0, 1}: - * Pass 1: select the swappable task with maximal swap_cnt. - * Pass 2: assign new swap_cnt values, then select as above. + * Pass 1: select the swappable task with maximal RSS that has + * not yet been swapped out. + * Pass 2: re-assign rss swap_cnt values, then select as above. + * * With this approach, there's no need to remember the last task * swapped out. If the swap-out fails, we clear swap_cnt so the * task won't be selected again until all others have been tried. + * + * Think of swap_cnt as a "shadow rss" - it tells us which process + * we want to page out (always try largest first). */ - counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority; + counter = nr_tasks / (priority+1); + if (counter < 1) + counter = 1; + if (counter > nr_tasks) + counter = nr_tasks; + for (; counter >= 0; counter--) { assign = 0; max_cnt = 0; @@ -382,15 +343,9 @@ static int swap_out(unsigned int priority, int gfp_mask) continue; if (p->mm->rss <= 0) continue; - if (assign) { - /* - * If we didn't select a task on pass 1, - * assign each task a new swap_cnt. - * Normalise the number of pages swapped - * by multiplying by (RSS / 1MB) - */ - p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss); - } + /* Refresh swap_cnt? */ + if (assign) + p->swap_cnt = p->mm->rss; if (p->swap_cnt > max_cnt) { max_cnt = p->swap_cnt; pbest = p; @@ -404,56 +359,60 @@ static int swap_out(unsigned int priority, int gfp_mask) } goto out; } - pbest->swap_cnt--; - /* - * Nonzero means we cleared out something, but only "1" means - * that we actually free'd up a page as a result. - */ - if (swap_out_process(pbest, gfp_mask) == 1) - return 1; + if (swap_out_process(pbest, gfp_mask)) + return 1; } out: return 0; } /* - * We are much more aggressive about trying to swap out than we used - * to be. This works out OK, because we now do proper aging on page - * contents. + * We need to make the locks finer granularity, but right + * now we need this so that we can do page allocations + * without holding the kernel lock etc. + * + * We want to try to free "count" pages, and we need to + * cluster them so that we get good swap-out behaviour. See + * the "free_memory()" macro for details. */ -static int do_try_to_free_page(int gfp_mask) +static int do_try_to_free_pages(unsigned int gfp_mask) { - static int state = 0; - int i=6; + int priority; + int count = SWAP_CLUSTER_MAX; + + lock_kernel(); /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - if (buffer_over_borrow() || pgcache_over_borrow()) - state = 0; + priority = 6; + do { + while (shrink_mmap(priority, gfp_mask)) { + if (!--count) + goto done; + } - switch (state) { - do { - case 0: - if (shrink_mmap(i, gfp_mask)) - return 1; - state = 1; - case 1: - if (shm_swap(i, gfp_mask)) - return 1; - state = 2; - case 2: - if (swap_out(i, gfp_mask)) - return 1; - state = 3; - case 3: - shrink_dcache_memory(i, gfp_mask); - state = 0; - i--; - } while (i >= 0); - } - return 0; + /* Try to get rid of some shared memory pages.. */ + if (gfp_mask & __GFP_IO) { + while (shm_swap(priority, gfp_mask)) { + if (!--count) + goto done; + } + } + + /* Then, try to page stuff out.. */ + while (swap_out(priority, gfp_mask)) { + if (!--count) + goto done; + } + + shrink_dcache_memory(priority, gfp_mask); + } while (--priority >= 0); +done: + unlock_kernel(); + + return priority >= 0; } /* @@ -467,6 +426,8 @@ void __init kswapd_setup(void) int i; char *revision="$Revision: 1.5 $", *s, *e; + swap_setup(); + if ((s = strchr(revision, ':')) && (e = strchr(s, '$'))) s++, i = e - s; @@ -475,35 +436,36 @@ void __init kswapd_setup(void) printk ("Starting kswapd v%.*s\n", i, s); } +static struct task_struct *kswapd_process; + /* - * The background pageout daemon. - * Started as a kernel thread from the init process. + * The background pageout daemon, started as a kernel thread + * from the init process. + * + * This basically executes once a second, trickling out pages + * so that we have _some_ free memory available even if there + * is no other activity that frees anything up. This is needed + * for things like routing etc, where we otherwise might have + * all activity going on in asynchronous contexts that cannot + * page things out. + * + * If there are applications that are active memory-allocators + * (most normal use), this basically shouldn't matter. */ int kswapd(void *unused) { - current->session = 1; - current->pgrp = 1; - strcpy(current->comm, "kswapd"); - sigfillset(¤t->blocked); - - /* - * As a kernel thread we want to tamper with system buffers - * and other internals and thus be subject to the SMP locking - * rules. (On a uniprocessor box this does nothing). - */ - lock_kernel(); - - /* - * Set the base priority to something smaller than a - * regular process. We will scale up the priority - * dynamically depending on how much memory we need. - */ - current->priority = (DEF_PRIORITY * 2) / 3; + struct task_struct *tsk = current; + kswapd_process = tsk; + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "kswapd"); + sigfillset(&tsk->blocked); + /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it - * regardless (see "try_to_free_pages()"). "kswapd" should + * regardless (see "__get_free_pages()"). "kswapd" should * never get caught in the normal page freeing logic. * * (Kswapd normally doesn't need memory anyway, but sometimes @@ -512,128 +474,52 @@ int kswapd(void *unused) * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ - current->flags |= PF_MEMALLOC; + tsk->flags |= PF_MEMALLOC; - init_swap_timer(); - kswapd_task = current; while (1) { - unsigned long end_time; - - current->state = TASK_INTERRUPTIBLE; - flush_signals(current); - run_task_queue(&tq_disk); - schedule(); - swapstats.wakeups++; - - /* max one hundreth of a second */ - end_time = jiffies + (HZ-1)/100; + /* + * Wake up once a second to see if we need to make + * more memory available. + * + * If we actually get into a low-memory situation, + * the processes needing more memory will wake us + * up on a more timely basis. + */ do { - if (!do_try_to_free_page(0)) + if (nr_free_pages >= freepages.high) break; - if (nr_free_pages > freepages.high + SWAP_CLUSTER_MAX) + + if (!do_try_to_free_pages(GFP_KSWAPD)) break; - } while (time_before_eq(jiffies,end_time)); + } while (!tsk->need_resched); + run_task_queue(&tq_disk); + tsk->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ); } - /* As if we could ever get here - maybe we want to make this killable */ - kswapd_task = NULL; - unlock_kernel(); - return 0; } /* - * We need to make the locks finer granularity, but right - * now we need this so that we can do page allocations - * without holding the kernel lock etc. + * Called by non-kswapd processes when they want more + * memory. + * + * In a perfect world, this should just wake up kswapd + * and return. We don't actually want to swap stuff out + * from user processes, because the locking issues are + * nasty to the extreme (file write locks, and MM locking) * - * The "PF_MEMALLOC" flag protects us against recursion: - * if we need more memory as part of a swap-out effort we - * will just silently return "success" to tell the page - * allocator to accept the allocation. + * One option might be to let kswapd do all the page-out + * and VM page table scanning that needs locking, and this + * process thread could do just the mmap shrink stage that + * can be done by just dropping cached pages without having + * any deadlock issues. */ -int try_to_free_pages(unsigned int gfp_mask, int count) +int try_to_free_pages(unsigned int gfp_mask) { int retval = 1; - lock_kernel(); - if (!(current->flags & PF_MEMALLOC)) { - current->flags |= PF_MEMALLOC; - do { - retval = do_try_to_free_page(gfp_mask); - if (!retval) - break; - count--; - } while (count > 0); - current->flags &= ~PF_MEMALLOC; - } - unlock_kernel(); + wake_up_process(kswapd_process); + if (gfp_mask & __GFP_WAIT) + retval = do_try_to_free_pages(gfp_mask); return retval; } - -/* - * Wake up kswapd according to the priority - * 0 - no wakeup - * 1 - wake up as a low-priority process - * 2 - wake up as a normal process - * 3 - wake up as an almost real-time process - * - * This plays mind-games with the "goodness()" - * function in kernel/sched.c. - */ -static inline void kswapd_wakeup(struct task_struct *p, int priority) -{ - if (priority) { - p->counter = p->priority << priority; - wake_up_process(p); - } -} - -/* - * The swap_tick function gets called on every clock tick. - */ -void swap_tick(void) -{ - struct task_struct *p = kswapd_task; - - /* - * Only bother to try to wake kswapd up - * if the task exists and can be woken. - */ - if (p && (p->state & TASK_INTERRUPTIBLE)) { - unsigned int pages; - int want_wakeup; - - /* - * Schedule for wakeup if there isn't lots - * of free memory or if there is too much - * of it used for buffers or pgcache. - * - * "want_wakeup" is our priority: 0 means - * not to wake anything up, while 3 means - * that we'd better give kswapd a realtime - * priority. - */ - want_wakeup = 0; - pages = nr_free_pages; - if (pages < freepages.high) - want_wakeup = 1; - if (pages < freepages.low) - want_wakeup = 2; - if (pages < freepages.min) - want_wakeup = 3; - kswapd_wakeup(p,want_wakeup); - } - - timer_active |= (1<<SWAP_TIMER); -} - -/* - * Initialise the swap timer - */ - -void init_swap_timer(void) -{ - timer_table[SWAP_TIMER].expires = jiffies; - timer_table[SWAP_TIMER].fn = swap_tick; - timer_active |= (1<<SWAP_TIMER); -} |