summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c262
-rw-r--r--mm/memory.c113
-rw-r--r--mm/mmap.c254
-rw-r--r--mm/mmap_avl.c374
-rw-r--r--mm/page_alloc.c153
-rw-r--r--mm/page_io.c215
-rw-r--r--mm/swap.c39
-rw-r--r--mm/swap_state.c60
-rw-r--r--mm/swapfile.c13
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c526
11 files changed, 1191 insertions, 822 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 227bcd5a9..3c15ea63b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -118,140 +118,83 @@ void remove_inode_page(struct page *page)
__free_page(page);
}
-/*
- * Check whether we can free this page.
- */
-static inline int shrink_one_page(struct page *page, int gfp_mask)
-{
- struct buffer_head *tmp, *bh;
-
- if (PageLocked(page))
- goto next;
- if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
- goto next;
- /* First of all, regenerate the page's referenced bit
- * from any buffers in the page
- */
- bh = page->buffers;
- if (bh) {
- tmp = bh;
- do {
- if (buffer_touched(tmp)) {
- clear_bit(BH_Touched, &tmp->b_state);
- set_bit(PG_referenced, &page->flags);
- }
- tmp = tmp->b_this_page;
- } while (tmp != bh);
-
- /* Refuse to swap out all buffer pages */
- if (buffer_under_min())
- goto next;
- }
-
- /* We can't throw away shared pages, but we do mark
- them as referenced. This relies on the fact that
- no page is currently in both the page cache and the
- buffer cache; we'd have to modify the following
- test to allow for that case. */
-
- switch (atomic_read(&page->count)) {
- case 1:
- /* is it a swap-cache or page-cache page? */
- if (page->inode) {
- if (test_and_clear_bit(PG_referenced, &page->flags))
- break;
- if (pgcache_under_min())
- break;
- if (PageSwapCache(page)) {
- delete_from_swap_cache(page);
- return 1;
- }
- remove_inode_page(page);
- return 1;
- }
- /* It's not a cache page, so we don't do aging.
- * If it has been referenced recently, don't free it */
- if (test_and_clear_bit(PG_referenced, &page->flags))
- break;
-
- if (buffer_under_min())
- break;
-
- /* is it a buffer cache page? */
- if (bh && try_to_free_buffer(bh, &bh, 6))
- return 1;
- break;
-
- default:
- /* more than one user: we can't throw it away */
- set_bit(PG_referenced, &page->flags);
- /* fall through */
- case 0:
- /* nothing */
- }
-next:
- return 0;
-}
-
int shrink_mmap(int priority, int gfp_mask)
{
static unsigned long clock = 0;
unsigned long limit = num_physpages;
struct page * page;
- int count_max, count_min;
+ int count;
- count_max = limit;
- count_min = (limit<<2) >> (priority);
+ count = limit >> priority;
page = mem_map + clock;
do {
- if (PageSkip(page)) {
- /* next_hash is overloaded for PageSkip */
- page = page->next_hash;
- clock = page->map_nr;
- }
-
- if (shrink_one_page(page, gfp_mask))
- return 1;
- count_max--;
- /*
- * If the page we looked at was recyclable but we didn't
- * reclaim it (presumably due to PG_referenced), don't
- * count it as scanned. This way, the more referenced
- * page cache pages we encounter, the more rapidly we
- * will age them.
+ int referenced;
+
+ /* This works even in the presence of PageSkip because
+ * the first two entries at the beginning of a hole will
+ * be marked, not just the first.
*/
- if (atomic_read(&page->count) != 1 ||
- (!page->inode && !page->buffers))
- count_min--;
page++;
clock++;
if (clock >= max_mapnr) {
clock = 0;
page = mem_map;
}
- } while (count_max > 0 && count_min > 0);
- return 0;
-}
+ if (PageSkip(page)) {
+ /* next_hash is overloaded for PageSkip */
+ page = page->next_hash;
+ clock = page - mem_map;
+ }
+
+ referenced = test_and_clear_bit(PG_referenced, &page->flags);
-/*
- * This is called from try_to_swap_out() when we try to get rid of some
- * pages.. If we're unmapping the last occurrence of this page, we also
- * free it from the page hash-queues etc, as we don't want to keep it
- * in-core unnecessarily.
- */
-unsigned long page_unuse(struct page * page)
-{
- int count = atomic_read(&page->count);
-
- if (count != 2)
- return count;
- if (!page->inode)
- return count;
- if (PageSwapCache(page))
- panic ("Doing a normal page_unuse of a swap cache page");
- remove_inode_page(page);
- return 1;
+ if (PageLocked(page))
+ continue;
+
+ if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+ continue;
+
+ /* We can't free pages unless there's just one user */
+ if (atomic_read(&page->count) != 1)
+ continue;
+
+ count--;
+
+ /*
+ * Is it a page swap page? If so, we want to
+ * drop it if it is no longer used, even if it
+ * were to be marked referenced..
+ */
+ if (PageSwapCache(page)) {
+ if (referenced && swap_count(page->offset) != 1)
+ continue;
+ delete_from_swap_cache(page);
+ return 1;
+ }
+
+ if (referenced)
+ continue;
+
+ /* Is it a buffer page? */
+ if (page->buffers) {
+ if (buffer_under_min())
+ continue;
+ if (!try_to_free_buffers(page))
+ continue;
+ return 1;
+ }
+
+ /* is it a page-cache page? */
+ if (page->inode) {
+ if (pgcache_under_min())
+ continue;
+ remove_inode_page(page);
+ return 1;
+ }
+
+ } while (count > 0);
+ return 0;
}
/*
@@ -974,7 +917,7 @@ static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long
struct file * file = area->vm_file;
struct dentry * dentry = file->f_dentry;
struct inode * inode = dentry->d_inode;
- unsigned long offset;
+ unsigned long offset, reada, i;
struct page * page, **hash;
unsigned long old_page, new_page;
@@ -1035,7 +978,18 @@ success:
return new_page;
no_cached_page:
- new_page = __get_free_page(GFP_USER);
+ /*
+ * Try to read in an entire cluster at once.
+ */
+ reada = offset;
+ reada >>= PAGE_SHIFT + page_cluster;
+ reada <<= PAGE_SHIFT + page_cluster;
+
+ for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_SIZE)
+ new_page = try_to_read_ahead(file, reada, new_page);
+
+ if (!new_page)
+ new_page = __get_free_page(GFP_USER);
if (!new_page)
goto no_page;
@@ -1059,11 +1013,6 @@ no_cached_page:
if (inode->i_op->readpage(file, page) != 0)
goto failure;
- /*
- * Do a very limited read-ahead if appropriate
- */
- if (PageLocked(page))
- new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
goto found_page;
page_locked_wait:
@@ -1137,22 +1086,6 @@ static int filemap_write_page(struct vm_area_struct * vma,
struct file * file;
struct dentry * dentry;
struct inode * inode;
- struct buffer_head * bh;
-
- bh = mem_map[MAP_NR(page)].buffers;
- if (bh) {
- /* whee.. just mark the buffer heads dirty */
- struct buffer_head * tmp = bh;
- do {
- /*
- * WSH: There's a race here: mark_buffer_dirty()
- * could block, and the buffers aren't pinned down.
- */
- mark_buffer_dirty(tmp, 0);
- tmp = tmp->b_this_page;
- } while (tmp != bh);
- return 0;
- }
file = vma->vm_file;
dentry = file->f_dentry;
@@ -1174,50 +1107,15 @@ static int filemap_write_page(struct vm_area_struct * vma,
/*
- * Swapping to a shared file: while we're busy writing out the page
- * (and the page still exists in memory), we save the page information
- * in the page table, so that "filemap_swapin()" can re-use the page
- * immediately if it is called while we're busy swapping it out..
- *
- * Once we've written it all out, we mark the page entry "empty", which
- * will result in a normal page-in (instead of a swap-in) from the now
- * up-to-date disk file.
+ * The page cache takes care of races between somebody
+ * trying to swap something out and swap something in
+ * at the same time..
*/
-int filemap_swapout(struct vm_area_struct * vma,
- unsigned long offset,
- pte_t *page_table)
-{
- int error;
- unsigned long page = pte_page(*page_table);
- unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
-
- flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
- set_pte(page_table, __pte(entry));
- flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
- error = filemap_write_page(vma, offset, page);
- if (pte_val(*page_table) == entry)
- pte_clear(page_table);
- return error;
-}
-
-/*
- * filemap_swapin() is called only if we have something in the page
- * tables that is non-zero (but not present), which we know to be the
- * page index of a page that is busy being swapped out (see above).
- * So we just use it directly..
- */
-static pte_t filemap_swapin(struct vm_area_struct * vma,
- unsigned long offset,
- unsigned long entry)
+int filemap_swapout(struct vm_area_struct * vma, struct page * page)
{
- unsigned long page = SWP_OFFSET(entry);
-
- atomic_inc(&mem_map[page].count);
- page = (page << PAGE_SHIFT) + PAGE_OFFSET;
- return mk_pte(page,vma->vm_page_prot);
+ return filemap_write_page(vma, page->offset, page_address(page));
}
-
static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
@@ -1358,7 +1256,7 @@ static struct vm_operations_struct file_shared_mmap = {
filemap_nopage, /* nopage */
NULL, /* wppage */
filemap_swapout, /* swapout */
- filemap_swapin, /* swapin */
+ NULL, /* swapin */
};
/*
@@ -1637,7 +1535,7 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset,
if (!page) {
if (!new)
goto out;
- page_cache = get_free_page(GFP_KERNEL);
+ page_cache = get_free_page(GFP_USER);
if (!page_cache)
goto out;
page = mem_map + MAP_NR(page_cache);
diff --git a/mm/memory.c b/mm/memory.c
index 932c35648..f788163c2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -126,48 +126,36 @@ int check_pgt_cache(void)
* This function clears all user-level page tables of a process - this
* is needed by execve(), so that old pages aren't in the way.
*/
-void clear_page_tables(struct task_struct * tsk)
+void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
{
- pgd_t * page_dir = tsk->mm->pgd;
- int i;
-
- if (!page_dir || page_dir == swapper_pg_dir)
- goto out_bad;
- for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
- free_one_pgd(page_dir + i);
+ pgd_t * page_dir = mm->pgd;
- /* keep the page table cache within bounds */
- check_pgt_cache();
- return;
+ if (page_dir && page_dir != swapper_pg_dir) {
+ page_dir += first;
+ do {
+ free_one_pgd(page_dir);
+ page_dir++;
+ } while (--nr);
-out_bad:
- printk(KERN_ERR
- "clear_page_tables: %s trying to clear kernel pgd\n",
- tsk->comm);
- return;
+ /* keep the page table cache within bounds */
+ check_pgt_cache();
+ }
}
/*
- * This function frees up all page tables of a process when it exits. It
- * is the same as "clear_page_tables()", except it also frees the old
- * page table directory.
+ * This function just free's the page directory - the
+ * pages tables themselves have been freed earlier by
+ * clear_page_tables().
*/
void free_page_tables(struct mm_struct * mm)
{
pgd_t * page_dir = mm->pgd;
- int i;
-
- if (!page_dir)
- goto out;
- if (page_dir == swapper_pg_dir)
- goto out_bad;
- for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
- free_one_pgd(page_dir + i);
- pgd_free(page_dir);
-
- /* keep the page table cache within bounds */
- check_pgt_cache();
-out:
+
+ if (page_dir) {
+ if (page_dir == swapper_pg_dir)
+ goto out_bad;
+ pgd_free(page_dir);
+ }
return;
out_bad:
@@ -204,7 +192,7 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
pgd_t * src_pgd, * dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
- unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+ unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -277,10 +265,15 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
set_pte(dst_pte, pte);
goto cont_copy_pte_range;
}
- if (cow)
+ /* If it's a COW mapping, write protect it both in the parent and the child */
+ if (cow) {
pte = pte_wrprotect(pte);
+ set_pte(src_pte, pte);
+ }
+ /* If it's a shared mapping, mark it clean in the child */
+ if (vma->vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
set_pte(dst_pte, pte_mkold(pte));
- set_pte(src_pte, pte);
atomic_inc(&mem_map[page_nr].count);
cont_copy_pte_range: address += PAGE_SIZE;
@@ -644,37 +637,47 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
page_map = mem_map + MAP_NR(old_page);
/*
- * Do we need to copy?
+ * We can avoid the copy if:
+ * - we're the only user (count == 1)
+ * - the only other user is the swap cache,
+ * and the only swap cache user is itself,
+ * in which case we can remove the page
+ * from the swap cache.
*/
- if (is_page_shared(page_map)) {
+ switch (atomic_read(&page_map->count)) {
+ case 2:
+ if (!PageSwapCache(page_map))
+ break;
+ if (swap_count(page_map->offset) != 1)
+ break;
+ delete_from_swap_cache(page_map);
+ /* FallThrough */
+ case 1:
+ /* We can release the kernel lock now.. */
unlock_kernel();
- if (!new_page)
- return 0;
- if (PageReserved(mem_map + MAP_NR(old_page)))
- ++vma->vm_mm->rss;
- copy_cow_page(old_page,new_page);
- flush_page_to_ram(old_page);
- flush_page_to_ram(new_page);
flush_cache_page(vma, address);
- set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
- free_page(old_page);
+ set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
flush_tlb_page(vma, address);
+end_wp_page:
+ if (new_page)
+ free_page(new_page);
return 1;
}
-
- if (PageSwapCache(page_map))
- delete_from_swap_cache(page_map);
-
- /* We can release the kernel lock now.. */
+
unlock_kernel();
+ if (!new_page)
+ return 0;
+ if (PageReserved(mem_map + MAP_NR(old_page)))
+ ++vma->vm_mm->rss;
+ copy_cow_page(old_page,new_page);
+ flush_page_to_ram(old_page);
+ flush_page_to_ram(new_page);
flush_cache_page(vma, address);
- set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
+ set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+ free_page(old_page);
flush_tlb_page(vma, address);
-end_wp_page:
- if (new_page)
- free_page(new_page);
return 1;
bad_wp_page:
diff --git a/mm/mmap.c b/mm/mmap.c
index 4cbdbe3ca..9a9146754 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -371,6 +371,100 @@ unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
}
}
+#define vm_avl_empty (struct vm_area_struct *) NULL
+
+#include "mmap_avl.c"
+
+/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL;
+
+ if (mm) {
+ /* Check the cache first. */
+ /* (Cache hit rate is typically around 35%.) */
+ vma = mm->mmap_cache;
+ if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+ if (!mm->mmap_avl) {
+ /* Go through the linear list. */
+ vma = mm->mmap;
+ while (vma && vma->vm_end <= addr)
+ vma = vma->vm_next;
+ } else {
+ /* Then go through the AVL tree quickly. */
+ struct vm_area_struct * tree = mm->mmap_avl;
+ vma = NULL;
+ for (;;) {
+ if (tree == vm_avl_empty)
+ break;
+ if (tree->vm_end > addr) {
+ vma = tree;
+ if (tree->vm_start <= addr)
+ break;
+ tree = tree->vm_avl_left;
+ } else
+ tree = tree->vm_avl_right;
+ }
+ }
+ if (vma)
+ mm->mmap_cache = vma;
+ }
+ }
+ return vma;
+}
+
+/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
+ struct vm_area_struct **pprev)
+{
+ if (mm) {
+ if (!mm->mmap_avl) {
+ /* Go through the linear list. */
+ struct vm_area_struct * prev = NULL;
+ struct vm_area_struct * vma = mm->mmap;
+ while (vma && vma->vm_end <= addr) {
+ prev = vma;
+ vma = vma->vm_next;
+ }
+ *pprev = prev;
+ return vma;
+ } else {
+ /* Go through the AVL tree quickly. */
+ struct vm_area_struct * vma = NULL;
+ struct vm_area_struct * last_turn_right = NULL;
+ struct vm_area_struct * prev = NULL;
+ struct vm_area_struct * tree = mm->mmap_avl;
+ for (;;) {
+ if (tree == vm_avl_empty)
+ break;
+ if (tree->vm_end > addr) {
+ vma = tree;
+ prev = last_turn_right;
+ if (tree->vm_start <= addr)
+ break;
+ tree = tree->vm_avl_left;
+ } else {
+ last_turn_right = tree;
+ tree = tree->vm_avl_right;
+ }
+ }
+ if (vma) {
+ if (vma->vm_avl_left != vm_avl_empty) {
+ prev = vma->vm_avl_left;
+ while (prev->vm_avl_right != vm_avl_empty)
+ prev = prev->vm_avl_right;
+ }
+ if ((prev ? prev->vm_next : mm->mmap) != vma)
+ printk("find_vma_prev: tree inconsistent with list\n");
+ *pprev = prev;
+ return vma;
+ }
+ }
+ }
+ *pprev = NULL;
+ return NULL;
+}
+
/* Normal function to fix up a mapping
* This function is the default for when an area has no specific
* function. This may be used as part of a more specific routine.
@@ -446,6 +540,57 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
return 1;
}
+/*
+ * Try to free as many page directory entries as we can,
+ * without having to work very hard at actually scanning
+ * the page tables themselves.
+ *
+ * Right now we try to free page tables if we have a nice
+ * PGDIR-aligned area that got free'd up. We could be more
+ * granular if we want to, but this is fast and simple,
+ * and covers the bad cases.
+ *
+ * "prev", if it exists, points to a vma before the one
+ * we just free'd - but there's no telling how much before.
+ */
+static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end)
+{
+ unsigned long first = start & PGDIR_MASK;
+ unsigned long last = (end + PGDIR_SIZE - 1) & PGDIR_MASK;
+
+ if (!prev) {
+ prev = mm->mmap;
+ if (!prev)
+ goto no_mmaps;
+ if (prev->vm_end > start) {
+ if (last > prev->vm_end)
+ last = prev->vm_end;
+ goto no_mmaps;
+ }
+ }
+ for (;;) {
+ struct vm_area_struct *next = prev->vm_next;
+
+ if (next) {
+ if (next->vm_start < start) {
+ prev = next;
+ continue;
+ }
+ if (last > next->vm_start)
+ last = next->vm_start;
+ }
+ if (prev->vm_end > first)
+ first = prev->vm_end + PGDIR_SIZE - 1;
+ break;
+ }
+no_mmaps:
+ first = first >> PGDIR_SHIFT;
+ last = last >> PGDIR_SHIFT;
+ if (last > first)
+ clear_page_tables(mm, first, last-first);
+}
+
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
@@ -454,8 +599,7 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
int do_munmap(unsigned long addr, size_t len)
{
struct mm_struct * mm;
- struct vm_area_struct *mpnt, *free, *extra;
- int freed;
+ struct vm_area_struct *mpnt, *prev, **npp, *free, *extra;
if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
return -EINVAL;
@@ -469,15 +613,17 @@ int do_munmap(unsigned long addr, size_t len)
* on the list. If nothing is put on, nothing is affected.
*/
mm = current->mm;
- mpnt = mm->mmap;
- while(mpnt && mpnt->vm_end <= addr)
- mpnt = mpnt->vm_next;
+ mpnt = find_vma_prev(mm, addr, &prev);
if (!mpnt)
return 0;
+ /* we have addr < mpnt->vm_end */
+
+ if (mpnt->vm_start >= addr+len)
+ return 0;
/* If we'll make "hole", check the vm areas limit */
- if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) &&
- mm->map_count > MAX_MAP_COUNT)
+ if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
+ && mm->map_count >= MAX_MAP_COUNT)
return -ENOMEM;
/*
@@ -488,18 +634,14 @@ int do_munmap(unsigned long addr, size_t len)
if (!extra)
return -ENOMEM;
- /* we have addr < mpnt->vm_end */
+ npp = (prev ? &prev->vm_next : &mm->mmap);
free = NULL;
- for ( ; mpnt && mpnt->vm_start < addr+len; ) {
- struct vm_area_struct *next = mpnt->vm_next;
-
- if(mpnt->vm_next)
- mpnt->vm_next->vm_pprev = mpnt->vm_pprev;
- *mpnt->vm_pprev = mpnt->vm_next;
-
+ for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
+ *npp = mpnt->vm_next;
mpnt->vm_next = free;
free = mpnt;
- mpnt = next;
+ if (mm->mmap_avl)
+ avl_remove(mpnt, &mm->mmap_avl);
}
/* Ok - we have the memory areas we should free on the 'free' list,
@@ -507,15 +649,10 @@ int do_munmap(unsigned long addr, size_t len)
* If the one of the segments is only being partially unmapped,
* it will put new vm_area_struct(s) into the address space.
*/
- freed = 0;
while ((mpnt = free) != NULL) {
unsigned long st, end, size;
free = free->vm_next;
- freed = 1;
-
- mm->map_count--;
- remove_shared_vm_struct(mpnt);
st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
end = addr+len;
@@ -525,6 +662,9 @@ int do_munmap(unsigned long addr, size_t len)
if (mpnt->vm_ops && mpnt->vm_ops->unmap)
mpnt->vm_ops->unmap(mpnt, st, size);
+ remove_shared_vm_struct(mpnt);
+ mm->map_count--;
+
flush_cache_range(mm, st, end);
zap_page_range(mm, st, size);
flush_tlb_range(mm, st, end);
@@ -540,8 +680,9 @@ int do_munmap(unsigned long addr, size_t len)
if (extra)
kmem_cache_free(vm_area_cachep, extra);
- if (freed)
- mm->mmap_cache = NULL; /* Kill the cache. */
+ free_pgtables(mm, prev, addr, addr+len);
+
+ mm->mmap_cache = NULL; /* Kill the cache. */
return 0;
}
@@ -557,13 +698,23 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len)
return ret;
}
+/* Build the AVL tree corresponding to the VMA list. */
+void build_mmap_avl(struct mm_struct * mm)
+{
+ struct vm_area_struct * vma;
+
+ mm->mmap_avl = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ avl_insert(vma, &mm->mmap_avl);
+}
+
/* Release all mmaps. */
void exit_mmap(struct mm_struct * mm)
{
struct vm_area_struct * mpnt;
mpnt = mm->mmap;
- mm->mmap = mm->mmap_cache = NULL;
+ mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
mm->rss = 0;
mm->total_vm = 0;
mm->locked_vm = 0;
@@ -591,6 +742,8 @@ void exit_mmap(struct mm_struct * mm)
/* This is just debugging */
if (mm->map_count)
printk("exit_mmap: map count is %d\n", mm->map_count);
+
+ clear_page_tables(mm, 0, USER_PTRS_PER_PGD);
}
/* Insert vm structure into process list sorted by address
@@ -598,20 +751,26 @@ void exit_mmap(struct mm_struct * mm)
*/
void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
{
- struct vm_area_struct **pprev = &mm->mmap;
+ struct vm_area_struct **pprev;
struct file * file;
- mm->map_count++;
-
- /* Find where to link it in. */
- while(*pprev && (*pprev)->vm_start <= vmp->vm_start)
- pprev = &(*pprev)->vm_next;
-
- /* Insert it. */
- if((vmp->vm_next = *pprev) != NULL)
- (*pprev)->vm_pprev = &vmp->vm_next;
+ if (!mm->mmap_avl) {
+ pprev = &mm->mmap;
+ while (*pprev && (*pprev)->vm_start <= vmp->vm_start)
+ pprev = &(*pprev)->vm_next;
+ } else {
+ struct vm_area_struct *prev, *next;
+ avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next);
+ pprev = (prev ? &prev->vm_next : &mm->mmap);
+ if (*pprev != next)
+ printk("insert_vm_struct: tree inconsistent with list\n");
+ }
+ vmp->vm_next = *pprev;
*pprev = vmp;
- vmp->vm_pprev = pprev;
+
+ mm->map_count++;
+ if (mm->map_count >= AVL_MIN_MAP_COUNT && !mm->mmap_avl)
+ build_mmap_avl(mm);
file = vmp->vm_file;
if (file) {
@@ -637,23 +796,17 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
*/
void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
{
- struct vm_area_struct *prev, *mpnt, *next;
+ struct vm_area_struct *prev, *mpnt, *next, *prev1;
- prev = NULL;
- mpnt = mm->mmap;
- while(mpnt && mpnt->vm_end <= start_addr) {
- prev = mpnt;
- mpnt = mpnt->vm_next;
- }
+ mpnt = find_vma_prev(mm, start_addr, &prev1);
if (!mpnt)
return;
- next = mpnt->vm_next;
-
- /* we have prev->vm_next == mpnt && mpnt->vm_next = next */
- if (!prev) {
+ if (prev1) {
+ prev = prev1;
+ } else {
prev = mpnt;
- mpnt = next;
+ mpnt = mpnt->vm_next;
}
/* prev and mpnt cycle through the list, as long as
@@ -684,11 +837,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
* big segment can possibly merge with the next one.
* The old unused mpnt is freed.
*/
- if(mpnt->vm_next)
- mpnt->vm_next->vm_pprev = mpnt->vm_pprev;
- *mpnt->vm_pprev = mpnt->vm_next;
-
+ if (mm->mmap_avl)
+ avl_remove(mpnt, &mm->mmap_avl);
prev->vm_end = mpnt->vm_end;
+ prev->vm_next = mpnt->vm_next;
if (mpnt->vm_ops && mpnt->vm_ops->close) {
mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start;
mpnt->vm_start = mpnt->vm_end;
diff --git a/mm/mmap_avl.c b/mm/mmap_avl.c
new file mode 100644
index 000000000..5a48ce89b
--- /dev/null
+++ b/mm/mmap_avl.c
@@ -0,0 +1,374 @@
+/*
+ * Searching a VMA in the linear list task->mm->mmap is horribly slow.
+ * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search
+ * from O(n) to O(log n), where n is the number of VMAs of the task
+ * n is typically around 6, but may reach 3000 in some cases: object-oriented
+ * databases, persistent store, generational garbage collection (Java, Lisp),
+ * ElectricFence.
+ * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>.
+ */
+
+/* We keep the list and tree sorted by address. */
+#define vm_avl_key vm_end
+#define vm_avl_key_t unsigned long /* typeof(vma->avl_key) */
+
+/*
+ * task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap
+ * or, more exactly, its root.
+ * A vm_area_struct has the following fields:
+ * vm_avl_left left son of a tree node
+ * vm_avl_right right son of a tree node
+ * vm_avl_height 1+max(heightof(left),heightof(right))
+ * The empty tree is represented as NULL.
+ */
+
+/* Since the trees are balanced, their height will never be large. */
+#define avl_maxheight 41 /* why this? a small exercise */
+#define heightof(tree) ((tree) == vm_avl_empty ? 0 : (tree)->vm_avl_height)
+/*
+ * Consistency and balancing rules:
+ * 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right))
+ * 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1
+ * 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key,
+ * foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key.
+ */
+
+#ifdef DEBUG_AVL
+
+/* Look up the nodes at the left and at the right of a given node. */
+static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
+{
+ vm_avl_key_t key = node->vm_avl_key;
+
+ *to_the_left = *to_the_right = NULL;
+ for (;;) {
+ if (tree == vm_avl_empty) {
+ printk("avl_neighbours: node not found in the tree\n");
+ return;
+ }
+ if (key == tree->vm_avl_key)
+ break;
+ if (key < tree->vm_avl_key) {
+ *to_the_right = tree;
+ tree = tree->vm_avl_left;
+ } else {
+ *to_the_left = tree;
+ tree = tree->vm_avl_right;
+ }
+ }
+ if (tree != node) {
+ printk("avl_neighbours: node not exactly found in the tree\n");
+ return;
+ }
+ if (tree->vm_avl_left != vm_avl_empty) {
+ struct vm_area_struct * node;
+ for (node = tree->vm_avl_left; node->vm_avl_right != vm_avl_empty; node = node->vm_avl_right)
+ continue;
+ *to_the_left = node;
+ }
+ if (tree->vm_avl_right != vm_avl_empty) {
+ struct vm_area_struct * node;
+ for (node = tree->vm_avl_right; node->vm_avl_left != vm_avl_empty; node = node->vm_avl_left)
+ continue;
+ *to_the_right = node;
+ }
+ if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right))
+ printk("avl_neighbours: tree inconsistent with list\n");
+}
+
+#endif
+
+/*
+ * Rebalance a tree.
+ * After inserting or deleting a node of a tree we have a sequence of subtrees
+ * nodes[0]..nodes[k-1] such that
+ * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}.
+ */
+static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
+{
+ for ( ; count > 0 ; count--) {
+ struct vm_area_struct ** nodeplace = *--nodeplaces_ptr;
+ struct vm_area_struct * node = *nodeplace;
+ struct vm_area_struct * nodeleft = node->vm_avl_left;
+ struct vm_area_struct * noderight = node->vm_avl_right;
+ int heightleft = heightof(nodeleft);
+ int heightright = heightof(noderight);
+ if (heightright + 1 < heightleft) {
+ /* */
+ /* * */
+ /* / \ */
+ /* n+2 n */
+ /* */
+ struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left;
+ struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right;
+ int heightleftright = heightof(nodeleftright);
+ if (heightof(nodeleftleft) >= heightleftright) {
+ /* */
+ /* * n+2|n+3 */
+ /* / \ / \ */
+ /* n+2 n --> / n+1|n+2 */
+ /* / \ | / \ */
+ /* n+1 n|n+1 n+1 n|n+1 n */
+ /* */
+ node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node;
+ nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright);
+ *nodeplace = nodeleft;
+ } else {
+ /* */
+ /* * n+2 */
+ /* / \ / \ */
+ /* n+2 n --> n+1 n+1 */
+ /* / \ / \ / \ */
+ /* n n+1 n L R n */
+ /* / \ */
+ /* L R */
+ /* */
+ nodeleft->vm_avl_right = nodeleftright->vm_avl_left;
+ node->vm_avl_left = nodeleftright->vm_avl_right;
+ nodeleftright->vm_avl_left = nodeleft;
+ nodeleftright->vm_avl_right = node;
+ nodeleft->vm_avl_height = node->vm_avl_height = heightleftright;
+ nodeleftright->vm_avl_height = heightleft;
+ *nodeplace = nodeleftright;
+ }
+ }
+ else if (heightleft + 1 < heightright) {
+ /* similar to the above, just interchange 'left' <--> 'right' */
+ struct vm_area_struct * noderightright = noderight->vm_avl_right;
+ struct vm_area_struct * noderightleft = noderight->vm_avl_left;
+ int heightrightleft = heightof(noderightleft);
+ if (heightof(noderightright) >= heightrightleft) {
+ node->vm_avl_right = noderightleft; noderight->vm_avl_left = node;
+ noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft);
+ *nodeplace = noderight;
+ } else {
+ noderight->vm_avl_left = noderightleft->vm_avl_right;
+ node->vm_avl_right = noderightleft->vm_avl_left;
+ noderightleft->vm_avl_right = noderight;
+ noderightleft->vm_avl_left = node;
+ noderight->vm_avl_height = node->vm_avl_height = heightrightleft;
+ noderightleft->vm_avl_height = heightright;
+ *nodeplace = noderightleft;
+ }
+ }
+ else {
+ int height = (heightleft<heightright ? heightright : heightleft) + 1;
+ if (height == node->vm_avl_height)
+ break;
+ node->vm_avl_height = height;
+ }
+ }
+}
+
+/* Insert a node into a tree. */
+static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree)
+{
+ vm_avl_key_t key = new_node->vm_avl_key;
+ struct vm_area_struct ** nodeplace = ptree;
+ struct vm_area_struct ** stack[avl_maxheight];
+ int stack_count = 0;
+ struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
+ for (;;) {
+ struct vm_area_struct * node = *nodeplace;
+ if (node == vm_avl_empty)
+ break;
+ *stack_ptr++ = nodeplace; stack_count++;
+ if (key < node->vm_avl_key)
+ nodeplace = &node->vm_avl_left;
+ else
+ nodeplace = &node->vm_avl_right;
+ }
+ new_node->vm_avl_left = vm_avl_empty;
+ new_node->vm_avl_right = vm_avl_empty;
+ new_node->vm_avl_height = 1;
+ *nodeplace = new_node;
+ avl_rebalance(stack_ptr,stack_count);
+}
+
+/* Insert a node into a tree, and
+ * return the node to the left of it and the node to the right of it.
+ */
+static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,
+ struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
+{
+ vm_avl_key_t key = new_node->vm_avl_key;
+ struct vm_area_struct ** nodeplace = ptree;
+ struct vm_area_struct ** stack[avl_maxheight];
+ int stack_count = 0;
+ struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
+ *to_the_left = *to_the_right = NULL;
+ for (;;) {
+ struct vm_area_struct * node = *nodeplace;
+ if (node == vm_avl_empty)
+ break;
+ *stack_ptr++ = nodeplace; stack_count++;
+ if (key < node->vm_avl_key) {
+ *to_the_right = node;
+ nodeplace = &node->vm_avl_left;
+ } else {
+ *to_the_left = node;
+ nodeplace = &node->vm_avl_right;
+ }
+ }
+ new_node->vm_avl_left = vm_avl_empty;
+ new_node->vm_avl_right = vm_avl_empty;
+ new_node->vm_avl_height = 1;
+ *nodeplace = new_node;
+ avl_rebalance(stack_ptr,stack_count);
+}
+
+/* Removes a node out of a tree. */
+static void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
+{
+ vm_avl_key_t key = node_to_delete->vm_avl_key;
+ struct vm_area_struct ** nodeplace = ptree;
+ struct vm_area_struct ** stack[avl_maxheight];
+ int stack_count = 0;
+ struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
+ struct vm_area_struct ** nodeplace_to_delete;
+ for (;;) {
+ struct vm_area_struct * node = *nodeplace;
+#ifdef DEBUG_AVL
+ if (node == vm_avl_empty) {
+ /* what? node_to_delete not found in tree? */
+ printk("avl_remove: node to delete not found in tree\n");
+ return;
+ }
+#endif
+ *stack_ptr++ = nodeplace; stack_count++;
+ if (key == node->vm_avl_key)
+ break;
+ if (key < node->vm_avl_key)
+ nodeplace = &node->vm_avl_left;
+ else
+ nodeplace = &node->vm_avl_right;
+ }
+ nodeplace_to_delete = nodeplace;
+ /* Have to remove node_to_delete = *nodeplace_to_delete. */
+ if (node_to_delete->vm_avl_left == vm_avl_empty) {
+ *nodeplace_to_delete = node_to_delete->vm_avl_right;
+ stack_ptr--; stack_count--;
+ } else {
+ struct vm_area_struct *** stack_ptr_to_delete = stack_ptr;
+ struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left;
+ struct vm_area_struct * node;
+ for (;;) {
+ node = *nodeplace;
+ if (node->vm_avl_right == vm_avl_empty)
+ break;
+ *stack_ptr++ = nodeplace; stack_count++;
+ nodeplace = &node->vm_avl_right;
+ }
+ *nodeplace = node->vm_avl_left;
+ /* node replaces node_to_delete */
+ node->vm_avl_left = node_to_delete->vm_avl_left;
+ node->vm_avl_right = node_to_delete->vm_avl_right;
+ node->vm_avl_height = node_to_delete->vm_avl_height;
+ *nodeplace_to_delete = node; /* replace node_to_delete */
+ *stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */
+ }
+ avl_rebalance(stack_ptr,stack_count);
+}
+
+#ifdef DEBUG_AVL
+
+/* print a list */
+static void printk_list (struct vm_area_struct * vma)
+{
+ printk("[");
+ while (vma) {
+ printk("%08lX-%08lX", vma->vm_start, vma->vm_end);
+ vma = vma->vm_next;
+ if (!vma)
+ break;
+ printk(" ");
+ }
+ printk("]");
+}
+
+/* print a tree */
+static void printk_avl (struct vm_area_struct * tree)
+{
+ if (tree != vm_avl_empty) {
+ printk("(");
+ if (tree->vm_avl_left != vm_avl_empty) {
+ printk_avl(tree->vm_avl_left);
+ printk("<");
+ }
+ printk("%08lX-%08lX", tree->vm_start, tree->vm_end);
+ if (tree->vm_avl_right != vm_avl_empty) {
+ printk(">");
+ printk_avl(tree->vm_avl_right);
+ }
+ printk(")");
+ }
+}
+
+static char *avl_check_point = "somewhere";
+
+/* check a tree's consistency and balancing */
+static void avl_checkheights (struct vm_area_struct * tree)
+{
+ int h, hl, hr;
+
+ if (tree == vm_avl_empty)
+ return;
+ avl_checkheights(tree->vm_avl_left);
+ avl_checkheights(tree->vm_avl_right);
+ h = tree->vm_avl_height;
+ hl = heightof(tree->vm_avl_left);
+ hr = heightof(tree->vm_avl_right);
+ if ((h == hl+1) && (hr <= hl) && (hl <= hr+1))
+ return;
+ if ((h == hr+1) && (hl <= hr) && (hr <= hl+1))
+ return;
+ printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point);
+}
+
+/* check that all values stored in a tree are < key */
+static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key)
+{
+ if (tree == vm_avl_empty)
+ return;
+ avl_checkleft(tree->vm_avl_left,key);
+ avl_checkleft(tree->vm_avl_right,key);
+ if (tree->vm_avl_key < key)
+ return;
+ printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
+}
+
+/* check that all values stored in a tree are > key */
+static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key)
+{
+ if (tree == vm_avl_empty)
+ return;
+ avl_checkright(tree->vm_avl_left,key);
+ avl_checkright(tree->vm_avl_right,key);
+ if (tree->vm_avl_key > key)
+ return;
+ printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
+}
+
+/* check that all values are properly increasing */
+static void avl_checkorder (struct vm_area_struct * tree)
+{
+ if (tree == vm_avl_empty)
+ return;
+ avl_checkorder(tree->vm_avl_left);
+ avl_checkorder(tree->vm_avl_right);
+ avl_checkleft(tree->vm_avl_left,tree->vm_avl_key);
+ avl_checkright(tree->vm_avl_right,tree->vm_avl_key);
+}
+
+/* all checks */
+static void avl_check (struct task_struct * task, char *caller)
+{
+ avl_check_point = caller;
+/* printk("task \"%s\", %s\n",task->comm,caller); */
+/* printk("task \"%s\" list: ",task->comm); printk_list(task->mm->mmap); printk("\n"); */
+/* printk("task \"%s\" tree: ",task->comm); printk_avl(task->mm->mmap_avl); printk("\n"); */
+ avl_checkheights(task->mm->mmap_avl);
+ avl_checkorder(task->mm->mmap_avl);
+}
+
+#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ceec01b9..4a956c085 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,33 +90,6 @@ static inline void remove_mem_queue(struct page * entry)
*/
spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
-/*
- * This routine is used by the kernel swap daemon to determine
- * whether we have "enough" free pages. It is fairly arbitrary,
- * having a low-water and high-water mark.
- *
- * This returns:
- * 0 - urgent need for memory
- * 1 - need some memory, but do it slowly in the background
- * 2 - no need to even think about it.
- */
-int free_memory_available(void)
-{
- static int available = 1;
-
- if (nr_free_pages < freepages.low) {
- available = 0;
- return 0;
- }
-
- if (nr_free_pages > freepages.high) {
- available = 1;
- return 2;
- }
-
- return available;
-}
-
static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
{
struct free_area_struct *area = free_area + order;
@@ -151,14 +124,10 @@ void __free_page(struct page *page)
if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
if (PageSwapCache(page))
panic ("Freeing swap cache page");
- free_pages_ok(page->map_nr, 0);
+ page->flags &= ~(1 << PG_referenced);
+ free_pages_ok(page - mem_map, 0);
return;
}
-#if 0
- if (PageSwapCache(page) && atomic_read(&page->count) == 1)
- printk(KERN_WARNING "VM: Releasing swap cache page at %p",
- __builtin_return_address(0));
-#endif
}
void free_pages(unsigned long addr, unsigned long order)
@@ -172,15 +141,10 @@ void free_pages(unsigned long addr, unsigned long order)
if (atomic_dec_and_test(&map->count)) {
if (PageSwapCache(map))
panic ("Freeing swap cache pages");
+ map->flags &= ~(1 << PG_referenced);
free_pages_ok(map_nr, order);
return;
}
-#if 0
- if (PageSwapCache(map) && atomic_read(&map->count) == 1)
- printk(KERN_WARNING
- "VM: Releasing swap cache pages at %p",
- __builtin_return_address(0));
-#endif
}
}
@@ -191,14 +155,15 @@ void free_pages(unsigned long addr, unsigned long order)
change_bit((index) >> (1+(order)), (area)->map)
#define CAN_DMA(x) (PageDMA(x))
#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
-#define RMQUEUE(order, dma) \
+#define RMQUEUE(order, gfp_mask) \
do { struct free_area_struct * area = free_area+order; \
unsigned long new_order = order; \
do { struct page *prev = memory_head(area), *ret = prev->next; \
while (memory_head(area) != ret) { \
- if (!dma || CAN_DMA(ret)) { \
- unsigned long map_nr = ret->map_nr; \
+ if (!(gfp_mask & __GFP_DMA) || CAN_DMA(ret)) { \
+ unsigned long map_nr; \
(prev->next = ret->next)->prev = prev; \
+ map_nr = ret - mem_map; \
MARK_USED(map_nr, new_order, area); \
nr_free_pages -= 1 << order; \
EXPAND(ret, map_nr, order, new_order, area); \
@@ -224,6 +189,8 @@ do { unsigned long size = 1 << high; \
atomic_set(&map->count, 1); \
} while (0)
+int low_on_memory = 0;
+
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
unsigned long flags;
@@ -231,30 +198,45 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
if (order >= NR_MEM_LISTS)
goto nopage;
- if (gfp_mask & __GFP_WAIT) {
- if (in_interrupt()) {
- static int count = 0;
- if (++count < 5) {
- printk("gfp called nonatomically from interrupt %p\n",
- __builtin_return_address(0));
- }
- goto nopage;
+#ifdef ATOMIC_MEMORY_DEBUGGING
+ if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
+ static int count = 0;
+ if (++count < 5) {
+ printk("gfp called nonatomically from interrupt %p\n",
+ __builtin_return_address(0));
}
+ goto nopage;
+ }
+#endif
- if (freepages.min > nr_free_pages) {
- int freed;
- freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
- /*
- * Low priority (user) allocations must not
- * succeed if we didn't have enough memory
- * and we couldn't get more..
- */
- if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
- goto nopage;
+ /*
+ * If this is a recursive call, we'd better
+ * do our best to just allocate things without
+ * further thought.
+ */
+ if (!(current->flags & PF_MEMALLOC)) {
+ int freed;
+
+ if (nr_free_pages > freepages.min) {
+ if (!low_on_memory)
+ goto ok_to_allocate;
+ if (nr_free_pages >= freepages.high) {
+ low_on_memory = 0;
+ goto ok_to_allocate;
+ }
}
+
+ low_on_memory = 1;
+ current->flags |= PF_MEMALLOC;
+ freed = try_to_free_pages(gfp_mask);
+ current->flags &= ~PF_MEMALLOC;
+
+ if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+ goto nopage;
}
+ok_to_allocate:
spin_lock_irqsave(&page_alloc_lock, flags);
- RMQUEUE(order, (gfp_mask & GFP_DMA));
+ RMQUEUE(order, gfp_mask);
spin_unlock_irqrestore(&page_alloc_lock, flags);
/*
@@ -341,7 +323,6 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
--p;
atomic_set(&p->count, 0);
p->flags = (1 << PG_DMA) | (1 << PG_reserved);
- p->map_nr = p - mem_map;
} while (p > mem_map);
for (i = 0 ; i < NR_MEM_LISTS ; i++) {
@@ -359,6 +340,46 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
return start_mem;
}
+/*
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time. We also make sure to queue
+ * the 'original' request together with the readahead ones...
+ */
+void swapin_readahead(unsigned long entry)
+{
+ int i;
+ struct page *new_page;
+ unsigned long offset = SWP_OFFSET(entry);
+ struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
+
+ offset = (offset >> page_cluster) << page_cluster;
+
+ i = 1 << page_cluster;
+ do {
+ /* Don't read-ahead past the end of the swap area */
+ if (offset >= swapdev->max)
+ break;
+ /* Don't block on I/O for read-ahead */
+ if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
+ break;
+ /* Don't read in bad or busy pages */
+ if (!swapdev->swap_map[offset])
+ break;
+ if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
+ break;
+ if (test_bit(offset, swapdev->swap_lockmap))
+ break;
+
+ /* Ok, do the async read-ahead now */
+ new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
+ if (new_page != NULL)
+ __free_page(new_page);
+ offset++;
+ } while (--i);
+ return;
+}
+
/*
* The tests may look silly, but it essentially makes sure that
* no other process did a swap-in on us just as we were waiting.
@@ -370,10 +391,12 @@ void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
pte_t * page_table, unsigned long entry, int write_access)
{
unsigned long page;
- struct page *page_map;
-
- page_map = read_swap_cache(entry);
+ struct page *page_map = lookup_swap_cache(entry);
+ if (!page_map) {
+ swapin_readahead(entry);
+ page_map = read_swap_cache(entry);
+ }
if (pte_val(*page_table) != entry) {
if (page_map)
free_page_and_swap_cache(page_address(page_map));
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dd24facc..498e4f63d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -7,6 +7,7 @@
* Asynchronous swapping added 30.12.95. Stephen Tweedie
* Removed race in async swapping. 14.4.1996. Bruno Haible
* Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
+ * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
*/
#include <linux/mm.h>
@@ -15,8 +16,6 @@
#include <linux/locks.h>
#include <linux/swapctl.h>
-#include <asm/dma.h>
-#include <asm/uaccess.h> /* for copy_to/from_user */
#include <asm/pgtable.h>
static struct wait_queue * lock_queue = NULL;
@@ -24,8 +23,6 @@ static struct wait_queue * lock_queue = NULL;
/*
* Reads or writes a swap page.
* wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O.
- * All IO to swap files (as opposed to swap partitions) is done
- * synchronously.
*
* Important prevention of race condition: the caller *must* atomically
* create a unique swap cache entry for this swap page before calling
@@ -38,21 +35,22 @@ static struct wait_queue * lock_queue = NULL;
* that shared pages stay shared while being swapped.
*/
-void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
+static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait)
{
unsigned long type, offset;
struct swap_info_struct * p;
- struct page *page = mem_map + MAP_NR(buf);
+ int zones[PAGE_SIZE/512];
+ int zones_used;
+ kdev_t dev = 0;
+ int block_size;
#ifdef DEBUG_SWAP
printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n",
(rw == READ) ? "read" : "write",
- entry, buf, atomic_read(&page->count),
+ entry, (char *) page_address(page), atomic_read(&page->count),
wait ? "wait" : "nowait");
#endif
- if (page->inode && page->inode != &swapper_inode)
- panic ("Tried to swap a non-swapper page");
type = SWP_TYPE(entry);
if (type >= nr_swapfiles) {
printk("Internal error: bad swap-device\n");
@@ -60,7 +58,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
}
/* Don't allow too many pending pages in flight.. */
- if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
+ if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
wait = 1;
p = &swap_info[type];
@@ -85,13 +83,27 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
printk(KERN_ERR "VM: swap page is unlocked\n");
return;
}
-
- /* Make sure we are the only process doing I/O with this swap page. */
- while (test_and_set_bit(offset,p->swap_lockmap)) {
- run_task_queue(&tq_disk);
- sleep_on(&lock_queue);
+
+ if (PageSwapCache(page)) {
+ /* Make sure we are the only process doing I/O with this swap page. */
+ while (test_and_set_bit(offset,p->swap_lockmap)) {
+ run_task_queue(&tq_disk);
+ sleep_on(&lock_queue);
+ }
+
+ /*
+ * Make sure that we have a swap cache association for this
+ * page. We need this to find which swap page to unlock once
+ * the swap IO has completed to the physical page. If the page
+ * is not already in the cache, just overload the offset entry
+ * as if it were: we are not allowed to manipulate the inode
+ * hashing for locked pages.
+ */
+ if (page->offset != entry) {
+ printk ("swap entry mismatch");
+ return;
+ }
}
-
if (rw == READ) {
clear_bit(PG_uptodate, &page->flags);
kstat.pswpin++;
@@ -99,54 +111,25 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
kstat.pswpout++;
atomic_inc(&page->count);
- /*
- * Make sure that we have a swap cache association for this
- * page. We need this to find which swap page to unlock once
- * the swap IO has completed to the physical page. If the page
- * is not already in the cache, just overload the offset entry
- * as if it were: we are not allowed to manipulate the inode
- * hashing for locked pages.
- */
- if (!PageSwapCache(page)) {
- printk(KERN_ERR "VM: swap page is not in swap cache\n");
- return;
- }
- if (page->offset != entry) {
- printk (KERN_ERR "VM: swap entry mismatch\n");
- return;
- }
-
if (p->swap_device) {
- if (!wait) {
- set_bit(PG_free_after, &page->flags);
- set_bit(PG_decr_after, &page->flags);
- set_bit(PG_swap_unlock_after, &page->flags);
- atomic_inc(&nr_async_pages);
- }
- ll_rw_page(rw,p->swap_device,offset,buf);
- /*
- * NOTE! We don't decrement the page count if we
- * don't wait - that will happen asynchronously
- * when the IO completes.
- */
- if (!wait)
- return;
- wait_on_page(page);
+ zones[0] = offset;
+ zones_used = 1;
+ dev = p->swap_device;
+ block_size = PAGE_SIZE;
} else if (p->swap_file) {
struct inode *swapf = p->swap_file->d_inode;
- unsigned int zones[PAGE_SIZE/512];
int i;
if (swapf->i_op->bmap == NULL
&& swapf->i_op->smap != NULL){
/*
- With MS-DOS, we use msdos_smap which return
+ With MS-DOS, we use msdos_smap which returns
a sector number (not a cluster or block number).
It is a patch to enable the UMSDOS project.
Other people are working on better solution.
It sounds like ll_rw_swap_file defined
- it operation size (sector size) based on
- PAGE_SIZE and the number of block to read.
+ its operation size (sector size) based on
+ PAGE_SIZE and the number of blocks to read.
So using bmap or smap should work even if
smap will require more blocks.
*/
@@ -159,39 +142,72 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
return;
}
}
+ block_size = 512;
}else{
int j;
unsigned int block = offset
<< (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
- for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
+ block_size = swapf->i_sb->s_blocksize;
+ for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size)
if (!(zones[i] = bmap(swapf,block++))) {
printk("rw_swap_page: bad swap file\n");
return;
}
+ zones_used = i;
+ dev = swapf->i_dev;
}
- ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
- /* Unlike ll_rw_page, ll_rw_swap_file won't unlock the
- page for us. */
- clear_bit(PG_locked, &page->flags);
- wake_up(&page->wait);
- } else
+ } else {
printk(KERN_ERR "rw_swap_page: no swap file or device\n");
+ /* Do some cleaning up so if this ever happens we can hopefully
+ * trigger controlled shutdown.
+ */
+ if (PageSwapCache(page)) {
+ if (!test_and_clear_bit(offset,p->swap_lockmap))
+ printk("swap_after_unlock_page: lock already cleared\n");
+ wake_up(&lock_queue);
+ }
+ atomic_dec(&page->count);
+ return;
+ }
+ if (!wait) {
+ set_bit(PG_decr_after, &page->flags);
+ atomic_inc(&nr_async_pages);
+ }
+ if (PageSwapCache(page)) {
+ /* only lock/unlock swap cache pages! */
+ set_bit(PG_swap_unlock_after, &page->flags);
+ }
+ set_bit(PG_free_after, &page->flags);
+ /* block_size == PAGE_SIZE/zones_used */
+ brw_page(rw, page, dev, zones, block_size, 0);
+
+ /* Note! For consistency we do all of the logic,
+ * decrementing the page count, and unlocking the page in the
+ * swap lock map - in the IO completion handler.
+ */
+ if (!wait)
+ return;
+ wait_on_page(page);
/* This shouldn't happen, but check to be sure. */
- if (atomic_read(&page->count) == 1)
+ if (atomic_read(&page->count) == 0)
printk(KERN_ERR "rw_swap_page: page unused while waiting!\n");
- atomic_dec(&page->count);
- if (offset && !test_and_clear_bit(offset,p->swap_lockmap))
- printk(KERN_ERR "rw_swap_page: lock already cleared\n");
- wake_up(&lock_queue);
+
#ifdef DEBUG_SWAP
printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n",
(rw == READ) ? "read" : "write",
- buf, atomic_read(&page->count));
+ (char *) page_adddress(page),
+ atomic_read(&page->count));
#endif
}
+/* Note: We could remove this totally asynchronous function,
+ * and improve swap performance, and remove the need for the swap lock map,
+ * by not removing pages from the swap cache until after I/O has been
+ * processed and letting remove_from_page_cache decrement the swap count
+ * just before it removes the page from the page cache.
+ */
/* This is run when asynchronous page I/O has completed. */
void swap_after_unlock_page (unsigned long entry)
{
@@ -214,6 +230,35 @@ void swap_after_unlock_page (unsigned long entry)
wake_up(&lock_queue);
}
+/* A simple wrapper so the base function doesn't need to enforce
+ * that all swap pages go through the swap cache!
+ */
+void rw_swap_page(int rw, unsigned long entry, char *buf, int wait)
+{
+ struct page *page = mem_map + MAP_NR(buf);
+
+ if (page->inode && page->inode != &swapper_inode)
+ panic ("Tried to swap a non-swapper page");
+
+ /*
+ * Make sure that we have a swap cache association for this
+ * page. We need this to find which swap page to unlock once
+ * the swap IO has completed to the physical page. If the page
+ * is not already in the cache, just overload the offset entry
+ * as if it were: we are not allowed to manipulate the inode
+ * hashing for locked pages.
+ */
+ if (!PageSwapCache(page)) {
+ printk("VM: swap page is not in swap cache\n");
+ return;
+ }
+ if (page->offset != entry) {
+ printk ("swap entry mismatch");
+ return;
+ }
+ rw_swap_page_base(rw, entry, page, wait);
+}
+
/*
* Setting up a new swap file needs a simple wrapper just to read the
* swap signature. SysV shared memory also needs a simple wrapper.
@@ -242,33 +287,23 @@ void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer)
clear_bit(PG_swap_cache, &page->flags);
}
-
-
/*
- * Swap partitions are now read via brw_page. ll_rw_page is an
- * asynchronous function now --- we must call wait_on_page afterwards
- * if synchronous IO is required.
+ * shmfs needs a version that doesn't put the page in the page cache!
+ * The swap lock map insists that pages be in the page cache!
+ * Therefore we can't use it. Later when we can remove the need for the
+ * lock map and we can reduce the number of functions exported.
*/
-void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer)
+void rw_swap_page_nolock(int rw, unsigned long entry, char *buffer, int wait)
{
- int block = offset;
- struct page *page;
-
- switch (rw) {
- case READ:
- break;
- case WRITE:
- if (is_read_only(dev)) {
- printk("Can't page to read-only device %s\n",
- kdevname(dev));
- return;
- }
- break;
- default:
- panic("ll_rw_page: bad block dev cmd, must be R/W");
+ struct page *page = mem_map + MAP_NR((unsigned long) buffer);
+
+ if (!PageLocked(page)) {
+ printk("VM: rw_swap_page_nolock: page not locked!\n");
+ return;
+ }
+ if (PageSwapCache(page)) {
+ printk ("VM: rw_swap_page_nolock: page in swap cache!\n");
+ return;
}
- page = mem_map + MAP_NR(buffer);
- if (!PageLocked(page))
- panic ("ll_rw_page: page not already locked");
- brw_page(rw, page, dev, &block, PAGE_SIZE, 0);
+ rw_swap_page_base(rw, entry, page, wait);
}
diff --git a/mm/swap.c b/mm/swap.c
index 1e2d8c36b..1d6b0d4d0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,35 +39,21 @@ freepages_t freepages = {
144 /* freepages.high */
};
+/* How many pages do we try to swap or page in/out together? */
+int page_cluster = 4; /* Default value modified in swap_setup() */
+
/* We track the number of pages currently being asynchronously swapped
out, so that we don't try to swap TOO many pages out at once */
atomic_t nr_async_pages = ATOMIC_INIT(0);
-/*
- * Constants for the page aging mechanism: the maximum age (actually,
- * the maximum "youthfulness"); the quanta by which pages rejuvenate
- * and age; and the initial age for new pages.
- *
- * The "pageout_weight" is strictly a fixedpoint number with the
- * ten low bits being the fraction (ie 8192 really means "8.0").
- */
-swap_control_t swap_control = {
- 20, 3, 1, 3, /* Page aging */
- 32, 4, /* Aging cluster */
- 8192, /* sc_pageout_weight aka PAGEOUT_WEIGHT */
- 8192, /* sc_bufferout_weight aka BUFFEROUT_WEIGHT */
-};
-
-swapstat_t swapstats = {0};
-
buffer_mem_t buffer_mem = {
- 5, /* minimum percent buffer */
+ 2, /* minimum percent buffer */
10, /* borrow percent buffer */
60 /* maximum percent buffer */
};
buffer_mem_t page_cache = {
- 5, /* minimum percent page cache */
+ 2, /* minimum percent page cache */
15, /* borrow percent page cache */
75 /* maximum */
};
@@ -77,3 +63,18 @@ pager_daemon_t pager_daemon = {
SWAP_CLUSTER_MAX, /* minimum number of tries */
SWAP_CLUSTER_MAX, /* do swap I/O in clusters of this size */
};
+
+/*
+ * Perform any setup for the swap system
+ */
+
+void __init swap_setup(void)
+{
+ /* Use a smaller cluster for memory <16MB or <32MB */
+ if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT))
+ page_cluster = 2;
+ else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT))
+ page_cluster = 3;
+ else
+ page_cluster = 4;
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e098974b2..8c5e7176c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -29,18 +29,16 @@ struct inode swapper_inode;
#ifdef SWAP_CACHE_INFO
unsigned long swap_cache_add_total = 0;
-unsigned long swap_cache_add_success = 0;
unsigned long swap_cache_del_total = 0;
-unsigned long swap_cache_del_success = 0;
unsigned long swap_cache_find_total = 0;
unsigned long swap_cache_find_success = 0;
void show_swap_cache_info(void)
{
- printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
- swap_cache_add_total, swap_cache_add_success,
- swap_cache_del_total, swap_cache_del_success,
- swap_cache_find_total, swap_cache_find_success);
+ printk("Swap cache: add %ld, delete %ld, find %ld/%ld\n",
+ swap_cache_add_total,
+ swap_cache_del_total,
+ swap_cache_find_success, swap_cache_find_total);
}
#endif
@@ -69,9 +67,6 @@ int add_to_swap_cache(struct page *page, unsigned long entry)
page->offset = entry;
add_page_to_hash_queue(page, &swapper_inode, entry);
add_page_to_inode_queue(&swapper_inode, page);
-#ifdef SWAP_CACHE_INFO
- swap_cache_add_success++;
-#endif
return 1;
}
@@ -192,16 +187,6 @@ static inline void remove_from_swap_cache(struct page *page)
printk ("VM: Removing swap cache page with wrong inode hash "
"on page %08lx\n", page_address(page));
}
-#if 0
- /*
- * This is a legal case, but warn about it.
- */
- if (atomic_read(&page->count) == 1) {
- printk (KERN_WARNING
- "VM: Removing page cache on unshared page %08lx\n",
- page_address(page));
- }
-#endif
#ifdef DEBUG_SWAP
printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
@@ -222,7 +207,6 @@ void delete_from_swap_cache(struct page *page)
#ifdef SWAP_CACHE_INFO
swap_cache_del_total++;
- swap_cache_del_success++;
#endif
#ifdef DEBUG_SWAP
printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
@@ -241,6 +225,7 @@ void delete_from_swap_cache(struct page *page)
void free_page_and_swap_cache(unsigned long addr)
{
struct page *page = mem_map + MAP_NR(addr);
+
/*
* If we are the only user, then free up the swap cache.
*/
@@ -248,7 +233,7 @@ void free_page_and_swap_cache(unsigned long addr)
delete_from_swap_cache(page);
}
- free_page(addr);
+ __free_page(page);
}
@@ -258,18 +243,25 @@ void free_page_and_swap_cache(unsigned long addr)
* incremented.
*/
-static struct page * lookup_swap_cache(unsigned long entry)
+struct page * lookup_swap_cache(unsigned long entry)
{
struct page *found;
-
+
+#ifdef SWAP_CACHE_INFO
+ swap_cache_find_total++;
+#endif
while (1) {
found = find_page(&swapper_inode, entry);
if (!found)
return 0;
if (found->inode != &swapper_inode || !PageSwapCache(found))
goto out_bad;
- if (!PageLocked(found))
+ if (!PageLocked(found)) {
+#ifdef SWAP_CACHE_INFO
+ swap_cache_find_success++;
+#endif
return found;
+ }
__free_page(found);
__wait_on_page(found);
}
@@ -291,7 +283,7 @@ out_bad:
struct page * read_swap_cache_async(unsigned long entry, int wait)
{
- struct page *found_page, *new_page;
+ struct page *found_page = 0, *new_page;
unsigned long new_page_addr;
#ifdef DEBUG_SWAP
@@ -299,15 +291,20 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
entry, wait ? ", wait" : "");
#endif
/*
+ * Make sure the swap entry is still in use.
+ */
+ if (!swap_duplicate(entry)) /* Account for the swap cache */
+ goto out;
+ /*
* Look for the page in the swap cache.
*/
found_page = lookup_swap_cache(entry);
if (found_page)
- goto out;
+ goto out_free_swap;
- new_page_addr = __get_free_page(GFP_KERNEL);
+ new_page_addr = __get_free_page(GFP_USER);
if (!new_page_addr)
- goto out; /* Out of memory */
+ goto out_free_swap; /* Out of memory */
new_page = mem_map + MAP_NR(new_page_addr);
/*
@@ -316,11 +313,6 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
found_page = lookup_swap_cache(entry);
if (found_page)
goto out_free_page;
- /*
- * Make sure the swap entry is still in use.
- */
- if (!swap_duplicate(entry)) /* Account for the swap cache */
- goto out_free_page;
/*
* Add it to the swap cache and read its contents.
*/
@@ -338,6 +330,8 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
out_free_page:
__free_page(new_page);
+out_free_swap:
+ swap_free(entry);
out:
return found_page;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c574fb59a..dd66701b5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -23,6 +23,7 @@ struct swap_list_t swap_list = {-1, -1};
struct swap_info_struct swap_info[MAX_SWAPFILES];
+#define SWAPFILE_CLUSTER 256
static inline int scan_swap_map(struct swap_info_struct *si)
{
@@ -30,7 +31,7 @@ static inline int scan_swap_map(struct swap_info_struct *si)
/*
* We try to cluster swap pages by allocating them
* sequentially in swap. Once we've allocated
- * SWAP_CLUSTER_MAX pages this way, however, we resort to
+ * SWAPFILE_CLUSTER pages this way, however, we resort to
* first-free allocation, starting a new cluster. This
* prevents us from scattering swap pages all over the entire
* swap partition, so that we reduce overall disk seek times
@@ -46,7 +47,7 @@ static inline int scan_swap_map(struct swap_info_struct *si)
goto got_page;
}
}
- si->cluster_nr = SWAP_CLUSTER_MAX;
+ si->cluster_nr = SWAPFILE_CLUSTER;
for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
if (si->swap_map[offset])
continue;
@@ -626,11 +627,11 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
p->highest_bit = swap_header->info.last_page - 1;
p->max = swap_header->info.last_page;
- if (p->max >= 0x7fffffffL/PAGE_SIZE ||
- (void *) &swap_header->info.badpages[(int) swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) {
- error = -EINVAL;
+ error = -EINVAL;
+ if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+ goto bad_swap;
+ if (p->max >= SWP_OFFSET(SWP_ENTRY(0,~0UL)))
goto bad_swap;
- }
/* OK, set up the swap map and apply the bad block list */
if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e99ad35fb..7063b2df1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -161,8 +161,10 @@ struct vm_struct * get_vm_area(unsigned long size)
for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
if (size + addr < (unsigned long) tmp->addr)
break;
- if (addr > VMALLOC_END-size)
+ if (addr > VMALLOC_END-size) {
+ kfree(area);
return NULL;
+ }
addr = tmp->size + (unsigned long) tmp->addr;
}
area->addr = (void *)addr;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5efa52a2..116096153 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -20,13 +20,6 @@
#include <asm/pgtable.h>
-/*
- * The wait queue for waking up the pageout daemon:
- */
-static struct task_struct * kswapd_task = NULL;
-
-static void init_swap_timer(void);
-
/*
* The swap-out functions return 1 if they successfully
* threw something out, and we got a free page. It returns
@@ -38,7 +31,7 @@ static void init_swap_timer(void);
* using a process that no longer actually exists (it might
* have died while we slept).
*/
-static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
+static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
unsigned long address, pte_t * page_table, int gfp_mask)
{
pte_t pte;
@@ -59,50 +52,6 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
|| ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
return 0;
- /*
- * Deal with page aging. There are several special cases to
- * consider:
- *
- * Page has been accessed, but is swap cached. If the page is
- * getting sufficiently "interesting" --- its age is getting
- * high --- then if we are sufficiently short of free swap
- * pages, then delete the swap cache. We can only do this if
- * the swap page's reference count is one: ie. there are no
- * other references to it beyond the swap cache (as there must
- * still be PTEs pointing to it if count > 1).
- *
- * If the page has NOT been touched, and its age reaches zero,
- * then we are swapping it out:
- *
- * If there is already a swap cache page for this page, then
- * another process has already allocated swap space, so just
- * dereference the physical page and copy in the swap entry
- * from the swap cache.
- *
- * Note, we rely on all pages read in from swap either having
- * the swap cache flag set, OR being marked writable in the pte,
- * but NEVER BOTH. (It IS legal to be neither cached nor dirty,
- * however.)
- *
- * -- Stephen Tweedie 1998 */
-
- if (PageSwapCache(page_map)) {
- if (pte_write(pte)) {
- struct page *found;
- printk ("VM: Found a writable swap-cached page!\n");
- /* Try to diagnose the problem ... */
- found = find_page(&swapper_inode, page_map->offset);
- if (found) {
- printk("page=%p@%08lx, found=%p, count=%d\n",
- page_map, page_map->offset,
- found, atomic_read(&found->count));
- __free_page(found);
- } else
- printk ("Spurious, page not in cache\n");
- return 0;
- }
- }
-
if (pte_young(pte)) {
/*
* Transfer the "accessed" bit from the page
@@ -110,109 +59,111 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
*/
set_pte(page_table, pte_mkold(pte));
set_bit(PG_referenced, &page_map->flags);
+ return 0;
+ }
- /*
- * We should test here to see if we want to recover any
- * swap cache page here. We do this if the page seeing
- * enough activity, AND we are sufficiently low on swap
- *
- * We need to track both the number of available swap
- * pages and the total number present before we can do
- * this...
- */
+ /*
+ * Is the page already in the swap cache? If so, then
+ * we can just drop our reference to it without doing
+ * any IO - it's already up-to-date on disk.
+ *
+ * Return 0, as we didn't actually free any real
+ * memory, and we should just continue our scan.
+ */
+ if (PageSwapCache(page_map)) {
+ entry = page_map->offset;
+ swap_duplicate(entry);
+ set_pte(page_table, __pte(entry));
+drop_pte:
+ vma->vm_mm->rss--;
+ flush_tlb_page(vma, address);
+ __free_page(page_map);
return 0;
}
- if (pte_dirty(pte)) {
- if (vma->vm_ops && vma->vm_ops->swapout) {
- pid_t pid = tsk->pid;
- vma->vm_mm->rss--;
- if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
- kill_proc(pid, SIGBUS, 1);
- } else {
- /*
- * This is a dirty, swappable page. First of all,
- * get a suitable swap entry for it, and make sure
- * we have the swap cache set up to associate the
- * page with that swap entry.
- */
- entry = in_swap_cache(page_map);
- if (!entry) {
- entry = get_swap_page();
- if (!entry)
- return 0; /* No swap space left */
- }
-
- vma->vm_mm->rss--;
- tsk->nswap++;
- flush_cache_page(vma, address);
- set_pte(page_table, __pte(entry));
- flush_tlb_page(vma, address);
- swap_duplicate(entry);
-
- /* Now to write back the page. We have two
- * cases: if the page is already part of the
- * swap cache, then it is already on disk. Just
- * free the page and return (we release the swap
- * cache on the last accessor too).
- *
- * If we have made a new swap entry, then we
- * start the write out to disk. If the page is
- * shared, however, we still need to keep the
- * copy in memory, so we add it to the swap
- * cache. */
- if (PageSwapCache(page_map)) {
- free_page(page);
- return (atomic_read(&page_map->count) == 0);
- }
- add_to_swap_cache(page_map, entry);
- /* We checked we were unlocked way up above, and we
- have been careful not to stall until here */
- set_bit(PG_locked, &page_map->flags);
- /* OK, do a physical write to swap. */
- rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
- }
- /* Now we can free the current physical page. We also
- * free up the swap cache if this is the last use of the
- * page. Note that there is a race here: the page may
- * still be shared COW by another process, but that
- * process may exit while we are writing out the page
- * asynchronously. That's no problem, shrink_mmap() can
- * correctly clean up the occassional unshared page
- * which gets left behind in the swap cache. */
- free_page(page);
- return 1; /* we slept: the process may not exist any more */
+ /*
+ * Is it a clean page? Then it must be recoverable
+ * by just paging it in again, and we can just drop
+ * it..
+ *
+ * However, this won't actually free any real
+ * memory, as the page will just be in the page cache
+ * somewhere, and as such we should just continue
+ * our scan.
+ *
+ * Basically, this just makes it possible for us to do
+ * some real work in the future in "shrink_mmap()".
+ */
+ if (!pte_dirty(pte)) {
+ pte_clear(page_table);
+ goto drop_pte;
}
- /* The page was _not_ dirty, but still has a zero age. It must
- * already be uptodate on disk. If it is in the swap cache,
- * then we can just unlink the page now. Remove the swap cache
- * too if this is the last user. */
- if ((entry = in_swap_cache(page_map))) {
- vma->vm_mm->rss--;
- flush_cache_page(vma, address);
- set_pte(page_table, __pte(entry));
- flush_tlb_page(vma, address);
- swap_duplicate(entry);
- free_page(page);
- return (atomic_read(&page_map->count) == 0);
- }
- /*
- * A clean page to be discarded? Must be mmap()ed from
- * somewhere. Unlink the pte, and tell the filemap code to
- * discard any cached backing page if this is the last user.
+ /*
+ * Don't go down into the swap-out stuff if
+ * we cannot do I/O! Avoid recursing on FS
+ * locks etc.
*/
- if (PageSwapCache(page_map)) {
- printk ("VM: How can this page _still_ be cached?");
+ if (!(gfp_mask & __GFP_IO))
return 0;
+
+ /*
+ * Ok, it's really dirty. That means that
+ * we should either create a new swap cache
+ * entry for it, or we should write it back
+ * to its own backing store.
+ *
+ * Note that in neither case do we actually
+ * know that we make a page available, but
+ * as we potentially sleep we can no longer
+ * continue scanning, so we migth as well
+ * assume we free'd something.
+ *
+ * NOTE NOTE NOTE! This should just set a
+ * dirty bit in page_map, and just drop the
+ * pte. All the hard work would be done by
+ * shrink_mmap().
+ *
+ * That would get rid of a lot of problems.
+ */
+ flush_cache_page(vma, address);
+ if (vma->vm_ops && vma->vm_ops->swapout) {
+ pid_t pid = tsk->pid;
+ pte_clear(page_table);
+ flush_tlb_page(vma, address);
+ vma->vm_mm->rss--;
+
+ if (vma->vm_ops->swapout(vma, page_map))
+ kill_proc(pid, SIGBUS, 1);
+ __free_page(page_map);
+ return 1;
}
+
+ /*
+ * This is a dirty, swappable page. First of all,
+ * get a suitable swap entry for it, and make sure
+ * we have the swap cache set up to associate the
+ * page with that swap entry.
+ */
+ entry = get_swap_page();
+ if (!entry)
+ return 0; /* No swap space left */
+
vma->vm_mm->rss--;
- flush_cache_page(vma, address);
- pte_clear(page_table);
+ tsk->nswap++;
+ set_pte(page_table, __pte(entry));
flush_tlb_page(vma, address);
- entry = (atomic_read(&page_map->count) == 1);
+ swap_duplicate(entry); /* One for the process, one for the swap cache */
+ add_to_swap_cache(page_map, entry);
+ /* We checked we were unlocked way up above, and we
+ have been careful not to stall until here */
+ set_bit(PG_locked, &page_map->flags);
+
+ /* OK, do a physical asynchronous write to swap. */
+ rw_swap_page(WRITE, entry, (char *) page, 0);
+
__free_page(page_map);
- return entry;
+ return 1;
}
/*
@@ -363,13 +314,23 @@ static int swap_out(unsigned int priority, int gfp_mask)
/*
* We make one or two passes through the task list, indexed by
* assign = {0, 1}:
- * Pass 1: select the swappable task with maximal swap_cnt.
- * Pass 2: assign new swap_cnt values, then select as above.
+ * Pass 1: select the swappable task with maximal RSS that has
+ * not yet been swapped out.
+ * Pass 2: re-assign rss swap_cnt values, then select as above.
+ *
* With this approach, there's no need to remember the last task
* swapped out. If the swap-out fails, we clear swap_cnt so the
* task won't be selected again until all others have been tried.
+ *
+ * Think of swap_cnt as a "shadow rss" - it tells us which process
+ * we want to page out (always try largest first).
*/
- counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+ counter = nr_tasks / (priority+1);
+ if (counter < 1)
+ counter = 1;
+ if (counter > nr_tasks)
+ counter = nr_tasks;
+
for (; counter >= 0; counter--) {
assign = 0;
max_cnt = 0;
@@ -382,15 +343,9 @@ static int swap_out(unsigned int priority, int gfp_mask)
continue;
if (p->mm->rss <= 0)
continue;
- if (assign) {
- /*
- * If we didn't select a task on pass 1,
- * assign each task a new swap_cnt.
- * Normalise the number of pages swapped
- * by multiplying by (RSS / 1MB)
- */
- p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
- }
+ /* Refresh swap_cnt? */
+ if (assign)
+ p->swap_cnt = p->mm->rss;
if (p->swap_cnt > max_cnt) {
max_cnt = p->swap_cnt;
pbest = p;
@@ -404,56 +359,60 @@ static int swap_out(unsigned int priority, int gfp_mask)
}
goto out;
}
- pbest->swap_cnt--;
- /*
- * Nonzero means we cleared out something, but only "1" means
- * that we actually free'd up a page as a result.
- */
- if (swap_out_process(pbest, gfp_mask) == 1)
- return 1;
+ if (swap_out_process(pbest, gfp_mask))
+ return 1;
}
out:
return 0;
}
/*
- * We are much more aggressive about trying to swap out than we used
- * to be. This works out OK, because we now do proper aging on page
- * contents.
+ * We need to make the locks finer granularity, but right
+ * now we need this so that we can do page allocations
+ * without holding the kernel lock etc.
+ *
+ * We want to try to free "count" pages, and we need to
+ * cluster them so that we get good swap-out behaviour. See
+ * the "free_memory()" macro for details.
*/
-static int do_try_to_free_page(int gfp_mask)
+static int do_try_to_free_pages(unsigned int gfp_mask)
{
- static int state = 0;
- int i=6;
+ int priority;
+ int count = SWAP_CLUSTER_MAX;
+
+ lock_kernel();
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
- if (buffer_over_borrow() || pgcache_over_borrow())
- state = 0;
+ priority = 6;
+ do {
+ while (shrink_mmap(priority, gfp_mask)) {
+ if (!--count)
+ goto done;
+ }
- switch (state) {
- do {
- case 0:
- if (shrink_mmap(i, gfp_mask))
- return 1;
- state = 1;
- case 1:
- if (shm_swap(i, gfp_mask))
- return 1;
- state = 2;
- case 2:
- if (swap_out(i, gfp_mask))
- return 1;
- state = 3;
- case 3:
- shrink_dcache_memory(i, gfp_mask);
- state = 0;
- i--;
- } while (i >= 0);
- }
- return 0;
+ /* Try to get rid of some shared memory pages.. */
+ if (gfp_mask & __GFP_IO) {
+ while (shm_swap(priority, gfp_mask)) {
+ if (!--count)
+ goto done;
+ }
+ }
+
+ /* Then, try to page stuff out.. */
+ while (swap_out(priority, gfp_mask)) {
+ if (!--count)
+ goto done;
+ }
+
+ shrink_dcache_memory(priority, gfp_mask);
+ } while (--priority >= 0);
+done:
+ unlock_kernel();
+
+ return priority >= 0;
}
/*
@@ -467,6 +426,8 @@ void __init kswapd_setup(void)
int i;
char *revision="$Revision: 1.5 $", *s, *e;
+ swap_setup();
+
if ((s = strchr(revision, ':')) &&
(e = strchr(s, '$')))
s++, i = e - s;
@@ -475,35 +436,36 @@ void __init kswapd_setup(void)
printk ("Starting kswapd v%.*s\n", i, s);
}
+static struct task_struct *kswapd_process;
+
/*
- * The background pageout daemon.
- * Started as a kernel thread from the init process.
+ * The background pageout daemon, started as a kernel thread
+ * from the init process.
+ *
+ * This basically executes once a second, trickling out pages
+ * so that we have _some_ free memory available even if there
+ * is no other activity that frees anything up. This is needed
+ * for things like routing etc, where we otherwise might have
+ * all activity going on in asynchronous contexts that cannot
+ * page things out.
+ *
+ * If there are applications that are active memory-allocators
+ * (most normal use), this basically shouldn't matter.
*/
int kswapd(void *unused)
{
- current->session = 1;
- current->pgrp = 1;
- strcpy(current->comm, "kswapd");
- sigfillset(&current->blocked);
-
- /*
- * As a kernel thread we want to tamper with system buffers
- * and other internals and thus be subject to the SMP locking
- * rules. (On a uniprocessor box this does nothing).
- */
- lock_kernel();
-
- /*
- * Set the base priority to something smaller than a
- * regular process. We will scale up the priority
- * dynamically depending on how much memory we need.
- */
- current->priority = (DEF_PRIORITY * 2) / 3;
+ struct task_struct *tsk = current;
+ kswapd_process = tsk;
+ tsk->session = 1;
+ tsk->pgrp = 1;
+ strcpy(tsk->comm, "kswapd");
+ sigfillset(&tsk->blocked);
+
/*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
- * regardless (see "try_to_free_pages()"). "kswapd" should
+ * regardless (see "__get_free_pages()"). "kswapd" should
* never get caught in the normal page freeing logic.
*
* (Kswapd normally doesn't need memory anyway, but sometimes
@@ -512,128 +474,52 @@ int kswapd(void *unused)
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
- current->flags |= PF_MEMALLOC;
+ tsk->flags |= PF_MEMALLOC;
- init_swap_timer();
- kswapd_task = current;
while (1) {
- unsigned long end_time;
-
- current->state = TASK_INTERRUPTIBLE;
- flush_signals(current);
- run_task_queue(&tq_disk);
- schedule();
- swapstats.wakeups++;
-
- /* max one hundreth of a second */
- end_time = jiffies + (HZ-1)/100;
+ /*
+ * Wake up once a second to see if we need to make
+ * more memory available.
+ *
+ * If we actually get into a low-memory situation,
+ * the processes needing more memory will wake us
+ * up on a more timely basis.
+ */
do {
- if (!do_try_to_free_page(0))
+ if (nr_free_pages >= freepages.high)
break;
- if (nr_free_pages > freepages.high + SWAP_CLUSTER_MAX)
+
+ if (!do_try_to_free_pages(GFP_KSWAPD))
break;
- } while (time_before_eq(jiffies,end_time));
+ } while (!tsk->need_resched);
+ run_task_queue(&tq_disk);
+ tsk->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ);
}
- /* As if we could ever get here - maybe we want to make this killable */
- kswapd_task = NULL;
- unlock_kernel();
- return 0;
}
/*
- * We need to make the locks finer granularity, but right
- * now we need this so that we can do page allocations
- * without holding the kernel lock etc.
+ * Called by non-kswapd processes when they want more
+ * memory.
+ *
+ * In a perfect world, this should just wake up kswapd
+ * and return. We don't actually want to swap stuff out
+ * from user processes, because the locking issues are
+ * nasty to the extreme (file write locks, and MM locking)
*
- * The "PF_MEMALLOC" flag protects us against recursion:
- * if we need more memory as part of a swap-out effort we
- * will just silently return "success" to tell the page
- * allocator to accept the allocation.
+ * One option might be to let kswapd do all the page-out
+ * and VM page table scanning that needs locking, and this
+ * process thread could do just the mmap shrink stage that
+ * can be done by just dropping cached pages without having
+ * any deadlock issues.
*/
-int try_to_free_pages(unsigned int gfp_mask, int count)
+int try_to_free_pages(unsigned int gfp_mask)
{
int retval = 1;
- lock_kernel();
- if (!(current->flags & PF_MEMALLOC)) {
- current->flags |= PF_MEMALLOC;
- do {
- retval = do_try_to_free_page(gfp_mask);
- if (!retval)
- break;
- count--;
- } while (count > 0);
- current->flags &= ~PF_MEMALLOC;
- }
- unlock_kernel();
+ wake_up_process(kswapd_process);
+ if (gfp_mask & __GFP_WAIT)
+ retval = do_try_to_free_pages(gfp_mask);
return retval;
}
-
-/*
- * Wake up kswapd according to the priority
- * 0 - no wakeup
- * 1 - wake up as a low-priority process
- * 2 - wake up as a normal process
- * 3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(struct task_struct *p, int priority)
-{
- if (priority) {
- p->counter = p->priority << priority;
- wake_up_process(p);
- }
-}
-
-/*
- * The swap_tick function gets called on every clock tick.
- */
-void swap_tick(void)
-{
- struct task_struct *p = kswapd_task;
-
- /*
- * Only bother to try to wake kswapd up
- * if the task exists and can be woken.
- */
- if (p && (p->state & TASK_INTERRUPTIBLE)) {
- unsigned int pages;
- int want_wakeup;
-
- /*
- * Schedule for wakeup if there isn't lots
- * of free memory or if there is too much
- * of it used for buffers or pgcache.
- *
- * "want_wakeup" is our priority: 0 means
- * not to wake anything up, while 3 means
- * that we'd better give kswapd a realtime
- * priority.
- */
- want_wakeup = 0;
- pages = nr_free_pages;
- if (pages < freepages.high)
- want_wakeup = 1;
- if (pages < freepages.low)
- want_wakeup = 2;
- if (pages < freepages.min)
- want_wakeup = 3;
- kswapd_wakeup(p,want_wakeup);
- }
-
- timer_active |= (1<<SWAP_TIMER);
-}
-
-/*
- * Initialise the swap timer
- */
-
-void init_swap_timer(void)
-{
- timer_table[SWAP_TIMER].expires = jiffies;
- timer_table[SWAP_TIMER].fn = swap_tick;
- timer_active |= (1<<SWAP_TIMER);
-}