summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c18
-rw-r--r--mm/memory.c14
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/page_alloc.c49
-rw-r--r--mm/shmem.c111
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swap.c3
-rw-r--r--mm/vmalloc.c1
-rw-r--r--mm/vmscan.c383
9 files changed, 246 insertions, 339 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index dedd7911e..4c89ad3e9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,7 +143,8 @@ void __set_page_dirty(struct page *page)
list_add(&page->list, &mapping->dirty_pages);
spin_unlock(&pagecache_lock);
- mark_inode_dirty_pages(mapping->host);
+ if (mapping->host)
+ mark_inode_dirty_pages(mapping->host);
}
/**
@@ -306,7 +307,7 @@ inside:
*/
age_page_up(page);
if (inactive_shortage() > inactive_target / 2 && free_shortage())
- wakeup_kswapd(0);
+ wakeup_kswapd();
not_found:
return page;
}
@@ -974,10 +975,6 @@ static void generic_file_readahead(int reada_ok,
* accessed sequentially.
*/
if (ahead) {
- if (reada_ok == 2) {
- run_task_queue(&tq_disk);
- }
-
filp->f_ralen += ahead;
filp->f_rawin += filp->f_ralen;
filp->f_raend = raend + ahead + 1;
@@ -1835,7 +1832,8 @@ static long madvise_fixup_start(struct vm_area_struct * vma,
n->vm_end = end;
setup_read_behavior(n, behavior);
n->vm_raend = 0;
- get_file(n->vm_file);
+ if (n->vm_file)
+ get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
lock_vma_mappings(vma);
@@ -1861,7 +1859,8 @@ static long madvise_fixup_end(struct vm_area_struct * vma,
n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
setup_read_behavior(n, behavior);
n->vm_raend = 0;
- get_file(n->vm_file);
+ if (n->vm_file)
+ get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
lock_vma_mappings(vma);
@@ -1893,7 +1892,8 @@ static long madvise_fixup_middle(struct vm_area_struct * vma,
right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
left->vm_raend = 0;
right->vm_raend = 0;
- atomic_add(2, &vma->vm_file->f_count);
+ if (vma->vm_file)
+ atomic_add(2, &vma->vm_file->f_count);
if (vma->vm_ops && vma->vm_ops->open) {
vma->vm_ops->open(left);
diff --git a/mm/memory.c b/mm/memory.c
index 6f1f318a3..7fc8de5eb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -207,7 +207,8 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
src_pte = pte_offset(src_pmd, address);
dst_pte = pte_offset(dst_pmd, address);
-
+
+ spin_lock(&src->page_table_lock);
do {
pte_t pte = *src_pte;
struct page *ptepage;
@@ -240,10 +241,11 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
cont_copy_pte_range: set_pte(dst_pte, pte);
cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
- goto out;
+ goto out_unlock;
src_pte++;
dst_pte++;
} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+ spin_unlock(&src->page_table_lock);
cont_copy_pmd_range: src_pmd++;
dst_pmd++;
@@ -252,6 +254,10 @@ cont_copy_pmd_range: src_pmd++;
out:
return 0;
+out_unlock:
+ spin_unlock(&src->page_table_lock);
+ return 0;
+
nomem:
return -ENOMEM;
}
@@ -939,7 +945,6 @@ void vmtruncate(struct inode * inode, loff_t offset)
if (inode->i_size < offset)
goto do_expand;
inode->i_size = offset;
- truncate_inode_pages(mapping, offset);
spin_lock(&mapping->i_shared_lock);
if (!mapping->i_mmap && !mapping->i_mmap_shared)
goto out_unlock;
@@ -954,8 +959,7 @@ void vmtruncate(struct inode * inode, loff_t offset)
out_unlock:
spin_unlock(&mapping->i_shared_lock);
- /* this should go into ->truncate */
- inode->i_size = offset;
+ truncate_inode_pages(mapping, offset);
if (inode->i_op && inode->i_op->truncate)
inode->i_op->truncate(inode);
return;
diff --git a/mm/mmap.c b/mm/mmap.c
index e5b3a989e..e1faba3c7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -883,6 +883,8 @@ void exit_mmap(struct mm_struct * mm)
mm->rss = 0;
mm->total_vm = 0;
mm->locked_vm = 0;
+
+ flush_cache_mm(mm);
while (mpnt) {
struct vm_area_struct * next = mpnt->vm_next;
unsigned long start = mpnt->vm_start;
@@ -895,13 +897,13 @@ void exit_mmap(struct mm_struct * mm)
}
mm->map_count--;
remove_shared_vm_struct(mpnt);
- flush_cache_range(mm, start, end);
zap_page_range(mm, start, size);
if (mpnt->vm_file)
fput(mpnt->vm_file);
kmem_cache_free(vm_area_cachep, mpnt);
mpnt = next;
}
+ flush_tlb_mm(mm);
/* This is just debugging */
if (mm->map_count)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b67aa4913..09ac27284 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
+#include <linux/slab.h>
int nr_swap_pages;
int nr_active_pages;
@@ -303,7 +304,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
* an inactive page shortage, wake up kswapd.
*/
if (inactive_shortage() > inactive_target / 2 && free_shortage())
- wakeup_kswapd(0);
+ wakeup_kswapd();
/*
* If we are about to get low on free pages and cleaning
* the inactive_dirty pages would fix the situation,
@@ -379,7 +380,7 @@ try_again:
* - if we don't have __GFP_IO set, kswapd may be
* able to free some memory we can't free ourselves
*/
- wakeup_kswapd(0);
+ wakeup_kswapd();
if (gfp_mask & __GFP_WAIT) {
__set_current_state(TASK_RUNNING);
current->policy |= SCHED_YIELD;
@@ -404,7 +405,7 @@ try_again:
* - we're doing a higher-order allocation
* --> move pages to the free list until we succeed
* - we're /really/ tight on memory
- * --> wait on the kswapd waitqueue until memory is freed
+ * --> try to free pages ourselves with page_launder
*/
if (!(current->flags & PF_MEMALLOC)) {
/*
@@ -443,36 +444,20 @@ try_again:
/*
* When we arrive here, we are really tight on memory.
*
- * We wake up kswapd and sleep until kswapd wakes us
- * up again. After that we loop back to the start.
- *
- * We have to do this because something else might eat
- * the memory kswapd frees for us and we need to be
- * reliable. Note that we don't loop back for higher
- * order allocations since it is possible that kswapd
- * simply cannot free a large enough contiguous area
- * of memory *ever*.
+ * We try to free pages ourselves by:
+ * - shrinking the i/d caches.
+ * - reclaiming unused memory from the slab caches.
+ * - swapping/syncing pages to disk (done by page_launder)
+ * - moving clean pages from the inactive dirty list to
+ * the inactive clean list. (done by page_launder)
*/
- if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
- wakeup_kswapd(1);
+ if (gfp_mask & __GFP_WAIT) {
memory_pressure++;
- if (!order)
- goto try_again;
- /*
- * If __GFP_IO isn't set, we can't wait on kswapd because
- * kswapd just might need some IO locks /we/ are holding ...
- *
- * SUBTLE: The scheduling point above makes sure that
- * kswapd does get the chance to free memory we can't
- * free ourselves...
- */
- } else if (gfp_mask & __GFP_WAIT) {
try_to_free_pages(gfp_mask);
- memory_pressure++;
+ wakeup_bdflush(0);
if (!order)
goto try_again;
}
-
}
/*
@@ -554,14 +539,8 @@ void __free_pages(struct page *page, unsigned long order)
void free_pages(unsigned long addr, unsigned long order)
{
- struct page *fpage;
-
-#ifdef CONFIG_DISCONTIGMEM
- if (addr == 0) return;
-#endif
- fpage = virt_to_page(addr);
- if (VALID_PAGE(fpage))
- __free_pages(fpage, order);
+ if (addr != 0)
+ __free_pages(virt_to_page(addr), order);
}
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index a81a74659..00426ca27 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -117,11 +117,43 @@ shmem_truncate_part (swp_entry_t * dir, unsigned long size,
return 0;
}
+/*
+ * shmem_recalc_inode - recalculate the size of an inode
+ *
+ * @inode: inode to recalc
+ *
+ * We have to calculate the free blocks since the mm can drop pages
+ * behind our back
+ *
+ * But we know that normally
+ * inodes->i_blocks == inode->i_mapping->nrpages + info->swapped
+ *
+ * So the mm freed
+ * inodes->i_blocks - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * It has to be called with the spinlock held.
+ */
+
+static void shmem_recalc_inode(struct inode * inode)
+{
+ unsigned long freed;
+
+ freed = inode->i_blocks -
+ (inode->i_mapping->nrpages + inode->u.shmem_i.swapped);
+ if (freed){
+ struct shmem_sb_info * info = &inode->i_sb->u.shmem_sb;
+ inode->i_blocks -= freed;
+ spin_lock (&info->stat_lock);
+ info->free_blocks += freed;
+ spin_unlock (&info->stat_lock);
+ }
+}
+
static void shmem_truncate (struct inode * inode)
{
int clear_base;
unsigned long start;
- unsigned long mmfreed, freed = 0;
+ unsigned long freed = 0;
swp_entry_t **base, **ptr;
struct shmem_inode_info * info = &inode->u.shmem_i;
@@ -154,26 +186,9 @@ static void shmem_truncate (struct inode * inode)
info->i_indirect = 0;
out:
-
- /*
- * We have to calculate the free blocks since we do not know
- * how many pages the mm discarded
- *
- * But we know that normally
- * inodes->i_blocks == inode->i_mapping->nrpages + info->swapped
- *
- * So the mm freed
- * inodes->i_blocks - (inode->i_mapping->nrpages + info->swapped)
- */
-
- mmfreed = inode->i_blocks - (inode->i_mapping->nrpages + info->swapped);
info->swapped -= freed;
- inode->i_blocks -= freed + mmfreed;
+ shmem_recalc_inode(inode);
spin_unlock (&info->lock);
-
- spin_lock (&inode->i_sb->u.shmem_sb.stat_lock);
- inode->i_sb->u.shmem_sb.free_blocks += freed + mmfreed;
- spin_unlock (&inode->i_sb->u.shmem_sb.stat_lock);
}
static void shmem_delete_inode(struct inode * inode)
@@ -201,13 +216,15 @@ static int shmem_writepage(struct page * page)
swp_entry_t *entry, swap;
info = &page->mapping->host->u.shmem_i;
- if (info->locked)
- return 1;
swap = __get_swap_page(2);
- if (!swap.val)
- return 1;
+ if (!swap.val) {
+ set_page_dirty(page);
+ UnlockPage(page);
+ return -ENOMEM;
+ }
spin_lock(&info->lock);
+ shmem_recalc_inode(page->mapping->host);
entry = shmem_swp_entry (info, page->index);
if (!entry) /* this had been allocted on page allocation */
BUG();
@@ -269,6 +286,9 @@ struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, i
entry = shmem_swp_entry (info, idx);
if (!entry)
goto oom;
+ spin_lock (&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock (&info->lock);
if (entry->val) {
unsigned long flags;
@@ -310,6 +330,8 @@ struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, i
}
/* We have the page */
SetPageUptodate (page);
+ if (info->locked)
+ page_cache_get(page);
cached_page:
UnlockPage (page);
@@ -374,8 +396,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
inode->i_fop = &shmem_dir_operations;
break;
case S_IFLNK:
- inode->i_op = &page_symlink_inode_operations;
- break;
+ BUG();
}
spin_lock (&shmem_ilock);
list_add (&inode->u.shmem_i.list, &shmem_inodes);
@@ -401,6 +422,32 @@ static int shmem_statfs(struct super_block *sb, struct statfs *buf)
return 0;
}
+void shmem_lock(struct file * file, int lock)
+{
+ struct inode * inode = file->f_dentry->d_inode;
+ struct shmem_inode_info * info = &inode->u.shmem_i;
+ struct page * page;
+ unsigned long idx, size;
+
+ if (info->locked == lock)
+ return;
+ down(&inode->i_sem);
+ info->locked = lock;
+ size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ for (idx = 0; idx < size; idx++) {
+ page = find_lock_page(inode->i_mapping, idx);
+ if (!page)
+ continue;
+ if (!lock) {
+ /* release the extra count and our reference */
+ page_cache_release(page);
+ page_cache_release(page);
+ }
+ UnlockPage(page);
+ }
+ up(&inode->i_sem);
+}
+
/*
* Lookup the data. This is trivial - if the dentry didn't already
* exist, we know it is negative.
@@ -528,19 +575,6 @@ static int shmem_rename(struct inode * old_dir, struct dentry *old_dentry, struc
return error;
}
-static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
-{
- int error;
-
- error = shmem_mknod(dir, dentry, S_IFLNK | S_IRWXUGO, 0);
- if (!error) {
- int l = strlen(symname)+1;
- struct inode *inode = dentry->d_inode;
- error = block_symlink(inode, symname, l);
- }
- return error;
-}
-
static int shmem_mmap(struct file * file, struct vm_area_struct * vma)
{
struct vm_operations_struct * ops;
@@ -677,7 +711,6 @@ static struct inode_operations shmem_dir_inode_operations = {
lookup: shmem_lookup,
link: shmem_link,
unlink: shmem_unlink,
- symlink: shmem_symlink,
mkdir: shmem_mkdir,
rmdir: shmem_rmdir,
mknod: shmem_mknod,
diff --git a/mm/slab.c b/mm/slab.c
index b3bd852d1..f6f8be1db 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1702,7 +1702,7 @@ static void enable_all_cpucaches (void)
* kmem_cache_reap - Reclaim memory from caches.
* @gfp_mask: the type of memory required.
*
- * Called from try_to_free_page().
+ * Called from do_try_to_free_pages() and __alloc_pages()
*/
void kmem_cache_reap (int gfp_mask)
{
diff --git a/mm/swap.c b/mm/swap.c
index 693773ccd..b1a6640bc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,8 +30,7 @@
* start background swapping if we fall below freepages.high free
* pages, and we begin intensive swapping below freepages.low.
*
- * Actual initialization is done in mm/page_alloc.c or
- * arch/sparc(64)/mm/init.c.
+ * Actual initialization is done in mm/page_alloc.c
*/
freepages_t freepages = {
0, /* freepages.min */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 62ce5f1ff..93edab662 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -9,6 +9,7 @@
#include <linux/malloc.h>
#include <linux/vmalloc.h>
#include <linux/spinlock.h>
+#include <linux/highmem.h>
#include <linux/smp_lock.h>
#include <asm/uaccess.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index afa5261c1..f41c53328 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,45 +35,21 @@
* using a process that no longer actually exists (it might
* have died while we slept).
*/
-static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
{
pte_t pte;
swp_entry_t entry;
- struct page * page;
- int onlist;
-
- pte = *page_table;
- if (!pte_present(pte))
- goto out_failed;
- page = pte_page(pte);
- if ((!VALID_PAGE(page)) || PageReserved(page))
- goto out_failed;
-
- if (!mm->swap_cnt)
- return 1;
-
- mm->swap_cnt--;
- onlist = PageActive(page);
/* Don't look at this pte if it's been accessed recently. */
if (ptep_test_and_clear_young(page_table)) {
- age_page_up(page);
- goto out_failed;
+ page->age += PAGE_AGE_ADV;
+ if (page->age > PAGE_AGE_MAX)
+ page->age = PAGE_AGE_MAX;
+ return;
}
- if (!onlist)
- /* The page is still mapped, so it can't be freeable... */
- age_page_down_ageonly(page);
-
- /*
- * If the page is in active use by us, or if the page
- * is in active use by others, don't unmap it or
- * (worse) start unneeded IO.
- */
- if (page->age > 0)
- goto out_failed;
if (TryLockPage(page))
- goto out_failed;
+ return;
/* From this point on, the odds are that we're going to
* nuke this pte, so read and clear the pte. This hook
@@ -87,9 +63,6 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
* Is the page already in the swap cache? If so, then
* we can just drop our reference to it without doing
* any IO - it's already up-to-date on disk.
- *
- * Return 0, as we didn't actually free any real
- * memory, and we should just continue our scan.
*/
if (PageSwapCache(page)) {
entry.val = page->index;
@@ -99,12 +72,12 @@ set_swap_pte:
swap_duplicate(entry);
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
- UnlockPage(page);
mm->rss--;
- deactivate_page(page);
+ if (!page->age)
+ deactivate_page(page);
+ UnlockPage(page);
page_cache_release(page);
-out_failed:
- return 0;
+ return;
}
/*
@@ -153,34 +126,20 @@ out_failed:
out_unlock_restore:
set_pte(page_table, pte);
UnlockPage(page);
- return 0;
+ return;
}
-/*
- * A new implementation of swap_out(). We do not swap complete processes,
- * but only a small number of blocks, before we continue with the next
- * process. The number of blocks actually swapped is determined on the
- * number of page faults, that this process actually had in the last time,
- * so we won't swap heavily used processes all the time ...
- *
- * Note: the priority argument is a hint on much CPU to waste with the
- * swap block search, not a hint, of how much blocks to swap with
- * each process.
- *
- * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
- */
-
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
{
pte_t * pte;
unsigned long pmd_end;
if (pmd_none(*dir))
- return 0;
+ return count;
if (pmd_bad(*dir)) {
pmd_ERROR(*dir);
pmd_clear(dir);
- return 0;
+ return count;
}
pte = pte_offset(dir, address);
@@ -190,28 +149,33 @@ static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vm
end = pmd_end;
do {
- int result;
- mm->swap_address = address + PAGE_SIZE;
- result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
- if (result)
- return result;
+ if (pte_present(*pte)) {
+ struct page *page = pte_page(*pte);
+
+ if (VALID_PAGE(page) && !PageReserved(page)) {
+ try_to_swap_out(mm, vma, address, pte, page);
+ if (!--count)
+ break;
+ }
+ }
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
- return 0;
+ mm->swap_address = address + PAGE_SIZE;
+ return count;
}
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
{
pmd_t * pmd;
unsigned long pgd_end;
if (pgd_none(*dir))
- return 0;
+ return count;
if (pgd_bad(*dir)) {
pgd_ERROR(*dir);
pgd_clear(dir);
- return 0;
+ return count;
}
pmd = pmd_offset(dir, address);
@@ -221,23 +185,23 @@ static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vm
end = pgd_end;
do {
- int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
- if (result)
- return result;
+ count = swap_out_pmd(mm, vma, pmd, address, end, count);
+ if (!count)
+ break;
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
- return 0;
+ return count;
}
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
{
pgd_t *pgdir;
unsigned long end;
/* Don't swap out areas which are locked down */
if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
- return 0;
+ return count;
pgdir = pgd_offset(mm, address);
@@ -245,18 +209,17 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi
if (address >= end)
BUG();
do {
- int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
- if (result)
- return result;
+ count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+ if (!count)
+ break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
} while (address && (address < end));
- return 0;
+ return count;
}
-static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
+static int swap_out_mm(struct mm_struct * mm, int count)
{
- int result = 0;
unsigned long address;
struct vm_area_struct* vma;
@@ -276,8 +239,8 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
address = vma->vm_start;
for (;;) {
- result = swap_out_vma(mm, vma, address, gfp_mask);
- if (result)
+ count = swap_out_vma(mm, vma, address, count);
+ if (!count)
goto out_unlock;
vma = vma->vm_next;
if (!vma)
@@ -287,94 +250,63 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
}
/* Reset to 0 when we reach the end of address space */
mm->swap_address = 0;
- mm->swap_cnt = 0;
out_unlock:
spin_unlock(&mm->page_table_lock);
- return result;
+ return !count;
}
/*
- * Select the task with maximal swap_cnt and try to swap out a page.
* N.B. This function returns only 0 or 1. Return values != 1 from
* the lower level routines result in continued processing.
*/
#define SWAP_SHIFT 5
#define SWAP_MIN 8
+static inline int swap_amount(struct mm_struct *mm)
+{
+ int nr = mm->rss >> SWAP_SHIFT;
+ return nr < SWAP_MIN ? SWAP_MIN : nr;
+}
+
static int swap_out(unsigned int priority, int gfp_mask)
{
int counter;
- int __ret = 0;
-
- /*
- * We make one or two passes through the task list, indexed by
- * assign = {0, 1}:
- * Pass 1: select the swappable task with maximal RSS that has
- * not yet been swapped out.
- * Pass 2: re-assign rss swap_cnt values, then select as above.
- *
- * With this approach, there's no need to remember the last task
- * swapped out. If the swap-out fails, we clear swap_cnt so the
- * task won't be selected again until all others have been tried.
- *
- * Think of swap_cnt as a "shadow rss" - it tells us which process
- * we want to page out (always try largest first).
- */
- counter = (nr_threads << SWAP_SHIFT) >> priority;
- if (counter < 1)
- counter = 1;
+ int retval = 0;
+ struct mm_struct *mm = current->mm;
- for (; counter >= 0; counter--) {
+ /* Always start by trying to penalize the process that is allocating memory */
+ if (mm)
+ retval = swap_out_mm(mm, swap_amount(mm));
+
+ /* Then, look at the other mm's */
+ counter = mmlist_nr >> priority;
+ do {
struct list_head *p;
- unsigned long max_cnt = 0;
- struct mm_struct *best = NULL;
- int assign = 0;
- int found_task = 0;
- select:
+
spin_lock(&mmlist_lock);
p = init_mm.mmlist.next;
- for (; p != &init_mm.mmlist; p = p->next) {
- struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);
- if (mm->rss <= 0)
- continue;
- found_task++;
- /* Refresh swap_cnt? */
- if (assign == 1) {
- mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
- if (mm->swap_cnt < SWAP_MIN)
- mm->swap_cnt = SWAP_MIN;
- }
- if (mm->swap_cnt > max_cnt) {
- max_cnt = mm->swap_cnt;
- best = mm;
- }
- }
+ if (p == &init_mm.mmlist)
+ goto empty;
+
+ /* Move it to the back of the queue.. */
+ list_del(p);
+ list_add_tail(p, &init_mm.mmlist);
+ mm = list_entry(p, struct mm_struct, mmlist);
- /* Make sure it doesn't disappear */
- if (best)
- atomic_inc(&best->mm_users);
+ /* Make sure the mm doesn't disappear when we drop the lock.. */
+ atomic_inc(&mm->mm_users);
spin_unlock(&mmlist_lock);
- /*
- * We have dropped the tasklist_lock, but we
- * know that "mm" still exists: we are running
- * with the big kernel lock, and exit_mm()
- * cannot race with us.
- */
- if (!best) {
- if (!assign && found_task > 0) {
- assign = 1;
- goto select;
- }
- break;
- } else {
- __ret = swap_out_mm(best, gfp_mask);
- mmput(best);
- break;
- }
- }
- return __ret;
+ /* Walk about 6% of the address space each time */
+ retval |= swap_out_mm(mm, swap_amount(mm));
+ mmput(mm);
+ } while (--counter >= 0);
+ return retval;
+
+empty:
+ spin_unlock(&mmlist_lock);
+ return 0;
}
@@ -540,7 +472,6 @@ dirty_page_rescan:
*/
if (PageDirty(page)) {
int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
- int result;
if (!writepage)
goto page_active;
@@ -558,16 +489,12 @@ dirty_page_rescan:
page_cache_get(page);
spin_unlock(&pagemap_lru_lock);
- result = writepage(page);
+ writepage(page);
page_cache_release(page);
/* And re-start the thing.. */
spin_lock(&pagemap_lru_lock);
- if (result != 1)
- continue;
- /* writepage refused to do anything */
- set_page_dirty(page);
- goto page_active;
+ continue;
}
/*
@@ -808,6 +735,9 @@ int free_shortage(void)
int inactive_shortage(void)
{
int shortage = 0;
+ pg_data_t *pgdat = pgdat_list;
+
+ /* Is the inactive dirty list too small? */
shortage += freepages.high;
shortage += inactive_target;
@@ -818,7 +748,27 @@ int inactive_shortage(void)
if (shortage > 0)
return shortage;
- return 0;
+ /* If not, do we have enough per-zone pages on the inactive list? */
+
+ shortage = 0;
+
+ do {
+ int i;
+ for(i = 0; i < MAX_NR_ZONES; i++) {
+ int zone_shortage;
+ zone_t *zone = pgdat->node_zones+ i;
+
+ zone_shortage = zone->pages_high;
+ zone_shortage -= zone->inactive_dirty_pages;
+ zone_shortage -= zone->inactive_clean_pages;
+ zone_shortage -= zone->free_pages;
+ if (zone_shortage > 0)
+ shortage += zone_shortage;
+ }
+ pgdat = pgdat->node_next;
+ } while (pgdat);
+
+ return shortage;
}
/*
@@ -833,72 +783,35 @@ int inactive_shortage(void)
* really care about latency. In that case we don't try
* to free too many pages.
*/
+#define DEF_PRIORITY (6)
static int refill_inactive(unsigned int gfp_mask, int user)
{
- int priority, count, start_count, made_progress;
+ int count, start_count, maxtry;
count = inactive_shortage() + free_shortage();
if (user)
count = (1 << page_cluster);
start_count = count;
- /* Always trim SLAB caches when memory gets low. */
- kmem_cache_reap(gfp_mask);
-
- priority = 6;
+ maxtry = 6;
do {
- made_progress = 0;
-
if (current->need_resched) {
__set_current_state(TASK_RUNNING);
schedule();
}
- while (refill_inactive_scan(priority, 1)) {
- made_progress = 1;
- if (--count <= 0)
- goto done;
- }
-
- /*
- * don't be too light against the d/i cache since
- * refill_inactive() almost never fail when there's
- * really plenty of memory free.
- */
- shrink_dcache_memory(priority, gfp_mask);
- shrink_icache_memory(priority, gfp_mask);
-
- /*
- * Then, try to page stuff out..
- */
- while (swap_out(priority, gfp_mask)) {
- made_progress = 1;
+ while (refill_inactive_scan(DEF_PRIORITY, 1)) {
if (--count <= 0)
goto done;
}
- /*
- * If we either have enough free memory, or if
- * page_launder() will be able to make enough
- * free memory, then stop.
- */
- if (!inactive_shortage() || !free_shortage())
- goto done;
+ /* If refill_inactive_scan failed, try to page stuff out.. */
+ swap_out(DEF_PRIORITY, gfp_mask);
- /*
- * Only switch to a lower "priority" if we
- * didn't make any useful progress in the
- * last loop.
- */
- if (!made_progress)
- priority--;
- } while (priority >= 0);
-
- /* Always end on a refill_inactive.., may sleep... */
- while (refill_inactive_scan(0, 1)) {
- if (--count <= 0)
- goto done;
- }
+ if (--maxtry <= 0)
+ return 0;
+
+ } while (inactive_shortage());
done:
return (count < start_count);
@@ -922,20 +835,29 @@ static int do_try_to_free_pages(unsigned int gfp_mask, int user)
/*
* If needed, we move pages from the active list
- * to the inactive list. We also "eat" pages from
- * the inode and dentry cache whenever we do this.
+ * to the inactive list.
*/
- if (free_shortage() || inactive_shortage()) {
- shrink_dcache_memory(6, gfp_mask);
- shrink_icache_memory(6, gfp_mask);
+ if (inactive_shortage())
ret += refill_inactive(gfp_mask, user);
+
+ /*
+ * Delete pages from the inode and dentry caches and
+ * reclaim unused slab cache if memory is low.
+ */
+ if (free_shortage()) {
+ shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+ shrink_icache_memory(DEF_PRIORITY, gfp_mask);
} else {
/*
- * Reclaim unused slab cache memory.
+ * Illogical, but true. At least for now.
+ *
+ * If we're _not_ under shortage any more, we
+ * reap the caches. Why? Because a noticeable
+ * part of the caches are the buffer-heads,
+ * which we'll want to keep if under shortage.
*/
kmem_cache_reap(gfp_mask);
- ret = 1;
- }
+ }
return ret;
}
@@ -988,13 +910,8 @@ int kswapd(void *unused)
static int recalc = 0;
/* If needed, try to free some memory. */
- if (inactive_shortage() || free_shortage()) {
- int wait = 0;
- /* Do we need to do some synchronous flushing? */
- if (waitqueue_active(&kswapd_done))
- wait = 1;
- do_try_to_free_pages(GFP_KSWAPD, wait);
- }
+ if (inactive_shortage() || free_shortage())
+ do_try_to_free_pages(GFP_KSWAPD, 0);
/*
* Do some (very minimal) background scanning. This
@@ -1002,7 +919,7 @@ int kswapd(void *unused)
* every minute. This clears old referenced bits
* and moves unused pages to the inactive list.
*/
- refill_inactive_scan(6, 0);
+ refill_inactive_scan(DEF_PRIORITY, 0);
/* Once a second, recalculate some VM stats. */
if (time_after(jiffies, recalc + HZ)) {
@@ -1010,11 +927,6 @@ int kswapd(void *unused)
recalculate_vm_stats();
}
- /*
- * Wake up everybody waiting for free memory
- * and unplug the disk queue.
- */
- wake_up_all(&kswapd_done);
run_task_queue(&tq_disk);
/*
@@ -1045,33 +957,10 @@ int kswapd(void *unused)
}
}
-void wakeup_kswapd(int block)
+void wakeup_kswapd(void)
{
- DECLARE_WAITQUEUE(wait, current);
-
- if (current == kswapd_task)
- return;
-
- if (!block) {
- if (waitqueue_active(&kswapd_wait))
- wake_up(&kswapd_wait);
- return;
- }
-
- /*
- * Kswapd could wake us up before we get a chance
- * to sleep, so we have to be very careful here to
- * prevent SMP races...
- */
- __set_current_state(TASK_UNINTERRUPTIBLE);
- add_wait_queue(&kswapd_done, &wait);
-
- if (waitqueue_active(&kswapd_wait))
- wake_up(&kswapd_wait);
- schedule();
-
- remove_wait_queue(&kswapd_done, &wait);
- __set_current_state(TASK_RUNNING);
+ if (current != kswapd_task)
+ wake_up_process(kswapd_task);
}
/*
@@ -1096,7 +985,7 @@ DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
/*
* Kreclaimd will move pages from the inactive_clean list to the
* free list, in order to keep atomic allocations possible under
- * all circumstances. Even when kswapd is blocked on IO.
+ * all circumstances.
*/
int kreclaimd(void *unused)
{