summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-10-05 01:18:40 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-10-05 01:18:40 +0000
commit012bb3e61e5eced6c610f9e036372bf0c8def2d1 (patch)
tree87efc733f9b164e8c85c0336f92c8fb7eff6d183 /mm
parent625a1589d3d6464b5d90b8a0918789e3afffd220 (diff)
Merge with Linux 2.4.0-test9. Please check DECstation, I had a number
of rejects to fixup while integrating Linus patches. I also found that this kernel will only boot SMP on Origin; the UP kernel freeze soon after bootup with SCSI timeout messages. I commit this anyway since I found that the last CVS versions had the same problem.
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c303
-rw-r--r--mm/memory.c71
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/numa.c8
-rw-r--r--mm/page_alloc.c400
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/slab.c126
-rw-r--r--mm/swap.c254
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/vmalloc.c2
-rw-r--r--mm/vmscan.c821
12 files changed, 1480 insertions, 532 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 977225432..6aca16409 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -44,9 +44,8 @@
atomic_t page_cache_size = ATOMIC_INIT(0);
unsigned int page_hash_bits;
struct page **page_hash_table;
-struct list_head lru_cache;
-static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
/*
* NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
* the pagemap_lru_lock held.
@@ -92,7 +91,7 @@ static inline int sync_page(struct page *page)
* sure the page is locked and that nobody else uses it - or that usage
* is safe.
*/
-static inline void __remove_inode_page(struct page *page)
+void __remove_inode_page(struct page *page)
{
remove_page_from_inode_queue(page);
remove_page_from_hash_queue(page);
@@ -146,9 +145,40 @@ void invalidate_inode_pages(struct inode * inode)
spin_unlock(&pagecache_lock);
}
-/*
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+ memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+
+ if (page->buffers)
+ block_flushpage(page, partial);
+
+}
+
+static inline void truncate_complete_page(struct page *page)
+{
+ if (!page->buffers || block_flushpage(page, 0))
+ lru_cache_del(page);
+
+ /*
+ * We remove the page from the page cache _after_ we have
+ * destroyed all buffer-cache references to it. Otherwise some
+ * other process might think this inode page is not in the
+ * page cache and creates a buffer-cache alias to it causing
+ * all sorts of fun problems ...
+ */
+ ClearPageDirty(page);
+ remove_inode_page(page);
+ page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
* Truncate the page cache at a set offset, removing the pages
* that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
*/
void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
{
@@ -168,11 +198,10 @@ repeat:
page = list_entry(curr, struct page, list);
curr = curr->next;
-
offset = page->index;
- /* page wholly truncated - free it */
- if (offset >= start) {
+ /* Is one of the pages to truncate? */
+ if ((offset >= start) || (partial && (offset + 1) == start)) {
if (TryLockPage(page)) {
page_cache_get(page);
spin_unlock(&pagecache_lock);
@@ -183,23 +212,14 @@ repeat:
page_cache_get(page);
spin_unlock(&pagecache_lock);
- if (!page->buffers || block_flushpage(page, 0))
- lru_cache_del(page);
-
- /*
- * We remove the page from the page cache
- * _after_ we have destroyed all buffer-cache
- * references to it. Otherwise some other process
- * might think this inode page is not in the
- * page cache and creates a buffer-cache alias
- * to it causing all sorts of fun problems ...
- */
- remove_inode_page(page);
- ClearPageDirty(page);
+ if (partial && (offset + 1) == start) {
+ truncate_partial_page(page, partial);
+ partial = 0;
+ } else
+ truncate_complete_page(page);
UnlockPage(page);
page_cache_release(page);
- page_cache_release(page);
/*
* We have done things without the pagecache lock,
@@ -210,176 +230,10 @@ repeat:
*/
goto repeat;
}
- /*
- * there is only one partial page possible.
- */
- if (!partial)
- continue;
-
- /* and it's the one preceeding the first wholly truncated page */
- if ((offset + 1) != start)
- continue;
-
- /* partial truncate, clear end of page */
- if (TryLockPage(page)) {
- spin_unlock(&pagecache_lock);
- goto repeat;
- }
- page_cache_get(page);
- spin_unlock(&pagecache_lock);
-
- memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
- if (page->buffers)
- block_flushpage(page, partial);
-
- partial = 0;
-
- /*
- * we have dropped the spinlock so we have to
- * restart.
- */
- UnlockPage(page);
- page_cache_release(page);
- goto repeat;
}
spin_unlock(&pagecache_lock);
}
-/*
- * nr_dirty represents the number of dirty pages that we will write async
- * before doing sync writes. We can only do sync writes if we can
- * wait for IO (__GFP_IO set).
- */
-int shrink_mmap(int priority, int gfp_mask)
-{
- int ret = 0, count, nr_dirty;
- struct list_head * page_lru;
- struct page * page = NULL;
-
- count = nr_lru_pages / (priority + 1);
- nr_dirty = priority;
-
- /* we need pagemap_lru_lock for list_del() ... subtle code below */
- spin_lock(&pagemap_lru_lock);
- while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
- page = list_entry(page_lru, struct page, lru);
- list_del(page_lru);
-
- if (PageTestandClearReferenced(page))
- goto dispose_continue;
-
- count--;
- /*
- * Avoid unscalable SMP locking for pages we can
- * immediate tell are untouchable..
- */
- if (!page->buffers && page_count(page) > 1)
- goto dispose_continue;
-
- if (TryLockPage(page))
- goto dispose_continue;
-
- /* Release the pagemap_lru lock even if the page is not yet
- queued in any lru queue since we have just locked down
- the page so nobody else may SMP race with us running
- a lru_cache_del() (lru_cache_del() always run with the
- page locked down ;). */
- spin_unlock(&pagemap_lru_lock);
-
- /* avoid freeing the page while it's locked */
- page_cache_get(page);
-
- /*
- * Is it a buffer page? Try to clean it up regardless
- * of zone - it's old.
- */
- if (page->buffers) {
- int wait;
- /*
- * 0 - free it if can do so without IO
- * 1 - start write-out of dirty buffers
- * 2 - wait for locked buffers
- */
- wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0;
- if (!try_to_free_buffers(page, wait))
- goto unlock_continue;
- /* page was locked, inode can't go away under us */
- if (!page->mapping) {
- atomic_dec(&buffermem_pages);
- goto made_buffer_progress;
- }
- }
-
- /* Take the pagecache_lock spinlock held to avoid
- other tasks to notice the page while we are looking at its
- page count. If it's a pagecache-page we'll free it
- in one atomic transaction after checking its page count. */
- spin_lock(&pagecache_lock);
-
- /*
- * We can't free pages unless there's just one user
- * (count == 2 because we added one ourselves above).
- */
- if (page_count(page) != 2)
- goto cache_unlock_continue;
-
- /*
- * Is it a page swap page? If so, we want to
- * drop it if it is no longer used, even if it
- * were to be marked referenced..
- */
- if (PageSwapCache(page)) {
- spin_unlock(&pagecache_lock);
- __delete_from_swap_cache(page);
- goto made_inode_progress;
- }
-
- /*
- * Page is from a zone we don't care about.
- * Don't drop page cache entries in vain.
- */
- if (page->zone->free_pages > page->zone->pages_high)
- goto cache_unlock_continue;
-
- /* is it a page-cache page? */
- if (page->mapping) {
- if (!PageDirty(page) && !pgcache_under_min()) {
- __remove_inode_page(page);
- spin_unlock(&pagecache_lock);
- goto made_inode_progress;
- }
- goto cache_unlock_continue;
- }
-
- printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
-
-cache_unlock_continue:
- spin_unlock(&pagecache_lock);
-unlock_continue:
- spin_lock(&pagemap_lru_lock);
- UnlockPage(page);
- page_cache_release(page);
-dispose_continue:
- list_add(page_lru, &lru_cache);
- }
- goto out;
-
-made_inode_progress:
- page_cache_release(page);
-made_buffer_progress:
- UnlockPage(page);
- page_cache_release(page);
- ret = 1;
- spin_lock(&pagemap_lru_lock);
- /* nr_lru_pages needs the spinlock */
- nr_lru_pages--;
-
-out:
- spin_unlock(&pagemap_lru_lock);
-
- return ret;
-}
-
static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
{
goto inside;
@@ -394,7 +248,14 @@ inside:
if (page->index == offset)
break;
}
- SetPageReferenced(page);
+ /*
+ * Touching the page may move it to the active list.
+ * If we end up with too few inactive pages, we wake
+ * up kswapd.
+ */
+ age_page_up(page);
+ if (inactive_shortage() > inactive_target / 2 && free_shortage())
+ wakeup_kswapd(0);
not_found:
return page;
}
@@ -626,6 +487,7 @@ void ___wait_on_page(struct page *page)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!PageLocked(page))
break;
+ run_task_queue(&tq_disk);
schedule();
} while (PageLocked(page));
tsk->state = TASK_RUNNING;
@@ -749,6 +611,53 @@ repeat:
#endif
/*
+ * We combine this with read-ahead to deactivate pages when we
+ * think there's sequential IO going on. Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * TODO:
+ * - make the readahead code smarter
+ * - move readahead to the VMA level so we can do the same
+ * trick with mmap()
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct page **hash;
+ struct page *page;
+ unsigned long start;
+
+ /* Nothing to drop-behind if we're on the first page. */
+ if (!index)
+ return;
+
+ if (index > file->f_rawin)
+ start = index - file->f_rawin;
+ else
+ start = 0;
+
+ /*
+ * Go backwards from index-1 and drop all pages in the
+ * readahead window. Since the readahead window may have
+ * been increased since the last time we were called, we
+ * stop when the page isn't there.
+ */
+ spin_lock(&pagecache_lock);
+ while (--index >= start) {
+ hash = page_hash(mapping, index);
+ page = __find_page_nolock(mapping, index, *hash);
+ if (!page)
+ break;
+ deactivate_page(page);
+ }
+ spin_unlock(&pagecache_lock);
+}
+
+/*
* Read-ahead profiling information
* --------------------------------
* Every PROFILE_MAXREADCOUNT, the following information is written
@@ -971,6 +880,12 @@ static void generic_file_readahead(int reada_ok,
if (filp->f_ramax > max_readahead)
filp->f_ramax = max_readahead;
+ /*
+ * Move the pages that have already been passed
+ * to the inactive list.
+ */
+ drop_behind(filp, index);
+
#ifdef PROFILE_READAHEAD
profile_readahead((reada_ok == 2), filp);
#endif
@@ -1074,6 +989,13 @@ found_page:
goto page_not_up_to_date;
generic_file_readahead(reada_ok, filp, inode, page);
page_ok:
+ /* If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (page->mapping->i_mmap_shared != NULL)
+ flush_dcache_page(page);
+
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
@@ -2002,10 +1924,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for shrink_mmap to actually free
+ * zap_page_range call sets things up for refill_inactive to actually free
* these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
- * shrink_mmap to pick up before reclaiming other pages.
+ * refill_inactive to pick up before reclaiming other pages.
*
* NB: This interface discards data rather than pushes it out to swap,
* as some implementations do. This has performance implications for
@@ -2530,6 +2452,7 @@ generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
unlock:
/* Mark it unlocked again and drop the page.. */
UnlockPage(page);
+ deactivate_page(page);
page_cache_release(page);
if (status < 0)
diff --git a/mm/memory.c b/mm/memory.c
index 83fc97cb3..6b047821d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -67,7 +67,7 @@ static inline void copy_cow_page(struct page * from, struct page * to, unsigned
copy_user_highpage(to, from, address);
}
-mem_map_t * mem_map = NULL;
+mem_map_t * mem_map;
/*
* Note: this doesn't free the actual pages themselves. That
@@ -924,33 +924,9 @@ static void partial_clear(struct vm_area_struct *vma, unsigned long address)
memclear_highpage_flush(page, offset, PAGE_SIZE - offset);
}
-/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page. Ugly, but necessary.
- */
-void vmtruncate(struct inode * inode, loff_t offset)
+static void vmtruncate_list(struct vm_area_struct *mpnt,
+ unsigned long pgoff, unsigned long partial)
{
- unsigned long partial, pgoff;
- struct vm_area_struct * mpnt;
- struct address_space *mapping = inode->i_mapping;
- unsigned long limit;
-
- if (inode->i_size < offset)
- goto do_expand;
- inode->i_size = offset;
- truncate_inode_pages(mapping, offset);
- spin_lock(&mapping->i_shared_lock);
- if (!mapping->i_mmap)
- goto out_unlock;
-
- pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- partial = (unsigned long)offset & (PAGE_CACHE_SIZE - 1);
-
- mpnt = mapping->i_mmap;
do {
struct mm_struct *mm = mpnt->vm_mm;
unsigned long start = mpnt->vm_start;
@@ -983,6 +959,39 @@ void vmtruncate(struct inode * inode, loff_t offset)
zap_page_range(mm, start, len);
flush_tlb_range(mm, start, end);
} while ((mpnt = mpnt->vm_next_share) != NULL);
+}
+
+
+/*
+ * Handle all mappings that got truncated by a "truncate()"
+ * system call.
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page. Ugly, but necessary.
+ */
+void vmtruncate(struct inode * inode, loff_t offset)
+{
+ unsigned long partial, pgoff;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long limit;
+
+ if (inode->i_size < offset)
+ goto do_expand;
+ inode->i_size = offset;
+ truncate_inode_pages(mapping, offset);
+ spin_lock(&mapping->i_shared_lock);
+ if (!mapping->i_mmap && !mapping->i_mmap_shared)
+ goto out_unlock;
+
+ pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ partial = (unsigned long)offset & (PAGE_CACHE_SIZE - 1);
+
+ if (mapping->i_mmap != NULL)
+ vmtruncate_list(mapping->i_mmap, pgoff, partial);
+ if (mapping->i_mmap_shared != NULL)
+ vmtruncate_list(mapping->i_mmap_shared, pgoff, partial);
+
out_unlock:
spin_unlock(&mapping->i_shared_lock);
/* this should go into ->truncate */
@@ -1031,7 +1040,8 @@ void swapin_readahead(swp_entry_t entry)
num = valid_swaphandles(entry, &offset);
for (i = 0; i < num; offset++, i++) {
/* Don't block on I/O for read-ahead */
- if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster) {
+ if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster
+ * (1 << page_cluster)) {
while (i++ < num)
swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++));
break;
@@ -1095,15 +1105,12 @@ static int do_swap_page(struct mm_struct * mm,
*/
static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
{
- int high = 0;
struct page *page = NULL;
pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
if (write_access) {
page = alloc_page(GFP_HIGHUSER);
if (!page)
return -1;
- if (PageHighMem(page))
- high = 1;
clear_user_highpage(page, addr);
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
mm->rss++;
@@ -1233,7 +1240,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
pgd = pgd_offset(mm, address);
pmd = pmd_alloc(pgd, address);
-
+
if (pmd) {
pte_t * pte = pte_alloc(pmd, address);
if (pte)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9667d19db..9c0027563 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -906,15 +906,21 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
if (file) {
struct inode * inode = file->f_dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
+ struct vm_area_struct **head;
+
if (vmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
+
+ head = &mapping->i_mmap;
+ if (vmp->vm_flags & VM_SHARED)
+ head = &mapping->i_mmap_shared;
/* insert vmp into inode's share list */
spin_lock(&mapping->i_shared_lock);
- if((vmp->vm_next_share = mapping->i_mmap) != NULL)
- mapping->i_mmap->vm_pprev_share = &vmp->vm_next_share;
- mapping->i_mmap = vmp;
- vmp->vm_pprev_share = &mapping->i_mmap;
+ if((vmp->vm_next_share = *head) != NULL)
+ (*head)->vm_pprev_share = &vmp->vm_next_share;
+ *head = vmp;
+ vmp->vm_pprev_share = head;
spin_unlock(&mapping->i_shared_lock);
}
}
diff --git a/mm/mremap.c b/mm/mremap.c
index a48125178..d1f6a7b8b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -225,6 +225,10 @@ unsigned long do_mremap(unsigned long addr,
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
goto out;
+ if (vma->vm_flags & VM_DONTEXPAND) {
+ if (new_len > old_len)
+ goto out;
+ }
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked = current->mm->locked_vm << PAGE_SHIFT;
locked += new_len - old_len;
diff --git a/mm/numa.c b/mm/numa.c
index bbe9ec6fb..06ad9ec63 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -21,12 +21,12 @@ pg_data_t contig_page_data = { bdata: &contig_bootmem_data };
* at a considerably higher value than 0. Examples are Super-H, ARM, m68k.
* Should be invoked with paramters (0, 0, unsigned long *[], start_paddr).
*/
-void __init free_area_init_node(int nid, pg_data_t *pgdat,
+void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
unsigned long *zones_size, unsigned long zone_start_paddr,
unsigned long *zholes_size)
{
free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size,
- zone_start_paddr, zholes_size);
+ zone_start_paddr, zholes_size, pmap);
}
#endif /* !CONFIG_DISCONTIGMEM */
@@ -55,7 +55,7 @@ void show_free_areas_node(int nid)
/*
* Nodes can be initialized parallely, in no particular order.
*/
-void __init free_area_init_node(int nid, pg_data_t *pgdat,
+void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
unsigned long *zones_size, unsigned long zone_start_paddr,
unsigned long *zholes_size)
{
@@ -66,7 +66,7 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat,
mem_map = (mem_map_t *)PAGE_OFFSET;
free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr,
- zholes_size);
+ zholes_size, pmap);
pgdat->node_id = nid;
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8b74a73db..0b5990a11 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,7 +25,8 @@
#endif
int nr_swap_pages;
-int nr_lru_pages;
+int nr_active_pages;
+int nr_inactive_dirty_pages;
pg_data_t *pgdat_list;
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -33,6 +34,8 @@ static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
+struct list_head active_list;
+struct list_head inactive_dirty_list;
/*
* Free_page() adds the page to the free lists. This is optimized for
* fast normal cases (no error jumps taken normally).
@@ -96,7 +99,16 @@ static void __free_pages_ok (struct page *page, unsigned long order)
BUG();
if (PageDirty(page))
BUG();
+ if (PageActive(page))
+ BUG();
+ if (PageInactiveDirty(page))
+ BUG();
+ if (PageInactiveClean(page))
+ BUG();
+ page->flags &= ~(1<<PG_referenced);
+ page->age = PAGE_AGE_START;
+
zone = page->zone;
mask = (~0UL) << order;
@@ -142,10 +154,13 @@ static void __free_pages_ok (struct page *page, unsigned long order)
spin_unlock_irqrestore(&zone->lock, flags);
- if (zone->free_pages > zone->pages_high) {
- zone->zone_wake_kswapd = 0;
- zone->low_on_memory = 0;
- }
+ /*
+ * We don't want to protect this variable from race conditions
+ * since it's nothing important, but we do want to make sure
+ * it never gets negative.
+ */
+ if (memory_pressure > NR_CPUS)
+ memory_pressure--;
}
#define MARK_USED(index, order, area) \
@@ -203,6 +218,7 @@ static struct page * rmqueue(zone_t *zone, unsigned long order)
set_page_count(page, 1);
if (BAD_RANGE(zone,page))
BUG();
+ DEBUG_ADD_PAGE
return page;
}
curr_order++;
@@ -213,13 +229,77 @@ static struct page * rmqueue(zone_t *zone, unsigned long order)
return NULL;
}
+#define PAGES_MIN 0
+#define PAGES_LOW 1
+#define PAGES_HIGH 2
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+ unsigned long order, int limit, int direct_reclaim)
+{
+ zone_t **zone = zonelist->zones;
+
+ for (;;) {
+ zone_t *z = *(zone++);
+ unsigned long water_mark;
+
+ if (!z)
+ break;
+ if (!z->size)
+ BUG();
+
+ /*
+ * We allocate if the number of free + inactive_clean
+ * pages is above the watermark.
+ */
+ switch (limit) {
+ default:
+ case PAGES_MIN:
+ water_mark = z->pages_min;
+ break;
+ case PAGES_LOW:
+ water_mark = z->pages_low;
+ break;
+ case PAGES_HIGH:
+ water_mark = z->pages_high;
+ }
+
+ if (z->free_pages + z->inactive_clean_pages > water_mark) {
+ struct page *page = NULL;
+ /* If possible, reclaim a page directly. */
+ if (direct_reclaim && z->free_pages < z->pages_min + 8)
+ page = reclaim_page(z);
+ /* If that fails, fall back to rmqueue. */
+ if (!page)
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
+
+ /* Found nothing. */
+ return NULL;
+}
+
+
/*
* This is the 'heart' of the zoned buddy allocator:
*/
struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
{
zone_t **zone;
- extern wait_queue_head_t kswapd_wait;
+ int direct_reclaim = 0;
+ unsigned int gfp_mask = zonelist->gfp_mask;
+ struct page * page = NULL;
+
+ /*
+ * Allocations put pressure on the VM subsystem.
+ */
+ memory_pressure++;
/*
* (If anyone calls gfp from interrupts nonatomically then it
@@ -229,6 +309,36 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
* in a higher zone fails.
*/
+ /*
+ * Can we take pages directly from the inactive_clean
+ * list?
+ */
+ if (order == 0 && (gfp_mask & __GFP_WAIT) &&
+ !(current->flags & PF_MEMALLOC))
+ direct_reclaim = 1;
+
+ /*
+ * If we are about to get low on free pages and we also have
+ * an inactive page shortage, wake up kswapd.
+ */
+ if (inactive_shortage() > inactive_target / 2 && free_shortage())
+ wakeup_kswapd(0);
+ /*
+ * If we are about to get low on free pages and cleaning
+ * the inactive_dirty pages would fix the situation,
+ * wake up bdflush.
+ */
+ else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
+ && nr_inactive_dirty_pages > freepages.high)
+ wakeup_bdflush(0);
+
+try_again:
+ /*
+ * First, see if we have any zones with lots of free memory.
+ *
+ * We allocate free memory first because it doesn't contain
+ * any data ... DUH!
+ */
zone = zonelist->zones;
for (;;) {
zone_t *z = *(zone++);
@@ -237,82 +347,193 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
if (!z->size)
BUG();
- /* Are we supposed to free memory? Don't make it worse.. */
- if (!z->zone_wake_kswapd) {
- struct page *page = rmqueue(z, order);
- if (z->free_pages < z->pages_low) {
- z->zone_wake_kswapd = 1;
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
- }
+ if (z->free_pages > z->pages_low) {
+ page = rmqueue(z, order);
if (page)
return page;
+ } else if (z->free_pages < z->pages_min &&
+ waitqueue_active(&kreclaimd_wait)) {
+ wake_up_interruptible(&kreclaimd_wait);
}
}
- /* Three possibilities to get here
- * - Previous alloc_pages resulted in last zone set to have
- * zone_wake_kswapd and start it. kswapd has not been able
- * to release enough pages so that one zone does not have
- * zone_wake_kswapd set.
- * - Different sets of zones (zonelist)
- * previous did not have all zones with zone_wake_kswapd but
- * this one has... should kswapd be woken up? it will run once.
- * - SMP race, kswapd went to sleep slightly after it as running
- * in 'if (waitqueue_active(...))' above.
- * + anyway the test is very cheap to do...
+ /*
+ * Try to allocate a page from a zone with a HIGH
+ * amount of free + inactive_clean pages.
+ *
+ * If there is a lot of activity, inactive_target
+ * will be high and we'll have a good chance of
+ * finding a page using the HIGH limit.
*/
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
+ page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+ if (page)
+ return page;
/*
- * Ok, we don't have any zones that don't need some
- * balancing.. See if we have any that aren't critical..
+ * Then try to allocate a page from a zone with more
+ * than zone->pages_low free + inactive_clean pages.
+ *
+ * When the working set is very large and VM activity
+ * is low, we're most likely to have our allocation
+ * succeed here.
*/
- zone = zonelist->zones;
- for (;;) {
- zone_t *z = *(zone++);
- if (!z)
- break;
- if (!z->low_on_memory) {
- struct page *page = rmqueue(z, order);
- if (z->free_pages < z->pages_min)
- z->low_on_memory = 1;
- if (page)
- return page;
- }
+ page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+ if (page)
+ return page;
+
+ /*
+ * OK, none of the zones on our zonelist has lots
+ * of pages free.
+ *
+ * We wake up kswapd, in the hope that kswapd will
+ * resolve this situation before memory gets tight.
+ *
+ * We also yield the CPU, because that:
+ * - gives kswapd a chance to do something
+ * - slows down allocations, in particular the
+ * allocations from the fast allocator that's
+ * causing the problems ...
+ * - ... which minimises the impact the "bad guys"
+ * have on the rest of the system
+ * - if we don't have __GFP_IO set, kswapd may be
+ * able to free some memory we can't free ourselves
+ */
+ wakeup_kswapd(0);
+ if (gfp_mask & __GFP_WAIT) {
+ __set_current_state(TASK_RUNNING);
+ current->policy |= SCHED_YIELD;
+ schedule();
}
/*
- * Uhhuh. All the zones have been critical, which means that
- * we'd better do some synchronous swap-out. kswapd has not
- * been able to cope..
+ * After waking up kswapd, we try to allocate a page
+ * from any zone which isn't critical yet.
+ *
+ * Kswapd should, in most situations, bring the situation
+ * back to normal in no time.
+ */
+ page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+ if (page)
+ return page;
+
+ /*
+ * Damn, we didn't succeed.
+ *
+ * This can be due to 2 reasons:
+ * - we're doing a higher-order allocation
+ * --> move pages to the free list until we succeed
+ * - we're /really/ tight on memory
+ * --> wait on the kswapd waitqueue until memory is freed
*/
if (!(current->flags & PF_MEMALLOC)) {
- int gfp_mask = zonelist->gfp_mask;
- if (!try_to_free_pages(gfp_mask)) {
- if (!(gfp_mask & __GFP_HIGH))
- goto fail;
+ /*
+ * Are we dealing with a higher order allocation?
+ *
+ * Move pages from the inactive_clean to the free list
+ * in the hope of creating a large, physically contiguous
+ * piece of free memory.
+ */
+ if (order > 0 && (gfp_mask & __GFP_WAIT)) {
+ zone = zonelist->zones;
+ /* First, clean some dirty pages. */
+ page_launder(gfp_mask, 1);
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
+ if (!z->size)
+ continue;
+ while (z->inactive_clean_pages) {
+ struct page * page;
+ /* Move one page to the free list. */
+ page = reclaim_page(z);
+ if (!page)
+ break;
+ __free_page(page);
+ /* Try if the allocation succeeds. */
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
}
+ /*
+ * When we arrive here, we are really tight on memory.
+ *
+ * We wake up kswapd and sleep until kswapd wakes us
+ * up again. After that we loop back to the start.
+ *
+ * We have to do this because something else might eat
+ * the memory kswapd frees for us and we need to be
+ * reliable. Note that we don't loop back for higher
+ * order allocations since it is possible that kswapd
+ * simply cannot free a large enough contiguous area
+ * of memory *ever*.
+ */
+ if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
+ wakeup_kswapd(1);
+ memory_pressure++;
+ if (!order)
+ goto try_again;
+ /*
+ * If __GFP_IO isn't set, we can't wait on kswapd because
+ * kswapd just might need some IO locks /we/ are holding ...
+ *
+ * SUBTLE: The scheduling point above makes sure that
+ * kswapd does get the chance to free memory we can't
+ * free ourselves...
+ */
+ } else if (gfp_mask & __GFP_WAIT) {
+ try_to_free_pages(gfp_mask);
+ memory_pressure++;
+ if (!order)
+ goto try_again;
+ }
+
}
/*
* Final phase: allocate anything we can!
+ *
+ * Higher order allocations, GFP_ATOMIC allocations and
+ * recursive allocations (PF_MEMALLOC) end up here.
+ *
+ * Only recursive allocations can use the very last pages
+ * in the system, otherwise it would be just too easy to
+ * deadlock the system...
*/
zone = zonelist->zones;
for (;;) {
- struct page *page;
-
zone_t *z = *(zone++);
+ struct page * page = NULL;
if (!z)
break;
- page = rmqueue(z, order);
+ if (!z->size)
+ BUG();
+
+ /*
+ * SUBTLE: direct_reclaim is only possible if the task
+ * becomes PF_MEMALLOC while looping above. This will
+ * happen when the OOM killer selects this task for
+ * instant execution...
+ */
+ if (direct_reclaim)
+ page = reclaim_page(z);
+ if (page)
+ return page;
+
+ /* XXX: is pages_min/4 a good amount to reserve for this? */
+ if (z->free_pages < z->pages_min / 4 &&
+ !(current->flags & PF_MEMALLOC))
+ continue;
+ if (!page)
+ page = rmqueue(z, order);
if (page)
return page;
}
-fail:
/* No luck.. */
+ printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
return NULL;
}
@@ -377,18 +598,46 @@ unsigned int nr_free_pages (void)
}
/*
- * Amount of free RAM allocatable as buffer memory:
+ * Total amount of inactive_clean (allocatable) RAM:
*/
-unsigned int nr_free_buffer_pages (void)
+unsigned int nr_inactive_clean_pages (void)
{
unsigned int sum;
zone_t *zone;
int i;
- sum = nr_lru_pages / 3;
+ sum = 0;
for (i = 0; i < NUMNODES; i++)
- for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
- sum += zone->free_pages;
+ for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
+ sum += zone->inactive_clean_pages;
+ return sum;
+}
+
+/*
+ * Amount of free RAM allocatable as buffer memory:
+ */
+unsigned int nr_free_buffer_pages (void)
+{
+ unsigned int sum;
+
+ sum = nr_free_pages();
+ sum += nr_inactive_clean_pages();
+ sum += nr_inactive_dirty_pages;
+
+ /*
+ * Keep our write behind queue filled, even if
+ * kswapd lags a bit right now.
+ */
+ if (sum < freepages.high + inactive_target)
+ sum = freepages.high + inactive_target;
+ /*
+ * We don't want dirty page writebehind to put too
+ * much pressure on the working set, but we want it
+ * to be possible to have some dirty pages in the
+ * working set without upsetting the writebehind logic.
+ */
+ sum += nr_active_pages >> 4;
+
return sum;
}
@@ -418,9 +667,11 @@ void show_free_areas_core(int nid)
nr_free_pages() << (PAGE_SHIFT-10),
nr_free_highpages() << (PAGE_SHIFT-10));
- printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
+ printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
+ nr_active_pages,
+ nr_inactive_dirty_pages,
+ nr_inactive_clean_pages(),
nr_free_pages(),
- nr_lru_pages,
freepages.min,
freepages.low,
freepages.high);
@@ -430,17 +681,6 @@ void show_free_areas_core(int nid)
zone_t *zone = NODE_DATA(nid)->node_zones + type;
unsigned long nr, total, flags;
- printk(" %c%d%d %s: ",
- (zone->free_pages > zone->pages_low
- ? (zone->free_pages > zone->pages_high
- ? ' '
- : 'H')
- : (zone->free_pages > zone->pages_min
- ? 'M'
- : 'L')),
- zone->zone_wake_kswapd, zone->low_on_memory,
- zone->name);
-
total = 0;
if (zone->size) {
spin_lock_irqsave(&zone->lock, flags);
@@ -532,9 +772,9 @@ static inline void build_zonelists(pg_data_t *pgdat)
*/
void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
unsigned long *zones_size, unsigned long zone_start_paddr,
- unsigned long *zholes_size)
+ unsigned long *zholes_size, struct page *lmem_map)
{
- struct page *p, *lmem_map;
+ struct page *p;
unsigned long i, j;
unsigned long map_size;
unsigned long totalpages, offset, realtotalpages;
@@ -570,7 +810,8 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
freepages.min += i;
freepages.low += i * 2;
freepages.high += i * 3;
- memlist_init(&lru_cache);
+ memlist_init(&active_list);
+ memlist_init(&inactive_dirty_list);
/*
* Some architectures (with lots of mem and discontinous memory
@@ -580,9 +821,11 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
* boundary, so that MAP_NR works.
*/
map_size = (totalpages + 1)*sizeof(struct page);
- lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
- lmem_map = (struct page *)(PAGE_OFFSET +
+ if (lmem_map == (struct page *)0) {
+ lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
+ lmem_map = (struct page *)(PAGE_OFFSET +
MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
+ }
*gmap = pgdat->node_mem_map = lmem_map;
pgdat->node_size = totalpages;
pgdat->node_start_paddr = zone_start_paddr;
@@ -616,6 +859,9 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
zone->lock = SPIN_LOCK_UNLOCKED;
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->inactive_clean_pages = 0;
+ zone->inactive_dirty_pages = 0;
+ memlist_init(&zone->inactive_clean_list);
if (!size)
continue;
@@ -629,8 +875,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
zone->pages_min = mask;
zone->pages_low = mask*2;
zone->pages_high = mask*3;
- zone->low_on_memory = 0;
- zone->zone_wake_kswapd = 0;
zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_start_paddr = zone_start_paddr;
@@ -664,7 +908,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
void __init free_area_init(unsigned long *zones_size)
{
- free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0);
+ free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0, 0);
}
static int __init setup_mem_frac(char *str)
diff --git a/mm/page_io.c b/mm/page_io.c
index 25ed62221..185e19247 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -43,7 +43,8 @@ static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page, int w
struct inode *swapf = 0;
/* Don't allow too many pending pages in flight.. */
- if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
+ if ((rw == WRITE) && atomic_read(&nr_async_pages) >
+ pager_daemon.swap_cluster * (1 << page_cluster))
wait = 1;
if (rw == READ) {
diff --git a/mm/slab.c b/mm/slab.c
index ed5d018f1..b3bd852d1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -579,7 +579,6 @@ static void kmem_slab_destroy (kmem_cache_t *cachep, slab_t *slabp)
kmem_cache_free(cachep->slabp_cache, slabp);
}
-
/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -838,48 +837,60 @@ static int is_chained_kmem_cache(kmem_cache_t * cachep)
}
#ifdef CONFIG_SMP
-static DECLARE_MUTEX(cache_drain_sem);
-static kmem_cache_t *cache_to_drain = NULL;
-static DECLARE_WAIT_QUEUE_HEAD(cache_drain_wait);
-unsigned long slab_cache_drain_mask;
-
/*
- * Waits for all CPUs to execute slab_drain_local_cache().
- * Caller must be holding cache_drain_sem.
+ * Waits for all CPUs to execute func().
*/
-static void slab_drain_all_sync(void)
+static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
{
- DECLARE_WAITQUEUE(wait, current);
-
local_irq_disable();
- slab_drain_local_cache();
+ func(arg);
local_irq_enable();
- add_wait_queue(&cache_drain_wait, &wait);
- current->state = TASK_UNINTERRUPTIBLE;
- while (slab_cache_drain_mask != 0UL)
- schedule();
- current->state = TASK_RUNNING;
- remove_wait_queue(&cache_drain_wait, &wait);
+ if (smp_call_function(func, arg, 1, 1))
+ BUG();
+}
+typedef struct ccupdate_struct_s
+{
+ kmem_cache_t *cachep;
+ cpucache_t *new[NR_CPUS];
+} ccupdate_struct_t;
+
+static void do_ccupdate_local(void *info)
+{
+ ccupdate_struct_t *new = (ccupdate_struct_t *)info;
+ cpucache_t *old = cc_data(new->cachep);
+
+ cc_data(new->cachep) = new->new[smp_processor_id()];
+ new->new[smp_processor_id()] = old;
}
+static void free_block (kmem_cache_t* cachep, void** objpp, int len);
+
static void drain_cpu_caches(kmem_cache_t *cachep)
{
- unsigned long cpu_mask = 0;
+ ccupdate_struct_t new;
int i;
- for (i = 0; i < smp_num_cpus; i++)
- cpu_mask |= (1UL << cpu_logical_map(i));
+ memset(&new.new,0,sizeof(new.new));
- down(&cache_drain_sem);
+ new.cachep = cachep;
- cache_to_drain = cachep;
- slab_cache_drain_mask = cpu_mask;
- slab_drain_all_sync();
- cache_to_drain = NULL;
+ down(&cache_chain_sem);
+ smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
- up(&cache_drain_sem);
+ for (i = 0; i < smp_num_cpus; i++) {
+ cpucache_t* ccold = new.new[cpu_logical_map(i)];
+ if (!ccold || (ccold->avail == 0))
+ continue;
+ local_irq_disable();
+ free_block(cachep, cc_entry(ccold), ccold->avail);
+ local_irq_enable();
+ ccold->avail = 0;
+ }
+ smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+ up(&cache_chain_sem);
}
+
#else
#define drain_cpu_caches(cachep) do { } while (0)
#endif
@@ -1593,56 +1604,6 @@ kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
#ifdef CONFIG_SMP
-typedef struct ccupdate_struct_s
-{
- kmem_cache_t *cachep;
- cpucache_t *new[NR_CPUS];
-} ccupdate_struct_t;
-
-static ccupdate_struct_t *ccupdate_state = NULL;
-
-/* Called from per-cpu timer interrupt. */
-void slab_drain_local_cache(void)
-{
- if (ccupdate_state != NULL) {
- ccupdate_struct_t *new = ccupdate_state;
- cpucache_t *old = cc_data(new->cachep);
-
- cc_data(new->cachep) = new->new[smp_processor_id()];
- new->new[smp_processor_id()] = old;
- } else {
- kmem_cache_t *cachep = cache_to_drain;
- cpucache_t *cc = cc_data(cachep);
-
- if (cc && cc->avail) {
- free_block(cachep, cc_entry(cc), cc->avail);
- cc->avail = 0;
- }
- }
-
- clear_bit(smp_processor_id(), &slab_cache_drain_mask);
- if (slab_cache_drain_mask == 0)
- wake_up(&cache_drain_wait);
-}
-
-static void do_ccupdate(ccupdate_struct_t *data)
-{
- unsigned long cpu_mask = 0;
- int i;
-
- for (i = 0; i < smp_num_cpus; i++)
- cpu_mask |= (1UL << cpu_logical_map(i));
-
- down(&cache_drain_sem);
-
- ccupdate_state = data;
- slab_cache_drain_mask = cpu_mask;
- slab_drain_all_sync();
- ccupdate_state = NULL;
-
- up(&cache_drain_sem);
-}
-
/* called with cache_chain_sem acquired. */
static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
{
@@ -1666,7 +1627,6 @@ static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
for (i = 0; i< smp_num_cpus; i++) {
cpucache_t* ccnew;
-
ccnew = kmalloc(sizeof(void*)*limit+
sizeof(cpucache_t), GFP_KERNEL);
if (!ccnew)
@@ -1681,7 +1641,7 @@ static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
cachep->batchcount = batchcount;
spin_unlock_irq(&cachep->spinlock);
- do_ccupdate(&new);
+ smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
for (i = 0; i < smp_num_cpus; i++) {
cpucache_t* ccold = new.new[cpu_logical_map(i)];
@@ -1772,14 +1732,6 @@ void kmem_cache_reap (int gfp_mask)
/* It's safe to test this without holding the cache-lock. */
if (searchp->flags & SLAB_NO_REAP)
goto next;
- /* FIXME: is this really a good idea? */
- if (gfp_mask & GFP_DMA) {
- if (!(searchp->gfpflags & GFP_DMA))
- goto next;
- } else {
- if (searchp->gfpflags & GFP_DMA)
- goto next;
- }
spin_lock_irq(&searchp->spinlock);
if (searchp->growing)
goto next_unlock;
diff --git a/mm/swap.c b/mm/swap.c
index 460707ff7..8cb160b81 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,18 @@ freepages_t freepages = {
};
/* How many pages do we try to swap or page in/out together? */
-int page_cluster = 4; /* Default value modified in swap_setup() */
+int page_cluster;
+
+/*
+ * This variable contains the amount of page steals the system
+ * is doing, averaged over a minute. We use this to determine how
+ * many inactive pages we should have.
+ *
+ * In reclaim_page and __alloc_pages: memory_pressure++
+ * In __free_pages_ok: memory_pressure--
+ * In recalculate_vm_stats the value is decayed (once a second)
+ */
+int memory_pressure;
/* We track the number of pages currently being asynchronously swapped
out, so that we don't try to swap TOO many pages out at once */
@@ -61,13 +72,250 @@ buffer_mem_t page_cache = {
pager_daemon_t pager_daemon = {
512, /* base number for calculating the number of tries */
SWAP_CLUSTER_MAX, /* minimum number of tries */
- SWAP_CLUSTER_MAX, /* do swap I/O in clusters of this size */
+ 8, /* do swap I/O in clusters of this size */
};
+/**
+ * age_page_{up,down} - page aging helper functions
+ * @page - the page we want to age
+ * @nolock - are we already holding the pagelist_lru_lock?
+ *
+ * If the page is on one of the lists (active, inactive_dirty or
+ * inactive_clean), we will grab the pagelist_lru_lock as needed.
+ * If you're already holding the lock, call this function with the
+ * nolock argument non-zero.
+ */
+void age_page_up_nolock(struct page * page)
+{
+ /*
+ * We're dealing with an inactive page, move the page
+ * to the active list.
+ */
+ if (!page->age)
+ activate_page_nolock(page);
+
+ /* The actual page aging bit */
+ page->age += PAGE_AGE_ADV;
+ if (page->age > PAGE_AGE_MAX)
+ page->age = PAGE_AGE_MAX;
+}
+
/*
- * Perform any setup for the swap system
+ * We use this (minimal) function in the case where we
+ * know we can't deactivate the page (yet).
*/
+void age_page_down_ageonly(struct page * page)
+{
+ page->age /= 2;
+}
+
+void age_page_down_nolock(struct page * page)
+{
+ /* The actual page aging bit */
+ page->age /= 2;
+
+ /*
+ * The page is now an old page. Move to the inactive
+ * list (if possible ... see below).
+ */
+ if (!page->age)
+ deactivate_page_nolock(page);
+}
+void age_page_up(struct page * page)
+{
+ /*
+ * We're dealing with an inactive page, move the page
+ * to the active list.
+ */
+ if (!page->age)
+ activate_page(page);
+
+ /* The actual page aging bit */
+ page->age += PAGE_AGE_ADV;
+ if (page->age > PAGE_AGE_MAX)
+ page->age = PAGE_AGE_MAX;
+}
+
+void age_page_down(struct page * page)
+{
+ /* The actual page aging bit */
+ page->age /= 2;
+
+ /*
+ * The page is now an old page. Move to the inactive
+ * list (if possible ... see below).
+ */
+ if (!page->age)
+ deactivate_page(page);
+}
+
+
+/**
+ * (de)activate_page - move pages from/to active and inactive lists
+ * @page: the page we want to move
+ * @nolock - are we already holding the pagemap_lru_lock?
+ *
+ * Deactivate_page will move an active page to the right
+ * inactive list, while activate_page will move a page back
+ * from one of the inactive lists to the active list. If
+ * called on a page which is not on any of the lists, the
+ * page is left alone.
+ */
+void deactivate_page_nolock(struct page * page)
+{
+ /*
+ * One for the cache, one for the extra reference the
+ * caller has and (maybe) one for the buffers.
+ *
+ * This isn't perfect, but works for just about everything.
+ * Besides, as long as we don't move unfreeable pages to the
+ * inactive_clean list it doesn't need to be perfect...
+ */
+ int maxcount = (page->buffers ? 3 : 2);
+ page->age = 0;
+
+ /*
+ * Don't touch it if it's not on the active list.
+ * (some pages aren't on any list at all)
+ */
+ if (PageActive(page) && page_count(page) <= maxcount &&
+ !page_ramdisk(page)) {
+
+ /*
+ * We can move the page to the inactive_dirty list
+ * if we have the strong suspicion that they might
+ * become freeable in the near future.
+ *
+ * That is, the page has buffer heads attached (that
+ * need to be cleared away) and/or the function calling
+ * us has an extra reference count on the page.
+ */
+ if (page->buffers || page_count(page) == 2) {
+ del_page_from_active_list(page);
+ add_page_to_inactive_dirty_list(page);
+ /*
+ * Only if we are SURE the page is clean and immediately
+ * reusable, we move it to the inactive_clean list.
+ */
+ } else if (page->mapping && !PageDirty(page) &&
+ !PageLocked(page)) {
+ del_page_from_active_list(page);
+ add_page_to_inactive_clean_list(page);
+ }
+ /*
+ * OK, we cannot free the page. Leave it alone.
+ */
+ }
+}
+
+void deactivate_page(struct page * page)
+{
+ spin_lock(&pagemap_lru_lock);
+ deactivate_page_nolock(page);
+ spin_unlock(&pagemap_lru_lock);
+}
+
+/*
+ * Move an inactive page to the active list.
+ */
+void activate_page_nolock(struct page * page)
+{
+ if (PageInactiveDirty(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ } else if (PageInactiveClean(page)) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_active_list(page);
+ } else {
+ /*
+ * The page was not on any list, so we take care
+ * not to do anything.
+ */
+ }
+
+ /* Make sure the page gets a fair chance at staying active. */
+ if (page->age < PAGE_AGE_START)
+ page->age = PAGE_AGE_START;
+}
+
+void activate_page(struct page * page)
+{
+ spin_lock(&pagemap_lru_lock);
+ activate_page_nolock(page);
+ spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+void lru_cache_add(struct page * page)
+{
+ spin_lock(&pagemap_lru_lock);
+ if (!PageLocked(page))
+ BUG();
+ DEBUG_ADD_PAGE
+ add_page_to_active_list(page);
+ /* This should be relatively rare */
+ if (!page->age)
+ deactivate_page_nolock(page);
+ spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * __lru_cache_del: remove a page from the page lists
+ * @page: the page to add
+ *
+ * This function is for when the caller already holds
+ * the pagemap_lru_lock.
+ */
+void __lru_cache_del(struct page * page)
+{
+ if (PageActive(page)) {
+ del_page_from_active_list(page);
+ } else if (PageInactiveDirty(page)) {
+ del_page_from_inactive_dirty_list(page);
+ } else if (PageInactiveClean(page)) {
+ del_page_from_inactive_clean_list(page);
+ } else {
+ printk("VM: __lru_cache_del, found unknown page ?!\n");
+ }
+ DEBUG_ADD_PAGE
+}
+
+/**
+ * lru_cache_del: remove a page from the page lists
+ * @page: the page to remove
+ */
+void lru_cache_del(struct page * page)
+{
+ if (!PageLocked(page))
+ BUG();
+ spin_lock(&pagemap_lru_lock);
+ __lru_cache_del(page);
+ spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * recalculate_vm_stats - recalculate VM statistics
+ *
+ * This function should be called once a second to recalculate
+ * some useful statistics the VM subsystem uses to determine
+ * its behaviour.
+ */
+void recalculate_vm_stats(void)
+{
+ /*
+ * Substract one second worth of memory_pressure from
+ * memory_pressure.
+ */
+ memory_pressure -= (memory_pressure >> INACTIVE_SHIFT);
+}
+
+/*
+ * Perform any setup for the swap system
+ */
void __init swap_setup(void)
{
/* Use a smaller cluster for memory <16MB or <32MB */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 506160354..d26c66f54 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -73,7 +73,7 @@ static inline void remove_from_swap_cache(struct page *page)
PAGE_BUG(page);
PageClearSwapCache(page);
- remove_inode_page(page);
+ __remove_inode_page(page);
}
/*
@@ -105,7 +105,9 @@ void delete_from_swap_cache_nolock(struct page *page)
if (block_flushpage(page, 0))
lru_cache_del(page);
+ spin_lock(&pagecache_lock);
__delete_from_swap_cache(page);
+ spin_unlock(&pagecache_lock);
page_cache_release(page);
}
@@ -164,7 +166,7 @@ repeat:
return 0;
/*
* Though the "found" page was in the swap cache an instant
- * earlier, it might have been removed by shrink_mmap etc.
+ * earlier, it might have been removed by refill_inactive etc.
* Re search ... Since find_lock_page grabs a reference on
* the page, it can not be reused for anything else, namely
* it can not be associated with another swaphandle, so it
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 817a3966b..e8c557e04 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,7 +14,7 @@
#include <asm/pgalloc.h>
rwlock_t vmlist_lock = RW_LOCK_UNLOCKED;
-struct vm_struct * vmlist = NULL;
+struct vm_struct * vmlist;
static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 95098e4d1..aacd9a5b0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -9,6 +9,7 @@
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
* Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ * Multiqueue VM started 5.8.00, Rik van Riel.
*/
#include <linux/slab.h>
@@ -40,6 +41,7 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
swp_entry_t entry;
struct page * page;
int (*swapout)(struct page *, struct file *);
+ int onlist;
pte = *page_table;
if (!pte_present(pte))
@@ -51,16 +53,37 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
if (mm->swap_cnt)
mm->swap_cnt--;
+ onlist = PageActive(page);
/* Don't look at this pte if it's been accessed recently. */
if (pte_young(pte)) {
- /*
- * Transfer the "accessed" bit from the page
- * tables to the global page map.
- */
set_pte(page_table, pte_mkold(pte));
- SetPageReferenced(page);
+ if (onlist) {
+ /*
+ * Transfer the "accessed" bit from the page
+ * tables to the global page map. Page aging
+ * will be done by refill_inactive_scan().
+ */
+ SetPageReferenced(page);
+ } else {
+ /*
+ * The page is not on the active list, so
+ * we have to do the page aging ourselves.
+ */
+ age_page_up(page);
+ }
goto out_failed;
}
+ if (!onlist)
+ /* The page is still mapped, so it can't be freeable... */
+ age_page_down_ageonly(page);
+
+ /*
+ * If the page is in active use by us, or if the page
+ * is in active use by others, don't unmap it or
+ * (worse) start unneeded IO.
+ */
+ if (page->age > 0)
+ goto out_failed;
if (TryLockPage(page))
goto out_failed;
@@ -79,8 +102,9 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
UnlockPage(page);
- vma->vm_mm->rss--;
+ mm->rss--;
flush_tlb_page(vma, address);
+ deactivate_page(page);
page_cache_release(page);
goto out_failed;
}
@@ -96,7 +120,7 @@ drop_pte:
* our scan.
*
* Basically, this just makes it possible for us to do
- * some real work in the future in "shrink_mmap()".
+ * some real work in the future in "refill_inactive()".
*/
if (!pte_dirty(pte)) {
flush_cache_page(vma, address);
@@ -116,7 +140,9 @@ drop_pte:
* Don't do any of the expensive stuff if
* we're not really interested in this zone.
*/
- if (page->zone->free_pages > page->zone->pages_high)
+ if (page->zone->free_pages + page->zone->inactive_clean_pages
+ + page->zone->inactive_dirty_pages
+ > page->zone->pages_high + inactive_target)
goto out_unlock;
/*
@@ -134,7 +160,7 @@ drop_pte:
* NOTE NOTE NOTE! This should just set a
* dirty bit in 'page', and just drop the
* pte. All the hard work would be done by
- * shrink_mmap().
+ * refill_inactive().
*
* That would get rid of a lot of problems.
*/
@@ -144,14 +170,15 @@ drop_pte:
struct file *file = vma->vm_file;
if (file) get_file(file);
pte_clear(page_table);
- vma->vm_mm->rss--;
+ mm->rss--;
flush_tlb_page(vma, address);
- vmlist_access_unlock(vma->vm_mm);
+ vmlist_access_unlock(mm);
error = swapout(page, file);
UnlockPage(page);
if (file) fput(file);
if (!error)
goto out_free_success;
+ deactivate_page(page);
page_cache_release(page);
return error;
}
@@ -175,13 +202,14 @@ drop_pte:
add_to_swap_cache(page, entry);
/* Put the swap entry into the pte after the page is in swapcache */
- vma->vm_mm->rss--;
+ mm->rss--;
set_pte(page_table, swp_entry_to_pte(entry));
flush_tlb_page(vma, address);
- vmlist_access_unlock(vma->vm_mm);
+ vmlist_access_unlock(mm);
/* OK, do a physical asynchronous write to swap. */
rw_swap_page(WRITE, page, 0);
+ deactivate_page(page);
out_free_success:
page_cache_release(page);
@@ -230,7 +258,7 @@ static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vm
do {
int result;
- vma->vm_mm->swap_address = address + PAGE_SIZE;
+ mm->swap_address = address + PAGE_SIZE;
result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
if (result)
return result;
@@ -282,7 +310,7 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi
if (vma->vm_flags & VM_LOCKED)
return 0;
- pgdir = pgd_offset(vma->vm_mm, address);
+ pgdir = pgd_offset(mm, address);
end = vma->vm_end;
if (address >= end)
@@ -323,17 +351,22 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
int result = swap_out_vma(mm, vma, address, gfp_mask);
if (result)
return result;
+ if (!mm->swap_cnt)
+ goto out_unlock;
vma = vma->vm_next;
if (!vma)
break;
address = vma->vm_start;
}
}
+ /* Reset to 0 when we reach the end of address space */
+ mm->swap_address = 0;
+ mm->swap_cnt = 0;
+
+out_unlock:
vmlist_access_unlock(mm);
/* We didn't find anything for the process */
- mm->swap_cnt = 0;
- mm->swap_address = 0;
return 0;
}
@@ -342,7 +375,10 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
* N.B. This function returns only 0 or 1. Return values != 1 from
* the lower level routines result in continued processing.
*/
-static int swap_out(unsigned int priority, int gfp_mask)
+#define SWAP_SHIFT 5
+#define SWAP_MIN 8
+
+static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time)
{
struct task_struct * p;
int counter;
@@ -363,7 +399,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
* Think of swap_cnt as a "shadow rss" - it tells us which process
* we want to page out (always try largest first).
*/
- counter = (nr_threads << 2) >> (priority >> 2);
+ counter = (nr_threads << SWAP_SHIFT) >> priority;
if (counter < 1)
counter = 1;
@@ -372,6 +408,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
struct mm_struct *best = NULL;
int pid = 0;
int assign = 0;
+ int found_task = 0;
select:
read_lock(&tasklist_lock);
p = init_task.next_task;
@@ -381,9 +418,17 @@ static int swap_out(unsigned int priority, int gfp_mask)
continue;
if (mm->rss <= 0)
continue;
+ /* Skip tasks which haven't slept long enough yet when idle-swapping. */
+ if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) ||
+ time_after(p->sleep_time + idle_time * HZ, jiffies)))
+ continue;
+ found_task++;
/* Refresh swap_cnt? */
- if (assign == 1)
- mm->swap_cnt = mm->rss;
+ if (assign == 1) {
+ mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
+ if (mm->swap_cnt < SWAP_MIN)
+ mm->swap_cnt = SWAP_MIN;
+ }
if (mm->swap_cnt > max_cnt) {
max_cnt = mm->swap_cnt;
best = mm;
@@ -392,7 +437,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
}
read_unlock(&tasklist_lock);
if (!best) {
- if (!assign) {
+ if (!assign && found_task > 0) {
assign = 1;
goto select;
}
@@ -418,50 +463,409 @@ out:
return __ret;
}
-/*
- * Check if there is any memory pressure (free_pages < pages_low)
+
+/**
+ * reclaim_page - reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
*/
-static inline int memory_pressure(void)
+struct page * reclaim_page(zone_t * zone)
{
- pg_data_t *pgdat = pgdat_list;
+ struct page * page = NULL;
+ struct list_head * page_lru;
+ int maxscan;
- do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones+ i;
- if (zone->size &&
- zone->free_pages < zone->pages_low)
- return 1;
+ /*
+ * We only need the pagemap_lru_lock if we don't reclaim the page,
+ * but we have to grab the pagecache_lock before the pagemap_lru_lock
+ * to avoid deadlocks and most of the time we'll succeed anyway.
+ */
+ spin_lock(&pagecache_lock);
+ spin_lock(&pagemap_lru_lock);
+ maxscan = zone->inactive_clean_pages;
+ while ((page_lru = zone->inactive_clean_list.prev) !=
+ &zone->inactive_clean_list && maxscan--) {
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageInactiveClean(page)) {
+ printk("VM: reclaim_page, wrong page on list.\n");
+ list_del(page_lru);
+ page->zone->inactive_clean_pages--;
+ continue;
}
- pgdat = pgdat->node_next;
- } while (pgdat);
- return 0;
+ /* Page is or was in use? Move it to the active list. */
+ if (PageTestandClearReferenced(page) || page->age > 0 ||
+ (!page->buffers && page_count(page) > 1)) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_active_list(page);
+ continue;
+ }
+
+ /* The page is dirty, or locked, move to inactive_diry list. */
+ if (page->buffers || TryLockPage(page)) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_inactive_dirty_list(page);
+ continue;
+ }
+
+ /* OK, remove the page from the caches. */
+ if (PageSwapCache(page)) {
+ __delete_from_swap_cache(page);
+ goto found_page;
+ }
+
+ if (page->mapping) {
+ __remove_inode_page(page);
+ goto found_page;
+ }
+
+ /* We should never ever get here. */
+ printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+ list_del(page_lru);
+ zone->inactive_clean_pages--;
+ UnlockPage(page);
+ }
+ /* Reset page pointer, maybe we encountered an unfreeable page. */
+ page = NULL;
+ goto out;
+
+found_page:
+ del_page_from_inactive_clean_list(page);
+ UnlockPage(page);
+ page->age = PAGE_AGE_START;
+ if (page_count(page) != 1)
+ printk("VM: reclaim_page, found page with count %d!\n",
+ page_count(page));
+out:
+ spin_unlock(&pagemap_lru_lock);
+ spin_unlock(&pagecache_lock);
+ memory_pressure++;
+ return page;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ * @sync: should we wait synchronously for the cleaning of pages
+ *
+ * When this function is called, we are most likely low on free +
+ * inactive_clean pages. Since we want to refill those pages as
+ * soon as possible, we'll make two loops over the inactive list,
+ * one to move the already cleaned pages to the inactive_clean lists
+ * and one to (often asynchronously) clean the dirty inactive pages.
+ *
+ * In situations where kswapd cannot keep up, user processes will
+ * end up calling this function. Since the user process needs to
+ * have a page before it can continue with its allocation, we'll
+ * do synchronous page flushing in that case.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define MAX_LAUNDER (4 * (1 << page_cluster))
+int page_launder(int gfp_mask, int sync)
+{
+ int launder_loop, maxscan, cleaned_pages, maxlaunder;
+ int can_get_io_locks;
+ struct list_head * page_lru;
+ struct page * page;
+
+ /*
+ * We can only grab the IO locks (eg. for flushing dirty
+ * buffers to disk) if __GFP_IO is set.
+ */
+ can_get_io_locks = gfp_mask & __GFP_IO;
+
+ launder_loop = 0;
+ maxlaunder = 0;
+ cleaned_pages = 0;
+
+dirty_page_rescan:
+ spin_lock(&pagemap_lru_lock);
+ maxscan = nr_inactive_dirty_pages;
+ while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
+ maxscan-- > 0) {
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageInactiveDirty(page)) {
+ printk("VM: page_launder, wrong page on list.\n");
+ list_del(page_lru);
+ nr_inactive_dirty_pages--;
+ page->zone->inactive_dirty_pages--;
+ continue;
+ }
+
+ /* Page is or was in use? Move it to the active list. */
+ if (PageTestandClearReferenced(page) || page->age > 0 ||
+ (!page->buffers && page_count(page) > 1) ||
+ page_ramdisk(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ continue;
+ }
+
+ /*
+ * The page is locked. IO in progress?
+ * Move it to the back of the list.
+ */
+ if (TryLockPage(page)) {
+ list_del(page_lru);
+ list_add(page_lru, &inactive_dirty_list);
+ continue;
+ }
+
+ /*
+ * If the page has buffers, try to free the buffer mappings
+ * associated with this page. If we succeed we either free
+ * the page (in case it was a buffercache only page) or we
+ * move the page to the inactive_clean list.
+ *
+ * On the first round, we should free all previously cleaned
+ * buffer pages
+ */
+ if (page->buffers) {
+ int wait, clearedbuf;
+ int freed_page = 0;
+ /*
+ * Since we might be doing disk IO, we have to
+ * drop the spinlock and take an extra reference
+ * on the page so it doesn't go away from under us.
+ */
+ del_page_from_inactive_dirty_list(page);
+ page_cache_get(page);
+ spin_unlock(&pagemap_lru_lock);
+
+ /* Will we do (asynchronous) IO? */
+ if (launder_loop && maxlaunder == 0 && sync)
+ wait = 2; /* Synchrounous IO */
+ else if (launder_loop && maxlaunder-- > 0)
+ wait = 1; /* Async IO */
+ else
+ wait = 0; /* No IO */
+
+ /* Try to free the page buffers. */
+ clearedbuf = try_to_free_buffers(page, wait);
+
+ /*
+ * Re-take the spinlock. Note that we cannot
+ * unlock the page yet since we're still
+ * accessing the page_struct here...
+ */
+ spin_lock(&pagemap_lru_lock);
+
+ /* The buffers were not freed. */
+ if (!clearedbuf) {
+ add_page_to_inactive_dirty_list(page);
+
+ /* The page was only in the buffer cache. */
+ } else if (!page->mapping) {
+ atomic_dec(&buffermem_pages);
+ freed_page = 1;
+ cleaned_pages++;
+
+ /* The page has more users besides the cache and us. */
+ } else if (page_count(page) > 2) {
+ add_page_to_active_list(page);
+
+ /* OK, we "created" a freeable page. */
+ } else /* page->mapping && page_count(page) == 2 */ {
+ add_page_to_inactive_clean_list(page);
+ cleaned_pages++;
+ }
+
+ /*
+ * Unlock the page and drop the extra reference.
+ * We can only do it here because we ar accessing
+ * the page struct above.
+ */
+ UnlockPage(page);
+ page_cache_release(page);
+
+ /*
+ * If we're freeing buffer cache pages, stop when
+ * we've got enough free memory.
+ */
+ if (freed_page && !free_shortage())
+ break;
+ continue;
+ } else if (page->mapping && !PageDirty(page)) {
+ /*
+ * If a page had an extra reference in
+ * deactivate_page(), we will find it here.
+ * Now the page is really freeable, so we
+ * move it to the inactive_clean list.
+ */
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_inactive_clean_list(page);
+ UnlockPage(page);
+ cleaned_pages++;
+ } else {
+ /*
+ * OK, we don't know what to do with the page.
+ * It's no use keeping it here, so we move it to
+ * the active list.
+ */
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ UnlockPage(page);
+ }
+ }
+ spin_unlock(&pagemap_lru_lock);
+
+ /*
+ * If we don't have enough free pages, we loop back once
+ * to queue the dirty pages for writeout. When we were called
+ * by a user process (that /needs/ a free page) and we didn't
+ * free anything yet, we wait synchronously on the writeout of
+ * MAX_SYNC_LAUNDER pages.
+ *
+ * We also wake up bdflush, since bdflush should, under most
+ * loads, flush out the dirty pages before we have to wait on
+ * IO.
+ */
+ if (can_get_io_locks && !launder_loop && free_shortage()) {
+ launder_loop = 1;
+ /* If we cleaned pages, never do synchronous IO. */
+ if (cleaned_pages)
+ sync = 0;
+ /* We only do a few "out of order" flushes. */
+ maxlaunder = MAX_LAUNDER;
+ /* Kflushd takes care of the rest. */
+ wakeup_bdflush(0);
+ goto dirty_page_rescan;
+ }
+
+ /* Return the number of pages moved to the inactive_clean list. */
+ return cleaned_pages;
+}
+
+/**
+ * refill_inactive_scan - scan the active list and find pages to deactivate
+ * @priority: the priority at which to scan
+ * @oneshot: exit after deactivating one page
+ *
+ * This function will scan a portion of the active list to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_scan(unsigned int priority, int oneshot)
+{
+ struct list_head * page_lru;
+ struct page * page;
+ int maxscan, page_active = 0;
+ int ret = 0;
+
+ /* Take the lock while messing with the list... */
+ spin_lock(&pagemap_lru_lock);
+ maxscan = nr_active_pages >> priority;
+ while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageActive(page)) {
+ printk("VM: refill_inactive, wrong page on list.\n");
+ list_del(page_lru);
+ nr_active_pages--;
+ continue;
+ }
+
+ /* Do aging on the pages. */
+ if (PageTestandClearReferenced(page)) {
+ age_page_up_nolock(page);
+ page_active = 1;
+ } else {
+ age_page_down_ageonly(page);
+ /*
+ * Since we don't hold a reference on the page
+ * ourselves, we have to do our test a bit more
+ * strict then deactivate_page(). This is needed
+ * since otherwise the system could hang shuffling
+ * unfreeable pages from the active list to the
+ * inactive_dirty list and back again...
+ *
+ * SUBTLE: we can have buffer pages with count 1.
+ */
+ if (page_count(page) <= (page->buffers ? 2 : 1)) {
+ deactivate_page_nolock(page);
+ page_active = 0;
+ } else {
+ page_active = 1;
+ }
+ }
+ /*
+ * If the page is still on the active list, move it
+ * to the other end of the list. Otherwise it was
+ * deactivated by age_page_down and we exit successfully.
+ */
+ if (page_active || PageActive(page)) {
+ list_del(page_lru);
+ list_add(page_lru, &active_list);
+ } else {
+ ret = 1;
+ if (oneshot)
+ break;
+ }
+ }
+ spin_unlock(&pagemap_lru_lock);
+
+ return ret;
}
/*
- * Check if all zones have recently had memory_pressure (zone_wake_kswapd)
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
*/
-static inline int keep_kswapd_awake(void)
+int free_shortage(void)
{
- int all_recent = 1;
pg_data_t *pgdat = pgdat_list;
+ int sum = 0;
+ int freeable = nr_free_pages() + nr_inactive_clean_pages();
+ int freetarget = freepages.high + inactive_target / 3;
+ /* Are we low on free pages globally? */
+ if (freeable < freetarget)
+ return freetarget - freeable;
+
+ /* If not, are we very low on any particular zone? */
do {
int i;
for(i = 0; i < MAX_NR_ZONES; i++) {
zone_t *zone = pgdat->node_zones+ i;
- if (zone->size) {
- if (zone->free_pages < zone->pages_min)
- return 1;
- if (!zone->zone_wake_kswapd)
- all_recent = 0;
+ if (zone->size && (zone->inactive_clean_pages +
+ zone->free_pages < zone->pages_min)) {
+ sum += zone->pages_min;
+ sum -= zone->free_pages;
+ sum -= zone->inactive_clean_pages;
}
}
pgdat = pgdat->node_next;
} while (pgdat);
- return all_recent;
+ return sum;
+}
+
+/*
+ * How many inactive pages are we short?
+ */
+int inactive_shortage(void)
+{
+ int shortage = 0;
+
+ shortage += freepages.high;
+ shortage += inactive_target;
+ shortage -= nr_free_pages();
+ shortage -= nr_inactive_clean_pages();
+ shortage -= nr_inactive_dirty_pages;
+
+ if (shortage > 0)
+ return shortage;
+
+ return 0;
}
/*
@@ -472,96 +876,140 @@ static inline int keep_kswapd_awake(void)
* We want to try to free "count" pages, and we want to
* cluster them so that we get good swap-out behaviour.
*
- * Don't try _too_ hard, though. We don't want to have bad
- * latency.
- *
- * Note: only called by kswapd and try_to_free_pages
- * both can WAIT at top level.
+ * OTOH, if we're a user process (and not kswapd), we
+ * really care about latency. In that case we don't try
+ * to free too many pages.
*/
-#define FREE_COUNT 8
-#define SWAP_COUNT 16
-static int do_try_to_free_pages(unsigned int gfp_mask)
+static int refill_inactive(unsigned int gfp_mask, int user)
{
- int priority;
- int count = FREE_COUNT;
- int swap_count;
+ int priority, count, start_count, made_progress;
+ unsigned long idle_time;
+
+ count = inactive_shortage() + free_shortage();
+ if (user)
+ count = (1 << page_cluster);
+ start_count = count;
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
- priority = 64;
+ /*
+ * Calculate the minimum time (in seconds) a process must
+ * have slept before we consider it for idle swapping.
+ * This must be the number of seconds it takes to go through
+ * all of the cache. Doing this idle swapping makes the VM
+ * smoother once we start hitting swap.
+ */
+ idle_time = atomic_read(&page_cache_size);
+ idle_time += atomic_read(&buffermem_pages);
+ idle_time /= (inactive_target + 1);
+
+ priority = 6;
do {
+ made_progress = 0;
+
if (current->need_resched) {
+ __set_current_state(TASK_RUNNING);
schedule();
- /* time has passed - pressure too? */
- if (!memory_pressure())
- goto done;
}
- while (shrink_mmap(priority, gfp_mask)) {
- if (!--count)
+ while (refill_inactive_scan(priority, 1) ||
+ swap_out(priority, gfp_mask, idle_time)) {
+ made_progress = 1;
+ if (--count <= 0)
goto done;
}
- /* check if mission completed */
- if (!keep_kswapd_awake())
- goto done;
+ /*
+ * don't be too light against the d/i cache since
+ * refill_inactive() almost never fail when there's
+ * really plenty of memory free.
+ */
+ shrink_dcache_memory(priority, gfp_mask);
+ shrink_icache_memory(priority, gfp_mask);
/* Try to get rid of some shared memory pages.. */
- if (gfp_mask & __GFP_IO) {
- /*
- * don't be too light against the d/i cache since
- * shrink_mmap() almost never fail when there's
- * really plenty of memory free.
- */
- count -= shrink_dcache_memory(priority, gfp_mask);
- count -= shrink_icache_memory(priority, gfp_mask);
- /*
- * Not currently working, see fixme in shrink_?cache_memory
- * In the inner funtions there is a comment:
- * "To help debugging, a zero exit status indicates
- * all slabs were released." (-arca?)
- * lets handle it in a primitive but working way...
- * if (count <= 0)
- * goto done;
- */
- if (!keep_kswapd_awake())
+ while (shm_swap(priority, gfp_mask)) {
+ made_progress = 1;
+ if (--count <= 0)
goto done;
-
- while (shm_swap(priority, gfp_mask)) {
- if (!--count)
- goto done;
- }
}
/*
* Then, try to page stuff out..
- *
- * This will not actually free any pages (they get
- * put in the swap cache), so we must not count this
- * as a "count" success.
*/
- swap_count = SWAP_COUNT;
- while (swap_out(priority, gfp_mask))
- if (--swap_count < 0)
- break;
+ while (swap_out(priority, gfp_mask, 0)) {
+ made_progress = 1;
+ if (--count <= 0)
+ goto done;
+ }
- } while (--priority >= 0);
+ /*
+ * If we either have enough free memory, or if
+ * page_launder() will be able to make enough
+ * free memory, then stop.
+ */
+ if (!inactive_shortage() || !free_shortage())
+ goto done;
+
+ /*
+ * Only switch to a lower "priority" if we
+ * didn't make any useful progress in the
+ * last loop.
+ */
+ if (!made_progress)
+ priority--;
+ } while (priority >= 0);
- /* Always end on a shrink_mmap.., may sleep... */
- while (shrink_mmap(0, gfp_mask)) {
- if (!--count)
+ /* Always end on a refill_inactive.., may sleep... */
+ while (refill_inactive_scan(0, 1)) {
+ if (--count <= 0)
goto done;
}
- /* Return 1 if any page is freed, or
- * there are no more memory pressure */
- return (count < FREE_COUNT || !keep_kswapd_awake());
-
+
done:
- return 1;
+ return (count < start_count);
+}
+
+static int do_try_to_free_pages(unsigned int gfp_mask, int user)
+{
+ int ret = 0;
+
+ /*
+ * If we're low on free pages, move pages from the
+ * inactive_dirty list to the inactive_clean list.
+ *
+ * Usually bdflush will have pre-cleaned the pages
+ * before we get around to moving them to the other
+ * list, so this is a relatively cheap operation.
+ */
+ if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
+ nr_inactive_clean_pages())
+ ret += page_launder(gfp_mask, user);
+
+ /*
+ * If needed, we move pages from the active list
+ * to the inactive list. We also "eat" pages from
+ * the inode and dentry cache whenever we do this.
+ */
+ if (free_shortage() || inactive_shortage()) {
+ shrink_dcache_memory(6, gfp_mask);
+ shrink_icache_memory(6, gfp_mask);
+ ret += refill_inactive(gfp_mask, user);
+ } else {
+ /*
+ * Reclaim unused slab cache memory.
+ */
+ kmem_cache_reap(gfp_mask);
+ ret = 1;
+ }
+
+ return ret;
}
DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+struct task_struct *kswapd_task;
/*
* The background pageout daemon, started as a kernel thread
@@ -584,6 +1032,7 @@ int kswapd(void *unused)
tsk->pgrp = 1;
strcpy(tsk->comm, "kswapd");
sigfillset(&tsk->blocked);
+ kswapd_task = tsk;
/*
* Tell the memory management that we're a "memory allocator",
@@ -599,54 +1048,166 @@ int kswapd(void *unused)
*/
tsk->flags |= PF_MEMALLOC;
+ /*
+ * Kswapd main loop.
+ */
for (;;) {
- if (!keep_kswapd_awake()) {
- interruptible_sleep_on(&kswapd_wait);
+ static int recalc = 0;
+
+ /* If needed, try to free some memory. */
+ if (inactive_shortage() || free_shortage()) {
+ int wait = 0;
+ /* Do we need to do some synchronous flushing? */
+ if (waitqueue_active(&kswapd_done))
+ wait = 1;
+ do_try_to_free_pages(GFP_KSWAPD, wait);
+ }
+
+ /*
+ * Do some (very minimal) background scanning. This
+ * will scan all pages on the active list once
+ * every minute. This clears old referenced bits
+ * and moves unused pages to the inactive list.
+ */
+ refill_inactive_scan(6, 0);
+
+ /* Once a second, recalculate some VM stats. */
+ if (time_after(jiffies, recalc + HZ)) {
+ recalc = jiffies;
+ recalculate_vm_stats();
}
- do_try_to_free_pages(GFP_KSWAPD);
+ /*
+ * Wake up everybody waiting for free memory
+ * and unplug the disk queue.
+ */
+ wake_up_all(&kswapd_done);
+ run_task_queue(&tq_disk);
+
+ /*
+ * We go to sleep if either the free page shortage
+ * or the inactive page shortage is gone. We do this
+ * because:
+ * 1) we need no more free pages or
+ * 2) the inactive pages need to be flushed to disk,
+ * it wouldn't help to eat CPU time now ...
+ *
+ * We go to sleep for one second, but if it's needed
+ * we'll be woken up earlier...
+ */
+ if (!free_shortage() || !inactive_shortage())
+ interruptible_sleep_on_timeout(&kswapd_wait, HZ);
+ /*
+ * TODO: insert out of memory check & oom killer
+ * invocation in an else branch here.
+ */
}
}
+void wakeup_kswapd(int block)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ if (current == kswapd_task)
+ return;
+
+ if (!block) {
+ if (waitqueue_active(&kswapd_wait))
+ wake_up(&kswapd_wait);
+ return;
+ }
+
+ /*
+ * Kswapd could wake us up before we get a chance
+ * to sleep, so we have to be very careful here to
+ * prevent SMP races...
+ */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&kswapd_done, &wait);
+
+ if (waitqueue_active(&kswapd_wait))
+ wake_up(&kswapd_wait);
+ schedule();
+
+ remove_wait_queue(&kswapd_done, &wait);
+ __set_current_state(TASK_RUNNING);
+}
+
/*
* Called by non-kswapd processes when they want more
- * memory.
- *
- * In a perfect world, this should just wake up kswapd
- * and return. We don't actually want to swap stuff out
- * from user processes, because the locking issues are
- * nasty to the extreme (file write locks, and MM locking)
- *
- * One option might be to let kswapd do all the page-out
- * and VM page table scanning that needs locking, and this
- * process thread could do just the mmap shrink stage that
- * can be done by just dropping cached pages without having
- * any deadlock issues.
+ * memory but are unable to sleep on kswapd because
+ * they might be holding some IO locks ...
*/
int try_to_free_pages(unsigned int gfp_mask)
{
- int retval = 1;
+ int ret = 1;
if (gfp_mask & __GFP_WAIT) {
- current->state = TASK_RUNNING;
current->flags |= PF_MEMALLOC;
- retval = do_try_to_free_pages(gfp_mask);
+ ret = do_try_to_free_pages(gfp_mask, 1);
current->flags &= ~PF_MEMALLOC;
}
- /* someone needed memory that kswapd had not provided
- * make sure kswapd runs, should not happen often */
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
+ return ret;
+}
+
+DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
+/*
+ * Kreclaimd will move pages from the inactive_clean list to the
+ * free list, in order to keep atomic allocations possible under
+ * all circumstances. Even when kswapd is blocked on IO.
+ */
+int kreclaimd(void *unused)
+{
+ struct task_struct *tsk = current;
+ pg_data_t *pgdat;
- return retval;
+ tsk->session = 1;
+ tsk->pgrp = 1;
+ strcpy(tsk->comm, "kreclaimd");
+ sigfillset(&tsk->blocked);
+ current->flags |= PF_MEMALLOC;
+
+ while (1) {
+
+ /*
+ * We sleep until someone wakes us up from
+ * page_alloc.c::__alloc_pages().
+ */
+ interruptible_sleep_on(&kreclaimd_wait);
+
+ /*
+ * Move some pages from the inactive_clean lists to
+ * the free lists, if it is needed.
+ */
+ pgdat = pgdat_list;
+ do {
+ int i;
+ for(i = 0; i < MAX_NR_ZONES; i++) {
+ zone_t *zone = pgdat->node_zones + i;
+ if (!zone->size)
+ continue;
+
+ while (zone->free_pages < zone->pages_low) {
+ struct page * page;
+ page = reclaim_page(zone);
+ if (!page)
+ break;
+ __free_page(page);
+ }
+ }
+ pgdat = pgdat->node_next;
+ } while (pgdat);
+ }
}
+
static int __init kswapd_init(void)
{
- printk("Starting kswapd v1.7\n");
+ printk("Starting kswapd v1.8\n");
swap_setup();
- kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+ kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
return 0;
}