From 012bb3e61e5eced6c610f9e036372bf0c8def2d1 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 5 Oct 2000 01:18:40 +0000 Subject: Merge with Linux 2.4.0-test9. Please check DECstation, I had a number of rejects to fixup while integrating Linus patches. I also found that this kernel will only boot SMP on Origin; the UP kernel freeze soon after bootup with SCSI timeout messages. I commit this anyway since I found that the last CVS versions had the same problem. --- mm/filemap.c | 303 ++++++++------------- mm/memory.c | 71 ++--- mm/mmap.c | 14 +- mm/mremap.c | 4 + mm/numa.c | 8 +- mm/page_alloc.c | 400 +++++++++++++++++++++------ mm/page_io.c | 3 +- mm/slab.c | 126 +++------ mm/swap.c | 254 +++++++++++++++++- mm/swap_state.c | 6 +- mm/vmalloc.c | 2 +- mm/vmscan.c | 821 +++++++++++++++++++++++++++++++++++++++++++++++--------- 12 files changed, 1480 insertions(+), 532 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 977225432..6aca16409 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -44,9 +44,8 @@ atomic_t page_cache_size = ATOMIC_INIT(0); unsigned int page_hash_bits; struct page **page_hash_table; -struct list_head lru_cache; -static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; +spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; /* * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with * the pagemap_lru_lock held. @@ -92,7 +91,7 @@ static inline int sync_page(struct page *page) * sure the page is locked and that nobody else uses it - or that usage * is safe. */ -static inline void __remove_inode_page(struct page *page) +void __remove_inode_page(struct page *page) { remove_page_from_inode_queue(page); remove_page_from_hash_queue(page); @@ -146,9 +145,40 @@ void invalidate_inode_pages(struct inode * inode) spin_unlock(&pagecache_lock); } -/* +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + + if (page->buffers) + block_flushpage(page, partial); + +} + +static inline void truncate_complete_page(struct page *page) +{ + if (!page->buffers || block_flushpage(page, 0)) + lru_cache_del(page); + + /* + * We remove the page from the page cache _after_ we have + * destroyed all buffer-cache references to it. Otherwise some + * other process might think this inode page is not in the + * page cache and creates a buffer-cache alias to it causing + * all sorts of fun problems ... + */ + ClearPageDirty(page); + remove_inode_page(page); + page_cache_release(page); +} + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * * Truncate the page cache at a set offset, removing the pages * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. */ void truncate_inode_pages(struct address_space * mapping, loff_t lstart) { @@ -168,11 +198,10 @@ repeat: page = list_entry(curr, struct page, list); curr = curr->next; - offset = page->index; - /* page wholly truncated - free it */ - if (offset >= start) { + /* Is one of the pages to truncate? */ + if ((offset >= start) || (partial && (offset + 1) == start)) { if (TryLockPage(page)) { page_cache_get(page); spin_unlock(&pagecache_lock); @@ -183,23 +212,14 @@ repeat: page_cache_get(page); spin_unlock(&pagecache_lock); - if (!page->buffers || block_flushpage(page, 0)) - lru_cache_del(page); - - /* - * We remove the page from the page cache - * _after_ we have destroyed all buffer-cache - * references to it. Otherwise some other process - * might think this inode page is not in the - * page cache and creates a buffer-cache alias - * to it causing all sorts of fun problems ... - */ - remove_inode_page(page); - ClearPageDirty(page); + if (partial && (offset + 1) == start) { + truncate_partial_page(page, partial); + partial = 0; + } else + truncate_complete_page(page); UnlockPage(page); page_cache_release(page); - page_cache_release(page); /* * We have done things without the pagecache lock, @@ -210,176 +230,10 @@ repeat: */ goto repeat; } - /* - * there is only one partial page possible. - */ - if (!partial) - continue; - - /* and it's the one preceeding the first wholly truncated page */ - if ((offset + 1) != start) - continue; - - /* partial truncate, clear end of page */ - if (TryLockPage(page)) { - spin_unlock(&pagecache_lock); - goto repeat; - } - page_cache_get(page); - spin_unlock(&pagecache_lock); - - memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); - if (page->buffers) - block_flushpage(page, partial); - - partial = 0; - - /* - * we have dropped the spinlock so we have to - * restart. - */ - UnlockPage(page); - page_cache_release(page); - goto repeat; } spin_unlock(&pagecache_lock); } -/* - * nr_dirty represents the number of dirty pages that we will write async - * before doing sync writes. We can only do sync writes if we can - * wait for IO (__GFP_IO set). - */ -int shrink_mmap(int priority, int gfp_mask) -{ - int ret = 0, count, nr_dirty; - struct list_head * page_lru; - struct page * page = NULL; - - count = nr_lru_pages / (priority + 1); - nr_dirty = priority; - - /* we need pagemap_lru_lock for list_del() ... subtle code below */ - spin_lock(&pagemap_lru_lock); - while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) { - page = list_entry(page_lru, struct page, lru); - list_del(page_lru); - - if (PageTestandClearReferenced(page)) - goto dispose_continue; - - count--; - /* - * Avoid unscalable SMP locking for pages we can - * immediate tell are untouchable.. - */ - if (!page->buffers && page_count(page) > 1) - goto dispose_continue; - - if (TryLockPage(page)) - goto dispose_continue; - - /* Release the pagemap_lru lock even if the page is not yet - queued in any lru queue since we have just locked down - the page so nobody else may SMP race with us running - a lru_cache_del() (lru_cache_del() always run with the - page locked down ;). */ - spin_unlock(&pagemap_lru_lock); - - /* avoid freeing the page while it's locked */ - page_cache_get(page); - - /* - * Is it a buffer page? Try to clean it up regardless - * of zone - it's old. - */ - if (page->buffers) { - int wait; - /* - * 0 - free it if can do so without IO - * 1 - start write-out of dirty buffers - * 2 - wait for locked buffers - */ - wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0; - if (!try_to_free_buffers(page, wait)) - goto unlock_continue; - /* page was locked, inode can't go away under us */ - if (!page->mapping) { - atomic_dec(&buffermem_pages); - goto made_buffer_progress; - } - } - - /* Take the pagecache_lock spinlock held to avoid - other tasks to notice the page while we are looking at its - page count. If it's a pagecache-page we'll free it - in one atomic transaction after checking its page count. */ - spin_lock(&pagecache_lock); - - /* - * We can't free pages unless there's just one user - * (count == 2 because we added one ourselves above). - */ - if (page_count(page) != 2) - goto cache_unlock_continue; - - /* - * Is it a page swap page? If so, we want to - * drop it if it is no longer used, even if it - * were to be marked referenced.. - */ - if (PageSwapCache(page)) { - spin_unlock(&pagecache_lock); - __delete_from_swap_cache(page); - goto made_inode_progress; - } - - /* - * Page is from a zone we don't care about. - * Don't drop page cache entries in vain. - */ - if (page->zone->free_pages > page->zone->pages_high) - goto cache_unlock_continue; - - /* is it a page-cache page? */ - if (page->mapping) { - if (!PageDirty(page) && !pgcache_under_min()) { - __remove_inode_page(page); - spin_unlock(&pagecache_lock); - goto made_inode_progress; - } - goto cache_unlock_continue; - } - - printk(KERN_ERR "shrink_mmap: unknown LRU page!\n"); - -cache_unlock_continue: - spin_unlock(&pagecache_lock); -unlock_continue: - spin_lock(&pagemap_lru_lock); - UnlockPage(page); - page_cache_release(page); -dispose_continue: - list_add(page_lru, &lru_cache); - } - goto out; - -made_inode_progress: - page_cache_release(page); -made_buffer_progress: - UnlockPage(page); - page_cache_release(page); - ret = 1; - spin_lock(&pagemap_lru_lock); - /* nr_lru_pages needs the spinlock */ - nr_lru_pages--; - -out: - spin_unlock(&pagemap_lru_lock); - - return ret; -} - static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) { goto inside; @@ -394,7 +248,14 @@ inside: if (page->index == offset) break; } - SetPageReferenced(page); + /* + * Touching the page may move it to the active list. + * If we end up with too few inactive pages, we wake + * up kswapd. + */ + age_page_up(page); + if (inactive_shortage() > inactive_target / 2 && free_shortage()) + wakeup_kswapd(0); not_found: return page; } @@ -626,6 +487,7 @@ void ___wait_on_page(struct page *page) set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!PageLocked(page)) break; + run_task_queue(&tq_disk); schedule(); } while (PageLocked(page)); tsk->state = TASK_RUNNING; @@ -748,6 +610,53 @@ repeat: #define DEBUG_READAHEAD #endif +/* + * We combine this with read-ahead to deactivate pages when we + * think there's sequential IO going on. Note that this is + * harmless since we don't actually evict the pages from memory + * but just move them to the inactive list. + * + * TODO: + * - make the readahead code smarter + * - move readahead to the VMA level so we can do the same + * trick with mmap() + * + * Rik van Riel, 2000 + */ +static void drop_behind(struct file * file, unsigned long index) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct page **hash; + struct page *page; + unsigned long start; + + /* Nothing to drop-behind if we're on the first page. */ + if (!index) + return; + + if (index > file->f_rawin) + start = index - file->f_rawin; + else + start = 0; + + /* + * Go backwards from index-1 and drop all pages in the + * readahead window. Since the readahead window may have + * been increased since the last time we were called, we + * stop when the page isn't there. + */ + spin_lock(&pagecache_lock); + while (--index >= start) { + hash = page_hash(mapping, index); + page = __find_page_nolock(mapping, index, *hash); + if (!page) + break; + deactivate_page(page); + } + spin_unlock(&pagecache_lock); +} + /* * Read-ahead profiling information * -------------------------------- @@ -971,6 +880,12 @@ static void generic_file_readahead(int reada_ok, if (filp->f_ramax > max_readahead) filp->f_ramax = max_readahead; + /* + * Move the pages that have already been passed + * to the inactive list. + */ + drop_behind(filp, index); + #ifdef PROFILE_READAHEAD profile_readahead((reada_ok == 2), filp); #endif @@ -1074,6 +989,13 @@ found_page: goto page_not_up_to_date; generic_file_readahead(reada_ok, filp, inode, page); page_ok: + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (page->mapping->i_mmap_shared != NULL) + flush_dcache_page(page); + /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... @@ -2002,10 +1924,10 @@ static long madvise_willneed(struct vm_area_struct * vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for shrink_mmap to actually free + * zap_page_range call sets things up for refill_inactive to actually free * these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for - * shrink_mmap to pick up before reclaiming other pages. + * refill_inactive to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for @@ -2530,6 +2452,7 @@ generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) unlock: /* Mark it unlocked again and drop the page.. */ UnlockPage(page); + deactivate_page(page); page_cache_release(page); if (status < 0) diff --git a/mm/memory.c b/mm/memory.c index 83fc97cb3..6b047821d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -67,7 +67,7 @@ static inline void copy_cow_page(struct page * from, struct page * to, unsigned copy_user_highpage(to, from, address); } -mem_map_t * mem_map = NULL; +mem_map_t * mem_map; /* * Note: this doesn't free the actual pages themselves. That @@ -924,33 +924,9 @@ static void partial_clear(struct vm_area_struct *vma, unsigned long address) memclear_highpage_flush(page, offset, PAGE_SIZE - offset); } -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -void vmtruncate(struct inode * inode, loff_t offset) +static void vmtruncate_list(struct vm_area_struct *mpnt, + unsigned long pgoff, unsigned long partial) { - unsigned long partial, pgoff; - struct vm_area_struct * mpnt; - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - inode->i_size = offset; - truncate_inode_pages(mapping, offset); - spin_lock(&mapping->i_shared_lock); - if (!mapping->i_mmap) - goto out_unlock; - - pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - partial = (unsigned long)offset & (PAGE_CACHE_SIZE - 1); - - mpnt = mapping->i_mmap; do { struct mm_struct *mm = mpnt->vm_mm; unsigned long start = mpnt->vm_start; @@ -983,6 +959,39 @@ void vmtruncate(struct inode * inode, loff_t offset) zap_page_range(mm, start, len); flush_tlb_range(mm, start, end); } while ((mpnt = mpnt->vm_next_share) != NULL); +} + + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +void vmtruncate(struct inode * inode, loff_t offset) +{ + unsigned long partial, pgoff; + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + inode->i_size = offset; + truncate_inode_pages(mapping, offset); + spin_lock(&mapping->i_shared_lock); + if (!mapping->i_mmap && !mapping->i_mmap_shared) + goto out_unlock; + + pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + partial = (unsigned long)offset & (PAGE_CACHE_SIZE - 1); + + if (mapping->i_mmap != NULL) + vmtruncate_list(mapping->i_mmap, pgoff, partial); + if (mapping->i_mmap_shared != NULL) + vmtruncate_list(mapping->i_mmap_shared, pgoff, partial); + out_unlock: spin_unlock(&mapping->i_shared_lock); /* this should go into ->truncate */ @@ -1031,7 +1040,8 @@ void swapin_readahead(swp_entry_t entry) num = valid_swaphandles(entry, &offset); for (i = 0; i < num; offset++, i++) { /* Don't block on I/O for read-ahead */ - if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster) { + if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster + * (1 << page_cluster)) { while (i++ < num) swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++)); break; @@ -1095,15 +1105,12 @@ static int do_swap_page(struct mm_struct * mm, */ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { - int high = 0; struct page *page = NULL; pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); if (write_access) { page = alloc_page(GFP_HIGHUSER); if (!page) return -1; - if (PageHighMem(page)) - high = 1; clear_user_highpage(page, addr); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); mm->rss++; @@ -1233,7 +1240,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, pgd = pgd_offset(mm, address); pmd = pmd_alloc(pgd, address); - + if (pmd) { pte_t * pte = pte_alloc(pmd, address); if (pte) diff --git a/mm/mmap.c b/mm/mmap.c index 9667d19db..9c0027563 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -906,15 +906,21 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) if (file) { struct inode * inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; + struct vm_area_struct **head; + if (vmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); + + head = &mapping->i_mmap; + if (vmp->vm_flags & VM_SHARED) + head = &mapping->i_mmap_shared; /* insert vmp into inode's share list */ spin_lock(&mapping->i_shared_lock); - if((vmp->vm_next_share = mapping->i_mmap) != NULL) - mapping->i_mmap->vm_pprev_share = &vmp->vm_next_share; - mapping->i_mmap = vmp; - vmp->vm_pprev_share = &mapping->i_mmap; + if((vmp->vm_next_share = *head) != NULL) + (*head)->vm_pprev_share = &vmp->vm_next_share; + *head = vmp; + vmp->vm_pprev_share = head; spin_unlock(&mapping->i_shared_lock); } } diff --git a/mm/mremap.c b/mm/mremap.c index a48125178..d1f6a7b8b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -225,6 +225,10 @@ unsigned long do_mremap(unsigned long addr, /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) goto out; + if (vma->vm_flags & VM_DONTEXPAND) { + if (new_len > old_len) + goto out; + } if (vma->vm_flags & VM_LOCKED) { unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; locked += new_len - old_len; diff --git a/mm/numa.c b/mm/numa.c index bbe9ec6fb..06ad9ec63 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -21,12 +21,12 @@ pg_data_t contig_page_data = { bdata: &contig_bootmem_data }; * at a considerably higher value than 0. Examples are Super-H, ARM, m68k. * Should be invoked with paramters (0, 0, unsigned long *[], start_paddr). */ -void __init free_area_init_node(int nid, pg_data_t *pgdat, +void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, unsigned long *zones_size, unsigned long zone_start_paddr, unsigned long *zholes_size) { free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, - zone_start_paddr, zholes_size); + zone_start_paddr, zholes_size, pmap); } #endif /* !CONFIG_DISCONTIGMEM */ @@ -55,7 +55,7 @@ void show_free_areas_node(int nid) /* * Nodes can be initialized parallely, in no particular order. */ -void __init free_area_init_node(int nid, pg_data_t *pgdat, +void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, unsigned long *zones_size, unsigned long zone_start_paddr, unsigned long *zholes_size) { @@ -66,7 +66,7 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, mem_map = (mem_map_t *)PAGE_OFFSET; free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr, - zholes_size); + zholes_size, pmap); pgdat->node_id = nid; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8b74a73db..0b5990a11 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -25,7 +25,8 @@ #endif int nr_swap_pages; -int nr_lru_pages; +int nr_active_pages; +int nr_inactive_dirty_pages; pg_data_t *pgdat_list; static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; @@ -33,6 +34,8 @@ static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, }; static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, }; static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, }; +struct list_head active_list; +struct list_head inactive_dirty_list; /* * Free_page() adds the page to the free lists. This is optimized for * fast normal cases (no error jumps taken normally). @@ -96,7 +99,16 @@ static void __free_pages_ok (struct page *page, unsigned long order) BUG(); if (PageDirty(page)) BUG(); + if (PageActive(page)) + BUG(); + if (PageInactiveDirty(page)) + BUG(); + if (PageInactiveClean(page)) + BUG(); + page->flags &= ~(1<age = PAGE_AGE_START; + zone = page->zone; mask = (~0UL) << order; @@ -142,10 +154,13 @@ static void __free_pages_ok (struct page *page, unsigned long order) spin_unlock_irqrestore(&zone->lock, flags); - if (zone->free_pages > zone->pages_high) { - zone->zone_wake_kswapd = 0; - zone->low_on_memory = 0; - } + /* + * We don't want to protect this variable from race conditions + * since it's nothing important, but we do want to make sure + * it never gets negative. + */ + if (memory_pressure > NR_CPUS) + memory_pressure--; } #define MARK_USED(index, order, area) \ @@ -203,6 +218,7 @@ static struct page * rmqueue(zone_t *zone, unsigned long order) set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); + DEBUG_ADD_PAGE return page; } curr_order++; @@ -213,13 +229,77 @@ static struct page * rmqueue(zone_t *zone, unsigned long order) return NULL; } +#define PAGES_MIN 0 +#define PAGES_LOW 1 +#define PAGES_HIGH 2 + +/* + * This function does the dirty work for __alloc_pages + * and is separated out to keep the code size smaller. + * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM) + */ +static struct page * __alloc_pages_limit(zonelist_t *zonelist, + unsigned long order, int limit, int direct_reclaim) +{ + zone_t **zone = zonelist->zones; + + for (;;) { + zone_t *z = *(zone++); + unsigned long water_mark; + + if (!z) + break; + if (!z->size) + BUG(); + + /* + * We allocate if the number of free + inactive_clean + * pages is above the watermark. + */ + switch (limit) { + default: + case PAGES_MIN: + water_mark = z->pages_min; + break; + case PAGES_LOW: + water_mark = z->pages_low; + break; + case PAGES_HIGH: + water_mark = z->pages_high; + } + + if (z->free_pages + z->inactive_clean_pages > water_mark) { + struct page *page = NULL; + /* If possible, reclaim a page directly. */ + if (direct_reclaim && z->free_pages < z->pages_min + 8) + page = reclaim_page(z); + /* If that fails, fall back to rmqueue. */ + if (!page) + page = rmqueue(z, order); + if (page) + return page; + } + } + + /* Found nothing. */ + return NULL; +} + + /* * This is the 'heart' of the zoned buddy allocator: */ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) { zone_t **zone; - extern wait_queue_head_t kswapd_wait; + int direct_reclaim = 0; + unsigned int gfp_mask = zonelist->gfp_mask; + struct page * page = NULL; + + /* + * Allocations put pressure on the VM subsystem. + */ + memory_pressure++; /* * (If anyone calls gfp from interrupts nonatomically then it @@ -229,6 +309,36 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) * in a higher zone fails. */ + /* + * Can we take pages directly from the inactive_clean + * list? + */ + if (order == 0 && (gfp_mask & __GFP_WAIT) && + !(current->flags & PF_MEMALLOC)) + direct_reclaim = 1; + + /* + * If we are about to get low on free pages and we also have + * an inactive page shortage, wake up kswapd. + */ + if (inactive_shortage() > inactive_target / 2 && free_shortage()) + wakeup_kswapd(0); + /* + * If we are about to get low on free pages and cleaning + * the inactive_dirty pages would fix the situation, + * wake up bdflush. + */ + else if (free_shortage() && nr_inactive_dirty_pages > free_shortage() + && nr_inactive_dirty_pages > freepages.high) + wakeup_bdflush(0); + +try_again: + /* + * First, see if we have any zones with lots of free memory. + * + * We allocate free memory first because it doesn't contain + * any data ... DUH! + */ zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); @@ -237,82 +347,193 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) if (!z->size) BUG(); - /* Are we supposed to free memory? Don't make it worse.. */ - if (!z->zone_wake_kswapd) { - struct page *page = rmqueue(z, order); - if (z->free_pages < z->pages_low) { - z->zone_wake_kswapd = 1; - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); - } + if (z->free_pages > z->pages_low) { + page = rmqueue(z, order); if (page) return page; + } else if (z->free_pages < z->pages_min && + waitqueue_active(&kreclaimd_wait)) { + wake_up_interruptible(&kreclaimd_wait); } } - /* Three possibilities to get here - * - Previous alloc_pages resulted in last zone set to have - * zone_wake_kswapd and start it. kswapd has not been able - * to release enough pages so that one zone does not have - * zone_wake_kswapd set. - * - Different sets of zones (zonelist) - * previous did not have all zones with zone_wake_kswapd but - * this one has... should kswapd be woken up? it will run once. - * - SMP race, kswapd went to sleep slightly after it as running - * in 'if (waitqueue_active(...))' above. - * + anyway the test is very cheap to do... + /* + * Try to allocate a page from a zone with a HIGH + * amount of free + inactive_clean pages. + * + * If there is a lot of activity, inactive_target + * will be high and we'll have a good chance of + * finding a page using the HIGH limit. */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); + if (page) + return page; /* - * Ok, we don't have any zones that don't need some - * balancing.. See if we have any that aren't critical.. + * Then try to allocate a page from a zone with more + * than zone->pages_low free + inactive_clean pages. + * + * When the working set is very large and VM activity + * is low, we're most likely to have our allocation + * succeed here. */ - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->low_on_memory) { - struct page *page = rmqueue(z, order); - if (z->free_pages < z->pages_min) - z->low_on_memory = 1; - if (page) - return page; - } + page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); + if (page) + return page; + + /* + * OK, none of the zones on our zonelist has lots + * of pages free. + * + * We wake up kswapd, in the hope that kswapd will + * resolve this situation before memory gets tight. + * + * We also yield the CPU, because that: + * - gives kswapd a chance to do something + * - slows down allocations, in particular the + * allocations from the fast allocator that's + * causing the problems ... + * - ... which minimises the impact the "bad guys" + * have on the rest of the system + * - if we don't have __GFP_IO set, kswapd may be + * able to free some memory we can't free ourselves + */ + wakeup_kswapd(0); + if (gfp_mask & __GFP_WAIT) { + __set_current_state(TASK_RUNNING); + current->policy |= SCHED_YIELD; + schedule(); } /* - * Uhhuh. All the zones have been critical, which means that - * we'd better do some synchronous swap-out. kswapd has not - * been able to cope.. + * After waking up kswapd, we try to allocate a page + * from any zone which isn't critical yet. + * + * Kswapd should, in most situations, bring the situation + * back to normal in no time. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + if (page) + return page; + + /* + * Damn, we didn't succeed. + * + * This can be due to 2 reasons: + * - we're doing a higher-order allocation + * --> move pages to the free list until we succeed + * - we're /really/ tight on memory + * --> wait on the kswapd waitqueue until memory is freed */ if (!(current->flags & PF_MEMALLOC)) { - int gfp_mask = zonelist->gfp_mask; - if (!try_to_free_pages(gfp_mask)) { - if (!(gfp_mask & __GFP_HIGH)) - goto fail; + /* + * Are we dealing with a higher order allocation? + * + * Move pages from the inactive_clean to the free list + * in the hope of creating a large, physically contiguous + * piece of free memory. + */ + if (order > 0 && (gfp_mask & __GFP_WAIT)) { + zone = zonelist->zones; + /* First, clean some dirty pages. */ + page_launder(gfp_mask, 1); + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + if (!z->size) + continue; + while (z->inactive_clean_pages) { + struct page * page; + /* Move one page to the free list. */ + page = reclaim_page(z); + if (!page) + break; + __free_page(page); + /* Try if the allocation succeeds. */ + page = rmqueue(z, order); + if (page) + return page; + } + } } + /* + * When we arrive here, we are really tight on memory. + * + * We wake up kswapd and sleep until kswapd wakes us + * up again. After that we loop back to the start. + * + * We have to do this because something else might eat + * the memory kswapd frees for us and we need to be + * reliable. Note that we don't loop back for higher + * order allocations since it is possible that kswapd + * simply cannot free a large enough contiguous area + * of memory *ever*. + */ + if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) { + wakeup_kswapd(1); + memory_pressure++; + if (!order) + goto try_again; + /* + * If __GFP_IO isn't set, we can't wait on kswapd because + * kswapd just might need some IO locks /we/ are holding ... + * + * SUBTLE: The scheduling point above makes sure that + * kswapd does get the chance to free memory we can't + * free ourselves... + */ + } else if (gfp_mask & __GFP_WAIT) { + try_to_free_pages(gfp_mask); + memory_pressure++; + if (!order) + goto try_again; + } + } /* * Final phase: allocate anything we can! + * + * Higher order allocations, GFP_ATOMIC allocations and + * recursive allocations (PF_MEMALLOC) end up here. + * + * Only recursive allocations can use the very last pages + * in the system, otherwise it would be just too easy to + * deadlock the system... */ zone = zonelist->zones; for (;;) { - struct page *page; - zone_t *z = *(zone++); + struct page * page = NULL; if (!z) break; - page = rmqueue(z, order); + if (!z->size) + BUG(); + + /* + * SUBTLE: direct_reclaim is only possible if the task + * becomes PF_MEMALLOC while looping above. This will + * happen when the OOM killer selects this task for + * instant execution... + */ + if (direct_reclaim) + page = reclaim_page(z); + if (page) + return page; + + /* XXX: is pages_min/4 a good amount to reserve for this? */ + if (z->free_pages < z->pages_min / 4 && + !(current->flags & PF_MEMALLOC)) + continue; + if (!page) + page = rmqueue(z, order); if (page) return page; } -fail: /* No luck.. */ + printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order); return NULL; } @@ -377,18 +598,46 @@ unsigned int nr_free_pages (void) } /* - * Amount of free RAM allocatable as buffer memory: + * Total amount of inactive_clean (allocatable) RAM: */ -unsigned int nr_free_buffer_pages (void) +unsigned int nr_inactive_clean_pages (void) { unsigned int sum; zone_t *zone; int i; - sum = nr_lru_pages / 3; + sum = 0; for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++) - sum += zone->free_pages; + for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++) + sum += zone->inactive_clean_pages; + return sum; +} + +/* + * Amount of free RAM allocatable as buffer memory: + */ +unsigned int nr_free_buffer_pages (void) +{ + unsigned int sum; + + sum = nr_free_pages(); + sum += nr_inactive_clean_pages(); + sum += nr_inactive_dirty_pages; + + /* + * Keep our write behind queue filled, even if + * kswapd lags a bit right now. + */ + if (sum < freepages.high + inactive_target) + sum = freepages.high + inactive_target; + /* + * We don't want dirty page writebehind to put too + * much pressure on the working set, but we want it + * to be possible to have some dirty pages in the + * working set without upsetting the writebehind logic. + */ + sum += nr_active_pages >> 4; + return sum; } @@ -418,9 +667,11 @@ void show_free_areas_core(int nid) nr_free_pages() << (PAGE_SHIFT-10), nr_free_highpages() << (PAGE_SHIFT-10)); - printk("( Free: %d, lru_cache: %d (%d %d %d) )\n", + printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n", + nr_active_pages, + nr_inactive_dirty_pages, + nr_inactive_clean_pages(), nr_free_pages(), - nr_lru_pages, freepages.min, freepages.low, freepages.high); @@ -430,17 +681,6 @@ void show_free_areas_core(int nid) zone_t *zone = NODE_DATA(nid)->node_zones + type; unsigned long nr, total, flags; - printk(" %c%d%d %s: ", - (zone->free_pages > zone->pages_low - ? (zone->free_pages > zone->pages_high - ? ' ' - : 'H') - : (zone->free_pages > zone->pages_min - ? 'M' - : 'L')), - zone->zone_wake_kswapd, zone->low_on_memory, - zone->name); - total = 0; if (zone->size) { spin_lock_irqsave(&zone->lock, flags); @@ -532,9 +772,9 @@ static inline void build_zonelists(pg_data_t *pgdat) */ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, unsigned long *zones_size, unsigned long zone_start_paddr, - unsigned long *zholes_size) + unsigned long *zholes_size, struct page *lmem_map) { - struct page *p, *lmem_map; + struct page *p; unsigned long i, j; unsigned long map_size; unsigned long totalpages, offset, realtotalpages; @@ -570,7 +810,8 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, freepages.min += i; freepages.low += i * 2; freepages.high += i * 3; - memlist_init(&lru_cache); + memlist_init(&active_list); + memlist_init(&inactive_dirty_list); /* * Some architectures (with lots of mem and discontinous memory @@ -580,9 +821,11 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, * boundary, so that MAP_NR works. */ map_size = (totalpages + 1)*sizeof(struct page); - lmem_map = (struct page *) alloc_bootmem_node(nid, map_size); - lmem_map = (struct page *)(PAGE_OFFSET + + if (lmem_map == (struct page *)0) { + lmem_map = (struct page *) alloc_bootmem_node(nid, map_size); + lmem_map = (struct page *)(PAGE_OFFSET + MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); + } *gmap = pgdat->node_mem_map = lmem_map; pgdat->node_size = totalpages; pgdat->node_start_paddr = zone_start_paddr; @@ -616,6 +859,9 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; + zone->inactive_clean_pages = 0; + zone->inactive_dirty_pages = 0; + memlist_init(&zone->inactive_clean_list); if (!size) continue; @@ -629,8 +875,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; - zone->low_on_memory = 0; - zone->zone_wake_kswapd = 0; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; @@ -664,7 +908,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, void __init free_area_init(unsigned long *zones_size) { - free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0); + free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0, 0); } static int __init setup_mem_frac(char *str) diff --git a/mm/page_io.c b/mm/page_io.c index 25ed62221..185e19247 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -43,7 +43,8 @@ static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page, int w struct inode *swapf = 0; /* Don't allow too many pending pages in flight.. */ - if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster) + if ((rw == WRITE) && atomic_read(&nr_async_pages) > + pager_daemon.swap_cluster * (1 << page_cluster)) wait = 1; if (rw == READ) { diff --git a/mm/slab.c b/mm/slab.c index ed5d018f1..b3bd852d1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -579,7 +579,6 @@ static void kmem_slab_destroy (kmem_cache_t *cachep, slab_t *slabp) kmem_cache_free(cachep->slabp_cache, slabp); } - /** * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -838,48 +837,60 @@ static int is_chained_kmem_cache(kmem_cache_t * cachep) } #ifdef CONFIG_SMP -static DECLARE_MUTEX(cache_drain_sem); -static kmem_cache_t *cache_to_drain = NULL; -static DECLARE_WAIT_QUEUE_HEAD(cache_drain_wait); -unsigned long slab_cache_drain_mask; - /* - * Waits for all CPUs to execute slab_drain_local_cache(). - * Caller must be holding cache_drain_sem. + * Waits for all CPUs to execute func(). */ -static void slab_drain_all_sync(void) +static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) { - DECLARE_WAITQUEUE(wait, current); - local_irq_disable(); - slab_drain_local_cache(); + func(arg); local_irq_enable(); - add_wait_queue(&cache_drain_wait, &wait); - current->state = TASK_UNINTERRUPTIBLE; - while (slab_cache_drain_mask != 0UL) - schedule(); - current->state = TASK_RUNNING; - remove_wait_queue(&cache_drain_wait, &wait); + if (smp_call_function(func, arg, 1, 1)) + BUG(); +} +typedef struct ccupdate_struct_s +{ + kmem_cache_t *cachep; + cpucache_t *new[NR_CPUS]; +} ccupdate_struct_t; + +static void do_ccupdate_local(void *info) +{ + ccupdate_struct_t *new = (ccupdate_struct_t *)info; + cpucache_t *old = cc_data(new->cachep); + + cc_data(new->cachep) = new->new[smp_processor_id()]; + new->new[smp_processor_id()] = old; } +static void free_block (kmem_cache_t* cachep, void** objpp, int len); + static void drain_cpu_caches(kmem_cache_t *cachep) { - unsigned long cpu_mask = 0; + ccupdate_struct_t new; int i; - for (i = 0; i < smp_num_cpus; i++) - cpu_mask |= (1UL << cpu_logical_map(i)); + memset(&new.new,0,sizeof(new.new)); - down(&cache_drain_sem); + new.cachep = cachep; - cache_to_drain = cachep; - slab_cache_drain_mask = cpu_mask; - slab_drain_all_sync(); - cache_to_drain = NULL; + down(&cache_chain_sem); + smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); - up(&cache_drain_sem); + for (i = 0; i < smp_num_cpus; i++) { + cpucache_t* ccold = new.new[cpu_logical_map(i)]; + if (!ccold || (ccold->avail == 0)) + continue; + local_irq_disable(); + free_block(cachep, cc_entry(ccold), ccold->avail); + local_irq_enable(); + ccold->avail = 0; + } + smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); + up(&cache_chain_sem); } + #else #define drain_cpu_caches(cachep) do { } while (0) #endif @@ -1593,56 +1604,6 @@ kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags) #ifdef CONFIG_SMP -typedef struct ccupdate_struct_s -{ - kmem_cache_t *cachep; - cpucache_t *new[NR_CPUS]; -} ccupdate_struct_t; - -static ccupdate_struct_t *ccupdate_state = NULL; - -/* Called from per-cpu timer interrupt. */ -void slab_drain_local_cache(void) -{ - if (ccupdate_state != NULL) { - ccupdate_struct_t *new = ccupdate_state; - cpucache_t *old = cc_data(new->cachep); - - cc_data(new->cachep) = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; - } else { - kmem_cache_t *cachep = cache_to_drain; - cpucache_t *cc = cc_data(cachep); - - if (cc && cc->avail) { - free_block(cachep, cc_entry(cc), cc->avail); - cc->avail = 0; - } - } - - clear_bit(smp_processor_id(), &slab_cache_drain_mask); - if (slab_cache_drain_mask == 0) - wake_up(&cache_drain_wait); -} - -static void do_ccupdate(ccupdate_struct_t *data) -{ - unsigned long cpu_mask = 0; - int i; - - for (i = 0; i < smp_num_cpus; i++) - cpu_mask |= (1UL << cpu_logical_map(i)); - - down(&cache_drain_sem); - - ccupdate_state = data; - slab_cache_drain_mask = cpu_mask; - slab_drain_all_sync(); - ccupdate_state = NULL; - - up(&cache_drain_sem); -} - /* called with cache_chain_sem acquired. */ static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount) { @@ -1666,7 +1627,6 @@ static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount) for (i = 0; i< smp_num_cpus; i++) { cpucache_t* ccnew; - ccnew = kmalloc(sizeof(void*)*limit+ sizeof(cpucache_t), GFP_KERNEL); if (!ccnew) @@ -1681,7 +1641,7 @@ static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount) cachep->batchcount = batchcount; spin_unlock_irq(&cachep->spinlock); - do_ccupdate(&new); + smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); for (i = 0; i < smp_num_cpus; i++) { cpucache_t* ccold = new.new[cpu_logical_map(i)]; @@ -1772,14 +1732,6 @@ void kmem_cache_reap (int gfp_mask) /* It's safe to test this without holding the cache-lock. */ if (searchp->flags & SLAB_NO_REAP) goto next; - /* FIXME: is this really a good idea? */ - if (gfp_mask & GFP_DMA) { - if (!(searchp->gfpflags & GFP_DMA)) - goto next; - } else { - if (searchp->gfpflags & GFP_DMA) - goto next; - } spin_lock_irq(&searchp->spinlock); if (searchp->growing) goto next_unlock; diff --git a/mm/swap.c b/mm/swap.c index 460707ff7..8cb160b81 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -40,7 +40,18 @@ freepages_t freepages = { }; /* How many pages do we try to swap or page in/out together? */ -int page_cluster = 4; /* Default value modified in swap_setup() */ +int page_cluster; + +/* + * This variable contains the amount of page steals the system + * is doing, averaged over a minute. We use this to determine how + * many inactive pages we should have. + * + * In reclaim_page and __alloc_pages: memory_pressure++ + * In __free_pages_ok: memory_pressure-- + * In recalculate_vm_stats the value is decayed (once a second) + */ +int memory_pressure; /* We track the number of pages currently being asynchronously swapped out, so that we don't try to swap TOO many pages out at once */ @@ -61,13 +72,250 @@ buffer_mem_t page_cache = { pager_daemon_t pager_daemon = { 512, /* base number for calculating the number of tries */ SWAP_CLUSTER_MAX, /* minimum number of tries */ - SWAP_CLUSTER_MAX, /* do swap I/O in clusters of this size */ + 8, /* do swap I/O in clusters of this size */ }; +/** + * age_page_{up,down} - page aging helper functions + * @page - the page we want to age + * @nolock - are we already holding the pagelist_lru_lock? + * + * If the page is on one of the lists (active, inactive_dirty or + * inactive_clean), we will grab the pagelist_lru_lock as needed. + * If you're already holding the lock, call this function with the + * nolock argument non-zero. + */ +void age_page_up_nolock(struct page * page) +{ + /* + * We're dealing with an inactive page, move the page + * to the active list. + */ + if (!page->age) + activate_page_nolock(page); + + /* The actual page aging bit */ + page->age += PAGE_AGE_ADV; + if (page->age > PAGE_AGE_MAX) + page->age = PAGE_AGE_MAX; +} + /* - * Perform any setup for the swap system + * We use this (minimal) function in the case where we + * know we can't deactivate the page (yet). */ +void age_page_down_ageonly(struct page * page) +{ + page->age /= 2; +} + +void age_page_down_nolock(struct page * page) +{ + /* The actual page aging bit */ + page->age /= 2; + + /* + * The page is now an old page. Move to the inactive + * list (if possible ... see below). + */ + if (!page->age) + deactivate_page_nolock(page); +} +void age_page_up(struct page * page) +{ + /* + * We're dealing with an inactive page, move the page + * to the active list. + */ + if (!page->age) + activate_page(page); + + /* The actual page aging bit */ + page->age += PAGE_AGE_ADV; + if (page->age > PAGE_AGE_MAX) + page->age = PAGE_AGE_MAX; +} + +void age_page_down(struct page * page) +{ + /* The actual page aging bit */ + page->age /= 2; + + /* + * The page is now an old page. Move to the inactive + * list (if possible ... see below). + */ + if (!page->age) + deactivate_page(page); +} + + +/** + * (de)activate_page - move pages from/to active and inactive lists + * @page: the page we want to move + * @nolock - are we already holding the pagemap_lru_lock? + * + * Deactivate_page will move an active page to the right + * inactive list, while activate_page will move a page back + * from one of the inactive lists to the active list. If + * called on a page which is not on any of the lists, the + * page is left alone. + */ +void deactivate_page_nolock(struct page * page) +{ + /* + * One for the cache, one for the extra reference the + * caller has and (maybe) one for the buffers. + * + * This isn't perfect, but works for just about everything. + * Besides, as long as we don't move unfreeable pages to the + * inactive_clean list it doesn't need to be perfect... + */ + int maxcount = (page->buffers ? 3 : 2); + page->age = 0; + + /* + * Don't touch it if it's not on the active list. + * (some pages aren't on any list at all) + */ + if (PageActive(page) && page_count(page) <= maxcount && + !page_ramdisk(page)) { + + /* + * We can move the page to the inactive_dirty list + * if we have the strong suspicion that they might + * become freeable in the near future. + * + * That is, the page has buffer heads attached (that + * need to be cleared away) and/or the function calling + * us has an extra reference count on the page. + */ + if (page->buffers || page_count(page) == 2) { + del_page_from_active_list(page); + add_page_to_inactive_dirty_list(page); + /* + * Only if we are SURE the page is clean and immediately + * reusable, we move it to the inactive_clean list. + */ + } else if (page->mapping && !PageDirty(page) && + !PageLocked(page)) { + del_page_from_active_list(page); + add_page_to_inactive_clean_list(page); + } + /* + * OK, we cannot free the page. Leave it alone. + */ + } +} + +void deactivate_page(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + deactivate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + +/* + * Move an inactive page to the active list. + */ +void activate_page_nolock(struct page * page) +{ + if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); + add_page_to_active_list(page); + } else { + /* + * The page was not on any list, so we take care + * not to do anything. + */ + } + + /* Make sure the page gets a fair chance at staying active. */ + if (page->age < PAGE_AGE_START) + page->age = PAGE_AGE_START; +} + +void activate_page(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + activate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void lru_cache_add(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + if (!PageLocked(page)) + BUG(); + DEBUG_ADD_PAGE + add_page_to_active_list(page); + /* This should be relatively rare */ + if (!page->age) + deactivate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + +/** + * __lru_cache_del: remove a page from the page lists + * @page: the page to add + * + * This function is for when the caller already holds + * the pagemap_lru_lock. + */ +void __lru_cache_del(struct page * page) +{ + if (PageActive(page)) { + del_page_from_active_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); + } else { + printk("VM: __lru_cache_del, found unknown page ?!\n"); + } + DEBUG_ADD_PAGE +} + +/** + * lru_cache_del: remove a page from the page lists + * @page: the page to remove + */ +void lru_cache_del(struct page * page) +{ + if (!PageLocked(page)) + BUG(); + spin_lock(&pagemap_lru_lock); + __lru_cache_del(page); + spin_unlock(&pagemap_lru_lock); +} + +/** + * recalculate_vm_stats - recalculate VM statistics + * + * This function should be called once a second to recalculate + * some useful statistics the VM subsystem uses to determine + * its behaviour. + */ +void recalculate_vm_stats(void) +{ + /* + * Substract one second worth of memory_pressure from + * memory_pressure. + */ + memory_pressure -= (memory_pressure >> INACTIVE_SHIFT); +} + +/* + * Perform any setup for the swap system + */ void __init swap_setup(void) { /* Use a smaller cluster for memory <16MB or <32MB */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 506160354..d26c66f54 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -73,7 +73,7 @@ static inline void remove_from_swap_cache(struct page *page) PAGE_BUG(page); PageClearSwapCache(page); - remove_inode_page(page); + __remove_inode_page(page); } /* @@ -105,7 +105,9 @@ void delete_from_swap_cache_nolock(struct page *page) if (block_flushpage(page, 0)) lru_cache_del(page); + spin_lock(&pagecache_lock); __delete_from_swap_cache(page); + spin_unlock(&pagecache_lock); page_cache_release(page); } @@ -164,7 +166,7 @@ repeat: return 0; /* * Though the "found" page was in the swap cache an instant - * earlier, it might have been removed by shrink_mmap etc. + * earlier, it might have been removed by refill_inactive etc. * Re search ... Since find_lock_page grabs a reference on * the page, it can not be reused for anything else, namely * it can not be associated with another swaphandle, so it diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 817a3966b..e8c557e04 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -14,7 +14,7 @@ #include rwlock_t vmlist_lock = RW_LOCK_UNLOCKED; -struct vm_struct * vmlist = NULL; +struct vm_struct * vmlist; static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 95098e4d1..aacd9a5b0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -9,6 +9,7 @@ * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $ * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). + * Multiqueue VM started 5.8.00, Rik van Riel. */ #include @@ -40,6 +41,7 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un swp_entry_t entry; struct page * page; int (*swapout)(struct page *, struct file *); + int onlist; pte = *page_table; if (!pte_present(pte)) @@ -51,16 +53,37 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un if (mm->swap_cnt) mm->swap_cnt--; + onlist = PageActive(page); /* Don't look at this pte if it's been accessed recently. */ if (pte_young(pte)) { - /* - * Transfer the "accessed" bit from the page - * tables to the global page map. - */ set_pte(page_table, pte_mkold(pte)); - SetPageReferenced(page); + if (onlist) { + /* + * Transfer the "accessed" bit from the page + * tables to the global page map. Page aging + * will be done by refill_inactive_scan(). + */ + SetPageReferenced(page); + } else { + /* + * The page is not on the active list, so + * we have to do the page aging ourselves. + */ + age_page_up(page); + } goto out_failed; } + if (!onlist) + /* The page is still mapped, so it can't be freeable... */ + age_page_down_ageonly(page); + + /* + * If the page is in active use by us, or if the page + * is in active use by others, don't unmap it or + * (worse) start unneeded IO. + */ + if (page->age > 0) + goto out_failed; if (TryLockPage(page)) goto out_failed; @@ -79,8 +102,9 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: UnlockPage(page); - vma->vm_mm->rss--; + mm->rss--; flush_tlb_page(vma, address); + deactivate_page(page); page_cache_release(page); goto out_failed; } @@ -96,7 +120,7 @@ drop_pte: * our scan. * * Basically, this just makes it possible for us to do - * some real work in the future in "shrink_mmap()". + * some real work in the future in "refill_inactive()". */ if (!pte_dirty(pte)) { flush_cache_page(vma, address); @@ -116,7 +140,9 @@ drop_pte: * Don't do any of the expensive stuff if * we're not really interested in this zone. */ - if (page->zone->free_pages > page->zone->pages_high) + if (page->zone->free_pages + page->zone->inactive_clean_pages + + page->zone->inactive_dirty_pages + > page->zone->pages_high + inactive_target) goto out_unlock; /* @@ -134,7 +160,7 @@ drop_pte: * NOTE NOTE NOTE! This should just set a * dirty bit in 'page', and just drop the * pte. All the hard work would be done by - * shrink_mmap(). + * refill_inactive(). * * That would get rid of a lot of problems. */ @@ -144,14 +170,15 @@ drop_pte: struct file *file = vma->vm_file; if (file) get_file(file); pte_clear(page_table); - vma->vm_mm->rss--; + mm->rss--; flush_tlb_page(vma, address); - vmlist_access_unlock(vma->vm_mm); + vmlist_access_unlock(mm); error = swapout(page, file); UnlockPage(page); if (file) fput(file); if (!error) goto out_free_success; + deactivate_page(page); page_cache_release(page); return error; } @@ -175,13 +202,14 @@ drop_pte: add_to_swap_cache(page, entry); /* Put the swap entry into the pte after the page is in swapcache */ - vma->vm_mm->rss--; + mm->rss--; set_pte(page_table, swp_entry_to_pte(entry)); flush_tlb_page(vma, address); - vmlist_access_unlock(vma->vm_mm); + vmlist_access_unlock(mm); /* OK, do a physical asynchronous write to swap. */ rw_swap_page(WRITE, page, 0); + deactivate_page(page); out_free_success: page_cache_release(page); @@ -230,7 +258,7 @@ static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vm do { int result; - vma->vm_mm->swap_address = address + PAGE_SIZE; + mm->swap_address = address + PAGE_SIZE; result = try_to_swap_out(mm, vma, address, pte, gfp_mask); if (result) return result; @@ -282,7 +310,7 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi if (vma->vm_flags & VM_LOCKED) return 0; - pgdir = pgd_offset(vma->vm_mm, address); + pgdir = pgd_offset(mm, address); end = vma->vm_end; if (address >= end) @@ -323,17 +351,22 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask) int result = swap_out_vma(mm, vma, address, gfp_mask); if (result) return result; + if (!mm->swap_cnt) + goto out_unlock; vma = vma->vm_next; if (!vma) break; address = vma->vm_start; } } + /* Reset to 0 when we reach the end of address space */ + mm->swap_address = 0; + mm->swap_cnt = 0; + +out_unlock: vmlist_access_unlock(mm); /* We didn't find anything for the process */ - mm->swap_cnt = 0; - mm->swap_address = 0; return 0; } @@ -342,7 +375,10 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask) * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. */ -static int swap_out(unsigned int priority, int gfp_mask) +#define SWAP_SHIFT 5 +#define SWAP_MIN 8 + +static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time) { struct task_struct * p; int counter; @@ -363,7 +399,7 @@ static int swap_out(unsigned int priority, int gfp_mask) * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = (nr_threads << 2) >> (priority >> 2); + counter = (nr_threads << SWAP_SHIFT) >> priority; if (counter < 1) counter = 1; @@ -372,6 +408,7 @@ static int swap_out(unsigned int priority, int gfp_mask) struct mm_struct *best = NULL; int pid = 0; int assign = 0; + int found_task = 0; select: read_lock(&tasklist_lock); p = init_task.next_task; @@ -381,9 +418,17 @@ static int swap_out(unsigned int priority, int gfp_mask) continue; if (mm->rss <= 0) continue; + /* Skip tasks which haven't slept long enough yet when idle-swapping. */ + if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) || + time_after(p->sleep_time + idle_time * HZ, jiffies))) + continue; + found_task++; /* Refresh swap_cnt? */ - if (assign == 1) - mm->swap_cnt = mm->rss; + if (assign == 1) { + mm->swap_cnt = (mm->rss >> SWAP_SHIFT); + if (mm->swap_cnt < SWAP_MIN) + mm->swap_cnt = SWAP_MIN; + } if (mm->swap_cnt > max_cnt) { max_cnt = mm->swap_cnt; best = mm; @@ -392,7 +437,7 @@ static int swap_out(unsigned int priority, int gfp_mask) } read_unlock(&tasklist_lock); if (!best) { - if (!assign) { + if (!assign && found_task > 0) { assign = 1; goto select; } @@ -418,50 +463,409 @@ out: return __ret; } -/* - * Check if there is any memory pressure (free_pages < pages_low) + +/** + * reclaim_page - reclaims one page from the inactive_clean list + * @zone: reclaim a page from this zone + * + * The pages on the inactive_clean can be instantly reclaimed. + * The tests look impressive, but most of the time we'll grab + * the first page of the list and exit successfully. */ -static inline int memory_pressure(void) +struct page * reclaim_page(zone_t * zone) { - pg_data_t *pgdat = pgdat_list; + struct page * page = NULL; + struct list_head * page_lru; + int maxscan; - do { - int i; - for(i = 0; i < MAX_NR_ZONES; i++) { - zone_t *zone = pgdat->node_zones+ i; - if (zone->size && - zone->free_pages < zone->pages_low) - return 1; + /* + * We only need the pagemap_lru_lock if we don't reclaim the page, + * but we have to grab the pagecache_lock before the pagemap_lru_lock + * to avoid deadlocks and most of the time we'll succeed anyway. + */ + spin_lock(&pagecache_lock); + spin_lock(&pagemap_lru_lock); + maxscan = zone->inactive_clean_pages; + while ((page_lru = zone->inactive_clean_list.prev) != + &zone->inactive_clean_list && maxscan--) { + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (!PageInactiveClean(page)) { + printk("VM: reclaim_page, wrong page on list.\n"); + list_del(page_lru); + page->zone->inactive_clean_pages--; + continue; } - pgdat = pgdat->node_next; - } while (pgdat); - return 0; + /* Page is or was in use? Move it to the active list. */ + if (PageTestandClearReferenced(page) || page->age > 0 || + (!page->buffers && page_count(page) > 1)) { + del_page_from_inactive_clean_list(page); + add_page_to_active_list(page); + continue; + } + + /* The page is dirty, or locked, move to inactive_diry list. */ + if (page->buffers || TryLockPage(page)) { + del_page_from_inactive_clean_list(page); + add_page_to_inactive_dirty_list(page); + continue; + } + + /* OK, remove the page from the caches. */ + if (PageSwapCache(page)) { + __delete_from_swap_cache(page); + goto found_page; + } + + if (page->mapping) { + __remove_inode_page(page); + goto found_page; + } + + /* We should never ever get here. */ + printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); + list_del(page_lru); + zone->inactive_clean_pages--; + UnlockPage(page); + } + /* Reset page pointer, maybe we encountered an unfreeable page. */ + page = NULL; + goto out; + +found_page: + del_page_from_inactive_clean_list(page); + UnlockPage(page); + page->age = PAGE_AGE_START; + if (page_count(page) != 1) + printk("VM: reclaim_page, found page with count %d!\n", + page_count(page)); +out: + spin_unlock(&pagemap_lru_lock); + spin_unlock(&pagecache_lock); + memory_pressure++; + return page; +} + +/** + * page_launder - clean dirty inactive pages, move to inactive_clean list + * @gfp_mask: what operations we are allowed to do + * @sync: should we wait synchronously for the cleaning of pages + * + * When this function is called, we are most likely low on free + + * inactive_clean pages. Since we want to refill those pages as + * soon as possible, we'll make two loops over the inactive list, + * one to move the already cleaned pages to the inactive_clean lists + * and one to (often asynchronously) clean the dirty inactive pages. + * + * In situations where kswapd cannot keep up, user processes will + * end up calling this function. Since the user process needs to + * have a page before it can continue with its allocation, we'll + * do synchronous page flushing in that case. + * + * This code is heavily inspired by the FreeBSD source code. Thanks + * go out to Matthew Dillon. + */ +#define MAX_LAUNDER (4 * (1 << page_cluster)) +int page_launder(int gfp_mask, int sync) +{ + int launder_loop, maxscan, cleaned_pages, maxlaunder; + int can_get_io_locks; + struct list_head * page_lru; + struct page * page; + + /* + * We can only grab the IO locks (eg. for flushing dirty + * buffers to disk) if __GFP_IO is set. + */ + can_get_io_locks = gfp_mask & __GFP_IO; + + launder_loop = 0; + maxlaunder = 0; + cleaned_pages = 0; + +dirty_page_rescan: + spin_lock(&pagemap_lru_lock); + maxscan = nr_inactive_dirty_pages; + while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && + maxscan-- > 0) { + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (!PageInactiveDirty(page)) { + printk("VM: page_launder, wrong page on list.\n"); + list_del(page_lru); + nr_inactive_dirty_pages--; + page->zone->inactive_dirty_pages--; + continue; + } + + /* Page is or was in use? Move it to the active list. */ + if (PageTestandClearReferenced(page) || page->age > 0 || + (!page->buffers && page_count(page) > 1) || + page_ramdisk(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + continue; + } + + /* + * The page is locked. IO in progress? + * Move it to the back of the list. + */ + if (TryLockPage(page)) { + list_del(page_lru); + list_add(page_lru, &inactive_dirty_list); + continue; + } + + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we either free + * the page (in case it was a buffercache only page) or we + * move the page to the inactive_clean list. + * + * On the first round, we should free all previously cleaned + * buffer pages + */ + if (page->buffers) { + int wait, clearedbuf; + int freed_page = 0; + /* + * Since we might be doing disk IO, we have to + * drop the spinlock and take an extra reference + * on the page so it doesn't go away from under us. + */ + del_page_from_inactive_dirty_list(page); + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + + /* Will we do (asynchronous) IO? */ + if (launder_loop && maxlaunder == 0 && sync) + wait = 2; /* Synchrounous IO */ + else if (launder_loop && maxlaunder-- > 0) + wait = 1; /* Async IO */ + else + wait = 0; /* No IO */ + + /* Try to free the page buffers. */ + clearedbuf = try_to_free_buffers(page, wait); + + /* + * Re-take the spinlock. Note that we cannot + * unlock the page yet since we're still + * accessing the page_struct here... + */ + spin_lock(&pagemap_lru_lock); + + /* The buffers were not freed. */ + if (!clearedbuf) { + add_page_to_inactive_dirty_list(page); + + /* The page was only in the buffer cache. */ + } else if (!page->mapping) { + atomic_dec(&buffermem_pages); + freed_page = 1; + cleaned_pages++; + + /* The page has more users besides the cache and us. */ + } else if (page_count(page) > 2) { + add_page_to_active_list(page); + + /* OK, we "created" a freeable page. */ + } else /* page->mapping && page_count(page) == 2 */ { + add_page_to_inactive_clean_list(page); + cleaned_pages++; + } + + /* + * Unlock the page and drop the extra reference. + * We can only do it here because we ar accessing + * the page struct above. + */ + UnlockPage(page); + page_cache_release(page); + + /* + * If we're freeing buffer cache pages, stop when + * we've got enough free memory. + */ + if (freed_page && !free_shortage()) + break; + continue; + } else if (page->mapping && !PageDirty(page)) { + /* + * If a page had an extra reference in + * deactivate_page(), we will find it here. + * Now the page is really freeable, so we + * move it to the inactive_clean list. + */ + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); + UnlockPage(page); + cleaned_pages++; + } else { + /* + * OK, we don't know what to do with the page. + * It's no use keeping it here, so we move it to + * the active list. + */ + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + UnlockPage(page); + } + } + spin_unlock(&pagemap_lru_lock); + + /* + * If we don't have enough free pages, we loop back once + * to queue the dirty pages for writeout. When we were called + * by a user process (that /needs/ a free page) and we didn't + * free anything yet, we wait synchronously on the writeout of + * MAX_SYNC_LAUNDER pages. + * + * We also wake up bdflush, since bdflush should, under most + * loads, flush out the dirty pages before we have to wait on + * IO. + */ + if (can_get_io_locks && !launder_loop && free_shortage()) { + launder_loop = 1; + /* If we cleaned pages, never do synchronous IO. */ + if (cleaned_pages) + sync = 0; + /* We only do a few "out of order" flushes. */ + maxlaunder = MAX_LAUNDER; + /* Kflushd takes care of the rest. */ + wakeup_bdflush(0); + goto dirty_page_rescan; + } + + /* Return the number of pages moved to the inactive_clean list. */ + return cleaned_pages; +} + +/** + * refill_inactive_scan - scan the active list and find pages to deactivate + * @priority: the priority at which to scan + * @oneshot: exit after deactivating one page + * + * This function will scan a portion of the active list to find + * unused pages, those pages will then be moved to the inactive list. + */ +int refill_inactive_scan(unsigned int priority, int oneshot) +{ + struct list_head * page_lru; + struct page * page; + int maxscan, page_active = 0; + int ret = 0; + + /* Take the lock while messing with the list... */ + spin_lock(&pagemap_lru_lock); + maxscan = nr_active_pages >> priority; + while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (!PageActive(page)) { + printk("VM: refill_inactive, wrong page on list.\n"); + list_del(page_lru); + nr_active_pages--; + continue; + } + + /* Do aging on the pages. */ + if (PageTestandClearReferenced(page)) { + age_page_up_nolock(page); + page_active = 1; + } else { + age_page_down_ageonly(page); + /* + * Since we don't hold a reference on the page + * ourselves, we have to do our test a bit more + * strict then deactivate_page(). This is needed + * since otherwise the system could hang shuffling + * unfreeable pages from the active list to the + * inactive_dirty list and back again... + * + * SUBTLE: we can have buffer pages with count 1. + */ + if (page_count(page) <= (page->buffers ? 2 : 1)) { + deactivate_page_nolock(page); + page_active = 0; + } else { + page_active = 1; + } + } + /* + * If the page is still on the active list, move it + * to the other end of the list. Otherwise it was + * deactivated by age_page_down and we exit successfully. + */ + if (page_active || PageActive(page)) { + list_del(page_lru); + list_add(page_lru, &active_list); + } else { + ret = 1; + if (oneshot) + break; + } + } + spin_unlock(&pagemap_lru_lock); + + return ret; } /* - * Check if all zones have recently had memory_pressure (zone_wake_kswapd) + * Check if there are zones with a severe shortage of free pages, + * or if all zones have a minor shortage. */ -static inline int keep_kswapd_awake(void) +int free_shortage(void) { - int all_recent = 1; pg_data_t *pgdat = pgdat_list; + int sum = 0; + int freeable = nr_free_pages() + nr_inactive_clean_pages(); + int freetarget = freepages.high + inactive_target / 3; + /* Are we low on free pages globally? */ + if (freeable < freetarget) + return freetarget - freeable; + + /* If not, are we very low on any particular zone? */ do { int i; for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones+ i; - if (zone->size) { - if (zone->free_pages < zone->pages_min) - return 1; - if (!zone->zone_wake_kswapd) - all_recent = 0; + if (zone->size && (zone->inactive_clean_pages + + zone->free_pages < zone->pages_min)) { + sum += zone->pages_min; + sum -= zone->free_pages; + sum -= zone->inactive_clean_pages; } } pgdat = pgdat->node_next; } while (pgdat); - return all_recent; + return sum; +} + +/* + * How many inactive pages are we short? + */ +int inactive_shortage(void) +{ + int shortage = 0; + + shortage += freepages.high; + shortage += inactive_target; + shortage -= nr_free_pages(); + shortage -= nr_inactive_clean_pages(); + shortage -= nr_inactive_dirty_pages; + + if (shortage > 0) + return shortage; + + return 0; } /* @@ -472,96 +876,140 @@ static inline int keep_kswapd_awake(void) * We want to try to free "count" pages, and we want to * cluster them so that we get good swap-out behaviour. * - * Don't try _too_ hard, though. We don't want to have bad - * latency. - * - * Note: only called by kswapd and try_to_free_pages - * both can WAIT at top level. + * OTOH, if we're a user process (and not kswapd), we + * really care about latency. In that case we don't try + * to free too many pages. */ -#define FREE_COUNT 8 -#define SWAP_COUNT 16 -static int do_try_to_free_pages(unsigned int gfp_mask) +static int refill_inactive(unsigned int gfp_mask, int user) { - int priority; - int count = FREE_COUNT; - int swap_count; + int priority, count, start_count, made_progress; + unsigned long idle_time; + + count = inactive_shortage() + free_shortage(); + if (user) + count = (1 << page_cluster); + start_count = count; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - priority = 64; + /* + * Calculate the minimum time (in seconds) a process must + * have slept before we consider it for idle swapping. + * This must be the number of seconds it takes to go through + * all of the cache. Doing this idle swapping makes the VM + * smoother once we start hitting swap. + */ + idle_time = atomic_read(&page_cache_size); + idle_time += atomic_read(&buffermem_pages); + idle_time /= (inactive_target + 1); + + priority = 6; do { + made_progress = 0; + if (current->need_resched) { + __set_current_state(TASK_RUNNING); schedule(); - /* time has passed - pressure too? */ - if (!memory_pressure()) - goto done; } - while (shrink_mmap(priority, gfp_mask)) { - if (!--count) + while (refill_inactive_scan(priority, 1) || + swap_out(priority, gfp_mask, idle_time)) { + made_progress = 1; + if (--count <= 0) goto done; } - /* check if mission completed */ - if (!keep_kswapd_awake()) - goto done; + /* + * don't be too light against the d/i cache since + * refill_inactive() almost never fail when there's + * really plenty of memory free. + */ + shrink_dcache_memory(priority, gfp_mask); + shrink_icache_memory(priority, gfp_mask); /* Try to get rid of some shared memory pages.. */ - if (gfp_mask & __GFP_IO) { - /* - * don't be too light against the d/i cache since - * shrink_mmap() almost never fail when there's - * really plenty of memory free. - */ - count -= shrink_dcache_memory(priority, gfp_mask); - count -= shrink_icache_memory(priority, gfp_mask); - /* - * Not currently working, see fixme in shrink_?cache_memory - * In the inner funtions there is a comment: - * "To help debugging, a zero exit status indicates - * all slabs were released." (-arca?) - * lets handle it in a primitive but working way... - * if (count <= 0) - * goto done; - */ - if (!keep_kswapd_awake()) + while (shm_swap(priority, gfp_mask)) { + made_progress = 1; + if (--count <= 0) goto done; - - while (shm_swap(priority, gfp_mask)) { - if (!--count) - goto done; - } } /* * Then, try to page stuff out.. - * - * This will not actually free any pages (they get - * put in the swap cache), so we must not count this - * as a "count" success. */ - swap_count = SWAP_COUNT; - while (swap_out(priority, gfp_mask)) - if (--swap_count < 0) - break; + while (swap_out(priority, gfp_mask, 0)) { + made_progress = 1; + if (--count <= 0) + goto done; + } - } while (--priority >= 0); + /* + * If we either have enough free memory, or if + * page_launder() will be able to make enough + * free memory, then stop. + */ + if (!inactive_shortage() || !free_shortage()) + goto done; + + /* + * Only switch to a lower "priority" if we + * didn't make any useful progress in the + * last loop. + */ + if (!made_progress) + priority--; + } while (priority >= 0); - /* Always end on a shrink_mmap.., may sleep... */ - while (shrink_mmap(0, gfp_mask)) { - if (!--count) + /* Always end on a refill_inactive.., may sleep... */ + while (refill_inactive_scan(0, 1)) { + if (--count <= 0) goto done; } - /* Return 1 if any page is freed, or - * there are no more memory pressure */ - return (count < FREE_COUNT || !keep_kswapd_awake()); - + done: - return 1; + return (count < start_count); +} + +static int do_try_to_free_pages(unsigned int gfp_mask, int user) +{ + int ret = 0; + + /* + * If we're low on free pages, move pages from the + * inactive_dirty list to the inactive_clean list. + * + * Usually bdflush will have pre-cleaned the pages + * before we get around to moving them to the other + * list, so this is a relatively cheap operation. + */ + if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() + + nr_inactive_clean_pages()) + ret += page_launder(gfp_mask, user); + + /* + * If needed, we move pages from the active list + * to the inactive list. We also "eat" pages from + * the inode and dentry cache whenever we do this. + */ + if (free_shortage() || inactive_shortage()) { + shrink_dcache_memory(6, gfp_mask); + shrink_icache_memory(6, gfp_mask); + ret += refill_inactive(gfp_mask, user); + } else { + /* + * Reclaim unused slab cache memory. + */ + kmem_cache_reap(gfp_mask); + ret = 1; + } + + return ret; } DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); +DECLARE_WAIT_QUEUE_HEAD(kswapd_done); +struct task_struct *kswapd_task; /* * The background pageout daemon, started as a kernel thread @@ -584,6 +1032,7 @@ int kswapd(void *unused) tsk->pgrp = 1; strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); + kswapd_task = tsk; /* * Tell the memory management that we're a "memory allocator", @@ -599,54 +1048,166 @@ int kswapd(void *unused) */ tsk->flags |= PF_MEMALLOC; + /* + * Kswapd main loop. + */ for (;;) { - if (!keep_kswapd_awake()) { - interruptible_sleep_on(&kswapd_wait); + static int recalc = 0; + + /* If needed, try to free some memory. */ + if (inactive_shortage() || free_shortage()) { + int wait = 0; + /* Do we need to do some synchronous flushing? */ + if (waitqueue_active(&kswapd_done)) + wait = 1; + do_try_to_free_pages(GFP_KSWAPD, wait); + } + + /* + * Do some (very minimal) background scanning. This + * will scan all pages on the active list once + * every minute. This clears old referenced bits + * and moves unused pages to the inactive list. + */ + refill_inactive_scan(6, 0); + + /* Once a second, recalculate some VM stats. */ + if (time_after(jiffies, recalc + HZ)) { + recalc = jiffies; + recalculate_vm_stats(); } - do_try_to_free_pages(GFP_KSWAPD); + /* + * Wake up everybody waiting for free memory + * and unplug the disk queue. + */ + wake_up_all(&kswapd_done); + run_task_queue(&tq_disk); + + /* + * We go to sleep if either the free page shortage + * or the inactive page shortage is gone. We do this + * because: + * 1) we need no more free pages or + * 2) the inactive pages need to be flushed to disk, + * it wouldn't help to eat CPU time now ... + * + * We go to sleep for one second, but if it's needed + * we'll be woken up earlier... + */ + if (!free_shortage() || !inactive_shortage()) + interruptible_sleep_on_timeout(&kswapd_wait, HZ); + /* + * TODO: insert out of memory check & oom killer + * invocation in an else branch here. + */ } } +void wakeup_kswapd(int block) +{ + DECLARE_WAITQUEUE(wait, current); + + if (current == kswapd_task) + return; + + if (!block) { + if (waitqueue_active(&kswapd_wait)) + wake_up(&kswapd_wait); + return; + } + + /* + * Kswapd could wake us up before we get a chance + * to sleep, so we have to be very careful here to + * prevent SMP races... + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&kswapd_done, &wait); + + if (waitqueue_active(&kswapd_wait)) + wake_up(&kswapd_wait); + schedule(); + + remove_wait_queue(&kswapd_done, &wait); + __set_current_state(TASK_RUNNING); +} + /* * Called by non-kswapd processes when they want more - * memory. - * - * In a perfect world, this should just wake up kswapd - * and return. We don't actually want to swap stuff out - * from user processes, because the locking issues are - * nasty to the extreme (file write locks, and MM locking) - * - * One option might be to let kswapd do all the page-out - * and VM page table scanning that needs locking, and this - * process thread could do just the mmap shrink stage that - * can be done by just dropping cached pages without having - * any deadlock issues. + * memory but are unable to sleep on kswapd because + * they might be holding some IO locks ... */ int try_to_free_pages(unsigned int gfp_mask) { - int retval = 1; + int ret = 1; if (gfp_mask & __GFP_WAIT) { - current->state = TASK_RUNNING; current->flags |= PF_MEMALLOC; - retval = do_try_to_free_pages(gfp_mask); + ret = do_try_to_free_pages(gfp_mask, 1); current->flags &= ~PF_MEMALLOC; } - /* someone needed memory that kswapd had not provided - * make sure kswapd runs, should not happen often */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + return ret; +} + +DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait); +/* + * Kreclaimd will move pages from the inactive_clean list to the + * free list, in order to keep atomic allocations possible under + * all circumstances. Even when kswapd is blocked on IO. + */ +int kreclaimd(void *unused) +{ + struct task_struct *tsk = current; + pg_data_t *pgdat; - return retval; + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "kreclaimd"); + sigfillset(&tsk->blocked); + current->flags |= PF_MEMALLOC; + + while (1) { + + /* + * We sleep until someone wakes us up from + * page_alloc.c::__alloc_pages(). + */ + interruptible_sleep_on(&kreclaimd_wait); + + /* + * Move some pages from the inactive_clean lists to + * the free lists, if it is needed. + */ + pgdat = pgdat_list; + do { + int i; + for(i = 0; i < MAX_NR_ZONES; i++) { + zone_t *zone = pgdat->node_zones + i; + if (!zone->size) + continue; + + while (zone->free_pages < zone->pages_low) { + struct page * page; + page = reclaim_page(zone); + if (!page) + break; + __free_page(page); + } + } + pgdat = pgdat->node_next; + } while (pgdat); + } } + static int __init kswapd_init(void) { - printk("Starting kswapd v1.7\n"); + printk("Starting kswapd v1.8\n"); swap_setup(); - kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); + kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); return 0; } -- cgit v1.2.3