/* * linux/mm/vmscan.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $ * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). */ #include #include #include #include #include #include #include #include #include #include /* * The swap-out functions return 1 if they successfully * threw something out, and we got a free page. It returns * zero if it couldn't do anything, and any other value * indicates it decreased rss, but the page was shared. * * NOTE! If it sleeps, it *must* return 1 to make sure we * don't continue with the swap-out. Otherwise we may be * using a process that no longer actually exists (it might * have died while we slept). */ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; swp_entry_t entry; struct page * page; int (*swapout)(struct page *, struct file *); pte = *page_table; if (!pte_present(pte)) goto out_failed; page = pte_page(pte); if ((page-mem_map >= max_mapnr) || PageReserved(page)) goto out_failed; if (mm->swap_cnt) mm->swap_cnt--; /* Don't look at this pte if it's been accessed recently. */ if (pte_young(pte)) { /* * Transfer the "accessed" bit from the page * tables to the global page map. */ set_pte(page_table, pte_mkold(pte)); SetPageReferenced(page); goto out_failed; } if (TryLockPage(page)) goto out_failed; /* * Is the page already in the swap cache? If so, then * we can just drop our reference to it without doing * any IO - it's already up-to-date on disk. * * Return 0, as we didn't actually free any real * memory, and we should just continue our scan. */ if (PageSwapCache(page)) { entry.val = page->index; swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: UnlockPage(page); vma->vm_mm->rss--; flush_tlb_page(vma, address); page_cache_release(page); goto out_failed; } /* * Is it a clean page? Then it must be recoverable * by just paging it in again, and we can just drop * it.. * * However, this won't actually free any real * memory, as the page will just be in the page cache * somewhere, and as such we should just continue * our scan. * * Basically, this just makes it possible for us to do * some real work in the future in "shrink_mmap()". */ if (!pte_dirty(pte)) { flush_cache_page(vma, address); pte_clear(page_table); goto drop_pte; } /* * Don't go down into the swap-out stuff if * we cannot do I/O! Avoid recursing on FS * locks etc. */ if (!(gfp_mask & __GFP_IO)) goto out_unlock; /* * Don't do any of the expensive stuff if * we're not really interested in this zone. */ if (page->zone->free_pages > page->zone->pages_high) goto out_unlock; /* * Ok, it's really dirty. That means that * we should either create a new swap cache * entry for it, or we should write it back * to its own backing store. * * Note that in neither case do we actually * know that we make a page available, but * as we potentially sleep we can no longer * continue scanning, so we migth as well * assume we free'd something. * * NOTE NOTE NOTE! This should just set a * dirty bit in 'page', and just drop the * pte. All the hard work would be done by * shrink_mmap(). * * That would get rid of a lot of problems. */ flush_cache_page(vma, address); if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) { int error; struct file *file = vma->vm_file; if (file) get_file(file); pte_clear(page_table); vma->vm_mm->rss--; flush_tlb_page(vma, address); vmlist_access_unlock(vma->vm_mm); error = swapout(page, file); UnlockPage(page); if (file) fput(file); if (!error) goto out_free_success; page_cache_release(page); return error; } /* * This is a dirty, swappable page. First of all, * get a suitable swap entry for it, and make sure * we have the swap cache set up to associate the * page with that swap entry. */ entry = get_swap_page(); if (!entry.val) goto out_unlock; /* No swap space left */ if (!(page = prepare_highmem_swapout(page))) goto out_swap_free; swap_duplicate(entry); /* One for the process, one for the swap cache */ /* Add it to the swap cache */ add_to_swap_cache(page, entry); /* Put the swap entry into the pte after the page is in swapcache */ vma->vm_mm->rss--; set_pte(page_table, swp_entry_to_pte(entry)); flush_tlb_page(vma, address); vmlist_access_unlock(vma->vm_mm); /* OK, do a physical asynchronous write to swap. */ rw_swap_page(WRITE, page, 0); out_free_success: page_cache_release(page); return 1; out_swap_free: swap_free(entry); out_failed: return 0; out_unlock: UnlockPage(page); return 0; } /* * A new implementation of swap_out(). We do not swap complete processes, * but only a small number of blocks, before we continue with the next * process. The number of blocks actually swapped is determined on the * number of page faults, that this process actually had in the last time, * so we won't swap heavily used processes all the time ... * * Note: the priority argument is a hint on much CPU to waste with the * swap block search, not a hint, of how much blocks to swap with * each process. * * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */ static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pte_t * pte; unsigned long pmd_end; if (pmd_none(*dir)) return 0; if (pmd_bad(*dir)) { pmd_ERROR(*dir); pmd_clear(dir); return 0; } pte = pte_offset(dir, address); pmd_end = (address + PMD_SIZE) & PMD_MASK; if (end > pmd_end) end = pmd_end; do { int result; vma->vm_mm->swap_address = address + PAGE_SIZE; result = try_to_swap_out(mm, vma, address, pte, gfp_mask); if (result) return result; if (!mm->swap_cnt) return 0; address += PAGE_SIZE; pte++; } while (address && (address < end)); return 0; } static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pmd_t * pmd; unsigned long pgd_end; if (pgd_none(*dir)) return 0; if (pgd_bad(*dir)) { pgd_ERROR(*dir); pgd_clear(dir); return 0; } pmd = pmd_offset(dir, address); pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; if (pgd_end && (end > pgd_end)) end = pgd_end; do { int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask); if (result) return result; if (!mm->swap_cnt) return 0; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); return 0; } static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask) { pgd_t *pgdir; unsigned long end; /* Don't swap out areas which are locked down */ if (vma->vm_flags & VM_LOCKED) return 0; pgdir = pgd_offset(vma->vm_mm, address); end = vma->vm_end; if (address >= end) BUG(); do { int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask); if (result) return result; if (!mm->swap_cnt) return 0; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (address && (address < end)); return 0; } static int swap_out_mm(struct mm_struct * mm, int gfp_mask) { unsigned long address; struct vm_area_struct* vma; /* * Go through process' page directory. */ address = mm->swap_address; /* * Find the proper vm-area after freezing the vma chain * and ptes. */ vmlist_access_lock(mm); vma = find_vma(mm, address); if (vma) { if (address < vma->vm_start) address = vma->vm_start; for (;;) { int result = swap_out_vma(mm, vma, address, gfp_mask); if (result) return result; vma = vma->vm_next; if (!vma) break; address = vma->vm_start; } } vmlist_access_unlock(mm); /* We didn't find anything for the process */ mm->swap_cnt = 0; mm->swap_address = 0; return 0; } /* * Select the task with maximal swap_cnt and try to swap out a page. * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. */ static int swap_out(unsigned int priority, int gfp_mask) { struct task_struct * p; int counter; int __ret = 0; lock_kernel(); /* * We make one or two passes through the task list, indexed by * assign = {0, 1}: * Pass 1: select the swappable task with maximal RSS that has * not yet been swapped out. * Pass 2: re-assign rss swap_cnt values, then select as above. * * With this approach, there's no need to remember the last task * swapped out. If the swap-out fails, we clear swap_cnt so the * task won't be selected again until all others have been tried. * * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ counter = (nr_threads << 2) >> (priority >> 2); if (counter < 1) counter = 1; for (; counter >= 0; counter--) { unsigned long max_cnt = 0; struct mm_struct *best = NULL; int pid = 0; int assign = 0; select: read_lock(&tasklist_lock); p = init_task.next_task; for (; p != &init_task; p = p->next_task) { struct mm_struct *mm = p->mm; if (!p->swappable || !mm) continue; if (mm->rss <= 0) continue; /* Refresh swap_cnt? */ if (assign == 1) mm->swap_cnt = mm->rss; if (mm->swap_cnt > max_cnt) { max_cnt = mm->swap_cnt; best = mm; pid = p->pid; } } read_unlock(&tasklist_lock); if (!best) { if (!assign) { assign = 1; goto select; } goto out; } else { int ret; atomic_inc(&best->mm_count); ret = swap_out_mm(best, gfp_mask); mmdrop(best); if (!ret) continue; if (ret < 0) kill_proc(pid, SIGBUS, 1); __ret = 1; goto out; } } out: unlock_kernel(); return __ret; } /* * Check if there is any memory pressure (free_pages < pages_low) */ static inline int memory_pressure(void) { pg_data_t *pgdat = pgdat_list; do { int i; for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones+ i; if (zone->size && zone->free_pages < zone->pages_low) return 1; } pgdat = pgdat->node_next; } while (pgdat); return 0; } /* * Check if there recently has been memory pressure (zone_wake_kswapd) */ static inline int keep_kswapd_awake(void) { pg_data_t *pgdat = pgdat_list; do { int i; for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones+ i; if (zone->size && zone->zone_wake_kswapd) return 1; } pgdat = pgdat->node_next; } while (pgdat); return 0; } /* * We need to make the locks finer granularity, but right * now we need this so that we can do page allocations * without holding the kernel lock etc. * * We want to try to free "count" pages, and we want to * cluster them so that we get good swap-out behaviour. * * Don't try _too_ hard, though. We don't want to have bad * latency. */ #define FREE_COUNT 8 #define SWAP_COUNT 16 static int do_try_to_free_pages(unsigned int gfp_mask) { int priority; int count = FREE_COUNT; int swap_count; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); priority = 64; do { if (current->need_resched) { schedule(); /* time has passed - pressure too? */ if (!memory_pressure()) goto done; } while (shrink_mmap(priority, gfp_mask)) { if (!--count) goto done; } /* not (been) low on memory - it is * pointless to try to swap out. */ if (!keep_kswapd_awake()) goto done; /* Try to get rid of some shared memory pages.. */ if (gfp_mask & __GFP_IO) { /* * don't be too light against the d/i cache since * shrink_mmap() almost never fail when there's * really plenty of memory free. */ count -= shrink_dcache_memory(priority, gfp_mask); count -= shrink_icache_memory(priority, gfp_mask); /* * Not currently working, see fixme in shrink_?cache_memory * In the inner funtions there is a comment: * "To help debugging, a zero exit status indicates * all slabs were released." (-arca?) * lets handle it in a primitive but working way... * if (count <= 0) * goto done; */ if (!keep_kswapd_awake()) goto done; while (shm_swap(priority, gfp_mask)) { if (!--count) goto done; } } /* * Then, try to page stuff out.. * * This will not actually free any pages (they get * put in the swap cache), so we must not count this * as a "count" success. */ swap_count = SWAP_COUNT; while (swap_out(priority, gfp_mask)) if (--swap_count < 0) break; } while (--priority >= 0); /* Always end on a shrink_mmap.., may sleep... */ while (shrink_mmap(0, gfp_mask)) { if (!--count) goto done; } /* Return 1 if any page is freed, or * there are no more memory pressure */ return (count < FREE_COUNT || !memory_pressure()); done: return 1; } DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); /* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ int kswapd(void *unused) { struct task_struct *tsk = current; tsk->session = 1; tsk->pgrp = 1; strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it * regardless (see "__alloc_pages()"). "kswapd" should * never get caught in the normal page freeing logic. * * (Kswapd normally doesn't need memory anyway, but sometimes * you need a small amount of memory in order to be able to * page out something else, and this flag essentially protects * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ tsk->flags |= PF_MEMALLOC; for (;;) { if (!keep_kswapd_awake()) { /* wake up regulary to do an early attempt too free * pages - pages will not actually be freed. */ interruptible_sleep_on_timeout(&kswapd_wait, HZ); } do_try_to_free_pages(GFP_KSWAPD); } } /* * Called by non-kswapd processes when they want more * memory. * * In a perfect world, this should just wake up kswapd * and return. We don't actually want to swap stuff out * from user processes, because the locking issues are * nasty to the extreme (file write locks, and MM locking) * * One option might be to let kswapd do all the page-out * and VM page table scanning that needs locking, and this * process thread could do just the mmap shrink stage that * can be done by just dropping cached pages without having * any deadlock issues. */ int try_to_free_pages(unsigned int gfp_mask) { int retval = 1; if (gfp_mask & __GFP_WAIT) { current->state = TASK_RUNNING; current->flags |= PF_MEMALLOC; retval = do_try_to_free_pages(gfp_mask); current->flags &= ~PF_MEMALLOC; } else { /* make sure kswapd runs */ if (waitqueue_active(&kswapd_wait)) wake_up_interruptible(&kswapd_wait); } return retval; } static int __init kswapd_init(void) { printk("Starting kswapd v1.6\n"); swap_setup(); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); return 0; } module_init(kswapd_init)