diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1999-10-09 00:00:47 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1999-10-09 00:00:47 +0000 |
commit | d6434e1042f3b0a6dfe1b1f615af369486f9b1fa (patch) | |
tree | e2be02f33984c48ec019c654051d27964e42c441 /mm | |
parent | 609d1e803baf519487233b765eb487f9ec227a18 (diff) |
Merge with 2.3.19.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bigmem.c | 71 | ||||
-rw-r--r-- | mm/filemap.c | 533 | ||||
-rw-r--r-- | mm/memory.c | 245 | ||||
-rw-r--r-- | mm/mlock.c | 10 | ||||
-rw-r--r-- | mm/mmap.c | 11 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 95 | ||||
-rw-r--r-- | mm/page_io.c | 82 | ||||
-rw-r--r-- | mm/slab.c | 152 | ||||
-rw-r--r-- | mm/swap_state.c | 19 | ||||
-rw-r--r-- | mm/swapfile.c | 98 | ||||
-rw-r--r-- | mm/vmalloc.c | 3 | ||||
-rw-r--r-- | mm/vmscan.c | 171 |
15 files changed, 951 insertions, 547 deletions
diff --git a/mm/Makefile b/mm/Makefile index c64eefbd2..68404aa67 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -12,4 +12,8 @@ O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o \ swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o +ifeq ($(CONFIG_BIGMEM),y) +O_OBJS += bigmem.o +endif + include $(TOPDIR)/Rules.make diff --git a/mm/bigmem.c b/mm/bigmem.c new file mode 100644 index 000000000..af63e860c --- /dev/null +++ b/mm/bigmem.c @@ -0,0 +1,71 @@ +/* + * BIGMEM common code and variables. + * + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/bigmem.h> + +unsigned long bigmem_mapnr; +int nr_free_bigpages = 0; + +struct page * prepare_bigmem_swapout(struct page * page) +{ + /* if this is a bigmem page so it can't be swapped out directly + otherwise the b_data buffer addresses will break + the lowlevel device drivers. */ + if (PageBIGMEM(page)) { + unsigned long regular_page; + unsigned long vaddr; + + regular_page = __get_free_page(GFP_ATOMIC); + if (!regular_page) + return NULL; + + vaddr = kmap(page_address(page), KM_READ); + copy_page(regular_page, vaddr); + kunmap(vaddr, KM_READ); + + /* ok, we can just forget about our bigmem page since + we stored its data into the new regular_page. */ + __free_page(page); + + page = MAP_NR(regular_page) + mem_map; + } + return page; +} + +struct page * replace_with_bigmem(struct page * page) +{ + if (!PageBIGMEM(page) && nr_free_bigpages) { + unsigned long kaddr; + + kaddr = __get_free_page(GFP_ATOMIC|GFP_BIGMEM); + if (kaddr) { + struct page * bigmem_page; + + bigmem_page = MAP_NR(kaddr) + mem_map; + if (PageBIGMEM(bigmem_page)) { + unsigned long vaddr; + + vaddr = kmap(kaddr, KM_WRITE); + copy_page(vaddr, page_address(page)); + kunmap(vaddr, KM_WRITE); + + /* Preserve the caching of the swap_entry. */ + bigmem_page->offset = page->offset; + + /* We can just forget the old page since + we stored its data into the new + bigmem_page. */ + __free_page(page); + + page = bigmem_page; + } + } + } + return page; +} diff --git a/mm/filemap.c b/mm/filemap.c index 668c6c99f..5efa9aaf7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,8 @@ * * finished 'unifying' the page and buffer cache and SMP-threaded the * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> */ atomic_t page_cache_size = ATOMIC_INIT(0); @@ -40,7 +42,16 @@ unsigned int page_hash_bits; struct page **page_hash_table; spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; +/* + * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with + * the pagemap_lru_lock held. + */ +spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED; +#define CLUSTER_PAGES (1 << page_cluster) +#define CLUSTER_SHIFT (PAGE_CACHE_SHIFT + page_cluster) +#define CLUSTER_BYTES (1 << CLUSTER_SHIFT) +#define CLUSTER_OFFSET(x) (((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT) void __add_page_to_hash_queue(struct page * page, struct page **p) { @@ -117,6 +128,7 @@ repeat: } if (page_count(page) != 2) printk("hm, busy page invalidated? (not necesserily a bug)\n"); + lru_cache_del(page); remove_page_from_inode_queue(page); remove_page_from_hash_queue(page); @@ -151,8 +163,9 @@ repeat: lock_page(page); - if (inode->i_op->flushpage) - inode->i_op->flushpage(inode, page, 0); + if (!inode->i_op->flushpage || + inode->i_op->flushpage(inode, page, 0)) + lru_cache_del(page); /* * We remove the page from the page cache @@ -212,93 +225,75 @@ repeat: spin_unlock(&pagecache_lock); } -extern atomic_t too_many_dirty_buffers; - int shrink_mmap(int priority, int gfp_mask) { - static unsigned long clock = 0; - unsigned long limit = num_physpages << 1; + int ret = 0, count; + LIST_HEAD(young); + LIST_HEAD(old); + LIST_HEAD(forget); + struct list_head * page_lru, * dispose; struct page * page; - int count, users; - count = limit >> priority; + count = nr_lru_pages / (priority+1); - page = mem_map + clock; - do { - int referenced; + spin_lock(&pagemap_lru_lock); - /* This works even in the presence of PageSkip because - * the first two entries at the beginning of a hole will - * be marked, not just the first. - */ - page++; - clock++; - if (clock >= max_mapnr) { - clock = 0; - page = mem_map; - } - if (PageSkip(page)) { - /* next_hash is overloaded for PageSkip */ - page = page->next_hash; - clock = page - mem_map; - } - - referenced = test_and_clear_bit(PG_referenced, &page->flags); + while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) { + page = list_entry(page_lru, struct page, lru); + list_del(page_lru); + dispose = &lru_cache; + if (test_and_clear_bit(PG_referenced, &page->flags)) + /* Roll the page at the top of the lru list, + * we could also be more aggressive putting + * the page in the young-dispose-list, so + * avoiding to free young pages in each pass. + */ + goto dispose_continue; + + dispose = &old; + /* don't account passes over not DMA pages */ if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) - continue; + goto dispose_continue; + if (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page)) + goto dispose_continue; count--; - /* - * Some common cases that we just short-circuit without - * getting the locks - we need to re-check this once we - * have the lock, but that's fine. - */ - users = page_count(page); - if (!users) - continue; - if (!page->buffers) { - if (!page->inode) - continue; - if (users > 1) - continue; - } - - /* - * ok, now the page looks interesting. Re-check things - * and keep the lock. - */ + dispose = &young; + if (TryLockPage(page)) + goto dispose_continue; + + /* Release the pagemap_lru lock even if the page is not yet + queued in any lru queue since we have just locked down + the page so nobody else may SMP race with us running + a lru_cache_del() (lru_cache_del() always run with the + page locked down ;). */ + spin_unlock(&pagemap_lru_lock); + + /* avoid unscalable SMP locking */ + if (!page->buffers && page_count(page) > 1) + goto unlock_noput_continue; + + /* Take the pagecache_lock spinlock held to avoid + other tasks to notice the page while we are looking at its + page count. If it's a pagecache-page we'll free it + in one atomic transaction after checking its page count. */ spin_lock(&pagecache_lock); - if (!page->inode && !page->buffers) { - spin_unlock(&pagecache_lock); - continue; - } - if (!page_count(page)) { - spin_unlock(&pagecache_lock); - BUG(); - continue; - } - get_page(page); - if (TryLockPage(page)) { - spin_unlock(&pagecache_lock); - goto put_continue; - } - /* - * we keep pagecache_lock locked and unlock it in - * each branch, so that the page->inode case doesnt - * have to re-grab it. Here comes the 'real' logic - * to free memory: - */ + /* avoid freeing the page while it's locked */ + get_page(page); /* Is it a buffer page? */ if (page->buffers) { - int mem = page->inode ? 0 : PAGE_CACHE_SIZE; spin_unlock(&pagecache_lock); if (!try_to_free_buffers(page)) goto unlock_continue; - atomic_sub(mem, &buffermem); + /* page was locked, inode can't go away under us */ + if (!page->inode) { + atomic_sub(PAGE_CACHE_SIZE, &buffermem); + goto made_buffer_progress; + } spin_lock(&pagecache_lock); } @@ -307,7 +302,7 @@ int shrink_mmap(int priority, int gfp_mask) * (count == 2 because we added one ourselves above). */ if (page_count(page) != 2) - goto spin_unlock_continue; + goto cache_unlock_continue; /* * Is it a page swap page? If so, we want to @@ -316,35 +311,68 @@ int shrink_mmap(int priority, int gfp_mask) */ if (PageSwapCache(page)) { spin_unlock(&pagecache_lock); - if (referenced && swap_count(page->offset) != 2) - goto unlock_continue; __delete_from_swap_cache(page); - page_cache_release(page); - goto made_progress; + goto made_inode_progress; } /* is it a page-cache page? */ - if (!referenced && page->inode && !pgcache_under_min()) { - remove_page_from_inode_queue(page); - remove_page_from_hash_queue(page); - page->inode = NULL; - spin_unlock(&pagecache_lock); - - page_cache_release(page); - goto made_progress; + if (page->inode) + { + dispose = &old; + if (!pgcache_under_min()) + { + remove_page_from_inode_queue(page); + remove_page_from_hash_queue(page); + page->inode = NULL; + spin_unlock(&pagecache_lock); + goto made_inode_progress; + } + goto cache_unlock_continue; } -spin_unlock_continue: + + dispose = &forget; + printk(KERN_ERR "shrink_mmap: unknown LRU page!\n"); + +cache_unlock_continue: spin_unlock(&pagecache_lock); unlock_continue: UnlockPage(page); -put_continue: put_page(page); - } while (count > 0); - return 0; -made_progress: +dispose_relock_continue: + /* even if the dispose list is local, a truncate_inode_page() + may remove a page from its queue so always + synchronize with the lru lock while accesing the + page->lru field */ + spin_lock(&pagemap_lru_lock); + list_add(page_lru, dispose); + continue; + +unlock_noput_continue: + UnlockPage(page); + goto dispose_relock_continue; + +dispose_continue: + list_add(page_lru, dispose); + } + goto out; + +made_inode_progress: + page_cache_release(page); +made_buffer_progress: UnlockPage(page); put_page(page); - return 1; + ret = 1; + spin_lock(&pagemap_lru_lock); + /* nr_lru_pages needs the spinlock */ + nr_lru_pages--; + +out: + list_splice(&young, &lru_cache); + list_splice(&old, lru_cache.prev); + + spin_unlock(&pagemap_lru_lock); + + return ret; } static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page) @@ -461,13 +489,14 @@ static inline void __add_to_page_cache(struct page * page, { unsigned long flags; - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error)); - page->flags = flags | ((1 << PG_locked) | (1 << PG_referenced)); - page->owner = (int)current; /* REMOVEME */ + flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced)); + page->flags = flags | (1 << PG_locked); + page->owner = current; /* REMOVEME */ get_page(page); page->offset = offset; add_page_to_inode_queue(inode, page); __add_page_to_hash_queue(page, hash); + lru_cache_add(page); } void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset) @@ -498,39 +527,58 @@ int add_to_page_cache_unique(struct page * page, } /* - * Try to read ahead in the file. "page_cache" is a potentially free page - * that we could use for the cache (if it is 0 we can try to create one, - * this is all overlapped with the IO on the previous page finishing anyway) + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. */ -static unsigned long try_to_read_ahead(struct file * file, - unsigned long offset, unsigned long page_cache) +static inline void page_cache_read(struct file * file, unsigned long offset) { + unsigned long new_page; struct inode *inode = file->f_dentry->d_inode; - struct page * page; - struct page ** hash; + struct page ** hash = page_hash(inode, offset); + struct page * page; - offset &= PAGE_CACHE_MASK; - switch (page_cache) { - case 0: - page_cache = page_cache_alloc(); - if (!page_cache) - break; - default: - if (offset >= inode->i_size) - break; - hash = page_hash(inode, offset); - page = page_cache_entry(page_cache); - if (!add_to_page_cache_unique(page, inode, offset, hash)) { - /* - * We do not have to check the return value here - * because it's a readahead. - */ - inode->i_op->readpage(file, page); - page_cache = 0; - page_cache_release(page); - } + spin_lock(&pagecache_lock); + page = __find_page_nolock(inode, offset, *hash); + spin_unlock(&pagecache_lock); + if (page) + return; + + new_page = page_cache_alloc(); + if (!new_page) + return; + page = page_cache_entry(new_page); + + if (!add_to_page_cache_unique(page, inode, offset, hash)) { + inode->i_op->readpage(file, page); + page_cache_release(page); + return; + } + + /* + * We arrive here in the unlikely event that someone + * raced with us and added our page to the cache first. + */ + page_cache_free(new_page); + return; +} + +/* + * Read in an entire cluster at once. A cluster is usually a 64k- + * aligned block that includes the address requested in "offset." + */ +static void read_cluster_nonblocking(struct file * file, + unsigned long offset) +{ + off_t filesize = file->f_dentry->d_inode->i_size; + unsigned long pages = CLUSTER_PAGES; + + offset = CLUSTER_OFFSET(offset); + while ((pages-- > 0) && (offset < filesize)) { + page_cache_read(file, offset); + offset += PAGE_CACHE_SIZE; } - return page_cache; + + return; } /* @@ -547,8 +595,8 @@ void ___wait_on_page(struct page *page) add_wait_queue(&page->wait, &wait); do { - tsk->state = TASK_UNINTERRUPTIBLE; run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!PageLocked(page)) break; schedule(); @@ -562,23 +610,8 @@ void ___wait_on_page(struct page *page) */ void lock_page(struct page *page) { - if (TryLockPage(page)) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, current); - - run_task_queue(&tq_disk); - add_wait_queue(&page->wait, &wait); - tsk->state = TASK_UNINTERRUPTIBLE; - - while (TryLockPage(page)) { - run_task_queue(&tq_disk); - schedule(); - tsk->state = TASK_UNINTERRUPTIBLE; - } - - remove_wait_queue(&page->wait, &wait); - tsk->state = TASK_RUNNING; - } + while (TryLockPage(page)) + ___wait_on_page(page); } @@ -607,13 +640,14 @@ repeat: struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); + run_task_queue(&tq_disk); + + __set_task_state(tsk, TASK_UNINTERRUPTIBLE); add_wait_queue(&page->wait, &wait); - tsk->state = TASK_UNINTERRUPTIBLE; - run_task_queue(&tq_disk); if (PageLocked(page)) schedule(); - tsk->state = TASK_RUNNING; + __set_task_state(tsk, TASK_RUNNING); remove_wait_queue(&page->wait, &wait); /* @@ -656,13 +690,14 @@ repeat: struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); + run_task_queue(&tq_disk); + + __set_task_state(tsk, TASK_UNINTERRUPTIBLE); add_wait_queue(&page->wait, &wait); - tsk->state = TASK_UNINTERRUPTIBLE; - run_task_queue(&tq_disk); if (PageLocked(page)) schedule(); - tsk->state = TASK_RUNNING; + __set_task_state(tsk, TASK_RUNNING); remove_wait_queue(&page->wait, &wait); /* @@ -811,9 +846,9 @@ static inline int get_max_readahead(struct inode * inode) return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)]; } -static inline unsigned long generic_file_readahead(int reada_ok, +static void generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode, - unsigned long ppos, struct page * page, unsigned long page_cache) + unsigned long ppos, struct page * page) { unsigned long max_ahead, ahead; unsigned long raend; @@ -877,8 +912,7 @@ static inline unsigned long generic_file_readahead(int reada_ok, ahead = 0; while (ahead < max_ahead) { ahead += PAGE_CACHE_SIZE; - page_cache = try_to_read_ahead(filp, raend + ahead, - page_cache); + page_cache_read(filp, raend + ahead); } /* * If we tried to read ahead some pages, @@ -910,26 +944,9 @@ static inline unsigned long generic_file_readahead(int reada_ok, #endif } - return page_cache; + return; } -/* - * "descriptor" for what we're up to with a read. - * This allows us to use the same read code yet - * have multiple different users of the data that - * we read from a file. - * - * The simplest case just copies the data to user - * mode. - */ -typedef struct { - size_t written; - size_t count; - char * buf; - int error; -} read_descriptor_t; - -typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long); /* * This is a generic file read routine, and uses the @@ -939,7 +956,7 @@ typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long); * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) +void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) { struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; @@ -1044,7 +1061,8 @@ page_ok: * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. */ page_not_up_to_date: - page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache); + generic_file_readahead(reada_ok, filp, inode, + pos & PAGE_CACHE_MASK, page); if (Page_Uptodate(page)) goto page_ok; @@ -1065,7 +1083,8 @@ readpage: goto page_ok; /* Again, try some read-ahead while waiting for the page to finish.. */ - page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache); + generic_file_readahead(reada_ok, filp, inode, + pos & PAGE_CACHE_MASK, page); wait_on_page(page); if (Page_Uptodate(page)) goto page_ok; @@ -1267,31 +1286,36 @@ out: } /* - * Semantics for shared and private memory areas are different past the end - * of the file. A shared mapping past the last page of the file is an error - * and results in a SIGBUS, while a private mapping just maps in a zero page. + * filemap_nopage() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. * - * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page - * ahead of the wait if we're sure to need it. + * XXX - at some point, this should return unique values to indicate to + * the caller whether this is EIO, OOM, or SIGBUS. */ -static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share) +static unsigned long filemap_nopage(struct vm_area_struct * area, + unsigned long address, int no_share) { struct file * file = area->vm_file; struct dentry * dentry = file->f_dentry; struct inode * inode = dentry->d_inode; - unsigned long offset, reada, i; struct page * page, **hash; - unsigned long old_page, new_page; - int error; + unsigned long old_page; + + unsigned long offset = address - area->vm_start + area->vm_offset; - new_page = 0; - offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset; - if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm) - goto no_page; + /* + * Semantics for shared and private memory areas are different + * past the end of the file. A shared mapping past the last page + * of the file is an error and results in a SIGBUS, while a + * private mapping just maps in a zero page. + */ + if ((offset >= inode->i_size) && + (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm)) + return 0; /* * Do we have something in the page cache already? @@ -1302,24 +1326,12 @@ retry_find: if (!page) goto no_cached_page; -found_page: /* * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. First check whether we'll need an - * extra page -- better to overlap the allocation with the I/O. + * that it's up-to-date. */ - if (no_share && !new_page) { - new_page = page_cache_alloc(); - if (!new_page) - goto failure; - } - - if (!Page_Uptodate(page)) { - lock_page(page); - if (!Page_Uptodate(page)) - goto page_not_uptodate; - UnlockPage(page); - } + if (!Page_Uptodate(page)) + goto page_not_uptodate; success: /* @@ -1327,100 +1339,76 @@ success: * and possibly copy it over to another page.. */ old_page = page_address(page); - if (!no_share) { - /* - * Ok, we can share the cached page directly.. Get rid - * of any potential extra pages. - */ - if (new_page) - page_cache_free(new_page); + if (no_share) { + unsigned long new_page = page_cache_alloc(); - flush_page_to_ram(old_page); - return old_page; + if (new_page) { + copy_page(new_page, old_page); + flush_page_to_ram(new_page); + } + page_cache_release(page); + return new_page; } - - /* - * No sharing ... copy to the new page. - */ - copy_page(new_page, old_page); - flush_page_to_ram(new_page); - page_cache_release(page); - return new_page; + + flush_page_to_ram(old_page); + return old_page; no_cached_page: /* - * Try to read in an entire cluster at once. - */ - reada = offset; - reada >>= PAGE_CACHE_SHIFT + page_cluster; - reada <<= PAGE_CACHE_SHIFT + page_cluster; - - for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE) - new_page = try_to_read_ahead(file, reada, new_page); - - if (!new_page) - new_page = page_cache_alloc(); - if (!new_page) - goto no_page; - - /* - * During getting the above page we might have slept, - * so we need to re-check the situation with the page - * cache.. The page we just got may be useful if we - * can't share, so don't get rid of it here. - */ - page = __find_get_page(inode, offset, hash); - if (page) - goto found_page; - - /* - * Now, create a new page-cache page from the page we got + * If the requested offset is within our file, try to read a whole + * cluster of pages at once. + * + * Otherwise, we're off the end of a privately mapped file, + * so we need to map a zero page. */ - page = page_cache_entry(new_page); - if (add_to_page_cache_unique(page, inode, offset, hash)) - goto retry_find; + if (offset < inode->i_size) + read_cluster_nonblocking(file, offset); + else + page_cache_read(file, offset); /* - * Now it's ours and locked, we can do initial IO to it: + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. */ - new_page = 0; + goto retry_find; page_not_uptodate: - error = inode->i_op->readpage(file, page); + lock_page(page); + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } - if (!error) { + if (!inode->i_op->readpage(file, page)) { wait_on_page(page); - if (PageError(page)) - goto page_read_error; - goto success; + if (Page_Uptodate(page)) + goto success; } -page_read_error: /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, * because there really aren't any performance issues here * and we need to check for errors. */ - if (!PageLocked(page)) - PAGE_BUG(page); - ClearPageError(page); - error = inode->i_op->readpage(file, page); - if (error) - goto failure; - wait_on_page(page); - if (Page_Uptodate(page)) + lock_page(page); + if (Page_Uptodate(page)) { + UnlockPage(page); goto success; + } + ClearPageError(page); + if (!inode->i_op->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } /* * Things didn't work out. Return zero to tell the * mm layer so, possibly freeing the page cache page first. */ -failure: page_cache_release(page); - if (new_page) - page_cache_free(new_page); -no_page: return 0; } @@ -1702,7 +1690,7 @@ static int msync_interval(struct vm_area_struct * vma, return 0; } -asmlinkage int sys_msync(unsigned long start, size_t len, int flags) +asmlinkage long sys_msync(unsigned long start, size_t len, int flags) { unsigned long end; struct vm_area_struct * vma; @@ -1855,28 +1843,29 @@ repeat_find: if (!PageLocked(page)) { PAGE_BUG(page); } else { - if (page->owner != (int)current) { + if (page->owner != current) { PAGE_BUG(page); } } status = write_one_page(file, page, offset, bytes, buf); + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + if (pos > inode->i_size) + inode->i_size = pos; + } /* Mark it unlocked again and drop the page.. */ UnlockPage(page); page_cache_release(page); if (status < 0) break; - - written += status; - count -= status; - pos += status; - buf += status; } *ppos = pos; - if (pos > inode->i_size) - inode->i_size = pos; if (page_cache) page_cache_free(page_cache); diff --git a/mm/memory.c b/mm/memory.c index a31e862b2..5498dbcf0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -31,6 +31,9 @@ /* * 05.04.94 - Multi-page memory management added for v1.1. * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) */ #include <linux/mm.h> @@ -39,6 +42,8 @@ #include <linux/pagemap.h> #include <linux/smp_lock.h> #include <linux/swapctl.h> +#include <linux/iobuf.h> +#include <linux/bigmem.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -55,10 +60,10 @@ void * high_memory = NULL; static inline void copy_cow_page(unsigned long from, unsigned long to) { if (from == ZERO_PAGE(to)) { - clear_page(to); + clear_bigpage(to); return; } - copy_page(to, from); + copy_bigpage(to, from); } mem_map_t * mem_map = NULL; @@ -142,39 +147,6 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) check_pgt_cache(); } -/* - * This function just free's the page directory - the - * pages tables themselves have been freed earlier by - * clear_page_tables(). - */ -void free_page_tables(struct mm_struct * mm) -{ - pgd_t * page_dir = mm->pgd; - - if (page_dir) { - if (page_dir == swapper_pg_dir) - goto out_bad; - pgd_free(page_dir); - } - return; - -out_bad: - printk(KERN_ERR - "free_page_tables: Trying to free kernel pgd\n"); - return; -} - -int new_page_tables(struct task_struct * tsk) -{ - pgd_t * new_pg; - - if (!(new_pg = pgd_alloc())) - return -ENOMEM; - SET_PAGE_DIR(tsk, new_pg); - tsk->mm->pgd = new_pg; - return 0; -} - #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) @@ -417,6 +389,192 @@ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long s } } + +/* + * Do a quick page-table lookup for a single page. + */ +static unsigned long follow_page(unsigned long address) +{ + pgd_t *pgd; + pmd_t *pmd; + + pgd = pgd_offset(current->mm, address); + pmd = pmd_offset(pgd, address); + if (pmd) { + pte_t * pte = pte_offset(pmd, address); + if (pte && pte_present(*pte)) { + return pte_page(*pte); + } + } + + printk(KERN_ERR "Missing page in follow_page\n"); + return 0; +} + +/* + * Given a physical address, is there a useful struct page pointing to it? + */ + +static struct page * get_page_map(unsigned long page) +{ + struct page *map; + + if (MAP_NR(page) >= max_mapnr) + return 0; + if (page == ZERO_PAGE(page)) + return 0; + map = mem_map + MAP_NR(page); + if (PageReserved(map)) + return 0; + return map; +} + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin and lock the pages for IO. + */ + +#define dprintk(x...) +int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) +{ + unsigned long ptr, end; + int err; + struct mm_struct * mm; + struct vm_area_struct * vma = 0; + unsigned long page; + struct page * map; + int doublepage = 0; + int repeat = 0; + int i; + + /* Make sure the iobuf is not already mapped somewhere. */ + if (iobuf->nr_pages) + return -EINVAL; + + mm = current->mm; + dprintk ("map_user_kiobuf: begin\n"); + + ptr = va & PAGE_MASK; + end = (va + len + PAGE_SIZE - 1) & PAGE_MASK; + err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT); + if (err) + return err; + + repeat: + down(&mm->mmap_sem); + + err = -EFAULT; + iobuf->locked = 1; + iobuf->offset = va & ~PAGE_MASK; + iobuf->length = len; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + } + if (handle_mm_fault(current, vma, ptr, (rw==READ)) <= 0) + goto out_unlock; + spin_lock(&mm->page_table_lock); + page = follow_page(ptr); + if (!page) { + dprintk (KERN_ERR "Missing page in map_user_kiobuf\n"); + map = NULL; + goto retry; + } + map = get_page_map(page); + if (map) { + if (TryLockPage(map)) { + goto retry; + } + atomic_inc(&map->count); + } + spin_unlock(&mm->page_table_lock); + dprintk ("Installing page %p %p: %d\n", (void *)page, map, i); + iobuf->pagelist[i] = page; + iobuf->maplist[i] = map; + iobuf->nr_pages = ++i; + + ptr += PAGE_SIZE; + } + + up(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return 0; + + out_unlock: + up(&mm->mmap_sem); + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf: end %d\n", err); + return err; + + retry: + + /* + * Undo the locking so far, wait on the page we got to, and try again. + */ + spin_unlock(&mm->page_table_lock); + unmap_kiobuf(iobuf); + up(&mm->mmap_sem); + + /* + * Did the release also unlock the page we got stuck on? + */ + if (map) { + if (!PageLocked(map)) { + /* If so, we may well have the page mapped twice + * in the IO address range. Bad news. Of + * course, it _might_ * just be a coincidence, + * but if it happens more than * once, chances + * are we have a double-mapped page. */ + if (++doublepage >= 3) { + return -EINVAL; + } + } + + /* + * Try again... + */ + wait_on_page(map); + } + + if (++repeat < 16) { + ptr = va & PAGE_MASK; + goto repeat; + } + return -EAGAIN; +} + + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kiobuf (struct kiobuf *iobuf) +{ + int i; + struct page *map; + + for (i = 0; i < iobuf->nr_pages; i++) { + map = iobuf->maplist[i]; + + if (map && iobuf->locked) { + __free_page(map); + UnlockPage(map); + } + } + + iobuf->nr_pages = 0; + iobuf->locked = 0; +} + static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pgprot_t prot) { @@ -655,7 +813,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, * Ok, we need to copy. Oh, well.. */ spin_unlock(&tsk->mm->page_table_lock); - new_page = __get_free_page(GFP_USER); + new_page = __get_free_page(GFP_BIGUSER); if (!new_page) return -1; spin_lock(&tsk->mm->page_table_lock); @@ -667,7 +825,6 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, if (PageReserved(page)) ++vma->vm_mm->rss; copy_cow_page(old_page,new_page); - flush_page_to_ram(old_page); flush_page_to_ram(new_page); flush_cache_page(vma, address); set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); @@ -681,6 +838,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, return 1; bad_wp_page: + spin_unlock(&tsk->mm->page_table_lock); printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); return -1; } @@ -781,7 +939,7 @@ out_unlock: * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... */ -static void swapin_readahead(unsigned long entry) +void swapin_readahead(unsigned long entry) { int i; struct page *new_page; @@ -833,12 +991,17 @@ static int do_swap_page(struct task_struct * tsk, vma->vm_mm->rss++; tsk->min_flt++; + lock_kernel(); swap_free(entry); + unlock_kernel(); pte = mk_pte(page_address(page), vma->vm_page_prot); + set_bit(PG_swap_entry, &page->flags); if (write_access && !is_page_shared(page)) { delete_from_swap_cache(page); + page = replace_with_bigmem(page); + pte = mk_pte(page_address(page), vma->vm_page_prot); pte = pte_mkwrite(pte_mkdirty(pte)); } set_pte(page_table, pte); @@ -854,10 +1017,10 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v { pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); if (write_access) { - unsigned long page = __get_free_page(GFP_USER); + unsigned long page = __get_free_page(GFP_BIGUSER); if (!page) return -1; - clear_page(page); + clear_bigpage(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); vma->vm_mm->rss++; tsk->min_flt++; @@ -898,6 +1061,8 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access); if (!page) return 0; /* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */ + if (page == -1) + return -1; /* OOM */ ++tsk->maj_flt; ++vma->vm_mm->rss; diff --git a/mm/mlock.c b/mm/mlock.c index d6b19cfb1..be5e07cbf 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -130,7 +130,7 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * next; int error; - if (!capable(CAP_IPC_LOCK)) + if (on && !capable(CAP_IPC_LOCK)) return -EPERM; len = (len + ~PAGE_MASK) & PAGE_MASK; end = start + len; @@ -172,7 +172,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -asmlinkage int sys_mlock(unsigned long start, size_t len) +asmlinkage long sys_mlock(unsigned long start, size_t len) { unsigned long locked; unsigned long lock_limit; @@ -203,7 +203,7 @@ out: return error; } -asmlinkage int sys_munlock(unsigned long start, size_t len) +asmlinkage long sys_munlock(unsigned long start, size_t len) { int ret; @@ -244,7 +244,7 @@ static int do_mlockall(int flags) return error; } -asmlinkage int sys_mlockall(int flags) +asmlinkage long sys_mlockall(int flags) { unsigned long lock_limit; int ret = -EINVAL; @@ -271,7 +271,7 @@ out: return ret; } -asmlinkage int sys_munlockall(void) +asmlinkage long sys_munlockall(void) { int ret; @@ -275,7 +275,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, vma->vm_ops = NULL; vma->vm_offset = off; vma->vm_file = NULL; - vma->vm_pte = 0; + vma->vm_private_data = NULL; /* Clear old maps */ error = -ENOMEM; @@ -547,7 +547,7 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area, mpnt->vm_ops = area->vm_ops; mpnt->vm_offset = area->vm_offset + (end - area->vm_start); mpnt->vm_file = area->vm_file; - mpnt->vm_pte = area->vm_pte; + mpnt->vm_private_data = area->vm_private_data; if (mpnt->vm_file) get_file(mpnt->vm_file); if (mpnt->vm_ops && mpnt->vm_ops->open) @@ -707,7 +707,7 @@ int do_munmap(unsigned long addr, size_t len) return 0; } -asmlinkage int sys_munmap(unsigned long addr, size_t len) +asmlinkage long sys_munmap(unsigned long addr, size_t len) { int ret; @@ -778,7 +778,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_ops = NULL; vma->vm_offset = 0; vma->vm_file = NULL; - vma->vm_pte = 0; + vma->vm_private_data = NULL; /* * merge_segments may merge our vma, so we can't refer to it @@ -813,6 +813,7 @@ void exit_mmap(struct mm_struct * mm) { struct vm_area_struct * mpnt; + release_segments(mm); mpnt = mm->mmap; mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL; mm->rss = 0; @@ -919,7 +920,7 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l /* To share, we must have the same file, operations.. */ if ((mpnt->vm_file != prev->vm_file)|| - (mpnt->vm_pte != prev->vm_pte) || + (mpnt->vm_private_data != prev->vm_private_data) || (mpnt->vm_ops != prev->vm_ops) || (mpnt->vm_flags != prev->vm_flags) || (prev->vm_end != mpnt->vm_start)) diff --git a/mm/mprotect.c b/mm/mprotect.c index b1504af83..61ef3116d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -194,7 +194,7 @@ static int mprotect_fixup(struct vm_area_struct * vma, return 0; } -asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) +asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) { unsigned long nstart, end, tmp; struct vm_area_struct * vma, * next; diff --git a/mm/mremap.c b/mm/mremap.c index 2852f9b06..95f4b4f90 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -118,7 +118,7 @@ oops_we_failed: flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, new_addr + len); + zap_page_range(mm, new_addr, len); flush_tlb_range(mm, new_addr, new_addr + len); return -1; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22ce7ac00..b62783c72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3,6 +3,7 @@ * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 */ #include <linux/config.h> @@ -13,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/pagemap.h> +#include <linux/bigmem.h> /* export bigmem vars */ #include <asm/dma.h> #include <asm/uaccess.h> /* for copy_to/from_user */ @@ -20,6 +22,8 @@ int nr_swap_pages = 0; int nr_free_pages = 0; +int nr_lru_pages; +LIST_HEAD(lru_cache); /* * Free area management @@ -45,7 +49,12 @@ struct free_area_struct { #define memory_head(x) ((struct page *)(x)) +#ifdef CONFIG_BIGMEM +#define BIGMEM_LISTS_OFFSET NR_MEM_LISTS +static struct free_area_struct free_area[NR_MEM_LISTS*2]; +#else static struct free_area_struct free_area[NR_MEM_LISTS]; +#endif static inline void init_mem_queue(struct free_area_struct * head) { @@ -101,6 +110,12 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order) #define list(x) (mem_map+(x)) +#ifdef CONFIG_BIGMEM + if (map_nr >= bigmem_mapnr) { + area += BIGMEM_LISTS_OFFSET; + nr_free_bigpages -= mask; + } +#endif map_nr &= mask; nr_free_pages -= mask; while (mask + (1 << (NR_MEM_LISTS-1))) { @@ -127,7 +142,6 @@ int __free_page(struct page *page) if (PageLocked(page)) PAGE_BUG(page); - page->flags &= ~(1 << PG_referenced); free_pages_ok(page - mem_map, 0); return 1; } @@ -145,7 +159,6 @@ int free_pages(unsigned long addr, unsigned long order) PAGE_BUG(map); if (PageLocked(map)) PAGE_BUG(map); - map->flags &= ~(1 << PG_referenced); free_pages_ok(map_nr, order); return 1; } @@ -160,6 +173,29 @@ int free_pages(unsigned long addr, unsigned long order) change_bit((index) >> (1+(order)), (area)->map) #define CAN_DMA(x) (PageDMA(x)) #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT)) + +#ifdef CONFIG_BIGMEM +#define RMQUEUEBIG(order, gfp_mask) \ +if (gfp_mask & __GFP_BIGMEM) { \ + struct free_area_struct * area = free_area+order+BIGMEM_LISTS_OFFSET; \ + unsigned long new_order = order; \ + do { struct page *prev = memory_head(area), *ret = prev->next; \ + if (memory_head(area) != ret) { \ + unsigned long map_nr; \ + (prev->next = ret->next)->prev = prev; \ + map_nr = ret - mem_map; \ + MARK_USED(map_nr, new_order, area); \ + nr_free_pages -= 1 << order; \ + nr_free_bigpages -= 1 << order; \ + EXPAND(ret, map_nr, order, new_order, area); \ + spin_unlock_irqrestore(&page_alloc_lock, flags); \ + return ADDRESS(map_nr); \ + } \ + new_order++; area++; \ + } while (new_order < NR_MEM_LISTS); \ +} +#endif + #define RMQUEUE(order, gfp_mask) \ do { struct free_area_struct * area = free_area+order; \ unsigned long new_order = order; \ @@ -194,8 +230,6 @@ do { unsigned long size = 1 << high; \ set_page_count(map, 1); \ } while (0) -int low_on_memory = 0; - unsigned long __get_free_pages(int gfp_mask, unsigned long order) { unsigned long flags; @@ -221,7 +255,9 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order) */ if (!(current->flags & PF_MEMALLOC)) { int freed; + static int low_on_memory = 0; +#ifndef CONFIG_BIGMEM if (nr_free_pages > freepages.min) { if (!low_on_memory) goto ok_to_allocate; @@ -232,6 +268,32 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order) } low_on_memory = 1; +#else + static int low_on_bigmemory = 0; + + if (gfp_mask & __GFP_BIGMEM) + { + if (nr_free_pages > freepages.min) { + if (!low_on_bigmemory) + goto ok_to_allocate; + if (nr_free_pages >= freepages.high) { + low_on_bigmemory = 0; + goto ok_to_allocate; + } + } + low_on_bigmemory = 1; + } else { + if (nr_free_pages-nr_free_bigpages > freepages.min) { + if (!low_on_memory) + goto ok_to_allocate; + if (nr_free_pages-nr_free_bigpages >= freepages.high) { + low_on_memory = 0; + goto ok_to_allocate; + } + } + low_on_memory = 1; + } +#endif current->flags |= PF_MEMALLOC; freed = try_to_free_pages(gfp_mask); current->flags &= ~PF_MEMALLOC; @@ -241,6 +303,9 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order) } ok_to_allocate: spin_lock_irqsave(&page_alloc_lock, flags); +#ifdef CONFIG_BIGMEM + RMQUEUEBIG(order, gfp_mask); +#endif RMQUEUE(order, gfp_mask); spin_unlock_irqrestore(&page_alloc_lock, flags); @@ -268,9 +333,12 @@ void show_free_areas(void) unsigned long order, flags; unsigned long total = 0; - printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); - printk("Free: %d (%d %d %d)\n", + printk("Free pages: %6dkB (%6dkB BigMem)\n ( ", + nr_free_pages<<(PAGE_SHIFT-10), + nr_free_bigpages<<(PAGE_SHIFT-10)); + printk("Free: %d, lru_cache: %d (%d %d %d)\n", nr_free_pages, + nr_lru_pages, freepages.min, freepages.low, freepages.high); @@ -281,6 +349,13 @@ void show_free_areas(void) for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) { nr ++; } +#ifdef CONFIG_BIGMEM + for (tmp = free_area[BIGMEM_LISTS_OFFSET+order].next; + tmp != memory_head(free_area+BIGMEM_LISTS_OFFSET+order); + tmp = tmp->next) { + nr ++; + } +#endif total += nr * ((PAGE_SIZE>>10) << order); printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order)); } @@ -334,6 +409,9 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m for (i = 0 ; i < NR_MEM_LISTS ; i++) { unsigned long bitmap_size; init_mem_queue(free_area+i); +#ifdef CONFIG_BIGMEM + init_mem_queue(free_area+BIGMEM_LISTS_OFFSET+i); +#endif mask += mask; end_mem = (end_mem + ~mask) & mask; bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i); @@ -342,6 +420,11 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m free_area[i].map = (unsigned int *) start_mem; memset((void *) start_mem, 0, bitmap_size); start_mem += bitmap_size; +#ifdef CONFIG_BIGMEM + free_area[BIGMEM_LISTS_OFFSET+i].map = (unsigned int *) start_mem; + memset((void *) start_mem, 0, bitmap_size); + start_mem += bitmap_size; +#endif } return start_mem; } diff --git a/mm/page_io.c b/mm/page_io.c index 0f7e6d199..72e8cb95a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,8 +18,6 @@ #include <asm/pgtable.h> -static DECLARE_WAIT_QUEUE_HEAD(lock_queue); - /* * Reads or writes a swap page. * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O. @@ -35,7 +33,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lock_queue); * that shared pages stay shared while being swapped. */ -static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait, int dolock) +static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait) { unsigned long type, offset; struct swap_info_struct * p; @@ -90,7 +88,6 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in } else kstat.pswpout++; - get_page(page); if (p->swap_device) { zones[0] = offset; zones_used = 1; @@ -99,58 +96,26 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in } else if (p->swap_file) { struct inode *swapf = p->swap_file->d_inode; int i; - if (swapf->i_op->get_block == NULL - && swapf->i_op->smap != NULL){ - /* - With MS-DOS, we use msdos_smap which returns - a sector number (not a cluster or block number). - It is a patch to enable the UMSDOS project. - Other people are working on better solution. - - It sounds like ll_rw_swap_file defined - its operation size (sector size) based on - PAGE_SIZE and the number of blocks to read. - So using get_block or smap should work even if - smap will require more blocks. - */ - int j; - unsigned int block = offset << 3; - - for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){ - if (!(zones[i] = swapf->i_op->smap(swapf,block++))) { - printk("rw_swap_page: bad swap file\n"); - return; - } + int j; + unsigned int block = offset + << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); + + block_size = swapf->i_sb->s_blocksize; + for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size) + if (!(zones[i] = bmap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; } - block_size = 512; - }else{ - int j; - unsigned int block = offset - << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); - - block_size = swapf->i_sb->s_blocksize; - for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size) - if (!(zones[i] = bmap(swapf,block++))) { - printk("rw_swap_page: bad swap file\n"); - return; - } - zones_used = i; - dev = swapf->i_dev; - } + zones_used = i; + dev = swapf->i_dev; } else { printk(KERN_ERR "rw_swap_page: no swap file or device\n"); - put_page(page); return; } if (!wait) { set_bit(PG_decr_after, &page->flags); atomic_inc(&nr_async_pages); } - if (dolock) { - set_bit(PG_free_swap_after, &page->flags); - p->swap_map[offset]++; - } - set_bit(PG_free_after, &page->flags); /* block_size == PAGE_SIZE/zones_used */ brw_page(rw, page, dev, zones, block_size, 0); @@ -192,29 +157,10 @@ void rw_swap_page(int rw, struct page *page, int wait) PAGE_BUG(page); if (page->inode != &swapper_inode) PAGE_BUG(page); - rw_swap_page_base(rw, entry, page, wait, 1); -} - -/* - * Setting up a new swap file needs a simple wrapper just to read the - * swap signature. SysV shared memory also needs a simple wrapper. - */ -void rw_swap_page_nocache(int rw, unsigned long entry, char *buf) -{ - struct page *page = mem_map + MAP_NR(buf); - - if (TryLockPage(page)) - PAGE_BUG(page); - if (PageSwapCache(page)) - PAGE_BUG(page); - if (page->inode) - PAGE_BUG(page); - page->offset = entry; - rw_swap_page_base(rw, entry, page, 1, 1); + rw_swap_page_base(rw, entry, page, wait); } /* - * shmfs needs a version that doesn't put the page in the page cache! * The swap lock map insists that pages be in the page cache! * Therefore we can't use it. Later when we can remove the need for the * lock map and we can reduce the number of functions exported. @@ -227,5 +173,5 @@ void rw_swap_page_nolock(int rw, unsigned long entry, char *buf, int wait) PAGE_BUG(page); if (PageSwapCache(page)) PAGE_BUG(page); - rw_swap_page_base(rw, entry, page, wait, 0); + rw_swap_page_base(rw, entry, page, wait); } @@ -3,6 +3,8 @@ * Written by Mark Hemment, 1996/97. * (markhe@nextd.demon.co.uk) * + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli + * * 11 April '97. Started multi-threading - markhe * The global cache-chain is protected by the semaphore 'cache_chain_sem'. * The sem is only needed when accessing/extending the cache-chain, which @@ -100,16 +102,10 @@ * is less than 512 (PAGE_SIZE<<3), but greater than 256. */ -#include <linux/mm.h> +#include <linux/config.h> #include <linux/slab.h> #include <linux/interrupt.h> -#include <linux/config.h> #include <linux/init.h> -#include <linux/smp.h> - -#include <asm/system.h> -#include <asm/atomic.h> -#include <asm/spinlock.h> #ifdef __mips__ #include <asm/pgtable.h> #include <asm/addrspace.h> @@ -989,6 +985,58 @@ opps: return cachep; } +/* + * This check if the kmem_cache_t pointer is chained in the cache_cache + * list. -arca + */ +static int is_chained_kmem_cache(kmem_cache_t * cachep) +{ + kmem_cache_t * searchp; + int ret = 0; + + /* Find the cache in the chain of caches. */ + down(&cache_chain_sem); + for (searchp = &cache_cache; searchp->c_nextp != &cache_cache; + searchp = searchp->c_nextp) { + if (searchp->c_nextp != cachep) + continue; + + /* Accessing clock_searchp is safe - we hold the mutex. */ + if (cachep == clock_searchp) + clock_searchp = cachep->c_nextp; + ret = 1; + break; + } + up(&cache_chain_sem); + + return ret; +} + +/* returns 0 if every slab is been freed -arca */ +static int __kmem_cache_shrink(kmem_cache_t *cachep) +{ + kmem_slab_t *slabp; + int ret; + + spin_lock_irq(&cachep->c_spinlock); + + /* If the cache is growing, stop shrinking. */ + while (!cachep->c_growing) { + slabp = cachep->c_lastp; + if (slabp->s_inuse || slabp == kmem_slab_end(cachep)) + break; + kmem_slab_unlink(slabp); + spin_unlock_irq(&cachep->c_spinlock); + kmem_slab_destroy(cachep, slabp); + spin_lock_irq(&cachep->c_spinlock); + } + ret = 1; + if (cachep->c_lastp == kmem_slab_end(cachep)) + ret = 0; /* Cache is empty. */ + spin_unlock_irq(&cachep->c_spinlock); + return ret; +} + /* Shrink a cache. Releases as many slabs as possible for a cache. * It is expected this function will be called by a module when it is * unloaded. The cache is _not_ removed, this creates too many problems and @@ -1000,10 +1048,6 @@ opps: int kmem_cache_shrink(kmem_cache_t *cachep) { - kmem_cache_t *searchp; - kmem_slab_t *slabp; - int ret; - if (!cachep) { printk(KERN_ERR "kmem_shrink: NULL ptr\n"); return 2; @@ -1013,43 +1057,73 @@ kmem_cache_shrink(kmem_cache_t *cachep) return 2; } + if (!is_chained_kmem_cache(cachep)) { + printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", + cachep); + return 2; + } + + return __kmem_cache_shrink(cachep); +} + +/* + * Remove a kmem_cache_t object from the slab cache. When returns 0 it + * completed succesfully. -arca + */ +int kmem_cache_destroy(kmem_cache_t * cachep) +{ + kmem_cache_t * prev; + int ret; + + if (!cachep) { + printk(KERN_ERR "kmem_destroy: NULL ptr\n"); + return 1; + } + if (in_interrupt()) { + printk(KERN_ERR "kmem_destroy: Called during int - %s\n", + cachep->c_name); + return 1; + } + + ret = 0; /* Find the cache in the chain of caches. */ - down(&cache_chain_sem); /* Semaphore is needed. */ - searchp = &cache_cache; - for (;searchp->c_nextp != &cache_cache; searchp = searchp->c_nextp) { - if (searchp->c_nextp != cachep) + down(&cache_chain_sem); + for (prev = &cache_cache; prev->c_nextp != &cache_cache; + prev = prev->c_nextp) { + if (prev->c_nextp != cachep) continue; /* Accessing clock_searchp is safe - we hold the mutex. */ if (cachep == clock_searchp) clock_searchp = cachep->c_nextp; - goto found; + + /* remove the cachep from the cache_cache list. -arca */ + prev->c_nextp = cachep->c_nextp; + + ret = 1; + break; } up(&cache_chain_sem); - printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep); - return 2; -found: - /* Release the semaphore before getting the cache-lock. This could - * mean multiple engines are shrinking the cache, but so what. - */ - up(&cache_chain_sem); - spin_lock_irq(&cachep->c_spinlock); - /* If the cache is growing, stop shrinking. */ - while (!cachep->c_growing) { - slabp = cachep->c_lastp; - if (slabp->s_inuse || slabp == kmem_slab_end(cachep)) - break; - kmem_slab_unlink(slabp); - spin_unlock_irq(&cachep->c_spinlock); - kmem_slab_destroy(cachep, slabp); - spin_lock_irq(&cachep->c_spinlock); + if (!ret) { + printk(KERN_ERR "kmem_destroy: Invalid cache addr %p\n", + cachep); + return 1; } - ret = 1; - if (cachep->c_lastp == kmem_slab_end(cachep)) - ret--; /* Cache is empty. */ - spin_unlock_irq(&cachep->c_spinlock); - return ret; + + if (__kmem_cache_shrink(cachep)) { + printk(KERN_ERR "kmem_destroy: Can't free all objects %p\n", + cachep); + down(&cache_chain_sem); + cachep->c_nextp = cache_cache.c_nextp; + cache_cache.c_nextp = cachep; + up(&cache_chain_sem); + return 1; + } + + kmem_cache_free(&cache_cache, cachep); + + return 0; } /* Get the memory for a slab management obj. */ @@ -1587,7 +1661,7 @@ bad_slab: #if 1 /* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */ -*(int *) 0 = 0; + BUG(); #endif return; diff --git a/mm/swap_state.c b/mm/swap_state.c index 2aa17d3a4..5cfc686dd 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -214,8 +214,6 @@ static inline void remove_from_swap_cache(struct page *page) page_address(page), page_count(page)); #endif PageClearSwapCache(page); - if (inode->i_op->flushpage) - inode->i_op->flushpage(inode, page, 0); remove_inode_page(page); } @@ -239,6 +237,15 @@ void __delete_from_swap_cache(struct page *page) swap_free (entry); } +static void delete_from_swap_cache_nolock(struct page *page) +{ + if (!swapper_inode.i_op->flushpage || + swapper_inode.i_op->flushpage(&swapper_inode, page, 0)) + lru_cache_del(page); + + __delete_from_swap_cache(page); +} + /* * This must be called only on pages that have * been verified to be in the swap cache. @@ -247,7 +254,7 @@ void delete_from_swap_cache(struct page *page) { lock_page(page); - __delete_from_swap_cache(page); + delete_from_swap_cache_nolock(page); UnlockPage(page); page_cache_release(page); @@ -267,13 +274,13 @@ void free_page_and_swap_cache(unsigned long addr) */ lock_page(page); if (PageSwapCache(page) && !is_page_shared(page)) { - long entry = page->offset; - remove_from_swap_cache(page); - swap_free(entry); + delete_from_swap_cache_nolock(page); page_cache_release(page); } UnlockPage(page); + clear_bit(PG_swap_entry, &page->flags); + __free_page(page); } diff --git a/mm/swapfile.c b/mm/swapfile.c index ce18f34f5..c4ce5377d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,16 +46,36 @@ static inline int scan_swap_map(struct swap_info_struct *si) } } si->cluster_nr = SWAPFILE_CLUSTER; + + /* try to find an empty (even not aligned) cluster. */ + offset = si->lowest_bit; + check_next_cluster: + if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) + { + int nr; + for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) + if (si->swap_map[nr]) + { + offset = nr+1; + goto check_next_cluster; + } + /* We found a completly empty cluster, so start + * using it. + */ + goto got_page; + } + /* No luck, so now go finegrined as usual. -Andrea */ for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { if (si->swap_map[offset]) continue; - si->lowest_bit = offset; -got_page: - si->swap_map[offset] = 1; - nr_swap_pages--; + got_page: + if (offset == si->lowest_bit) + si->lowest_bit++; if (offset == si->highest_bit) si->highest_bit--; - si->cluster_next = offset; + si->swap_map[offset] = 1; + nr_swap_pages--; + si->cluster_next = offset+1; return offset; } return 0; @@ -81,12 +101,9 @@ unsigned long get_swap_page(void) entry = SWP_ENTRY(type,offset); type = swap_info[type].next; if (type < 0 || - p->prio != swap_info[type].prio) - { + p->prio != swap_info[type].prio) { swap_list.next = swap_list.head; - } - else - { + } else { swap_list.next = type; } return entry; @@ -126,15 +143,16 @@ void swap_free(unsigned long entry) offset = SWP_OFFSET(entry); if (offset >= p->max) goto bad_offset; - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) - p->highest_bit = offset; if (!p->swap_map[offset]) goto bad_free; if (p->swap_map[offset] < SWAP_MAP_MAX) { - if (!--p->swap_map[offset]) + if (!--p->swap_map[offset]) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; nr_swap_pages++; + } } #ifdef DEBUG_SWAP printk("DebugVM: swap_free(entry %08lx, count now %d)\n", @@ -157,6 +175,44 @@ bad_free: goto out; } +/* needs the big kernel lock */ +unsigned long acquire_swap_entry(struct page *page) +{ + struct swap_info_struct * p; + unsigned long offset, type; + unsigned long entry; + + if (!test_bit(PG_swap_entry, &page->flags)) + goto new_swap_entry; + + /* We have the old entry in the page offset still */ + entry = page->offset; + if (!entry) + goto new_swap_entry; + type = SWP_TYPE(entry); + if (type & SHM_SWP_TYPE) + goto new_swap_entry; + if (type >= nr_swapfiles) + goto new_swap_entry; + p = type + swap_info; + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + goto new_swap_entry; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto new_swap_entry; + /* Has it been re-used for something else? */ + if (p->swap_map[offset]) + goto new_swap_entry; + + /* We're cool, we can just use the old one */ + p->swap_map[offset] = 1; + nr_swap_pages--; + return entry; + +new_swap_entry: + return get_swap_page(); +} + /* * The swap entry has been read in advance, and we return 1 to indicate * that the page has been used or is no longer needed. @@ -266,7 +322,7 @@ static void unuse_process(struct mm_struct * mm, unsigned long entry, /* * Go through process' page directory. */ - if (!mm || mm == &init_mm) + if (!mm) return; for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); @@ -340,7 +396,7 @@ static int try_to_unuse(unsigned int type) return 0; } -asmlinkage int sys_swapoff(const char * specialfile) +asmlinkage long sys_swapoff(const char * specialfile) { struct swap_info_struct * p = NULL; struct dentry * dentry; @@ -484,7 +540,7 @@ int is_swap_partition(kdev_t dev) { * * The swapon system call */ -asmlinkage int sys_swapon(const char * specialfile, int swap_flags) +asmlinkage long sys_swapon(const char * specialfile, int swap_flags) { struct swap_info_struct * p; struct dentry * swap_dentry; @@ -495,7 +551,6 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) static int least_priority = 0; union swap_header *swap_header = 0; int swap_header_version; - int lock_map_size = PAGE_SIZE; int nr_good_pages = 0; unsigned long maxpages; int swapfilesize; @@ -661,8 +716,9 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) else p->swap_map[page] = SWAP_MAP_BAD; } - nr_good_pages = swap_header->info.last_page - i; - lock_map_size = (p->max + 7) / 8; + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; if (error) goto bad_swap; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a4eeb1dc5..9bd4142c3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2,6 +2,7 @@ * linux/mm/vmalloc.c * * Copyright (C) 1993 Linus Torvalds + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 */ #include <linux/malloc.h> @@ -94,7 +95,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo unsigned long page; if (!pte_none(*pte)) printk("alloc_area_pte: page already exists\n"); - page = __get_free_page(GFP_KERNEL); + page = __get_free_page(GFP_KERNEL|GFP_BIGMEM); if (!page) return -ENOMEM; set_pte(pte, mk_pte(page, prot)); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ae052b94..8ee000fc0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -17,6 +17,7 @@ #include <linux/smp_lock.h> #include <linux/pagemap.h> #include <linux/init.h> +#include <linux/bigmem.h> #include <asm/pgtable.h> @@ -31,8 +32,7 @@ * using a process that no longer actually exists (it might * have died while we slept). */ -static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, - unsigned long address, pte_t * page_table, int gfp_mask) +static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; unsigned long entry; @@ -47,15 +47,12 @@ static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, goto out_failed; page = mem_map + MAP_NR(page_addr); - spin_lock(&tsk->mm->page_table_lock); + spin_lock(&vma->vm_mm->page_table_lock); if (pte_val(pte) != pte_val(*page_table)) goto out_failed_unlock; - /* - * Dont be too eager to get aging right if - * memory is dangerously low. - */ - if (!low_on_memory && pte_young(pte)) { + /* Don't look at this pte if it's been accessed recently. */ + if (pte_young(pte)) { /* * Transfer the "accessed" bit from the page * tables to the global page map. @@ -67,7 +64,8 @@ static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, if (PageReserved(page) || PageLocked(page) - || ((gfp_mask & __GFP_DMA) && !PageDMA(page))) + || ((gfp_mask & __GFP_DMA) && !PageDMA(page)) + || (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page))) goto out_failed_unlock; /* @@ -136,15 +134,16 @@ drop_pte: */ flush_cache_page(vma, address); if (vma->vm_ops && vma->vm_ops->swapout) { - pid_t pid = tsk->pid; + int error; pte_clear(page_table); - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&vma->vm_mm->page_table_lock); flush_tlb_page(vma, address); vma->vm_mm->rss--; - - if (vma->vm_ops->swapout(vma, page)) - kill_proc(pid, SIGBUS, 1); - goto out_free_success; + error = vma->vm_ops->swapout(vma, page); + if (!error) + goto out_free_success; + __free_page(page); + return error; } /* @@ -153,14 +152,16 @@ drop_pte: * we have the swap cache set up to associate the * page with that swap entry. */ - entry = get_swap_page(); + entry = acquire_swap_entry(page); if (!entry) - goto out_failed; /* No swap space left */ + goto out_failed_unlock; /* No swap space left */ + if (!(page = prepare_bigmem_swapout(page))) + goto out_swap_free_unlock; + vma->vm_mm->rss--; - tsk->nswap++; set_pte(page_table, __pte(entry)); - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&vma->vm_mm->page_table_lock); flush_tlb_page(vma, address); swap_duplicate(entry); /* One for the process, one for the swap cache */ @@ -175,9 +176,14 @@ out_free_success: __free_page(page); return 1; out_failed_unlock: - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&vma->vm_mm->page_table_lock); out_failed: return 0; +out_swap_free_unlock: + swap_free(entry); + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; + } /* @@ -194,8 +200,7 @@ out_failed: * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */ -static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma, - pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) +static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pte_t * pte; unsigned long pmd_end; @@ -216,8 +221,8 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * do { int result; - tsk->mm->swap_address = address + PAGE_SIZE; - result = try_to_swap_out(tsk, vma, address, pte, gfp_mask); + vma->vm_mm->swap_address = address + PAGE_SIZE; + result = try_to_swap_out(vma, address, pte, gfp_mask); if (result) return result; address += PAGE_SIZE; @@ -226,8 +231,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * return 0; } -static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) +static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pmd_t * pmd; unsigned long pgd_end; @@ -247,7 +251,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * end = pgd_end; do { - int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask); + int result = swap_out_pmd(vma, pmd, address, end, gfp_mask); if (result) return result; address = (address + PMD_SIZE) & PMD_MASK; @@ -256,8 +260,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * return 0; } -static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, - unsigned long address, int gfp_mask) +static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask) { pgd_t *pgdir; unsigned long end; @@ -266,11 +269,11 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, if (vma->vm_flags & VM_LOCKED) return 0; - pgdir = pgd_offset(tsk->mm, address); + pgdir = pgd_offset(vma->vm_mm, address); end = vma->vm_end; while (address < end) { - int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask); + int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask); if (result) return result; address = (address + PGDIR_SIZE) & PGDIR_MASK; @@ -279,7 +282,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, return 0; } -static int swap_out_process(struct task_struct * p, int gfp_mask) +static int swap_out_mm(struct mm_struct * mm, int gfp_mask) { unsigned long address; struct vm_area_struct* vma; @@ -287,18 +290,18 @@ static int swap_out_process(struct task_struct * p, int gfp_mask) /* * Go through process' page directory. */ - address = p->mm->swap_address; + address = mm->swap_address; /* * Find the proper vm-area */ - vma = find_vma(p->mm, address); + vma = find_vma(mm, address); if (vma) { if (address < vma->vm_start) address = vma->vm_start; for (;;) { - int result = swap_out_vma(p, vma, address, gfp_mask); + int result = swap_out_vma(vma, address, gfp_mask); if (result) return result; vma = vma->vm_next; @@ -309,8 +312,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask) } /* We didn't find anything for the process */ - p->mm->swap_cnt = 0; - p->mm->swap_address = 0; + mm->swap_cnt = 0; + mm->swap_address = 0; return 0; } @@ -321,9 +324,11 @@ static int swap_out_process(struct task_struct * p, int gfp_mask) */ static int swap_out(unsigned int priority, int gfp_mask) { - struct task_struct * p, * pbest; - int counter, assign, max_cnt; + struct task_struct * p; + int counter; + int __ret = 0; + lock_kernel(); /* * We make one or two passes through the task list, indexed by * assign = {0, 1}: @@ -338,46 +343,61 @@ static int swap_out(unsigned int priority, int gfp_mask) * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = nr_tasks / (priority+1); + counter = nr_threads / (priority+1); if (counter < 1) counter = 1; - if (counter > nr_tasks) - counter = nr_tasks; + if (counter > nr_threads) + counter = nr_threads; for (; counter >= 0; counter--) { - assign = 0; - max_cnt = 0; - pbest = NULL; + int assign = 0; + int max_cnt = 0; + struct mm_struct *best = NULL; + int pid = 0; select: read_lock(&tasklist_lock); p = init_task.next_task; for (; p != &init_task; p = p->next_task) { - if (!p->swappable) + struct mm_struct *mm = p->mm; + if (!p->swappable || !mm) continue; - if (p->mm->rss <= 0) + if (mm->rss <= 0) continue; /* Refresh swap_cnt? */ if (assign) - p->mm->swap_cnt = p->mm->rss; - if (p->mm->swap_cnt > max_cnt) { - max_cnt = p->mm->swap_cnt; - pbest = p; + mm->swap_cnt = mm->rss; + if (mm->swap_cnt > max_cnt) { + max_cnt = mm->swap_cnt; + best = mm; + pid = p->pid; } } read_unlock(&tasklist_lock); - if (!pbest) { + if (!best) { if (!assign) { assign = 1; goto select; } goto out; - } + } else { + int ret; + + atomic_inc(&best->mm_count); + ret = swap_out_mm(best, gfp_mask); + mmdrop(best); + + if (!ret) + continue; - if (swap_out_process(pbest, gfp_mask)) - return 1; + if (ret < 0) + kill_proc(pid, SIGBUS, 1); + __ret = 1; + goto out; + } } out: - return 0; + unlock_kernel(); + return __ret; } /* @@ -394,8 +414,6 @@ static int do_try_to_free_pages(unsigned int gfp_mask) int priority; int count = SWAP_CLUSTER_MAX; - lock_kernel(); - /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); @@ -423,32 +441,10 @@ static int do_try_to_free_pages(unsigned int gfp_mask) shrink_dcache_memory(priority, gfp_mask); } while (--priority >= 0); done: - unlock_kernel(); return priority >= 0; } -/* - * Before we start the kernel thread, print out the - * kswapd initialization message (otherwise the init message - * may be printed in the middle of another driver's init - * message). It looks very bad when that happens. - */ -void __init kswapd_setup(void) -{ - int i; - char *revision="$Revision: 1.5 $", *s, *e; - - swap_setup(); - - if ((s = strchr(revision, ':')) && - (e = strchr(s, '$'))) - s++, i = e - s; - else - s = revision, i = -1; - printk ("Starting kswapd v%.*s\n", i, s); -} - static struct task_struct *kswapd_process; /* @@ -499,7 +495,9 @@ int kswapd(void *unused) * up on a more timely basis. */ do { - if (nr_free_pages >= freepages.high) + /* kswapd is critical to provide GFP_ATOMIC + allocations (not GFP_BIGMEM ones). */ + if (nr_free_pages - nr_free_bigpages >= freepages.high) break; if (!do_try_to_free_pages(GFP_KSWAPD)) @@ -535,4 +533,13 @@ int try_to_free_pages(unsigned int gfp_mask) retval = do_try_to_free_pages(gfp_mask); return retval; } - + +static int __init kswapd_init(void) +{ + printk("Starting kswapd v1.6\n"); + swap_setup(); + kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); + return 0; +} + +module_init(kswapd_init) |