From 51d3b7814cdccef9188240fe0cbd8d97ff2c7470 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 22 Jun 1999 23:05:57 +0000 Subject: Merge with Linux 2.3.7. WARNING: 2.3.7 is known to eat filesystems for breakfast and little children for lunch, so if you try this on your machine make backups first ... --- mm/filemap.c | 959 +++++++++++++++++++++++++++++++++++++++----------------- mm/memory.c | 26 +- mm/mmap.c | 6 +- mm/page_alloc.c | 26 +- mm/page_io.c | 53 ++-- mm/swap_state.c | 106 ++++--- mm/swapfile.c | 2 +- mm/vmscan.c | 2 +- 8 files changed, 797 insertions(+), 383 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 455f334f3..4e885758f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1,7 +1,7 @@ /* * linux/mm/filemap.c * - * Copyright (C) 1994, 1995 Linus Torvalds + * Copyright (C) 1994-1999 Linus Torvalds */ /* @@ -29,9 +29,12 @@ * though. * * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar */ -unsigned long page_cache_size = 0; +atomic_t page_cache_size = ATOMIC_INIT(0); struct page * page_hash_table[PAGE_HASH_SIZE]; /* @@ -50,38 +53,97 @@ static struct pio_request *pio_first = NULL, **pio_last = &pio_first; static kmem_cache_t *pio_request_cache; static DECLARE_WAIT_QUEUE_HEAD(pio_wait); +spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; + + static inline void make_pio_request(struct file *, unsigned long, unsigned long); +void __add_page_to_hash_queue(struct page * page, struct page **p){ + atomic_inc(&page_cache_size); + if((page->next_hash = *p) != NULL) + (*p)->pprev_hash = &page->next_hash; + *p = page; + page->pprev_hash = p; + if (page->buffers) + PAGE_BUG(page); +} + +static void remove_page_from_hash_queue(struct page * page) +{ + if(page->pprev_hash) { + if(page->next_hash) + page->next_hash->pprev_hash = page->pprev_hash; + *page->pprev_hash = page->next_hash; + page->pprev_hash = NULL; + } + atomic_dec(&page_cache_size); +} + +static void remove_page_from_inode_queue(struct page * page) +{ + struct inode * inode = page->inode; + struct page *prev, *next; + + inode->i_nrpages--; + next = page->next; + prev = page->prev; + if (inode->i_pages == page) + inode->i_pages = next; + if (next) + next->prev = prev; + if (prev) + prev->next = next; + page->next = NULL; + page->prev = NULL; +} /* - * Invalidate the pages of an inode, removing all pages that aren't - * locked down (those are sure to be up-to-date anyway, so we shouldn't - * invalidate them). + * Remove a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. */ +void remove_inode_page(struct page *page) +{ + if (!PageLocked(page)) + PAGE_BUG(page); + + spin_lock(&pagecache_lock); + remove_page_from_inode_queue(page); + remove_page_from_hash_queue(page); + page->inode = NULL; + spin_unlock(&pagecache_lock); +} + void invalidate_inode_pages(struct inode * inode) { struct page ** p; struct page * page; +repeat: + spin_lock(&pagecache_lock); p = &inode->i_pages; while ((page = *p) != NULL) { - if (PageLocked(page)) { - p = &page->next; - continue; + get_page(page); + if (TryLockPage(page)) { + spin_unlock(&pagecache_lock); + wait_on_page(page); + page_cache_release(page); + goto repeat; } - inode->i_nrpages--; - if ((*p = page->next) != NULL) - (*p)->prev = page->prev; - page->next = NULL; - page->prev = NULL; + if (page_count(page) != 2) + printk("hm, busy page invalidated? (not necesserily a bug)\n"); + + remove_page_from_inode_queue(page); remove_page_from_hash_queue(page); page->inode = NULL; + UnlockPage(page); + page_cache_release(page); page_cache_release(page); - continue; + } + spin_unlock(&pagecache_lock); } - /* * Truncate the page cache at a set offset, removing the pages * that are beyond that offset (and zeroing out partial pages). @@ -90,55 +152,90 @@ void truncate_inode_pages(struct inode * inode, unsigned long start) { struct page ** p; struct page * page; + int partial = 0; repeat: + spin_lock(&pagecache_lock); p = &inode->i_pages; while ((page = *p) != NULL) { unsigned long offset = page->offset; /* page wholly truncated - free it */ if (offset >= start) { - if (PageLocked(page)) { - wait_on_page(page); - goto repeat; - } - inode->i_nrpages--; - if ((*p = page->next) != NULL) - (*p)->prev = page->prev; - page->next = NULL; - page->prev = NULL; - remove_page_from_hash_queue(page); - page->inode = NULL; + get_page(page); + spin_unlock(&pagecache_lock); + + lock_page(page); + + if (inode->i_op->flushpage) + inode->i_op->flushpage(inode, page, 0); + + /* + * We remove the page from the page cache + * _after_ we have destroyed all buffer-cache + * references to it. Otherwise some other process + * might think this inode page is not in the + * page cache and creates a buffer-cache alias + * to it causing all sorts of fun problems ... + */ + remove_inode_page(page); + + UnlockPage(page); page_cache_release(page); - continue; + page_cache_release(page); + + /* + * We have done things without the pagecache lock, + * so we'll have to repeat the scan. + * It's not possible to deadlock here because + * we are guaranteed to make progress. (ie. we have + * just removed a page) + */ + goto repeat; } p = &page->next; + /* + * there is only one partial page possible. + */ + if (partial) + continue; + offset = start - offset; /* partial truncate, clear end of page */ if (offset < PAGE_CACHE_SIZE) { - unsigned long address = page_address(page); + unsigned long address; + get_page(page); + spin_unlock(&pagecache_lock); + + lock_page(page); + partial = 1; + + address = page_address(page); memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset); flush_page_to_ram(address); + + if (inode->i_op->flushpage) + inode->i_op->flushpage(inode, page, offset); + /* + * we have dropped the spinlock so we have to + * restart. + */ + UnlockPage(page); + page_cache_release(page); + goto repeat; } } + spin_unlock(&pagecache_lock); } -/* - * Remove a page from the page cache and free it. - */ -void remove_inode_page(struct page *page) -{ - remove_page_from_hash_queue(page); - remove_page_from_inode_queue(page); - page_cache_release(page); -} +extern atomic_t too_many_dirty_buffers; int shrink_mmap(int priority, int gfp_mask) { static unsigned long clock = 0; unsigned long limit = num_physpages; struct page * page; - int count; + int count, users; count = limit >> priority; @@ -164,15 +261,67 @@ int shrink_mmap(int priority, int gfp_mask) referenced = test_and_clear_bit(PG_referenced, &page->flags); - if (PageLocked(page)) + if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) continue; - if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) + /* + * Some common cases that we just short-circuit without + * getting the locks - we need to re-check this once we + * have the lock, but that's fine. + */ + users = page_count(page); + if (!users) + continue; + if (!page->buffers) { + if (!page->inode) + continue; + if (users > 1) + continue; + } + + /* + * ok, now the page looks interesting. Re-check things + * and keep the lock. + */ + spin_lock(&pagecache_lock); + if (!page->inode && !page->buffers) { + spin_unlock(&pagecache_lock); continue; + } + if (!page_count(page)) { +// BUG(); + spin_unlock(&pagecache_lock); + continue; + } + get_page(page); + if (TryLockPage(page)) { + spin_unlock(&pagecache_lock); + goto put_continue; + } + + /* + * we keep pagecache_lock locked and unlock it in + * each branch, so that the page->inode case doesnt + * have to re-grab it. Here comes the 'real' logic + * to free memory: + */ + + /* Is it a buffer page? */ + if (page->buffers) { + kdev_t dev = page->buffers->b_dev; + spin_unlock(&pagecache_lock); + if (try_to_free_buffers(page)) + goto made_progress; + if (!atomic_read(&too_many_dirty_buffers)) { + atomic_set(&too_many_dirty_buffers, 1); + balance_dirty(dev); + } + goto unlock_continue; + } /* We can't free pages unless there's just one user */ - if (atomic_read(&page->count) != 1) - continue; + if (page_count(page) != 2) + goto spin_unlock_continue; count--; @@ -182,77 +331,180 @@ int shrink_mmap(int priority, int gfp_mask) * were to be marked referenced.. */ if (PageSwapCache(page)) { - if (referenced && swap_count(page->offset) != 1) - continue; - delete_from_swap_cache(page); - return 1; + spin_unlock(&pagecache_lock); + if (referenced && swap_count(page->offset) != 2) + goto unlock_continue; + __delete_from_swap_cache(page); + page_cache_release(page); + goto made_progress; } - if (referenced) - continue; - - /* Is it a buffer page? */ - if (page->buffers) { - if (buffer_under_min()) - continue; - if (!try_to_free_buffers(page)) - continue; - return 1; - } - /* is it a page-cache page? */ - if (page->inode) { - if (pgcache_under_min()) - continue; - remove_inode_page(page); - return 1; - } + if (!referenced && page->inode && !pgcache_under_min()) { + remove_page_from_inode_queue(page); + remove_page_from_hash_queue(page); + page->inode = NULL; + spin_unlock(&pagecache_lock); + page_cache_release(page); + goto made_progress; + } +spin_unlock_continue: + spin_unlock(&pagecache_lock); +unlock_continue: + UnlockPage(page); +put_continue: + put_page(page); } while (count > 0); return 0; +made_progress: + UnlockPage(page); + put_page(page); + return 1; +} + +static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page) +{ + goto inside; + + for (;;) { + page = page->next_hash; +inside: + if (!page) + goto not_found; + if (page->inode != inode) + continue; + if (page->offset == offset) + break; + } +not_found: + return page; } /* - * Update a page cache copy, when we're doing a "write()" system call - * See also "update_vm_cache()". + * By the time this is called, the page is locked and + * we don't have to worry about any races any more. + * + * Start the IO.. */ -void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count) +static int writeout_one_page(struct page *page) { - unsigned long offset, len; + struct buffer_head *bh, *head = page->buffers; - offset = (pos & ~PAGE_CACHE_MASK); - pos = pos & PAGE_CACHE_MASK; - len = PAGE_CACHE_SIZE - offset; + bh = head; do { - struct page * page; + if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh)) + continue; - if (len > count) - len = count; - page = find_page(inode, pos); - if (page) { - wait_on_page(page); - memcpy((void *) (offset + page_address(page)), buf, len); - page_cache_release(page); - } - count -= len; - buf += len; - len = PAGE_CACHE_SIZE; - offset = 0; - pos += PAGE_CACHE_SIZE; - } while (count); + bh->b_flushtime = 0; + ll_rw_block(WRITE, 1, &bh); + } while ((bh = bh->b_this_page) != head); + return 0; +} + +static int waitfor_one_page(struct page *page) +{ + int error = 0; + struct buffer_head *bh, *head = page->buffers; + + bh = head; + do { + wait_on_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + error = -EIO; + } while ((bh = bh->b_this_page) != head); + return error; +} + +static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *)) +{ + struct page *next; + int retval = 0; + + start &= PAGE_MASK; + + spin_lock(&pagecache_lock); + next = inode->i_pages; + while (next) { + struct page *page = next; + next = page->next; + if (!page->buffers) + continue; + if (page->offset >= end) + continue; + if (page->offset < start) + continue; + + get_page(page); + spin_unlock(&pagecache_lock); + lock_page(page); + + /* The buffers could have been free'd while we waited for the page lock */ + if (page->buffers) + retval |= fn(page); + + UnlockPage(page); + spin_lock(&pagecache_lock); + next = page->next; + page_cache_release(page); + } + spin_unlock(&pagecache_lock); + + return retval; +} + +/* + * Two-stage data sync: first start the IO, then go back and + * collect the information.. + */ +int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end) +{ + int retval; + + retval = do_buffer_fdatasync(inode, start, end, writeout_one_page); + retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page); + return retval; } -static inline void add_to_page_cache(struct page * page, +/* + * This adds a page to the page cache, starting out as locked, + * owned by us, referenced, but not uptodate and with no errors. + */ +static inline void __add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset, struct page **hash) { - atomic_inc(&page->count); - page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced); + unsigned long flags; + + flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error)); + page->flags = flags | ((1 << PG_locked) | (1 << PG_referenced)); + page->owner = (int)current; /* REMOVEME */ + get_page(page); page->offset = offset; add_page_to_inode_queue(inode, page); __add_page_to_hash_queue(page, hash); } +int add_to_page_cache_unique(struct page * page, + struct inode * inode, unsigned long offset, + struct page **hash) +{ + int err; + struct page *alias; + + spin_lock(&pagecache_lock); + alias = __find_page_nolock(inode, offset, *hash); + + err = 1; + if (!alias) { + __add_to_page_cache(page,inode,offset,hash); + err = 0; + } + + spin_unlock(&pagecache_lock); + return err; +} + /* * Try to read ahead in the file. "page_cache" is a potentially free page * that we could use for the cache (if it is 0 we can try to create one, @@ -275,45 +527,173 @@ static unsigned long try_to_read_ahead(struct file * file, if (offset >= inode->i_size) break; hash = page_hash(inode, offset); - page = __find_page(inode, offset, *hash); - if (!page) { + page = page_cache_entry(page_cache); + if (!add_to_page_cache_unique(page, inode, offset, hash)) { /* - * Ok, add the new page to the hash-queues... + * We do not have to check the return value here + * because it's a readahead. */ - page = page_cache_entry(page_cache); - add_to_page_cache(page, inode, offset, hash); inode->i_op->readpage(file, page); page_cache = 0; + page_cache_release(page); } - page_cache_release(page); } return page_cache; } /* - * Wait for IO to complete on a locked page. + * Wait for a page to get unlocked. * * This must be called with the caller "holding" the page, * ie with increased "page->count" so that the page won't * go away during the wait.. */ -void __wait_on_page(struct page *page) +void ___wait_on_page(struct page *page) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); add_wait_queue(&page->wait, &wait); -repeat: - tsk->state = TASK_UNINTERRUPTIBLE; - run_task_queue(&tq_disk); - if (PageLocked(page)) { + do { + tsk->state = TASK_UNINTERRUPTIBLE; + run_task_queue(&tq_disk); + if (!PageLocked(page)) + break; schedule(); - goto repeat; - } + } while (PageLocked(page)); tsk->state = TASK_RUNNING; remove_wait_queue(&page->wait, &wait); } +/* + * Get an exclusive lock on the page.. + */ +void lock_page(struct page *page) +{ + if (TryLockPage(page)) { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, current); + + run_task_queue(&tq_disk); + add_wait_queue(&page->wait, &wait); + tsk->state = TASK_UNINTERRUPTIBLE; + + while (TryLockPage(page)) { + run_task_queue(&tq_disk); + schedule(); + tsk->state = TASK_UNINTERRUPTIBLE; + } + + remove_wait_queue(&page->wait, &wait); + tsk->state = TASK_RUNNING; + } +} + + +/* + * a rather lightweight function, finding and getting a reference to a + * hashed page atomically, waiting for it if it's locked. + */ +struct page * __find_get_page (struct inode * inode, + unsigned long offset, struct page **hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ +repeat: + spin_lock(&pagecache_lock); + page = __find_page_nolock(inode, offset, *hash); + if (page) + get_page(page); + spin_unlock(&pagecache_lock); + + /* Found the page, sleep if locked. */ + if (page && PageLocked(page)) { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&page->wait, &wait); + tsk->state = TASK_UNINTERRUPTIBLE; + + run_task_queue(&tq_disk); + if (PageLocked(page)) + schedule(); + tsk->state = TASK_RUNNING; + remove_wait_queue(&page->wait, &wait); + + /* + * The page might have been unhashed meanwhile. It's + * not freed though because we hold a reference to it. + * If this is the case then it will be freed _here_, + * and we recheck the hash anyway. + */ + page_cache_release(page); + goto repeat; + } + /* + * It's not locked so we can return the page and we hold + * a reference to it. + */ + return page; +} + +/* + * Get the lock to a page atomically. + */ +struct page * __find_lock_page (struct inode * inode, + unsigned long offset, struct page **hash) +{ + int locked; + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ +repeat: + spin_lock(&pagecache_lock); + page = __find_page_nolock(inode, offset, *hash); + locked = 0; + if (page) { + get_page(page); + if (TryLockPage(page)) + locked = 1; + } + spin_unlock(&pagecache_lock); + + /* Found the page, sleep if locked. */ + if (page && locked) { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&page->wait, &wait); + tsk->state = TASK_UNINTERRUPTIBLE; + + run_task_queue(&tq_disk); + if (PageLocked(page)) + schedule(); + tsk->state = TASK_RUNNING; + remove_wait_queue(&page->wait, &wait); + + /* + * The page might have been unhashed meanwhile. It's + * not freed though because we hold a reference to it. + * If this is the case then it will be freed _here_, + * and we recheck the hash anyway. + */ + page_cache_release(page); + goto repeat; + } + /* + * It's not locked so we can return the page and we hold + * a reference to it. + */ + return page; +} + #if 0 #define PROFILE_READAHEAD #define DEBUG_READAHEAD @@ -386,14 +766,14 @@ static void profile_readahead(int async, struct file *filp) * ------------------- * The read ahead context fields of the "struct file" are the following: * - f_raend : position of the first byte after the last page we tried to - * read ahead. + * read ahead. * - f_ramax : current read-ahead maximum size. * - f_ralen : length of the current IO read block we tried to read-ahead. * - f_rawin : length of the current read-ahead window. - * if last read-ahead was synchronous then - * f_rawin = f_ralen - * otherwise (was asynchronous) - * f_rawin = previous value of f_ralen + f_ralen + * if last read-ahead was synchronous then + * f_rawin = f_ralen + * otherwise (was asynchronous) + * f_rawin = previous value of f_ralen + f_ralen * * Read-ahead limits: * ------------------ @@ -485,7 +865,7 @@ static inline unsigned long generic_file_readahead(int reada_ok, * We will later force unplug device in order to force asynchronous read IO. */ else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE && - ppos <= raend && ppos + filp->f_ralen >= raend) { + ppos <= raend && ppos + filp->f_ralen >= raend) { /* * Add ONE page to max_ahead in order to try to have about the same IO max size * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE. @@ -578,6 +958,7 @@ static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descript struct inode *inode = dentry->d_inode; size_t pos, pgpos, page_cache; int reada_ok; + int error; int max_readahead = get_max_readahead(inode); page_cache = 0; @@ -633,33 +1014,22 @@ static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descript * Try to find the data in the page cache.. */ hash = page_hash(inode, pos & PAGE_CACHE_MASK); - page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash); if (!page) goto no_cached_page; - found_page: -/* - * Try to read ahead only if the current page is filled or being filled. - * Otherwise, if we were reading ahead, decrease max read ahead size to - * the minimum value. - * In this context, that seems to may happen only on some read error or if - * the page has been rewritten. - */ - if (PageUptodate(page) || PageLocked(page)) - page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache); - else if (reada_ok && filp->f_ramax > MIN_READAHEAD) - filp->f_ramax = MIN_READAHEAD; - - wait_on_page(page); - - if (!PageUptodate(page)) - goto page_read_error; + get_page(page); + spin_unlock(&pagecache_lock); -success: - /* - * Ok, we have the page, it's up-to-date and ok, - * so now we can finally copy it to user space... - */ + if (!Page_Uptodate(page)) + goto page_not_up_to_date; +page_ok: + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + */ { unsigned long offset, nr; @@ -683,75 +1053,77 @@ success: break; } +/* + * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. + */ +page_not_up_to_date: + page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache); + + if (Page_Uptodate(page)) + goto page_ok; + + /* Get exclusive access to the page ... */ + lock_page(page); + if (Page_Uptodate(page)) { + UnlockPage(page); + goto page_ok; + } + +readpage: + /* ... and start the actual read. The read will unlock the page. */ + error = inode->i_op->readpage(filp, page); + + if (!error) { + if (Page_Uptodate(page)) + goto page_ok; + + /* Again, try some read-ahead while waiting for the page to finish.. */ + page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache); + wait_on_page(page); + if (Page_Uptodate(page)) + goto page_ok; + error = -EIO; + } + + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); + break; + no_cached_page: /* * Ok, it wasn't cached, so we need to create a new * page.. + * + * We get here with the page cache lock held. */ if (!page_cache) { + spin_unlock(&pagecache_lock); page_cache = page_cache_alloc(); + if (!page_cache) { + desc->error = -ENOMEM; + break; + } + /* - * That could have slept, so go around to the - * very beginning.. + * Somebody may have added the page while we + * dropped the page cache lock. Check for that. */ - if (page_cache) - continue; - desc->error = -ENOMEM; - break; + spin_lock(&pagecache_lock); + page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash); + if (page) + goto found_page; } /* * Ok, add the new page to the hash-queues... */ page = page_cache_entry(page_cache); - page_cache = 0; - add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash); - - /* - * Error handling is tricky. If we get a read error, - * the cached page stays in the cache (but uptodate=0), - * and the next process that accesses it will try to - * re-read it. This is needed for NFS etc, where the - * identity of the reader can decide if we can read the - * page or not.. - */ -/* - * We have to read the page. - * If we were reading ahead, we had previously tried to read this page, - * That means that the page has probably been removed from the cache before - * the application process needs it, or has been rewritten. - * Decrease max readahead size to the minimum value in that situation. - */ - if (reada_ok && filp->f_ramax > MIN_READAHEAD) - filp->f_ramax = MIN_READAHEAD; - - { - int error = inode->i_op->readpage(filp, page); - if (!error) - goto found_page; - desc->error = error; - page_cache_release(page); - break; - } + __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash); + spin_unlock(&pagecache_lock); -page_read_error: - /* - * We found the page, but it wasn't up-to-date. - * Try to re-read it _once_. We do this synchronously, - * because this happens only if there were errors. - */ - { - int error = inode->i_op->readpage(filp, page); - if (!error) { - wait_on_page(page); - if (PageUptodate(page) && !PageError(page)) - goto success; - error = -EIO; /* Some unspecified error occurred.. */ - } - desc->error = error; - page_cache_release(page); - break; - } + page_cache = 0; + goto readpage; } *ppos = pos; @@ -787,6 +1159,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t * { ssize_t retval; + unlock_kernel(); retval = -EFAULT; if (access_ok(VERIFY_WRITE, buf, count)) { retval = 0; @@ -804,6 +1177,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t * retval = desc.error; } } + lock_kernel(); return retval; } @@ -812,17 +1186,14 @@ static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned ssize_t written; unsigned long count = desc->count; struct file *file = (struct file *) desc->buf; - struct inode *inode = file->f_dentry->d_inode; mm_segment_t old_fs; if (size > count) size = count; - down(&inode->i_sem); old_fs = get_fs(); set_fs(KERNEL_DS); written = file->f_op->write(file, area, size, &file->f_pos); set_fs(old_fs); - up(&inode->i_sem); if (written < 0) { desc->error = written; written = 0; @@ -878,6 +1249,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou if (retval) goto fput_out; + unlock_kernel(); retval = 0; if (count) { read_descriptor_t desc; @@ -887,7 +1259,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou ppos = &in_file->f_pos; if (offset) { if (get_user(pos, offset)) - goto fput_out; + goto fput_out_lock; ppos = &pos; } @@ -904,7 +1276,8 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou put_user(pos, offset); } - +fput_out_lock: + lock_kernel(); fput_out: fput(out_file); fput_in: @@ -934,17 +1307,21 @@ static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long unsigned long offset, reada, i; struct page * page, **hash; unsigned long old_page, new_page; + int error; new_page = 0; offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset; if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm) - goto no_page; + goto no_page_nolock; + + unlock_kernel(); /* * Do we have something in the page cache already? */ hash = page_hash(inode, offset); - page = __find_page(inode, offset, *hash); +retry_find: + page = __find_get_page(inode, offset, hash); if (!page) goto no_cached_page; @@ -960,15 +1337,17 @@ found_page: goto failure; } - if (PageLocked(page)) - goto page_locked_wait; - if (!PageUptodate(page)) - goto page_read_error; + if (!Page_Uptodate(page)) { + lock_page(page); + if (!Page_Uptodate(page)) + goto page_not_uptodate; + UnlockPage(page); + } success: /* - * Found the page, need to check sharing and possibly - * copy it over to another page.. + * Found the page and have a reference on it, need to check sharing + * and possibly copy it over to another page.. */ old_page = page_address(page); if (!no_share) { @@ -980,6 +1359,7 @@ success: page_cache_free(new_page); flush_page_to_ram(old_page); + lock_kernel(); return old_page; } @@ -989,6 +1369,7 @@ success: copy_page(new_page, old_page); flush_page_to_ram(new_page); page_cache_release(page); + lock_kernel(); return new_page; no_cached_page: @@ -1013,7 +1394,7 @@ no_cached_page: * cache.. The page we just got may be useful if we * can't share, so don't get rid of it here. */ - page = find_page(inode, offset); + page = __find_get_page(inode, offset, hash); if (page) goto found_page; @@ -1021,19 +1402,24 @@ no_cached_page: * Now, create a new page-cache page from the page we got */ page = page_cache_entry(new_page); - new_page = 0; - add_to_page_cache(page, inode, offset, hash); + if (add_to_page_cache_unique(page, inode, offset, hash)) + goto retry_find; - if (inode->i_op->readpage(file, page) != 0) - goto failure; + /* + * Now it's ours and locked, we can do initial IO to it: + */ + new_page = 0; - goto found_page; +page_not_uptodate: + error = inode->i_op->readpage(file, page); -page_locked_wait: - __wait_on_page(page); - if (PageUptodate(page)) + if (!error) { + wait_on_page(page); + if (PageError(page)) + goto page_read_error; goto success; - + } + page_read_error: /* * Umm, take care of errors if the page isn't up-to-date. @@ -1041,12 +1427,14 @@ page_read_error: * because there really aren't any performance issues here * and we need to check for errors. */ - if (inode->i_op->readpage(file, page) != 0) + if (!PageLocked(page)) + PAGE_BUG(page); + ClearPageError(page); + error = inode->i_op->readpage(file, page); + if (error) goto failure; wait_on_page(page); - if (PageError(page)) - goto failure; - if (PageUptodate(page)) + if (Page_Uptodate(page)) goto success; /* @@ -1058,6 +1446,8 @@ failure: if (new_page) page_cache_free(new_page); no_page: + lock_kernel(); +no_page_nolock: return 0; } @@ -1066,12 +1456,13 @@ no_page: * if the disk is full. */ static inline int do_write_page(struct inode * inode, struct file * file, - const char * page, unsigned long offset) + const char * page_addr, unsigned long offset) { int retval; unsigned long size; loff_t loff = offset; - mm_segment_t old_fs; + int (*writepage) (struct file *, struct page *); + struct page * page; size = offset + PAGE_SIZE; /* refuse to extend file size.. */ @@ -1083,12 +1474,21 @@ static inline int do_write_page(struct inode * inode, struct file * file, return -EIO; } size -= offset; - old_fs = get_fs(); - set_fs(KERNEL_DS); retval = -EIO; - if (size == file->f_op->write(file, (const char *) page, size, &loff)) - retval = 0; - set_fs(old_fs); + writepage = inode->i_op->writepage; + page = mem_map + MAP_NR(page_addr); + lock_page(page); + + if (writepage) { + retval = writepage(file, page); + } else { + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + if (size == file->f_op->write(file, page_addr, size, &loff)) + retval = 0; + set_fs(old_fs); + } + UnlockPage(page); return retval; } @@ -1124,9 +1524,7 @@ static int filemap_write_page(struct vm_area_struct * vma, return 0; } - down(&inode->i_sem); result = do_write_page(inode, file, (const char *) page, offset); - up(&inode->i_sem); fput(file); return result; } @@ -1146,7 +1544,8 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pte_t pte = *ptep; - unsigned long page; + unsigned long pageaddr; + struct page *page; int error; if (!(flags & MS_INVALIDATE)) { @@ -1158,8 +1557,9 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, flush_cache_page(vma, address); set_pte(ptep, pte_mkclean(pte)); flush_tlb_page(vma, address); - page = pte_page(pte); - atomic_inc(&page_cache_entry(page)->count); + pageaddr = pte_page(pte); + page = page_cache_entry(pageaddr); + get_page(page); } else { if (pte_none(pte)) return 0; @@ -1170,14 +1570,14 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, swap_free(pte_val(pte)); return 0; } - page = pte_page(pte); + pageaddr = pte_page(pte); if (!pte_dirty(pte) || flags == MS_INVALIDATE) { - page_cache_free(page); + page_cache_free(pageaddr); return 0; } } - error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1); - page_cache_free(page); + error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1); + page_cache_free(pageaddr); return error; } @@ -1338,10 +1738,7 @@ static int msync_interval(struct vm_area_struct * vma, struct file * file = vma->vm_file; if (file) { struct dentry * dentry = file->f_dentry; - struct inode * inode = dentry->d_inode; - down(&inode->i_sem); error = file_fsync(file, dentry); - up(&inode->i_sem); } } return error; @@ -1436,11 +1833,12 @@ generic_file_write(struct file *file, const char *buf, unsigned long page_cache = 0; unsigned long written; long status; + int err; - if (file->f_error) { - int error = file->f_error; + err = file->f_error; + if (err) { file->f_error = 0; - return error; + goto out; } written = 0; @@ -1451,7 +1849,7 @@ generic_file_write(struct file *file, const char *buf, /* * Check whether we've reached the file size limit. */ - status = -EFBIG; + err = -EFBIG; if (pos >= limit) { send_sig(SIGXFSZ, current, 0); goto out; @@ -1467,6 +1865,8 @@ generic_file_write(struct file *file, const char *buf, count = limit - pos; } + unlock_kernel(); + while (count) { unsigned long bytes, pgpos, offset; /* @@ -1480,29 +1880,36 @@ generic_file_write(struct file *file, const char *buf, bytes = count; hash = page_hash(inode, pgpos); - page = __find_page(inode, pgpos, *hash); +repeat_find: + page = __find_lock_page(inode, pgpos, hash); if (!page) { if (!page_cache) { page_cache = page_cache_alloc(); if (page_cache) - continue; + goto repeat_find; status = -ENOMEM; break; } page = page_cache_entry(page_cache); - add_to_page_cache(page, inode, pgpos, hash); + if (add_to_page_cache_unique(page,inode,pgpos,hash)) + goto repeat_find; + page_cache = 0; } - /* Get exclusive IO access to the page.. */ - wait_on_page(page); - set_bit(PG_locked, &page->flags); + /* We have exclusive IO access to the page.. */ + if (!PageLocked(page)) { + PAGE_BUG(page); + } else { + if (page->owner != (int)current) { + PAGE_BUG(page); + } + } status = write_one_page(file, page, offset, bytes, buf); /* Mark it unlocked again and drop the page.. */ - clear_bit(PG_locked, &page->flags); - wake_up(&page->wait); + UnlockPage(page); page_cache_release(page); if (status < 0) @@ -1519,51 +1926,16 @@ generic_file_write(struct file *file, const char *buf, if (page_cache) page_cache_free(page_cache); + + err = written ? written : status; + lock_kernel(); out: - return written ? written : status; + return err; } /* - * Support routines for directory cacheing using the page cache. - */ - -/* - * Finds the page at the specified offset, installing a new page - * if requested. The count is incremented and the page is locked. - * - * Note: we don't have to worry about races here, as the caller - * is holding the inode semaphore. + * Support routines for directory caching using the page cache. */ -unsigned long get_cached_page(struct inode * inode, unsigned long offset, - int new) -{ - struct page * page; - struct page ** hash; - unsigned long page_cache = 0; - - hash = page_hash(inode, offset); - page = __find_page(inode, offset, *hash); - if (!page) { - if (!new) - goto out; - page_cache = page_cache_alloc(); - if (!page_cache) - goto out; - clear_page(page_cache); - page = page_cache_entry(page_cache); - add_to_page_cache(page, inode, offset, hash); - } - if (atomic_read(&page->count) != 2) - printk(KERN_ERR "get_cached_page: page count=%d\n", - atomic_read(&page->count)); - if (test_bit(PG_locked, &page->flags)) - printk(KERN_ERR "get_cached_page: page already locked!\n"); - set_bit(PG_locked, &page->flags); - page_cache = page_address(page); - -out: - return page_cache; -} /* * Unlock and free a page. @@ -1572,13 +1944,10 @@ void put_cached_page(unsigned long addr) { struct page * page = page_cache_entry(addr); - if (!test_bit(PG_locked, &page->flags)) - printk("put_cached_page: page not locked!\n"); - if (atomic_read(&page->count) != 2) - printk("put_cached_page: page count=%d\n", - atomic_read(&page->count)); - clear_bit(PG_locked, &page->flags); - wake_up(&page->wait); + UnlockPage(page); + if (page_count(page) != 2) + panic("put_cached_page: page count=%d\n", + page_count(page)); page_cache_release(page); } @@ -1607,11 +1976,13 @@ static inline struct pio_request * get_pio_request(void) static inline void make_pio_request(struct file *file, unsigned long offset, - unsigned long page) + unsigned long pageaddr) { struct pio_request *p; + struct page *page; - atomic_inc(&page_cache_entry(page)->count); + page = page_cache_entry(pageaddr); + get_page(page); /* * We need to allocate without causing any recursive IO in the @@ -1634,7 +2005,7 @@ static inline void make_pio_request(struct file *file, p->file = file; p->offset = offset; - p->page = page; + p->page = pageaddr; put_pio_request(p); wake_up(&pio_wait); @@ -1694,10 +2065,8 @@ int kpiod(void * unused) dentry = p->file->f_dentry; inode = dentry->d_inode; - down(&inode->i_sem); do_write_page(inode, p->file, (const char *) p->page, p->offset); - up(&inode->i_sem); fput(p->file); page_cache_free(p->page); kmem_cache_free(pio_request_cache, p); diff --git a/mm/memory.c b/mm/memory.c index ae56831b3..aac203bbb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -272,7 +272,7 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; if (vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); set_pte(dst_pte, pte_mkold(pte)); - atomic_inc(&mem_map[page_nr].count); + get_page(mem_map + page_nr); cont_copy_pte_range: address += PAGE_SIZE; if (address >= end) @@ -556,7 +556,7 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig if (MAP_NR(page) >= max_mapnr) printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address); - if (atomic_read(&mem_map[MAP_NR(page)].count) != 1) + if (page_count(mem_map + MAP_NR(page)) != 1) printk("mem_map disagrees with %08lx at %08lx\n",page,address); pgd = pgd_offset(tsk->mm,address); pmd = pmd_alloc(pgd, address); @@ -604,17 +604,17 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t pte) { unsigned long old_page, new_page; - struct page * page_map; + struct page * page; new_page = __get_free_page(GFP_USER); - /* Did swap_out() unmapped the protected page while we slept? */ + /* Did swap_out() unmap the protected page while we slept? */ if (pte_val(*page_table) != pte_val(pte)) goto end_wp_page; old_page = pte_page(pte); if (MAP_NR(old_page) >= max_mapnr) goto bad_wp_page; tsk->min_flt++; - page_map = mem_map + MAP_NR(old_page); + page = mem_map + MAP_NR(old_page); /* * We can avoid the copy if: @@ -624,13 +624,13 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, * in which case we can remove the page * from the swap cache. */ - switch (atomic_read(&page_map->count)) { + switch (page_count(page)) { case 2: - if (!PageSwapCache(page_map)) + if (!PageSwapCache(page)) break; - if (swap_count(page_map->offset) != 1) + if (swap_count(page->offset) != 1) break; - delete_from_swap_cache(page_map); + delete_from_swap_cache(page); /* FallThrough */ case 1: flush_cache_page(vma, address); @@ -652,7 +652,7 @@ end_wp_page: if (!new_page) goto no_new_page; - if (PageReserved(page_map)) + if (PageReserved(page)) ++vma->vm_mm->rss; copy_cow_page(old_page,new_page); flush_page_to_ram(old_page); @@ -661,7 +661,7 @@ end_wp_page: set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); flush_tlb_page(vma, address); unlock_kernel(); - __free_page(page_map); + __free_page(page); return 1; bad_wp_page: @@ -776,7 +776,7 @@ static int do_swap_page(struct task_struct * tsk, if (pte_val(*page_table) != pte_val(entry)) { free_page(pte_page(page)); } else { - if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 && + if (page_count(mem_map + MAP_NR(pte_page(page))) > 1 && !(vma->vm_flags & VM_SHARED)) page = pte_wrprotect(page); ++vma->vm_mm->rss; @@ -861,7 +861,7 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, entry = mk_pte(page, vma->vm_page_prot); if (write_access) { entry = pte_mkwrite(pte_mkdirty(entry)); - } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 && + } else if (page_count(mem_map+MAP_NR(page)) > 1 && !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); set_pte(page_table, entry); diff --git a/mm/mmap.c b/mm/mmap.c index 6e5eda00d..e179a2932 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -63,7 +63,7 @@ int vm_enough_memory(long pages) return 1; free = buffermem >> PAGE_SHIFT; - free += page_cache_size; + free += atomic_read(&page_cache_size); free += nr_free_pages; free += nr_swap_pages; free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100; @@ -728,6 +728,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len) struct vm_area_struct * vma; unsigned long flags, retval; + len = PAGE_ALIGN(len); + if (!len) + return addr; + /* * mlock MCL_FUTURE? */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8826b9af1..fad87ba27 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -119,33 +119,33 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order) spin_unlock_irqrestore(&page_alloc_lock, flags); } -void __free_page(struct page *page) +int __free_page(struct page *page) { - if (!PageReserved(page) && atomic_dec_and_test(&page->count)) { + if (!PageReserved(page) && put_page_testzero(page)) { if (PageSwapCache(page)) - panic ("Freeing swap cache page"); + PAGE_BUG(page); page->flags &= ~(1 << PG_referenced); free_pages_ok(page - mem_map, 0); - return; + return 1; } + return 0; } -void free_pages(unsigned long addr, unsigned long order) +int free_pages(unsigned long addr, unsigned long order) { unsigned long map_nr = MAP_NR(addr); if (map_nr < max_mapnr) { mem_map_t * map = mem_map + map_nr; - if (PageReserved(map)) - return; - if (atomic_dec_and_test(&map->count)) { + if (!PageReserved(map) && put_page_testzero(map)) { if (PageSwapCache(map)) - panic ("Freeing swap cache pages"); + PAGE_BUG(map); map->flags &= ~(1 << PG_referenced); free_pages_ok(map_nr, order); - return; + return 1; } } + return 0; } /* @@ -167,7 +167,7 @@ do { struct free_area_struct * area = free_area+order; \ MARK_USED(map_nr, new_order, area); \ nr_free_pages -= 1 << order; \ EXPAND(ret, map_nr, order, new_order, area); \ - spin_unlock_irqrestore(&page_alloc_lock, flags); \ + spin_unlock_irqrestore(&page_alloc_lock,flags);\ return ADDRESS(map_nr); \ } \ prev = ret; \ @@ -186,7 +186,7 @@ do { unsigned long size = 1 << high; \ index += size; \ map += size; \ } \ - atomic_set(&map->count, 1); \ + set_page_count(map, 1); \ } while (0) int low_on_memory = 0; @@ -321,7 +321,7 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m memset(mem_map, 0, start_mem - (unsigned long) mem_map); do { --p; - atomic_set(&p->count, 0); + set_page_count(p, 0); p->flags = (1 << PG_DMA) | (1 << PG_reserved); init_waitqueue_head(&p->wait); } while (p > mem_map); diff --git a/mm/page_io.c b/mm/page_io.c index 9f5e82446..2226c2c9d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -47,7 +47,7 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in #ifdef DEBUG_SWAP printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n", (rw == READ) ? "read" : "write", - entry, (char *) page_address(page), atomic_read(&page->count), + entry, (char *) page_address(page), page_count(page), wait ? "wait" : "nowait"); #endif @@ -105,12 +105,12 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in } } if (rw == READ) { - clear_bit(PG_uptodate, &page->flags); + ClearPageUptodate(page); kstat.pswpin++; } else kstat.pswpout++; - atomic_inc(&page->count); + get_page(page); if (p->swap_device) { zones[0] = offset; zones_used = 1; @@ -167,7 +167,7 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in printk("swap_after_unlock_page: lock already cleared\n"); wake_up(&lock_queue); } - atomic_dec(&page->count); + put_page(page); return; } if (!wait) { @@ -182,23 +182,24 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in /* block_size == PAGE_SIZE/zones_used */ brw_page(rw, page, dev, zones, block_size, 0); - + /* Note! For consistency we do all of the logic, * decrementing the page count, and unlocking the page in the * swap lock map - in the IO completion handler. */ - if (!wait) + if (!wait) { return; + } wait_on_page(page); /* This shouldn't happen, but check to be sure. */ - if (atomic_read(&page->count) == 0) + if (page_count(page) == 0) printk(KERN_ERR "rw_swap_page: page unused while waiting!\n"); #ifdef DEBUG_SWAP printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n", (rw == READ) ? "read" : "write", - (char *) page_adddress(page), - atomic_read(&page->count)); + (char *) page_address(page), + page_count(page)); #endif } @@ -238,7 +239,7 @@ void rw_swap_page(int rw, unsigned long entry, char *buf, int wait) struct page *page = mem_map + MAP_NR(buf); if (page->inode && page->inode != &swapper_inode) - panic ("Tried to swap a non-swapper page"); + PAGE_BUG(page); /* * Make sure that we have a swap cache association for this @@ -268,23 +269,27 @@ void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer) struct page *page; page = mem_map + MAP_NR((unsigned long) buffer); - wait_on_page(page); - set_bit(PG_locked, &page->flags); - if (test_and_set_bit(PG_swap_cache, &page->flags)) { - printk ("VM: read_swap_page: page already in swap cache!\n"); - return; - } - if (page->inode) { - printk ("VM: read_swap_page: page already in page cache!\n"); - return; - } + + if (TryLockPage(page)) + PAGE_BUG(page); + if (test_and_set_bit(PG_swap_cache, &page->flags)) + PAGE_BUG(page); + if (page->inode) + PAGE_BUG(page); + get_page(page); /* Protect from shrink_mmap() */ page->inode = &swapper_inode; page->offset = entry; - atomic_inc(&page->count); /* Protect from shrink_mmap() */ rw_swap_page(rw, entry, buffer, 1); - atomic_dec(&page->count); - page->inode = 0; - clear_bit(PG_swap_cache, &page->flags); + + /* + * and now remove it from the pagecache ... + */ + if (TryLockPage(page)) + PAGE_BUG(page); + PageClearSwapCache(page); + remove_inode_page(page); + page_cache_release(page); + UnlockPage(page); } /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 8c5e7176c..21723c1db 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -25,7 +25,31 @@ * ensure that any mistaken dereferences of this structure cause a * kernel oops. */ -struct inode swapper_inode; + +static struct inode_operations swapper_inode_operations = { + NULL, /* default file operations */ + NULL, /* create */ + NULL, /* lookup */ + NULL, /* link */ + NULL, /* unlink */ + NULL, /* symlink */ + NULL, /* mkdir */ + NULL, /* rmdir */ + NULL, /* mknod */ + NULL, /* rename */ + NULL, /* readlink */ + NULL, /* follow_link */ + NULL, /* bmap */ + NULL, /* readpage */ + NULL, /* writepage */ + block_flushpage, /* flushpage */ + NULL, /* truncate */ + NULL, /* permission */ + NULL, /* smap */ + NULL /* revalidate */ +}; + +struct inode swapper_inode = { i_op: &swapper_inode_operations }; #ifdef SWAP_CACHE_INFO unsigned long swap_cache_add_total = 0; @@ -49,20 +73,20 @@ int add_to_swap_cache(struct page *page, unsigned long entry) #endif #ifdef DEBUG_SWAP printk("DebugVM: add_to_swap_cache(%08lx count %d, entry %08lx)\n", - page_address(page), atomic_read(&page->count), entry); + page_address(page), page_count(page), entry); #endif if (PageTestandSetSwapCache(page)) { printk(KERN_ERR "swap_cache: replacing non-empty entry %08lx " - "on page %08lx\n", - page->offset, page_address(page)); + "on page %08lx\n", + page->offset, page_address(page)); return 0; } if (page->inode) { printk(KERN_ERR "swap_cache: replacing page-cached entry " - "on page %08lx\n", page_address(page)); + "on page %08lx\n", page_address(page)); return 0; } - atomic_inc(&page->count); + get_page(page); page->inode = &swapper_inode; page->offset = entry; add_page_to_hash_queue(page, &swapper_inode, entry); @@ -111,7 +135,7 @@ int swap_duplicate(unsigned long entry) result = 1; #ifdef DEBUG_SWAP printk("DebugVM: swap_duplicate(entry %08lx, count now %d)\n", - entry, p->swap_map[offset]); + entry, p->swap_map[offset]); #endif out: return result; @@ -127,7 +151,7 @@ bad_offset: bad_unused: printk(KERN_ERR "swap_duplicate at %8p: entry %08lx, unused page\n", - __builtin_return_address(0), entry); + __builtin_return_address(0), entry); goto out; } @@ -153,7 +177,7 @@ int swap_count(unsigned long entry) retval = p->swap_map[offset]; #ifdef DEBUG_SWAP printk("DebugVM: swap_count(entry %08lx, count %d)\n", - entry, retval); + entry, retval); #endif out: return retval; @@ -163,16 +187,16 @@ bad_entry: goto out; bad_file: printk(KERN_ERR - "swap_count: entry %08lx, nonexistent swap file!\n", entry); + "swap_count: entry %08lx, nonexistent swap file!\n", entry); goto out; bad_offset: printk(KERN_ERR - "swap_count: entry %08lx, offset exceeds max!\n", entry); + "swap_count: entry %08lx, offset exceeds max!\n", entry); goto out; bad_unused: printk(KERN_ERR - "swap_count at %8p: entry %08lx, unused page!\n", - __builtin_return_address(0), entry); + "swap_count at %8p: entry %08lx, unused page!\n", + __builtin_return_address(0), entry); goto out; } @@ -190,18 +214,17 @@ static inline void remove_from_swap_cache(struct page *page) #ifdef DEBUG_SWAP printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n", - page_address(page), atomic_read(&page->count)); + page_address(page), page_count(page)); #endif - PageClearSwapCache (page); + PageClearSwapCache(page); remove_inode_page(page); } - /* * This must be called only on pages that have * been verified to be in the swap cache. */ -void delete_from_swap_cache(struct page *page) +void __delete_from_swap_cache(struct page *page) { long entry = page->offset; @@ -210,13 +233,27 @@ void delete_from_swap_cache(struct page *page) #endif #ifdef DEBUG_SWAP printk("DebugVM: delete_from_swap_cache(%08lx count %d, " - "entry %08lx)\n", - page_address(page), atomic_read(&page->count), entry); + "entry %08lx)\n", + page_address(page), page_count(page), entry); #endif remove_from_swap_cache (page); swap_free (entry); } +/* + * This must be called only on pages that have + * been verified to be in the swap cache. + */ +void delete_from_swap_cache(struct page *page) +{ + lock_page(page); + + __delete_from_swap_cache(page); + + UnlockPage(page); + page_cache_release(page); +} + /* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. @@ -229,18 +266,18 @@ void free_page_and_swap_cache(unsigned long addr) /* * If we are the only user, then free up the swap cache. */ - if (PageSwapCache(page) && !is_page_shared(page)) { + if (PageSwapCache(page) && !is_page_shared(page)) delete_from_swap_cache(page); - } __free_page(page); } /* - * Lookup a swap entry in the swap cache. We need to be careful about - * locked pages. A found page will be returned with its refcount - * incremented. + * Lookup a swap entry in the swap cache. A found page will be returned + * unlocked and with its refcount incremented - we rely on the kernel + * lock getting page table operations atomic even if we drop the page + * lock before returning. */ struct page * lookup_swap_cache(unsigned long entry) @@ -251,23 +288,21 @@ struct page * lookup_swap_cache(unsigned long entry) swap_cache_find_total++; #endif while (1) { - found = find_page(&swapper_inode, entry); + found = find_lock_page(&swapper_inode, entry); if (!found) return 0; if (found->inode != &swapper_inode || !PageSwapCache(found)) goto out_bad; - if (!PageLocked(found)) { #ifdef SWAP_CACHE_INFO - swap_cache_find_success++; + swap_cache_find_success++; #endif - return found; - } - __free_page(found); - __wait_on_page(found); + UnlockPage(found); + return found; } out_bad: printk (KERN_ERR "VM: Found a non-swapper swap page!\n"); + UnlockPage(found); __free_page(found); return 0; } @@ -288,7 +323,7 @@ struct page * read_swap_cache_async(unsigned long entry, int wait) #ifdef DEBUG_SWAP printk("DebugVM: read_swap_cache_async entry %08lx%s\n", - entry, wait ? ", wait" : ""); + entry, wait ? ", wait" : ""); #endif /* * Make sure the swap entry is still in use. @@ -319,12 +354,12 @@ struct page * read_swap_cache_async(unsigned long entry, int wait) if (!add_to_swap_cache(new_page, entry)) goto out_free_page; - set_bit(PG_locked, &new_page->flags); + LockPage(new_page); rw_swap_page(READ, entry, (char *) new_page_addr, wait); #ifdef DEBUG_SWAP printk("DebugVM: read_swap_cache_async created " - "entry %08lx at %p\n", - entry, (char *) page_address(new_page)); + "entry %08lx at %p\n", + entry, (char *) page_address(new_page)); #endif return new_page; @@ -335,3 +370,4 @@ out_free_swap: out: return found_page; } + diff --git a/mm/swapfile.c b/mm/swapfile.c index de29f1006..794e39aff 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -192,7 +192,7 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, return; set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); swap_free(entry); - atomic_inc(&mem_map[MAP_NR(page)].count); + get_page(mem_map + MAP_NR(page)); ++vma->vm_mm->rss; } diff --git a/mm/vmscan.c b/mm/vmscan.c index d651e6f94..9ca4988e4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -157,7 +157,7 @@ drop_pte: add_to_swap_cache(page_map, entry); /* We checked we were unlocked way up above, and we have been careful not to stall until here */ - set_bit(PG_locked, &page_map->flags); + LockPage(page_map); /* OK, do a physical asynchronous write to swap. */ rw_swap_page(WRITE, entry, (char *) page, 0); -- cgit v1.2.3