summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c959
-rw-r--r--mm/memory.c26
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/page_alloc.c26
-rw-r--r--mm/page_io.c53
-rw-r--r--mm/swap_state.c106
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmscan.c2
8 files changed, 797 insertions, 383 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 455f334f3..4e885758f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1,7 +1,7 @@
/*
* linux/mm/filemap.c
*
- * Copyright (C) 1994, 1995 Linus Torvalds
+ * Copyright (C) 1994-1999 Linus Torvalds
*/
/*
@@ -29,9 +29,12 @@
* though.
*
* Shared mappings now work. 15.8.1995 Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
*/
-unsigned long page_cache_size = 0;
+atomic_t page_cache_size = ATOMIC_INIT(0);
struct page * page_hash_table[PAGE_HASH_SIZE];
/*
@@ -50,38 +53,97 @@ static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
static kmem_cache_t *pio_request_cache;
static DECLARE_WAIT_QUEUE_HEAD(pio_wait);
+spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+
+
static inline void
make_pio_request(struct file *, unsigned long, unsigned long);
+void __add_page_to_hash_queue(struct page * page, struct page **p){
+ atomic_inc(&page_cache_size);
+ if((page->next_hash = *p) != NULL)
+ (*p)->pprev_hash = &page->next_hash;
+ *p = page;
+ page->pprev_hash = p;
+ if (page->buffers)
+ PAGE_BUG(page);
+}
+
+static void remove_page_from_hash_queue(struct page * page)
+{
+ if(page->pprev_hash) {
+ if(page->next_hash)
+ page->next_hash->pprev_hash = page->pprev_hash;
+ *page->pprev_hash = page->next_hash;
+ page->pprev_hash = NULL;
+ }
+ atomic_dec(&page_cache_size);
+}
+
+static void remove_page_from_inode_queue(struct page * page)
+{
+ struct inode * inode = page->inode;
+ struct page *prev, *next;
+
+ inode->i_nrpages--;
+ next = page->next;
+ prev = page->prev;
+ if (inode->i_pages == page)
+ inode->i_pages = next;
+ if (next)
+ next->prev = prev;
+ if (prev)
+ prev->next = next;
+ page->next = NULL;
+ page->prev = NULL;
+}
/*
- * Invalidate the pages of an inode, removing all pages that aren't
- * locked down (those are sure to be up-to-date anyway, so we shouldn't
- * invalidate them).
+ * Remove a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe.
*/
+void remove_inode_page(struct page *page)
+{
+ if (!PageLocked(page))
+ PAGE_BUG(page);
+
+ spin_lock(&pagecache_lock);
+ remove_page_from_inode_queue(page);
+ remove_page_from_hash_queue(page);
+ page->inode = NULL;
+ spin_unlock(&pagecache_lock);
+}
+
void invalidate_inode_pages(struct inode * inode)
{
struct page ** p;
struct page * page;
+repeat:
+ spin_lock(&pagecache_lock);
p = &inode->i_pages;
while ((page = *p) != NULL) {
- if (PageLocked(page)) {
- p = &page->next;
- continue;
+ get_page(page);
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+ wait_on_page(page);
+ page_cache_release(page);
+ goto repeat;
}
- inode->i_nrpages--;
- if ((*p = page->next) != NULL)
- (*p)->prev = page->prev;
- page->next = NULL;
- page->prev = NULL;
+ if (page_count(page) != 2)
+ printk("hm, busy page invalidated? (not necesserily a bug)\n");
+
+ remove_page_from_inode_queue(page);
remove_page_from_hash_queue(page);
page->inode = NULL;
+ UnlockPage(page);
+ page_cache_release(page);
page_cache_release(page);
- continue;
+
}
+ spin_unlock(&pagecache_lock);
}
-
/*
* Truncate the page cache at a set offset, removing the pages
* that are beyond that offset (and zeroing out partial pages).
@@ -90,55 +152,90 @@ void truncate_inode_pages(struct inode * inode, unsigned long start)
{
struct page ** p;
struct page * page;
+ int partial = 0;
repeat:
+ spin_lock(&pagecache_lock);
p = &inode->i_pages;
while ((page = *p) != NULL) {
unsigned long offset = page->offset;
/* page wholly truncated - free it */
if (offset >= start) {
- if (PageLocked(page)) {
- wait_on_page(page);
- goto repeat;
- }
- inode->i_nrpages--;
- if ((*p = page->next) != NULL)
- (*p)->prev = page->prev;
- page->next = NULL;
- page->prev = NULL;
- remove_page_from_hash_queue(page);
- page->inode = NULL;
+ get_page(page);
+ spin_unlock(&pagecache_lock);
+
+ lock_page(page);
+
+ if (inode->i_op->flushpage)
+ inode->i_op->flushpage(inode, page, 0);
+
+ /*
+ * We remove the page from the page cache
+ * _after_ we have destroyed all buffer-cache
+ * references to it. Otherwise some other process
+ * might think this inode page is not in the
+ * page cache and creates a buffer-cache alias
+ * to it causing all sorts of fun problems ...
+ */
+ remove_inode_page(page);
+
+ UnlockPage(page);
page_cache_release(page);
- continue;
+ page_cache_release(page);
+
+ /*
+ * We have done things without the pagecache lock,
+ * so we'll have to repeat the scan.
+ * It's not possible to deadlock here because
+ * we are guaranteed to make progress. (ie. we have
+ * just removed a page)
+ */
+ goto repeat;
}
p = &page->next;
+ /*
+ * there is only one partial page possible.
+ */
+ if (partial)
+ continue;
+
offset = start - offset;
/* partial truncate, clear end of page */
if (offset < PAGE_CACHE_SIZE) {
- unsigned long address = page_address(page);
+ unsigned long address;
+ get_page(page);
+ spin_unlock(&pagecache_lock);
+
+ lock_page(page);
+ partial = 1;
+
+ address = page_address(page);
memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
flush_page_to_ram(address);
+
+ if (inode->i_op->flushpage)
+ inode->i_op->flushpage(inode, page, offset);
+ /*
+ * we have dropped the spinlock so we have to
+ * restart.
+ */
+ UnlockPage(page);
+ page_cache_release(page);
+ goto repeat;
}
}
+ spin_unlock(&pagecache_lock);
}
-/*
- * Remove a page from the page cache and free it.
- */
-void remove_inode_page(struct page *page)
-{
- remove_page_from_hash_queue(page);
- remove_page_from_inode_queue(page);
- page_cache_release(page);
-}
+extern atomic_t too_many_dirty_buffers;
int shrink_mmap(int priority, int gfp_mask)
{
static unsigned long clock = 0;
unsigned long limit = num_physpages;
struct page * page;
- int count;
+ int count, users;
count = limit >> priority;
@@ -164,15 +261,67 @@ int shrink_mmap(int priority, int gfp_mask)
referenced = test_and_clear_bit(PG_referenced, &page->flags);
- if (PageLocked(page))
+ if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
continue;
- if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+ /*
+ * Some common cases that we just short-circuit without
+ * getting the locks - we need to re-check this once we
+ * have the lock, but that's fine.
+ */
+ users = page_count(page);
+ if (!users)
+ continue;
+ if (!page->buffers) {
+ if (!page->inode)
+ continue;
+ if (users > 1)
+ continue;
+ }
+
+ /*
+ * ok, now the page looks interesting. Re-check things
+ * and keep the lock.
+ */
+ spin_lock(&pagecache_lock);
+ if (!page->inode && !page->buffers) {
+ spin_unlock(&pagecache_lock);
continue;
+ }
+ if (!page_count(page)) {
+// BUG();
+ spin_unlock(&pagecache_lock);
+ continue;
+ }
+ get_page(page);
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+ goto put_continue;
+ }
+
+ /*
+ * we keep pagecache_lock locked and unlock it in
+ * each branch, so that the page->inode case doesnt
+ * have to re-grab it. Here comes the 'real' logic
+ * to free memory:
+ */
+
+ /* Is it a buffer page? */
+ if (page->buffers) {
+ kdev_t dev = page->buffers->b_dev;
+ spin_unlock(&pagecache_lock);
+ if (try_to_free_buffers(page))
+ goto made_progress;
+ if (!atomic_read(&too_many_dirty_buffers)) {
+ atomic_set(&too_many_dirty_buffers, 1);
+ balance_dirty(dev);
+ }
+ goto unlock_continue;
+ }
/* We can't free pages unless there's just one user */
- if (atomic_read(&page->count) != 1)
- continue;
+ if (page_count(page) != 2)
+ goto spin_unlock_continue;
count--;
@@ -182,77 +331,180 @@ int shrink_mmap(int priority, int gfp_mask)
* were to be marked referenced..
*/
if (PageSwapCache(page)) {
- if (referenced && swap_count(page->offset) != 1)
- continue;
- delete_from_swap_cache(page);
- return 1;
+ spin_unlock(&pagecache_lock);
+ if (referenced && swap_count(page->offset) != 2)
+ goto unlock_continue;
+ __delete_from_swap_cache(page);
+ page_cache_release(page);
+ goto made_progress;
}
- if (referenced)
- continue;
-
- /* Is it a buffer page? */
- if (page->buffers) {
- if (buffer_under_min())
- continue;
- if (!try_to_free_buffers(page))
- continue;
- return 1;
- }
-
/* is it a page-cache page? */
- if (page->inode) {
- if (pgcache_under_min())
- continue;
- remove_inode_page(page);
- return 1;
- }
+ if (!referenced && page->inode && !pgcache_under_min()) {
+ remove_page_from_inode_queue(page);
+ remove_page_from_hash_queue(page);
+ page->inode = NULL;
+ spin_unlock(&pagecache_lock);
+ page_cache_release(page);
+ goto made_progress;
+ }
+spin_unlock_continue:
+ spin_unlock(&pagecache_lock);
+unlock_continue:
+ UnlockPage(page);
+put_continue:
+ put_page(page);
} while (count > 0);
return 0;
+made_progress:
+ UnlockPage(page);
+ put_page(page);
+ return 1;
+}
+
+static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
+{
+ goto inside;
+
+ for (;;) {
+ page = page->next_hash;
+inside:
+ if (!page)
+ goto not_found;
+ if (page->inode != inode)
+ continue;
+ if (page->offset == offset)
+ break;
+ }
+not_found:
+ return page;
}
/*
- * Update a page cache copy, when we're doing a "write()" system call
- * See also "update_vm_cache()".
+ * By the time this is called, the page is locked and
+ * we don't have to worry about any races any more.
+ *
+ * Start the IO..
*/
-void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
+static int writeout_one_page(struct page *page)
{
- unsigned long offset, len;
+ struct buffer_head *bh, *head = page->buffers;
- offset = (pos & ~PAGE_CACHE_MASK);
- pos = pos & PAGE_CACHE_MASK;
- len = PAGE_CACHE_SIZE - offset;
+ bh = head;
do {
- struct page * page;
+ if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
+ continue;
- if (len > count)
- len = count;
- page = find_page(inode, pos);
- if (page) {
- wait_on_page(page);
- memcpy((void *) (offset + page_address(page)), buf, len);
- page_cache_release(page);
- }
- count -= len;
- buf += len;
- len = PAGE_CACHE_SIZE;
- offset = 0;
- pos += PAGE_CACHE_SIZE;
- } while (count);
+ bh->b_flushtime = 0;
+ ll_rw_block(WRITE, 1, &bh);
+ } while ((bh = bh->b_this_page) != head);
+ return 0;
+}
+
+static int waitfor_one_page(struct page *page)
+{
+ int error = 0;
+ struct buffer_head *bh, *head = page->buffers;
+
+ bh = head;
+ do {
+ wait_on_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh))
+ error = -EIO;
+ } while ((bh = bh->b_this_page) != head);
+ return error;
+}
+
+static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
+{
+ struct page *next;
+ int retval = 0;
+
+ start &= PAGE_MASK;
+
+ spin_lock(&pagecache_lock);
+ next = inode->i_pages;
+ while (next) {
+ struct page *page = next;
+ next = page->next;
+ if (!page->buffers)
+ continue;
+ if (page->offset >= end)
+ continue;
+ if (page->offset < start)
+ continue;
+
+ get_page(page);
+ spin_unlock(&pagecache_lock);
+ lock_page(page);
+
+ /* The buffers could have been free'd while we waited for the page lock */
+ if (page->buffers)
+ retval |= fn(page);
+
+ UnlockPage(page);
+ spin_lock(&pagecache_lock);
+ next = page->next;
+ page_cache_release(page);
+ }
+ spin_unlock(&pagecache_lock);
+
+ return retval;
+}
+
+/*
+ * Two-stage data sync: first start the IO, then go back and
+ * collect the information..
+ */
+int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
+{
+ int retval;
+
+ retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
+ retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
+ return retval;
}
-static inline void add_to_page_cache(struct page * page,
+/*
+ * This adds a page to the page cache, starting out as locked,
+ * owned by us, referenced, but not uptodate and with no errors.
+ */
+static inline void __add_to_page_cache(struct page * page,
struct inode * inode, unsigned long offset,
struct page **hash)
{
- atomic_inc(&page->count);
- page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
+ unsigned long flags;
+
+ flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
+ page->flags = flags | ((1 << PG_locked) | (1 << PG_referenced));
+ page->owner = (int)current; /* REMOVEME */
+ get_page(page);
page->offset = offset;
add_page_to_inode_queue(inode, page);
__add_page_to_hash_queue(page, hash);
}
+int add_to_page_cache_unique(struct page * page,
+ struct inode * inode, unsigned long offset,
+ struct page **hash)
+{
+ int err;
+ struct page *alias;
+
+ spin_lock(&pagecache_lock);
+ alias = __find_page_nolock(inode, offset, *hash);
+
+ err = 1;
+ if (!alias) {
+ __add_to_page_cache(page,inode,offset,hash);
+ err = 0;
+ }
+
+ spin_unlock(&pagecache_lock);
+ return err;
+}
+
/*
* Try to read ahead in the file. "page_cache" is a potentially free page
* that we could use for the cache (if it is 0 we can try to create one,
@@ -275,45 +527,173 @@ static unsigned long try_to_read_ahead(struct file * file,
if (offset >= inode->i_size)
break;
hash = page_hash(inode, offset);
- page = __find_page(inode, offset, *hash);
- if (!page) {
+ page = page_cache_entry(page_cache);
+ if (!add_to_page_cache_unique(page, inode, offset, hash)) {
/*
- * Ok, add the new page to the hash-queues...
+ * We do not have to check the return value here
+ * because it's a readahead.
*/
- page = page_cache_entry(page_cache);
- add_to_page_cache(page, inode, offset, hash);
inode->i_op->readpage(file, page);
page_cache = 0;
+ page_cache_release(page);
}
- page_cache_release(page);
}
return page_cache;
}
/*
- * Wait for IO to complete on a locked page.
+ * Wait for a page to get unlocked.
*
* This must be called with the caller "holding" the page,
* ie with increased "page->count" so that the page won't
* go away during the wait..
*/
-void __wait_on_page(struct page *page)
+void ___wait_on_page(struct page *page)
{
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
add_wait_queue(&page->wait, &wait);
-repeat:
- tsk->state = TASK_UNINTERRUPTIBLE;
- run_task_queue(&tq_disk);
- if (PageLocked(page)) {
+ do {
+ tsk->state = TASK_UNINTERRUPTIBLE;
+ run_task_queue(&tq_disk);
+ if (!PageLocked(page))
+ break;
schedule();
- goto repeat;
- }
+ } while (PageLocked(page));
tsk->state = TASK_RUNNING;
remove_wait_queue(&page->wait, &wait);
}
+/*
+ * Get an exclusive lock on the page..
+ */
+void lock_page(struct page *page)
+{
+ if (TryLockPage(page)) {
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, current);
+
+ run_task_queue(&tq_disk);
+ add_wait_queue(&page->wait, &wait);
+ tsk->state = TASK_UNINTERRUPTIBLE;
+
+ while (TryLockPage(page)) {
+ run_task_queue(&tq_disk);
+ schedule();
+ tsk->state = TASK_UNINTERRUPTIBLE;
+ }
+
+ remove_wait_queue(&page->wait, &wait);
+ tsk->state = TASK_RUNNING;
+ }
+}
+
+
+/*
+ * a rather lightweight function, finding and getting a reference to a
+ * hashed page atomically, waiting for it if it's locked.
+ */
+struct page * __find_get_page (struct inode * inode,
+ unsigned long offset, struct page **hash)
+{
+ struct page *page;
+
+ /*
+ * We scan the hash list read-only. Addition to and removal from
+ * the hash-list needs a held write-lock.
+ */
+repeat:
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, offset, *hash);
+ if (page)
+ get_page(page);
+ spin_unlock(&pagecache_lock);
+
+ /* Found the page, sleep if locked. */
+ if (page && PageLocked(page)) {
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ add_wait_queue(&page->wait, &wait);
+ tsk->state = TASK_UNINTERRUPTIBLE;
+
+ run_task_queue(&tq_disk);
+ if (PageLocked(page))
+ schedule();
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(&page->wait, &wait);
+
+ /*
+ * The page might have been unhashed meanwhile. It's
+ * not freed though because we hold a reference to it.
+ * If this is the case then it will be freed _here_,
+ * and we recheck the hash anyway.
+ */
+ page_cache_release(page);
+ goto repeat;
+ }
+ /*
+ * It's not locked so we can return the page and we hold
+ * a reference to it.
+ */
+ return page;
+}
+
+/*
+ * Get the lock to a page atomically.
+ */
+struct page * __find_lock_page (struct inode * inode,
+ unsigned long offset, struct page **hash)
+{
+ int locked;
+ struct page *page;
+
+ /*
+ * We scan the hash list read-only. Addition to and removal from
+ * the hash-list needs a held write-lock.
+ */
+repeat:
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, offset, *hash);
+ locked = 0;
+ if (page) {
+ get_page(page);
+ if (TryLockPage(page))
+ locked = 1;
+ }
+ spin_unlock(&pagecache_lock);
+
+ /* Found the page, sleep if locked. */
+ if (page && locked) {
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ add_wait_queue(&page->wait, &wait);
+ tsk->state = TASK_UNINTERRUPTIBLE;
+
+ run_task_queue(&tq_disk);
+ if (PageLocked(page))
+ schedule();
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(&page->wait, &wait);
+
+ /*
+ * The page might have been unhashed meanwhile. It's
+ * not freed though because we hold a reference to it.
+ * If this is the case then it will be freed _here_,
+ * and we recheck the hash anyway.
+ */
+ page_cache_release(page);
+ goto repeat;
+ }
+ /*
+ * It's not locked so we can return the page and we hold
+ * a reference to it.
+ */
+ return page;
+}
+
#if 0
#define PROFILE_READAHEAD
#define DEBUG_READAHEAD
@@ -386,14 +766,14 @@ static void profile_readahead(int async, struct file *filp)
* -------------------
* The read ahead context fields of the "struct file" are the following:
* - f_raend : position of the first byte after the last page we tried to
- * read ahead.
+ * read ahead.
* - f_ramax : current read-ahead maximum size.
* - f_ralen : length of the current IO read block we tried to read-ahead.
* - f_rawin : length of the current read-ahead window.
- * if last read-ahead was synchronous then
- * f_rawin = f_ralen
- * otherwise (was asynchronous)
- * f_rawin = previous value of f_ralen + f_ralen
+ * if last read-ahead was synchronous then
+ * f_rawin = f_ralen
+ * otherwise (was asynchronous)
+ * f_rawin = previous value of f_ralen + f_ralen
*
* Read-ahead limits:
* ------------------
@@ -485,7 +865,7 @@ static inline unsigned long generic_file_readahead(int reada_ok,
* We will later force unplug device in order to force asynchronous read IO.
*/
else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
- ppos <= raend && ppos + filp->f_ralen >= raend) {
+ ppos <= raend && ppos + filp->f_ralen >= raend) {
/*
* Add ONE page to max_ahead in order to try to have about the same IO max size
* as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
@@ -578,6 +958,7 @@ static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descript
struct inode *inode = dentry->d_inode;
size_t pos, pgpos, page_cache;
int reada_ok;
+ int error;
int max_readahead = get_max_readahead(inode);
page_cache = 0;
@@ -633,33 +1014,22 @@ static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descript
* Try to find the data in the page cache..
*/
hash = page_hash(inode, pos & PAGE_CACHE_MASK);
- page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash);
+
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
if (!page)
goto no_cached_page;
-
found_page:
-/*
- * Try to read ahead only if the current page is filled or being filled.
- * Otherwise, if we were reading ahead, decrease max read ahead size to
- * the minimum value.
- * In this context, that seems to may happen only on some read error or if
- * the page has been rewritten.
- */
- if (PageUptodate(page) || PageLocked(page))
- page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
- else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
- filp->f_ramax = MIN_READAHEAD;
-
- wait_on_page(page);
-
- if (!PageUptodate(page))
- goto page_read_error;
+ get_page(page);
+ spin_unlock(&pagecache_lock);
-success:
- /*
- * Ok, we have the page, it's up-to-date and ok,
- * so now we can finally copy it to user space...
- */
+ if (!Page_Uptodate(page))
+ goto page_not_up_to_date;
+page_ok:
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ */
{
unsigned long offset, nr;
@@ -683,75 +1053,77 @@ success:
break;
}
+/*
+ * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
+ */
+page_not_up_to_date:
+ page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+
+ if (Page_Uptodate(page))
+ goto page_ok;
+
+ /* Get exclusive access to the page ... */
+ lock_page(page);
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
+ goto page_ok;
+ }
+
+readpage:
+ /* ... and start the actual read. The read will unlock the page. */
+ error = inode->i_op->readpage(filp, page);
+
+ if (!error) {
+ if (Page_Uptodate(page))
+ goto page_ok;
+
+ /* Again, try some read-ahead while waiting for the page to finish.. */
+ page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+ wait_on_page(page);
+ if (Page_Uptodate(page))
+ goto page_ok;
+ error = -EIO;
+ }
+
+ /* UHHUH! A synchronous read error occurred. Report it */
+ desc->error = error;
+ page_cache_release(page);
+ break;
+
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
+ *
+ * We get here with the page cache lock held.
*/
if (!page_cache) {
+ spin_unlock(&pagecache_lock);
page_cache = page_cache_alloc();
+ if (!page_cache) {
+ desc->error = -ENOMEM;
+ break;
+ }
+
/*
- * That could have slept, so go around to the
- * very beginning..
+ * Somebody may have added the page while we
+ * dropped the page cache lock. Check for that.
*/
- if (page_cache)
- continue;
- desc->error = -ENOMEM;
- break;
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
+ if (page)
+ goto found_page;
}
/*
* Ok, add the new page to the hash-queues...
*/
page = page_cache_entry(page_cache);
- page_cache = 0;
- add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
-
- /*
- * Error handling is tricky. If we get a read error,
- * the cached page stays in the cache (but uptodate=0),
- * and the next process that accesses it will try to
- * re-read it. This is needed for NFS etc, where the
- * identity of the reader can decide if we can read the
- * page or not..
- */
-/*
- * We have to read the page.
- * If we were reading ahead, we had previously tried to read this page,
- * That means that the page has probably been removed from the cache before
- * the application process needs it, or has been rewritten.
- * Decrease max readahead size to the minimum value in that situation.
- */
- if (reada_ok && filp->f_ramax > MIN_READAHEAD)
- filp->f_ramax = MIN_READAHEAD;
-
- {
- int error = inode->i_op->readpage(filp, page);
- if (!error)
- goto found_page;
- desc->error = error;
- page_cache_release(page);
- break;
- }
+ __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
+ spin_unlock(&pagecache_lock);
-page_read_error:
- /*
- * We found the page, but it wasn't up-to-date.
- * Try to re-read it _once_. We do this synchronously,
- * because this happens only if there were errors.
- */
- {
- int error = inode->i_op->readpage(filp, page);
- if (!error) {
- wait_on_page(page);
- if (PageUptodate(page) && !PageError(page))
- goto success;
- error = -EIO; /* Some unspecified error occurred.. */
- }
- desc->error = error;
- page_cache_release(page);
- break;
- }
+ page_cache = 0;
+ goto readpage;
}
*ppos = pos;
@@ -787,6 +1159,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
{
ssize_t retval;
+ unlock_kernel();
retval = -EFAULT;
if (access_ok(VERIFY_WRITE, buf, count)) {
retval = 0;
@@ -804,6 +1177,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
retval = desc.error;
}
}
+ lock_kernel();
return retval;
}
@@ -812,17 +1186,14 @@ static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned
ssize_t written;
unsigned long count = desc->count;
struct file *file = (struct file *) desc->buf;
- struct inode *inode = file->f_dentry->d_inode;
mm_segment_t old_fs;
if (size > count)
size = count;
- down(&inode->i_sem);
old_fs = get_fs();
set_fs(KERNEL_DS);
written = file->f_op->write(file, area, size, &file->f_pos);
set_fs(old_fs);
- up(&inode->i_sem);
if (written < 0) {
desc->error = written;
written = 0;
@@ -878,6 +1249,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
if (retval)
goto fput_out;
+ unlock_kernel();
retval = 0;
if (count) {
read_descriptor_t desc;
@@ -887,7 +1259,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
ppos = &in_file->f_pos;
if (offset) {
if (get_user(pos, offset))
- goto fput_out;
+ goto fput_out_lock;
ppos = &pos;
}
@@ -904,7 +1276,8 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
put_user(pos, offset);
}
-
+fput_out_lock:
+ lock_kernel();
fput_out:
fput(out_file);
fput_in:
@@ -934,17 +1307,21 @@ static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long
unsigned long offset, reada, i;
struct page * page, **hash;
unsigned long old_page, new_page;
+ int error;
new_page = 0;
offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
- goto no_page;
+ goto no_page_nolock;
+
+ unlock_kernel();
/*
* Do we have something in the page cache already?
*/
hash = page_hash(inode, offset);
- page = __find_page(inode, offset, *hash);
+retry_find:
+ page = __find_get_page(inode, offset, hash);
if (!page)
goto no_cached_page;
@@ -960,15 +1337,17 @@ found_page:
goto failure;
}
- if (PageLocked(page))
- goto page_locked_wait;
- if (!PageUptodate(page))
- goto page_read_error;
+ if (!Page_Uptodate(page)) {
+ lock_page(page);
+ if (!Page_Uptodate(page))
+ goto page_not_uptodate;
+ UnlockPage(page);
+ }
success:
/*
- * Found the page, need to check sharing and possibly
- * copy it over to another page..
+ * Found the page and have a reference on it, need to check sharing
+ * and possibly copy it over to another page..
*/
old_page = page_address(page);
if (!no_share) {
@@ -980,6 +1359,7 @@ success:
page_cache_free(new_page);
flush_page_to_ram(old_page);
+ lock_kernel();
return old_page;
}
@@ -989,6 +1369,7 @@ success:
copy_page(new_page, old_page);
flush_page_to_ram(new_page);
page_cache_release(page);
+ lock_kernel();
return new_page;
no_cached_page:
@@ -1013,7 +1394,7 @@ no_cached_page:
* cache.. The page we just got may be useful if we
* can't share, so don't get rid of it here.
*/
- page = find_page(inode, offset);
+ page = __find_get_page(inode, offset, hash);
if (page)
goto found_page;
@@ -1021,19 +1402,24 @@ no_cached_page:
* Now, create a new page-cache page from the page we got
*/
page = page_cache_entry(new_page);
- new_page = 0;
- add_to_page_cache(page, inode, offset, hash);
+ if (add_to_page_cache_unique(page, inode, offset, hash))
+ goto retry_find;
- if (inode->i_op->readpage(file, page) != 0)
- goto failure;
+ /*
+ * Now it's ours and locked, we can do initial IO to it:
+ */
+ new_page = 0;
- goto found_page;
+page_not_uptodate:
+ error = inode->i_op->readpage(file, page);
-page_locked_wait:
- __wait_on_page(page);
- if (PageUptodate(page))
+ if (!error) {
+ wait_on_page(page);
+ if (PageError(page))
+ goto page_read_error;
goto success;
-
+ }
+
page_read_error:
/*
* Umm, take care of errors if the page isn't up-to-date.
@@ -1041,12 +1427,14 @@ page_read_error:
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- if (inode->i_op->readpage(file, page) != 0)
+ if (!PageLocked(page))
+ PAGE_BUG(page);
+ ClearPageError(page);
+ error = inode->i_op->readpage(file, page);
+ if (error)
goto failure;
wait_on_page(page);
- if (PageError(page))
- goto failure;
- if (PageUptodate(page))
+ if (Page_Uptodate(page))
goto success;
/*
@@ -1058,6 +1446,8 @@ failure:
if (new_page)
page_cache_free(new_page);
no_page:
+ lock_kernel();
+no_page_nolock:
return 0;
}
@@ -1066,12 +1456,13 @@ no_page:
* if the disk is full.
*/
static inline int do_write_page(struct inode * inode, struct file * file,
- const char * page, unsigned long offset)
+ const char * page_addr, unsigned long offset)
{
int retval;
unsigned long size;
loff_t loff = offset;
- mm_segment_t old_fs;
+ int (*writepage) (struct file *, struct page *);
+ struct page * page;
size = offset + PAGE_SIZE;
/* refuse to extend file size.. */
@@ -1083,12 +1474,21 @@ static inline int do_write_page(struct inode * inode, struct file * file,
return -EIO;
}
size -= offset;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
retval = -EIO;
- if (size == file->f_op->write(file, (const char *) page, size, &loff))
- retval = 0;
- set_fs(old_fs);
+ writepage = inode->i_op->writepage;
+ page = mem_map + MAP_NR(page_addr);
+ lock_page(page);
+
+ if (writepage) {
+ retval = writepage(file, page);
+ } else {
+ mm_segment_t old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ if (size == file->f_op->write(file, page_addr, size, &loff))
+ retval = 0;
+ set_fs(old_fs);
+ }
+ UnlockPage(page);
return retval;
}
@@ -1124,9 +1524,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
return 0;
}
- down(&inode->i_sem);
result = do_write_page(inode, file, (const char *) page, offset);
- up(&inode->i_sem);
fput(file);
return result;
}
@@ -1146,7 +1544,8 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pte_t pte = *ptep;
- unsigned long page;
+ unsigned long pageaddr;
+ struct page *page;
int error;
if (!(flags & MS_INVALIDATE)) {
@@ -1158,8 +1557,9 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
flush_cache_page(vma, address);
set_pte(ptep, pte_mkclean(pte));
flush_tlb_page(vma, address);
- page = pte_page(pte);
- atomic_inc(&page_cache_entry(page)->count);
+ pageaddr = pte_page(pte);
+ page = page_cache_entry(pageaddr);
+ get_page(page);
} else {
if (pte_none(pte))
return 0;
@@ -1170,14 +1570,14 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
swap_free(pte_val(pte));
return 0;
}
- page = pte_page(pte);
+ pageaddr = pte_page(pte);
if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
- page_cache_free(page);
+ page_cache_free(pageaddr);
return 0;
}
}
- error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
- page_cache_free(page);
+ error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
+ page_cache_free(pageaddr);
return error;
}
@@ -1338,10 +1738,7 @@ static int msync_interval(struct vm_area_struct * vma,
struct file * file = vma->vm_file;
if (file) {
struct dentry * dentry = file->f_dentry;
- struct inode * inode = dentry->d_inode;
- down(&inode->i_sem);
error = file_fsync(file, dentry);
- up(&inode->i_sem);
}
}
return error;
@@ -1436,11 +1833,12 @@ generic_file_write(struct file *file, const char *buf,
unsigned long page_cache = 0;
unsigned long written;
long status;
+ int err;
- if (file->f_error) {
- int error = file->f_error;
+ err = file->f_error;
+ if (err) {
file->f_error = 0;
- return error;
+ goto out;
}
written = 0;
@@ -1451,7 +1849,7 @@ generic_file_write(struct file *file, const char *buf,
/*
* Check whether we've reached the file size limit.
*/
- status = -EFBIG;
+ err = -EFBIG;
if (pos >= limit) {
send_sig(SIGXFSZ, current, 0);
goto out;
@@ -1467,6 +1865,8 @@ generic_file_write(struct file *file, const char *buf,
count = limit - pos;
}
+ unlock_kernel();
+
while (count) {
unsigned long bytes, pgpos, offset;
/*
@@ -1480,29 +1880,36 @@ generic_file_write(struct file *file, const char *buf,
bytes = count;
hash = page_hash(inode, pgpos);
- page = __find_page(inode, pgpos, *hash);
+repeat_find:
+ page = __find_lock_page(inode, pgpos, hash);
if (!page) {
if (!page_cache) {
page_cache = page_cache_alloc();
if (page_cache)
- continue;
+ goto repeat_find;
status = -ENOMEM;
break;
}
page = page_cache_entry(page_cache);
- add_to_page_cache(page, inode, pgpos, hash);
+ if (add_to_page_cache_unique(page,inode,pgpos,hash))
+ goto repeat_find;
+
page_cache = 0;
}
- /* Get exclusive IO access to the page.. */
- wait_on_page(page);
- set_bit(PG_locked, &page->flags);
+ /* We have exclusive IO access to the page.. */
+ if (!PageLocked(page)) {
+ PAGE_BUG(page);
+ } else {
+ if (page->owner != (int)current) {
+ PAGE_BUG(page);
+ }
+ }
status = write_one_page(file, page, offset, bytes, buf);
/* Mark it unlocked again and drop the page.. */
- clear_bit(PG_locked, &page->flags);
- wake_up(&page->wait);
+ UnlockPage(page);
page_cache_release(page);
if (status < 0)
@@ -1519,51 +1926,16 @@ generic_file_write(struct file *file, const char *buf,
if (page_cache)
page_cache_free(page_cache);
+
+ err = written ? written : status;
+ lock_kernel();
out:
- return written ? written : status;
+ return err;
}
/*
- * Support routines for directory cacheing using the page cache.
- */
-
-/*
- * Finds the page at the specified offset, installing a new page
- * if requested. The count is incremented and the page is locked.
- *
- * Note: we don't have to worry about races here, as the caller
- * is holding the inode semaphore.
+ * Support routines for directory caching using the page cache.
*/
-unsigned long get_cached_page(struct inode * inode, unsigned long offset,
- int new)
-{
- struct page * page;
- struct page ** hash;
- unsigned long page_cache = 0;
-
- hash = page_hash(inode, offset);
- page = __find_page(inode, offset, *hash);
- if (!page) {
- if (!new)
- goto out;
- page_cache = page_cache_alloc();
- if (!page_cache)
- goto out;
- clear_page(page_cache);
- page = page_cache_entry(page_cache);
- add_to_page_cache(page, inode, offset, hash);
- }
- if (atomic_read(&page->count) != 2)
- printk(KERN_ERR "get_cached_page: page count=%d\n",
- atomic_read(&page->count));
- if (test_bit(PG_locked, &page->flags))
- printk(KERN_ERR "get_cached_page: page already locked!\n");
- set_bit(PG_locked, &page->flags);
- page_cache = page_address(page);
-
-out:
- return page_cache;
-}
/*
* Unlock and free a page.
@@ -1572,13 +1944,10 @@ void put_cached_page(unsigned long addr)
{
struct page * page = page_cache_entry(addr);
- if (!test_bit(PG_locked, &page->flags))
- printk("put_cached_page: page not locked!\n");
- if (atomic_read(&page->count) != 2)
- printk("put_cached_page: page count=%d\n",
- atomic_read(&page->count));
- clear_bit(PG_locked, &page->flags);
- wake_up(&page->wait);
+ UnlockPage(page);
+ if (page_count(page) != 2)
+ panic("put_cached_page: page count=%d\n",
+ page_count(page));
page_cache_release(page);
}
@@ -1607,11 +1976,13 @@ static inline struct pio_request * get_pio_request(void)
static inline void make_pio_request(struct file *file,
unsigned long offset,
- unsigned long page)
+ unsigned long pageaddr)
{
struct pio_request *p;
+ struct page *page;
- atomic_inc(&page_cache_entry(page)->count);
+ page = page_cache_entry(pageaddr);
+ get_page(page);
/*
* We need to allocate without causing any recursive IO in the
@@ -1634,7 +2005,7 @@ static inline void make_pio_request(struct file *file,
p->file = file;
p->offset = offset;
- p->page = page;
+ p->page = pageaddr;
put_pio_request(p);
wake_up(&pio_wait);
@@ -1694,10 +2065,8 @@ int kpiod(void * unused)
dentry = p->file->f_dentry;
inode = dentry->d_inode;
- down(&inode->i_sem);
do_write_page(inode, p->file,
(const char *) p->page, p->offset);
- up(&inode->i_sem);
fput(p->file);
page_cache_free(p->page);
kmem_cache_free(pio_request_cache, p);
diff --git a/mm/memory.c b/mm/memory.c
index ae56831b3..aac203bbb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -272,7 +272,7 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
if (vma->vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
set_pte(dst_pte, pte_mkold(pte));
- atomic_inc(&mem_map[page_nr].count);
+ get_page(mem_map + page_nr);
cont_copy_pte_range: address += PAGE_SIZE;
if (address >= end)
@@ -556,7 +556,7 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig
if (MAP_NR(page) >= max_mapnr)
printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
- if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
+ if (page_count(mem_map + MAP_NR(page)) != 1)
printk("mem_map disagrees with %08lx at %08lx\n",page,address);
pgd = pgd_offset(tsk->mm,address);
pmd = pmd_alloc(pgd, address);
@@ -604,17 +604,17 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
unsigned long address, pte_t *page_table, pte_t pte)
{
unsigned long old_page, new_page;
- struct page * page_map;
+ struct page * page;
new_page = __get_free_page(GFP_USER);
- /* Did swap_out() unmapped the protected page while we slept? */
+ /* Did swap_out() unmap the protected page while we slept? */
if (pte_val(*page_table) != pte_val(pte))
goto end_wp_page;
old_page = pte_page(pte);
if (MAP_NR(old_page) >= max_mapnr)
goto bad_wp_page;
tsk->min_flt++;
- page_map = mem_map + MAP_NR(old_page);
+ page = mem_map + MAP_NR(old_page);
/*
* We can avoid the copy if:
@@ -624,13 +624,13 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
* in which case we can remove the page
* from the swap cache.
*/
- switch (atomic_read(&page_map->count)) {
+ switch (page_count(page)) {
case 2:
- if (!PageSwapCache(page_map))
+ if (!PageSwapCache(page))
break;
- if (swap_count(page_map->offset) != 1)
+ if (swap_count(page->offset) != 1)
break;
- delete_from_swap_cache(page_map);
+ delete_from_swap_cache(page);
/* FallThrough */
case 1:
flush_cache_page(vma, address);
@@ -652,7 +652,7 @@ end_wp_page:
if (!new_page)
goto no_new_page;
- if (PageReserved(page_map))
+ if (PageReserved(page))
++vma->vm_mm->rss;
copy_cow_page(old_page,new_page);
flush_page_to_ram(old_page);
@@ -661,7 +661,7 @@ end_wp_page:
set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
flush_tlb_page(vma, address);
unlock_kernel();
- __free_page(page_map);
+ __free_page(page);
return 1;
bad_wp_page:
@@ -776,7 +776,7 @@ static int do_swap_page(struct task_struct * tsk,
if (pte_val(*page_table) != pte_val(entry)) {
free_page(pte_page(page));
} else {
- if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
+ if (page_count(mem_map + MAP_NR(pte_page(page))) > 1 &&
!(vma->vm_flags & VM_SHARED))
page = pte_wrprotect(page);
++vma->vm_mm->rss;
@@ -861,7 +861,7 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
entry = mk_pte(page, vma->vm_page_prot);
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
- } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
+ } else if (page_count(mem_map+MAP_NR(page)) > 1 &&
!(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
set_pte(page_table, entry);
diff --git a/mm/mmap.c b/mm/mmap.c
index 6e5eda00d..e179a2932 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -63,7 +63,7 @@ int vm_enough_memory(long pages)
return 1;
free = buffermem >> PAGE_SHIFT;
- free += page_cache_size;
+ free += atomic_read(&page_cache_size);
free += nr_free_pages;
free += nr_swap_pages;
free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100;
@@ -728,6 +728,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
struct vm_area_struct * vma;
unsigned long flags, retval;
+ len = PAGE_ALIGN(len);
+ if (!len)
+ return addr;
+
/*
* mlock MCL_FUTURE?
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8826b9af1..fad87ba27 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -119,33 +119,33 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
spin_unlock_irqrestore(&page_alloc_lock, flags);
}
-void __free_page(struct page *page)
+int __free_page(struct page *page)
{
- if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
+ if (!PageReserved(page) && put_page_testzero(page)) {
if (PageSwapCache(page))
- panic ("Freeing swap cache page");
+ PAGE_BUG(page);
page->flags &= ~(1 << PG_referenced);
free_pages_ok(page - mem_map, 0);
- return;
+ return 1;
}
+ return 0;
}
-void free_pages(unsigned long addr, unsigned long order)
+int free_pages(unsigned long addr, unsigned long order)
{
unsigned long map_nr = MAP_NR(addr);
if (map_nr < max_mapnr) {
mem_map_t * map = mem_map + map_nr;
- if (PageReserved(map))
- return;
- if (atomic_dec_and_test(&map->count)) {
+ if (!PageReserved(map) && put_page_testzero(map)) {
if (PageSwapCache(map))
- panic ("Freeing swap cache pages");
+ PAGE_BUG(map);
map->flags &= ~(1 << PG_referenced);
free_pages_ok(map_nr, order);
- return;
+ return 1;
}
}
+ return 0;
}
/*
@@ -167,7 +167,7 @@ do { struct free_area_struct * area = free_area+order; \
MARK_USED(map_nr, new_order, area); \
nr_free_pages -= 1 << order; \
EXPAND(ret, map_nr, order, new_order, area); \
- spin_unlock_irqrestore(&page_alloc_lock, flags); \
+ spin_unlock_irqrestore(&page_alloc_lock,flags);\
return ADDRESS(map_nr); \
} \
prev = ret; \
@@ -186,7 +186,7 @@ do { unsigned long size = 1 << high; \
index += size; \
map += size; \
} \
- atomic_set(&map->count, 1); \
+ set_page_count(map, 1); \
} while (0)
int low_on_memory = 0;
@@ -321,7 +321,7 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
memset(mem_map, 0, start_mem - (unsigned long) mem_map);
do {
--p;
- atomic_set(&p->count, 0);
+ set_page_count(p, 0);
p->flags = (1 << PG_DMA) | (1 << PG_reserved);
init_waitqueue_head(&p->wait);
} while (p > mem_map);
diff --git a/mm/page_io.c b/mm/page_io.c
index 9f5e82446..2226c2c9d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -47,7 +47,7 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
#ifdef DEBUG_SWAP
printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n",
(rw == READ) ? "read" : "write",
- entry, (char *) page_address(page), atomic_read(&page->count),
+ entry, (char *) page_address(page), page_count(page),
wait ? "wait" : "nowait");
#endif
@@ -105,12 +105,12 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
}
}
if (rw == READ) {
- clear_bit(PG_uptodate, &page->flags);
+ ClearPageUptodate(page);
kstat.pswpin++;
} else
kstat.pswpout++;
- atomic_inc(&page->count);
+ get_page(page);
if (p->swap_device) {
zones[0] = offset;
zones_used = 1;
@@ -167,7 +167,7 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
printk("swap_after_unlock_page: lock already cleared\n");
wake_up(&lock_queue);
}
- atomic_dec(&page->count);
+ put_page(page);
return;
}
if (!wait) {
@@ -182,23 +182,24 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
/* block_size == PAGE_SIZE/zones_used */
brw_page(rw, page, dev, zones, block_size, 0);
-
+
/* Note! For consistency we do all of the logic,
* decrementing the page count, and unlocking the page in the
* swap lock map - in the IO completion handler.
*/
- if (!wait)
+ if (!wait) {
return;
+ }
wait_on_page(page);
/* This shouldn't happen, but check to be sure. */
- if (atomic_read(&page->count) == 0)
+ if (page_count(page) == 0)
printk(KERN_ERR "rw_swap_page: page unused while waiting!\n");
#ifdef DEBUG_SWAP
printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n",
(rw == READ) ? "read" : "write",
- (char *) page_adddress(page),
- atomic_read(&page->count));
+ (char *) page_address(page),
+ page_count(page));
#endif
}
@@ -238,7 +239,7 @@ void rw_swap_page(int rw, unsigned long entry, char *buf, int wait)
struct page *page = mem_map + MAP_NR(buf);
if (page->inode && page->inode != &swapper_inode)
- panic ("Tried to swap a non-swapper page");
+ PAGE_BUG(page);
/*
* Make sure that we have a swap cache association for this
@@ -268,23 +269,27 @@ void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer)
struct page *page;
page = mem_map + MAP_NR((unsigned long) buffer);
- wait_on_page(page);
- set_bit(PG_locked, &page->flags);
- if (test_and_set_bit(PG_swap_cache, &page->flags)) {
- printk ("VM: read_swap_page: page already in swap cache!\n");
- return;
- }
- if (page->inode) {
- printk ("VM: read_swap_page: page already in page cache!\n");
- return;
- }
+
+ if (TryLockPage(page))
+ PAGE_BUG(page);
+ if (test_and_set_bit(PG_swap_cache, &page->flags))
+ PAGE_BUG(page);
+ if (page->inode)
+ PAGE_BUG(page);
+ get_page(page); /* Protect from shrink_mmap() */
page->inode = &swapper_inode;
page->offset = entry;
- atomic_inc(&page->count); /* Protect from shrink_mmap() */
rw_swap_page(rw, entry, buffer, 1);
- atomic_dec(&page->count);
- page->inode = 0;
- clear_bit(PG_swap_cache, &page->flags);
+
+ /*
+ * and now remove it from the pagecache ...
+ */
+ if (TryLockPage(page))
+ PAGE_BUG(page);
+ PageClearSwapCache(page);
+ remove_inode_page(page);
+ page_cache_release(page);
+ UnlockPage(page);
}
/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8c5e7176c..21723c1db 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -25,7 +25,31 @@
* ensure that any mistaken dereferences of this structure cause a
* kernel oops.
*/
-struct inode swapper_inode;
+
+static struct inode_operations swapper_inode_operations = {
+ NULL, /* default file operations */
+ NULL, /* create */
+ NULL, /* lookup */
+ NULL, /* link */
+ NULL, /* unlink */
+ NULL, /* symlink */
+ NULL, /* mkdir */
+ NULL, /* rmdir */
+ NULL, /* mknod */
+ NULL, /* rename */
+ NULL, /* readlink */
+ NULL, /* follow_link */
+ NULL, /* bmap */
+ NULL, /* readpage */
+ NULL, /* writepage */
+ block_flushpage, /* flushpage */
+ NULL, /* truncate */
+ NULL, /* permission */
+ NULL, /* smap */
+ NULL /* revalidate */
+};
+
+struct inode swapper_inode = { i_op: &swapper_inode_operations };
#ifdef SWAP_CACHE_INFO
unsigned long swap_cache_add_total = 0;
@@ -49,20 +73,20 @@ int add_to_swap_cache(struct page *page, unsigned long entry)
#endif
#ifdef DEBUG_SWAP
printk("DebugVM: add_to_swap_cache(%08lx count %d, entry %08lx)\n",
- page_address(page), atomic_read(&page->count), entry);
+ page_address(page), page_count(page), entry);
#endif
if (PageTestandSetSwapCache(page)) {
printk(KERN_ERR "swap_cache: replacing non-empty entry %08lx "
- "on page %08lx\n",
- page->offset, page_address(page));
+ "on page %08lx\n",
+ page->offset, page_address(page));
return 0;
}
if (page->inode) {
printk(KERN_ERR "swap_cache: replacing page-cached entry "
- "on page %08lx\n", page_address(page));
+ "on page %08lx\n", page_address(page));
return 0;
}
- atomic_inc(&page->count);
+ get_page(page);
page->inode = &swapper_inode;
page->offset = entry;
add_page_to_hash_queue(page, &swapper_inode, entry);
@@ -111,7 +135,7 @@ int swap_duplicate(unsigned long entry)
result = 1;
#ifdef DEBUG_SWAP
printk("DebugVM: swap_duplicate(entry %08lx, count now %d)\n",
- entry, p->swap_map[offset]);
+ entry, p->swap_map[offset]);
#endif
out:
return result;
@@ -127,7 +151,7 @@ bad_offset:
bad_unused:
printk(KERN_ERR
"swap_duplicate at %8p: entry %08lx, unused page\n",
- __builtin_return_address(0), entry);
+ __builtin_return_address(0), entry);
goto out;
}
@@ -153,7 +177,7 @@ int swap_count(unsigned long entry)
retval = p->swap_map[offset];
#ifdef DEBUG_SWAP
printk("DebugVM: swap_count(entry %08lx, count %d)\n",
- entry, retval);
+ entry, retval);
#endif
out:
return retval;
@@ -163,16 +187,16 @@ bad_entry:
goto out;
bad_file:
printk(KERN_ERR
- "swap_count: entry %08lx, nonexistent swap file!\n", entry);
+ "swap_count: entry %08lx, nonexistent swap file!\n", entry);
goto out;
bad_offset:
printk(KERN_ERR
- "swap_count: entry %08lx, offset exceeds max!\n", entry);
+ "swap_count: entry %08lx, offset exceeds max!\n", entry);
goto out;
bad_unused:
printk(KERN_ERR
- "swap_count at %8p: entry %08lx, unused page!\n",
- __builtin_return_address(0), entry);
+ "swap_count at %8p: entry %08lx, unused page!\n",
+ __builtin_return_address(0), entry);
goto out;
}
@@ -190,18 +214,17 @@ static inline void remove_from_swap_cache(struct page *page)
#ifdef DEBUG_SWAP
printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
- page_address(page), atomic_read(&page->count));
+ page_address(page), page_count(page));
#endif
- PageClearSwapCache (page);
+ PageClearSwapCache(page);
remove_inode_page(page);
}
-
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
-void delete_from_swap_cache(struct page *page)
+void __delete_from_swap_cache(struct page *page)
{
long entry = page->offset;
@@ -210,13 +233,27 @@ void delete_from_swap_cache(struct page *page)
#endif
#ifdef DEBUG_SWAP
printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
- "entry %08lx)\n",
- page_address(page), atomic_read(&page->count), entry);
+ "entry %08lx)\n",
+ page_address(page), page_count(page), entry);
#endif
remove_from_swap_cache (page);
swap_free (entry);
}
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache.
+ */
+void delete_from_swap_cache(struct page *page)
+{
+ lock_page(page);
+
+ __delete_from_swap_cache(page);
+
+ UnlockPage(page);
+ page_cache_release(page);
+}
+
/*
* Perform a free_page(), also freeing any swap cache associated with
* this page if it is the last user of the page.
@@ -229,18 +266,18 @@ void free_page_and_swap_cache(unsigned long addr)
/*
* If we are the only user, then free up the swap cache.
*/
- if (PageSwapCache(page) && !is_page_shared(page)) {
+ if (PageSwapCache(page) && !is_page_shared(page))
delete_from_swap_cache(page);
- }
__free_page(page);
}
/*
- * Lookup a swap entry in the swap cache. We need to be careful about
- * locked pages. A found page will be returned with its refcount
- * incremented.
+ * Lookup a swap entry in the swap cache. A found page will be returned
+ * unlocked and with its refcount incremented - we rely on the kernel
+ * lock getting page table operations atomic even if we drop the page
+ * lock before returning.
*/
struct page * lookup_swap_cache(unsigned long entry)
@@ -251,23 +288,21 @@ struct page * lookup_swap_cache(unsigned long entry)
swap_cache_find_total++;
#endif
while (1) {
- found = find_page(&swapper_inode, entry);
+ found = find_lock_page(&swapper_inode, entry);
if (!found)
return 0;
if (found->inode != &swapper_inode || !PageSwapCache(found))
goto out_bad;
- if (!PageLocked(found)) {
#ifdef SWAP_CACHE_INFO
- swap_cache_find_success++;
+ swap_cache_find_success++;
#endif
- return found;
- }
- __free_page(found);
- __wait_on_page(found);
+ UnlockPage(found);
+ return found;
}
out_bad:
printk (KERN_ERR "VM: Found a non-swapper swap page!\n");
+ UnlockPage(found);
__free_page(found);
return 0;
}
@@ -288,7 +323,7 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
#ifdef DEBUG_SWAP
printk("DebugVM: read_swap_cache_async entry %08lx%s\n",
- entry, wait ? ", wait" : "");
+ entry, wait ? ", wait" : "");
#endif
/*
* Make sure the swap entry is still in use.
@@ -319,12 +354,12 @@ struct page * read_swap_cache_async(unsigned long entry, int wait)
if (!add_to_swap_cache(new_page, entry))
goto out_free_page;
- set_bit(PG_locked, &new_page->flags);
+ LockPage(new_page);
rw_swap_page(READ, entry, (char *) new_page_addr, wait);
#ifdef DEBUG_SWAP
printk("DebugVM: read_swap_cache_async created "
- "entry %08lx at %p\n",
- entry, (char *) page_address(new_page));
+ "entry %08lx at %p\n",
+ entry, (char *) page_address(new_page));
#endif
return new_page;
@@ -335,3 +370,4 @@ out_free_swap:
out:
return found_page;
}
+
diff --git a/mm/swapfile.c b/mm/swapfile.c
index de29f1006..794e39aff 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -192,7 +192,7 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
return;
set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
- atomic_inc(&mem_map[MAP_NR(page)].count);
+ get_page(mem_map + MAP_NR(page));
++vma->vm_mm->rss;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d651e6f94..9ca4988e4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -157,7 +157,7 @@ drop_pte:
add_to_swap_cache(page_map, entry);
/* We checked we were unlocked way up above, and we
have been careful not to stall until here */
- set_bit(PG_locked, &page_map->flags);
+ LockPage(page_map);
/* OK, do a physical asynchronous write to swap. */
rw_swap_page(WRITE, entry, (char *) page, 0);