summaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1999-06-22 23:05:57 +0000
committerRalf Baechle <ralf@linux-mips.org>1999-06-22 23:05:57 +0000
commit51d3b7814cdccef9188240fe0cbd8d97ff2c7470 (patch)
tree5cbb01d0323d4f63ade66bdf48ba4a91aaa6df16 /mm/filemap.c
parent52273a23c9a84336b93a35e4847fc88fac7eb0e4 (diff)
Merge with Linux 2.3.7.
WARNING: 2.3.7 is known to eat filesystems for breakfast and little children for lunch, so if you try this on your machine make backups first ...
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c959
1 files changed, 664 insertions, 295 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 455f334f3..4e885758f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1,7 +1,7 @@
/*
* linux/mm/filemap.c
*
- * Copyright (C) 1994, 1995 Linus Torvalds
+ * Copyright (C) 1994-1999 Linus Torvalds
*/
/*
@@ -29,9 +29,12 @@
* though.
*
* Shared mappings now work. 15.8.1995 Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
*/
-unsigned long page_cache_size = 0;
+atomic_t page_cache_size = ATOMIC_INIT(0);
struct page * page_hash_table[PAGE_HASH_SIZE];
/*
@@ -50,38 +53,97 @@ static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
static kmem_cache_t *pio_request_cache;
static DECLARE_WAIT_QUEUE_HEAD(pio_wait);
+spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+
+
static inline void
make_pio_request(struct file *, unsigned long, unsigned long);
+void __add_page_to_hash_queue(struct page * page, struct page **p){
+ atomic_inc(&page_cache_size);
+ if((page->next_hash = *p) != NULL)
+ (*p)->pprev_hash = &page->next_hash;
+ *p = page;
+ page->pprev_hash = p;
+ if (page->buffers)
+ PAGE_BUG(page);
+}
+
+static void remove_page_from_hash_queue(struct page * page)
+{
+ if(page->pprev_hash) {
+ if(page->next_hash)
+ page->next_hash->pprev_hash = page->pprev_hash;
+ *page->pprev_hash = page->next_hash;
+ page->pprev_hash = NULL;
+ }
+ atomic_dec(&page_cache_size);
+}
+
+static void remove_page_from_inode_queue(struct page * page)
+{
+ struct inode * inode = page->inode;
+ struct page *prev, *next;
+
+ inode->i_nrpages--;
+ next = page->next;
+ prev = page->prev;
+ if (inode->i_pages == page)
+ inode->i_pages = next;
+ if (next)
+ next->prev = prev;
+ if (prev)
+ prev->next = next;
+ page->next = NULL;
+ page->prev = NULL;
+}
/*
- * Invalidate the pages of an inode, removing all pages that aren't
- * locked down (those are sure to be up-to-date anyway, so we shouldn't
- * invalidate them).
+ * Remove a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe.
*/
+void remove_inode_page(struct page *page)
+{
+ if (!PageLocked(page))
+ PAGE_BUG(page);
+
+ spin_lock(&pagecache_lock);
+ remove_page_from_inode_queue(page);
+ remove_page_from_hash_queue(page);
+ page->inode = NULL;
+ spin_unlock(&pagecache_lock);
+}
+
void invalidate_inode_pages(struct inode * inode)
{
struct page ** p;
struct page * page;
+repeat:
+ spin_lock(&pagecache_lock);
p = &inode->i_pages;
while ((page = *p) != NULL) {
- if (PageLocked(page)) {
- p = &page->next;
- continue;
+ get_page(page);
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+ wait_on_page(page);
+ page_cache_release(page);
+ goto repeat;
}
- inode->i_nrpages--;
- if ((*p = page->next) != NULL)
- (*p)->prev = page->prev;
- page->next = NULL;
- page->prev = NULL;
+ if (page_count(page) != 2)
+ printk("hm, busy page invalidated? (not necesserily a bug)\n");
+
+ remove_page_from_inode_queue(page);
remove_page_from_hash_queue(page);
page->inode = NULL;
+ UnlockPage(page);
+ page_cache_release(page);
page_cache_release(page);
- continue;
+
}
+ spin_unlock(&pagecache_lock);
}
-
/*
* Truncate the page cache at a set offset, removing the pages
* that are beyond that offset (and zeroing out partial pages).
@@ -90,55 +152,90 @@ void truncate_inode_pages(struct inode * inode, unsigned long start)
{
struct page ** p;
struct page * page;
+ int partial = 0;
repeat:
+ spin_lock(&pagecache_lock);
p = &inode->i_pages;
while ((page = *p) != NULL) {
unsigned long offset = page->offset;
/* page wholly truncated - free it */
if (offset >= start) {
- if (PageLocked(page)) {
- wait_on_page(page);
- goto repeat;
- }
- inode->i_nrpages--;
- if ((*p = page->next) != NULL)
- (*p)->prev = page->prev;
- page->next = NULL;
- page->prev = NULL;
- remove_page_from_hash_queue(page);
- page->inode = NULL;
+ get_page(page);
+ spin_unlock(&pagecache_lock);
+
+ lock_page(page);
+
+ if (inode->i_op->flushpage)
+ inode->i_op->flushpage(inode, page, 0);
+
+ /*
+ * We remove the page from the page cache
+ * _after_ we have destroyed all buffer-cache
+ * references to it. Otherwise some other process
+ * might think this inode page is not in the
+ * page cache and creates a buffer-cache alias
+ * to it causing all sorts of fun problems ...
+ */
+ remove_inode_page(page);
+
+ UnlockPage(page);
page_cache_release(page);
- continue;
+ page_cache_release(page);
+
+ /*
+ * We have done things without the pagecache lock,
+ * so we'll have to repeat the scan.
+ * It's not possible to deadlock here because
+ * we are guaranteed to make progress. (ie. we have
+ * just removed a page)
+ */
+ goto repeat;
}
p = &page->next;
+ /*
+ * there is only one partial page possible.
+ */
+ if (partial)
+ continue;
+
offset = start - offset;
/* partial truncate, clear end of page */
if (offset < PAGE_CACHE_SIZE) {
- unsigned long address = page_address(page);
+ unsigned long address;
+ get_page(page);
+ spin_unlock(&pagecache_lock);
+
+ lock_page(page);
+ partial = 1;
+
+ address = page_address(page);
memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
flush_page_to_ram(address);
+
+ if (inode->i_op->flushpage)
+ inode->i_op->flushpage(inode, page, offset);
+ /*
+ * we have dropped the spinlock so we have to
+ * restart.
+ */
+ UnlockPage(page);
+ page_cache_release(page);
+ goto repeat;
}
}
+ spin_unlock(&pagecache_lock);
}
-/*
- * Remove a page from the page cache and free it.
- */
-void remove_inode_page(struct page *page)
-{
- remove_page_from_hash_queue(page);
- remove_page_from_inode_queue(page);
- page_cache_release(page);
-}
+extern atomic_t too_many_dirty_buffers;
int shrink_mmap(int priority, int gfp_mask)
{
static unsigned long clock = 0;
unsigned long limit = num_physpages;
struct page * page;
- int count;
+ int count, users;
count = limit >> priority;
@@ -164,15 +261,67 @@ int shrink_mmap(int priority, int gfp_mask)
referenced = test_and_clear_bit(PG_referenced, &page->flags);
- if (PageLocked(page))
+ if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
continue;
- if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+ /*
+ * Some common cases that we just short-circuit without
+ * getting the locks - we need to re-check this once we
+ * have the lock, but that's fine.
+ */
+ users = page_count(page);
+ if (!users)
+ continue;
+ if (!page->buffers) {
+ if (!page->inode)
+ continue;
+ if (users > 1)
+ continue;
+ }
+
+ /*
+ * ok, now the page looks interesting. Re-check things
+ * and keep the lock.
+ */
+ spin_lock(&pagecache_lock);
+ if (!page->inode && !page->buffers) {
+ spin_unlock(&pagecache_lock);
continue;
+ }
+ if (!page_count(page)) {
+// BUG();
+ spin_unlock(&pagecache_lock);
+ continue;
+ }
+ get_page(page);
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+ goto put_continue;
+ }
+
+ /*
+ * we keep pagecache_lock locked and unlock it in
+ * each branch, so that the page->inode case doesnt
+ * have to re-grab it. Here comes the 'real' logic
+ * to free memory:
+ */
+
+ /* Is it a buffer page? */
+ if (page->buffers) {
+ kdev_t dev = page->buffers->b_dev;
+ spin_unlock(&pagecache_lock);
+ if (try_to_free_buffers(page))
+ goto made_progress;
+ if (!atomic_read(&too_many_dirty_buffers)) {
+ atomic_set(&too_many_dirty_buffers, 1);
+ balance_dirty(dev);
+ }
+ goto unlock_continue;
+ }
/* We can't free pages unless there's just one user */
- if (atomic_read(&page->count) != 1)
- continue;
+ if (page_count(page) != 2)
+ goto spin_unlock_continue;
count--;
@@ -182,77 +331,180 @@ int shrink_mmap(int priority, int gfp_mask)
* were to be marked referenced..
*/
if (PageSwapCache(page)) {
- if (referenced && swap_count(page->offset) != 1)
- continue;
- delete_from_swap_cache(page);
- return 1;
+ spin_unlock(&pagecache_lock);
+ if (referenced && swap_count(page->offset) != 2)
+ goto unlock_continue;
+ __delete_from_swap_cache(page);
+ page_cache_release(page);
+ goto made_progress;
}
- if (referenced)
- continue;
-
- /* Is it a buffer page? */
- if (page->buffers) {
- if (buffer_under_min())
- continue;
- if (!try_to_free_buffers(page))
- continue;
- return 1;
- }
-
/* is it a page-cache page? */
- if (page->inode) {
- if (pgcache_under_min())
- continue;
- remove_inode_page(page);
- return 1;
- }
+ if (!referenced && page->inode && !pgcache_under_min()) {
+ remove_page_from_inode_queue(page);
+ remove_page_from_hash_queue(page);
+ page->inode = NULL;
+ spin_unlock(&pagecache_lock);
+ page_cache_release(page);
+ goto made_progress;
+ }
+spin_unlock_continue:
+ spin_unlock(&pagecache_lock);
+unlock_continue:
+ UnlockPage(page);
+put_continue:
+ put_page(page);
} while (count > 0);
return 0;
+made_progress:
+ UnlockPage(page);
+ put_page(page);
+ return 1;
+}
+
+static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
+{
+ goto inside;
+
+ for (;;) {
+ page = page->next_hash;
+inside:
+ if (!page)
+ goto not_found;
+ if (page->inode != inode)
+ continue;
+ if (page->offset == offset)
+ break;
+ }
+not_found:
+ return page;
}
/*
- * Update a page cache copy, when we're doing a "write()" system call
- * See also "update_vm_cache()".
+ * By the time this is called, the page is locked and
+ * we don't have to worry about any races any more.
+ *
+ * Start the IO..
*/
-void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
+static int writeout_one_page(struct page *page)
{
- unsigned long offset, len;
+ struct buffer_head *bh, *head = page->buffers;
- offset = (pos & ~PAGE_CACHE_MASK);
- pos = pos & PAGE_CACHE_MASK;
- len = PAGE_CACHE_SIZE - offset;
+ bh = head;
do {
- struct page * page;
+ if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
+ continue;
- if (len > count)
- len = count;
- page = find_page(inode, pos);
- if (page) {
- wait_on_page(page);
- memcpy((void *) (offset + page_address(page)), buf, len);
- page_cache_release(page);
- }
- count -= len;
- buf += len;
- len = PAGE_CACHE_SIZE;
- offset = 0;
- pos += PAGE_CACHE_SIZE;
- } while (count);
+ bh->b_flushtime = 0;
+ ll_rw_block(WRITE, 1, &bh);
+ } while ((bh = bh->b_this_page) != head);
+ return 0;
+}
+
+static int waitfor_one_page(struct page *page)
+{
+ int error = 0;
+ struct buffer_head *bh, *head = page->buffers;
+
+ bh = head;
+ do {
+ wait_on_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh))
+ error = -EIO;
+ } while ((bh = bh->b_this_page) != head);
+ return error;
+}
+
+static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
+{
+ struct page *next;
+ int retval = 0;
+
+ start &= PAGE_MASK;
+
+ spin_lock(&pagecache_lock);
+ next = inode->i_pages;
+ while (next) {
+ struct page *page = next;
+ next = page->next;
+ if (!page->buffers)
+ continue;
+ if (page->offset >= end)
+ continue;
+ if (page->offset < start)
+ continue;
+
+ get_page(page);
+ spin_unlock(&pagecache_lock);
+ lock_page(page);
+
+ /* The buffers could have been free'd while we waited for the page lock */
+ if (page->buffers)
+ retval |= fn(page);
+
+ UnlockPage(page);
+ spin_lock(&pagecache_lock);
+ next = page->next;
+ page_cache_release(page);
+ }
+ spin_unlock(&pagecache_lock);
+
+ return retval;
+}
+
+/*
+ * Two-stage data sync: first start the IO, then go back and
+ * collect the information..
+ */
+int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
+{
+ int retval;
+
+ retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
+ retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
+ return retval;
}
-static inline void add_to_page_cache(struct page * page,
+/*
+ * This adds a page to the page cache, starting out as locked,
+ * owned by us, referenced, but not uptodate and with no errors.
+ */
+static inline void __add_to_page_cache(struct page * page,
struct inode * inode, unsigned long offset,
struct page **hash)
{
- atomic_inc(&page->count);
- page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
+ unsigned long flags;
+
+ flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
+ page->flags = flags | ((1 << PG_locked) | (1 << PG_referenced));
+ page->owner = (int)current; /* REMOVEME */
+ get_page(page);
page->offset = offset;
add_page_to_inode_queue(inode, page);
__add_page_to_hash_queue(page, hash);
}
+int add_to_page_cache_unique(struct page * page,
+ struct inode * inode, unsigned long offset,
+ struct page **hash)
+{
+ int err;
+ struct page *alias;
+
+ spin_lock(&pagecache_lock);
+ alias = __find_page_nolock(inode, offset, *hash);
+
+ err = 1;
+ if (!alias) {
+ __add_to_page_cache(page,inode,offset,hash);
+ err = 0;
+ }
+
+ spin_unlock(&pagecache_lock);
+ return err;
+}
+
/*
* Try to read ahead in the file. "page_cache" is a potentially free page
* that we could use for the cache (if it is 0 we can try to create one,
@@ -275,45 +527,173 @@ static unsigned long try_to_read_ahead(struct file * file,
if (offset >= inode->i_size)
break;
hash = page_hash(inode, offset);
- page = __find_page(inode, offset, *hash);
- if (!page) {
+ page = page_cache_entry(page_cache);
+ if (!add_to_page_cache_unique(page, inode, offset, hash)) {
/*
- * Ok, add the new page to the hash-queues...
+ * We do not have to check the return value here
+ * because it's a readahead.
*/
- page = page_cache_entry(page_cache);
- add_to_page_cache(page, inode, offset, hash);
inode->i_op->readpage(file, page);
page_cache = 0;
+ page_cache_release(page);
}
- page_cache_release(page);
}
return page_cache;
}
/*
- * Wait for IO to complete on a locked page.
+ * Wait for a page to get unlocked.
*
* This must be called with the caller "holding" the page,
* ie with increased "page->count" so that the page won't
* go away during the wait..
*/
-void __wait_on_page(struct page *page)
+void ___wait_on_page(struct page *page)
{
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
add_wait_queue(&page->wait, &wait);
-repeat:
- tsk->state = TASK_UNINTERRUPTIBLE;
- run_task_queue(&tq_disk);
- if (PageLocked(page)) {
+ do {
+ tsk->state = TASK_UNINTERRUPTIBLE;
+ run_task_queue(&tq_disk);
+ if (!PageLocked(page))
+ break;
schedule();
- goto repeat;
- }
+ } while (PageLocked(page));
tsk->state = TASK_RUNNING;
remove_wait_queue(&page->wait, &wait);
}
+/*
+ * Get an exclusive lock on the page..
+ */
+void lock_page(struct page *page)
+{
+ if (TryLockPage(page)) {
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, current);
+
+ run_task_queue(&tq_disk);
+ add_wait_queue(&page->wait, &wait);
+ tsk->state = TASK_UNINTERRUPTIBLE;
+
+ while (TryLockPage(page)) {
+ run_task_queue(&tq_disk);
+ schedule();
+ tsk->state = TASK_UNINTERRUPTIBLE;
+ }
+
+ remove_wait_queue(&page->wait, &wait);
+ tsk->state = TASK_RUNNING;
+ }
+}
+
+
+/*
+ * a rather lightweight function, finding and getting a reference to a
+ * hashed page atomically, waiting for it if it's locked.
+ */
+struct page * __find_get_page (struct inode * inode,
+ unsigned long offset, struct page **hash)
+{
+ struct page *page;
+
+ /*
+ * We scan the hash list read-only. Addition to and removal from
+ * the hash-list needs a held write-lock.
+ */
+repeat:
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, offset, *hash);
+ if (page)
+ get_page(page);
+ spin_unlock(&pagecache_lock);
+
+ /* Found the page, sleep if locked. */
+ if (page && PageLocked(page)) {
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ add_wait_queue(&page->wait, &wait);
+ tsk->state = TASK_UNINTERRUPTIBLE;
+
+ run_task_queue(&tq_disk);
+ if (PageLocked(page))
+ schedule();
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(&page->wait, &wait);
+
+ /*
+ * The page might have been unhashed meanwhile. It's
+ * not freed though because we hold a reference to it.
+ * If this is the case then it will be freed _here_,
+ * and we recheck the hash anyway.
+ */
+ page_cache_release(page);
+ goto repeat;
+ }
+ /*
+ * It's not locked so we can return the page and we hold
+ * a reference to it.
+ */
+ return page;
+}
+
+/*
+ * Get the lock to a page atomically.
+ */
+struct page * __find_lock_page (struct inode * inode,
+ unsigned long offset, struct page **hash)
+{
+ int locked;
+ struct page *page;
+
+ /*
+ * We scan the hash list read-only. Addition to and removal from
+ * the hash-list needs a held write-lock.
+ */
+repeat:
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, offset, *hash);
+ locked = 0;
+ if (page) {
+ get_page(page);
+ if (TryLockPage(page))
+ locked = 1;
+ }
+ spin_unlock(&pagecache_lock);
+
+ /* Found the page, sleep if locked. */
+ if (page && locked) {
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ add_wait_queue(&page->wait, &wait);
+ tsk->state = TASK_UNINTERRUPTIBLE;
+
+ run_task_queue(&tq_disk);
+ if (PageLocked(page))
+ schedule();
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(&page->wait, &wait);
+
+ /*
+ * The page might have been unhashed meanwhile. It's
+ * not freed though because we hold a reference to it.
+ * If this is the case then it will be freed _here_,
+ * and we recheck the hash anyway.
+ */
+ page_cache_release(page);
+ goto repeat;
+ }
+ /*
+ * It's not locked so we can return the page and we hold
+ * a reference to it.
+ */
+ return page;
+}
+
#if 0
#define PROFILE_READAHEAD
#define DEBUG_READAHEAD
@@ -386,14 +766,14 @@ static void profile_readahead(int async, struct file *filp)
* -------------------
* The read ahead context fields of the "struct file" are the following:
* - f_raend : position of the first byte after the last page we tried to
- * read ahead.
+ * read ahead.
* - f_ramax : current read-ahead maximum size.
* - f_ralen : length of the current IO read block we tried to read-ahead.
* - f_rawin : length of the current read-ahead window.
- * if last read-ahead was synchronous then
- * f_rawin = f_ralen
- * otherwise (was asynchronous)
- * f_rawin = previous value of f_ralen + f_ralen
+ * if last read-ahead was synchronous then
+ * f_rawin = f_ralen
+ * otherwise (was asynchronous)
+ * f_rawin = previous value of f_ralen + f_ralen
*
* Read-ahead limits:
* ------------------
@@ -485,7 +865,7 @@ static inline unsigned long generic_file_readahead(int reada_ok,
* We will later force unplug device in order to force asynchronous read IO.
*/
else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
- ppos <= raend && ppos + filp->f_ralen >= raend) {
+ ppos <= raend && ppos + filp->f_ralen >= raend) {
/*
* Add ONE page to max_ahead in order to try to have about the same IO max size
* as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
@@ -578,6 +958,7 @@ static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descript
struct inode *inode = dentry->d_inode;
size_t pos, pgpos, page_cache;
int reada_ok;
+ int error;
int max_readahead = get_max_readahead(inode);
page_cache = 0;
@@ -633,33 +1014,22 @@ static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descript
* Try to find the data in the page cache..
*/
hash = page_hash(inode, pos & PAGE_CACHE_MASK);
- page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash);
+
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
if (!page)
goto no_cached_page;
-
found_page:
-/*
- * Try to read ahead only if the current page is filled or being filled.
- * Otherwise, if we were reading ahead, decrease max read ahead size to
- * the minimum value.
- * In this context, that seems to may happen only on some read error or if
- * the page has been rewritten.
- */
- if (PageUptodate(page) || PageLocked(page))
- page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
- else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
- filp->f_ramax = MIN_READAHEAD;
-
- wait_on_page(page);
-
- if (!PageUptodate(page))
- goto page_read_error;
+ get_page(page);
+ spin_unlock(&pagecache_lock);
-success:
- /*
- * Ok, we have the page, it's up-to-date and ok,
- * so now we can finally copy it to user space...
- */
+ if (!Page_Uptodate(page))
+ goto page_not_up_to_date;
+page_ok:
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ */
{
unsigned long offset, nr;
@@ -683,75 +1053,77 @@ success:
break;
}
+/*
+ * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
+ */
+page_not_up_to_date:
+ page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+
+ if (Page_Uptodate(page))
+ goto page_ok;
+
+ /* Get exclusive access to the page ... */
+ lock_page(page);
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
+ goto page_ok;
+ }
+
+readpage:
+ /* ... and start the actual read. The read will unlock the page. */
+ error = inode->i_op->readpage(filp, page);
+
+ if (!error) {
+ if (Page_Uptodate(page))
+ goto page_ok;
+
+ /* Again, try some read-ahead while waiting for the page to finish.. */
+ page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+ wait_on_page(page);
+ if (Page_Uptodate(page))
+ goto page_ok;
+ error = -EIO;
+ }
+
+ /* UHHUH! A synchronous read error occurred. Report it */
+ desc->error = error;
+ page_cache_release(page);
+ break;
+
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
+ *
+ * We get here with the page cache lock held.
*/
if (!page_cache) {
+ spin_unlock(&pagecache_lock);
page_cache = page_cache_alloc();
+ if (!page_cache) {
+ desc->error = -ENOMEM;
+ break;
+ }
+
/*
- * That could have slept, so go around to the
- * very beginning..
+ * Somebody may have added the page while we
+ * dropped the page cache lock. Check for that.
*/
- if (page_cache)
- continue;
- desc->error = -ENOMEM;
- break;
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
+ if (page)
+ goto found_page;
}
/*
* Ok, add the new page to the hash-queues...
*/
page = page_cache_entry(page_cache);
- page_cache = 0;
- add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
-
- /*
- * Error handling is tricky. If we get a read error,
- * the cached page stays in the cache (but uptodate=0),
- * and the next process that accesses it will try to
- * re-read it. This is needed for NFS etc, where the
- * identity of the reader can decide if we can read the
- * page or not..
- */
-/*
- * We have to read the page.
- * If we were reading ahead, we had previously tried to read this page,
- * That means that the page has probably been removed from the cache before
- * the application process needs it, or has been rewritten.
- * Decrease max readahead size to the minimum value in that situation.
- */
- if (reada_ok && filp->f_ramax > MIN_READAHEAD)
- filp->f_ramax = MIN_READAHEAD;
-
- {
- int error = inode->i_op->readpage(filp, page);
- if (!error)
- goto found_page;
- desc->error = error;
- page_cache_release(page);
- break;
- }
+ __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
+ spin_unlock(&pagecache_lock);
-page_read_error:
- /*
- * We found the page, but it wasn't up-to-date.
- * Try to re-read it _once_. We do this synchronously,
- * because this happens only if there were errors.
- */
- {
- int error = inode->i_op->readpage(filp, page);
- if (!error) {
- wait_on_page(page);
- if (PageUptodate(page) && !PageError(page))
- goto success;
- error = -EIO; /* Some unspecified error occurred.. */
- }
- desc->error = error;
- page_cache_release(page);
- break;
- }
+ page_cache = 0;
+ goto readpage;
}
*ppos = pos;
@@ -787,6 +1159,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
{
ssize_t retval;
+ unlock_kernel();
retval = -EFAULT;
if (access_ok(VERIFY_WRITE, buf, count)) {
retval = 0;
@@ -804,6 +1177,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
retval = desc.error;
}
}
+ lock_kernel();
return retval;
}
@@ -812,17 +1186,14 @@ static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned
ssize_t written;
unsigned long count = desc->count;
struct file *file = (struct file *) desc->buf;
- struct inode *inode = file->f_dentry->d_inode;
mm_segment_t old_fs;
if (size > count)
size = count;
- down(&inode->i_sem);
old_fs = get_fs();
set_fs(KERNEL_DS);
written = file->f_op->write(file, area, size, &file->f_pos);
set_fs(old_fs);
- up(&inode->i_sem);
if (written < 0) {
desc->error = written;
written = 0;
@@ -878,6 +1249,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
if (retval)
goto fput_out;
+ unlock_kernel();
retval = 0;
if (count) {
read_descriptor_t desc;
@@ -887,7 +1259,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
ppos = &in_file->f_pos;
if (offset) {
if (get_user(pos, offset))
- goto fput_out;
+ goto fput_out_lock;
ppos = &pos;
}
@@ -904,7 +1276,8 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t cou
put_user(pos, offset);
}
-
+fput_out_lock:
+ lock_kernel();
fput_out:
fput(out_file);
fput_in:
@@ -934,17 +1307,21 @@ static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long
unsigned long offset, reada, i;
struct page * page, **hash;
unsigned long old_page, new_page;
+ int error;
new_page = 0;
offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
- goto no_page;
+ goto no_page_nolock;
+
+ unlock_kernel();
/*
* Do we have something in the page cache already?
*/
hash = page_hash(inode, offset);
- page = __find_page(inode, offset, *hash);
+retry_find:
+ page = __find_get_page(inode, offset, hash);
if (!page)
goto no_cached_page;
@@ -960,15 +1337,17 @@ found_page:
goto failure;
}
- if (PageLocked(page))
- goto page_locked_wait;
- if (!PageUptodate(page))
- goto page_read_error;
+ if (!Page_Uptodate(page)) {
+ lock_page(page);
+ if (!Page_Uptodate(page))
+ goto page_not_uptodate;
+ UnlockPage(page);
+ }
success:
/*
- * Found the page, need to check sharing and possibly
- * copy it over to another page..
+ * Found the page and have a reference on it, need to check sharing
+ * and possibly copy it over to another page..
*/
old_page = page_address(page);
if (!no_share) {
@@ -980,6 +1359,7 @@ success:
page_cache_free(new_page);
flush_page_to_ram(old_page);
+ lock_kernel();
return old_page;
}
@@ -989,6 +1369,7 @@ success:
copy_page(new_page, old_page);
flush_page_to_ram(new_page);
page_cache_release(page);
+ lock_kernel();
return new_page;
no_cached_page:
@@ -1013,7 +1394,7 @@ no_cached_page:
* cache.. The page we just got may be useful if we
* can't share, so don't get rid of it here.
*/
- page = find_page(inode, offset);
+ page = __find_get_page(inode, offset, hash);
if (page)
goto found_page;
@@ -1021,19 +1402,24 @@ no_cached_page:
* Now, create a new page-cache page from the page we got
*/
page = page_cache_entry(new_page);
- new_page = 0;
- add_to_page_cache(page, inode, offset, hash);
+ if (add_to_page_cache_unique(page, inode, offset, hash))
+ goto retry_find;
- if (inode->i_op->readpage(file, page) != 0)
- goto failure;
+ /*
+ * Now it's ours and locked, we can do initial IO to it:
+ */
+ new_page = 0;
- goto found_page;
+page_not_uptodate:
+ error = inode->i_op->readpage(file, page);
-page_locked_wait:
- __wait_on_page(page);
- if (PageUptodate(page))
+ if (!error) {
+ wait_on_page(page);
+ if (PageError(page))
+ goto page_read_error;
goto success;
-
+ }
+
page_read_error:
/*
* Umm, take care of errors if the page isn't up-to-date.
@@ -1041,12 +1427,14 @@ page_read_error:
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- if (inode->i_op->readpage(file, page) != 0)
+ if (!PageLocked(page))
+ PAGE_BUG(page);
+ ClearPageError(page);
+ error = inode->i_op->readpage(file, page);
+ if (error)
goto failure;
wait_on_page(page);
- if (PageError(page))
- goto failure;
- if (PageUptodate(page))
+ if (Page_Uptodate(page))
goto success;
/*
@@ -1058,6 +1446,8 @@ failure:
if (new_page)
page_cache_free(new_page);
no_page:
+ lock_kernel();
+no_page_nolock:
return 0;
}
@@ -1066,12 +1456,13 @@ no_page:
* if the disk is full.
*/
static inline int do_write_page(struct inode * inode, struct file * file,
- const char * page, unsigned long offset)
+ const char * page_addr, unsigned long offset)
{
int retval;
unsigned long size;
loff_t loff = offset;
- mm_segment_t old_fs;
+ int (*writepage) (struct file *, struct page *);
+ struct page * page;
size = offset + PAGE_SIZE;
/* refuse to extend file size.. */
@@ -1083,12 +1474,21 @@ static inline int do_write_page(struct inode * inode, struct file * file,
return -EIO;
}
size -= offset;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
retval = -EIO;
- if (size == file->f_op->write(file, (const char *) page, size, &loff))
- retval = 0;
- set_fs(old_fs);
+ writepage = inode->i_op->writepage;
+ page = mem_map + MAP_NR(page_addr);
+ lock_page(page);
+
+ if (writepage) {
+ retval = writepage(file, page);
+ } else {
+ mm_segment_t old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ if (size == file->f_op->write(file, page_addr, size, &loff))
+ retval = 0;
+ set_fs(old_fs);
+ }
+ UnlockPage(page);
return retval;
}
@@ -1124,9 +1524,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
return 0;
}
- down(&inode->i_sem);
result = do_write_page(inode, file, (const char *) page, offset);
- up(&inode->i_sem);
fput(file);
return result;
}
@@ -1146,7 +1544,8 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pte_t pte = *ptep;
- unsigned long page;
+ unsigned long pageaddr;
+ struct page *page;
int error;
if (!(flags & MS_INVALIDATE)) {
@@ -1158,8 +1557,9 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
flush_cache_page(vma, address);
set_pte(ptep, pte_mkclean(pte));
flush_tlb_page(vma, address);
- page = pte_page(pte);
- atomic_inc(&page_cache_entry(page)->count);
+ pageaddr = pte_page(pte);
+ page = page_cache_entry(pageaddr);
+ get_page(page);
} else {
if (pte_none(pte))
return 0;
@@ -1170,14 +1570,14 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
swap_free(pte_val(pte));
return 0;
}
- page = pte_page(pte);
+ pageaddr = pte_page(pte);
if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
- page_cache_free(page);
+ page_cache_free(pageaddr);
return 0;
}
}
- error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
- page_cache_free(page);
+ error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
+ page_cache_free(pageaddr);
return error;
}
@@ -1338,10 +1738,7 @@ static int msync_interval(struct vm_area_struct * vma,
struct file * file = vma->vm_file;
if (file) {
struct dentry * dentry = file->f_dentry;
- struct inode * inode = dentry->d_inode;
- down(&inode->i_sem);
error = file_fsync(file, dentry);
- up(&inode->i_sem);
}
}
return error;
@@ -1436,11 +1833,12 @@ generic_file_write(struct file *file, const char *buf,
unsigned long page_cache = 0;
unsigned long written;
long status;
+ int err;
- if (file->f_error) {
- int error = file->f_error;
+ err = file->f_error;
+ if (err) {
file->f_error = 0;
- return error;
+ goto out;
}
written = 0;
@@ -1451,7 +1849,7 @@ generic_file_write(struct file *file, const char *buf,
/*
* Check whether we've reached the file size limit.
*/
- status = -EFBIG;
+ err = -EFBIG;
if (pos >= limit) {
send_sig(SIGXFSZ, current, 0);
goto out;
@@ -1467,6 +1865,8 @@ generic_file_write(struct file *file, const char *buf,
count = limit - pos;
}
+ unlock_kernel();
+
while (count) {
unsigned long bytes, pgpos, offset;
/*
@@ -1480,29 +1880,36 @@ generic_file_write(struct file *file, const char *buf,
bytes = count;
hash = page_hash(inode, pgpos);
- page = __find_page(inode, pgpos, *hash);
+repeat_find:
+ page = __find_lock_page(inode, pgpos, hash);
if (!page) {
if (!page_cache) {
page_cache = page_cache_alloc();
if (page_cache)
- continue;
+ goto repeat_find;
status = -ENOMEM;
break;
}
page = page_cache_entry(page_cache);
- add_to_page_cache(page, inode, pgpos, hash);
+ if (add_to_page_cache_unique(page,inode,pgpos,hash))
+ goto repeat_find;
+
page_cache = 0;
}
- /* Get exclusive IO access to the page.. */
- wait_on_page(page);
- set_bit(PG_locked, &page->flags);
+ /* We have exclusive IO access to the page.. */
+ if (!PageLocked(page)) {
+ PAGE_BUG(page);
+ } else {
+ if (page->owner != (int)current) {
+ PAGE_BUG(page);
+ }
+ }
status = write_one_page(file, page, offset, bytes, buf);
/* Mark it unlocked again and drop the page.. */
- clear_bit(PG_locked, &page->flags);
- wake_up(&page->wait);
+ UnlockPage(page);
page_cache_release(page);
if (status < 0)
@@ -1519,51 +1926,16 @@ generic_file_write(struct file *file, const char *buf,
if (page_cache)
page_cache_free(page_cache);
+
+ err = written ? written : status;
+ lock_kernel();
out:
- return written ? written : status;
+ return err;
}
/*
- * Support routines for directory cacheing using the page cache.
- */
-
-/*
- * Finds the page at the specified offset, installing a new page
- * if requested. The count is incremented and the page is locked.
- *
- * Note: we don't have to worry about races here, as the caller
- * is holding the inode semaphore.
+ * Support routines for directory caching using the page cache.
*/
-unsigned long get_cached_page(struct inode * inode, unsigned long offset,
- int new)
-{
- struct page * page;
- struct page ** hash;
- unsigned long page_cache = 0;
-
- hash = page_hash(inode, offset);
- page = __find_page(inode, offset, *hash);
- if (!page) {
- if (!new)
- goto out;
- page_cache = page_cache_alloc();
- if (!page_cache)
- goto out;
- clear_page(page_cache);
- page = page_cache_entry(page_cache);
- add_to_page_cache(page, inode, offset, hash);
- }
- if (atomic_read(&page->count) != 2)
- printk(KERN_ERR "get_cached_page: page count=%d\n",
- atomic_read(&page->count));
- if (test_bit(PG_locked, &page->flags))
- printk(KERN_ERR "get_cached_page: page already locked!\n");
- set_bit(PG_locked, &page->flags);
- page_cache = page_address(page);
-
-out:
- return page_cache;
-}
/*
* Unlock and free a page.
@@ -1572,13 +1944,10 @@ void put_cached_page(unsigned long addr)
{
struct page * page = page_cache_entry(addr);
- if (!test_bit(PG_locked, &page->flags))
- printk("put_cached_page: page not locked!\n");
- if (atomic_read(&page->count) != 2)
- printk("put_cached_page: page count=%d\n",
- atomic_read(&page->count));
- clear_bit(PG_locked, &page->flags);
- wake_up(&page->wait);
+ UnlockPage(page);
+ if (page_count(page) != 2)
+ panic("put_cached_page: page count=%d\n",
+ page_count(page));
page_cache_release(page);
}
@@ -1607,11 +1976,13 @@ static inline struct pio_request * get_pio_request(void)
static inline void make_pio_request(struct file *file,
unsigned long offset,
- unsigned long page)
+ unsigned long pageaddr)
{
struct pio_request *p;
+ struct page *page;
- atomic_inc(&page_cache_entry(page)->count);
+ page = page_cache_entry(pageaddr);
+ get_page(page);
/*
* We need to allocate without causing any recursive IO in the
@@ -1634,7 +2005,7 @@ static inline void make_pio_request(struct file *file,
p->file = file;
p->offset = offset;
- p->page = page;
+ p->page = pageaddr;
put_pio_request(p);
wake_up(&pio_wait);
@@ -1694,10 +2065,8 @@ int kpiod(void * unused)
dentry = p->file->f_dentry;
inode = dentry->d_inode;
- down(&inode->i_sem);
do_write_page(inode, p->file,
(const char *) p->page, p->offset);
- up(&inode->i_sem);
fput(p->file);
page_cache_free(p->page);
kmem_cache_free(pio_request_cache, p);