diff options
Diffstat (limited to 'fs/nfs')
-rw-r--r-- | fs/nfs/Makefile | 2 | ||||
-rw-r--r-- | fs/nfs/dir.c | 62 | ||||
-rw-r--r-- | fs/nfs/file.c | 4 | ||||
-rw-r--r-- | fs/nfs/flushd.c | 304 | ||||
-rw-r--r-- | fs/nfs/inode.c | 57 | ||||
-rw-r--r-- | fs/nfs/nfs2xdr.c | 49 | ||||
-rw-r--r-- | fs/nfs/nfsroot.c | 5 | ||||
-rw-r--r-- | fs/nfs/proc.c | 8 | ||||
-rw-r--r-- | fs/nfs/read.c | 10 | ||||
-rw-r--r-- | fs/nfs/write.c | 1667 |
10 files changed, 1665 insertions, 503 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 3171e8adc..3c8aac510 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -9,7 +9,7 @@ O_TARGET := nfs.o O_OBJS := inode.o file.o read.o write.o dir.o symlink.o proc.o \ - nfs2xdr.o + nfs2xdr.o flushd.o ifdef CONFIG_ROOT_NFS O_OBJS += nfsroot.o mount_clnt.o diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 37b2b682b..3ca240129 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -596,9 +596,12 @@ static int nfs_lookup_revalidate(struct dentry * dentry, int flags) out_valid: return 1; out_bad: - d_drop(dentry); if (!list_empty(&dentry->d_subdirs)) shrink_dcache_parent(dentry); + /* If we have submounts, don't unhash ! */ + if (have_submounts(dentry)) + goto out_valid; + d_drop(dentry); /* Purge readdir caches. */ if (dentry->d_parent->d_inode) { nfs_zap_caches(dentry->d_parent->d_inode); @@ -862,61 +865,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry) return error; } - -/* Note: we copy the code from lookup_dentry() here, only: we have to - * omit the directory lock. We are already the owner of the lock when - * we reach here. And "down(&dir->i_sem)" would make us sleep forever - * ('cause WE have the lock) - * - * VERY IMPORTANT: calculate the hash for this dentry!!!!!!!! - * Otherwise the cached lookup DEFINITELY WILL fail. And a new dentry - * is created. Without the DCACHE_NFSFS_RENAMED flag. And with d_count - * == 1. And trouble. - * - * Concerning my choice of the temp name: it is just nice to have - * i_ino part of the temp name, as this offers another check whether - * somebody attempts to remove the "silly renamed" dentry itself. - * Which is something that I consider evil. Your opinion may vary. - * BUT: - * Now that I compute the hash value right, it should be possible to simply - * check for the DCACHE_NFSFS_RENAMED flag in dentry->d_flag instead of - * doing the string compare. - * WHICH MEANS: - * This offers the opportunity to shorten the temp name. Currently, I use - * the hex representation of i_ino + an event counter. This sums up to - * as much as 36 characters for a 64 bit machine, and needs 20 chars on - * a 32 bit machine. - * QUINTESSENCE - * The use of i_ino is simply cosmetic. All we need is a unique temp - * file name for the .nfs files. The event counter seemed to be adequate. - * And as we retry in case such a file already exists, we are guaranteed - * to succeed. - */ - -static -struct dentry *nfs_silly_lookup(struct dentry *parent, char *silly, int slen) -{ - struct qstr sqstr; - struct dentry *sdentry; - struct dentry *res; - - sqstr.name = silly; - sqstr.len = slen; - sqstr.hash = full_name_hash(silly, slen); - sdentry = d_lookup(parent, &sqstr); - if (!sdentry) { - sdentry = d_alloc(parent, &sqstr); - if (sdentry == NULL) - return ERR_PTR(-ENOMEM); - res = nfs_lookup(parent->d_inode, sdentry); - if (res) { - dput(sdentry); - return res; - } - } - return sdentry; -} - static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) { static unsigned int sillycounter = 0; @@ -966,7 +914,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name); dfprintk(VFS, "trying to rename %s to %s\n", dentry->d_name.name, silly); - sdentry = nfs_silly_lookup(dentry->d_parent, silly, slen); + sdentry = lookup_one(silly, dget(dentry->d_parent)); /* * N.B. Better to return EBUSY here ... it could be * dangerous to delete the file while it's in use. diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 9a91bb1ab..32d290c73 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -144,10 +144,10 @@ nfs_fsync(struct file *file, struct dentry *dentry) * If the writer ends up delaying the write, the writer needs to * increment the page use counts until he is done with the page. */ -static int nfs_prepare_write(struct page *page, unsigned offset, unsigned to) +static int nfs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) { kmap(page); - return 0; + return nfs_flush_incompatible(file, page); } static int nfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { diff --git a/fs/nfs/flushd.c b/fs/nfs/flushd.c new file mode 100644 index 000000000..d36c3a9ae --- /dev/null +++ b/fs/nfs/flushd.c @@ -0,0 +1,304 @@ +/* + * linux/fs/nfs/flushd.c + * + * For each NFS mount, there is a separate cache object that contains + * a hash table of all clusters. With this cache, an async RPC task + * (`flushd') is associated, which wakes up occasionally to inspect + * its list of dirty buffers. + * (Note that RPC tasks aren't kernel threads. Take a look at the + * rpciod code to understand what they are). + * + * Inside the cache object, we also maintain a count of the current number + * of dirty pages, which may not exceed a certain threshold. + * (FIXME: This threshold should be configurable). + * + * The code is streamlined for what I think is the prevalent case for + * NFS traffic, which is sequential write access without concurrent + * access by different processes. + * + * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de> + * + * Rewritten 6/3/2000 by Trond Myklebust + * Copyright (C) 1999, 2000, Trond Myklebust <trond.myklebust@fys.uio.no> + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/malloc.h> +#include <linux/pagemap.h> +#include <linux/file.h> + +#include <linux/sched.h> + +#include <linux/sunrpc/auth.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/sched.h> + +#include <linux/spinlock.h> + +#include <linux/nfs.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/nfs_flushd.h> +#include <linux/nfs_mount.h> + +/* + * Various constants + */ +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + +/* + * This is the wait queue all cluster daemons sleep on + */ +static struct rpc_wait_queue flushd_queue = RPC_INIT_WAITQ("nfs_flushd"); + +/* + * Spinlock + */ +spinlock_t nfs_flushd_lock = SPIN_LOCK_UNLOCKED; + +/* + * Local function declarations. + */ +static void nfs_flushd(struct rpc_task *); +static void nfs_flushd_exit(struct rpc_task *); + + +int nfs_reqlist_init(struct nfs_server *server) +{ + struct nfs_reqlist *cache; + struct rpc_task *task; + int status = 0; + + dprintk("NFS: writecache_init\n"); + spin_lock(&nfs_flushd_lock); + cache = server->rw_requests; + + if (cache->task) + goto out_unlock; + + /* Create the RPC task */ + status = -ENOMEM; + task = rpc_new_task(server->client, NULL, RPC_TASK_ASYNC); + if (!task) + goto out_unlock; + + task->tk_calldata = server; + + cache->task = task; + + /* Run the task */ + cache->runat = jiffies; + + cache->auth = server->client->cl_auth; + task->tk_action = nfs_flushd; + task->tk_exit = nfs_flushd_exit; + + spin_unlock(&nfs_flushd_lock); + rpc_execute(task); + return 0; + out_unlock: + spin_unlock(&nfs_flushd_lock); + return status; +} + +void nfs_reqlist_exit(struct nfs_server *server) +{ + struct nfs_reqlist *cache; + + cache = server->rw_requests; + if (!cache) + return; + + dprintk("NFS: reqlist_exit (ptr %p rpc %p)\n", cache, cache->task); + while (cache->task || cache->inodes) { + spin_lock(&nfs_flushd_lock); + if (!cache->task) { + spin_unlock(&nfs_flushd_lock); + nfs_reqlist_init(server); + } else { + cache->task->tk_status = -ENOMEM; + rpc_wake_up_task(cache->task); + spin_unlock(&nfs_flushd_lock); + } + interruptible_sleep_on_timeout(&cache->request_wait, 1 * HZ); + } +} + +int nfs_reqlist_alloc(struct nfs_server *server) +{ + struct nfs_reqlist *cache; + if (server->rw_requests) + return 0; + + cache = (struct nfs_reqlist *)kmalloc(sizeof(*cache), GFP_KERNEL); + if (!cache) + return -ENOMEM; + + memset(cache, 0, sizeof(*cache)); + init_waitqueue_head(&cache->request_wait); + server->rw_requests = cache; + + return 0; +} + +void nfs_reqlist_free(struct nfs_server *server) +{ + if (server->rw_requests) { + kfree(server->rw_requests); + server->rw_requests = NULL; + } +} + +void nfs_wake_flushd() +{ + rpc_wake_up_status(&flushd_queue, -ENOMEM); +} + +static void inode_append_flushd(struct inode *inode) +{ + struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); + struct inode **q; + + spin_lock(&nfs_flushd_lock); + if (NFS_FLAGS(inode) & NFS_INO_FLUSH) + goto out; + inode->u.nfs_i.hash_next = NULL; + + q = &cache->inodes; + while (*q) + q = &(*q)->u.nfs_i.hash_next; + *q = inode; + + /* Note: we increase the inode i_count in order to prevent + * it from disappearing when on the flush list + */ + NFS_FLAGS(inode) |= NFS_INO_FLUSH; + inode->i_count++; + out: + spin_unlock(&nfs_flushd_lock); +} + +void inode_remove_flushd(struct inode *inode) +{ + struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); + struct inode **q; + + spin_lock(&nfs_flushd_lock); + if (!(NFS_FLAGS(inode) & NFS_INO_FLUSH)) + goto out; + + q = &cache->inodes; + while (*q && *q != inode) + q = &(*q)->u.nfs_i.hash_next; + if (*q) { + *q = inode->u.nfs_i.hash_next; + NFS_FLAGS(inode) &= ~NFS_INO_FLUSH; + iput(inode); + } + out: + spin_unlock(&nfs_flushd_lock); +} + +void inode_schedule_scan(struct inode *inode, unsigned long time) +{ + struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); + struct rpc_task *task; + unsigned long mintimeout; + + if (time_after(NFS_NEXTSCAN(inode), time)) + NFS_NEXTSCAN(inode) = time; + mintimeout = jiffies + 1 * HZ; + if (time_before(mintimeout, NFS_NEXTSCAN(inode))) + mintimeout = NFS_NEXTSCAN(inode); + inode_append_flushd(inode); + + spin_lock(&nfs_flushd_lock); + task = cache->task; + if (!task) { + spin_unlock(&nfs_flushd_lock); + nfs_reqlist_init(NFS_SERVER(inode)); + } else { + if (time_after(cache->runat, mintimeout)) + rpc_wake_up_task(task); + spin_unlock(&nfs_flushd_lock); + } +} + + +static void +nfs_flushd(struct rpc_task *task) +{ + struct nfs_server *server; + struct nfs_reqlist *cache; + struct inode *inode, *next; + unsigned long delay = jiffies + NFS_WRITEBACK_LOCKDELAY; + int flush = (task->tk_status == -ENOMEM); + + dprintk("NFS: %4d flushd starting\n", task->tk_pid); + server = (struct nfs_server *) task->tk_calldata; + cache = server->rw_requests; + + spin_lock(&nfs_flushd_lock); + next = cache->inodes; + cache->inodes = NULL; + spin_unlock(&nfs_flushd_lock); + + while ((inode = next) != NULL) { + next = next->u.nfs_i.hash_next; + inode->u.nfs_i.hash_next = NULL; + NFS_FLAGS(inode) &= ~NFS_INO_FLUSH; + + if (flush) { + nfs_sync_file(inode, NULL, 0, 0, FLUSH_AGING); + } else if (time_after(jiffies, NFS_NEXTSCAN(inode))) { + NFS_NEXTSCAN(inode) = jiffies + NFS_WRITEBACK_LOCKDELAY; + nfs_flush_timeout(inode, FLUSH_AGING); +#ifdef CONFIG_NFS_V3 + nfs_commit_timeout(inode, FLUSH_AGING); +#endif + } + + if (nfs_have_writebacks(inode)) { + inode_append_flushd(inode); + if (time_after(delay, NFS_NEXTSCAN(inode))) + delay = NFS_NEXTSCAN(inode); + } + iput(inode); + } + + dprintk("NFS: %4d flushd back to sleep\n", task->tk_pid); + if (time_after(jiffies + 1 * HZ, delay)) + delay = 1 * HZ; + else + delay = delay - jiffies; + task->tk_status = 0; + task->tk_action = nfs_flushd; + task->tk_timeout = delay; + cache->runat = jiffies + task->tk_timeout; + + spin_lock(&nfs_flushd_lock); + if (!cache->nr_requests && !cache->inodes) { + cache->task = NULL; + task->tk_action = NULL; + } else + rpc_sleep_on(&flushd_queue, task, NULL, NULL); + spin_unlock(&nfs_flushd_lock); +} + +static void +nfs_flushd_exit(struct rpc_task *task) +{ + struct nfs_server *server; + struct nfs_reqlist *cache; + server = (struct nfs_server *) task->tk_calldata; + cache = server->rw_requests; + + spin_lock(&nfs_flushd_lock); + if (cache->task == task) + cache->task = NULL; + spin_unlock(&nfs_flushd_lock); + wake_up(&cache->request_wait); + rpc_release_task(task); +} + diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 441d62edc..ca7e1b944 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -27,6 +27,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/nfs_fs.h> +#include <linux/nfs_flushd.h> #include <linux/lockd/bind.h> #include <linux/smp_lock.h> @@ -74,6 +75,12 @@ nfs_read_inode(struct inode * inode) inode->i_rdev = 0; NFS_FILEID(inode) = 0; NFS_FSID(inode) = 0; + INIT_LIST_HEAD(&inode->u.nfs_i.dirty); + INIT_LIST_HEAD(&inode->u.nfs_i.commit); + INIT_LIST_HEAD(&inode->u.nfs_i.writeback); + inode->u.nfs_i.ndirty = 0; + inode->u.nfs_i.ncommit = 0; + inode->u.nfs_i.npages = 0; NFS_CACHEINV(inode); NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); } @@ -92,8 +99,6 @@ nfs_put_inode(struct inode * inode) static void nfs_delete_inode(struct inode * inode) { - int failed; - dprintk("NFS: delete_inode(%x/%ld)\n", inode->i_dev, inode->i_ino); lock_kernel(); @@ -101,29 +106,12 @@ nfs_delete_inode(struct inode * inode) nfs_free_dircache(inode); } else { /* - * Flush out any pending write requests ... + * The following can never actually happen... */ - if (NFS_WRITEBACK(inode) != NULL) { - unsigned long timeout = jiffies + 5*HZ; -#ifdef NFS_DEBUG_VERBOSE -printk("nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino); -#endif - nfs_inval(inode); - while (NFS_WRITEBACK(inode) != NULL && - time_before(jiffies, timeout)) { - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(HZ/10); - } - current->state = TASK_RUNNING; - if (NFS_WRITEBACK(inode) != NULL) - printk("NFS: Arghhh, stuck RPC requests!\n"); + if (nfs_have_writebacks(inode)) { + printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino); } } - - failed = nfs_check_failed_request(inode); - if (failed) - printk("NFS: inode %ld had %d failed requests\n", - inode->i_ino, failed); unlock_kernel(); clear_inode(inode); @@ -135,9 +123,18 @@ nfs_put_super(struct super_block *sb) struct nfs_server *server = &sb->u.nfs_sb.s_server; struct rpc_clnt *rpc; + /* + * First get rid of the request flushing daemon. + * Relies on rpc_shutdown_client() waiting on all + * client tasks to finish. + */ + nfs_reqlist_exit(server); + if ((rpc = server->client) != NULL) rpc_shutdown_client(rpc); + nfs_reqlist_free(server); + if (!(server->flags & NFS_MOUNT_NONLM)) lockd_down(); /* release rpc.lockd */ rpciod_down(); /* release rpciod */ @@ -306,6 +303,12 @@ nfs_read_super(struct super_block *sb, void *raw_data, int silent) sb->s_root->d_op = &nfs_dentry_operations; sb->s_root->d_fsdata = root_fh; + /* Fire up the writeback cache */ + if (nfs_reqlist_alloc(server) < 0) { + printk(KERN_NOTICE "NFS: cannot initialize writeback cache.\n"); + goto failure_kill_reqlist; + } + /* We're airborne */ /* Check whether to start the lockd process */ @@ -314,6 +317,8 @@ nfs_read_super(struct super_block *sb, void *raw_data, int silent) return sb; /* Yargs. It didn't work out. */ + failure_kill_reqlist: + nfs_reqlist_exit(server); out_no_root: printk("nfs_read_super: get root inode failed\n"); iput(root_inode); @@ -342,6 +347,7 @@ out_no_xprt: printk(KERN_WARNING "NFS: cannot create RPC transport.\n"); out_free_host: + nfs_reqlist_free(server); kfree(server->hostname); out_unlock: goto out_fail; @@ -440,7 +446,6 @@ nfs_invalidate_inode(struct inode *inode) make_bad_inode(inode); inode->i_mode = save_mode; - nfs_inval(inode); nfs_zap_caches(inode); } @@ -864,7 +869,7 @@ nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) * to look at the size or the mtime the server sends us * too closely, as we're in the middle of modifying them. */ - if (NFS_WRITEBACK(inode)) + if (nfs_have_writebacks(inode)) goto out; if (inode->i_size != fattr->size) { @@ -925,7 +930,7 @@ printk("nfs_refresh_inode: invalidating %ld pages\n", inode->i_nrpages); static DECLARE_FSTYPE(nfs_fs_type, "nfs", nfs_read_super, 0); extern int nfs_init_fhcache(void); -extern int nfs_init_wreqcache(void); +extern int nfs_init_nfspagecache(void); /* * Initialize NFS @@ -939,7 +944,7 @@ init_nfs_fs(void) if (err) return err; - err = nfs_init_wreqcache(); + err = nfs_init_nfspagecache(); if (err) return err; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index a7e53e6db..5ad2aaa67 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -63,6 +63,7 @@ static int nfs_stat_to_errno(int stat); #define NFS_diropres_sz 1+NFS_fhandle_sz+NFS_fattr_sz #define NFS_readlinkres_sz 1 #define NFS_readres_sz 1+NFS_fattr_sz+1 +#define NFS_writeres_sz NFS_attrstat_sz #define NFS_stat_sz 1 #define NFS_readdirres_sz 1 #define NFS_statfsres_sz 1+NFS_info_sz @@ -273,6 +274,7 @@ nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res) static int nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args) { + unsigned int nr; u32 count = args->count; p = xdr_encode_fhandle(p, args->fh); @@ -282,28 +284,35 @@ nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args) *p++ = htonl(count); req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - req->rq_svec[1].iov_base = (void *) args->buffer; - req->rq_svec[1].iov_len = count; - req->rq_slen += count; - req->rq_snr = 2; + /* Get the number of buffers in the send iovec */ + nr = args->nriov; + + if (nr+2 > MAX_IOVEC) { + printk(KERN_ERR "NFS: Bad number of iov's in xdr_writeargs " + "(nr %d max %d)\n", nr, MAX_IOVEC); + return -EINVAL; + } + + /* Copy the iovec */ + memcpy(req->rq_svec + 1, args->iov, nr * sizeof(struct iovec)); #ifdef NFS_PAD_WRITES /* * Some old servers require that the message length * be a multiple of 4, so we pad it here if needed. */ - count = ((count + 3) & ~3) - count; - if (count) { -#if 0 -printk("nfs_writeargs: padding write, len=%d, slen=%d, pad=%d\n", -req->rq_svec[1].iov_len, req->rq_slen, count); -#endif - req->rq_svec[2].iov_base = (void *) "\0\0\0"; - req->rq_svec[2].iov_len = count; - req->rq_slen += count; - req->rq_snr = 3; + if (count & 3) { + struct iovec *iov = req->rq_svec + nr + 1; + int pad = 4 - (count & 3); + + iov->iov_base = (void *) "\0\0\0"; + iov->iov_len = pad; + count += pad; + nr++; } #endif + req->rq_slen += count; + req->rq_snr += nr; return 0; } @@ -593,6 +602,16 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy) } /* + * Decode WRITE reply + */ +static int +nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res) +{ + res->verf->committed = NFS_FILE_SYNC; + return nfs_xdr_attrstat(req, p, res->fattr); +} + +/* * Decode STATFS reply */ static int @@ -678,7 +697,7 @@ static struct rpc_procinfo nfs_procedures[18] = { PROC(readlink, readlinkargs, readlinkres), PROC(read, readargs, readres), PROC(writecache, enc_void, dec_void), - PROC(write, writeargs, attrstat), + PROC(write, writeargs, writeres), PROC(create, createargs, diropres), PROC(remove, diropargs, stat), PROC(rename, renameargs, stat), diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index d9a423f16..a592608be 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -145,6 +145,8 @@ static struct nfs_bool_opts { { "nocto", ~NFS_MOUNT_NOCTO, NFS_MOUNT_NOCTO }, { "ac", ~NFS_MOUNT_NOAC, 0 }, { "noac", ~NFS_MOUNT_NOAC, NFS_MOUNT_NOAC }, + { "lock", ~NFS_MOUNT_NONLM, 0 }, + { "nolock", ~NFS_MOUNT_NONLM, NFS_MOUNT_NONLM }, { NULL, 0, 0 } }; @@ -320,7 +322,7 @@ int __init root_nfs_init(void) * Parse NFS server and directory information passed on the kernel * command line. */ -void __init nfs_root_setup(char *line) +int __init nfs_root_setup(char *line) { ROOT_DEV = MKDEV(UNNAMED_MAJOR, 255); if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { @@ -333,6 +335,7 @@ void __init nfs_root_setup(char *line) sprintf(nfs_root_name, NFS_ROOT, line); } root_nfs_parse_addr(nfs_root_name); + return 1; } __setup("nfsroot=", nfs_root_setup); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index bb55ce6d6..3823c3118 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -111,11 +111,15 @@ nfs_proc_write(struct nfs_server *server, struct nfs_fh *fhandle, int swap, unsigned long offset, unsigned int count, const void *buffer, struct nfs_fattr *fattr) { - struct nfs_writeargs arg = { fhandle, offset, count, buffer }; + struct nfs_writeargs arg = { fhandle, offset, count, 1, 1, + {{(void *) buffer, count}, {0,0}, {0,0}, {0,0}, + {0,0}, {0,0}, {0,0}, {0,0}}}; + struct nfs_writeverf verf; + struct nfs_writeres res = {fattr, &verf, count}; int status; dprintk("NFS call write %d @ %ld\n", count, offset); - status = rpc_call(server->client, NFSPROC_WRITE, &arg, fattr, + status = rpc_call(server->client, NFSPROC_WRITE, &arg, &res, swap? (RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS) : 0); dprintk("NFS reply read: %d\n", status); return status < 0? status : count; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 717d12bbb..aa17780e5 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -171,6 +171,7 @@ static inline int nfs_readpage_async(struct dentry *dentry, struct inode *inode, struct page *page) { + struct rpc_message msg; unsigned long address; struct nfs_rreq *req; int result = -1, flags; @@ -195,8 +196,13 @@ nfs_readpage_async(struct dentry *dentry, struct inode *inode, /* Start the async call */ dprintk("NFS: executing async READ request.\n"); - result = rpc_do_call(NFS_CLIENT(inode), NFSPROC_READ, - &req->ra_args, &req->ra_res, flags, + + msg.rpc_proc = NFSPROC_READ; + msg.rpc_argp = &req->ra_args; + msg.rpc_resp = &req->ra_res; + msg.rpc_cred = NULL; + + result = rpc_call_async(NFS_CLIENT(inode), &msg, flags, nfs_readpage_result, req); if (result < 0) goto out_free; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5f847bec8..af023a121 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -46,6 +46,7 @@ * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de> */ +#include <linux/config.h> #include <linux/types.h> #include <linux/malloc.h> #include <linux/swap.h> @@ -54,33 +55,126 @@ #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> +#include <linux/nfs_flushd.h> #include <asm/uaccess.h> #include <linux/smp_lock.h> #define NFS_PARANOIA 1 #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static void nfs_wback_begin(struct rpc_task *task); -static void nfs_wback_result(struct rpc_task *task); -static void nfs_cancel_request(struct nfs_wreq *req); +/* + * Spinlock + */ +spinlock_t nfs_wreq_lock = SPIN_LOCK_UNLOCKED; +static unsigned int nfs_nr_requests = 0; /* - * Cache parameters + * Local structures + * + * Valid flags for a dirty buffer */ -#define NFS_WRITEBACK_DELAY (10 * HZ) -#define NFS_WRITEBACK_MAX 64 +#define PG_BUSY 0x0001 /* - * Limit number of delayed writes + * This is the struct where the WRITE/COMMIT arguments go. */ -static int nr_write_requests = 0; -static struct rpc_wait_queue write_queue = RPC_INIT_WAITQ("write_chain"); +struct nfs_write_data { + struct rpc_task task; + struct file *file; + struct rpc_cred *cred; + struct nfs_writeargs args; /* argument struct */ + struct nfs_writeres res; /* result struct */ + struct nfs_fattr fattr; + struct nfs_writeverf verf; + struct list_head pages; /* Coalesced requests we wish to flush */ +}; + +struct nfs_page { + struct list_head wb_hash, /* Inode */ + wb_list, + *wb_list_head; + struct file *wb_file; + struct rpc_cred *wb_cred; + struct page *wb_page; /* page to write out */ + wait_queue_head_t wb_wait; /* wait queue */ + unsigned long wb_timeout; /* when to write/commit */ + unsigned int wb_offset, /* Offset of write */ + wb_bytes, /* Length of request */ + wb_count, /* reference count */ + wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ +}; + +#define NFS_WBACK_BUSY(req) ((req)->wb_flags & PG_BUSY) + +/* + * Local function declarations + */ +static void nfs_writeback_done(struct rpc_task *); +#ifdef CONFIG_NFS_V3 +static void nfs_commit_done(struct rpc_task *); +#endif /* Hack for future NFS swap support */ #ifndef IS_SWAPFILE # define IS_SWAPFILE(inode) (0) #endif +static kmem_cache_t *nfs_page_cachep = NULL; +static kmem_cache_t *nfs_wdata_cachep = NULL; + +static __inline__ struct nfs_page *nfs_page_alloc(void) +{ + struct nfs_page *p; + p = kmem_cache_alloc(nfs_page_cachep, SLAB_KERNEL); + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->wb_hash); + INIT_LIST_HEAD(&p->wb_list); + init_waitqueue_head(&p->wb_wait); + } + return p; +} + +static __inline__ void nfs_page_free(struct nfs_page *p) +{ + kmem_cache_free(nfs_page_cachep, p); +} + +static __inline__ struct nfs_write_data *nfs_writedata_alloc(void) +{ + struct nfs_write_data *p; + p = kmem_cache_alloc(nfs_wdata_cachep, SLAB_NFS); + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + } + return p; +} + +static __inline__ void nfs_writedata_free(struct nfs_write_data *p) +{ + kmem_cache_free(nfs_wdata_cachep, p); +} + +static void nfs_writedata_release(struct rpc_task *task) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata; + rpc_release_task(task); + nfs_writedata_free(wdata); +} + +/* + * This function will be used to simulate weak cache consistency + * under NFSv2 when the NFSv3 attribute patch is included. + * For the moment, we just call nfs_refresh_inode(). + */ +static __inline__ int +nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + return nfs_refresh_inode(inode, fattr); +} + /* * Write a page synchronously. * Offset is the data offset within the page. @@ -161,278 +255,770 @@ io_error: } /* - * Append a writeback request to a list + * Write a page to the server. This was supposed to be used for + * NFS swapping only. + * FIXME: Using this for mmap is pointless, breaks asynchronous + * writebacks, and is extremely slow. */ -static inline void -append_write_request(struct nfs_wreq **q, struct nfs_wreq *wreq) +int +nfs_writepage(struct dentry * dentry, struct page *page) { - dprintk("NFS: append_write_request(%p, %p)\n", q, wreq); - rpc_append_list(q, wreq); + struct inode *inode = dentry->d_inode; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + unsigned offset = PAGE_CACHE_SIZE; + int err; + + /* easy case */ + if (page->index < end_index) + goto do_it; + /* things got complicated... */ + offset = inode->i_size & (PAGE_CACHE_SIZE-1); + /* OK, are we completely out? */ + if (page->index >= end_index+1 || !offset) + return -EIO; +do_it: + err = nfs_writepage_sync(dentry, inode, page, 0, offset); + if ( err == offset) return 0; + return err; +} + +/* + * Check whether the file range we want to write to is locked by + * us. + */ +static int +region_locked(struct inode *inode, struct nfs_page *req) +{ + struct file_lock *fl; + unsigned long rqstart, rqend; + + /* Don't optimize writes if we don't use NLM */ + if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) + return 0; + + rqstart = page_offset(req->wb_page) + req->wb_offset; + rqend = rqstart + req->wb_bytes; + for (fl = inode->i_flock; fl; fl = fl->fl_next) { + if (fl->fl_owner == current->files && (fl->fl_flags & FL_POSIX) + && fl->fl_type == F_WRLCK + && fl->fl_start <= rqstart && rqend <= fl->fl_end) { + return 1; + } + } + + return 0; +} + +static inline struct nfs_page * +nfs_inode_wb_entry(struct list_head *head) +{ + return list_entry(head, struct nfs_page, wb_hash); } /* - * Remove a writeback request from a list + * Insert a write request into an inode */ static inline void -remove_write_request(struct nfs_wreq **q, struct nfs_wreq *wreq) +nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { - dprintk("NFS: remove_write_request(%p, %p)\n", q, wreq); - rpc_remove_list(q, wreq); + if (!list_empty(&req->wb_hash)) + return; + if (!NFS_WBACK_BUSY(req)) + printk(KERN_ERR "NFS: unlocked request attempted hashed!\n"); + inode->u.nfs_i.npages++; + list_add(&req->wb_hash, &inode->u.nfs_i.writeback); + req->wb_count++; } /* - * Find a non-busy write request for a given page to - * try to combine with. + * Insert a write request into an inode */ -static inline struct nfs_wreq * -find_write_request(struct inode *inode, struct page *page) +static inline void +nfs_inode_remove_request(struct nfs_page *req) { - pid_t pid = current->pid; - struct nfs_wreq *head, *req; + struct inode *inode; + spin_lock(&nfs_wreq_lock); + if (list_empty(&req->wb_hash)) { + spin_unlock(&nfs_wreq_lock); + return; + } + if (!NFS_WBACK_BUSY(req)) + printk(KERN_ERR "NFS: unlocked request attempted unhashed!\n"); + inode = req->wb_file->f_dentry->d_inode; + list_del(&req->wb_hash); + INIT_LIST_HEAD(&req->wb_hash); + inode->u.nfs_i.npages--; + if ((inode->u.nfs_i.npages == 0) != list_empty(&inode->u.nfs_i.writeback)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.npages.\n"); + if (!nfs_have_writebacks(inode)) + inode_remove_flushd(inode); + spin_unlock(&nfs_wreq_lock); + nfs_release_request(req); +} - dprintk("NFS: find_write_request(%x/%ld, %p)\n", - inode->i_dev, inode->i_ino, page); - if (!(req = head = NFS_WRITEBACK(inode))) - return NULL; - do { - /* - * We can't combine with canceled requests or - * requests that have already been started.. - */ - if (req->wb_flags & (NFS_WRITE_CANCELLED | NFS_WRITE_INPROGRESS)) +/* + * Find a request + */ +static inline struct nfs_page * +_nfs_find_request(struct inode *inode, struct page *page) +{ + struct list_head *head, *next; + + head = &inode->u.nfs_i.writeback; + next = head->next; + while (next != head) { + struct nfs_page *req = nfs_inode_wb_entry(next); + next = next->next; + if (page_index(req->wb_page) != page_index(page)) continue; + req->wb_count++; + return req; + } + return NULL; +} - if (req->wb_page == page && req->wb_pid == pid) - return req; +struct nfs_page * +nfs_find_request(struct inode *inode, struct page *page) +{ + struct nfs_page *req; - /* - * Ehh, don't keep too many tasks queued.. - */ - rpc_wake_up_task(&req->wb_task); + spin_lock(&nfs_wreq_lock); + req = _nfs_find_request(inode, page); + spin_unlock(&nfs_wreq_lock); + return req; +} - } while ((req = WB_NEXT(req)) != head); - return NULL; +static inline struct nfs_page * +nfs_list_entry(struct list_head *head) +{ + return list_entry(head, struct nfs_page, wb_list); } /* - * Find and release all failed requests for this inode. + * Insert a write request into a sorted list */ -int -nfs_check_failed_request(struct inode * inode) +static inline void +nfs_list_add_request(struct nfs_page *req, struct list_head *head) { - /* FIXME! */ - return 0; + struct list_head *prev; + + if (!list_empty(&req->wb_list)) { + printk(KERN_ERR "NFS: Add to list failed!\n"); + return; + } + if (list_empty(&req->wb_hash)) { + printk(KERN_ERR "NFS: Unhashed request attempted added to a list!\n"); + return; + } + if (!NFS_WBACK_BUSY(req)) + printk(KERN_ERR "NFS: unlocked request attempted added to list!\n"); + prev = head->prev; + while (prev != head) { + struct nfs_page *p = nfs_list_entry(prev); + if (page_index(p->wb_page) < page_index(req->wb_page)) + break; + prev = prev->prev; + } + list_add(&req->wb_list, prev); + req->wb_list_head = head; } /* - * Try to merge adjacent write requests. This works only for requests - * issued by the same user. + * Insert a write request into an inode */ -static inline int -update_write_request(struct nfs_wreq *req, unsigned int first, - unsigned int bytes) +static inline void +nfs_list_remove_request(struct nfs_page *req) { - unsigned int rqfirst = req->wb_offset, - rqlast = rqfirst + req->wb_bytes, - last = first + bytes; + if (list_empty(&req->wb_list)) + return; + if (!NFS_WBACK_BUSY(req)) + printk(KERN_ERR "NFS: unlocked request attempted removed from list!\n"); + list_del(&req->wb_list); + INIT_LIST_HEAD(&req->wb_list); + req->wb_list_head = NULL; +} - dprintk("nfs: trying to update write request %p\n", req); +/* + * Add a request to the inode's dirty list. + */ +static inline void +nfs_mark_request_dirty(struct nfs_page *req) +{ + struct inode *inode = req->wb_file->f_dentry->d_inode; - /* not contiguous? */ - if (rqlast < first || last < rqfirst) - return 0; + spin_lock(&nfs_wreq_lock); + if (list_empty(&req->wb_list)) { + nfs_list_add_request(req, &inode->u.nfs_i.dirty); + inode->u.nfs_i.ndirty++; + } + spin_unlock(&nfs_wreq_lock); + /* + * NB: the call to inode_schedule_scan() must lie outside the + * spinlock since it can run flushd(). + */ + inode_schedule_scan(inode, req->wb_timeout); +} - if (first < rqfirst) - rqfirst = first; - if (rqlast < last) - rqlast = last; +/* + * Check if a request is dirty + */ +static inline int +nfs_dirty_request(struct nfs_page *req) +{ + struct inode *inode = req->wb_file->f_dentry->d_inode; + return !list_empty(&req->wb_list) && req->wb_list_head == &inode->u.nfs_i.dirty; +} - req->wb_offset = rqfirst; - req->wb_bytes = rqlast - rqfirst; - req->wb_count++; +#ifdef CONFIG_NFS_V3 +/* + * Add a request to the inode's commit list. + */ +static inline void +nfs_mark_request_commit(struct nfs_page *req) +{ + struct inode *inode = req->wb_file->f_dentry->d_inode; - return 1; + spin_lock(&nfs_wreq_lock); + if (list_empty(&req->wb_list)) { + nfs_list_add_request(req, &inode->u.nfs_i.commit); + inode->u.nfs_i.ncommit++; + } + spin_unlock(&nfs_wreq_lock); + /* + * NB: the call to inode_schedule_scan() must lie outside the + * spinlock since it can run flushd(). + */ + inode_schedule_scan(inode, req->wb_timeout); } +#endif -static kmem_cache_t *nfs_wreq_cachep; - -int nfs_init_wreqcache(void) +/* + * Lock the page of an asynchronous request + */ +static inline int +nfs_lock_request(struct nfs_page *req) { - nfs_wreq_cachep = kmem_cache_create("nfs_wreq", - sizeof(struct nfs_wreq), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (nfs_wreq_cachep == NULL) - return -ENOMEM; - return 0; + if (NFS_WBACK_BUSY(req)) + return 0; + req->wb_count++; + req->wb_flags |= PG_BUSY; + return 1; } static inline void -free_write_request(struct nfs_wreq * req) +nfs_unlock_request(struct nfs_page *req) { - if (!--req->wb_count) - kmem_cache_free(nfs_wreq_cachep, req); + if (!NFS_WBACK_BUSY(req)) { + printk(KERN_ERR "NFS: Invalid unlock attempted\n"); + return; + } + req->wb_flags &= ~PG_BUSY; + wake_up(&req->wb_wait); + nfs_release_request(req); } /* - * Create and initialize a writeback request + * Create a write request. + * Page must be locked by the caller. This makes sure we never create + * two different requests for the same page, and avoids possible deadlock + * when we reach the hard limit on the number of dirty pages. */ -static inline struct nfs_wreq * -create_write_request(struct file * file, struct page *page, unsigned int offset, unsigned int bytes) +static struct nfs_page * +nfs_create_request(struct inode *inode, struct file *file, struct page *page, + unsigned int offset, unsigned int count) { - struct dentry *dentry = file->f_dentry; - struct inode *inode = dentry->d_inode; - struct rpc_clnt *clnt = NFS_CLIENT(inode); - struct nfs_wreq *wreq; - struct rpc_task *task; + struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); + struct nfs_page *req = NULL; + long timeout; - dprintk("NFS: create_write_request(%s/%s, %ld+%d)\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - (page->index << PAGE_CACHE_SHIFT) + offset, bytes); - - /* FIXME: Enforce hard limit on number of concurrent writes? */ - wreq = kmem_cache_alloc(nfs_wreq_cachep, SLAB_KERNEL); - if (!wreq) - goto out_fail; - memset(wreq, 0, sizeof(*wreq)); + /* Deal with hard/soft limits. + */ + do { + /* If we're over the soft limit, flush out old requests */ + if (nfs_nr_requests >= MAX_REQUEST_SOFT) + nfs_wb_file(inode, file); + + /* If we're still over the soft limit, wake up some requests */ + if (nfs_nr_requests >= MAX_REQUEST_SOFT) { + dprintk("NFS: hit soft limit (%d requests)\n", + nfs_nr_requests); + if (!cache->task) + nfs_reqlist_init(NFS_SERVER(inode)); + nfs_wake_flushd(); + } - task = &wreq->wb_task; - rpc_init_task(task, clnt, nfs_wback_result, RPC_TASK_NFSWRITE); - task->tk_calldata = wreq; - task->tk_action = nfs_wback_begin; + /* If we haven't reached the hard limit yet, + * try to allocate the request struct */ + if (nfs_nr_requests < MAX_REQUEST_HARD) { + req = nfs_page_alloc(); + if (req != NULL) + break; + } - rpcauth_lookupcred(task); /* Obtain user creds */ - if (task->tk_status < 0) - goto out_req; + /* We're over the hard limit. Wait for better times */ + dprintk("NFS: create_request sleeping (total %d pid %d)\n", + nfs_nr_requests, current->pid); + + timeout = 1 * HZ; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_INTR) { + interruptible_sleep_on_timeout(&cache->request_wait, + timeout); + if (signalled()) + break; + } else + sleep_on_timeout(&cache->request_wait, timeout); + + dprintk("NFS: create_request waking up (tot %d pid %d)\n", + nfs_nr_requests, current->pid); + } while (!req); + if (!req) + return NULL; - /* Put the task on inode's writeback request list. */ + /* Initialize the request struct. Initially, we assume a + * long write-back delay. This will be adjusted in + * update_nfs_request below if the region is not locked. */ + req->wb_page = page; + atomic_inc(&page->count); + req->wb_offset = offset; + req->wb_bytes = count; + /* If the region is locked, adjust the timeout */ + if (region_locked(inode, req)) + req->wb_timeout = jiffies + NFS_WRITEBACK_LOCKDELAY; + else + req->wb_timeout = jiffies + NFS_WRITEBACK_DELAY; + req->wb_file = file; + req->wb_cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); get_file(file); - wreq->wb_file = file; - wreq->wb_pid = current->pid; - wreq->wb_page = page; - init_waitqueue_head(&wreq->wb_wait); - wreq->wb_offset = offset; - wreq->wb_bytes = bytes; - wreq->wb_count = 2; /* One for the IO, one for us */ + req->wb_count = 1; - kmap(page); - append_write_request(&NFS_WRITEBACK(inode), wreq); + /* register request's existence */ + cache->nr_requests++; + nfs_nr_requests++; + return req; +} - if (nr_write_requests++ > NFS_WRITEBACK_MAX*3/4) - rpc_wake_up_next(&write_queue); - return wreq; +/* + * Release all resources associated with a write request after it + * has been committed to stable storage + * + * Note: Should always be called with the spinlock held! + */ +void +nfs_release_request(struct nfs_page *req) +{ + struct inode *inode = req->wb_file->f_dentry->d_inode; + struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); + struct page *page = req->wb_page; + + spin_lock(&nfs_wreq_lock); + if (--req->wb_count) { + spin_unlock(&nfs_wreq_lock); + return; + } + spin_unlock(&nfs_wreq_lock); -out_req: - rpc_release_task(task); - kmem_cache_free(nfs_wreq_cachep, wreq); -out_fail: - return NULL; + if (!list_empty(&req->wb_list)) { + printk(KERN_ERR "NFS: Request released while still on a list!\n"); + nfs_list_remove_request(req); + } + if (!list_empty(&req->wb_hash)) { + printk(KERN_ERR "NFS: Request released while still hashed!\n"); + nfs_inode_remove_request(req); + } + if (NFS_WBACK_BUSY(req)) + printk(KERN_ERR "NFS: Request released while still locked!\n"); + + rpcauth_releasecred(NFS_CLIENT(inode)->cl_auth, req->wb_cred); + fput(req->wb_file); + page_cache_release(page); + nfs_page_free(req); + /* wake up anyone waiting to allocate a request */ + cache->nr_requests--; + nfs_nr_requests--; + wake_up(&cache->request_wait); } /* - * Schedule a writeback RPC call. - * If the server is congested, don't add to our backlog of queued - * requests but call it synchronously. - * The function returns whether we should wait for the thing or not. + * Wait for a request to complete. * - * FIXME: Here we could walk the inode's lock list to see whether the - * page we're currently writing to has been write-locked by the caller. - * If it is, we could schedule an async write request with a long - * delay in order to avoid writing back the page until the lock is - * released. + * Interruptible by signals only if mounted with intr flag. */ -static inline int -schedule_write_request(struct nfs_wreq *req, int sync) +static int +nfs_wait_on_request(struct nfs_page *req) { - struct rpc_task *task = &req->wb_task; - struct file *file = req->wb_file; - struct dentry *dentry = file->f_dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = req->wb_file->f_dentry->d_inode; + struct rpc_clnt *clnt = NFS_CLIENT(inode); + int retval; - if (NFS_CONGESTED(inode) || nr_write_requests >= NFS_WRITEBACK_MAX) - sync = 1; - - if (sync) { - sigset_t oldmask; - struct rpc_clnt *clnt = NFS_CLIENT(inode); - dprintk("NFS: %4d schedule_write_request (sync)\n", - task->tk_pid); - /* Page is already locked */ - rpc_clnt_sigmask(clnt, &oldmask); - rpc_execute(task); - rpc_clnt_sigunmask(clnt, &oldmask); - } else { - dprintk("NFS: %4d schedule_write_request (async)\n", - task->tk_pid); - task->tk_flags |= RPC_TASK_ASYNC; - task->tk_timeout = NFS_WRITEBACK_DELAY; - rpc_sleep_on(&write_queue, task, NULL, NULL); + if (!NFS_WBACK_BUSY(req)) + return 0; + req->wb_count++; + retval = nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req)); + nfs_release_request(req); + return retval; +} + +/* + * Wait for a request to complete. + * + * Interruptible by signals only if mounted with intr flag. + */ +static int +nfs_wait_on_requests(struct inode *inode, struct file *file, unsigned long start, unsigned int count) +{ + struct list_head *p, *head; + unsigned long idx_start, idx_end; + unsigned int pages = 0; + int error; + + idx_start = start >> PAGE_CACHE_SHIFT; + if (count == 0) + idx_end = ~0; + else { + unsigned long idx_count = (count-1) >> PAGE_CACHE_SHIFT; + idx_end = idx_start + idx_count; } + spin_lock(&nfs_wreq_lock); + head = &inode->u.nfs_i.writeback; + p = head->next; + while (p != head) { + unsigned long pg_idx; + struct nfs_page *req = nfs_inode_wb_entry(p); + + p = p->next; + + if (file && req->wb_file != file) + continue; + + pg_idx = page_index(req->wb_page); + if (pg_idx < idx_start || pg_idx > idx_end) + continue; - return sync; + if (!NFS_WBACK_BUSY(req)) + continue; + req->wb_count++; + spin_unlock(&nfs_wreq_lock); + error = nfs_wait_on_request(req); + nfs_release_request(req); + if (error < 0) + return error; + spin_lock(&nfs_wreq_lock); + p = head->next; + pages++; + } + spin_unlock(&nfs_wreq_lock); + return pages; } /* - * Wait for request to complete. + * Scan cluster for dirty pages and send as many of them to the + * server as possible. */ static int -wait_on_write_request(struct nfs_wreq *req) +nfs_scan_list_timeout(struct list_head *head, struct list_head *dst, struct inode *inode) { - struct file *file = req->wb_file; - struct dentry *dentry = file->f_dentry; - struct inode *inode = dentry->d_inode; - struct rpc_clnt *clnt = NFS_CLIENT(inode); - DECLARE_WAITQUEUE(wait, current); - sigset_t oldmask; - int retval; + struct list_head *p; + struct nfs_page *req; + int pages = 0; + + p = head->next; + while (p != head) { + req = nfs_list_entry(p); + p = p->next; + if (time_after(req->wb_timeout, jiffies)) { + if (time_after(NFS_NEXTSCAN(inode), req->wb_timeout)) + NFS_NEXTSCAN(inode) = req->wb_timeout; + continue; + } + if (!nfs_lock_request(req)) + continue; + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + pages++; + } + return pages; +} + +static int +nfs_scan_dirty_timeout(struct inode *inode, struct list_head *dst) +{ + int pages; + spin_lock(&nfs_wreq_lock); + pages = nfs_scan_list_timeout(&inode->u.nfs_i.dirty, dst, inode); + inode->u.nfs_i.ndirty -= pages; + if ((inode->u.nfs_i.ndirty == 0) != list_empty(&inode->u.nfs_i.dirty)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); + spin_unlock(&nfs_wreq_lock); + return pages; +} - /* Make sure it's started.. */ - if (!WB_INPROGRESS(req)) - rpc_wake_up_task(&req->wb_task); +#ifdef CONFIG_NFS_V3 +static int +nfs_scan_commit_timeout(struct inode *inode, struct list_head *dst) +{ + int pages; + spin_lock(&nfs_wreq_lock); + pages = nfs_scan_list_timeout(&inode->u.nfs_i.commit, dst, inode); + inode->u.nfs_i.ncommit -= pages; + if ((inode->u.nfs_i.ncommit == 0) != list_empty(&inode->u.nfs_i.commit)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); + spin_unlock(&nfs_wreq_lock); + return pages; +} +#endif + +static int +nfs_scan_list(struct list_head *src, struct list_head *dst, struct file *file, unsigned long start, unsigned int count) +{ + struct list_head *p; + struct nfs_page *req; + unsigned long idx_start, idx_end; + int pages; + + pages = 0; + idx_start = start >> PAGE_CACHE_SHIFT; + if (count == 0) + idx_end = ~0; + else + idx_end = idx_start + ((count-1) >> PAGE_CACHE_SHIFT); + p = src->next; + while (p != src) { + unsigned long pg_idx; + + req = nfs_list_entry(p); + p = p->next; + + if (file && req->wb_file != file) + continue; + + pg_idx = page_index(req->wb_page); + if (pg_idx < idx_start || pg_idx > idx_end) + continue; + + if (!nfs_lock_request(req)) + continue; + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + pages++; + } + return pages; +} + +static int +nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long start, unsigned int count) +{ + int pages; + spin_lock(&nfs_wreq_lock); + pages = nfs_scan_list(&inode->u.nfs_i.dirty, dst, file, start, count); + inode->u.nfs_i.ndirty -= pages; + if ((inode->u.nfs_i.ndirty == 0) != list_empty(&inode->u.nfs_i.dirty)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); + spin_unlock(&nfs_wreq_lock); + return pages; +} + +#ifdef CONFIG_NFS_V3 +static int +nfs_scan_commit(struct inode *inode, struct list_head *dst, struct file *file, unsigned long start, unsigned int count) +{ + int pages; + spin_lock(&nfs_wreq_lock); + pages = nfs_scan_list(&inode->u.nfs_i.commit, dst, file, start, count); + inode->u.nfs_i.ncommit -= pages; + if ((inode->u.nfs_i.ncommit == 0) != list_empty(&inode->u.nfs_i.commit)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); + spin_unlock(&nfs_wreq_lock); + return pages; +} +#endif + + +static int +coalesce_requests(struct list_head *src, struct list_head *dst, unsigned int maxpages) +{ + struct nfs_page *req = NULL; + unsigned int pages = 0; + + while (!list_empty(src)) { + struct nfs_page *prev = req; + + req = nfs_list_entry(src->next); + if (prev) { + if (req->wb_file != prev->wb_file) + break; + + if (page_index(req->wb_page) != page_index(prev->wb_page)+1) + break; + + if (req->wb_offset != 0) + break; + } + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + pages++; + if (req->wb_offset + req->wb_bytes != PAGE_CACHE_SIZE) + break; + if (pages >= maxpages) + break; + } + return pages; +} + +/* + * Try to update any existing write request, or create one if there is none. + * In order to match, the request's credentials must match those of + * the calling process. + * + * Note: Should always be called with the Page Lock held! + */ +static struct nfs_page * +nfs_update_request(struct file* file, struct page *page, + unsigned long offset, unsigned int bytes) +{ + struct inode *inode = file->f_dentry->d_inode; + struct nfs_page *req, *new = NULL; + unsigned long rqend, end; + + end = offset + bytes; - rpc_clnt_sigmask(clnt, &oldmask); - add_wait_queue(&req->wb_wait, &wait); for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - retval = 0; - if (req->wb_flags & NFS_WRITE_COMPLETE) + /* Loop over all inode entries and see if we find + * A request for the page we wish to update + */ + spin_lock(&nfs_wreq_lock); + req = _nfs_find_request(inode, page); + if (req) { + if (!nfs_lock_request(req)) { + spin_unlock(&nfs_wreq_lock); + nfs_wait_on_request(req); + nfs_release_request(req); + continue; + } + spin_unlock(&nfs_wreq_lock); + if (new) + nfs_release_request(new); break; - retval = -ERESTARTSYS; - if (signalled()) + } + + req = new; + if (req) { + nfs_lock_request(req); + nfs_inode_add_request(inode, req); + spin_unlock(&nfs_wreq_lock); + nfs_mark_request_dirty(req); break; - schedule(); + } + spin_unlock(&nfs_wreq_lock); + + /* Create the request. It's safe to sleep in this call because + * we only get here if the page is locked. + */ + new = nfs_create_request(inode, file, page, offset, bytes); + if (!new) + return ERR_PTR(-ENOMEM); + } + + /* We have a request for our page. + * If the creds don't match, or the + * page addresses don't match, + * tell the caller to wait on the conflicting + * request. + */ + rqend = req->wb_offset + req->wb_bytes; + if (req->wb_file != file + || req->wb_page != page + || !nfs_dirty_request(req) + || offset > rqend || end < req->wb_offset) { + nfs_unlock_request(req); + nfs_release_request(req); + return ERR_PTR(-EBUSY); + } + + /* Okay, the request matches. Update the region */ + if (offset < req->wb_offset) { + req->wb_offset = offset; + req->wb_bytes = rqend - req->wb_offset; } - remove_wait_queue(&req->wb_wait, &wait); - current->state = TASK_RUNNING; - rpc_clnt_sigunmask(clnt, &oldmask); - return retval; + + if (end > rqend) + req->wb_bytes = end - req->wb_offset; + + nfs_unlock_request(req); + + return req; } /* - * Write a page to the server. This will be used for NFS swapping only - * (for now), and we currently do this synchronously only. + * This is the strategy routine for NFS. + * It is called by nfs_updatepage whenever the user wrote up to the end + * of a page. + * + * We always try to submit a set of requests in parallel so that the + * server's write code can gather writes. This is mainly for the benefit + * of NFSv2. + * + * We never submit more requests than we think the remote can handle. + * For UDP sockets, we make sure we don't exceed the congestion window; + * for TCP, we limit the number of requests to 8. + * + * NFS_STRATEGY_PAGES gives the minimum number of requests for NFSv2 that + * should be sent out in one go. This is for the benefit of NFSv2 servers + * that perform write gathering. + * + * FIXME: Different servers may have different sweet spots. + * Record the average congestion window in server struct? */ -int -nfs_writepage(struct dentry * dentry, struct page *page) +#define NFS_STRATEGY_PAGES 8 +static void +nfs_strategy(struct file *file) { - struct inode *inode = dentry->d_inode; - unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; - unsigned offset = PAGE_CACHE_SIZE; - int err; + struct inode *inode = file->f_dentry->d_inode; + unsigned int dirty, wpages; + + dirty = inode->u.nfs_i.ndirty; + wpages = NFS_SERVER(inode)->wsize >> PAGE_CACHE_SHIFT; +#ifdef CONFIG_NFS_V3 + if (NFS_PROTO(inode)->version == 2) { + if (dirty >= NFS_STRATEGY_PAGES * wpages) + nfs_flush_file(inode, file, 0, 0, 0); + } else { + if (dirty >= wpages) + nfs_flush_file(inode, file, 0, 0, 0); + } +#else + if (dirty >= NFS_STRATEGY_PAGES * wpages) + nfs_flush_file(inode, file, 0, 0, 0); +#endif + /* + * If we're running out of requests, flush out everything + * in order to reduce memory useage... + */ + if (nfs_nr_requests > MAX_REQUEST_SOFT) + nfs_wb_file(inode, file); +} - /* easy case */ - if (page->index < end_index) - goto do_it; - /* things got complicated... */ - offset = inode->i_size & (PAGE_CACHE_SIZE-1); - /* OK, are we completely out? */ - if (page->index >= end_index+1 || !offset) - return -EIO; -do_it: - err = nfs_writepage_sync(dentry, inode, page, 0, offset); - if ( err == offset) return 0; - return err; +int +nfs_flush_incompatible(struct file *file, struct page *page) +{ + struct inode *inode = file->f_dentry->d_inode; + struct nfs_page *req; + int status = 0; + /* + * Look for a request corresponding to this page. If there + * is one, and it belongs to another file, we flush it out + * before we try to copy anything into the page. Do this + * due to the lack of an ACCESS-type call in NFSv2. + * Also do the same if we find a request from an existing + * dropped page. + */ + req = nfs_find_request(inode,page); + if (req) { + if (req->wb_file != file || req->wb_page != page) + status = nfs_wb_page(inode, page); + nfs_release_request(req); + } + return (status < 0) ? status : 0; } /* @@ -446,27 +1032,13 @@ nfs_updatepage(struct file *file, struct page *page, unsigned long offset, unsig { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; - struct nfs_wreq *req; + struct nfs_page *req; int synchronous = file->f_flags & O_SYNC; - int retval; + int status = 0; - dprintk("NFS: nfs_updatepage(%s/%s %d@%ld)\n", + dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - count, (page->index << PAGE_CACHE_SHIFT) +offset); - - /* - * Try to find a corresponding request on the writeback queue. - * If there is one, we can be sure that this request is not - * yet being processed, because we hold a lock on the page. - * - * If the request was created by us, update it. Otherwise, - * transfer the page lock and flush out the dirty page now. - * After returning, generic_file_write will wait on the - * page and retry the update. - */ - req = find_write_request(inode, page); - if (req && req->wb_file == file && update_write_request(req, offset, count)) - goto updated; + count, page_offset(page) +offset); /* * If wsize is smaller than page size, update and write @@ -475,241 +1047,542 @@ nfs_updatepage(struct file *file, struct page *page, unsigned long offset, unsig if (NFS_SERVER(inode)->wsize < PAGE_SIZE) return nfs_writepage_sync(dentry, inode, page, offset, count); - /* Create the write request. */ - req = create_write_request(file, page, offset, count); - if (!req) - return -ENOBUFS; - /* - * Ok, there's another user of this page with the new request.. - * The IO completion will then free the page and the dentry. + * Try to find an NFS request corresponding to this page + * and update it. + * If the existing request cannot be updated, we must flush + * it out now. */ - get_page(page); - - /* Schedule request */ - synchronous = schedule_write_request(req, synchronous); + do { + req = nfs_update_request(file, page, offset, count); + status = (IS_ERR(req)) ? PTR_ERR(req) : 0; + if (status != -EBUSY) + break; + /* Request could not be updated. Flush it out and try again */ + status = nfs_wb_page(inode, page); + } while (status >= 0); + if (status < 0) + goto done; -updated: - if (req->wb_bytes == PAGE_SIZE) + if (req->wb_bytes == PAGE_CACHE_SIZE) SetPageUptodate(page); - retval = 0; + status = 0; if (synchronous) { - int status = wait_on_write_request(req); - if (status) { - nfs_cancel_request(req); - retval = status; - } else { - status = req->wb_status; - if (status < 0) - retval = status; - } + int error; - if (retval < 0) - ClearPageUptodate(page); + error = nfs_sync_file(inode, file, page_offset(page) + offset, count, FLUSH_SYNC|FLUSH_STABLE); + if (error < 0 || (error = file->f_error) < 0) + status = error; + file->f_error = 0; + } else { + /* If we wrote past the end of the page. + * Call the strategy routine so it can send out a bunch + * of requests. + */ + if (req->wb_offset == 0 && req->wb_bytes == PAGE_CACHE_SIZE) + nfs_strategy(file); } - - free_write_request(req); - return retval; + nfs_release_request(req); +done: + dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", + status, inode->i_size); + if (status < 0) + clear_bit(PG_uptodate, &page->flags); + return status; } /* - * Cancel a write request. We always mark it cancelled, - * but if it's already in progress there's no point in - * calling rpc_exit, and we don't want to overwrite the - * tk_status field. - */ + * Set up the argument/result storage required for the RPC call. + */ static void -nfs_cancel_request(struct nfs_wreq *req) +nfs_write_rpcsetup(struct list_head *head, struct nfs_write_data *data) { - req->wb_flags |= NFS_WRITE_CANCELLED; - if (!WB_INPROGRESS(req)) { - rpc_exit(&req->wb_task, 0); - rpc_wake_up_task(&req->wb_task); + struct nfs_page *req; + struct iovec *iov; + unsigned int count; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + iov = data->args.iov; + count = 0; + while (!list_empty(head)) { + struct nfs_page *req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &data->pages); + iov->iov_base = (void *)(kmap(req->wb_page) + req->wb_offset); + iov->iov_len = req->wb_bytes; + count += req->wb_bytes; + iov++; + data->args.nriov++; } + req = nfs_list_entry(data->pages.next); + data->file = req->wb_file; + data->cred = req->wb_cred; + data->args.fh = NFS_FH(req->wb_file->f_dentry); + data->args.offset = page_offset(req->wb_page) + req->wb_offset; + data->args.count = count; + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.verf = &data->verf; } + /* - * Cancel all writeback requests, both pending and in progress. + * Create an RPC task for the given write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. */ -static void -nfs_cancel_dirty(struct inode *inode, pid_t pid) +static int +nfs_flush_one(struct list_head *head, struct file *file, int how) { - struct nfs_wreq *head, *req; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_write_data *data; + struct rpc_task *task; + struct rpc_message msg; + int flags, + async = !(how & FLUSH_SYNC), + stable = (how & FLUSH_STABLE); + sigset_t oldset; + + + data = nfs_writedata_alloc(); + if (!data) + goto out_bad; + task = &data->task; + + /* Set the initial flags for the task. */ + flags = (async) ? RPC_TASK_ASYNC : 0; + + /* Set up the argument struct */ + nfs_write_rpcsetup(head, data); + if (stable) { + if (!inode->u.nfs_i.ncommit) + data->args.stable = NFS_FILE_SYNC; + else + data->args.stable = NFS_DATA_SYNC; + } else + data->args.stable = NFS_UNSTABLE; + + /* Finalize the task. */ + rpc_init_task(task, clnt, nfs_writeback_done, flags); + task->tk_calldata = data; + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? NFS3PROC_WRITE : NFSPROC_WRITE; +#else + msg.rpc_proc = NFSPROC_WRITE; +#endif + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + msg.rpc_cred = data->cred; + + dprintk("NFS: %4d initiated write call (req %s/%s count %d nriov %d)\n", + task->tk_pid, + dentry->d_parent->d_name.name, + dentry->d_name.name, + data->args.count, data->args.nriov); + + rpc_clnt_sigmask(clnt, &oldset); + rpc_call_setup(task, &msg, 0); + rpc_execute(task); + rpc_clnt_sigunmask(clnt, &oldset); + return 0; + out_bad: + while (!list_empty(head)) { + struct nfs_page *req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_mark_request_dirty(req); + nfs_unlock_request(req); + } + return -ENOMEM; +} - req = head = NFS_WRITEBACK(inode); - while (req != NULL) { - if (pid == 0 || req->wb_pid == pid) - nfs_cancel_request(req); - if ((req = WB_NEXT(req)) == head) +static int +nfs_flush_list(struct inode *inode, struct list_head *head, int how) +{ + LIST_HEAD(one_request); + struct nfs_page *req; + int error = 0; + unsigned int pages = 0, + wpages = NFS_SERVER(inode)->wsize >> PAGE_CACHE_SHIFT; + + while (!list_empty(head)) { + pages += coalesce_requests(head, &one_request, wpages); + req = nfs_list_entry(one_request.next); + error = nfs_flush_one(&one_request, req->wb_file, how); + if (error < 0) break; } + if (error >= 0) + return pages; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_mark_request_dirty(req); + nfs_unlock_request(req); + } + return error; } + /* - * If we're waiting on somebody else's request - * we need to increment the counter during the - * wait so that the request doesn't disappear - * from under us during the wait.. + * This function is called when the WRITE call is complete. */ -static int FASTCALL(wait_on_other_req(struct nfs_wreq *)); -static int wait_on_other_req(struct nfs_wreq *req) +static void +nfs_writeback_done(struct rpc_task *task) { - int retval; - req->wb_count++; - retval = wait_on_write_request(req); - free_write_request(req); - return retval; -} + struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct dentry *dentry = data->file->f_dentry; + struct inode *inode = dentry->d_inode; + struct nfs_page *req; + + dprintk("NFS: %4d nfs_writeback_done (status %d)\n", + task->tk_pid, task->tk_status); + + /* We can't handle that yet but we check for it nevertheless */ + if (resp->count < argp->count && task->tk_status >= 0) { + static unsigned long complain = 0; + if (time_before(complain, jiffies)) { + printk(KERN_WARNING + "NFS: Server wrote less than requested.\n"); + complain = jiffies + 300 * HZ; + } + /* Can't do anything about it right now except throw + * an error. */ + task->tk_status = -EIO; + } +#ifdef CONFIG_NFS_V3 + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { + /* We tried a write call, but the server did not + * commit data to stable storage even though we + * requested it. + */ + static unsigned long complain = 0; + + if (time_before(complain, jiffies)) { + printk(KERN_NOTICE "NFS: faulty NFSv3 server %s:" + " (committed = %d) != (stable = %d)\n", + NFS_SERVER(inode)->hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } + } +#endif -/* - * This writes back a set of requests according to the condition. - * - * If this ever gets much more convoluted, use a fn pointer for - * the condition.. - */ -#define NFS_WB(inode, cond) { int retval = 0 ; \ - do { \ - struct nfs_wreq *req = NFS_WRITEBACK(inode); \ - struct nfs_wreq *head = req; \ - if (!req) break; \ - for (;;) { \ - if (!(req->wb_flags & NFS_WRITE_COMPLETE)) \ - if (cond) break; \ - req = WB_NEXT(req); \ - if (req == head) goto out; \ - } \ - retval = wait_on_other_req(req); \ - } while (!retval); \ -out: return retval; \ -} + /* Update attributes as result of writeback. */ + if (task->tk_status >= 0) + nfs_write_attributes(inode, resp->fattr); -int -nfs_wb_all(struct inode *inode) -{ - NFS_WB(inode, 1); + while (!list_empty(&data->pages)) { + req = nfs_list_entry(data->pages.next); + nfs_list_remove_request(req); + + kunmap(req->wb_page); + + dprintk("NFS: write (%s/%s %d@%Ld)", + req->wb_file->f_dentry->d_parent->d_name.name, + req->wb_file->f_dentry->d_name.name, + req->wb_bytes, + page_offset(req->wb_page) + req->wb_offset); + + if (task->tk_status < 0) { + req->wb_file->f_error = task->tk_status; + nfs_inode_remove_request(req); + dprintk(", error = %d\n", task->tk_status); + goto next; + } + +#ifdef CONFIG_NFS_V3 + if (resp->verf->committed != NFS_UNSTABLE) { + nfs_inode_remove_request(req); + dprintk(" OK\n"); + goto next; + } + memcpy(&req->wb_verf, resp->verf, sizeof(req->wb_verf)); + req->wb_timeout = jiffies + NFS_COMMIT_DELAY; + nfs_mark_request_commit(req); + dprintk(" marked for commit\n"); +#else + nfs_inode_remove_request(req); +#endif + next: + nfs_unlock_request(req); + } + nfs_writedata_release(task); } + +#ifdef CONFIG_NFS_V3 /* - * Write back all requests on one page - we do this before reading it. + * Set up the argument/result storage required for the RPC call. */ -int -nfs_wb_page(struct inode *inode, struct page *page) +static void +nfs_commit_rpcsetup(struct list_head *head, struct nfs_write_data *data) { - NFS_WB(inode, req->wb_page == page); + struct nfs_page *req; + struct dentry *dentry; + struct inode *inode; + unsigned long start, end, len; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + end = 0; + start = ~0; + req = nfs_list_entry(head->next); + data->file = req->wb_file; + data->cred = req->wb_cred; + dentry = data->file->f_dentry; + inode = dentry->d_inode; + while (!list_empty(head)) { + struct nfs_page *req; + unsigned long rqstart, rqend; + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &data->pages); + rqstart = page_offset(req->wb_page) + req->wb_offset; + rqend = rqstart + req->wb_bytes; + if (rqstart < start) + start = rqstart; + if (rqend > end) + end = rqend; + } + data->args.fh = NFS_FH(dentry); + data->args.offset = start; + len = end - start; + if (end >= inode->i_size || len > (~((u32)0) >> 1)) + len = 0; + data->res.count = data->args.count = (u32)len; + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; } /* - * Write back all pending writes from one file descriptor.. + * Commit dirty pages */ -int -nfs_wb_file(struct inode *inode, struct file *file) -{ - NFS_WB(inode, req->wb_file == file); -} - -void -nfs_inval(struct inode *inode) +static int +nfs_commit_list(struct list_head *head, int how) { - nfs_cancel_dirty(inode,0); + struct rpc_message msg; + struct file *file; + struct rpc_clnt *clnt; + struct nfs_write_data *data; + struct rpc_task *task; + struct nfs_page *req; + int flags, + async = !(how & FLUSH_SYNC); + sigset_t oldset; + + data = nfs_writedata_alloc(); + + if (!data) + goto out_bad; + task = &data->task; + + flags = (async) ? RPC_TASK_ASYNC : 0; + + /* Set up the argument struct */ + nfs_commit_rpcsetup(head, data); + req = nfs_list_entry(data->pages.next); + file = req->wb_file; + clnt = NFS_CLIENT(file->f_dentry->d_inode); + + rpc_init_task(task, clnt, nfs_commit_done, flags); + task->tk_calldata = data; + + msg.rpc_proc = NFS3PROC_COMMIT; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + msg.rpc_cred = data->cred; + + dprintk("NFS: %4d initiated commit call\n", task->tk_pid); + rpc_clnt_sigmask(clnt, &oldset); + rpc_call_setup(task, &msg, 0); + rpc_execute(task); + rpc_clnt_sigunmask(clnt, &oldset); + return 0; + out_bad: + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_mark_request_commit(req); + nfs_unlock_request(req); + } + return -ENOMEM; } /* - * The following procedures make up the writeback finite state machinery: - * - * 1. Try to lock the page if not yet locked by us, - * set up the RPC call info, and pass to the call FSM. + * COMMIT call returned */ static void -nfs_wback_begin(struct rpc_task *task) +nfs_commit_done(struct rpc_task *task) { - struct nfs_wreq *req = (struct nfs_wreq *) task->tk_calldata; - struct page *page = req->wb_page; - struct file *file = req->wb_file; - struct dentry *dentry = file->f_dentry; - - dprintk("NFS: %4d nfs_wback_begin (%s/%s, status=%d flags=%x)\n", - task->tk_pid, dentry->d_parent->d_name.name, - dentry->d_name.name, task->tk_status, req->wb_flags); + struct nfs_write_data *data = (struct nfs_write_data *)task->tk_calldata; + struct nfs_writeres *resp = &data->res; + struct nfs_page *req; + struct dentry *dentry = data->file->f_dentry; + struct inode *inode = dentry->d_inode; - task->tk_status = 0; + dprintk("NFS: %4d nfs_commit_done (status %d)\n", + task->tk_pid, task->tk_status); + + nfs_refresh_inode(inode, resp->fattr); + while (!list_empty(&data->pages)) { + req = nfs_list_entry(data->pages.next); + nfs_list_remove_request(req); + + dprintk("NFS: commit (%s/%s %d@%ld)", + req->wb_file->f_dentry->d_parent->d_name.name, + req->wb_file->f_dentry->d_name.name, + req->wb_bytes, + page_offset(req->wb_page) + req->wb_offset); + if (task->tk_status < 0) { + req->wb_file->f_error = task->tk_status; + nfs_inode_remove_request(req); + dprintk(", error = %d\n", task->tk_status); + goto next; + } - /* Setup the task struct for a writeback call */ - req->wb_flags |= NFS_WRITE_INPROGRESS; - req->wb_args.fh = NFS_FH(dentry); - req->wb_args.offset = (page->index << PAGE_CACHE_SHIFT) + req->wb_offset; - req->wb_args.count = req->wb_bytes; - req->wb_args.buffer = (void *) (page_address(page) + req->wb_offset); + /* Okay, COMMIT succeeded, apparently. Check the verifier + * returned by the server against all stored verfs. */ + if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { + /* We have a match */ + nfs_inode_remove_request(req); + dprintk(" OK\n"); + goto next; + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); + nfs_mark_request_dirty(req); + next: + nfs_unlock_request(req); + } + nfs_writedata_release(task); +} +#endif - rpc_call_setup(task, NFSPROC_WRITE, &req->wb_args, &req->wb_fattr, 0); +int nfs_flush_file(struct inode *inode, struct file *file, unsigned long start, + unsigned int count, int how) +{ + LIST_HEAD(head); + int pages, + error = 0; + + pages = nfs_scan_dirty(inode, &head, file, start, count); + if (pages) + error = nfs_flush_list(inode, &head, how); + if (error < 0) + return error; + return pages; +} - return; +int nfs_flush_timeout(struct inode *inode, int how) +{ + LIST_HEAD(head); + int pages, + error = 0; + + pages = nfs_scan_dirty_timeout(inode, &head); + if (pages) + error = nfs_flush_list(inode, &head, how); + if (error < 0) + return error; + return pages; } -/* - * 2. Collect the result - */ -static void -nfs_wback_result(struct rpc_task *task) +#ifdef CONFIG_NFS_V3 +int nfs_commit_file(struct inode *inode, struct file *file, unsigned long start, + unsigned int count, int how) { - struct nfs_wreq *req = (struct nfs_wreq *) task->tk_calldata; - struct file *file = req->wb_file; - struct page *page = req->wb_page; - int status = task->tk_status; - struct dentry *dentry = file->f_dentry; - struct inode *inode = dentry->d_inode; + LIST_HEAD(head); + int pages, + error = 0; + + pages = nfs_scan_commit(inode, &head, file, start, count); + if (pages) + error = nfs_commit_list(&head, how); + if (error < 0) + return error; + return pages; +} - dprintk("NFS: %4d nfs_wback_result (%s/%s, status=%d, flags=%x)\n", - task->tk_pid, dentry->d_parent->d_name.name, - dentry->d_name.name, status, req->wb_flags); - - /* Set the WRITE_COMPLETE flag, but leave WRITE_INPROGRESS set */ - req->wb_flags |= NFS_WRITE_COMPLETE; - req->wb_status = status; - - if (status < 0) { - req->wb_flags |= NFS_WRITE_INVALIDATE; - file->f_error = status; - } else if (!WB_CANCELLED(req)) { - struct nfs_fattr *fattr = &req->wb_fattr; - /* Update attributes as result of writeback. - * Beware: when UDP replies arrive out of order, we - * may end up overwriting a previous, bigger file size. - * - * When the file size shrinks we cancel all pending - * writebacks. - */ - if (fattr->mtime.seconds >= inode->i_mtime) { - if (fattr->size < inode->i_size) - fattr->size = inode->i_size; - - /* possible Solaris 2.5 server bug workaround */ - if (inode->i_ino == fattr->fileid) { - /* - * We expect these values to change, and - * don't want to invalidate the caches. - */ - inode->i_size = fattr->size; - inode->i_mtime = fattr->mtime.seconds; - nfs_refresh_inode(inode, fattr); - } - else - printk("nfs_wback_result: inode %ld, got %u?\n", - inode->i_ino, fattr->fileid); - } +int nfs_commit_timeout(struct inode *inode, int how) +{ + LIST_HEAD(head); + int pages, + error = 0; + + pages = nfs_scan_commit_timeout(inode, &head); + if (pages) { + pages += nfs_scan_commit(inode, &head, NULL, 0, 0); + error = nfs_commit_list(&head, how); } + if (error < 0) + return error; + return pages; +} +#endif - rpc_release_task(task); +int nfs_sync_file(struct inode *inode, struct file *file, unsigned long start, + unsigned int count, int how) +{ + int error, + wait; - if (WB_INVALIDATE(req)) - ClearPageUptodate(page); + wait = how & FLUSH_WAIT; + how &= ~FLUSH_WAIT; - kunmap(page); - __free_page(page); - remove_write_request(&NFS_WRITEBACK(inode), req); - nr_write_requests--; - fput(req->wb_file); + if (!inode && file) + inode = file->f_dentry->d_inode; - wake_up(&req->wb_wait); - free_write_request(req); + do { + error = 0; + if (wait) + error = nfs_wait_on_requests(inode, file, start, count); + if (error == 0) + error = nfs_flush_file(inode, file, start, count, how); +#ifdef CONFIG_NFS_V3 + if (error == 0) + error = nfs_commit_file(inode, file, start, count, how); +#endif + } while (error > 0); + return error; +} + +int nfs_init_nfspagecache(void) +{ + nfs_page_cachep = kmem_cache_create("nfs_page", + sizeof(struct nfs_page), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (nfs_page_cachep == NULL) + return -ENOMEM; + + nfs_wdata_cachep = kmem_cache_create("nfs_write_data", + sizeof(struct nfs_write_data), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (nfs_wdata_cachep == NULL) + return -ENOMEM; + + return 0; +} + +void nfs_destroy_nfspagecache(void) +{ + if (kmem_cache_destroy(nfs_page_cachep)) + printk(KERN_INFO "nfs_page: not all structures were freed\n"); + if (kmem_cache_destroy(nfs_wdata_cachep)) + printk(KERN_INFO "nfs_write_data: not all structures were freed\n"); } + |