Merge with Linux 2.3.99-pre3.

author: Ralf Baechle <ralf@linux-mips.org> 2000-03-27 23:54:12 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-03-27 23:54:12 +0000
commit: d3e71cb08747743fce908122bab08b479eb403a5 (patch)
tree: cbec6948fdbdee9af81cf3ecfb504070d2745d7b /fs/nfs
parent: fe7ff1706e323d0e5ed83972960a1ecc1ee538b3 (diff)
10 files changed, 1665 insertions, 503 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 3171e8adc..3c8aac510 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -9,7 +9,7 @@
 
 O_TARGET := nfs.o
 O_OBJS   := inode.o file.o read.o write.o dir.o symlink.o proc.o \
-	    nfs2xdr.o
+	    nfs2xdr.o flushd.o
 
 ifdef CONFIG_ROOT_NFS
   O_OBJS += nfsroot.o mount_clnt.o
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 37b2b682b..3ca240129 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -596,9 +596,12 @@ static int nfs_lookup_revalidate(struct dentry * dentry, int flags)
 out_valid:
 	return 1;
 out_bad:
-	d_drop(dentry);
 	if (!list_empty(&dentry->d_subdirs))
 		shrink_dcache_parent(dentry);
+	/* If we have submounts, don't unhash ! */
+	if (have_submounts(dentry))
+		goto out_valid;
+	d_drop(dentry);
 	/* Purge readdir caches. */
 	if (dentry->d_parent->d_inode) {
 		nfs_zap_caches(dentry->d_parent->d_inode);
@@ -862,61 +865,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return error;
 }
 
-
-/*  Note: we copy the code from lookup_dentry() here, only: we have to
- *  omit the directory lock. We are already the owner of the lock when
- *  we reach here. And "down(&dir->i_sem)" would make us sleep forever
- *  ('cause WE have the lock)
- * 
- *  VERY IMPORTANT: calculate the hash for this dentry!!!!!!!!
- *  Otherwise the cached lookup DEFINITELY WILL fail. And a new dentry
- *  is created. Without the DCACHE_NFSFS_RENAMED flag. And with d_count
- *  == 1. And trouble.
- *
- *  Concerning my choice of the temp name: it is just nice to have
- *  i_ino part of the temp name, as this offers another check whether
- *  somebody attempts to remove the "silly renamed" dentry itself.
- *  Which is something that I consider evil. Your opinion may vary.
- *  BUT:
- *  Now that I compute the hash value right, it should be possible to simply
- *  check for the DCACHE_NFSFS_RENAMED flag in dentry->d_flag instead of
- *  doing the string compare.
- *  WHICH MEANS:
- *  This offers the opportunity to shorten the temp name. Currently, I use
- *  the hex representation of i_ino + an event counter. This sums up to
- *  as much as 36 characters for a 64 bit machine, and needs 20 chars on 
- *  a 32 bit machine.
- *  QUINTESSENCE
- *  The use of i_ino is simply cosmetic. All we need is a unique temp
- *  file name for the .nfs files. The event counter seemed to be adequate.
- *  And as we retry in case such a file already exists, we are guaranteed
- *  to succeed.
- */
-
-static
-struct dentry *nfs_silly_lookup(struct dentry *parent, char *silly, int slen)
-{
-	struct qstr    sqstr;
-	struct dentry *sdentry;
-	struct dentry *res;
-
-	sqstr.name = silly;
-	sqstr.len  = slen;
-	sqstr.hash = full_name_hash(silly, slen);
-	sdentry = d_lookup(parent, &sqstr);
-	if (!sdentry) {
-		sdentry = d_alloc(parent, &sqstr);
-		if (sdentry == NULL)
-			return ERR_PTR(-ENOMEM);
-		res = nfs_lookup(parent->d_inode, sdentry);
-		if (res) {
-			dput(sdentry);
-			return res;
-		}
-	}
-	return sdentry;
-}
-
 static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 {
 	static unsigned int sillycounter = 0;
@@ -966,7 +914,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name);
 		dfprintk(VFS, "trying to rename %s to %s\n",
 			 dentry->d_name.name, silly);
 		
-		sdentry = nfs_silly_lookup(dentry->d_parent, silly, slen);
+		sdentry = lookup_one(silly, dget(dentry->d_parent));
 		/*
 		 * N.B. Better to return EBUSY here ... it could be
 		 * dangerous to delete the file while it's in use.
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 9a91bb1ab..32d290c73 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -144,10 +144,10 @@ nfs_fsync(struct file *file, struct dentry *dentry)
  * If the writer ends up delaying the write, the writer needs to
  * increment the page use counts until he is done with the page.
  */
-static int nfs_prepare_write(struct page *page, unsigned offset, unsigned to)
+static int nfs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
 {
 	kmap(page);
-	return 0;
+	return nfs_flush_incompatible(file, page);
 }
 static int nfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
 {
diff --git a/fs/nfs/flushd.c b/fs/nfs/flushd.c
new file mode 100644
index 000000000..d36c3a9ae
--- /dev/null
+++ b/fs/nfs/flushd.c
@@ -0,0 +1,304 @@
+/*
+ * linux/fs/nfs/flushd.c
+ *
+ * For each NFS mount, there is a separate cache object that contains
+ * a hash table of all clusters. With this cache, an async RPC task
+ * (`flushd') is associated, which wakes up occasionally to inspect
+ * its list of dirty buffers.
+ * (Note that RPC tasks aren't kernel threads. Take a look at the
+ * rpciod code to understand what they are).
+ *
+ * Inside the cache object, we also maintain a count of the current number
+ * of dirty pages, which may not exceed a certain threshold.
+ * (FIXME: This threshold should be configurable).
+ *
+ * The code is streamlined for what I think is the prevalent case for
+ * NFS traffic, which is sequential write access without concurrent
+ * access by different processes.
+ *
+ * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
+ *
+ * Rewritten 6/3/2000 by Trond Myklebust
+ * Copyright (C) 1999, 2000, Trond Myklebust <trond.myklebust@fys.uio.no>
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/malloc.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include <linux/sched.h>
+
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+
+#include <linux/spinlock.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/nfs_flushd.h>
+#include <linux/nfs_mount.h>
+
+/*
+ * Various constants
+ */
+#define NFSDBG_FACILITY         NFSDBG_PAGECACHE
+
+/*
+ * This is the wait queue all cluster daemons sleep on
+ */
+static struct rpc_wait_queue    flushd_queue = RPC_INIT_WAITQ("nfs_flushd");
+
+/*
+ * Spinlock
+ */
+spinlock_t nfs_flushd_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Local function declarations.
+ */
+static void	nfs_flushd(struct rpc_task *);
+static void	nfs_flushd_exit(struct rpc_task *);
+
+
+int nfs_reqlist_init(struct nfs_server *server)
+{
+	struct nfs_reqlist	*cache;
+	struct rpc_task		*task;
+	int			status = 0;
+
+	dprintk("NFS: writecache_init\n");
+	spin_lock(&nfs_flushd_lock);
+	cache = server->rw_requests;
+
+	if (cache->task)
+		goto out_unlock;
+
+	/* Create the RPC task */
+	status = -ENOMEM;
+	task = rpc_new_task(server->client, NULL, RPC_TASK_ASYNC);
+	if (!task)
+		goto out_unlock;
+
+	task->tk_calldata = server;
+
+	cache->task = task;
+
+	/* Run the task */
+	cache->runat = jiffies;
+
+	cache->auth = server->client->cl_auth;
+	task->tk_action   = nfs_flushd;
+	task->tk_exit   = nfs_flushd_exit;
+
+	spin_unlock(&nfs_flushd_lock);
+	rpc_execute(task);
+	return 0;
+ out_unlock:
+	spin_unlock(&nfs_flushd_lock);
+	return status;
+}
+
+void nfs_reqlist_exit(struct nfs_server *server)
+{
+	struct nfs_reqlist      *cache;
+
+	cache = server->rw_requests;
+	if (!cache)
+		return;
+
+	dprintk("NFS: reqlist_exit (ptr %p rpc %p)\n", cache, cache->task);
+	while (cache->task || cache->inodes) {
+		spin_lock(&nfs_flushd_lock);
+		if (!cache->task) {
+			spin_unlock(&nfs_flushd_lock);
+			nfs_reqlist_init(server);
+		} else {
+			cache->task->tk_status = -ENOMEM;
+			rpc_wake_up_task(cache->task);
+			spin_unlock(&nfs_flushd_lock);
+		}
+		interruptible_sleep_on_timeout(&cache->request_wait, 1 * HZ);
+	}
+}
+
+int nfs_reqlist_alloc(struct nfs_server *server)
+{
+	struct nfs_reqlist	*cache;
+	if (server->rw_requests)
+		return 0;
+
+	cache = (struct nfs_reqlist *)kmalloc(sizeof(*cache), GFP_KERNEL);
+	if (!cache)
+		return -ENOMEM;
+
+	memset(cache, 0, sizeof(*cache));
+	init_waitqueue_head(&cache->request_wait);
+	server->rw_requests = cache;
+
+	return 0;
+}
+
+void nfs_reqlist_free(struct nfs_server *server)
+{
+	if (server->rw_requests) {
+		kfree(server->rw_requests);
+		server->rw_requests = NULL;
+	}
+}
+
+void nfs_wake_flushd()
+{
+	rpc_wake_up_status(&flushd_queue, -ENOMEM);
+}
+
+static void inode_append_flushd(struct inode *inode)
+{
+	struct nfs_reqlist	*cache = NFS_REQUESTLIST(inode);
+	struct inode		**q;
+
+	spin_lock(&nfs_flushd_lock);
+	if (NFS_FLAGS(inode) & NFS_INO_FLUSH)
+		goto out;
+	inode->u.nfs_i.hash_next = NULL;
+
+	q = &cache->inodes;
+	while (*q)
+		q = &(*q)->u.nfs_i.hash_next;
+	*q = inode;
+
+	/* Note: we increase the inode i_count in order to prevent
+	 *	 it from disappearing when on the flush list
+	 */
+	NFS_FLAGS(inode) |= NFS_INO_FLUSH;
+	inode->i_count++;
+ out:
+	spin_unlock(&nfs_flushd_lock);
+}
+
+void inode_remove_flushd(struct inode *inode)
+{
+	struct nfs_reqlist	*cache = NFS_REQUESTLIST(inode);
+	struct inode		**q;
+
+	spin_lock(&nfs_flushd_lock);
+	if (!(NFS_FLAGS(inode) & NFS_INO_FLUSH))
+		goto out;
+
+	q = &cache->inodes;
+	while (*q && *q != inode)
+		q = &(*q)->u.nfs_i.hash_next;
+	if (*q) {
+		*q = inode->u.nfs_i.hash_next;
+		NFS_FLAGS(inode) &= ~NFS_INO_FLUSH;
+		iput(inode);
+	}
+ out:
+	spin_unlock(&nfs_flushd_lock);
+}
+
+void inode_schedule_scan(struct inode *inode, unsigned long time)
+{
+	struct nfs_reqlist	*cache = NFS_REQUESTLIST(inode);
+	struct rpc_task		*task;
+	unsigned long		mintimeout;
+
+	if (time_after(NFS_NEXTSCAN(inode), time))
+		NFS_NEXTSCAN(inode) = time;
+	mintimeout = jiffies + 1 * HZ;
+	if (time_before(mintimeout, NFS_NEXTSCAN(inode)))
+		mintimeout = NFS_NEXTSCAN(inode);
+	inode_append_flushd(inode);
+
+	spin_lock(&nfs_flushd_lock);
+	task = cache->task;
+	if (!task) {
+		spin_unlock(&nfs_flushd_lock);
+		nfs_reqlist_init(NFS_SERVER(inode));
+	} else {
+		if (time_after(cache->runat, mintimeout))
+			rpc_wake_up_task(task);
+		spin_unlock(&nfs_flushd_lock);
+	}
+}
+
+
+static void
+nfs_flushd(struct rpc_task *task)
+{
+	struct nfs_server	*server;
+	struct nfs_reqlist	*cache;
+	struct inode		*inode, *next;
+	unsigned long		delay = jiffies + NFS_WRITEBACK_LOCKDELAY;
+	int			flush = (task->tk_status == -ENOMEM);
+
+        dprintk("NFS: %4d flushd starting\n", task->tk_pid);
+	server = (struct nfs_server *) task->tk_calldata;
+        cache = server->rw_requests;
+
+	spin_lock(&nfs_flushd_lock);
+	next = cache->inodes;
+	cache->inodes = NULL;
+	spin_unlock(&nfs_flushd_lock);
+
+	while ((inode = next) != NULL) {
+		next = next->u.nfs_i.hash_next;
+		inode->u.nfs_i.hash_next = NULL;
+		NFS_FLAGS(inode) &= ~NFS_INO_FLUSH;
+
+		if (flush) {
+			nfs_sync_file(inode, NULL, 0, 0, FLUSH_AGING);
+		} else if (time_after(jiffies, NFS_NEXTSCAN(inode))) {
+			NFS_NEXTSCAN(inode) = jiffies + NFS_WRITEBACK_LOCKDELAY;
+			nfs_flush_timeout(inode, FLUSH_AGING);
+#ifdef CONFIG_NFS_V3
+			nfs_commit_timeout(inode, FLUSH_AGING);
+#endif
+		}
+
+		if (nfs_have_writebacks(inode)) {
+			inode_append_flushd(inode);
+			if (time_after(delay, NFS_NEXTSCAN(inode)))
+				delay = NFS_NEXTSCAN(inode);
+		}
+		iput(inode);
+	}
+
+	dprintk("NFS: %4d flushd back to sleep\n", task->tk_pid);
+	if (time_after(jiffies + 1 * HZ, delay))
+		delay = 1 * HZ;
+	else
+		delay = delay - jiffies;
+	task->tk_status = 0;
+	task->tk_action = nfs_flushd;
+	task->tk_timeout = delay;
+	cache->runat = jiffies + task->tk_timeout;
+
+	spin_lock(&nfs_flushd_lock);
+	if (!cache->nr_requests && !cache->inodes) {
+		cache->task = NULL;
+		task->tk_action = NULL;
+	} else
+		rpc_sleep_on(&flushd_queue, task, NULL, NULL);
+	spin_unlock(&nfs_flushd_lock);
+}
+
+static void
+nfs_flushd_exit(struct rpc_task *task)
+{
+	struct nfs_server	*server;
+	struct nfs_reqlist	*cache;
+	server = (struct nfs_server *) task->tk_calldata;
+	cache = server->rw_requests;
+
+	spin_lock(&nfs_flushd_lock);
+	if (cache->task == task)
+		cache->task = NULL;
+	spin_unlock(&nfs_flushd_lock);
+	wake_up(&cache->request_wait);
+	rpc_release_task(task);
+}
+
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 441d62edc..ca7e1b944 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/nfs_fs.h>
+#include <linux/nfs_flushd.h>
 #include <linux/lockd/bind.h>
 #include <linux/smp_lock.h>
 
@@ -74,6 +75,12 @@ nfs_read_inode(struct inode * inode)
 	inode->i_rdev = 0;
 	NFS_FILEID(inode) = 0;
 	NFS_FSID(inode) = 0;
+	INIT_LIST_HEAD(&inode->u.nfs_i.dirty);
+	INIT_LIST_HEAD(&inode->u.nfs_i.commit);
+	INIT_LIST_HEAD(&inode->u.nfs_i.writeback);
+	inode->u.nfs_i.ndirty = 0;
+	inode->u.nfs_i.ncommit = 0;
+	inode->u.nfs_i.npages = 0;
 	NFS_CACHEINV(inode);
 	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 }
@@ -92,8 +99,6 @@ nfs_put_inode(struct inode * inode)
 static void
 nfs_delete_inode(struct inode * inode)
 {
-	int failed;
-
 	dprintk("NFS: delete_inode(%x/%ld)\n", inode->i_dev, inode->i_ino);
 
 	lock_kernel();
@@ -101,29 +106,12 @@ nfs_delete_inode(struct inode * inode)
 		nfs_free_dircache(inode);
 	} else {
 		/*
-		 * Flush out any pending write requests ...
+		 * The following can never actually happen...
 		 */
-		if (NFS_WRITEBACK(inode) != NULL) {
-			unsigned long timeout = jiffies + 5*HZ;
-#ifdef NFS_DEBUG_VERBOSE
-printk("nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
-#endif
-			nfs_inval(inode);
-			while (NFS_WRITEBACK(inode) != NULL &&
-			       time_before(jiffies, timeout)) {
-				current->state = TASK_INTERRUPTIBLE;
-				schedule_timeout(HZ/10);
-			}
-			current->state = TASK_RUNNING;
-			if (NFS_WRITEBACK(inode) != NULL)
-				printk("NFS: Arghhh, stuck RPC requests!\n");
+		if (nfs_have_writebacks(inode)) {
+			printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
 		}
 	}
-
-	failed = nfs_check_failed_request(inode);
-	if (failed)
-		printk("NFS: inode %ld had %d failed requests\n",
-			inode->i_ino, failed);
 	unlock_kernel();
 
 	clear_inode(inode);
@@ -135,9 +123,18 @@ nfs_put_super(struct super_block *sb)
 	struct nfs_server *server = &sb->u.nfs_sb.s_server;
 	struct rpc_clnt	*rpc;
 
+	/*
+	 * First get rid of the request flushing daemon.
+	 * Relies on rpc_shutdown_client() waiting on all
+	 * client tasks to finish.
+	 */
+	nfs_reqlist_exit(server);
+
 	if ((rpc = server->client) != NULL)
 		rpc_shutdown_client(rpc);
 
+	nfs_reqlist_free(server);
+
 	if (!(server->flags & NFS_MOUNT_NONLM))
 		lockd_down();	/* release rpc.lockd */
 	rpciod_down();		/* release rpciod */
@@ -306,6 +303,12 @@ nfs_read_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_root->d_op = &nfs_dentry_operations;
 	sb->s_root->d_fsdata = root_fh;
 
+	/* Fire up the writeback cache */
+	if (nfs_reqlist_alloc(server) < 0) {
+		printk(KERN_NOTICE "NFS: cannot initialize writeback cache.\n");
+		goto failure_kill_reqlist;
+	}
+
 	/* We're airborne */
 
 	/* Check whether to start the lockd process */
@@ -314,6 +317,8 @@ nfs_read_super(struct super_block *sb, void *raw_data, int silent)
 	return sb;
 
 	/* Yargs. It didn't work out. */
+ failure_kill_reqlist:
+	nfs_reqlist_exit(server);
 out_no_root:
 	printk("nfs_read_super: get root inode failed\n");
 	iput(root_inode);
@@ -342,6 +347,7 @@ out_no_xprt:
 	printk(KERN_WARNING "NFS: cannot create RPC transport.\n");
 
 out_free_host:
+	nfs_reqlist_free(server);
 	kfree(server->hostname);
 out_unlock:
 	goto out_fail;
@@ -440,7 +446,6 @@ nfs_invalidate_inode(struct inode *inode)
 
 	make_bad_inode(inode);
 	inode->i_mode = save_mode;
-	nfs_inval(inode);
 	nfs_zap_caches(inode);
 }
 
@@ -864,7 +869,7 @@ nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 * to look at the size or the mtime the server sends us
 	 * too closely, as we're in the middle of modifying them.
 	 */
-	if (NFS_WRITEBACK(inode))
+	if (nfs_have_writebacks(inode))
 		goto out;
 
 	if (inode->i_size != fattr->size) {
@@ -925,7 +930,7 @@ printk("nfs_refresh_inode: invalidating %ld pages\n", inode->i_nrpages);
 static DECLARE_FSTYPE(nfs_fs_type, "nfs", nfs_read_super, 0);
 
 extern int nfs_init_fhcache(void);
-extern int nfs_init_wreqcache(void);
+extern int nfs_init_nfspagecache(void);
 
 /*
  * Initialize NFS
@@ -939,7 +944,7 @@ init_nfs_fs(void)
 	if (err)
 		return err;
 
-	err = nfs_init_wreqcache();
+	err = nfs_init_nfspagecache();
 	if (err)
 		return err;
 
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index a7e53e6db..5ad2aaa67 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -63,6 +63,7 @@ static int			nfs_stat_to_errno(int stat);
 #define NFS_diropres_sz		1+NFS_fhandle_sz+NFS_fattr_sz
 #define NFS_readlinkres_sz	1
 #define NFS_readres_sz		1+NFS_fattr_sz+1
+#define NFS_writeres_sz		NFS_attrstat_sz
 #define NFS_stat_sz		1
 #define NFS_readdirres_sz	1
 #define NFS_statfsres_sz	1+NFS_info_sz
@@ -273,6 +274,7 @@ nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
 static int
 nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
 {
+	unsigned int nr;
 	u32 count = args->count;
 
 	p = xdr_encode_fhandle(p, args->fh);
@@ -282,28 +284,35 @@ nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
 	*p++ = htonl(count);
 	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
 
-	req->rq_svec[1].iov_base = (void *) args->buffer;
-	req->rq_svec[1].iov_len = count;
-	req->rq_slen += count;
-	req->rq_snr = 2;
+	/* Get the number of buffers in the send iovec */
+	nr = args->nriov;
+
+	if (nr+2 > MAX_IOVEC) {
+		printk(KERN_ERR "NFS: Bad number of iov's in xdr_writeargs "
+			"(nr %d max %d)\n", nr, MAX_IOVEC);
+		return -EINVAL;
+	}
+
+	/* Copy the iovec */
+	memcpy(req->rq_svec + 1, args->iov, nr * sizeof(struct iovec));
 
 #ifdef NFS_PAD_WRITES
 	/*
 	 * Some old servers require that the message length
 	 * be a multiple of 4, so we pad it here if needed.
 	 */
-	count = ((count + 3) & ~3) - count;
-	if (count) {
-#if 0
-printk("nfs_writeargs: padding write, len=%d, slen=%d, pad=%d\n",
-req->rq_svec[1].iov_len, req->rq_slen, count);
-#endif
-		req->rq_svec[2].iov_base = (void *) "\0\0\0";
-		req->rq_svec[2].iov_len  = count;
-		req->rq_slen += count;
-		req->rq_snr = 3;
+	if (count & 3) {
+		struct iovec	*iov = req->rq_svec + nr + 1;
+		int		pad = 4 - (count & 3);
+
+		iov->iov_base = (void *) "\0\0\0";
+		iov->iov_len  = pad;
+		count += pad;
+		nr++;
 	}
 #endif
+	req->rq_slen += count;
+	req->rq_snr += nr;
 
 	return 0;
 }
@@ -593,6 +602,16 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy)
 }
 
 /*
+ * Decode WRITE reply
+ */
+static int
+nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+{
+	res->verf->committed = NFS_FILE_SYNC;
+	return nfs_xdr_attrstat(req, p, res->fattr);
+}
+
+/*
  * Decode STATFS reply
  */
 static int
@@ -678,7 +697,7 @@ static struct rpc_procinfo	nfs_procedures[18] = {
     PROC(readlink,	readlinkargs,	readlinkres),
     PROC(read,		readargs,	readres),
     PROC(writecache,	enc_void,	dec_void),
-    PROC(write,		writeargs,	attrstat),
+    PROC(write,		writeargs,	writeres),
     PROC(create,	createargs,	diropres),
     PROC(remove,	diropargs,	stat),
     PROC(rename,	renameargs,	stat),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index d9a423f16..a592608be 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -145,6 +145,8 @@ static struct nfs_bool_opts {
 	{ "nocto",	~NFS_MOUNT_NOCTO,	NFS_MOUNT_NOCTO },
 	{ "ac",		~NFS_MOUNT_NOAC,	0 },
 	{ "noac",	~NFS_MOUNT_NOAC,	NFS_MOUNT_NOAC },
+	{ "lock",	~NFS_MOUNT_NONLM,	0 },
+	{ "nolock",	~NFS_MOUNT_NONLM,	NFS_MOUNT_NONLM },
 	{ NULL,		0,			0 }
 };
 
@@ -320,7 +322,7 @@ int __init root_nfs_init(void)
  *  Parse NFS server and directory information passed on the kernel
  *  command line.
  */
-void __init nfs_root_setup(char *line)
+int __init nfs_root_setup(char *line)
 {
 	ROOT_DEV = MKDEV(UNNAMED_MAJOR, 255);
 	if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
@@ -333,6 +335,7 @@ void __init nfs_root_setup(char *line)
 		sprintf(nfs_root_name, NFS_ROOT, line);
 	}
 	root_nfs_parse_addr(nfs_root_name);
+	return 1;
 }
 
 __setup("nfsroot=", nfs_root_setup);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index bb55ce6d6..3823c3118 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -111,11 +111,15 @@ nfs_proc_write(struct nfs_server *server, struct nfs_fh *fhandle, int swap,
 			unsigned long offset, unsigned int count,
 			const void *buffer, struct nfs_fattr *fattr)
 {
-	struct nfs_writeargs	arg = { fhandle, offset, count, buffer };
+	struct nfs_writeargs	arg = { fhandle, offset, count, 1, 1,
+					{{(void *) buffer, count}, {0,0}, {0,0}, {0,0},
+					{0,0}, {0,0}, {0,0}, {0,0}}};
+	struct nfs_writeverf	verf;
+	struct nfs_writeres	res = {fattr, &verf, count};
 	int			status;
 
 	dprintk("NFS call  write %d @ %ld\n", count, offset);
-	status = rpc_call(server->client, NFSPROC_WRITE, &arg, fattr,
+	status = rpc_call(server->client, NFSPROC_WRITE, &arg, &res,
 			swap? (RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS) : 0);
 	dprintk("NFS reply read: %d\n", status);
 	return status < 0? status : count;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 717d12bbb..aa17780e5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -171,6 +171,7 @@ static inline int
 nfs_readpage_async(struct dentry *dentry, struct inode *inode,
 			struct page *page)
 {
+	struct rpc_message msg;
 	unsigned long address;
 	struct nfs_rreq	*req;
 	int		result = -1, flags;
@@ -195,8 +196,13 @@ nfs_readpage_async(struct dentry *dentry, struct inode *inode,
 
 	/* Start the async call */
 	dprintk("NFS: executing async READ request.\n");
-	result = rpc_do_call(NFS_CLIENT(inode), NFSPROC_READ,
-				&req->ra_args, &req->ra_res, flags,
+
+	msg.rpc_proc = NFSPROC_READ;
+	msg.rpc_argp = &req->ra_args;
+	msg.rpc_resp = &req->ra_res;
+	msg.rpc_cred = NULL;
+
+	result = rpc_call_async(NFS_CLIENT(inode), &msg, flags,
 				nfs_readpage_result, req);
 	if (result < 0)
 		goto out_free;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5f847bec8..af023a121 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -46,6 +46,7 @@
  * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
  */
 
+#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/malloc.h>
 #include <linux/swap.h>
@@ -54,33 +55,126 @@
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
+#include <linux/nfs_flushd.h>
 #include <asm/uaccess.h>
 #include <linux/smp_lock.h>
 
 #define NFS_PARANOIA 1
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
-static void			nfs_wback_begin(struct rpc_task *task);
-static void			nfs_wback_result(struct rpc_task *task);
-static void			nfs_cancel_request(struct nfs_wreq *req);
+/*
+ * Spinlock
+ */
+spinlock_t nfs_wreq_lock = SPIN_LOCK_UNLOCKED;
+static unsigned int	nfs_nr_requests = 0;
 
 /*
- * Cache parameters
+ * Local structures
+ *
+ * Valid flags for a dirty buffer
  */
-#define NFS_WRITEBACK_DELAY	(10 * HZ)
-#define NFS_WRITEBACK_MAX	64
+#define PG_BUSY			0x0001
 
 /*
- * Limit number of delayed writes
+ * This is the struct where the WRITE/COMMIT arguments go.
  */
-static int			nr_write_requests = 0;
-static struct rpc_wait_queue	write_queue = RPC_INIT_WAITQ("write_chain");
+struct nfs_write_data {
+	struct rpc_task		task;
+	struct file		*file;
+	struct rpc_cred		*cred;
+	struct nfs_writeargs	args;		/* argument struct */
+	struct nfs_writeres	res;		/* result struct */
+	struct nfs_fattr	fattr;
+	struct nfs_writeverf	verf;
+	struct list_head	pages;		/* Coalesced requests we wish to flush */
+};
+
+struct nfs_page {
+	struct list_head	wb_hash,	/* Inode */
+				wb_list,
+				*wb_list_head;
+	struct file		*wb_file;
+	struct rpc_cred		*wb_cred;
+	struct page		*wb_page;	/* page to write out */
+	wait_queue_head_t	wb_wait;	/* wait queue */
+	unsigned long		wb_timeout;	/* when to write/commit */
+	unsigned int		wb_offset,	/* Offset of write */
+				wb_bytes,	/* Length of request */
+				wb_count,	/* reference count */
+				wb_flags;
+	struct nfs_writeverf	wb_verf;	/* Commit cookie */
+};
+
+#define NFS_WBACK_BUSY(req)	((req)->wb_flags & PG_BUSY)
+
+/*
+ * Local function declarations
+ */
+static void	nfs_writeback_done(struct rpc_task *);
+#ifdef CONFIG_NFS_V3
+static void	nfs_commit_done(struct rpc_task *);
+#endif
 
 /* Hack for future NFS swap support */
 #ifndef IS_SWAPFILE
 # define IS_SWAPFILE(inode)	(0)
 #endif
 
+static kmem_cache_t *nfs_page_cachep = NULL;
+static kmem_cache_t *nfs_wdata_cachep = NULL;
+
+static __inline__ struct nfs_page *nfs_page_alloc(void)
+{
+	struct nfs_page	*p;
+	p = kmem_cache_alloc(nfs_page_cachep, SLAB_KERNEL);
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->wb_hash);
+		INIT_LIST_HEAD(&p->wb_list);
+		init_waitqueue_head(&p->wb_wait);
+	}
+	return p;
+}
+
+static __inline__ void nfs_page_free(struct nfs_page *p)
+{
+	kmem_cache_free(nfs_page_cachep, p);
+}
+
+static __inline__ struct nfs_write_data *nfs_writedata_alloc(void)
+{
+	struct nfs_write_data	*p;
+	p = kmem_cache_alloc(nfs_wdata_cachep, SLAB_NFS);
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+	}
+	return p;
+}
+
+static __inline__ void nfs_writedata_free(struct nfs_write_data *p)
+{
+	kmem_cache_free(nfs_wdata_cachep, p);
+}
+
+static void nfs_writedata_release(struct rpc_task *task)
+{
+	struct nfs_write_data	*wdata = (struct nfs_write_data *)task->tk_calldata;
+	rpc_release_task(task);
+	nfs_writedata_free(wdata);
+}
+
+/*
+ * This function will be used to simulate weak cache consistency
+ * under NFSv2 when the NFSv3 attribute patch is included.
+ * For the moment, we just call nfs_refresh_inode().
+ */
+static __inline__ int
+nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr)
+{
+	return nfs_refresh_inode(inode, fattr);
+}
+
 /*
  * Write a page synchronously.
  * Offset is the data offset within the page.
@@ -161,278 +255,770 @@ io_error:
 }
 
 /*
- * Append a writeback request to a list
+ * Write a page to the server. This was supposed to be used for
+ * NFS swapping only.
+ * FIXME: Using this for mmap is pointless, breaks asynchronous
+ *        writebacks, and is extremely slow.
  */
-static inline void
-append_write_request(struct nfs_wreq **q, struct nfs_wreq *wreq)
+int
+nfs_writepage(struct dentry * dentry, struct page *page)
 {
-	dprintk("NFS:      append_write_request(%p, %p)\n", q, wreq);
-	rpc_append_list(q, wreq);
+	struct inode *inode = dentry->d_inode;
+	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset = PAGE_CACHE_SIZE;
+	int err;
+
+	/* easy case */
+	if (page->index < end_index)
+		goto do_it;
+	/* things got complicated... */
+	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+	/* OK, are we completely out? */
+	if (page->index >= end_index+1 || !offset)
+		return -EIO;
+do_it:
+	err = nfs_writepage_sync(dentry, inode, page, 0, offset); 
+	if ( err == offset) return 0; 
+	return err; 
+}
+
+/*
+ * Check whether the file range we want to write to is locked by
+ * us.
+ */
+static int
+region_locked(struct inode *inode, struct nfs_page *req)
+{
+	struct file_lock	*fl;
+	unsigned long		rqstart, rqend;
+
+	/* Don't optimize writes if we don't use NLM */
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
+		return 0;
+
+	rqstart = page_offset(req->wb_page) + req->wb_offset;
+	rqend = rqstart + req->wb_bytes;
+	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+		if (fl->fl_owner == current->files && (fl->fl_flags & FL_POSIX)
+		    && fl->fl_type == F_WRLCK
+		    && fl->fl_start <= rqstart && rqend <= fl->fl_end) {
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static inline struct nfs_page *
+nfs_inode_wb_entry(struct list_head *head)
+{
+	return list_entry(head, struct nfs_page, wb_hash);
 }
 
 /*
- * Remove a writeback request from a list
+ * Insert a write request into an inode
  */
 static inline void
-remove_write_request(struct nfs_wreq **q, struct nfs_wreq *wreq)
+nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
-	dprintk("NFS:      remove_write_request(%p, %p)\n", q, wreq);
-	rpc_remove_list(q, wreq);
+	if (!list_empty(&req->wb_hash))
+		return;
+	if (!NFS_WBACK_BUSY(req))
+		printk(KERN_ERR "NFS: unlocked request attempted hashed!\n");
+	inode->u.nfs_i.npages++;
+	list_add(&req->wb_hash, &inode->u.nfs_i.writeback);
+	req->wb_count++;
 }
 
 /*
- * Find a non-busy write request for a given page to
- * try to combine with.
+ * Insert a write request into an inode
  */
-static inline struct nfs_wreq *
-find_write_request(struct inode *inode, struct page *page)
+static inline void
+nfs_inode_remove_request(struct nfs_page *req)
 {
-	pid_t pid = current->pid;
-	struct nfs_wreq	*head, *req;
+	struct inode *inode;
+	spin_lock(&nfs_wreq_lock);
+	if (list_empty(&req->wb_hash)) {
+		spin_unlock(&nfs_wreq_lock);
+		return;
+	}
+	if (!NFS_WBACK_BUSY(req))
+		printk(KERN_ERR "NFS: unlocked request attempted unhashed!\n");
+	inode = req->wb_file->f_dentry->d_inode;
+	list_del(&req->wb_hash);
+	INIT_LIST_HEAD(&req->wb_hash);
+	inode->u.nfs_i.npages--;
+	if ((inode->u.nfs_i.npages == 0) != list_empty(&inode->u.nfs_i.writeback))
+		printk(KERN_ERR "NFS: desynchronized value of nfs_i.npages.\n");
+	if (!nfs_have_writebacks(inode))
+		inode_remove_flushd(inode);
+	spin_unlock(&nfs_wreq_lock);
+	nfs_release_request(req);
+}
 
-	dprintk("NFS:      find_write_request(%x/%ld, %p)\n",
-				inode->i_dev, inode->i_ino, page);
-	if (!(req = head = NFS_WRITEBACK(inode)))
-		return NULL;
-	do {
-		/*
-		 * We can't combine with canceled requests or
-		 * requests that have already been started..
-		 */
-		if (req->wb_flags & (NFS_WRITE_CANCELLED | NFS_WRITE_INPROGRESS))
+/*
+ * Find a request
+ */
+static inline struct nfs_page *
+_nfs_find_request(struct inode *inode, struct page *page)
+{
+	struct list_head	*head, *next;
+
+	head = &inode->u.nfs_i.writeback;
+	next = head->next;
+	while (next != head) {
+		struct nfs_page *req = nfs_inode_wb_entry(next);
+		next = next->next;
+		if (page_index(req->wb_page) != page_index(page))
 			continue;
+		req->wb_count++;
+		return req;
+	}
+	return NULL;
+}
 
-		if (req->wb_page == page && req->wb_pid == pid)
-			return req;
+struct nfs_page *
+nfs_find_request(struct inode *inode, struct page *page)
+{
+	struct nfs_page		*req;
 
-		/*
-		 * Ehh, don't keep too many tasks queued..
-		 */
-		rpc_wake_up_task(&req->wb_task);
+	spin_lock(&nfs_wreq_lock);
+	req = _nfs_find_request(inode, page);
+	spin_unlock(&nfs_wreq_lock);
+	return req;
+}
 
-	} while ((req = WB_NEXT(req)) != head);
-	return NULL;
+static inline struct nfs_page *
+nfs_list_entry(struct list_head *head)
+{
+	return list_entry(head, struct nfs_page, wb_list);
 }
 
 /*
- * Find and release all failed requests for this inode.
+ * Insert a write request into a sorted list
  */
-int
-nfs_check_failed_request(struct inode * inode)
+static inline void
+nfs_list_add_request(struct nfs_page *req, struct list_head *head)
 {
-	/* FIXME! */
-	return 0;
+	struct list_head *prev;
+
+	if (!list_empty(&req->wb_list)) {
+		printk(KERN_ERR "NFS: Add to list failed!\n");
+		return;
+	}
+	if (list_empty(&req->wb_hash)) {
+		printk(KERN_ERR "NFS: Unhashed request attempted added to a list!\n");
+		return;
+	}
+	if (!NFS_WBACK_BUSY(req))
+		printk(KERN_ERR "NFS: unlocked request attempted added to list!\n");
+	prev = head->prev;
+	while (prev != head) {
+		struct nfs_page	*p = nfs_list_entry(prev);
+		if (page_index(p->wb_page) < page_index(req->wb_page))
+			break;
+		prev = prev->prev;
+	}
+	list_add(&req->wb_list, prev);
+	req->wb_list_head = head;
 }
 
 /*
- * Try to merge adjacent write requests. This works only for requests
- * issued by the same user.
+ * Insert a write request into an inode
  */
-static inline int
-update_write_request(struct nfs_wreq *req, unsigned int first,
-			unsigned int bytes)
+static inline void
+nfs_list_remove_request(struct nfs_page *req)
 {
-	unsigned int	rqfirst = req->wb_offset,
-			rqlast = rqfirst + req->wb_bytes,
-			last = first + bytes;
+	if (list_empty(&req->wb_list))
+		return;
+	if (!NFS_WBACK_BUSY(req))
+		printk(KERN_ERR "NFS: unlocked request attempted removed from list!\n");
+	list_del(&req->wb_list);
+	INIT_LIST_HEAD(&req->wb_list);
+	req->wb_list_head = NULL;
+}
 
-	dprintk("nfs:      trying to update write request %p\n", req);
+/*
+ * Add a request to the inode's dirty list.
+ */
+static inline void
+nfs_mark_request_dirty(struct nfs_page *req)
+{
+	struct inode *inode = req->wb_file->f_dentry->d_inode;
 
-	/* not contiguous? */
-	if (rqlast < first || last < rqfirst)
-		return 0;
+	spin_lock(&nfs_wreq_lock);
+	if (list_empty(&req->wb_list)) {
+		nfs_list_add_request(req, &inode->u.nfs_i.dirty);
+		inode->u.nfs_i.ndirty++;
+	}
+	spin_unlock(&nfs_wreq_lock);
+	/*
+	 * NB: the call to inode_schedule_scan() must lie outside the
+	 *     spinlock since it can run flushd().
+	 */
+	inode_schedule_scan(inode, req->wb_timeout);
+}
 
-	if (first < rqfirst)
-		rqfirst = first;
-	if (rqlast < last)
-		rqlast = last;
+/*
+ * Check if a request is dirty
+ */
+static inline int
+nfs_dirty_request(struct nfs_page *req)
+{
+	struct inode *inode = req->wb_file->f_dentry->d_inode;
+	return !list_empty(&req->wb_list) && req->wb_list_head == &inode->u.nfs_i.dirty;
+}
 
-	req->wb_offset = rqfirst;
-	req->wb_bytes  = rqlast - rqfirst;
-	req->wb_count++;
+#ifdef CONFIG_NFS_V3
+/*
+ * Add a request to the inode's commit list.
+ */
+static inline void
+nfs_mark_request_commit(struct nfs_page *req)
+{
+	struct inode *inode = req->wb_file->f_dentry->d_inode;
 
-	return 1;
+	spin_lock(&nfs_wreq_lock);
+	if (list_empty(&req->wb_list)) {
+		nfs_list_add_request(req, &inode->u.nfs_i.commit);
+		inode->u.nfs_i.ncommit++;
+	}
+	spin_unlock(&nfs_wreq_lock);
+	/*
+	 * NB: the call to inode_schedule_scan() must lie outside the
+	 *     spinlock since it can run flushd().
+	 */
+	inode_schedule_scan(inode, req->wb_timeout);
 }
+#endif
 
-static kmem_cache_t *nfs_wreq_cachep;
-
-int nfs_init_wreqcache(void)
+/*
+ * Lock the page of an asynchronous request
+ */
+static inline int
+nfs_lock_request(struct nfs_page *req)
 {
-	nfs_wreq_cachep = kmem_cache_create("nfs_wreq",
-					    sizeof(struct nfs_wreq),
-					    0, SLAB_HWCACHE_ALIGN,
-					    NULL, NULL);
-	if (nfs_wreq_cachep == NULL)
-		return -ENOMEM;
-	return 0;
+	if (NFS_WBACK_BUSY(req))
+		return 0;
+	req->wb_count++;
+	req->wb_flags |= PG_BUSY;
+	return 1;
 }
 
 static inline void
-free_write_request(struct nfs_wreq * req)
+nfs_unlock_request(struct nfs_page *req)
 {
-	if (!--req->wb_count)
-		kmem_cache_free(nfs_wreq_cachep, req);
+	if (!NFS_WBACK_BUSY(req)) {
+		printk(KERN_ERR "NFS: Invalid unlock attempted\n");
+		return;
+	}
+	req->wb_flags &= ~PG_BUSY;
+	wake_up(&req->wb_wait);
+	nfs_release_request(req);
 }
 
 /*
- * Create and initialize a writeback request
+ * Create a write request.
+ * Page must be locked by the caller. This makes sure we never create
+ * two different requests for the same page, and avoids possible deadlock
+ * when we reach the hard limit on the number of dirty pages.
  */
-static inline struct nfs_wreq *
-create_write_request(struct file * file, struct page *page, unsigned int offset, unsigned int bytes)
+static struct nfs_page *
+nfs_create_request(struct inode *inode, struct file *file, struct page *page,
+		   unsigned int offset, unsigned int count)
 {
-	struct dentry	*dentry = file->f_dentry;
-	struct inode	*inode = dentry->d_inode;
-	struct rpc_clnt	*clnt = NFS_CLIENT(inode);
-	struct nfs_wreq *wreq;
-	struct rpc_task	*task;
+	struct nfs_reqlist	*cache = NFS_REQUESTLIST(inode);
+	struct nfs_page		*req = NULL;
+	long			timeout;
 
-	dprintk("NFS:      create_write_request(%s/%s, %ld+%d)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(page->index << PAGE_CACHE_SHIFT) + offset, bytes);
-
-	/* FIXME: Enforce hard limit on number of concurrent writes? */
-	wreq = kmem_cache_alloc(nfs_wreq_cachep, SLAB_KERNEL);
-	if (!wreq)
-		goto out_fail;
-	memset(wreq, 0, sizeof(*wreq));
+	/* Deal with hard/soft limits.
+	 */
+	do {
+		/* If we're over the soft limit, flush out old requests */
+		if (nfs_nr_requests >= MAX_REQUEST_SOFT)
+			nfs_wb_file(inode, file);
+
+		/* If we're still over the soft limit, wake up some requests */
+		if (nfs_nr_requests >= MAX_REQUEST_SOFT) {
+			dprintk("NFS:      hit soft limit (%d requests)\n",
+				nfs_nr_requests);
+			if (!cache->task)
+				nfs_reqlist_init(NFS_SERVER(inode));
+			nfs_wake_flushd();
+		}
 
-	task = &wreq->wb_task;
-	rpc_init_task(task, clnt, nfs_wback_result, RPC_TASK_NFSWRITE);
-	task->tk_calldata = wreq;
-	task->tk_action = nfs_wback_begin;
+		/* If we haven't reached the hard limit yet,
+		 * try to allocate the request struct */
+		if (nfs_nr_requests < MAX_REQUEST_HARD) {
+			req = nfs_page_alloc();
+			if (req != NULL)
+				break;
+		}
 
-	rpcauth_lookupcred(task);	/* Obtain user creds */
-	if (task->tk_status < 0)
-		goto out_req;
+		/* We're over the hard limit. Wait for better times */
+		dprintk("NFS:      create_request sleeping (total %d pid %d)\n",
+			nfs_nr_requests, current->pid);
+
+		timeout = 1 * HZ;
+		if (NFS_SERVER(inode)->flags & NFS_MOUNT_INTR) {
+			interruptible_sleep_on_timeout(&cache->request_wait,
+						       timeout);
+			if (signalled())
+				break;
+		} else
+			sleep_on_timeout(&cache->request_wait, timeout);
+
+		dprintk("NFS:      create_request waking up (tot %d pid %d)\n",
+			nfs_nr_requests, current->pid);
+	} while (!req);
+	if (!req)
+		return NULL;
 
-	/* Put the task on inode's writeback request list. */
+	/* Initialize the request struct. Initially, we assume a
+	 * long write-back delay. This will be adjusted in
+	 * update_nfs_request below if the region is not locked. */
+	req->wb_page    = page;
+	atomic_inc(&page->count);
+	req->wb_offset  = offset;
+	req->wb_bytes   = count;
+	/* If the region is locked, adjust the timeout */
+	if (region_locked(inode, req))
+		req->wb_timeout = jiffies + NFS_WRITEBACK_LOCKDELAY;
+	else
+		req->wb_timeout = jiffies + NFS_WRITEBACK_DELAY;
+	req->wb_file    = file;
+	req->wb_cred	= rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0);
 	get_file(file);
-	wreq->wb_file = file;
-	wreq->wb_pid    = current->pid;
-	wreq->wb_page   = page;
-	init_waitqueue_head(&wreq->wb_wait);
-	wreq->wb_offset = offset;
-	wreq->wb_bytes  = bytes;
-	wreq->wb_count	= 2;		/* One for the IO, one for us */
+	req->wb_count   = 1;
 
-	kmap(page);
-	append_write_request(&NFS_WRITEBACK(inode), wreq);
+	/* register request's existence */
+	cache->nr_requests++;
+	nfs_nr_requests++;
+	return req;
+}
 
-	if (nr_write_requests++ > NFS_WRITEBACK_MAX*3/4)
-		rpc_wake_up_next(&write_queue);
 
-	return wreq;
+/*
+ * Release all resources associated with a write request after it
+ * has been committed to stable storage
+ *
+ * Note: Should always be called with the spinlock held!
+ */
+void
+nfs_release_request(struct nfs_page *req)
+{
+	struct inode		*inode = req->wb_file->f_dentry->d_inode;
+	struct nfs_reqlist	*cache = NFS_REQUESTLIST(inode);
+	struct page		*page = req->wb_page;
+
+	spin_lock(&nfs_wreq_lock);
+	if (--req->wb_count) {
+		spin_unlock(&nfs_wreq_lock);
+		return;
+	}
+	spin_unlock(&nfs_wreq_lock);
 
-out_req:
-	rpc_release_task(task);
-	kmem_cache_free(nfs_wreq_cachep, wreq);
-out_fail:
-	return NULL;
+	if (!list_empty(&req->wb_list)) {
+		printk(KERN_ERR "NFS: Request released while still on a list!\n");
+		nfs_list_remove_request(req);
+	}
+	if (!list_empty(&req->wb_hash)) {
+		printk(KERN_ERR "NFS: Request released while still hashed!\n");
+		nfs_inode_remove_request(req);
+	}
+	if (NFS_WBACK_BUSY(req))
+		printk(KERN_ERR "NFS: Request released while still locked!\n");
+
+	rpcauth_releasecred(NFS_CLIENT(inode)->cl_auth, req->wb_cred);
+	fput(req->wb_file);
+	page_cache_release(page);
+	nfs_page_free(req);
+	/* wake up anyone waiting to allocate a request */
+	cache->nr_requests--;
+	nfs_nr_requests--;
+	wake_up(&cache->request_wait);
 }
 
 /*
- * Schedule a writeback RPC call.
- * If the server is congested, don't add to our backlog of queued
- * requests but call it synchronously.
- * The function returns whether we should wait for the thing or not.
+ * Wait for a request to complete.
  *
- * FIXME: Here we could walk the inode's lock list to see whether the
- * page we're currently writing to has been write-locked by the caller.
- * If it is, we could schedule an async write request with a long
- * delay in order to avoid writing back the page until the lock is
- * released.
+ * Interruptible by signals only if mounted with intr flag.
  */
-static inline int
-schedule_write_request(struct nfs_wreq *req, int sync)
+static int
+nfs_wait_on_request(struct nfs_page *req)
 {
-	struct rpc_task	*task = &req->wb_task;
-	struct file	*file = req->wb_file;
-	struct dentry	*dentry = file->f_dentry;
-	struct inode	*inode = dentry->d_inode;
+	struct inode	*inode = req->wb_file->f_dentry->d_inode;
+        struct rpc_clnt	*clnt = NFS_CLIENT(inode);
+        int retval;
 
-	if (NFS_CONGESTED(inode) || nr_write_requests >= NFS_WRITEBACK_MAX)
-		sync = 1;
-
-	if (sync) {
-		sigset_t	oldmask;
-		struct rpc_clnt *clnt = NFS_CLIENT(inode);
-		dprintk("NFS: %4d schedule_write_request (sync)\n",
-					task->tk_pid);
-		/* Page is already locked */
-		rpc_clnt_sigmask(clnt, &oldmask);
-		rpc_execute(task);
-		rpc_clnt_sigunmask(clnt, &oldmask);
-	} else {
-		dprintk("NFS: %4d schedule_write_request (async)\n",
-					task->tk_pid);
-		task->tk_flags |= RPC_TASK_ASYNC;
-		task->tk_timeout = NFS_WRITEBACK_DELAY;
-		rpc_sleep_on(&write_queue, task, NULL, NULL);
+	if (!NFS_WBACK_BUSY(req))
+		return 0;
+	req->wb_count++;
+	retval = nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req));
+	nfs_release_request(req);
+        return retval;
+}
+
+/*
+ * Wait for a request to complete.
+ *
+ * Interruptible by signals only if mounted with intr flag.
+ */
+static int
+nfs_wait_on_requests(struct inode *inode, struct file *file, unsigned long start, unsigned int count)
+{
+	struct list_head	*p, *head;
+	unsigned long		idx_start, idx_end;
+	unsigned int		pages = 0;
+	int			error;
+
+	idx_start = start >> PAGE_CACHE_SHIFT;
+	if (count == 0)
+		idx_end = ~0;
+	else {
+		unsigned long idx_count = (count-1) >> PAGE_CACHE_SHIFT;
+		idx_end = idx_start + idx_count;
 	}
+	spin_lock(&nfs_wreq_lock);
+	head = &inode->u.nfs_i.writeback;
+	p = head->next;
+	while (p != head) {
+		unsigned long pg_idx;
+		struct nfs_page *req = nfs_inode_wb_entry(p);
+
+		p = p->next;
+
+		if (file && req->wb_file != file)
+			continue;
+
+		pg_idx = page_index(req->wb_page);
+		if (pg_idx < idx_start || pg_idx > idx_end)
+			continue;
 
-	return sync;
+		if (!NFS_WBACK_BUSY(req))
+			continue;
+		req->wb_count++;
+		spin_unlock(&nfs_wreq_lock);
+		error = nfs_wait_on_request(req);
+		nfs_release_request(req);
+		if (error < 0)
+			return error;
+		spin_lock(&nfs_wreq_lock);
+		p = head->next;
+		pages++;
+	}
+	spin_unlock(&nfs_wreq_lock);
+	return pages;
 }
 
 /*
- * Wait for request to complete.
+ * Scan cluster for dirty pages and send as many of them to the
+ * server as possible.
  */
 static int
-wait_on_write_request(struct nfs_wreq *req)
+nfs_scan_list_timeout(struct list_head *head, struct list_head *dst, struct inode *inode)
 {
-	struct file		*file = req->wb_file;
-	struct dentry		*dentry = file->f_dentry;
-	struct inode		*inode = dentry->d_inode;
-	struct rpc_clnt		*clnt = NFS_CLIENT(inode);
-	DECLARE_WAITQUEUE(wait, current);
-	sigset_t		oldmask;
-	int retval;
+	struct list_head	*p;
+        struct nfs_page		*req;
+        int			pages = 0;
+
+	p = head->next;
+        while (p != head) {
+		req = nfs_list_entry(p);
+		p = p->next;
+		if (time_after(req->wb_timeout, jiffies)) {
+			if (time_after(NFS_NEXTSCAN(inode), req->wb_timeout))
+				NFS_NEXTSCAN(inode) = req->wb_timeout;
+			continue;
+		}
+		if (!nfs_lock_request(req))
+			continue;
+		nfs_list_remove_request(req);
+		nfs_list_add_request(req, dst);
+		pages++;
+	}
+	return pages;
+}
+
+static int
+nfs_scan_dirty_timeout(struct inode *inode, struct list_head *dst)
+{
+	int	pages;
+	spin_lock(&nfs_wreq_lock);
+	pages = nfs_scan_list_timeout(&inode->u.nfs_i.dirty, dst, inode);
+	inode->u.nfs_i.ndirty -= pages;
+	if ((inode->u.nfs_i.ndirty == 0) != list_empty(&inode->u.nfs_i.dirty))
+		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
+	spin_unlock(&nfs_wreq_lock);
+	return pages;
+}
 
-	/* Make sure it's started.. */
-	if (!WB_INPROGRESS(req))
-		rpc_wake_up_task(&req->wb_task);
+#ifdef CONFIG_NFS_V3
+static int
+nfs_scan_commit_timeout(struct inode *inode, struct list_head *dst)
+{
+	int	pages;
+	spin_lock(&nfs_wreq_lock);
+	pages = nfs_scan_list_timeout(&inode->u.nfs_i.commit, dst, inode);
+	inode->u.nfs_i.ncommit -= pages;
+	if ((inode->u.nfs_i.ncommit == 0) != list_empty(&inode->u.nfs_i.commit))
+		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
+	spin_unlock(&nfs_wreq_lock);
+	return pages;
+}
+#endif
+
+static int
+nfs_scan_list(struct list_head *src, struct list_head *dst, struct file *file, unsigned long start, unsigned int count)
+{
+	struct list_head	*p;
+	struct nfs_page		*req;
+	unsigned long		idx_start, idx_end;
+	int			pages;
+
+	pages = 0;
+	idx_start = start >> PAGE_CACHE_SHIFT;
+	if (count == 0)
+		idx_end = ~0;
+	else
+		idx_end = idx_start + ((count-1) >> PAGE_CACHE_SHIFT);
+	p = src->next;
+	while (p != src) {
+		unsigned long pg_idx;
+
+		req = nfs_list_entry(p);
+		p = p->next;
+
+		if (file && req->wb_file != file)
+			continue;
+
+		pg_idx = page_index(req->wb_page);
+		if (pg_idx < idx_start || pg_idx > idx_end)
+			continue;
+
+		if (!nfs_lock_request(req))
+			continue;
+		nfs_list_remove_request(req);
+		nfs_list_add_request(req, dst);
+		pages++;
+	}
+	return pages;
+}
+
+static int
+nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long start, unsigned int count)
+{
+	int	pages;
+	spin_lock(&nfs_wreq_lock);
+	pages = nfs_scan_list(&inode->u.nfs_i.dirty, dst, file, start, count);
+	inode->u.nfs_i.ndirty -= pages;
+	if ((inode->u.nfs_i.ndirty == 0) != list_empty(&inode->u.nfs_i.dirty))
+		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
+	spin_unlock(&nfs_wreq_lock);
+	return pages;
+}
+
+#ifdef CONFIG_NFS_V3
+static int
+nfs_scan_commit(struct inode *inode, struct list_head *dst, struct file *file, unsigned long start, unsigned int count)
+{
+	int	pages;
+	spin_lock(&nfs_wreq_lock);
+	pages = nfs_scan_list(&inode->u.nfs_i.commit, dst, file, start, count);
+	inode->u.nfs_i.ncommit -= pages;
+	if ((inode->u.nfs_i.ncommit == 0) != list_empty(&inode->u.nfs_i.commit))
+		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
+	spin_unlock(&nfs_wreq_lock);
+	return pages;
+}
+#endif
+
+
+static int
+coalesce_requests(struct list_head *src, struct list_head *dst, unsigned int maxpages)
+{
+	struct nfs_page		*req = NULL;
+	unsigned int		pages = 0;
+
+	while (!list_empty(src)) {
+		struct nfs_page	*prev = req;
+
+		req = nfs_list_entry(src->next);
+		if (prev) {
+			if (req->wb_file != prev->wb_file)
+				break;
+
+			if (page_index(req->wb_page) != page_index(prev->wb_page)+1)
+				break;
+
+			if (req->wb_offset != 0)
+				break;
+		}
+		nfs_list_remove_request(req);
+		nfs_list_add_request(req, dst);
+		pages++;
+		if (req->wb_offset + req->wb_bytes != PAGE_CACHE_SIZE)
+			break;
+		if (pages >= maxpages)
+			break;
+	}
+	return pages;
+}
+
+/*
+ * Try to update any existing write request, or create one if there is none.
+ * In order to match, the request's credentials must match those of
+ * the calling process.
+ *
+ * Note: Should always be called with the Page Lock held!
+ */
+static struct nfs_page *
+nfs_update_request(struct file* file, struct page *page,
+		   unsigned long offset, unsigned int bytes)
+{
+	struct inode		*inode = file->f_dentry->d_inode;
+	struct nfs_page		*req, *new = NULL;
+	unsigned long		rqend, end;
+
+	end = offset + bytes;
 
-	rpc_clnt_sigmask(clnt, &oldmask);
-	add_wait_queue(&req->wb_wait, &wait);
 	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		retval = 0;
-		if (req->wb_flags & NFS_WRITE_COMPLETE)
+		/* Loop over all inode entries and see if we find
+		 * A request for the page we wish to update
+		 */
+		spin_lock(&nfs_wreq_lock);
+		req = _nfs_find_request(inode, page);
+		if (req) {
+			if (!nfs_lock_request(req)) {
+				spin_unlock(&nfs_wreq_lock);
+				nfs_wait_on_request(req);
+				nfs_release_request(req);
+				continue;
+			}
+			spin_unlock(&nfs_wreq_lock);
+			if (new)
+				nfs_release_request(new);
 			break;
-		retval = -ERESTARTSYS;
-		if (signalled())
+		}
+
+		req = new;
+		if (req) {
+			nfs_lock_request(req);
+			nfs_inode_add_request(inode, req);
+			spin_unlock(&nfs_wreq_lock);
+			nfs_mark_request_dirty(req);
 			break;
-		schedule();
+		}
+		spin_unlock(&nfs_wreq_lock);
+
+		/* Create the request. It's safe to sleep in this call because
+		 * we only get here if the page is locked.
+		 */
+		new = nfs_create_request(inode, file, page, offset, bytes);
+		if (!new)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	/* We have a request for our page.
+	 * If the creds don't match, or the
+	 * page addresses don't match,
+	 * tell the caller to wait on the conflicting
+	 * request.
+	 */
+	rqend = req->wb_offset + req->wb_bytes;
+	if (req->wb_file != file
+	    || req->wb_page != page
+	    || !nfs_dirty_request(req)
+	    || offset > rqend || end < req->wb_offset) {
+		nfs_unlock_request(req);
+		nfs_release_request(req);
+		return ERR_PTR(-EBUSY);
+	}
+
+	/* Okay, the request matches. Update the region */
+	if (offset < req->wb_offset) {
+		req->wb_offset = offset;
+		req->wb_bytes = rqend - req->wb_offset;
 	}
-	remove_wait_queue(&req->wb_wait, &wait);
-	current->state = TASK_RUNNING;
-	rpc_clnt_sigunmask(clnt, &oldmask);
-	return retval;
+
+	if (end > rqend)
+		req->wb_bytes = end - req->wb_offset;
+
+	nfs_unlock_request(req);
+
+	return req;
 }
 
 /*
- * Write a page to the server. This will be used for NFS swapping only
- * (for now), and we currently do this synchronously only.
+ * This is the strategy routine for NFS.
+ * It is called by nfs_updatepage whenever the user wrote up to the end
+ * of a page.
+ *
+ * We always try to submit a set of requests in parallel so that the
+ * server's write code can gather writes. This is mainly for the benefit
+ * of NFSv2.
+ *
+ * We never submit more requests than we think the remote can handle.
+ * For UDP sockets, we make sure we don't exceed the congestion window;
+ * for TCP, we limit the number of requests to 8.
+ *
+ * NFS_STRATEGY_PAGES gives the minimum number of requests for NFSv2 that
+ * should be sent out in one go. This is for the benefit of NFSv2 servers
+ * that perform write gathering.
+ *
+ * FIXME: Different servers may have different sweet spots.
+ * Record the average congestion window in server struct?
  */
-int
-nfs_writepage(struct dentry * dentry, struct page *page)
+#define NFS_STRATEGY_PAGES      8
+static void
+nfs_strategy(struct file *file)
 {
-	struct inode *inode = dentry->d_inode;
-	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset = PAGE_CACHE_SIZE;
-	int err;
+	struct inode	*inode = file->f_dentry->d_inode;
+	unsigned int	dirty, wpages;
+
+	dirty  = inode->u.nfs_i.ndirty;
+	wpages = NFS_SERVER(inode)->wsize >> PAGE_CACHE_SHIFT;
+#ifdef CONFIG_NFS_V3
+	if (NFS_PROTO(inode)->version == 2) {
+		if (dirty >= NFS_STRATEGY_PAGES * wpages)
+			nfs_flush_file(inode, file, 0, 0, 0);
+	} else {
+		if (dirty >= wpages)
+			nfs_flush_file(inode, file, 0, 0, 0);
+	}
+#else
+	if (dirty >= NFS_STRATEGY_PAGES * wpages)
+		nfs_flush_file(inode, file, 0, 0, 0);
+#endif
+	/*
+	 * If we're running out of requests, flush out everything
+	 * in order to reduce memory useage...
+	 */
+	if (nfs_nr_requests > MAX_REQUEST_SOFT)
+		nfs_wb_file(inode, file);
+}
 
-	/* easy case */
-	if (page->index < end_index)
-		goto do_it;
-	/* things got complicated... */
-	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
-	/* OK, are we completely out? */
-	if (page->index >= end_index+1 || !offset)
-		return -EIO;
-do_it:
-	err = nfs_writepage_sync(dentry, inode, page, 0, offset); 
-	if ( err == offset) return 0; 
-	return err; 
+int
+nfs_flush_incompatible(struct file *file, struct page *page)
+{
+	struct inode	*inode = file->f_dentry->d_inode;
+	struct nfs_page	*req;
+	int		status = 0;
+	/*
+	 * Look for a request corresponding to this page. If there
+	 * is one, and it belongs to another file, we flush it out
+	 * before we try to copy anything into the page. Do this
+	 * due to the lack of an ACCESS-type call in NFSv2.
+	 * Also do the same if we find a request from an existing
+	 * dropped page.
+	 */
+	req = nfs_find_request(inode,page);
+	if (req) {
+		if (req->wb_file != file || req->wb_page != page)
+			status = nfs_wb_page(inode, page);
+		nfs_release_request(req);
+	}
+	return (status < 0) ? status : 0;
 }
 
 /*
@@ -446,27 +1032,13 @@ nfs_updatepage(struct file *file, struct page *page, unsigned long offset, unsig
 {
 	struct dentry	*dentry = file->f_dentry;
 	struct inode	*inode = dentry->d_inode;
-	struct nfs_wreq	*req;
+	struct nfs_page	*req;
 	int		synchronous = file->f_flags & O_SYNC;
-	int		retval;
+	int		status = 0;
 
-	dprintk("NFS:      nfs_updatepage(%s/%s %d@%ld)\n",
+	dprintk("NFS:      nfs_updatepage(%s/%s %d@%Ld)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		count, (page->index << PAGE_CACHE_SHIFT) +offset);
-
-	/*
-	 * Try to find a corresponding request on the writeback queue.
-	 * If there is one, we can be sure that this request is not
-	 * yet being processed, because we hold a lock on the page.
-	 *
-	 * If the request was created by us, update it. Otherwise,
-	 * transfer the page lock and flush out the dirty page now.
-	 * After returning, generic_file_write will wait on the
-	 * page and retry the update.
-	 */
-	req = find_write_request(inode, page);
-	if (req && req->wb_file == file && update_write_request(req, offset, count))
-		goto updated;
+		count, page_offset(page) +offset);
 
 	/*
 	 * If wsize is smaller than page size, update and write
@@ -475,241 +1047,542 @@ nfs_updatepage(struct file *file, struct page *page, unsigned long offset, unsig
 	if (NFS_SERVER(inode)->wsize < PAGE_SIZE)
 		return nfs_writepage_sync(dentry, inode, page, offset, count);
 
-	/* Create the write request. */
-	req = create_write_request(file, page, offset, count);
-	if (!req)
-		return -ENOBUFS;
-
 	/*
-	 * Ok, there's another user of this page with the new request..
-	 * The IO completion will then free the page and the dentry.
+	 * Try to find an NFS request corresponding to this page
+	 * and update it.
+	 * If the existing request cannot be updated, we must flush
+	 * it out now.
 	 */
-	get_page(page);
-
-	/* Schedule request */
-	synchronous = schedule_write_request(req, synchronous);
+	do {
+		req = nfs_update_request(file, page, offset, count);
+		status = (IS_ERR(req)) ? PTR_ERR(req) : 0;
+		if (status != -EBUSY)
+			break;
+		/* Request could not be updated. Flush it out and try again */
+		status = nfs_wb_page(inode, page);
+	} while (status >= 0);
+	if (status < 0)
+		goto done;
 
-updated:
-	if (req->wb_bytes == PAGE_SIZE)
+	if (req->wb_bytes == PAGE_CACHE_SIZE)
 		SetPageUptodate(page);
 
-	retval = 0;
+	status = 0;
 	if (synchronous) {
-		int status = wait_on_write_request(req);
-		if (status) {
-			nfs_cancel_request(req);
-			retval = status;
-		} else {
-			status = req->wb_status;
-			if (status < 0)
-				retval = status;
-		}
+		int error;
 
-		if (retval < 0)
-			ClearPageUptodate(page);
+		error = nfs_sync_file(inode, file, page_offset(page) + offset, count, FLUSH_SYNC|FLUSH_STABLE);
+		if (error < 0 || (error = file->f_error) < 0)
+			status = error;
+		file->f_error = 0;
+	} else {
+		/* If we wrote past the end of the page.
+		 * Call the strategy routine so it can send out a bunch
+		 * of requests.
+		 */
+		if (req->wb_offset == 0 && req->wb_bytes == PAGE_CACHE_SIZE)
+			nfs_strategy(file);
 	}
-
-	free_write_request(req);
-	return retval;
+	nfs_release_request(req);
+done:
+        dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
+                                                status, inode->i_size);
+	if (status < 0)
+		clear_bit(PG_uptodate, &page->flags);
+	return status;
 }
 
 /*
- * Cancel a write request. We always mark it cancelled,
- * but if it's already in progress there's no point in
- * calling rpc_exit, and we don't want to overwrite the
- * tk_status field.
- */ 
+ * Set up the argument/result storage required for the RPC call.
+ */
 static void
-nfs_cancel_request(struct nfs_wreq *req)
+nfs_write_rpcsetup(struct list_head *head, struct nfs_write_data *data)
 {
-	req->wb_flags |= NFS_WRITE_CANCELLED;
-	if (!WB_INPROGRESS(req)) {
-		rpc_exit(&req->wb_task, 0);
-		rpc_wake_up_task(&req->wb_task);
+	struct nfs_page		*req;
+	struct iovec		*iov;
+	unsigned int		count;
+
+	/* Set up the RPC argument and reply structs
+	 * NB: take care not to mess about with data->commit et al. */
+
+	iov = data->args.iov;
+	count = 0;
+	while (!list_empty(head)) {
+		struct nfs_page *req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_list_add_request(req, &data->pages);
+		iov->iov_base = (void *)(kmap(req->wb_page) + req->wb_offset);
+		iov->iov_len = req->wb_bytes;
+		count += req->wb_bytes;
+		iov++;
+		data->args.nriov++;
 	}
+	req = nfs_list_entry(data->pages.next);
+	data->file = req->wb_file;
+	data->cred = req->wb_cred;
+	data->args.fh     = NFS_FH(req->wb_file->f_dentry);
+	data->args.offset = page_offset(req->wb_page) + req->wb_offset;
+	data->args.count  = count;
+	data->res.fattr   = &data->fattr;
+	data->res.count   = count;
+	data->res.verf    = &data->verf;
 }
 
+
 /*
- * Cancel all writeback requests, both pending and in progress.
+ * Create an RPC task for the given write request and kick it.
+ * The page must have been locked by the caller.
+ *
+ * It may happen that the page we're passed is not marked dirty.
+ * This is the case if nfs_updatepage detects a conflicting request
+ * that has been written but not committed.
  */
-static void
-nfs_cancel_dirty(struct inode *inode, pid_t pid)
+static int
+nfs_flush_one(struct list_head *head, struct file *file, int how)
 {
-	struct nfs_wreq *head, *req;
+	struct dentry           *dentry = file->f_dentry;
+	struct inode            *inode = dentry->d_inode;
+	struct rpc_clnt 	*clnt = NFS_CLIENT(inode);
+	struct nfs_write_data	*data;
+	struct rpc_task		*task;
+	struct rpc_message	msg;
+	int                     flags,
+				async = !(how & FLUSH_SYNC),
+				stable = (how & FLUSH_STABLE);
+	sigset_t		oldset;
+
+
+	data = nfs_writedata_alloc();
+	if (!data)
+		goto out_bad;
+	task = &data->task;
+
+	/* Set the initial flags for the task.  */
+	flags = (async) ? RPC_TASK_ASYNC : 0;
+
+	/* Set up the argument struct */
+	nfs_write_rpcsetup(head, data);
+	if (stable) {
+		if (!inode->u.nfs_i.ncommit)
+			data->args.stable = NFS_FILE_SYNC;
+		else
+			data->args.stable = NFS_DATA_SYNC;
+	} else
+		data->args.stable = NFS_UNSTABLE;
+
+	/* Finalize the task. */
+	rpc_init_task(task, clnt, nfs_writeback_done, flags);
+	task->tk_calldata = data;
+
+#ifdef CONFIG_NFS_V3
+	msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? NFS3PROC_WRITE : NFSPROC_WRITE;
+#else
+	msg.rpc_proc = NFSPROC_WRITE;
+#endif
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	msg.rpc_cred = data->cred;
+
+	dprintk("NFS: %4d initiated write call (req %s/%s count %d nriov %d)\n",
+		task->tk_pid, 
+		dentry->d_parent->d_name.name,
+		dentry->d_name.name,
+		data->args.count, data->args.nriov);
+
+	rpc_clnt_sigmask(clnt, &oldset);
+	rpc_call_setup(task, &msg, 0);
+	rpc_execute(task);
+	rpc_clnt_sigunmask(clnt, &oldset);
+	return 0;
+ out_bad:
+	while (!list_empty(head)) {
+		struct nfs_page *req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_mark_request_dirty(req);
+		nfs_unlock_request(req);
+	}
+	return -ENOMEM;
+}
 
-	req = head = NFS_WRITEBACK(inode);
-	while (req != NULL) {
-		if (pid == 0 || req->wb_pid == pid)
-			nfs_cancel_request(req);
-		if ((req = WB_NEXT(req)) == head)
+static int
+nfs_flush_list(struct inode *inode, struct list_head *head, int how)
+{
+	LIST_HEAD(one_request);
+	struct nfs_page		*req;
+	int			error = 0;
+	unsigned int		pages = 0,
+				wpages = NFS_SERVER(inode)->wsize >> PAGE_CACHE_SHIFT;
+
+	while (!list_empty(head)) {
+		pages += coalesce_requests(head, &one_request, wpages);
+		req = nfs_list_entry(one_request.next);
+		error = nfs_flush_one(&one_request, req->wb_file, how);
+		if (error < 0)
 			break;
 	}
+	if (error >= 0)
+		return pages;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_mark_request_dirty(req);
+		nfs_unlock_request(req);
+	}
+	return error;
 }
 
+
 /*
- * If we're waiting on somebody else's request
- * we need to increment the counter during the
- * wait so that the request doesn't disappear
- * from under us during the wait..
+ * This function is called when the WRITE call is complete.
  */
-static int FASTCALL(wait_on_other_req(struct nfs_wreq *));
-static int wait_on_other_req(struct nfs_wreq *req)
+static void
+nfs_writeback_done(struct rpc_task *task)
 {
-	int retval;
-	req->wb_count++;
-	retval = wait_on_write_request(req);
-	free_write_request(req);
-	return retval;
-}
+	struct nfs_write_data	*data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_writeargs	*argp = &data->args;
+	struct nfs_writeres	*resp = &data->res;
+	struct dentry		*dentry = data->file->f_dentry;
+	struct inode		*inode = dentry->d_inode;
+	struct nfs_page		*req;
+
+	dprintk("NFS: %4d nfs_writeback_done (status %d)\n",
+		task->tk_pid, task->tk_status);
+
+	/* We can't handle that yet but we check for it nevertheless */
+	if (resp->count < argp->count && task->tk_status >= 0) {
+		static unsigned long    complain = 0;
+		if (time_before(complain, jiffies)) {
+			printk(KERN_WARNING
+			       "NFS: Server wrote less than requested.\n");
+			complain = jiffies + 300 * HZ;
+		}
+		/* Can't do anything about it right now except throw
+		 * an error. */
+		task->tk_status = -EIO;
+	}
+#ifdef CONFIG_NFS_V3
+	if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
+		/* We tried a write call, but the server did not
+		 * commit data to stable storage even though we
+		 * requested it.
+		 */
+		static unsigned long    complain = 0;
+
+		if (time_before(complain, jiffies)) {
+			printk(KERN_NOTICE "NFS: faulty NFSv3 server %s:"
+			       " (committed = %d) != (stable = %d)\n",
+			       NFS_SERVER(inode)->hostname,
+			       resp->verf->committed, argp->stable);
+			complain = jiffies + 300 * HZ;
+		}
+	}
+#endif
 
-/*
- * This writes back a set of requests according to the condition.
- *
- * If this ever gets much more convoluted, use a fn pointer for
- * the condition..
- */
-#define NFS_WB(inode, cond) { int retval = 0 ; \
-	do { \
-		struct nfs_wreq *req = NFS_WRITEBACK(inode); \
-		struct nfs_wreq *head = req; \
-		if (!req) break; \
-		for (;;) { \
-			if (!(req->wb_flags & NFS_WRITE_COMPLETE)) \
-				if (cond) break; \
-			req = WB_NEXT(req); \
-			if (req == head) goto out; \
-		} \
-		retval = wait_on_other_req(req); \
-	} while (!retval); \
-out:	return retval; \
-}
+	/* Update attributes as result of writeback. */
+	if (task->tk_status >= 0)
+		nfs_write_attributes(inode, resp->fattr);
 
-int
-nfs_wb_all(struct inode *inode)
-{
-	NFS_WB(inode, 1);
+	while (!list_empty(&data->pages)) {
+		req = nfs_list_entry(data->pages.next);
+		nfs_list_remove_request(req);
+
+		kunmap(req->wb_page);
+
+		dprintk("NFS: write (%s/%s %d@%Ld)",
+			req->wb_file->f_dentry->d_parent->d_name.name,
+			req->wb_file->f_dentry->d_name.name,
+			req->wb_bytes,
+			page_offset(req->wb_page) + req->wb_offset);
+
+		if (task->tk_status < 0) {
+			req->wb_file->f_error = task->tk_status;
+			nfs_inode_remove_request(req);
+			dprintk(", error = %d\n", task->tk_status);
+			goto next;
+		}
+
+#ifdef CONFIG_NFS_V3
+		if (resp->verf->committed != NFS_UNSTABLE) {
+			nfs_inode_remove_request(req);
+			dprintk(" OK\n");
+			goto next;
+		}
+		memcpy(&req->wb_verf, resp->verf, sizeof(req->wb_verf));
+		req->wb_timeout = jiffies + NFS_COMMIT_DELAY;
+		nfs_mark_request_commit(req);
+		dprintk(" marked for commit\n");
+#else
+		nfs_inode_remove_request(req);
+#endif
+	next:
+		nfs_unlock_request(req);
+	}
+	nfs_writedata_release(task);
 }
 
+
+#ifdef CONFIG_NFS_V3
 /*
- * Write back all requests on one page - we do this before reading it.
+ * Set up the argument/result storage required for the RPC call.
  */
-int
-nfs_wb_page(struct inode *inode, struct page *page)
+static void
+nfs_commit_rpcsetup(struct list_head *head, struct nfs_write_data *data)
 {
-	NFS_WB(inode, req->wb_page == page);
+	struct nfs_page		*req;
+	struct dentry		*dentry;
+	struct inode		*inode;
+	unsigned long		start, end, len;
+
+	/* Set up the RPC argument and reply structs
+	 * NB: take care not to mess about with data->commit et al. */
+
+	end = 0;
+	start = ~0;
+	req = nfs_list_entry(head->next);
+	data->file = req->wb_file;
+	data->cred = req->wb_cred;
+	dentry = data->file->f_dentry;
+	inode = dentry->d_inode;
+	while (!list_empty(head)) {
+		struct nfs_page	*req;
+		unsigned long	rqstart, rqend;
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_list_add_request(req, &data->pages);
+		rqstart = page_offset(req->wb_page) + req->wb_offset;
+		rqend = rqstart + req->wb_bytes;
+		if (rqstart < start)
+			start = rqstart;
+		if (rqend > end)
+			end = rqend;
+	}
+	data->args.fh     = NFS_FH(dentry);
+	data->args.offset = start;
+	len = end - start;
+	if (end >= inode->i_size || len > (~((u32)0) >> 1))
+		len = 0;
+	data->res.count   = data->args.count = (u32)len;
+	data->res.fattr   = &data->fattr;
+	data->res.verf    = &data->verf;
 }
 
 /*
- * Write back all pending writes from one file descriptor..
+ * Commit dirty pages
  */
-int
-nfs_wb_file(struct inode *inode, struct file *file)
-{
-	NFS_WB(inode, req->wb_file == file);
-}
-
-void
-nfs_inval(struct inode *inode)
+static int
+nfs_commit_list(struct list_head *head, int how)
 {
-	nfs_cancel_dirty(inode,0);
+	struct rpc_message	msg;
+	struct file		*file;
+	struct rpc_clnt		*clnt;
+	struct nfs_write_data	*data;
+	struct rpc_task         *task;
+	struct nfs_page         *req;
+	int                     flags,
+				async = !(how & FLUSH_SYNC);
+	sigset_t		oldset;
+
+	data = nfs_writedata_alloc();
+
+	if (!data)
+		goto out_bad;
+	task = &data->task;
+
+	flags = (async) ? RPC_TASK_ASYNC : 0;
+
+	/* Set up the argument struct */
+	nfs_commit_rpcsetup(head, data);
+	req = nfs_list_entry(data->pages.next);
+	file = req->wb_file;
+	clnt = NFS_CLIENT(file->f_dentry->d_inode);
+
+	rpc_init_task(task, clnt, nfs_commit_done, flags);
+	task->tk_calldata = data;
+
+	msg.rpc_proc = NFS3PROC_COMMIT;
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	msg.rpc_cred = data->cred;
+
+	dprintk("NFS: %4d initiated commit call\n", task->tk_pid);
+	rpc_clnt_sigmask(clnt, &oldset);
+	rpc_call_setup(task, &msg, 0);
+	rpc_execute(task);
+	rpc_clnt_sigunmask(clnt, &oldset);
+	return 0;
+ out_bad:
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_mark_request_commit(req);
+		nfs_unlock_request(req);
+	}
+	return -ENOMEM;
 }
 
 /*
- * The following procedures make up the writeback finite state machinery:
- *
- * 1.	Try to lock the page if not yet locked by us,
- *	set up the RPC call info, and pass to the call FSM.
+ * COMMIT call returned
  */
 static void
-nfs_wback_begin(struct rpc_task *task)
+nfs_commit_done(struct rpc_task *task)
 {
-	struct nfs_wreq	*req = (struct nfs_wreq *) task->tk_calldata;
-	struct page	*page = req->wb_page;
-	struct file	*file = req->wb_file;
-	struct dentry	*dentry = file->f_dentry;
-
-	dprintk("NFS: %4d nfs_wback_begin (%s/%s, status=%d flags=%x)\n",
-		task->tk_pid, dentry->d_parent->d_name.name,
-		dentry->d_name.name, task->tk_status, req->wb_flags);
+	struct nfs_write_data	*data = (struct nfs_write_data *)task->tk_calldata;
+	struct nfs_writeres	*resp = &data->res;
+	struct nfs_page		*req;
+	struct dentry		*dentry = data->file->f_dentry;
+	struct inode		*inode = dentry->d_inode;
 
-	task->tk_status = 0;
+        dprintk("NFS: %4d nfs_commit_done (status %d)\n",
+                                task->tk_pid, task->tk_status);
+
+	nfs_refresh_inode(inode, resp->fattr);
+	while (!list_empty(&data->pages)) {
+		req = nfs_list_entry(data->pages.next);
+		nfs_list_remove_request(req);
+
+		dprintk("NFS: commit (%s/%s %d@%ld)",
+			req->wb_file->f_dentry->d_parent->d_name.name,
+			req->wb_file->f_dentry->d_name.name,
+			req->wb_bytes,
+			page_offset(req->wb_page) + req->wb_offset);
+		if (task->tk_status < 0) {
+			req->wb_file->f_error = task->tk_status;
+			nfs_inode_remove_request(req);
+			dprintk(", error = %d\n", task->tk_status);
+			goto next;
+		}
 
-	/* Setup the task struct for a writeback call */
-	req->wb_flags |= NFS_WRITE_INPROGRESS;
-	req->wb_args.fh     = NFS_FH(dentry);
-	req->wb_args.offset = (page->index << PAGE_CACHE_SHIFT) + req->wb_offset;
-	req->wb_args.count  = req->wb_bytes;
-	req->wb_args.buffer = (void *) (page_address(page) + req->wb_offset);
+		/* Okay, COMMIT succeeded, apparently. Check the verifier
+		 * returned by the server against all stored verfs. */
+		if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
+			/* We have a match */
+			nfs_inode_remove_request(req);
+			dprintk(" OK\n");
+			goto next;
+		}
+		/* We have a mismatch. Write the page again */
+		dprintk(" mismatch\n");
+		nfs_mark_request_dirty(req);
+	next:
+		nfs_unlock_request(req);
+	}
+	nfs_writedata_release(task);
+}
+#endif
 
-	rpc_call_setup(task, NFSPROC_WRITE, &req->wb_args, &req->wb_fattr, 0);
+int nfs_flush_file(struct inode *inode, struct file *file, unsigned long start,
+		   unsigned int count, int how)
+{
+	LIST_HEAD(head);
+	int			pages,
+				error = 0;
+
+	pages = nfs_scan_dirty(inode, &head, file, start, count);
+	if (pages)
+		error = nfs_flush_list(inode, &head, how);
+	if (error < 0)
+		return error;
+	return pages;
+}
 
-	return;
+int nfs_flush_timeout(struct inode *inode, int how)
+{
+	LIST_HEAD(head);
+	int			pages,
+				error = 0;
+
+	pages = nfs_scan_dirty_timeout(inode, &head);
+	if (pages)
+		error = nfs_flush_list(inode, &head, how);
+	if (error < 0)
+		return error;
+	return pages;
 }
 
-/*
- * 2.	Collect the result
- */
-static void
-nfs_wback_result(struct rpc_task *task)
+#ifdef CONFIG_NFS_V3
+int nfs_commit_file(struct inode *inode, struct file *file, unsigned long start,
+		    unsigned int count, int how)
 {
-	struct nfs_wreq *req = (struct nfs_wreq *) task->tk_calldata;
-	struct file	*file = req->wb_file;
-	struct page	*page = req->wb_page;
-	int		status = task->tk_status;
-	struct dentry	*dentry = file->f_dentry;
-	struct inode	*inode = dentry->d_inode;
+	LIST_HEAD(head);
+	int			pages,
+				error = 0;
+
+	pages = nfs_scan_commit(inode, &head, file, start, count);
+	if (pages)
+		error = nfs_commit_list(&head, how);
+	if (error < 0)
+		return error;
+	return pages;
+}
 
-	dprintk("NFS: %4d nfs_wback_result (%s/%s, status=%d, flags=%x)\n",
-		task->tk_pid, dentry->d_parent->d_name.name,
-		dentry->d_name.name, status, req->wb_flags);
-
-	/* Set the WRITE_COMPLETE flag, but leave WRITE_INPROGRESS set */
-	req->wb_flags |= NFS_WRITE_COMPLETE;
-	req->wb_status = status;
-
-	if (status < 0) {
-		req->wb_flags |= NFS_WRITE_INVALIDATE;
-		file->f_error = status;
-	} else if (!WB_CANCELLED(req)) {
-		struct nfs_fattr *fattr = &req->wb_fattr;
-		/* Update attributes as result of writeback. 
-		 * Beware: when UDP replies arrive out of order, we
-		 * may end up overwriting a previous, bigger file size.
-		 *
-		 * When the file size shrinks we cancel all pending
-		 * writebacks. 
-		 */
-		if (fattr->mtime.seconds >= inode->i_mtime) {
-			if (fattr->size < inode->i_size)
-				fattr->size = inode->i_size;
-
-			/* possible Solaris 2.5 server bug workaround */
-			if (inode->i_ino == fattr->fileid) {
-				/*
-				 * We expect these values to change, and
-				 * don't want to invalidate the caches.
-				 */
-				inode->i_size  = fattr->size;
-				inode->i_mtime = fattr->mtime.seconds;
-				nfs_refresh_inode(inode, fattr);
-			}
-			else
-				printk("nfs_wback_result: inode %ld, got %u?\n",
-					inode->i_ino, fattr->fileid);
-		}
+int nfs_commit_timeout(struct inode *inode, int how)
+{
+	LIST_HEAD(head);
+	int			pages,
+				error = 0;
+
+	pages = nfs_scan_commit_timeout(inode, &head);
+	if (pages) {
+		pages += nfs_scan_commit(inode, &head, NULL, 0, 0);
+		error = nfs_commit_list(&head, how);
 	}
+	if (error < 0)
+		return error;
+	return pages;
+}
+#endif
 
-	rpc_release_task(task);
+int nfs_sync_file(struct inode *inode, struct file *file, unsigned long start,
+		  unsigned int count, int how)
+{
+	int	error,
+		wait;
 
-	if (WB_INVALIDATE(req))
-		ClearPageUptodate(page);
+	wait = how & FLUSH_WAIT;
+	how &= ~FLUSH_WAIT;
 
-	kunmap(page);
-	__free_page(page);
-	remove_write_request(&NFS_WRITEBACK(inode), req);
-	nr_write_requests--;
-	fput(req->wb_file);
+	if (!inode && file)
+		inode = file->f_dentry->d_inode;
 
-	wake_up(&req->wb_wait);
-	free_write_request(req);
+	do {
+		error = 0;
+		if (wait)
+			error = nfs_wait_on_requests(inode, file, start, count);
+		if (error == 0)
+			error = nfs_flush_file(inode, file, start, count, how);
+#ifdef CONFIG_NFS_V3
+		if (error == 0)
+			error = nfs_commit_file(inode, file, start, count, how);
+#endif
+	} while (error > 0);
+	return error;
+}
+
+int nfs_init_nfspagecache(void)
+{
+	nfs_page_cachep = kmem_cache_create("nfs_page",
+					    sizeof(struct nfs_page),
+					    0, SLAB_HWCACHE_ALIGN,
+					    NULL, NULL);
+	if (nfs_page_cachep == NULL)
+		return -ENOMEM;
+
+	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
+					     sizeof(struct nfs_write_data),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL, NULL);
+	if (nfs_wdata_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void nfs_destroy_nfspagecache(void)
+{
+	if (kmem_cache_destroy(nfs_page_cachep))
+		printk(KERN_INFO "nfs_page: not all structures were freed\n");
+	if (kmem_cache_destroy(nfs_wdata_cachep))
+		printk(KERN_INFO "nfs_write_data: not all structures were freed\n");
 }
+
author	Ralf Baechle <ralf@linux-mips.org>	2000-03-27 23:54:12 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-03-27 23:54:12 +0000
commit	d3e71cb08747743fce908122bab08b479eb403a5 (patch)
tree	cbec6948fdbdee9af81cf3ecfb504070d2745d7b /fs/nfs
parent	fe7ff1706e323d0e5ed83972960a1ecc1ee538b3 (diff)