/* * linux/fs/nfs/write.c * * Writing file data over NFS. * * We do it like this: When a (user) process wishes to write data to an * NFS file, a write request is allocated that contains the RPC task data * plus some info on the page to be written, and added to the inode's * write chain. If the process writes past the end of the page, an async * RPC call to write the page is scheduled immediately; otherwise, the call * is delayed for a few seconds. * * Just like readahead, no async I/O is performed if wsize < PAGE_SIZE. * * Write requests are kept on the inode's writeback list. Each entry in * that list references the page (portion) to be written. When the * cache timeout has expired, the RPC task is woken up, and tries to * lock the page. As soon as it manages to do so, the request is moved * from the writeback list to the writelock list. * * Note: we must make sure never to confuse the inode passed in the * write_page request with the one in page->inode. As far as I understand * it, these are different when doing a swap-out. * * To understand everything that goes on here and in the NFS read code, * one should be aware that a page is locked in exactly one of the following * cases: * * - A write request is in progress. * - A user process is in generic_file_write/nfs_update_page * - A user process is in generic_file_read * * Also note that because of the way pages are invalidated in * nfs_revalidate_inode, the following assertions hold: * * - If a page is dirty, there will be no read requests (a page will * not be re-read unless invalidated by nfs_revalidate_inode). * - If the page is not uptodate, there will be no pending write * requests, and no process will be in nfs_update_page. * * FIXME: Interaction with the vmscan routines is not optimal yet. * Either vmscan must be made nfs-savvy, or we need a different page * reclaim concept that supports something like FS-independent * buffer_heads with a b_ops-> field. * * Copyright (C) 1996, 1997, Olaf Kirch */ #include #include #include #include #include #include #include #include #include #define NFS_PARANOIA 1 #define NFSDBG_FACILITY NFSDBG_PAGECACHE static void nfs_wback_begin(struct rpc_task *task); static void nfs_wback_result(struct rpc_task *task); static void nfs_cancel_request(struct nfs_wreq *req); /* * Cache parameters */ #define NFS_WRITEBACK_DELAY (10 * HZ) #define NFS_WRITEBACK_MAX 64 /* * Limit number of delayed writes */ static int nr_write_requests = 0; static struct rpc_wait_queue write_queue = RPC_INIT_WAITQ("write_chain"); /* Hack for future NFS swap support */ #ifndef IS_SWAPFILE # define IS_SWAPFILE(inode) (0) #endif /* * Write a page synchronously. * Offset is the data offset within the page. */ static int nfs_writepage_sync(struct dentry *dentry, struct inode *inode, struct page *page, unsigned long offset, unsigned int count) { unsigned int wsize = NFS_SERVER(inode)->wsize; int result, refresh = 0, written = 0; u8 *buffer; struct nfs_fattr fattr; lock_kernel(); dprintk("NFS: nfs_writepage_sync(%s/%s %d@%ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, count, page->offset + offset); buffer = (u8 *) page_address(page) + offset; offset += page->offset; do { if (count < wsize && !IS_SWAPFILE(inode)) wsize = count; result = nfs_proc_write(NFS_DSERVER(dentry), NFS_FH(dentry), IS_SWAPFILE(inode), offset, wsize, buffer, &fattr); if (result < 0) { /* Must mark the page invalid after I/O error */ ClearPageUptodate(page); goto io_error; } if (result != wsize) printk("NFS: short write, wsize=%u, result=%d\n", wsize, result); refresh = 1; buffer += wsize; offset += wsize; written += wsize; count -= wsize; /* * If we've extended the file, update the inode * now so we don't invalidate the cache. */ if (offset > inode->i_size) inode->i_size = offset; } while (count); io_error: /* Note: we don't refresh if the call failed (fattr invalid) */ if (refresh && result >= 0) { /* See comments in nfs_wback_result */ /* N.B. I don't think this is right -- sync writes in order */ if (fattr.size < inode->i_size) fattr.size = inode->i_size; if (fattr.mtime.seconds < inode->i_mtime) printk("nfs_writepage_sync: prior time??\n"); /* Solaris 2.5 server seems to send garbled * fattrs occasionally */ if (inode->i_ino == fattr.fileid) { /* * We expect the mtime value to change, and * don't want to invalidate the caches. */ inode->i_mtime = fattr.mtime.seconds; nfs_refresh_inode(inode, &fattr); } else printk("nfs_writepage_sync: inode %ld, got %u?\n", inode->i_ino, fattr.fileid); } unlock_kernel(); return written? written : result; } /* * Append a writeback request to a list */ static inline void append_write_request(struct nfs_wreq **q, struct nfs_wreq *wreq) { dprintk("NFS: append_write_request(%p, %p)\n", q, wreq); rpc_append_list(q, wreq); } /* * Remove a writeback request from a list */ static inline void remove_write_request(struct nfs_wreq **q, struct nfs_wreq *wreq) { dprintk("NFS: remove_write_request(%p, %p)\n", q, wreq); rpc_remove_list(q, wreq); } /* * Find a non-busy write request for a given page to * try to combine with. */ static inline struct nfs_wreq * find_write_request(struct inode *inode, struct page *page) { pid_t pid = current->pid; struct nfs_wreq *head, *req; dprintk("NFS: find_write_request(%x/%ld, %p)\n", inode->i_dev, inode->i_ino, page); if (!(req = head = NFS_WRITEBACK(inode))) return NULL; do { /* * We can't combine with canceled requests or * requests that have already been started.. */ if (req->wb_flags & (NFS_WRITE_CANCELLED | NFS_WRITE_INPROGRESS)) continue; if (req->wb_page == page && req->wb_pid == pid) return req; /* * Ehh, don't keep too many tasks queued.. */ rpc_wake_up_task(&req->wb_task); } while ((req = WB_NEXT(req)) != head); return NULL; } /* * Find and release all failed requests for this inode. */ int nfs_check_failed_request(struct inode * inode) { /* FIXME! */ return 0; } /* * Try to merge adjacent write requests. This works only for requests * issued by the same user. */ static inline int update_write_request(struct nfs_wreq *req, unsigned int first, unsigned int bytes) { unsigned int rqfirst = req->wb_offset, rqlast = rqfirst + req->wb_bytes, last = first + bytes; dprintk("nfs: trying to update write request %p\n", req); /* not contiguous? */ if (rqlast < first || last < rqfirst) return 0; if (first < rqfirst) rqfirst = first; if (rqlast < last) rqlast = last; req->wb_offset = rqfirst; req->wb_bytes = rqlast - rqfirst; req->wb_count++; return 1; } static kmem_cache_t *nfs_wreq_cachep; int nfs_init_wreqcache(void) { nfs_wreq_cachep = kmem_cache_create("nfs_wreq", sizeof(struct nfs_wreq), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (nfs_wreq_cachep == NULL) return -ENOMEM; return 0; } static inline void free_write_request(struct nfs_wreq * req) { if (!--req->wb_count) kmem_cache_free(nfs_wreq_cachep, req); } /* * Create and initialize a writeback request */ static inline struct nfs_wreq * create_write_request(struct file * file, struct page *page, unsigned int offset, unsigned int bytes) { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_wreq *wreq; struct rpc_task *task; dprintk("NFS: create_write_request(%s/%s, %ld+%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, page->offset + offset, bytes); /* FIXME: Enforce hard limit on number of concurrent writes? */ wreq = kmem_cache_alloc(nfs_wreq_cachep, SLAB_KERNEL); if (!wreq) goto out_fail; memset(wreq, 0, sizeof(*wreq)); task = &wreq->wb_task; rpc_init_task(task, clnt, nfs_wback_result, RPC_TASK_NFSWRITE); task->tk_calldata = wreq; task->tk_action = nfs_wback_begin; rpcauth_lookupcred(task); /* Obtain user creds */ if (task->tk_status < 0) goto out_req; /* Put the task on inode's writeback request list. */ get_file(file); wreq->wb_file = file; wreq->wb_pid = current->pid; wreq->wb_page = page; init_waitqueue_head(&wreq->wb_wait); wreq->wb_offset = offset; wreq->wb_bytes = bytes; wreq->wb_count = 2; /* One for the IO, one for us */ append_write_request(&NFS_WRITEBACK(inode), wreq); if (nr_write_requests++ > NFS_WRITEBACK_MAX*3/4) rpc_wake_up_next(&write_queue); return wreq; out_req: rpc_release_task(task); kmem_cache_free(nfs_wreq_cachep, wreq); out_fail: return NULL; } /* * Schedule a writeback RPC call. * If the server is congested, don't add to our backlog of queued * requests but call it synchronously. * The function returns whether we should wait for the thing or not. * * FIXME: Here we could walk the inode's lock list to see whether the * page we're currently writing to has been write-locked by the caller. * If it is, we could schedule an async write request with a long * delay in order to avoid writing back the page until the lock is * released. */ static inline int schedule_write_request(struct nfs_wreq *req, int sync) { struct rpc_task *task = &req->wb_task; struct file *file = req->wb_file; struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; if (NFS_CONGESTED(inode) || nr_write_requests >= NFS_WRITEBACK_MAX) sync = 1; if (sync) { sigset_t oldmask; struct rpc_clnt *clnt = NFS_CLIENT(inode); dprintk("NFS: %4d schedule_write_request (sync)\n", task->tk_pid); /* Page is already locked */ rpc_clnt_sigmask(clnt, &oldmask); rpc_execute(task); rpc_clnt_sigunmask(clnt, &oldmask); } else { dprintk("NFS: %4d schedule_write_request (async)\n", task->tk_pid); task->tk_flags |= RPC_TASK_ASYNC; task->tk_timeout = NFS_WRITEBACK_DELAY; rpc_sleep_on(&write_queue, task, NULL, NULL); } return sync; } /* * Wait for request to complete. */ static int wait_on_write_request(struct nfs_wreq *req) { struct file *file = req->wb_file; struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; struct rpc_clnt *clnt = NFS_CLIENT(inode); DECLARE_WAITQUEUE(wait, current); sigset_t oldmask; int retval; /* Make sure it's started.. */ if (!WB_INPROGRESS(req)) rpc_wake_up_task(&req->wb_task); rpc_clnt_sigmask(clnt, &oldmask); add_wait_queue(&req->wb_wait, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); retval = 0; if (req->wb_flags & NFS_WRITE_COMPLETE) break; retval = -ERESTARTSYS; if (signalled()) break; schedule(); } remove_wait_queue(&req->wb_wait, &wait); current->state = TASK_RUNNING; rpc_clnt_sigunmask(clnt, &oldmask); return retval; } /* * Write a page to the server. This will be used for NFS swapping only * (for now), and we currently do this synchronously only. */ int nfs_writepage(struct file * file, struct page *page) { struct dentry *dentry = file->f_dentry; return nfs_writepage_sync(dentry, dentry->d_inode, page, 0, PAGE_SIZE); } /* * Update and possibly write a cached page of an NFS file. * * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad * things with a page scheduled for an RPC call (e.g. invalidate it). */ int nfs_updatepage(struct file *file, struct page *page, unsigned long offset, unsigned int count) { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; struct nfs_wreq *req; int synchronous = file->f_flags & O_SYNC; int retval; dprintk("NFS: nfs_updatepage(%s/%s %d@%ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, count, page->offset+offset); /* * Try to find a corresponding request on the writeback queue. * If there is one, we can be sure that this request is not * yet being processed, because we hold a lock on the page. * * If the request was created by us, update it. Otherwise, * transfer the page lock and flush out the dirty page now. * After returning, generic_file_write will wait on the * page and retry the update. */ req = find_write_request(inode, page); if (req && req->wb_file == file && update_write_request(req, offset, count)) goto updated; /* * If wsize is smaller than page size, update and write * page synchronously. */ if (NFS_SERVER(inode)->wsize < PAGE_SIZE) return nfs_writepage_sync(dentry, inode, page, offset, count); /* Create the write request. */ req = create_write_request(file, page, offset, count); if (!req) return -ENOBUFS; /* * Ok, there's another user of this page with the new request.. * The IO completion will then free the page and the dentry. */ get_page(page); /* Schedule request */ synchronous = schedule_write_request(req, synchronous); updated: if (req->wb_bytes == PAGE_SIZE) SetPageUptodate(page); retval = count; if (synchronous) { int status = wait_on_write_request(req); if (status) { nfs_cancel_request(req); retval = status; } else { status = req->wb_status; if (status < 0) retval = status; } if (retval < 0) ClearPageUptodate(page); } free_write_request(req); return retval; } /* * Cancel a write request. We always mark it cancelled, * but if it's already in progress there's no point in * calling rpc_exit, and we don't want to overwrite the * tk_status field. */ static void nfs_cancel_request(struct nfs_wreq *req) { req->wb_flags |= NFS_WRITE_CANCELLED; if (!WB_INPROGRESS(req)) { rpc_exit(&req->wb_task, 0); rpc_wake_up_task(&req->wb_task); } } /* * Cancel all writeback requests, both pending and in progress. */ static void nfs_cancel_dirty(struct inode *inode, pid_t pid) { struct nfs_wreq *head, *req; req = head = NFS_WRITEBACK(inode); while (req != NULL) { if (pid == 0 || req->wb_pid == pid) nfs_cancel_request(req); if ((req = WB_NEXT(req)) == head) break; } } /* * If we're waiting on somebody else's request * we need to increment the counter during the * wait so that the request doesn't disappear * from under us during the wait.. */ static int FASTCALL(wait_on_other_req(struct nfs_wreq *)); static int wait_on_other_req(struct nfs_wreq *req) { int retval; req->wb_count++; retval = wait_on_write_request(req); free_write_request(req); return retval; } /* * This writes back a set of requests according to the condition. * * If this ever gets much more convoluted, use a fn pointer for * the condition.. */ #define NFS_WB(inode, cond) { int retval = 0 ; \ do { \ struct nfs_wreq *req = NFS_WRITEBACK(inode); \ struct nfs_wreq *head = req; \ if (!req) break; \ for (;;) { \ if (!(req->wb_flags & NFS_WRITE_COMPLETE)) \ if (cond) break; \ req = WB_NEXT(req); \ if (req == head) goto out; \ } \ retval = wait_on_other_req(req); \ } while (!retval); \ out: return retval; \ } int nfs_wb_all(struct inode *inode) { NFS_WB(inode, 1); } /* * Write back all requests on one page - we do this before reading it. */ int nfs_wb_page(struct inode *inode, struct page *page) { NFS_WB(inode, req->wb_page == page); } /* * Write back all pending writes from one file descriptor.. */ int nfs_wb_file(struct inode *inode, struct file *file) { NFS_WB(inode, req->wb_file == file); } void nfs_inval(struct inode *inode) { nfs_cancel_dirty(inode,0); } /* * The following procedures make up the writeback finite state machinery: * * 1. Try to lock the page if not yet locked by us, * set up the RPC call info, and pass to the call FSM. */ static void nfs_wback_begin(struct rpc_task *task) { struct nfs_wreq *req = (struct nfs_wreq *) task->tk_calldata; struct page *page = req->wb_page; struct file *file = req->wb_file; struct dentry *dentry = file->f_dentry; dprintk("NFS: %4d nfs_wback_begin (%s/%s, status=%d flags=%x)\n", task->tk_pid, dentry->d_parent->d_name.name, dentry->d_name.name, task->tk_status, req->wb_flags); task->tk_status = 0; /* Setup the task struct for a writeback call */ req->wb_flags |= NFS_WRITE_INPROGRESS; req->wb_args.fh = NFS_FH(dentry); req->wb_args.offset = page->offset + req->wb_offset; req->wb_args.count = req->wb_bytes; req->wb_args.buffer = (void *) (page_address(page) + req->wb_offset); rpc_call_setup(task, NFSPROC_WRITE, &req->wb_args, &req->wb_fattr, 0); return; } /* * 2. Collect the result */ static void nfs_wback_result(struct rpc_task *task) { struct nfs_wreq *req = (struct nfs_wreq *) task->tk_calldata; struct file *file = req->wb_file; struct page *page = req->wb_page; int status = task->tk_status; struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; dprintk("NFS: %4d nfs_wback_result (%s/%s, status=%d, flags=%x)\n", task->tk_pid, dentry->d_parent->d_name.name, dentry->d_name.name, status, req->wb_flags); /* Set the WRITE_COMPLETE flag, but leave WRITE_INPROGRESS set */ req->wb_flags |= NFS_WRITE_COMPLETE; req->wb_status = status; if (status < 0) { req->wb_flags |= NFS_WRITE_INVALIDATE; file->f_error = status; } else if (!WB_CANCELLED(req)) { struct nfs_fattr *fattr = &req->wb_fattr; /* Update attributes as result of writeback. * Beware: when UDP replies arrive out of order, we * may end up overwriting a previous, bigger file size. * * When the file size shrinks we cancel all pending * writebacks. */ if (fattr->mtime.seconds >= inode->i_mtime) { if (fattr->size < inode->i_size) fattr->size = inode->i_size; /* possible Solaris 2.5 server bug workaround */ if (inode->i_ino == fattr->fileid) { /* * We expect these values to change, and * don't want to invalidate the caches. */ inode->i_size = fattr->size; inode->i_mtime = fattr->mtime.seconds; nfs_refresh_inode(inode, fattr); } else printk("nfs_wback_result: inode %ld, got %u?\n", inode->i_ino, fattr->fileid); } } rpc_release_task(task); if (WB_INVALIDATE(req)) ClearPageUptodate(page); __free_page(page); remove_write_request(&NFS_WRITEBACK(inode), req); nr_write_requests--; fput(req->wb_file); wake_up(&req->wb_wait); free_write_request(req); }