diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-02-23 00:40:54 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-02-23 00:40:54 +0000 |
commit | 529c593ece216e4aaffd36bd940cb94f1fa63129 (patch) | |
tree | 78f1c0b805f5656aa7b0417a043c5346f700a2cf /fs/buffer.c | |
parent | 0bd079751d25808d1972baee5c4eaa1db2227257 (diff) |
Merge with 2.3.43. I did ignore all modifications to the qlogicisp.c
driver due to the Origin A64 hacks.
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 771 |
1 files changed, 439 insertions, 332 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index b2bc7670f..9113c07cc 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -94,6 +94,7 @@ static struct bh_free_head free_list[NR_SIZES]; kmem_cache_t *bh_cachep; static int grow_buffers(int size); +static void __refile_buffer(struct buffer_head *); /* This is used by some architectures to estimate available memory. */ atomic_t buffermem_pages = ATOMIC_INIT(0); @@ -277,11 +278,14 @@ repeat: void sync_dev(kdev_t dev) { - sync_buffers(dev, 0); sync_supers(dev); sync_inodes(dev); - sync_buffers(dev, 0); DQUOT_SYNC(dev); + /* sync all the dirty buffers out to disk only _after_ all the + high level layers finished generated buffer dirty data + (or we'll return with some buffer still dirty on the blockdevice + so breaking the semantics of this call) */ + sync_buffers(dev, 0); /* * FIXME(eric) we need to sync the physical devices here. * This is because some (scsi) controllers have huge amounts of @@ -323,7 +327,9 @@ int file_fsync(struct file *filp, struct dentry *dentry) struct inode * inode = dentry->d_inode; struct super_block * sb; kdev_t dev; + int ret; + lock_kernel(); /* sync the inode to buffers */ write_inode_now(inode); @@ -335,7 +341,9 @@ int file_fsync(struct file *filp, struct dentry *dentry) /* .. finally sync the buffers to disk */ dev = inode->i_dev; - return sync_buffers(dev, 1); + ret = sync_buffers(dev, 1); + unlock_kernel(); + return ret; } asmlinkage long sys_fsync(unsigned int fd) @@ -345,7 +353,6 @@ asmlinkage long sys_fsync(unsigned int fd) struct inode * inode; int err; - lock_kernel(); err = -EBADF; file = fget(fd); if (!file) @@ -371,7 +378,6 @@ asmlinkage long sys_fsync(unsigned int fd) out_putf: fput(file); out: - unlock_kernel(); return err; } @@ -382,7 +388,6 @@ asmlinkage long sys_fdatasync(unsigned int fd) struct inode * inode; int err; - lock_kernel(); err = -EBADF; file = fget(fd); if (!file) @@ -408,44 +413,9 @@ asmlinkage long sys_fdatasync(unsigned int fd) out_putf: fput(file); out: - unlock_kernel(); return err; } -void invalidate_buffers(kdev_t dev) -{ - int nlist; - - spin_lock(&lru_list_lock); - for(nlist = 0; nlist < NR_LIST; nlist++) { - struct buffer_head * bh; - int i; - retry: - bh = lru_list[nlist]; - if (!bh) - continue; - for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) { - if (bh->b_dev != dev) - continue; - if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); - spin_unlock(&lru_list_lock); - wait_on_buffer(bh); - spin_lock(&lru_list_lock); - atomic_dec(&bh->b_count); - goto retry; - } - if (atomic_read(&bh->b_count)) - continue; - clear_bit(BH_Protected, &bh->b_state); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Dirty, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - } - } - spin_unlock(&lru_list_lock); -} - /* After several hours of tedious analysis, the following hash * function won. Do not mess with it... -DaveM */ @@ -464,10 +434,12 @@ static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head ** static __inline__ void __hash_unlink(struct buffer_head *bh) { - if (bh->b_next) - bh->b_next->b_pprev = bh->b_pprev; - *(bh->b_pprev) = bh->b_next; - bh->b_pprev = NULL; + if (bh->b_pprev) { + if (bh->b_next) + bh->b_next->b_pprev = bh->b_pprev; + *(bh->b_pprev) = bh->b_next; + bh->b_pprev = NULL; + } } static void __insert_into_lru_list(struct buffer_head * bh, int blist) @@ -514,17 +486,12 @@ static void __remove_from_free_list(struct buffer_head * bh, int index) bh->b_next_free = bh->b_prev_free = NULL; } -/* The following two functions must operate atomically - * because they control the visibility of a buffer head - * to the rest of the kernel. - */ -static __inline__ void __remove_from_queues(struct buffer_head *bh) +/* must be called with both the hash_table_lock and the lru_list_lock + held */ +static void __remove_from_queues(struct buffer_head *bh) { - write_lock(&hash_table_lock); - if (bh->b_pprev) - __hash_unlink(bh); + __hash_unlink(bh); __remove_from_lru_list(bh, bh->b_list); - write_unlock(&hash_table_lock); } static void insert_into_queues(struct buffer_head *bh) @@ -547,6 +514,8 @@ static void put_last_free(struct buffer_head * bh) struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)]; struct buffer_head **bhp = &head->list; + bh->b_state = 0; + spin_lock(&head->lock); bh->b_dev = B_FREE; if(!*bhp) { @@ -604,11 +573,73 @@ unsigned int get_hardblocksize(kdev_t dev) return 0; } +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash dirty + buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completation) and + then an invalidate_buffers call that doesn't trashes dirty buffers. */ +void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) +{ + int i, nlist, slept; + struct buffer_head * bh, * bh_next; + + retry: + slept = 0; + spin_lock(&lru_list_lock); + for(nlist = 0; nlist < NR_LIST; nlist++) { + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { + bh_next = bh->b_next_free; + if (bh->b_dev != dev) + continue; + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + slept = 1; + spin_lock(&lru_list_lock); + atomic_dec(&bh->b_count); + } + + write_lock(&hash_table_lock); + if (!atomic_read(&bh->b_count) && + (destroy_dirty_buffers || !buffer_dirty(bh))) { + __remove_from_queues(bh); + put_last_free(bh); + } + write_unlock(&hash_table_lock); + if (slept) + goto out; + } + } +out: + spin_unlock(&lru_list_lock); + if (slept) + goto retry; +} + void set_blocksize(kdev_t dev, int size) { extern int *blksize_size[]; - int i, nlist; - struct buffer_head * bh, *bhnext; + int i, nlist, slept; + struct buffer_head * bh, * bh_next; if (!blksize_size[MAJOR(dev)]) return; @@ -626,41 +657,53 @@ void set_blocksize(kdev_t dev, int size) sync_buffers(dev, 2); blksize_size[MAJOR(dev)][MINOR(dev)] = size; - /* We need to be quite careful how we do this - we are moving entries - * around on the free list, and we can get in a loop if we are not careful. - */ + retry: + slept = 0; + spin_lock(&lru_list_lock); for(nlist = 0; nlist < NR_LIST; nlist++) { - repeat: - spin_lock(&lru_list_lock); bh = lru_list[nlist]; - for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) { - if(!bh) - break; - - bhnext = bh->b_next_free; - if (bh->b_dev != dev) - continue; - if (bh->b_size == size) - continue; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { + bh_next = bh->b_next_free; + if (bh->b_dev != dev || bh->b_size == size) + continue; if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); wait_on_buffer(bh); + slept = 1; + spin_lock(&lru_list_lock); atomic_dec(&bh->b_count); - goto repeat; - } - if (bh->b_dev == dev && bh->b_size != size) { - clear_bit(BH_Dirty, &bh->b_state); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); } - if (atomic_read(&bh->b_count) == 0) { + + write_lock(&hash_table_lock); + if (!atomic_read(&bh->b_count)) { + if (buffer_dirty(bh)) + printk(KERN_WARNING + "set_blocksize: dev %s buffer_dirty %lu size %hu\n", + kdevname(dev), bh->b_blocknr, bh->b_size); __remove_from_queues(bh); put_last_free(bh); + } else { + if (atomic_set_buffer_clean(bh)) + __refile_buffer(bh); + clear_bit(BH_Uptodate, &bh->b_state); + printk(KERN_WARNING + "set_blocksize: " + "b_count %d, dev %s, block %lu, from %p\n", + atomic_read(&bh->b_count), bdevname(bh->b_dev), + bh->b_blocknr, __builtin_return_address(0)); } + write_unlock(&hash_table_lock); + if (slept) + goto out; } - spin_unlock(&lru_list_lock); } + out: + spin_unlock(&lru_list_lock); + if (slept) + goto retry; } /* @@ -785,30 +828,31 @@ repeat: atomic_set(&bh->b_count, 1); } spin_unlock(&free_list[isize].lock); - if (!bh) - goto refill; - /* OK, FINALLY we know that this buffer is the only one of its kind, - * we hold a reference (b_count>0), it is unlocked, and it is clean. + /* + * OK, FINALLY we know that this buffer is the only one of + * its kind, we hold a reference (b_count>0), it is unlocked, + * and it is clean. */ - init_buffer(bh, end_buffer_io_sync, NULL); - bh->b_dev = dev; - bh->b_blocknr = block; - bh->b_state = 1 << BH_Mapped; + if (bh) { + init_buffer(bh, end_buffer_io_sync, NULL); + bh->b_dev = dev; + bh->b_blocknr = block; + bh->b_state = 1 << BH_Mapped; - /* Insert the buffer into the regular lists */ - insert_into_queues(bh); - goto out; + /* Insert the buffer into the regular lists */ + insert_into_queues(bh); + out: + touch_buffer(bh); + return bh; + } /* * If we block while refilling the free list, somebody may * create the buffer first ... search the hashes again. */ -refill: refill_freelist(size); goto repeat; -out: - return bh; } /* -1 -> no need to flush @@ -820,11 +864,13 @@ static int balance_dirty_state(kdev_t dev) dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; tot = nr_free_buffer_pages(); - hard_dirty_limit = tot * bdf_prm.b_un.nfract / 100; - soft_dirty_limit = hard_dirty_limit >> 1; + tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT; - if (dirty > soft_dirty_limit) - { + dirty *= 200; + soft_dirty_limit = tot * bdf_prm.b_un.nfract; + hard_dirty_limit = soft_dirty_limit * 2; + + if (dirty > soft_dirty_limit) { if (dirty > hard_dirty_limit) return 1; return 0; @@ -848,29 +894,39 @@ void balance_dirty(kdev_t dev) wakeup_bdflush(state); } -static inline void __mark_dirty(struct buffer_head *bh, int flag) +static __inline__ void __mark_dirty(struct buffer_head *bh, int flag) { bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer); - clear_bit(BH_New, &bh->b_state); refile_buffer(bh); } +/* atomic version, the user must call balance_dirty() by hand + as soon as it become possible to block */ void __mark_buffer_dirty(struct buffer_head *bh, int flag) { - __mark_dirty(bh, flag); + if (!atomic_set_buffer_dirty(bh)) + __mark_dirty(bh, flag); +} + +void mark_buffer_dirty(struct buffer_head *bh, int flag) +{ + __mark_buffer_dirty(bh, flag); + balance_dirty(bh->b_dev); } /* * A buffer may need to be moved from one buffer list to another * (e.g. in case it is not shared any more). Handle this. */ -static __inline__ void __refile_buffer(struct buffer_head *bh) +static void __refile_buffer(struct buffer_head *bh) { int dispose = BUF_CLEAN; if (buffer_locked(bh)) dispose = BUF_LOCKED; if (buffer_dirty(bh)) dispose = BUF_DIRTY; + if (buffer_protected(bh)) + dispose = BUF_PROTECTED; if (dispose != bh->b_list) { __remove_from_lru_list(bh, bh->b_list); bh->b_list = dispose; @@ -890,8 +946,6 @@ void refile_buffer(struct buffer_head *bh) */ void __brelse(struct buffer_head * buf) { - touch_buffer(buf); - if (atomic_read(&buf->b_count)) { atomic_dec(&buf->b_count); return; @@ -912,12 +966,10 @@ void __bforget(struct buffer_head * buf) write_lock(&hash_table_lock); if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf)) goto in_use; - if (buf->b_pprev) - __hash_unlink(buf); + __hash_unlink(buf); write_unlock(&hash_table_lock); __remove_from_lru_list(buf, buf->b_list); spin_unlock(&lru_list_lock); - buf->b_state = 0; put_last_free(buf); return; @@ -1218,13 +1270,13 @@ static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], i static void unmap_buffer(struct buffer_head * bh) { - if (buffer_mapped(bh)) - { + if (buffer_mapped(bh)) { mark_buffer_clean(bh); wait_on_buffer(bh); clear_bit(BH_Uptodate, &bh->b_state); clear_bit(BH_Mapped, &bh->b_state); clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_New, &bh->b_state); } } @@ -1303,30 +1355,25 @@ static void create_empty_buffers(struct page *page, struct inode *inode, unsigne static void unmap_underlying_metadata(struct buffer_head * bh) { -#if 0 - if (buffer_new(bh)) { - struct buffer_head *old_bh; - - old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); - if (old_bh) { - unmap_buffer(old_bh); - /* Here we could run brelse or bforget. We use - bforget because it will try to put the buffer - in the freelist. */ - __bforget(old_bh); - } + struct buffer_head *old_bh; + + old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); + if (old_bh) { + unmap_buffer(old_bh); + /* Here we could run brelse or bforget. We use + bforget because it will try to put the buffer + in the freelist. */ + __bforget(old_bh); } -#endif } /* * block_write_full_page() is SMP-safe - currently it's still * being called with the kernel lock held, but the code is ready. */ -int block_write_full_page(struct dentry *dentry, struct page *page) +static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) { - struct inode *inode = dentry->d_inode; - int err, i; + int err, i, need_balance_dirty = 0; unsigned long block; struct buffer_head *bh, *head; @@ -1337,17 +1384,11 @@ int block_write_full_page(struct dentry *dentry, struct page *page) create_empty_buffers(page, inode, inode->i_sb->s_blocksize); head = page->buffers; - /* The page cache is now PAGE_CACHE_SIZE aligned, period. We handle old a.out - * and others via unaligned private mappings. - */ block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); bh = head; i = 0; do { - if (!bh) - BUG(); - /* * If the buffer isn't up-to-date, we can't be sure * that the buffer has been initialized with the proper @@ -1358,18 +1399,25 @@ int block_write_full_page(struct dentry *dentry, struct page *page) */ bh->b_end_io = end_buffer_io_sync; if (!buffer_mapped(bh)) { - err = inode->i_op->get_block(inode, block, bh, 1); + err = get_block(inode, block, bh, 1); if (err) goto out; - unmap_underlying_metadata(bh); + if (buffer_new(bh)) + unmap_underlying_metadata(bh); } set_bit(BH_Uptodate, &bh->b_state); - mark_buffer_dirty(bh,0); + if (!atomic_set_buffer_dirty(bh)) { + __mark_dirty(bh, 0); + need_balance_dirty = 1; + } bh = bh->b_this_page; block++; } while (bh != head); + if (need_balance_dirty) + balance_dirty(bh->b_dev); + SetPageUptodate(page); return 0; out: @@ -1377,13 +1425,12 @@ out: return err; } -int block_write_zero_range(struct inode *inode, struct page *page, - unsigned zerofrom, unsigned from, unsigned to, - const char * buf) +static int __block_prepare_write(struct inode *inode, struct page *page, + unsigned from, unsigned to, get_block_t *get_block) { - unsigned zeroto = 0, block_start, block_end; + unsigned block_start, block_end; unsigned long block; - int err = 0, partial = 0, need_balance_dirty = 0; + int err = 0; unsigned blocksize, bbits; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; char *kaddr = (char *)kmap(page); @@ -1396,35 +1443,31 @@ int block_write_zero_range(struct inode *inode, struct page *page, bbits = inode->i_sb->s_blocksize_bits; block = page->index << (PAGE_CACHE_SHIFT - bbits); - /* - * First pass - map what needs to be mapped, initiate reads - * on the boundaries if needed (i.e. if block is partially covered - * _and_ is not up-to-date _and_ is not new). - */ for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { if (!bh) BUG(); block_end = block_start+blocksize; - if (block_end <= zerofrom) + if (block_end <= from) continue; if (block_start >= to) break; bh->b_end_io = end_buffer_io_sync; if (!buffer_mapped(bh)) { - err = inode->i_op->get_block(inode, block, bh, 1); + err = get_block(inode, block, bh, 1); if (err) goto out; - unmap_underlying_metadata(bh); - } - if (buffer_new(bh)) { - zeroto = block_end; - if (block_start < zerofrom) - zerofrom = block_start; - continue; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh); + if (block_end > to) + memset(kaddr+to, 0, block_end-to); + if (block_start < from) + memset(kaddr+block_start, 0, from-block_start); + continue; + } } if (!buffer_uptodate(bh) && - (block_start < zerofrom || block_end > to)) { + (block_start < from || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; } @@ -1438,44 +1481,31 @@ int block_write_zero_range(struct inode *inode, struct page *page, if (!buffer_uptodate(*wait_bh)) goto out; } - /* - * Now we can copy the data. - */ - if (zerofrom < from) - memset(kaddr+zerofrom, 0, from-zerofrom); - if (from < to) - err = copy_from_user(kaddr+from, buf, to-from); - if (to < zeroto) - memset(kaddr+to, 0, zeroto-to); - else - zeroto = to; - if (err < 0) - goto out; - /* - * Second pass: check if all out-of-range blocks are up-to-date - * and mark the rest up-to-date and dirty. - * - * NOTE! This also does a direct dirty balace check, - * rather than relying on bdflush just waking up every - * once in a while. This is to catch (and slow down) - * the processes that write tons of buffer.. - * - * Note how we do NOT want to do this in the full block - * case: full pages are flushed not by the people who - * dirtied them, but by people who need memory. And we - * should not penalize them for somebody else writing - * lots of dirty pages. - */ - for(bh = head, block_start = 0; + return 0; +out: + return err; +} + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0, need_balance_dirty = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = inode->i_sb->s_blocksize; + + for(bh = head = page->buffers, block_start = 0; bh != head || !block_start; block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; - if (block_end <= zerofrom || block_start >= zeroto) { + if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = 1; } else { set_bit(BH_Uptodate, &bh->b_state); - if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { + if (!atomic_set_buffer_dirty(bh)) { __mark_dirty(bh, 0); need_balance_dirty = 1; } @@ -1492,51 +1522,202 @@ int block_write_zero_range(struct inode *inode, struct page *page, */ if (!partial) SetPageUptodate(page); - kunmap(page); return 0; -out: - ClearPageUptodate(page); - kunmap(page); - return err; } -int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf) +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * mark_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +static inline int __block_read_full_page(struct inode *inode, struct page *page, + get_block_t *get_block) { - struct inode *inode = file->f_dentry->d_inode; - int err; + unsigned long iblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, blocks; + unsigned long kaddr = 0; + int nr, i; if (!PageLocked(page)) - BUG(); - if (offset < 0 || offset >= PAGE_SIZE) - BUG(); - if (bytes+offset < 0 || bytes+offset > PAGE_SIZE) - BUG(); + PAGE_BUG(page); + blocksize = inode->i_sb->s_blocksize; + if (!page->buffers) + create_empty_buffers(page, inode, blocksize); + head = page->buffers; - err = block_write_range(inode, page, offset, bytes, buf); - return err ? err : bytes; + blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits; + iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + get_block(inode, iblock, bh, 0); + if (!buffer_mapped(bh)) { + if (!kaddr) + kaddr = kmap(page); + memset((char *)(kaddr + i*blocksize), 0, blocksize); + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + } + + init_buffer(bh, end_buffer_io_async, NULL); + atomic_inc(&bh->b_count); + arr[nr] = bh; + nr++; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + ++current->maj_flt; + if (nr) { + if (Page_Uptodate(page)) + BUG(); + ll_rw_block(READ, nr, arr); + } else { + /* + * all buffers are uptodate - we can set the page + * uptodate as well. + */ + SetPageUptodate(page); + UnlockPage(page); + } + if (kaddr) + kunmap(page); + return 0; } /* * For moronic filesystems that do not allow holes in file. - * we allow offset==PAGE_SIZE, bytes==0 + * We may have to extend the file. */ -int block_write_cont_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf) +int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = (struct inode*)mapping->host; + struct page *new_page; + unsigned long pgpos; + long status; + unsigned zerofrom; + unsigned blocksize = inode->i_sb->s_blocksize; + char *kaddr; + + while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { + status = -ENOMEM; + new_page = grab_cache_page(mapping, pgpos); + if (!new_page) + goto out; + /* we might sleep */ + if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { + UnlockPage(new_page); + page_cache_release(new_page); + continue; + } + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + status = __block_prepare_write(inode, new_page, zerofrom, + PAGE_CACHE_SIZE, get_block); + if (status) + goto out_unmap; + kaddr = (char*)page_address(page); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + __block_commit_write(inode, new_page, zerofrom, to); + kunmap(new_page); + UnlockPage(new_page); + page_cache_release(new_page); + } + + if (page->index < pgpos) { + /* completely inside the area */ + zerofrom = offset; + } else { + /* page covers the boundary, find the boundary offset */ + zerofrom = *bytes & ~PAGE_CACHE_MASK; + + /* if we will expand the thing last block will be filled */ + if (to > zerofrom && (zerofrom & (blocksize-1))) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + /* starting below the boundary? Nothing to zero out */ + if (offset <= zerofrom) + zerofrom = offset; + } + status = __block_prepare_write(inode, page, zerofrom, to, get_block); + if (status) + goto out1; + kaddr = (char*)page_address(page); + if (zerofrom < offset) { + memset(kaddr+zerofrom, 0, offset-zerofrom); + __block_commit_write(inode, page, zerofrom, offset); + } + return 0; +out1: + ClearPageUptodate(page); + kunmap(page); + return status; + +out_unmap: + ClearPageUptodate(new_page); + kunmap(new_page); + UnlockPage(new_page); + page_cache_release(new_page); +out: + return status; +} + +int block_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) { - struct inode *inode = file->f_dentry->d_inode; - int err; - unsigned zerofrom = offset; + struct inode *inode = (struct inode*)page->mapping->host; + int err = __block_prepare_write(inode, page, from, to, get_block); + if (err) { + ClearPageUptodate(page); + kunmap(page); + } + return err; +} - if (page->index > (inode->i_size >> PAGE_CACHE_SHIFT)) - zerofrom = 0; - else if (page->index == (inode->i_size >> PAGE_CACHE_SHIFT) && - offset > (inode->i_size & ~PAGE_CACHE_MASK)) - zerofrom = inode->i_size & ~PAGE_CACHE_MASK; - err = block_write_zero_range(inode, page, zerofrom,offset,offset+bytes, - buf); - return err ? err : bytes; +int generic_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + __block_commit_write((struct inode*)page->mapping->host,page,from,to); + kunmap(page); + return 0; } +int block_write_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = (struct inode*)page->mapping->host; + return __block_write_full_page(inode, page, get_block); +} + +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = (struct inode*)page->mapping->host; + return __block_read_full_page(inode, page, get_block); +} + +int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = (struct inode*)mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} /* * IO completion routine for a buffer_head being used for kiobuf IO: we @@ -1814,93 +1995,22 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) return 0; } -/* - * Generic "read page" function for block devices that have the normal - * get_block functionality. This is most of the block device filesystems. - * Reads the page asynchronously --- the unlock_buffer() and - * mark_buffer_uptodate() functions propagate buffer state into the - * page struct once IO has completed. - */ -static inline int __block_read_full_page(struct inode *inode, struct page *page) -{ - unsigned long iblock; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize, blocks; - unsigned long kaddr = 0; - int nr, i; - - if (!PageLocked(page)) - PAGE_BUG(page); - blocksize = inode->i_sb->s_blocksize; - if (!page->buffers) - create_empty_buffers(page, inode, blocksize); - head = page->buffers; - - blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits; - iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - bh = head; - nr = 0; - i = 0; - - do { - if (buffer_uptodate(bh)) - continue; - - if (!buffer_mapped(bh)) { - inode->i_op->get_block(inode, iblock, bh, 0); - if (!buffer_mapped(bh)) { - if (!kaddr) - kaddr = kmap(page); - memset((char *)(kaddr + i*blocksize), 0, blocksize); - set_bit(BH_Uptodate, &bh->b_state); - continue; - } - } - - init_buffer(bh, end_buffer_io_async, NULL); - atomic_inc(&bh->b_count); - arr[nr] = bh; - nr++; - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - ++current->maj_flt; - if (nr) { - if (Page_Uptodate(page)) - BUG(); - ll_rw_block(READ, nr, arr); - } else { - /* - * all buffers are uptodate - we can set the page - * uptodate as well. - */ - SetPageUptodate(page); - UnlockPage(page); - } - if (kaddr) - kunmap(page); - return 0; -} - -int block_read_full_page(struct dentry *dentry, struct page *page) -{ - return __block_read_full_page(dentry->d_inode, page); -} - int block_symlink(struct inode *inode, const char *symname, int len) { - struct page *page = grab_cache_page(&inode->i_data, 0); - mm_segment_t fs; + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); int err = -ENOMEM; + char *kaddr; if (!page) goto fail; - fs = get_fs(); - set_fs(KERNEL_DS); - err = block_write_range(inode, page, 0, len-1, symname); - set_fs(fs); - inode->i_size = len-1; + err = mapping->a_ops->prepare_write(page, 0, len-1); if (err) - goto fail_write; + goto fail_map; + kaddr = (char*)page_address(page); + memcpy(kaddr, symname, len-1); + mapping->a_ops->commit_write(NULL, page, 0, len-1); + inode->i_size = len-1; /* * Notice that we are _not_ going to block here - end of page is * unmapped, so this will only try to map the rest of page, see @@ -1908,14 +2018,15 @@ int block_symlink(struct inode *inode, const char *symname, int len) * ->i_size will be enough for everything) and zero it out. * OTOH it's obviously correct and should make the page up-to-date. */ - err = __block_read_full_page(inode, page); + err = mapping->a_ops->readpage(NULL, page); wait_on_page(page); page_cache_release(page); if (err < 0) goto fail; mark_inode_dirty(inode); return 0; -fail_write: +fail_map: + inode->i_size = len-1; UnlockPage(page); page_cache_release(page); fail: @@ -2000,7 +2111,7 @@ out: */ int try_to_free_buffers(struct page * page) { - struct buffer_head * tmp, * bh = page->buffers; + struct buffer_head * tmp, * p, * bh = page->buffers; int index = BUFSIZE_INDEX(bh->b_size); int ret; @@ -2009,7 +2120,7 @@ int try_to_free_buffers(struct page * page) spin_lock(&free_list[index].lock); tmp = bh; do { - struct buffer_head * p = tmp; + p = tmp; tmp = tmp->b_this_page; if (buffer_busy(p)) @@ -2025,13 +2136,10 @@ int try_to_free_buffers(struct page * page) /* The buffer can be either on the regular * queues or on the free list.. */ - if (p->b_dev == B_FREE) { + if (p->b_dev != B_FREE) + __remove_from_queues(p); + else __remove_from_free_list(p, index); - } else { - if (p->b_pprev) - __hash_unlink(p); - __remove_from_lru_list(p, p->b_list); - } __put_unused_buffer_head(p); } while (tmp != bh); spin_unlock(&unused_list_lock); @@ -2051,7 +2159,8 @@ out: busy_buffer_page: /* Uhhuh, start writeback so that we don't end up with all dirty pages */ - wakeup_bdflush(0); + if (buffer_dirty(p)) + wakeup_bdflush(0); ret = 0; goto out; } @@ -2065,7 +2174,7 @@ void show_buffers(void) int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; int protected = 0; int nlist; - static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY" }; + static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", }; #endif printk("Buffer memory: %6dkB\n", @@ -2091,10 +2200,16 @@ void show_buffers(void) used++, lastused = found; bh = bh->b_next_free; } while (bh != lru_list[nlist]); - printk("%8s: %d buffers, %d used (last=%d), " + { + int tmp = nr_buffers_type[nlist]; + if (found != tmp) + printk("%9s: BUG -> found %d, reported %d\n", + buf_types[nlist], found, tmp); + } + printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), " "%d locked, %d protected, %d dirty\n", - buf_types[nlist], found, used, lastused, - locked, protected, dirty); + buf_types[nlist], found, size_buffers_type[nlist]>>10, + used, lastused, locked, protected, dirty); } spin_unlock(&lru_list_lock); #endif @@ -2184,8 +2299,7 @@ void wakeup_bdflush(int block) if (current == bdflush_tsk) return; - if (!block) - { + if (!block) { wake_up_process(bdflush_tsk); return; } @@ -2210,7 +2324,7 @@ void wakeup_bdflush(int block) as all dirty buffers lives _only_ in the DIRTY lru list. As we never browse the LOCKED and CLEAN lru lists they are infact completly useless. */ -static void flush_dirty_buffers(int check_flushtime) +static int flush_dirty_buffers(int check_flushtime) { struct buffer_head * bh, *next; int flushed = 0, i; @@ -2220,29 +2334,24 @@ static void flush_dirty_buffers(int check_flushtime) bh = lru_list[BUF_DIRTY]; if (!bh) goto out_unlock; - for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) - { + for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) { next = bh->b_next_free; - if (!buffer_dirty(bh)) - { + if (!buffer_dirty(bh)) { __refile_buffer(bh); continue; } if (buffer_locked(bh)) continue; - if (check_flushtime) - { - /* The dirty lru list is chronogical ordered so + if (check_flushtime) { + /* The dirty lru list is chronologically ordered so if the current bh is not yet timed out, then also all the following bhs will be too young. */ if (time_before(jiffies, bh->b_flushtime)) goto out_unlock; - } - else - { + } else { if (++flushed > bdf_prm.b_un.ndirty) goto out_unlock; } @@ -2259,6 +2368,8 @@ static void flush_dirty_buffers(int check_flushtime) } out_unlock: spin_unlock(&lru_list_lock); + + return flushed; } /* @@ -2342,6 +2453,7 @@ asmlinkage long sys_bdflush(int func, long data) */ int bdflush(void * unused) { + int flushed; /* * We have a bare-bones task_struct, and really should fill * in a few more things so "top" and /proc/2/{exe,root,cwd} @@ -2363,7 +2475,7 @@ int bdflush(void * unused) for (;;) { CHECK_EMERGENCY_SYNC - flush_dirty_buffers(0); + flushed = flush_dirty_buffers(0); /* If wakeup_bdflush will wakeup us after our bdflush_done wakeup, then @@ -2378,10 +2490,10 @@ int bdflush(void * unused) /* * If there are still a lot of dirty buffers around, * skip the sleep and flush some more. Otherwise, we - * sleep for a while. + * go to sleep waiting a wakeup. */ - if (balance_dirty_state(NODEV) < 0) - schedule_timeout(5*HZ); + if (!flushed || balance_dirty_state(NODEV) < 0) + schedule(); /* Remember to mark us as running otherwise the next schedule will block. */ __set_current_state(TASK_RUNNING); @@ -2413,24 +2525,19 @@ int kupdate(void * unused) for (;;) { /* update interval */ interval = bdf_prm.b_un.interval; - if (interval) - { + if (interval) { tsk->state = TASK_INTERRUPTIBLE; schedule_timeout(interval); - } - else - { + } else { stop_kupdate: tsk->state = TASK_STOPPED; schedule(); /* wait for SIGCONT */ } /* check for sigstop */ - if (signal_pending(tsk)) - { + if (signal_pending(tsk)) { int stopped = 0; spin_lock_irq(&tsk->sigmask_lock); - if (sigismember(&tsk->signal, SIGSTOP)) - { + if (sigismember(&tsk->signal, SIGSTOP)) { sigdelset(&tsk->signal, SIGSTOP); stopped = 1; } |