diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1999-07-05 23:09:37 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1999-07-05 23:09:37 +0000 |
commit | aba344fdfed81b2c03d6114c54cfd73a486aa10b (patch) | |
tree | d032d8430bf1234c3ecc6f6330d6de6e887e5963 /fs/buffer.c | |
parent | 40c138bfc6d37dbff5339f84575db1e3cec6e34e (diff) |
Merge with Linux 2.3.9.
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 1243 |
1 files changed, 565 insertions, 678 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 9ffb8556a..108b385ea 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -24,6 +24,8 @@ * - RMK */ +/* Thread it... -DaveM */ + #include <linux/sched.h> #include <linux/fs.h> #include <linux/malloc.h> @@ -57,31 +59,39 @@ static char buffersize_index[65] = #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this number of unused buffer heads */ -/* - * Hash table mask.. +/* Anti-deadlock ordering: + * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock */ -static unsigned long bh_hash_mask = 0; - -static int grow_buffers(int size); -static struct buffer_head ** hash_table; -static struct buffer_head * lru_list[NR_LIST] = {NULL, }; -static struct buffer_head * free_list[NR_SIZES] = {NULL, }; +/* + * Hash table gook.. + */ +static unsigned int bh_hash_mask = 0; +static unsigned int bh_hash_shift = 0; +static struct buffer_head **hash_table; +static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED; -static kmem_cache_t *bh_cachep; +static struct buffer_head *lru_list[NR_LIST]; +static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED; +static int nr_buffers_type[NR_LIST] = {0,}; static struct buffer_head * unused_list = NULL; -static struct buffer_head * reuse_list = NULL; +static int nr_unused_buffer_heads = 0; +static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); -static int nr_buffers = 0; -static int nr_buffers_type[NR_LIST] = {0,}; -static int nr_buffer_heads = 0; -static int nr_unused_buffer_heads = 0; -static int nr_hashed_buffers = 0; +struct bh_free_head { + struct buffer_head *list; + spinlock_t lock; +}; +static struct bh_free_head free_list[NR_SIZES]; + +static kmem_cache_t *bh_cachep; + +static int grow_buffers(int size); /* This is used by some architectures to estimate available memory. */ -int buffermem = 0; +atomic_t buffermem = ATOMIC_INIT(0); /* Here is the parameter block for the bdflush process. If you add or * remove any of the parameters, make sure to update kernel/sysctl.c. @@ -131,7 +141,7 @@ void __wait_on_buffer(struct buffer_head * bh) struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); - bh->b_count++; + atomic_inc(&bh->b_count); add_wait_queue(&bh->b_wait, &wait); repeat: tsk->state = TASK_UNINTERRUPTIBLE; @@ -142,7 +152,7 @@ repeat: } tsk->state = TASK_RUNNING; remove_wait_queue(&bh->b_wait, &wait); - bh->b_count--; + atomic_dec(&bh->b_count); } /* Call sync_buffers with wait!=0 to ensure that the call does not @@ -167,17 +177,19 @@ static int sync_buffers(kdev_t dev, int wait) */ do { retry = 0; -repeat: + /* We search all lists as a failsafe mechanism, not because we expect * there to be dirty buffers on any of the other lists. */ +repeat: + spin_lock(&lru_list_lock); bh = lru_list[BUF_DIRTY]; if (!bh) goto repeat2; + for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) { - if (bh->b_list != BUF_DIRTY) - goto repeat; next = bh->b_next_free; + if (!lru_list[BUF_DIRTY]) break; if (dev && bh->b_dev != dev) @@ -190,7 +202,10 @@ repeat: retry = 1; continue; } + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); wait_on_buffer (bh); + atomic_dec(&bh->b_count); goto repeat; } @@ -209,30 +224,24 @@ repeat: if (!buffer_dirty(bh) || pass >= 2) continue; - /* Don't bother about locked buffers. - * - * XXX We checked if it was locked above and there is no - * XXX way we could have slept in between. -DaveM - */ - if (buffer_locked(bh)) - continue; - bh->b_count++; - next->b_count++; + atomic_inc(&bh->b_count); bh->b_flushtime = 0; + spin_unlock(&lru_list_lock); ll_rw_block(WRITE, 1, &bh); - bh->b_count--; - next->b_count--; + atomic_dec(&bh->b_count); retry = 1; + goto repeat; } repeat2: bh = lru_list[BUF_LOCKED]; - if (!bh) + if (!bh) { + spin_unlock(&lru_list_lock); break; + } for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) { - if (bh->b_list != BUF_LOCKED) - goto repeat2; next = bh->b_next_free; + if (!lru_list[BUF_LOCKED]) break; if (dev && bh->b_dev != dev) @@ -245,10 +254,15 @@ repeat: retry = 1; continue; } + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); wait_on_buffer (bh); + spin_lock(&lru_list_lock); + atomic_dec(&bh->b_count); goto repeat2; } } + spin_unlock(&lru_list_lock); /* If we are waiting for the sync to succeed, and if any dirty * blocks were written, then repeat; on the second pass, only @@ -282,17 +296,19 @@ void sync_dev(kdev_t dev) int fsync_dev(kdev_t dev) { sync_buffers(dev, 0); + + lock_kernel(); sync_supers(dev); sync_inodes(dev); DQUOT_SYNC(dev); + unlock_kernel(); + return sync_buffers(dev, 1); } asmlinkage int sys_sync(void) { - lock_kernel(); fsync_dev(0); - unlock_kernel(); return 0; } @@ -396,19 +412,28 @@ out: void invalidate_buffers(kdev_t dev) { - int i; int nlist; - struct buffer_head * bh; + spin_lock(&lru_list_lock); for(nlist = 0; nlist < NR_LIST; nlist++) { + struct buffer_head * bh; + int i; + retry: bh = lru_list[nlist]; + if (!bh) + continue; for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) { if (bh->b_dev != dev) continue; - wait_on_buffer(bh); - if (bh->b_dev != dev) - continue; - if (bh->b_count) + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + spin_lock(&lru_list_lock); + atomic_dec(&bh->b_count); + goto retry; + } + if (atomic_read(&bh->b_count)) continue; bh->b_flushtime = 0; clear_bit(BH_Protected, &bh->b_state); @@ -417,157 +442,119 @@ void invalidate_buffers(kdev_t dev) clear_bit(BH_Req, &bh->b_state); } } + spin_unlock(&lru_list_lock); } -#define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask) -#define hash(dev,block) hash_table[_hashfn(dev,block)] +/* After several hours of tedious analysis, the following hash + * function won. Do not mess with it... -DaveM + */ +#define _hashfn(dev,block) \ + ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \ + (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12)))) +#define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)] -static void insert_into_hash_list(struct buffer_head * bh) +static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head) { - bh->b_next = NULL; - bh->b_pprev = NULL; - if (bh->b_dev) { - struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr); - struct buffer_head *next = *bhp; - - if (next) { - bh->b_next = next; - next->b_pprev = &bh->b_next; - } - *bhp = bh; - bh->b_pprev = bhp; - nr_hashed_buffers++; - } + if ((bh->b_next = *head) != NULL) + bh->b_next->b_pprev = &bh->b_next; + *head = bh; + bh->b_pprev = head; } -static void remove_from_hash_queue(struct buffer_head * bh) +static __inline__ void __hash_unlink(struct buffer_head *bh) { - struct buffer_head **pprev = bh->b_pprev; - if (pprev) { - struct buffer_head * next = bh->b_next; - if (next) { - next->b_pprev = pprev; - bh->b_next = NULL; - } - *pprev = next; - bh->b_pprev = NULL; - nr_hashed_buffers--; - } + if (bh->b_next) + bh->b_next->b_pprev = bh->b_pprev; + *(bh->b_pprev) = bh->b_next; + bh->b_pprev = NULL; } -static void insert_into_lru_list(struct buffer_head * bh) +static void __insert_into_lru_list(struct buffer_head * bh, int blist) { - struct buffer_head **bhp = &lru_list[bh->b_list]; - - if (bh->b_dev == B_FREE) - BUG(); + struct buffer_head **bhp = &lru_list[blist]; if(!*bhp) { *bhp = bh; bh->b_prev_free = bh; } - - if (bh->b_next_free) - panic("VFS: buffer LRU pointers corrupted"); - bh->b_next_free = *bhp; bh->b_prev_free = (*bhp)->b_prev_free; (*bhp)->b_prev_free->b_next_free = bh; (*bhp)->b_prev_free = bh; - - nr_buffers++; - nr_buffers_type[bh->b_list]++; + nr_buffers_type[blist]++; } -static void remove_from_lru_list(struct buffer_head * bh) +static void __remove_from_lru_list(struct buffer_head * bh, int blist) { - if (!(bh->b_prev_free) || !(bh->b_next_free)) - return; - - if (bh->b_dev == B_FREE) { - printk("LRU list corrupted"); - *(int*)0 = 0; + if (bh->b_prev_free || bh->b_next_free) { + bh->b_prev_free->b_next_free = bh->b_next_free; + bh->b_next_free->b_prev_free = bh->b_prev_free; + if (lru_list[blist] == bh) + lru_list[blist] = bh->b_next_free; + if (lru_list[blist] == bh) + lru_list[blist] = NULL; + bh->b_next_free = bh->b_prev_free = NULL; + nr_buffers_type[blist]--; } - bh->b_prev_free->b_next_free = bh->b_next_free; - bh->b_next_free->b_prev_free = bh->b_prev_free; - - if (lru_list[bh->b_list] == bh) - lru_list[bh->b_list] = bh->b_next_free; - if (lru_list[bh->b_list] == bh) - lru_list[bh->b_list] = NULL; - bh->b_next_free = bh->b_prev_free = NULL; - - nr_buffers--; - nr_buffers_type[bh->b_list]--; } -static void remove_from_free_list(struct buffer_head * bh) +static void __remove_from_free_list(struct buffer_head * bh, int index) { - int isize = BUFSIZE_INDEX(bh->b_size); - if (!(bh->b_prev_free) || !(bh->b_next_free)) - panic("VFS: Free block list corrupted"); - if(bh->b_dev != B_FREE) - panic("Free list corrupted"); - if(!free_list[isize]) - panic("Free list empty"); if(bh->b_next_free == bh) - free_list[isize] = NULL; + free_list[index].list = NULL; else { bh->b_prev_free->b_next_free = bh->b_next_free; bh->b_next_free->b_prev_free = bh->b_prev_free; - if (free_list[isize] == bh) - free_list[isize] = bh->b_next_free; + if (free_list[index].list == bh) + free_list[index].list = bh->b_next_free; } bh->b_next_free = bh->b_prev_free = NULL; } -static void remove_from_queues(struct buffer_head * bh) +/* The following two functions must operate atomically + * because they control the visibility of a buffer head + * to the rest of the kernel. + */ +static __inline__ void __remove_from_queues(struct buffer_head *bh) { - if (bh->b_dev == B_FREE) - BUG(); - remove_from_hash_queue(bh); - remove_from_lru_list(bh); + write_lock(&hash_table_lock); + if (bh->b_pprev) + __hash_unlink(bh); + __remove_from_lru_list(bh, bh->b_list); + write_unlock(&hash_table_lock); } -static void put_last_free(struct buffer_head * bh) +static void insert_into_queues(struct buffer_head *bh) { - if (bh) { - struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)]; - - if (bh->b_count) - BUG(); - - bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */ - - /* Add to back of free list. */ - if(!*bhp) { - *bhp = bh; - bh->b_prev_free = bh; - } - - bh->b_next_free = *bhp; - bh->b_prev_free = (*bhp)->b_prev_free; - (*bhp)->b_prev_free->b_next_free = bh; - (*bhp)->b_prev_free = bh; - } + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); + + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + __hash_link(bh, head); + __insert_into_lru_list(bh, bh->b_list); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); } -struct buffer_head * find_buffer(kdev_t dev, int block, int size) -{ - struct buffer_head * next; +/* This function must only run if there are no other + * references _anywhere_ to this buffer head. + */ +static void put_last_free(struct buffer_head * bh) +{ + struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)]; + struct buffer_head **bhp = &head->list; - next = hash(dev,block); - for (;;) { - struct buffer_head *tmp = next; - if (!next) - break; - next = tmp->b_next; - if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev) - continue; - next = tmp; - break; + spin_lock(&head->lock); + bh->b_dev = B_FREE; + if(!*bhp) { + *bhp = bh; + bh->b_prev_free = bh; } - return next; + bh->b_next_free = *bhp; + bh->b_prev_free = (*bhp)->b_prev_free; + (*bhp)->b_prev_free->b_next_free = bh; + (*bhp)->b_prev_free = bh; + spin_unlock(&head->lock); } /* @@ -579,10 +566,19 @@ struct buffer_head * find_buffer(kdev_t dev, int block, int size) */ struct buffer_head * get_hash_table(kdev_t dev, int block, int size) { - struct buffer_head * bh; - bh = find_buffer(dev,block,size); + struct buffer_head **head = &hash(dev, block); + struct buffer_head *bh; + + read_lock(&hash_table_lock); + for(bh = *head; bh; bh = bh->b_next) + if (bh->b_blocknr == block && + bh->b_size == size && + bh->b_dev == dev) + break; if (bh) - bh->b_count++; + atomic_inc(&bh->b_count); + read_unlock(&hash_table_lock); + return bh; } @@ -631,6 +627,8 @@ void set_blocksize(kdev_t dev, int size) * around on the free list, and we can get in a loop if we are not careful. */ for(nlist = 0; nlist < NR_LIST; nlist++) { + repeat: + spin_lock(&lru_list_lock); bh = lru_list[nlist]; for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) { if(!bh) @@ -641,21 +639,25 @@ void set_blocksize(kdev_t dev, int size) continue; if (bh->b_size == size) continue; - bhnext->b_count++; - bh->b_count++; - wait_on_buffer(bh); - bhnext->b_count--; + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + atomic_dec(&bh->b_count); + goto repeat; + } if (bh->b_dev == dev && bh->b_size != size) { clear_bit(BH_Dirty, &bh->b_state); clear_bit(BH_Uptodate, &bh->b_state); clear_bit(BH_Req, &bh->b_state); bh->b_flushtime = 0; } - if (--bh->b_count) - continue; - remove_from_queues(bh); - put_last_free(bh); + if (atomic_read(&bh->b_count) == 0) { + __remove_from_queues(bh); + put_last_free(bh); + } } + spin_unlock(&lru_list_lock); } } @@ -671,13 +673,10 @@ static void refill_freelist(int size) } } -void init_buffer(struct buffer_head *bh, kdev_t dev, int block, - bh_end_io_t *handler, void *dev_id) +void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id) { bh->b_list = BUF_CLEAN; bh->b_flushtime = 0; - bh->b_dev = dev; - bh->b_blocknr = block; bh->b_end_io = handler; bh->b_dev_id = dev_id; } @@ -688,6 +687,92 @@ static void end_buffer_io_sync(struct buffer_head *bh, int uptodate) unlock_buffer(bh); } +static void end_buffer_io_bad(struct buffer_head *bh, int uptodate) +{ + mark_buffer_uptodate(bh, uptodate); + unlock_buffer(bh); + BUG(); +} + +static void end_buffer_io_async(struct buffer_head * bh, int uptodate) +{ + static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + struct buffer_head *tmp; + struct page *page; + int free; + + mark_buffer_uptodate(bh, uptodate); + + /* This is a temporary buffer used for page I/O. */ + page = mem_map + MAP_NR(bh->b_data); + + if (!uptodate) + SetPageError(page); + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + * + * Async buffer_heads are here only as labels for IO, and get + * thrown away once the IO for this page is complete. IO is + * deemed complete once all buffers have been visited + * (b_count==0) and are now unlocked. We must make sure that + * only the _last_ buffer that decrements its count is the one + * that free's the page.. + */ + spin_lock_irqsave(&page_uptodate_lock, flags); + unlock_buffer(bh); + atomic_dec(&bh->b_count); + tmp = bh->b_this_page; + while (tmp != bh) { + if (atomic_read(&tmp->b_count) && + (tmp->b_end_io == end_buffer_io_async)) + goto still_busy; + tmp = tmp->b_this_page; + } + + /* OK, the async IO on this page is complete. */ + spin_unlock_irqrestore(&page_uptodate_lock, flags); + + /* + * if none of the buffers had errors then we can set the + * page uptodate: + */ + if (!PageError(page)) + SetPageUptodate(page); + + /* + * Run the hooks that have to be done when a page I/O has completed. + * + * Note - we need to test the flags before we unlock the page, but + * we must not actually free the page until after the unlock! + */ + if (test_and_clear_bit(PG_decr_after, &page->flags)) + atomic_dec(&nr_async_pages); + + if (test_and_clear_bit(PG_free_swap_after, &page->flags)) + swap_free(page->offset); + + free = test_and_clear_bit(PG_free_after, &page->flags); + + if (page->owner != -1) + PAGE_BUG(page); + page->owner = (int)current; + UnlockPage(page); + + if (free) + __free_page(page); + + return; + +still_busy: + spin_unlock_irqrestore(&page_uptodate_lock, flags); + return; +} + + /* * Ok, this is getblk, and it isn't very clear, again to hinder * race-conditions. Most of the code is seldom used, (ie repeating), @@ -713,22 +798,26 @@ repeat: } isize = BUFSIZE_INDEX(size); -get_free: - bh = free_list[isize]; + spin_lock(&free_list[isize].lock); + bh = free_list[isize].list; + if (bh) { + __remove_from_free_list(bh, isize); + atomic_set(&bh->b_count, 1); + } + spin_unlock(&free_list[isize].lock); if (!bh) goto refill; - remove_from_free_list(bh); /* OK, FINALLY we know that this buffer is the only one of its kind, - * and that it's unused (b_count=0), unlocked, and clean. + * we hold a reference (b_count>0), it is unlocked, and it is clean. */ - init_buffer(bh, dev, block, end_buffer_io_sync, NULL); - bh->b_count = 1; - bh->b_state = 0; + init_buffer(bh, end_buffer_io_sync, NULL); + bh->b_dev = dev; + bh->b_blocknr = block; + bh->b_state = 1 << BH_Mapped; /* Insert the buffer into the regular lists */ - insert_into_lru_list(bh); - insert_into_hash_list(bh); + insert_into_queues(bh); goto out; /* @@ -737,24 +826,12 @@ get_free: */ refill: refill_freelist(size); - if (!find_buffer(dev,block,size)) - goto get_free; goto repeat; out: return bh; } /* - * Put a buffer into the appropriate list, without side-effects. - */ -static void file_buffer(struct buffer_head *bh, int list) -{ - remove_from_lru_list(bh); - bh->b_list = list; - insert_into_lru_list(bh); -} - -/* * if a new dirty buffer is created we need to balance bdflush. * * in the future we might want to make bdflush aware of different @@ -783,6 +860,7 @@ void balance_dirty(kdev_t dev) static inline void __mark_dirty(struct buffer_head *bh, int flag) { bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer); + clear_bit(BH_New, &bh->b_state); refile_buffer(bh); } @@ -791,34 +869,29 @@ void __mark_buffer_dirty(struct buffer_head *bh, int flag) __mark_dirty(bh, flag); } -void __atomic_mark_buffer_dirty(struct buffer_head *bh, int flag) -{ - lock_kernel(); - __mark_dirty(bh, flag); - unlock_kernel(); -} - /* * A buffer may need to be moved from one buffer list to another * (e.g. in case it is not shared any more). Handle this. */ -void refile_buffer(struct buffer_head * buf) +static __inline__ void __refile_buffer(struct buffer_head *bh) { - int dispose; - - if (buf->b_dev == B_FREE) { - printk("Attempt to refile free buffer\n"); - return; - } - - dispose = BUF_CLEAN; - if (buffer_locked(buf)) + int dispose = BUF_CLEAN; + if (buffer_locked(bh)) dispose = BUF_LOCKED; - if (buffer_dirty(buf)) + if (buffer_dirty(bh)) dispose = BUF_DIRTY; + if (dispose != bh->b_list) { + __remove_from_lru_list(bh, bh->b_list); + bh->b_list = dispose; + __insert_into_lru_list(bh, dispose); + } +} - if (dispose != buf->b_list) - file_buffer(buf, dispose); +void refile_buffer(struct buffer_head *bh) +{ + spin_lock(&lru_list_lock); + __refile_buffer(bh); + spin_unlock(&lru_list_lock); } /* @@ -828,9 +901,8 @@ void __brelse(struct buffer_head * buf) { touch_buffer(buf); - if (buf->b_count) { - buf->b_count--; - wake_up(&buffer_wait); + if (atomic_read(&buf->b_count)) { + atomic_dec(&buf->b_count); return; } printk("VFS: brelse: Trying to free free buffer\n"); @@ -844,14 +916,21 @@ void __brelse(struct buffer_head * buf) */ void __bforget(struct buffer_head * buf) { - if (buf->b_count != 1 || buffer_locked(buf)) { - __brelse(buf); - return; + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) { + touch_buffer(buf); + atomic_dec(&buf->b_count); + } else { + atomic_set(&buf->b_count, 0); + buf->b_state = 0; + if (buf->b_pprev) + __hash_unlink(buf); + __remove_from_lru_list(buf, buf->b_list); + put_last_free(buf); } - buf->b_count = 0; - buf->b_state = 0; - remove_from_queues(buf); - put_last_free(buf); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); } /* @@ -941,49 +1020,25 @@ struct buffer_head * breada(kdev_t dev, int block, int bufsize, /* * Note: the caller should wake up the buffer_wait list if needed. */ -static void put_unused_buffer_head(struct buffer_head * bh) +static __inline__ void __put_unused_buffer_head(struct buffer_head * bh) { if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { - nr_buffer_heads--; kmem_cache_free(bh_cachep, bh); - return; + } else { + bh->b_blocknr = -1; + init_waitqueue_head(&bh->b_wait); + nr_unused_buffer_heads++; + bh->b_next_free = unused_list; + bh->b_this_page = NULL; + unused_list = bh; } - -// memset(bh, 0, sizeof(*bh)); - bh->b_blocknr = -1; - init_waitqueue_head(&bh->b_wait); - nr_unused_buffer_heads++; - bh->b_next_free = unused_list; - unused_list = bh; } -/* - * We can't put completed temporary IO buffer_heads directly onto the - * unused_list when they become unlocked, since the device driver - * end_request routines still expect access to the buffer_head's - * fields after the final unlock. So, the device driver puts them on - * the reuse_list instead once IO completes, and we recover these to - * the unused_list here. - * - * Note that we don't do a wakeup here, but return a flag indicating - * whether we got any buffer heads. A task ready to sleep can check - * the returned value, and any tasks already sleeping will have been - * awakened when the buffer heads were added to the reuse list. - */ -static inline int recover_reusable_buffer_heads(void) +static void put_unused_buffer_head(struct buffer_head *bh) { - struct buffer_head *head = xchg(&reuse_list, NULL); - int found = 0; - - if (head) { - do { - struct buffer_head *bh = head; - head = head->b_next_free; - put_unused_buffer_head(bh); - } while (head); - found = 1; - } - return found; + spin_lock(&unused_list_lock); + __put_unused_buffer_head(bh); + spin_unlock(&unused_list_lock); } /* @@ -995,13 +1050,15 @@ static struct buffer_head * get_unused_buffer_head(int async) { struct buffer_head * bh; - recover_reusable_buffer_heads(); + spin_lock(&unused_list_lock); if (nr_unused_buffer_heads > NR_RESERVED) { bh = unused_list; unused_list = bh->b_next_free; nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); return bh; } + spin_unlock(&unused_list_lock); /* This is critical. We can't swap out pages to get * more buffer heads, because the swap-out may need @@ -1010,20 +1067,23 @@ static struct buffer_head * get_unused_buffer_head(int async) if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) { memset(bh, 0, sizeof(*bh)); init_waitqueue_head(&bh->b_wait); - nr_buffer_heads++; return bh; } /* * If we need an async buffer, use the reserved buffer heads. */ - if (async && unused_list) { - bh = unused_list; - unused_list = bh->b_next_free; - nr_unused_buffer_heads--; - return bh; + if (async) { + spin_lock(&unused_list_lock); + if (unused_list) { + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; + } + spin_unlock(&unused_list_lock); } - #if 0 /* * (Pending further analysis ...) @@ -1035,7 +1095,6 @@ static struct buffer_head * get_unused_buffer_head(int async) (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) { memset(bh, 0, sizeof(*bh)); init_waitqueue_head(&bh->b_wait); - nr_buffer_heads++; return bh; } #endif @@ -1052,8 +1111,7 @@ static struct buffer_head * get_unused_buffer_head(int async) * from ordinary buffer allocations, and only async requests are allowed * to sleep waiting for buffer heads. */ -static struct buffer_head * create_buffers(unsigned long page, - unsigned long size, int async) +static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async) { DECLARE_WAITQUEUE(wait, current); struct buffer_head *bh, *head; @@ -1073,11 +1131,14 @@ try_again: bh->b_state = 0; bh->b_next_free = NULL; - bh->b_count = 0; + bh->b_pprev = NULL; + atomic_set(&bh->b_count, 0); bh->b_size = size; bh->b_data = (char *) (page+offset); - bh->b_list = 0; + bh->b_list = BUF_CLEAN; + bh->b_flushtime = 0; + bh->b_end_io = end_buffer_io_bad; } return head; /* @@ -1118,115 +1179,16 @@ no_grow: */ add_wait_queue(&buffer_wait, &wait); current->state = TASK_UNINTERRUPTIBLE; - if (!recover_reusable_buffer_heads()) + if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) { + current->policy |= SCHED_YIELD; schedule(); + } remove_wait_queue(&buffer_wait, &wait); current->state = TASK_RUNNING; goto try_again; } -/* Run the hooks that have to be done when a page I/O has completed. */ -static inline void after_unlock_page (struct page * page) -{ - if (test_and_clear_bit(PG_decr_after, &page->flags)) { - atomic_dec(&nr_async_pages); -#ifdef DEBUG_SWAP - printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n", - (char *) page_address(page), - atomic_read(&nr_async_pages)); -#endif - } - if (test_and_clear_bit(PG_swap_unlock_after, &page->flags)) - swap_after_unlock_page(page->offset); - if (test_and_clear_bit(PG_free_after, &page->flags)) - __free_page(page); -} - -/* - * Free all temporary buffers belonging to a page. - * This needs to be called with interrupts disabled. - */ -static inline void free_async_buffers (struct buffer_head * bh) -{ - struct buffer_head *tmp, *tail; - - /* - * Link all the buffers into the b_next_free list, - * so we only have to do one xchg() operation ... - */ - tail = bh; - while ((tmp = tail->b_this_page) != bh) { - tail->b_next_free = tmp; - tail = tmp; - }; - - /* Update the reuse list */ - tail->b_next_free = xchg(&reuse_list, NULL); - reuse_list = bh; - - /* Wake up any waiters ... */ - wake_up(&buffer_wait); -} - -static void end_buffer_io_async(struct buffer_head * bh, int uptodate) -{ - unsigned long flags; - struct buffer_head *tmp; - struct page *page; - - mark_buffer_uptodate(bh, uptodate); - - /* This is a temporary buffer used for page I/O. */ - page = mem_map + MAP_NR(bh->b_data); - - if (!uptodate) - SetPageError(page); - - /* - * Be _very_ careful from here on. Bad things can happen if - * two buffer heads end IO at almost the same time and both - * decide that the page is now completely done. - * - * Async buffer_heads are here only as labels for IO, and get - * thrown away once the IO for this page is complete. IO is - * deemed complete once all buffers have been visited - * (b_count==0) and are now unlocked. We must make sure that - * only the _last_ buffer that decrements its count is the one - * that free's the page.. - */ - save_flags(flags); - cli(); - unlock_buffer(bh); - tmp = bh->b_this_page; - while (tmp != bh) { - if (buffer_locked(tmp)) - goto still_busy; - tmp = tmp->b_this_page; - } - - /* OK, the async IO on this page is complete. */ - restore_flags(flags); - - after_unlock_page(page); - /* - * if none of the buffers had errors then we can set the - * page uptodate: - */ - if (!PageError(page)) - SetPageUptodate(page); - if (page->owner != -1) - PAGE_BUG(page); - page->owner = (int)current; - UnlockPage(page); - - return; - -still_busy: - restore_flags(flags); - return; -} - -static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) +static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) { struct buffer_head *head, *bh, *tail; int block; @@ -1240,9 +1202,7 @@ static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[], * They show up in the buffer hash table and are registered in * page->buffers. */ - lock_kernel(); head = create_buffers(page_address(page), size, 1); - unlock_kernel(); if (page->buffers) BUG(); if (!head) @@ -1252,7 +1212,9 @@ static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[], block = *(b++); tail = bh; - init_buffer(bh, dev, block, end_buffer_io_async, NULL); + init_buffer(bh, end_buffer_io_async, NULL); + bh->b_dev = dev; + bh->b_blocknr = block; /* * When we use bmap, we define block zero to represent @@ -1261,9 +1223,11 @@ static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[], * two cases. */ if (bmap && !block) { - set_bit(BH_Uptodate, &bh->b_state); memset(bh->b_data, 0, size); + set_bit(BH_Uptodate, &bh->b_state); + continue; } + set_bit(BH_Mapped, &bh->b_state); } tail->b_this_page = head; get_page(page); @@ -1287,7 +1251,6 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset BUG(); if (!page->buffers) return 0; - lock_kernel(); head = page->buffers; bh = head; @@ -1299,14 +1262,16 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset * is this block fully flushed? */ if (offset <= curr_off) { - if (bh->b_blocknr) { - bh->b_count++; + if (buffer_mapped(bh)) { + atomic_inc(&bh->b_count); wait_on_buffer(bh); if (bh->b_dev == B_FREE) BUG(); mark_buffer_clean(bh); + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); bh->b_blocknr = 0; - bh->b_count--; + atomic_dec(&bh->b_count); } } curr_off = next_off; @@ -1318,22 +1283,24 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset * the 'final' flushpage. We have invalidated the bmap * cached value unconditionally, so real IO is not * possible anymore. + * + * If the free doesn't work out, the buffers can be + * left around - they just turn into anonymous buffers + * instead. */ - if (!offset) - try_to_free_buffers(page); + if (!offset) { + if (!try_to_free_buffers(page)) + atomic_add(PAGE_CACHE_SIZE, &buffermem); + } - unlock_kernel(); return 0; } -static void create_empty_buffers (struct page *page, - struct inode *inode, unsigned long blocksize) +static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize) { struct buffer_head *bh, *head, *tail; - lock_kernel(); head = create_buffers(page_address(page), blocksize, 1); - unlock_kernel(); if (page->buffers) BUG(); @@ -1341,6 +1308,7 @@ static void create_empty_buffers (struct page *page, do { bh->b_dev = inode->i_dev; bh->b_blocknr = 0; + bh->b_end_io = end_buffer_io_bad; tail = bh; bh = bh->b_this_page; } while (bh); @@ -1353,12 +1321,12 @@ static void create_empty_buffers (struct page *page, * block_write_full_page() is SMP-safe - currently it's still * being called with the kernel lock held, but the code is ready. */ -int block_write_full_page (struct file *file, struct page *page, fs_getblock_t fs_get_block) +int block_write_full_page(struct file *file, struct page *page) { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; - int err, created, i; - unsigned long block, phys, offset; + int err, i; + unsigned long block, offset; struct buffer_head *bh, *head; if (!PageLocked(page)) @@ -1381,23 +1349,22 @@ int block_write_full_page (struct file *file, struct page *page, fs_getblock_t f if (!bh) BUG(); - if (!bh->b_blocknr) { - err = -EIO; - phys = fs_get_block (inode, block, 1, &err, &created); - if (!phys) + /* + * If the buffer isn't up-to-date, we can't be sure + * that the buffer has been initialized with the proper + * block number information etc.. + * + * Leave it to the low-level FS to make all those + * decisions (block #0 may actually be a valid block) + */ + bh->b_end_io = end_buffer_io_sync; + if (!buffer_mapped(bh)) { + err = inode->i_op->get_block(inode, block, bh, 1); + if (err) goto out; - - init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL); - bh->b_state = (1<<BH_Uptodate); - } else { - /* - * block already exists, just mark it uptodate and - * dirty: - */ - bh->b_end_io = end_buffer_io_sync; - set_bit(BH_Uptodate, &bh->b_state); } - atomic_mark_buffer_dirty(bh,0); + set_bit(BH_Uptodate, &bh->b_state); + mark_buffer_dirty(bh,0); bh = bh->b_this_page; block++; @@ -1410,15 +1377,15 @@ out: return err; } -int block_write_partial_page (struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf, fs_getblock_t fs_get_block) +int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf) { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; unsigned long block; - int err, created, partial; + int err, partial; unsigned long blocksize, start_block, end_block; unsigned long start_offset, start_bytes, end_bytes; - unsigned long bbits, phys, blocks, i, len; + unsigned long bbits, blocks, i, len; struct buffer_head *bh, *head; char * target_buf; @@ -1469,46 +1436,35 @@ int block_write_partial_page (struct file *file, struct page *page, unsigned lon partial = 1; goto skip; } - if (!bh->b_blocknr) { - err = -EIO; - phys = fs_get_block (inode, block, 1, &err, &created); - if (!phys) - goto out; - init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL); + /* + * If the buffer is not up-to-date, we need to ask the low-level + * FS to do something for us (we used to have assumptions about + * the meaning of b_blocknr etc, that's bad). + * + * If "update" is set, that means that the low-level FS should + * try to make sure that the block is up-to-date because we're + * not going to fill it completely. + */ + bh->b_end_io = end_buffer_io_sync; + if (!buffer_mapped(bh)) { + err = inode->i_op->get_block(inode, block, bh, 1); + if (err) + goto out; + } - /* - * if partially written block which has contents on - * disk, then we have to read it first. - * We also rely on the fact that filesystem holes - * cannot be written. - */ - if (start_offset || (end_bytes && (i == end_block))) { - if (created) { - memset(bh->b_data, 0, bh->b_size); - } else { - bh->b_state = 0; - ll_rw_block(READ, 1, &bh); - lock_kernel(); - wait_on_buffer(bh); - unlock_kernel(); - err = -EIO; - if (!buffer_uptodate(bh)) - goto out; - } + if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) { + if (buffer_new(bh)) { + memset(bh->b_data, 0, bh->b_size); + } else { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + err = -EIO; + if (!buffer_uptodate(bh)) + goto out; } - - bh->b_state = (1<<BH_Uptodate); - } else { - /* - * block already exists, just mark it uptodate: - */ - bh->b_end_io = end_buffer_io_sync; - set_bit(BH_Uptodate, &bh->b_state); - created = 0; } - err = -EFAULT; len = blocksize; if (start_offset) { len = start_bytes; @@ -1517,8 +1473,7 @@ int block_write_partial_page (struct file *file, struct page *page, unsigned lon len = end_bytes; end_bytes = 0; } - if (copy_from_user(target_buf, buf, len)) - goto out; + err = copy_from_user(target_buf, buf, len); target_buf += len; buf += len; @@ -1538,12 +1493,18 @@ int block_write_partial_page (struct file *file, struct page *page, unsigned lon * should not penalize them for somebody else writing * lots of dirty pages. */ + set_bit(BH_Uptodate, &bh->b_state); if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { - __atomic_mark_buffer_dirty(bh, bdf_prm.b_un.age_buffer); + __mark_dirty(bh, 0); if (too_many_dirty_buffers) balance_dirty(bh->b_dev); } + if (err) { + err = -EFAULT; + goto out; + } + skip: i++; block++; @@ -1572,6 +1533,9 @@ out: * * brw_page() is SMP-safe, although it's being called with the * kernel lock held - but the code is ready. + * + * FIXME: we need a swapper_inode->get_block function to remove + * some of the bmap kludges and interface ugliness here. */ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) { @@ -1600,7 +1564,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) do { block = *(b++); - if (fresh && (bh->b_count != 0)) + if (fresh && (atomic_read(&bh->b_count) != 0)) BUG(); if (rw == READ) { if (!fresh) @@ -1613,6 +1577,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) BUG(); if (!buffer_uptodate(bh)) { arr[nr++] = bh; + atomic_inc(&bh->b_count); } } } else { /* WRITE */ @@ -1625,8 +1590,9 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) BUG(); } set_bit(BH_Uptodate, &bh->b_state); - atomic_mark_buffer_dirty(bh, 0); + set_bit(BH_Dirty, &bh->b_state); arr[nr++] = bh; + atomic_inc(&bh->b_count); } bh = bh->b_this_page; } while (bh != head); @@ -1649,30 +1615,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) } /* - * This is called by end_request() when I/O has completed. - */ -void mark_buffer_uptodate(struct buffer_head * bh, int on) -{ - if (on) { - struct buffer_head *tmp = bh; - struct page *page; - set_bit(BH_Uptodate, &bh->b_state); - /* If a page has buffers and all these buffers are uptodate, - * then the page is uptodate. */ - do { - if (!test_bit(BH_Uptodate, &tmp->b_state)) - return; - tmp=tmp->b_this_page; - } while (tmp && tmp != bh); - page = mem_map + MAP_NR(bh->b_data); - SetPageUptodate(page); - return; - } - clear_bit(BH_Uptodate, &bh->b_state); -} - -/* - * Generic "readpage" function for block devices that have the normal + * Generic "read page" function for block devices that have the normal * bmap functionality. This is most of the block device filesystems. * Reads the page asynchronously --- the unlock_buffer() and * mark_buffer_uptodate() functions propagate buffer state into the @@ -1682,7 +1625,7 @@ int block_read_full_page(struct file * file, struct page * page) { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; - unsigned long iblock, phys_block; + unsigned long iblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, blocks; int nr; @@ -1700,33 +1643,25 @@ int block_read_full_page(struct file * file, struct page * page) head = page->buffers; bh = head; nr = 0; + do { - phys_block = bh->b_blocknr; - /* - * important, we have to retry buffers that already have - * their bnr cached but had an IO error! - */ - if (!buffer_uptodate(bh)) { - phys_block = inode->i_op->bmap(inode, iblock); - /* - * this is safe to do because we hold the page lock: - */ - if (phys_block) { - init_buffer(bh, inode->i_dev, phys_block, - end_buffer_io_async, NULL); - arr[nr] = bh; - nr++; - } else { - /* - * filesystem 'hole' represents zero-contents: - */ + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + inode->i_op->get_block(inode, iblock, bh, 0); + if (!buffer_mapped(bh)) { memset(bh->b_data, 0, blocksize); set_bit(BH_Uptodate, &bh->b_state); + continue; } } - iblock++; - bh = bh->b_this_page; - } while (bh != head); + + init_buffer(bh, end_buffer_io_async, NULL); + atomic_inc(&bh->b_count); + arr[nr] = bh; + nr++; + } while (iblock++, (bh = bh->b_this_page) != head); ++current->maj_flt; if (nr) { @@ -1770,8 +1705,9 @@ static int grow_buffers(int size) } isize = BUFSIZE_INDEX(size); - insert_point = free_list[isize]; + spin_lock(&free_list[isize].lock); + insert_point = free_list[isize].list; tmp = bh; while (1) { if (insert_point) { @@ -1790,9 +1726,11 @@ static int grow_buffers(int size) break; } tmp->b_this_page = bh; - free_list[isize] = bh; + free_list[isize].list = bh; + spin_unlock(&free_list[isize].lock); + mem_map[MAP_NR(page)].buffers = bh; - buffermem += PAGE_SIZE; + atomic_add(PAGE_SIZE, &buffermem); return 1; } @@ -1800,7 +1738,7 @@ static int grow_buffers(int size) * Can the buffer be thrown out? */ #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected)) -#define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS)) +#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) /* * try_to_free_buffers() checks if all the buffers on this particular page @@ -1808,90 +1746,70 @@ static int grow_buffers(int size) * * Wake up bdflush() if this fails - if we're running low on memory due * to dirty buffers, we need to flush them out as quickly as possible. + * + * NOTE: There are quite a number of ways that threads of control can + * obtain a reference to a buffer head within a page. So we must + * lock out all of these paths to cleanly toss the page. */ int try_to_free_buffers(struct page * page) { struct buffer_head * tmp, * bh = page->buffers; + int index = BUFSIZE_INDEX(bh->b_size); + int ret; + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + spin_lock(&free_list[index].lock); tmp = bh; do { struct buffer_head * p = tmp; tmp = tmp->b_this_page; - if (!buffer_busy(p)) - continue; - - too_many_dirty_buffers = 1; - wakeup_bdflush(0); - return 0; + if (buffer_busy(p)) + goto busy_buffer_page; } while (tmp != bh); + spin_lock(&unused_list_lock); tmp = bh; do { struct buffer_head * p = tmp; tmp = tmp->b_this_page; - /* The buffer can be either on the regular queues or on the free list.. */ - if (p->b_dev == B_FREE) - remove_from_free_list(p); - else - remove_from_queues(p); - - put_unused_buffer_head(p); + /* The buffer can be either on the regular + * queues or on the free list.. + */ + if (p->b_dev == B_FREE) { + __remove_from_free_list(p, index); + } else { + if (p->b_pprev) + __hash_unlink(p); + __remove_from_lru_list(p, p->b_list); + } + __put_unused_buffer_head(p); } while (tmp != bh); + spin_unlock(&unused_list_lock); /* Wake up anyone waiting for buffer heads */ wake_up(&buffer_wait); /* And free the page */ page->buffers = NULL; - if (__free_page(page)) { - buffermem -= PAGE_SIZE; - return 1; - } - return 0; -} - -/* ================== Debugging =================== */ - -void show_buffers(void) -{ - struct buffer_head * bh; - int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; - int protected = 0; - int nlist; - static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"}; - - printk("Buffer memory: %6dkB\n",buffermem>>10); - printk("Buffer heads: %6d\n",nr_buffer_heads); - printk("Buffer blocks: %6d\n",nr_buffers); - printk("Buffer hashed: %6d\n",nr_hashed_buffers); - - for(nlist = 0; nlist < NR_LIST; nlist++) { - found = locked = dirty = used = lastused = protected = 0; - bh = lru_list[nlist]; - if(!bh) continue; - - do { - found++; - if (buffer_locked(bh)) - locked++; - if (buffer_protected(bh)) - protected++; - if (buffer_dirty(bh)) - dirty++; - if (bh->b_count) - used++, lastused = found; - bh = bh->b_next_free; - } while (bh != lru_list[nlist]); - printk("%8s: %d buffers, %d used (last=%d), " - "%d locked, %d protected, %d dirty\n", - buf_types[nlist], found, used, lastused, - locked, protected, dirty); - }; + __free_page(page); + ret = 1; +out: + spin_unlock(&free_list[index].lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + return ret; + +busy_buffer_page: + /* Uhhuh, start writeback so that we don't end up with all dirty pages */ + too_many_dirty_buffers = 1; + wakeup_bdflush(0); + ret = 0; + goto out; } - /* ===================== Init ======================= */ /* @@ -1901,31 +1819,53 @@ void show_buffers(void) */ void __init buffer_init(unsigned long memory_size) { - int order; + int order, i; unsigned int nr_hash; - /* we need to guess at the right sort of size for a buffer cache. - the heuristic from working with large databases and getting - fsync times (ext2) manageable, is the following */ - - memory_size >>= 22; - for (order = 5; (1UL << order) < memory_size; order++); + /* The buffer cache hash table is less important these days, + * trim it a bit. + */ + memory_size >>= 14; + memory_size *= sizeof(struct buffer_head *); + for (order = 0; (PAGE_SIZE << order) < memory_size; order++) + ; /* try to allocate something until we get it or we're asking for something that is really too small */ do { - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct buffer_head *); + unsigned long tmp; + + nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *); + bh_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + bh_hash_shift = 0; + while((tmp >>= 1UL) != 0UL) + bh_hash_shift++; + hash_table = (struct buffer_head **) __get_free_pages(GFP_ATOMIC, order); - } while (hash_table == NULL && --order > 4); - printk("buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", nr_hash, order, (1UL<<order) * PAGE_SIZE); - + } while (hash_table == NULL && --order > 0); + printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", + nr_hash, order, (1UL<<order) * PAGE_SIZE); + if (!hash_table) panic("Failed to allocate buffer hash table\n"); - memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *)); - bh_hash_mask = nr_hash-1; + + /* Setup hash chains. */ + for(i = 0; i < nr_hash; i++) + hash_table[i] = NULL; + + /* Setup free lists. */ + for(i = 0; i < NR_SIZES; i++) { + free_list[i].list = NULL; + free_list[i].lock = SPIN_LOCK_UNLOCKED; + } + + /* Setup lru lists. */ + for(i = 0; i < NR_LIST; i++) + lru_list[i] = NULL; bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), @@ -1933,21 +1873,6 @@ void __init buffer_init(unsigned long memory_size) SLAB_HWCACHE_ALIGN, NULL, NULL); if(!bh_cachep) panic("Cannot create buffer head SLAB cache\n"); - /* - * Allocate the reserved buffer heads. - */ - while (nr_buffer_heads < NR_RESERVED) { - struct buffer_head * bh; - - bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC); - if (!bh) - break; - put_unused_buffer_head(bh); - nr_buffer_heads++; - } - - lru_list[BUF_CLEAN] = 0; - grow_buffers(BLOCK_SIZE); } @@ -1983,70 +1908,49 @@ void wakeup_bdflush(int wait) static int sync_old_buffers(void) { - int i; - int ndirty, nwritten; int nlist; - int ncount; - struct buffer_head * bh, *next; + lock_kernel(); sync_supers(0); sync_inodes(0); + unlock_kernel(); - ncount = 0; -#ifdef DEBUG - for(nlist = 0; nlist < NR_LIST; nlist++) -#else - for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) -#endif - { - ndirty = 0; - nwritten = 0; + for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) { + struct buffer_head *bh; repeat: - + spin_lock(&lru_list_lock); bh = lru_list[nlist]; - if(bh) - for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) { - /* We may have stalled while waiting for I/O to complete. */ - if(bh->b_list != nlist) goto repeat; - next = bh->b_next_free; - if(!lru_list[nlist]) { - printk("Dirty list empty %d\n", i); - break; - } - - /* Clean buffer on dirty list? Refile it */ - if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) { - refile_buffer(bh); - continue; - } - - /* Unlocked buffer on locked list? Refile it */ - if (nlist == BUF_LOCKED && !buffer_locked(bh)) { - refile_buffer(bh); - continue; - } + if(bh) { + struct buffer_head *next; + int i; + for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) { + next = bh->b_next_free; + + /* If the buffer is not on the proper list, + * then refile it. + */ + if ((nlist == BUF_DIRTY && + (!buffer_dirty(bh) && !buffer_locked(bh))) || + (nlist == BUF_LOCKED && !buffer_locked(bh))) { + __refile_buffer(bh); + continue; + } - if (buffer_locked(bh) || !buffer_dirty(bh)) - continue; - ndirty++; - nwritten++; - next->b_count++; - bh->b_count++; - bh->b_flushtime = 0; -#ifdef DEBUG - if(nlist != BUF_DIRTY) ncount++; -#endif - ll_rw_block(WRITE, 1, &bh); - bh->b_count--; - next->b_count--; - } + if (buffer_locked(bh) || !buffer_dirty(bh)) + continue; + + /* OK, now we are committed to write it out. */ + bh->b_flushtime = 0; + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + atomic_dec(&bh->b_count); + goto repeat; + } + } + spin_unlock(&lru_list_lock); } run_task_queue(&tq_disk); -#ifdef DEBUG - if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount); - printk("Wrote %d/%d buffers\n", nwritten, ndirty); -#endif - run_task_queue(&tq_disk); return 0; } @@ -2060,7 +1964,6 @@ asmlinkage int sys_bdflush(int func, long data) { int i, error = -EPERM; - lock_kernel(); if (!capable(CAP_SYS_ADMIN)) goto out; @@ -2092,7 +1995,6 @@ asmlinkage int sys_bdflush(int func, long data) */ error = 0; out: - unlock_kernel(); return error; } @@ -2114,52 +2016,37 @@ int bdflush(void * unused) sprintf(current->comm, "kflushd"); bdflush_tsk = current; - /* - * As a kernel thread we want to tamper with system buffers - * and other internals and thus be subject to the SMP locking - * rules. (On a uniprocessor box this does nothing). - */ - lock_kernel(); - for (;;) { int nlist; CHECK_EMERGENCY_SYNC - for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) - { - int nr; - int written = 0; + for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) { + int nr, major, written = 0; struct buffer_head *next; - int major; repeat: + spin_lock(&lru_list_lock); next = lru_list[nlist]; nr = nr_buffers_type[nlist]; - while (nr-- > 0) { struct buffer_head *bh = next; - /* We may have stalled while waiting for I/O to complete. */ - if (next->b_list != nlist) - goto repeat; + next = next->b_next_free; - /* Clean buffer on dirty list? Refile it */ - if (nlist == BUF_DIRTY && !buffer_dirty(bh)) { - refile_buffer(bh); - continue; - } - - /* Unlocked buffer on locked list? Refile it */ - if (nlist == BUF_LOCKED && !buffer_locked(bh)) { - refile_buffer(bh); + /* If the buffer is not on the correct list, + * then refile it. + */ + if ((nlist == BUF_DIRTY && + (!buffer_dirty(bh) && !buffer_locked(bh))) || + (nlist == BUF_LOCKED && !buffer_locked(bh))) { + __refile_buffer(bh); continue; } - /* - * If we aren't in panic mode, don't write out too much - * at a time. Also, don't write out buffers we don't really - * have to write out yet.. + /* If we aren't in panic mode, don't write out too much + * at a time. Also, don't write out buffers we don't + * really have to write out yet.. */ if (!too_many_dirty_buffers) { if (written > bdf_prm.b_un.ndirty) @@ -2172,9 +2059,6 @@ int bdflush(void * unused) continue; major = MAJOR(bh->b_dev); - if (next) - next->b_count++; - bh->b_count++; written++; bh->b_flushtime = 0; @@ -2182,18 +2066,18 @@ int bdflush(void * unused) * For the loop major we can try to do asynchronous writes, * but we have to guarantee that we're making some progress.. */ + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); if (major == LOOP_MAJOR && written > 1) { ll_rw_block(WRITEA, 1, &bh); if (buffer_dirty(bh)) --written; } else ll_rw_block(WRITE, 1, &bh); - - bh->b_count--; - if (next) - next->b_count--; - wake_up(&buffer_wait); + atomic_dec(&bh->b_count); + goto repeat; } + spin_unlock(&lru_list_lock); } run_task_queue(&tq_disk); wake_up(&bdflush_done); @@ -2206,7 +2090,10 @@ int bdflush(void * unused) */ if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) { too_many_dirty_buffers = 0; - sleep_on_timeout(&bdflush_wait, 5*HZ); + spin_lock_irq(¤t->sigmask_lock); + flush_signals(current); + spin_unlock_irq(¤t->sigmask_lock); + interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ); } } } |