summaryrefslogtreecommitdiffstats
path: root/fs/buffer.c
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1999-07-05 23:09:37 +0000
committerRalf Baechle <ralf@linux-mips.org>1999-07-05 23:09:37 +0000
commitaba344fdfed81b2c03d6114c54cfd73a486aa10b (patch)
treed032d8430bf1234c3ecc6f6330d6de6e887e5963 /fs/buffer.c
parent40c138bfc6d37dbff5339f84575db1e3cec6e34e (diff)
Merge with Linux 2.3.9.
Diffstat (limited to 'fs/buffer.c')
-rw-r--r--fs/buffer.c1243
1 files changed, 565 insertions, 678 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 9ffb8556a..108b385ea 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -24,6 +24,8 @@
* - RMK
*/
+/* Thread it... -DaveM */
+
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/malloc.h>
@@ -57,31 +59,39 @@ static char buffersize_index[65] =
#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
number of unused buffer heads */
-/*
- * Hash table mask..
+/* Anti-deadlock ordering:
+ * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
*/
-static unsigned long bh_hash_mask = 0;
-
-static int grow_buffers(int size);
-static struct buffer_head ** hash_table;
-static struct buffer_head * lru_list[NR_LIST] = {NULL, };
-static struct buffer_head * free_list[NR_SIZES] = {NULL, };
+/*
+ * Hash table gook..
+ */
+static unsigned int bh_hash_mask = 0;
+static unsigned int bh_hash_shift = 0;
+static struct buffer_head **hash_table;
+static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
-static kmem_cache_t *bh_cachep;
+static struct buffer_head *lru_list[NR_LIST];
+static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
+static int nr_buffers_type[NR_LIST] = {0,};
static struct buffer_head * unused_list = NULL;
-static struct buffer_head * reuse_list = NULL;
+static int nr_unused_buffer_heads = 0;
+static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
-static int nr_buffers = 0;
-static int nr_buffers_type[NR_LIST] = {0,};
-static int nr_buffer_heads = 0;
-static int nr_unused_buffer_heads = 0;
-static int nr_hashed_buffers = 0;
+struct bh_free_head {
+ struct buffer_head *list;
+ spinlock_t lock;
+};
+static struct bh_free_head free_list[NR_SIZES];
+
+static kmem_cache_t *bh_cachep;
+
+static int grow_buffers(int size);
/* This is used by some architectures to estimate available memory. */
-int buffermem = 0;
+atomic_t buffermem = ATOMIC_INIT(0);
/* Here is the parameter block for the bdflush process. If you add or
* remove any of the parameters, make sure to update kernel/sysctl.c.
@@ -131,7 +141,7 @@ void __wait_on_buffer(struct buffer_head * bh)
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
- bh->b_count++;
+ atomic_inc(&bh->b_count);
add_wait_queue(&bh->b_wait, &wait);
repeat:
tsk->state = TASK_UNINTERRUPTIBLE;
@@ -142,7 +152,7 @@ repeat:
}
tsk->state = TASK_RUNNING;
remove_wait_queue(&bh->b_wait, &wait);
- bh->b_count--;
+ atomic_dec(&bh->b_count);
}
/* Call sync_buffers with wait!=0 to ensure that the call does not
@@ -167,17 +177,19 @@ static int sync_buffers(kdev_t dev, int wait)
*/
do {
retry = 0;
-repeat:
+
/* We search all lists as a failsafe mechanism, not because we expect
* there to be dirty buffers on any of the other lists.
*/
+repeat:
+ spin_lock(&lru_list_lock);
bh = lru_list[BUF_DIRTY];
if (!bh)
goto repeat2;
+
for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
- if (bh->b_list != BUF_DIRTY)
- goto repeat;
next = bh->b_next_free;
+
if (!lru_list[BUF_DIRTY])
break;
if (dev && bh->b_dev != dev)
@@ -190,7 +202,10 @@ repeat:
retry = 1;
continue;
}
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
wait_on_buffer (bh);
+ atomic_dec(&bh->b_count);
goto repeat;
}
@@ -209,30 +224,24 @@ repeat:
if (!buffer_dirty(bh) || pass >= 2)
continue;
- /* Don't bother about locked buffers.
- *
- * XXX We checked if it was locked above and there is no
- * XXX way we could have slept in between. -DaveM
- */
- if (buffer_locked(bh))
- continue;
- bh->b_count++;
- next->b_count++;
+ atomic_inc(&bh->b_count);
bh->b_flushtime = 0;
+ spin_unlock(&lru_list_lock);
ll_rw_block(WRITE, 1, &bh);
- bh->b_count--;
- next->b_count--;
+ atomic_dec(&bh->b_count);
retry = 1;
+ goto repeat;
}
repeat2:
bh = lru_list[BUF_LOCKED];
- if (!bh)
+ if (!bh) {
+ spin_unlock(&lru_list_lock);
break;
+ }
for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
- if (bh->b_list != BUF_LOCKED)
- goto repeat2;
next = bh->b_next_free;
+
if (!lru_list[BUF_LOCKED])
break;
if (dev && bh->b_dev != dev)
@@ -245,10 +254,15 @@ repeat:
retry = 1;
continue;
}
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
wait_on_buffer (bh);
+ spin_lock(&lru_list_lock);
+ atomic_dec(&bh->b_count);
goto repeat2;
}
}
+ spin_unlock(&lru_list_lock);
/* If we are waiting for the sync to succeed, and if any dirty
* blocks were written, then repeat; on the second pass, only
@@ -282,17 +296,19 @@ void sync_dev(kdev_t dev)
int fsync_dev(kdev_t dev)
{
sync_buffers(dev, 0);
+
+ lock_kernel();
sync_supers(dev);
sync_inodes(dev);
DQUOT_SYNC(dev);
+ unlock_kernel();
+
return sync_buffers(dev, 1);
}
asmlinkage int sys_sync(void)
{
- lock_kernel();
fsync_dev(0);
- unlock_kernel();
return 0;
}
@@ -396,19 +412,28 @@ out:
void invalidate_buffers(kdev_t dev)
{
- int i;
int nlist;
- struct buffer_head * bh;
+ spin_lock(&lru_list_lock);
for(nlist = 0; nlist < NR_LIST; nlist++) {
+ struct buffer_head * bh;
+ int i;
+ retry:
bh = lru_list[nlist];
+ if (!bh)
+ continue;
for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
if (bh->b_dev != dev)
continue;
- wait_on_buffer(bh);
- if (bh->b_dev != dev)
- continue;
- if (bh->b_count)
+ if (buffer_locked(bh)) {
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ spin_lock(&lru_list_lock);
+ atomic_dec(&bh->b_count);
+ goto retry;
+ }
+ if (atomic_read(&bh->b_count))
continue;
bh->b_flushtime = 0;
clear_bit(BH_Protected, &bh->b_state);
@@ -417,157 +442,119 @@ void invalidate_buffers(kdev_t dev)
clear_bit(BH_Req, &bh->b_state);
}
}
+ spin_unlock(&lru_list_lock);
}
-#define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
-#define hash(dev,block) hash_table[_hashfn(dev,block)]
+/* After several hours of tedious analysis, the following hash
+ * function won. Do not mess with it... -DaveM
+ */
+#define _hashfn(dev,block) \
+ ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
+ (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
+#define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
-static void insert_into_hash_list(struct buffer_head * bh)
+static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
{
- bh->b_next = NULL;
- bh->b_pprev = NULL;
- if (bh->b_dev) {
- struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
- struct buffer_head *next = *bhp;
-
- if (next) {
- bh->b_next = next;
- next->b_pprev = &bh->b_next;
- }
- *bhp = bh;
- bh->b_pprev = bhp;
- nr_hashed_buffers++;
- }
+ if ((bh->b_next = *head) != NULL)
+ bh->b_next->b_pprev = &bh->b_next;
+ *head = bh;
+ bh->b_pprev = head;
}
-static void remove_from_hash_queue(struct buffer_head * bh)
+static __inline__ void __hash_unlink(struct buffer_head *bh)
{
- struct buffer_head **pprev = bh->b_pprev;
- if (pprev) {
- struct buffer_head * next = bh->b_next;
- if (next) {
- next->b_pprev = pprev;
- bh->b_next = NULL;
- }
- *pprev = next;
- bh->b_pprev = NULL;
- nr_hashed_buffers--;
- }
+ if (bh->b_next)
+ bh->b_next->b_pprev = bh->b_pprev;
+ *(bh->b_pprev) = bh->b_next;
+ bh->b_pprev = NULL;
}
-static void insert_into_lru_list(struct buffer_head * bh)
+static void __insert_into_lru_list(struct buffer_head * bh, int blist)
{
- struct buffer_head **bhp = &lru_list[bh->b_list];
-
- if (bh->b_dev == B_FREE)
- BUG();
+ struct buffer_head **bhp = &lru_list[blist];
if(!*bhp) {
*bhp = bh;
bh->b_prev_free = bh;
}
-
- if (bh->b_next_free)
- panic("VFS: buffer LRU pointers corrupted");
-
bh->b_next_free = *bhp;
bh->b_prev_free = (*bhp)->b_prev_free;
(*bhp)->b_prev_free->b_next_free = bh;
(*bhp)->b_prev_free = bh;
-
- nr_buffers++;
- nr_buffers_type[bh->b_list]++;
+ nr_buffers_type[blist]++;
}
-static void remove_from_lru_list(struct buffer_head * bh)
+static void __remove_from_lru_list(struct buffer_head * bh, int blist)
{
- if (!(bh->b_prev_free) || !(bh->b_next_free))
- return;
-
- if (bh->b_dev == B_FREE) {
- printk("LRU list corrupted");
- *(int*)0 = 0;
+ if (bh->b_prev_free || bh->b_next_free) {
+ bh->b_prev_free->b_next_free = bh->b_next_free;
+ bh->b_next_free->b_prev_free = bh->b_prev_free;
+ if (lru_list[blist] == bh)
+ lru_list[blist] = bh->b_next_free;
+ if (lru_list[blist] == bh)
+ lru_list[blist] = NULL;
+ bh->b_next_free = bh->b_prev_free = NULL;
+ nr_buffers_type[blist]--;
}
- bh->b_prev_free->b_next_free = bh->b_next_free;
- bh->b_next_free->b_prev_free = bh->b_prev_free;
-
- if (lru_list[bh->b_list] == bh)
- lru_list[bh->b_list] = bh->b_next_free;
- if (lru_list[bh->b_list] == bh)
- lru_list[bh->b_list] = NULL;
- bh->b_next_free = bh->b_prev_free = NULL;
-
- nr_buffers--;
- nr_buffers_type[bh->b_list]--;
}
-static void remove_from_free_list(struct buffer_head * bh)
+static void __remove_from_free_list(struct buffer_head * bh, int index)
{
- int isize = BUFSIZE_INDEX(bh->b_size);
- if (!(bh->b_prev_free) || !(bh->b_next_free))
- panic("VFS: Free block list corrupted");
- if(bh->b_dev != B_FREE)
- panic("Free list corrupted");
- if(!free_list[isize])
- panic("Free list empty");
if(bh->b_next_free == bh)
- free_list[isize] = NULL;
+ free_list[index].list = NULL;
else {
bh->b_prev_free->b_next_free = bh->b_next_free;
bh->b_next_free->b_prev_free = bh->b_prev_free;
- if (free_list[isize] == bh)
- free_list[isize] = bh->b_next_free;
+ if (free_list[index].list == bh)
+ free_list[index].list = bh->b_next_free;
}
bh->b_next_free = bh->b_prev_free = NULL;
}
-static void remove_from_queues(struct buffer_head * bh)
+/* The following two functions must operate atomically
+ * because they control the visibility of a buffer head
+ * to the rest of the kernel.
+ */
+static __inline__ void __remove_from_queues(struct buffer_head *bh)
{
- if (bh->b_dev == B_FREE)
- BUG();
- remove_from_hash_queue(bh);
- remove_from_lru_list(bh);
+ write_lock(&hash_table_lock);
+ if (bh->b_pprev)
+ __hash_unlink(bh);
+ __remove_from_lru_list(bh, bh->b_list);
+ write_unlock(&hash_table_lock);
}
-static void put_last_free(struct buffer_head * bh)
+static void insert_into_queues(struct buffer_head *bh)
{
- if (bh) {
- struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
-
- if (bh->b_count)
- BUG();
-
- bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */
-
- /* Add to back of free list. */
- if(!*bhp) {
- *bhp = bh;
- bh->b_prev_free = bh;
- }
-
- bh->b_next_free = *bhp;
- bh->b_prev_free = (*bhp)->b_prev_free;
- (*bhp)->b_prev_free->b_next_free = bh;
- (*bhp)->b_prev_free = bh;
- }
+ struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
+
+ spin_lock(&lru_list_lock);
+ write_lock(&hash_table_lock);
+ __hash_link(bh, head);
+ __insert_into_lru_list(bh, bh->b_list);
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
}
-struct buffer_head * find_buffer(kdev_t dev, int block, int size)
-{
- struct buffer_head * next;
+/* This function must only run if there are no other
+ * references _anywhere_ to this buffer head.
+ */
+static void put_last_free(struct buffer_head * bh)
+{
+ struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
+ struct buffer_head **bhp = &head->list;
- next = hash(dev,block);
- for (;;) {
- struct buffer_head *tmp = next;
- if (!next)
- break;
- next = tmp->b_next;
- if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
- continue;
- next = tmp;
- break;
+ spin_lock(&head->lock);
+ bh->b_dev = B_FREE;
+ if(!*bhp) {
+ *bhp = bh;
+ bh->b_prev_free = bh;
}
- return next;
+ bh->b_next_free = *bhp;
+ bh->b_prev_free = (*bhp)->b_prev_free;
+ (*bhp)->b_prev_free->b_next_free = bh;
+ (*bhp)->b_prev_free = bh;
+ spin_unlock(&head->lock);
}
/*
@@ -579,10 +566,19 @@ struct buffer_head * find_buffer(kdev_t dev, int block, int size)
*/
struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
{
- struct buffer_head * bh;
- bh = find_buffer(dev,block,size);
+ struct buffer_head **head = &hash(dev, block);
+ struct buffer_head *bh;
+
+ read_lock(&hash_table_lock);
+ for(bh = *head; bh; bh = bh->b_next)
+ if (bh->b_blocknr == block &&
+ bh->b_size == size &&
+ bh->b_dev == dev)
+ break;
if (bh)
- bh->b_count++;
+ atomic_inc(&bh->b_count);
+ read_unlock(&hash_table_lock);
+
return bh;
}
@@ -631,6 +627,8 @@ void set_blocksize(kdev_t dev, int size)
* around on the free list, and we can get in a loop if we are not careful.
*/
for(nlist = 0; nlist < NR_LIST; nlist++) {
+ repeat:
+ spin_lock(&lru_list_lock);
bh = lru_list[nlist];
for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
if(!bh)
@@ -641,21 +639,25 @@ void set_blocksize(kdev_t dev, int size)
continue;
if (bh->b_size == size)
continue;
- bhnext->b_count++;
- bh->b_count++;
- wait_on_buffer(bh);
- bhnext->b_count--;
+ if (buffer_locked(bh)) {
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ atomic_dec(&bh->b_count);
+ goto repeat;
+ }
if (bh->b_dev == dev && bh->b_size != size) {
clear_bit(BH_Dirty, &bh->b_state);
clear_bit(BH_Uptodate, &bh->b_state);
clear_bit(BH_Req, &bh->b_state);
bh->b_flushtime = 0;
}
- if (--bh->b_count)
- continue;
- remove_from_queues(bh);
- put_last_free(bh);
+ if (atomic_read(&bh->b_count) == 0) {
+ __remove_from_queues(bh);
+ put_last_free(bh);
+ }
}
+ spin_unlock(&lru_list_lock);
}
}
@@ -671,13 +673,10 @@ static void refill_freelist(int size)
}
}
-void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
- bh_end_io_t *handler, void *dev_id)
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
{
bh->b_list = BUF_CLEAN;
bh->b_flushtime = 0;
- bh->b_dev = dev;
- bh->b_blocknr = block;
bh->b_end_io = handler;
bh->b_dev_id = dev_id;
}
@@ -688,6 +687,92 @@ static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
unlock_buffer(bh);
}
+static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
+{
+ mark_buffer_uptodate(bh, uptodate);
+ unlock_buffer(bh);
+ BUG();
+}
+
+static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
+{
+ static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
+ unsigned long flags;
+ struct buffer_head *tmp;
+ struct page *page;
+ int free;
+
+ mark_buffer_uptodate(bh, uptodate);
+
+ /* This is a temporary buffer used for page I/O. */
+ page = mem_map + MAP_NR(bh->b_data);
+
+ if (!uptodate)
+ SetPageError(page);
+
+ /*
+ * Be _very_ careful from here on. Bad things can happen if
+ * two buffer heads end IO at almost the same time and both
+ * decide that the page is now completely done.
+ *
+ * Async buffer_heads are here only as labels for IO, and get
+ * thrown away once the IO for this page is complete. IO is
+ * deemed complete once all buffers have been visited
+ * (b_count==0) and are now unlocked. We must make sure that
+ * only the _last_ buffer that decrements its count is the one
+ * that free's the page..
+ */
+ spin_lock_irqsave(&page_uptodate_lock, flags);
+ unlock_buffer(bh);
+ atomic_dec(&bh->b_count);
+ tmp = bh->b_this_page;
+ while (tmp != bh) {
+ if (atomic_read(&tmp->b_count) &&
+ (tmp->b_end_io == end_buffer_io_async))
+ goto still_busy;
+ tmp = tmp->b_this_page;
+ }
+
+ /* OK, the async IO on this page is complete. */
+ spin_unlock_irqrestore(&page_uptodate_lock, flags);
+
+ /*
+ * if none of the buffers had errors then we can set the
+ * page uptodate:
+ */
+ if (!PageError(page))
+ SetPageUptodate(page);
+
+ /*
+ * Run the hooks that have to be done when a page I/O has completed.
+ *
+ * Note - we need to test the flags before we unlock the page, but
+ * we must not actually free the page until after the unlock!
+ */
+ if (test_and_clear_bit(PG_decr_after, &page->flags))
+ atomic_dec(&nr_async_pages);
+
+ if (test_and_clear_bit(PG_free_swap_after, &page->flags))
+ swap_free(page->offset);
+
+ free = test_and_clear_bit(PG_free_after, &page->flags);
+
+ if (page->owner != -1)
+ PAGE_BUG(page);
+ page->owner = (int)current;
+ UnlockPage(page);
+
+ if (free)
+ __free_page(page);
+
+ return;
+
+still_busy:
+ spin_unlock_irqrestore(&page_uptodate_lock, flags);
+ return;
+}
+
+
/*
* Ok, this is getblk, and it isn't very clear, again to hinder
* race-conditions. Most of the code is seldom used, (ie repeating),
@@ -713,22 +798,26 @@ repeat:
}
isize = BUFSIZE_INDEX(size);
-get_free:
- bh = free_list[isize];
+ spin_lock(&free_list[isize].lock);
+ bh = free_list[isize].list;
+ if (bh) {
+ __remove_from_free_list(bh, isize);
+ atomic_set(&bh->b_count, 1);
+ }
+ spin_unlock(&free_list[isize].lock);
if (!bh)
goto refill;
- remove_from_free_list(bh);
/* OK, FINALLY we know that this buffer is the only one of its kind,
- * and that it's unused (b_count=0), unlocked, and clean.
+ * we hold a reference (b_count>0), it is unlocked, and it is clean.
*/
- init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
- bh->b_count = 1;
- bh->b_state = 0;
+ init_buffer(bh, end_buffer_io_sync, NULL);
+ bh->b_dev = dev;
+ bh->b_blocknr = block;
+ bh->b_state = 1 << BH_Mapped;
/* Insert the buffer into the regular lists */
- insert_into_lru_list(bh);
- insert_into_hash_list(bh);
+ insert_into_queues(bh);
goto out;
/*
@@ -737,24 +826,12 @@ get_free:
*/
refill:
refill_freelist(size);
- if (!find_buffer(dev,block,size))
- goto get_free;
goto repeat;
out:
return bh;
}
/*
- * Put a buffer into the appropriate list, without side-effects.
- */
-static void file_buffer(struct buffer_head *bh, int list)
-{
- remove_from_lru_list(bh);
- bh->b_list = list;
- insert_into_lru_list(bh);
-}
-
-/*
* if a new dirty buffer is created we need to balance bdflush.
*
* in the future we might want to make bdflush aware of different
@@ -783,6 +860,7 @@ void balance_dirty(kdev_t dev)
static inline void __mark_dirty(struct buffer_head *bh, int flag)
{
bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
+ clear_bit(BH_New, &bh->b_state);
refile_buffer(bh);
}
@@ -791,34 +869,29 @@ void __mark_buffer_dirty(struct buffer_head *bh, int flag)
__mark_dirty(bh, flag);
}
-void __atomic_mark_buffer_dirty(struct buffer_head *bh, int flag)
-{
- lock_kernel();
- __mark_dirty(bh, flag);
- unlock_kernel();
-}
-
/*
* A buffer may need to be moved from one buffer list to another
* (e.g. in case it is not shared any more). Handle this.
*/
-void refile_buffer(struct buffer_head * buf)
+static __inline__ void __refile_buffer(struct buffer_head *bh)
{
- int dispose;
-
- if (buf->b_dev == B_FREE) {
- printk("Attempt to refile free buffer\n");
- return;
- }
-
- dispose = BUF_CLEAN;
- if (buffer_locked(buf))
+ int dispose = BUF_CLEAN;
+ if (buffer_locked(bh))
dispose = BUF_LOCKED;
- if (buffer_dirty(buf))
+ if (buffer_dirty(bh))
dispose = BUF_DIRTY;
+ if (dispose != bh->b_list) {
+ __remove_from_lru_list(bh, bh->b_list);
+ bh->b_list = dispose;
+ __insert_into_lru_list(bh, dispose);
+ }
+}
- if (dispose != buf->b_list)
- file_buffer(buf, dispose);
+void refile_buffer(struct buffer_head *bh)
+{
+ spin_lock(&lru_list_lock);
+ __refile_buffer(bh);
+ spin_unlock(&lru_list_lock);
}
/*
@@ -828,9 +901,8 @@ void __brelse(struct buffer_head * buf)
{
touch_buffer(buf);
- if (buf->b_count) {
- buf->b_count--;
- wake_up(&buffer_wait);
+ if (atomic_read(&buf->b_count)) {
+ atomic_dec(&buf->b_count);
return;
}
printk("VFS: brelse: Trying to free free buffer\n");
@@ -844,14 +916,21 @@ void __brelse(struct buffer_head * buf)
*/
void __bforget(struct buffer_head * buf)
{
- if (buf->b_count != 1 || buffer_locked(buf)) {
- __brelse(buf);
- return;
+ spin_lock(&lru_list_lock);
+ write_lock(&hash_table_lock);
+ if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
+ touch_buffer(buf);
+ atomic_dec(&buf->b_count);
+ } else {
+ atomic_set(&buf->b_count, 0);
+ buf->b_state = 0;
+ if (buf->b_pprev)
+ __hash_unlink(buf);
+ __remove_from_lru_list(buf, buf->b_list);
+ put_last_free(buf);
}
- buf->b_count = 0;
- buf->b_state = 0;
- remove_from_queues(buf);
- put_last_free(buf);
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
}
/*
@@ -941,49 +1020,25 @@ struct buffer_head * breada(kdev_t dev, int block, int bufsize,
/*
* Note: the caller should wake up the buffer_wait list if needed.
*/
-static void put_unused_buffer_head(struct buffer_head * bh)
+static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
{
if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
- nr_buffer_heads--;
kmem_cache_free(bh_cachep, bh);
- return;
+ } else {
+ bh->b_blocknr = -1;
+ init_waitqueue_head(&bh->b_wait);
+ nr_unused_buffer_heads++;
+ bh->b_next_free = unused_list;
+ bh->b_this_page = NULL;
+ unused_list = bh;
}
-
-// memset(bh, 0, sizeof(*bh));
- bh->b_blocknr = -1;
- init_waitqueue_head(&bh->b_wait);
- nr_unused_buffer_heads++;
- bh->b_next_free = unused_list;
- unused_list = bh;
}
-/*
- * We can't put completed temporary IO buffer_heads directly onto the
- * unused_list when they become unlocked, since the device driver
- * end_request routines still expect access to the buffer_head's
- * fields after the final unlock. So, the device driver puts them on
- * the reuse_list instead once IO completes, and we recover these to
- * the unused_list here.
- *
- * Note that we don't do a wakeup here, but return a flag indicating
- * whether we got any buffer heads. A task ready to sleep can check
- * the returned value, and any tasks already sleeping will have been
- * awakened when the buffer heads were added to the reuse list.
- */
-static inline int recover_reusable_buffer_heads(void)
+static void put_unused_buffer_head(struct buffer_head *bh)
{
- struct buffer_head *head = xchg(&reuse_list, NULL);
- int found = 0;
-
- if (head) {
- do {
- struct buffer_head *bh = head;
- head = head->b_next_free;
- put_unused_buffer_head(bh);
- } while (head);
- found = 1;
- }
- return found;
+ spin_lock(&unused_list_lock);
+ __put_unused_buffer_head(bh);
+ spin_unlock(&unused_list_lock);
}
/*
@@ -995,13 +1050,15 @@ static struct buffer_head * get_unused_buffer_head(int async)
{
struct buffer_head * bh;
- recover_reusable_buffer_heads();
+ spin_lock(&unused_list_lock);
if (nr_unused_buffer_heads > NR_RESERVED) {
bh = unused_list;
unused_list = bh->b_next_free;
nr_unused_buffer_heads--;
+ spin_unlock(&unused_list_lock);
return bh;
}
+ spin_unlock(&unused_list_lock);
/* This is critical. We can't swap out pages to get
* more buffer heads, because the swap-out may need
@@ -1010,20 +1067,23 @@ static struct buffer_head * get_unused_buffer_head(int async)
if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
memset(bh, 0, sizeof(*bh));
init_waitqueue_head(&bh->b_wait);
- nr_buffer_heads++;
return bh;
}
/*
* If we need an async buffer, use the reserved buffer heads.
*/
- if (async && unused_list) {
- bh = unused_list;
- unused_list = bh->b_next_free;
- nr_unused_buffer_heads--;
- return bh;
+ if (async) {
+ spin_lock(&unused_list_lock);
+ if (unused_list) {
+ bh = unused_list;
+ unused_list = bh->b_next_free;
+ nr_unused_buffer_heads--;
+ spin_unlock(&unused_list_lock);
+ return bh;
+ }
+ spin_unlock(&unused_list_lock);
}
-
#if 0
/*
* (Pending further analysis ...)
@@ -1035,7 +1095,6 @@ static struct buffer_head * get_unused_buffer_head(int async)
(bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
memset(bh, 0, sizeof(*bh));
init_waitqueue_head(&bh->b_wait);
- nr_buffer_heads++;
return bh;
}
#endif
@@ -1052,8 +1111,7 @@ static struct buffer_head * get_unused_buffer_head(int async)
* from ordinary buffer allocations, and only async requests are allowed
* to sleep waiting for buffer heads.
*/
-static struct buffer_head * create_buffers(unsigned long page,
- unsigned long size, int async)
+static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async)
{
DECLARE_WAITQUEUE(wait, current);
struct buffer_head *bh, *head;
@@ -1073,11 +1131,14 @@ try_again:
bh->b_state = 0;
bh->b_next_free = NULL;
- bh->b_count = 0;
+ bh->b_pprev = NULL;
+ atomic_set(&bh->b_count, 0);
bh->b_size = size;
bh->b_data = (char *) (page+offset);
- bh->b_list = 0;
+ bh->b_list = BUF_CLEAN;
+ bh->b_flushtime = 0;
+ bh->b_end_io = end_buffer_io_bad;
}
return head;
/*
@@ -1118,115 +1179,16 @@ no_grow:
*/
add_wait_queue(&buffer_wait, &wait);
current->state = TASK_UNINTERRUPTIBLE;
- if (!recover_reusable_buffer_heads())
+ if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
+ current->policy |= SCHED_YIELD;
schedule();
+ }
remove_wait_queue(&buffer_wait, &wait);
current->state = TASK_RUNNING;
goto try_again;
}
-/* Run the hooks that have to be done when a page I/O has completed. */
-static inline void after_unlock_page (struct page * page)
-{
- if (test_and_clear_bit(PG_decr_after, &page->flags)) {
- atomic_dec(&nr_async_pages);
-#ifdef DEBUG_SWAP
- printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
- (char *) page_address(page),
- atomic_read(&nr_async_pages));
-#endif
- }
- if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
- swap_after_unlock_page(page->offset);
- if (test_and_clear_bit(PG_free_after, &page->flags))
- __free_page(page);
-}
-
-/*
- * Free all temporary buffers belonging to a page.
- * This needs to be called with interrupts disabled.
- */
-static inline void free_async_buffers (struct buffer_head * bh)
-{
- struct buffer_head *tmp, *tail;
-
- /*
- * Link all the buffers into the b_next_free list,
- * so we only have to do one xchg() operation ...
- */
- tail = bh;
- while ((tmp = tail->b_this_page) != bh) {
- tail->b_next_free = tmp;
- tail = tmp;
- };
-
- /* Update the reuse list */
- tail->b_next_free = xchg(&reuse_list, NULL);
- reuse_list = bh;
-
- /* Wake up any waiters ... */
- wake_up(&buffer_wait);
-}
-
-static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
-{
- unsigned long flags;
- struct buffer_head *tmp;
- struct page *page;
-
- mark_buffer_uptodate(bh, uptodate);
-
- /* This is a temporary buffer used for page I/O. */
- page = mem_map + MAP_NR(bh->b_data);
-
- if (!uptodate)
- SetPageError(page);
-
- /*
- * Be _very_ careful from here on. Bad things can happen if
- * two buffer heads end IO at almost the same time and both
- * decide that the page is now completely done.
- *
- * Async buffer_heads are here only as labels for IO, and get
- * thrown away once the IO for this page is complete. IO is
- * deemed complete once all buffers have been visited
- * (b_count==0) and are now unlocked. We must make sure that
- * only the _last_ buffer that decrements its count is the one
- * that free's the page..
- */
- save_flags(flags);
- cli();
- unlock_buffer(bh);
- tmp = bh->b_this_page;
- while (tmp != bh) {
- if (buffer_locked(tmp))
- goto still_busy;
- tmp = tmp->b_this_page;
- }
-
- /* OK, the async IO on this page is complete. */
- restore_flags(flags);
-
- after_unlock_page(page);
- /*
- * if none of the buffers had errors then we can set the
- * page uptodate:
- */
- if (!PageError(page))
- SetPageUptodate(page);
- if (page->owner != -1)
- PAGE_BUG(page);
- page->owner = (int)current;
- UnlockPage(page);
-
- return;
-
-still_busy:
- restore_flags(flags);
- return;
-}
-
-static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
+static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
{
struct buffer_head *head, *bh, *tail;
int block;
@@ -1240,9 +1202,7 @@ static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[],
* They show up in the buffer hash table and are registered in
* page->buffers.
*/
- lock_kernel();
head = create_buffers(page_address(page), size, 1);
- unlock_kernel();
if (page->buffers)
BUG();
if (!head)
@@ -1252,7 +1212,9 @@ static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[],
block = *(b++);
tail = bh;
- init_buffer(bh, dev, block, end_buffer_io_async, NULL);
+ init_buffer(bh, end_buffer_io_async, NULL);
+ bh->b_dev = dev;
+ bh->b_blocknr = block;
/*
* When we use bmap, we define block zero to represent
@@ -1261,9 +1223,11 @@ static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[],
* two cases.
*/
if (bmap && !block) {
- set_bit(BH_Uptodate, &bh->b_state);
memset(bh->b_data, 0, size);
+ set_bit(BH_Uptodate, &bh->b_state);
+ continue;
}
+ set_bit(BH_Mapped, &bh->b_state);
}
tail->b_this_page = head;
get_page(page);
@@ -1287,7 +1251,6 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset
BUG();
if (!page->buffers)
return 0;
- lock_kernel();
head = page->buffers;
bh = head;
@@ -1299,14 +1262,16 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset
* is this block fully flushed?
*/
if (offset <= curr_off) {
- if (bh->b_blocknr) {
- bh->b_count++;
+ if (buffer_mapped(bh)) {
+ atomic_inc(&bh->b_count);
wait_on_buffer(bh);
if (bh->b_dev == B_FREE)
BUG();
mark_buffer_clean(bh);
+ clear_bit(BH_Uptodate, &bh->b_state);
+ clear_bit(BH_Mapped, &bh->b_state);
bh->b_blocknr = 0;
- bh->b_count--;
+ atomic_dec(&bh->b_count);
}
}
curr_off = next_off;
@@ -1318,22 +1283,24 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset
* the 'final' flushpage. We have invalidated the bmap
* cached value unconditionally, so real IO is not
* possible anymore.
+ *
+ * If the free doesn't work out, the buffers can be
+ * left around - they just turn into anonymous buffers
+ * instead.
*/
- if (!offset)
- try_to_free_buffers(page);
+ if (!offset) {
+ if (!try_to_free_buffers(page))
+ atomic_add(PAGE_CACHE_SIZE, &buffermem);
+ }
- unlock_kernel();
return 0;
}
-static void create_empty_buffers (struct page *page,
- struct inode *inode, unsigned long blocksize)
+static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
{
struct buffer_head *bh, *head, *tail;
- lock_kernel();
head = create_buffers(page_address(page), blocksize, 1);
- unlock_kernel();
if (page->buffers)
BUG();
@@ -1341,6 +1308,7 @@ static void create_empty_buffers (struct page *page,
do {
bh->b_dev = inode->i_dev;
bh->b_blocknr = 0;
+ bh->b_end_io = end_buffer_io_bad;
tail = bh;
bh = bh->b_this_page;
} while (bh);
@@ -1353,12 +1321,12 @@ static void create_empty_buffers (struct page *page,
* block_write_full_page() is SMP-safe - currently it's still
* being called with the kernel lock held, but the code is ready.
*/
-int block_write_full_page (struct file *file, struct page *page, fs_getblock_t fs_get_block)
+int block_write_full_page(struct file *file, struct page *page)
{
struct dentry *dentry = file->f_dentry;
struct inode *inode = dentry->d_inode;
- int err, created, i;
- unsigned long block, phys, offset;
+ int err, i;
+ unsigned long block, offset;
struct buffer_head *bh, *head;
if (!PageLocked(page))
@@ -1381,23 +1349,22 @@ int block_write_full_page (struct file *file, struct page *page, fs_getblock_t f
if (!bh)
BUG();
- if (!bh->b_blocknr) {
- err = -EIO;
- phys = fs_get_block (inode, block, 1, &err, &created);
- if (!phys)
+ /*
+ * If the buffer isn't up-to-date, we can't be sure
+ * that the buffer has been initialized with the proper
+ * block number information etc..
+ *
+ * Leave it to the low-level FS to make all those
+ * decisions (block #0 may actually be a valid block)
+ */
+ bh->b_end_io = end_buffer_io_sync;
+ if (!buffer_mapped(bh)) {
+ err = inode->i_op->get_block(inode, block, bh, 1);
+ if (err)
goto out;
-
- init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL);
- bh->b_state = (1<<BH_Uptodate);
- } else {
- /*
- * block already exists, just mark it uptodate and
- * dirty:
- */
- bh->b_end_io = end_buffer_io_sync;
- set_bit(BH_Uptodate, &bh->b_state);
}
- atomic_mark_buffer_dirty(bh,0);
+ set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_dirty(bh,0);
bh = bh->b_this_page;
block++;
@@ -1410,15 +1377,15 @@ out:
return err;
}
-int block_write_partial_page (struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf, fs_getblock_t fs_get_block)
+int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
{
struct dentry *dentry = file->f_dentry;
struct inode *inode = dentry->d_inode;
unsigned long block;
- int err, created, partial;
+ int err, partial;
unsigned long blocksize, start_block, end_block;
unsigned long start_offset, start_bytes, end_bytes;
- unsigned long bbits, phys, blocks, i, len;
+ unsigned long bbits, blocks, i, len;
struct buffer_head *bh, *head;
char * target_buf;
@@ -1469,46 +1436,35 @@ int block_write_partial_page (struct file *file, struct page *page, unsigned lon
partial = 1;
goto skip;
}
- if (!bh->b_blocknr) {
- err = -EIO;
- phys = fs_get_block (inode, block, 1, &err, &created);
- if (!phys)
- goto out;
- init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL);
+ /*
+ * If the buffer is not up-to-date, we need to ask the low-level
+ * FS to do something for us (we used to have assumptions about
+ * the meaning of b_blocknr etc, that's bad).
+ *
+ * If "update" is set, that means that the low-level FS should
+ * try to make sure that the block is up-to-date because we're
+ * not going to fill it completely.
+ */
+ bh->b_end_io = end_buffer_io_sync;
+ if (!buffer_mapped(bh)) {
+ err = inode->i_op->get_block(inode, block, bh, 1);
+ if (err)
+ goto out;
+ }
- /*
- * if partially written block which has contents on
- * disk, then we have to read it first.
- * We also rely on the fact that filesystem holes
- * cannot be written.
- */
- if (start_offset || (end_bytes && (i == end_block))) {
- if (created) {
- memset(bh->b_data, 0, bh->b_size);
- } else {
- bh->b_state = 0;
- ll_rw_block(READ, 1, &bh);
- lock_kernel();
- wait_on_buffer(bh);
- unlock_kernel();
- err = -EIO;
- if (!buffer_uptodate(bh))
- goto out;
- }
+ if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
+ if (buffer_new(bh)) {
+ memset(bh->b_data, 0, bh->b_size);
+ } else {
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ err = -EIO;
+ if (!buffer_uptodate(bh))
+ goto out;
}
-
- bh->b_state = (1<<BH_Uptodate);
- } else {
- /*
- * block already exists, just mark it uptodate:
- */
- bh->b_end_io = end_buffer_io_sync;
- set_bit(BH_Uptodate, &bh->b_state);
- created = 0;
}
- err = -EFAULT;
len = blocksize;
if (start_offset) {
len = start_bytes;
@@ -1517,8 +1473,7 @@ int block_write_partial_page (struct file *file, struct page *page, unsigned lon
len = end_bytes;
end_bytes = 0;
}
- if (copy_from_user(target_buf, buf, len))
- goto out;
+ err = copy_from_user(target_buf, buf, len);
target_buf += len;
buf += len;
@@ -1538,12 +1493,18 @@ int block_write_partial_page (struct file *file, struct page *page, unsigned lon
* should not penalize them for somebody else writing
* lots of dirty pages.
*/
+ set_bit(BH_Uptodate, &bh->b_state);
if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
- __atomic_mark_buffer_dirty(bh, bdf_prm.b_un.age_buffer);
+ __mark_dirty(bh, 0);
if (too_many_dirty_buffers)
balance_dirty(bh->b_dev);
}
+ if (err) {
+ err = -EFAULT;
+ goto out;
+ }
+
skip:
i++;
block++;
@@ -1572,6 +1533,9 @@ out:
*
* brw_page() is SMP-safe, although it's being called with the
* kernel lock held - but the code is ready.
+ *
+ * FIXME: we need a swapper_inode->get_block function to remove
+ * some of the bmap kludges and interface ugliness here.
*/
int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
{
@@ -1600,7 +1564,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
do {
block = *(b++);
- if (fresh && (bh->b_count != 0))
+ if (fresh && (atomic_read(&bh->b_count) != 0))
BUG();
if (rw == READ) {
if (!fresh)
@@ -1613,6 +1577,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
BUG();
if (!buffer_uptodate(bh)) {
arr[nr++] = bh;
+ atomic_inc(&bh->b_count);
}
}
} else { /* WRITE */
@@ -1625,8 +1590,9 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
BUG();
}
set_bit(BH_Uptodate, &bh->b_state);
- atomic_mark_buffer_dirty(bh, 0);
+ set_bit(BH_Dirty, &bh->b_state);
arr[nr++] = bh;
+ atomic_inc(&bh->b_count);
}
bh = bh->b_this_page;
} while (bh != head);
@@ -1649,30 +1615,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
}
/*
- * This is called by end_request() when I/O has completed.
- */
-void mark_buffer_uptodate(struct buffer_head * bh, int on)
-{
- if (on) {
- struct buffer_head *tmp = bh;
- struct page *page;
- set_bit(BH_Uptodate, &bh->b_state);
- /* If a page has buffers and all these buffers are uptodate,
- * then the page is uptodate. */
- do {
- if (!test_bit(BH_Uptodate, &tmp->b_state))
- return;
- tmp=tmp->b_this_page;
- } while (tmp && tmp != bh);
- page = mem_map + MAP_NR(bh->b_data);
- SetPageUptodate(page);
- return;
- }
- clear_bit(BH_Uptodate, &bh->b_state);
-}
-
-/*
- * Generic "readpage" function for block devices that have the normal
+ * Generic "read page" function for block devices that have the normal
* bmap functionality. This is most of the block device filesystems.
* Reads the page asynchronously --- the unlock_buffer() and
* mark_buffer_uptodate() functions propagate buffer state into the
@@ -1682,7 +1625,7 @@ int block_read_full_page(struct file * file, struct page * page)
{
struct dentry *dentry = file->f_dentry;
struct inode *inode = dentry->d_inode;
- unsigned long iblock, phys_block;
+ unsigned long iblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize, blocks;
int nr;
@@ -1700,33 +1643,25 @@ int block_read_full_page(struct file * file, struct page * page)
head = page->buffers;
bh = head;
nr = 0;
+
do {
- phys_block = bh->b_blocknr;
- /*
- * important, we have to retry buffers that already have
- * their bnr cached but had an IO error!
- */
- if (!buffer_uptodate(bh)) {
- phys_block = inode->i_op->bmap(inode, iblock);
- /*
- * this is safe to do because we hold the page lock:
- */
- if (phys_block) {
- init_buffer(bh, inode->i_dev, phys_block,
- end_buffer_io_async, NULL);
- arr[nr] = bh;
- nr++;
- } else {
- /*
- * filesystem 'hole' represents zero-contents:
- */
+ if (buffer_uptodate(bh))
+ continue;
+
+ if (!buffer_mapped(bh)) {
+ inode->i_op->get_block(inode, iblock, bh, 0);
+ if (!buffer_mapped(bh)) {
memset(bh->b_data, 0, blocksize);
set_bit(BH_Uptodate, &bh->b_state);
+ continue;
}
}
- iblock++;
- bh = bh->b_this_page;
- } while (bh != head);
+
+ init_buffer(bh, end_buffer_io_async, NULL);
+ atomic_inc(&bh->b_count);
+ arr[nr] = bh;
+ nr++;
+ } while (iblock++, (bh = bh->b_this_page) != head);
++current->maj_flt;
if (nr) {
@@ -1770,8 +1705,9 @@ static int grow_buffers(int size)
}
isize = BUFSIZE_INDEX(size);
- insert_point = free_list[isize];
+ spin_lock(&free_list[isize].lock);
+ insert_point = free_list[isize].list;
tmp = bh;
while (1) {
if (insert_point) {
@@ -1790,9 +1726,11 @@ static int grow_buffers(int size)
break;
}
tmp->b_this_page = bh;
- free_list[isize] = bh;
+ free_list[isize].list = bh;
+ spin_unlock(&free_list[isize].lock);
+
mem_map[MAP_NR(page)].buffers = bh;
- buffermem += PAGE_SIZE;
+ atomic_add(PAGE_SIZE, &buffermem);
return 1;
}
@@ -1800,7 +1738,7 @@ static int grow_buffers(int size)
* Can the buffer be thrown out?
*/
#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
-#define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
+#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
/*
* try_to_free_buffers() checks if all the buffers on this particular page
@@ -1808,90 +1746,70 @@ static int grow_buffers(int size)
*
* Wake up bdflush() if this fails - if we're running low on memory due
* to dirty buffers, we need to flush them out as quickly as possible.
+ *
+ * NOTE: There are quite a number of ways that threads of control can
+ * obtain a reference to a buffer head within a page. So we must
+ * lock out all of these paths to cleanly toss the page.
*/
int try_to_free_buffers(struct page * page)
{
struct buffer_head * tmp, * bh = page->buffers;
+ int index = BUFSIZE_INDEX(bh->b_size);
+ int ret;
+ spin_lock(&lru_list_lock);
+ write_lock(&hash_table_lock);
+ spin_lock(&free_list[index].lock);
tmp = bh;
do {
struct buffer_head * p = tmp;
tmp = tmp->b_this_page;
- if (!buffer_busy(p))
- continue;
-
- too_many_dirty_buffers = 1;
- wakeup_bdflush(0);
- return 0;
+ if (buffer_busy(p))
+ goto busy_buffer_page;
} while (tmp != bh);
+ spin_lock(&unused_list_lock);
tmp = bh;
do {
struct buffer_head * p = tmp;
tmp = tmp->b_this_page;
- /* The buffer can be either on the regular queues or on the free list.. */
- if (p->b_dev == B_FREE)
- remove_from_free_list(p);
- else
- remove_from_queues(p);
-
- put_unused_buffer_head(p);
+ /* The buffer can be either on the regular
+ * queues or on the free list..
+ */
+ if (p->b_dev == B_FREE) {
+ __remove_from_free_list(p, index);
+ } else {
+ if (p->b_pprev)
+ __hash_unlink(p);
+ __remove_from_lru_list(p, p->b_list);
+ }
+ __put_unused_buffer_head(p);
} while (tmp != bh);
+ spin_unlock(&unused_list_lock);
/* Wake up anyone waiting for buffer heads */
wake_up(&buffer_wait);
/* And free the page */
page->buffers = NULL;
- if (__free_page(page)) {
- buffermem -= PAGE_SIZE;
- return 1;
- }
- return 0;
-}
-
-/* ================== Debugging =================== */
-
-void show_buffers(void)
-{
- struct buffer_head * bh;
- int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
- int protected = 0;
- int nlist;
- static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
-
- printk("Buffer memory: %6dkB\n",buffermem>>10);
- printk("Buffer heads: %6d\n",nr_buffer_heads);
- printk("Buffer blocks: %6d\n",nr_buffers);
- printk("Buffer hashed: %6d\n",nr_hashed_buffers);
-
- for(nlist = 0; nlist < NR_LIST; nlist++) {
- found = locked = dirty = used = lastused = protected = 0;
- bh = lru_list[nlist];
- if(!bh) continue;
-
- do {
- found++;
- if (buffer_locked(bh))
- locked++;
- if (buffer_protected(bh))
- protected++;
- if (buffer_dirty(bh))
- dirty++;
- if (bh->b_count)
- used++, lastused = found;
- bh = bh->b_next_free;
- } while (bh != lru_list[nlist]);
- printk("%8s: %d buffers, %d used (last=%d), "
- "%d locked, %d protected, %d dirty\n",
- buf_types[nlist], found, used, lastused,
- locked, protected, dirty);
- };
+ __free_page(page);
+ ret = 1;
+out:
+ spin_unlock(&free_list[index].lock);
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
+ return ret;
+
+busy_buffer_page:
+ /* Uhhuh, start writeback so that we don't end up with all dirty pages */
+ too_many_dirty_buffers = 1;
+ wakeup_bdflush(0);
+ ret = 0;
+ goto out;
}
-
/* ===================== Init ======================= */
/*
@@ -1901,31 +1819,53 @@ void show_buffers(void)
*/
void __init buffer_init(unsigned long memory_size)
{
- int order;
+ int order, i;
unsigned int nr_hash;
- /* we need to guess at the right sort of size for a buffer cache.
- the heuristic from working with large databases and getting
- fsync times (ext2) manageable, is the following */
-
- memory_size >>= 22;
- for (order = 5; (1UL << order) < memory_size; order++);
+ /* The buffer cache hash table is less important these days,
+ * trim it a bit.
+ */
+ memory_size >>= 14;
+ memory_size *= sizeof(struct buffer_head *);
+ for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
+ ;
/* try to allocate something until we get it or we're asking
for something that is really too small */
do {
- nr_hash = (1UL << order) * PAGE_SIZE /
- sizeof(struct buffer_head *);
+ unsigned long tmp;
+
+ nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
+ bh_hash_mask = (nr_hash - 1);
+
+ tmp = nr_hash;
+ bh_hash_shift = 0;
+ while((tmp >>= 1UL) != 0UL)
+ bh_hash_shift++;
+
hash_table = (struct buffer_head **)
__get_free_pages(GFP_ATOMIC, order);
- } while (hash_table == NULL && --order > 4);
- printk("buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", nr_hash, order, (1UL<<order) * PAGE_SIZE);
-
+ } while (hash_table == NULL && --order > 0);
+ printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
+ nr_hash, order, (1UL<<order) * PAGE_SIZE);
+
if (!hash_table)
panic("Failed to allocate buffer hash table\n");
- memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
- bh_hash_mask = nr_hash-1;
+
+ /* Setup hash chains. */
+ for(i = 0; i < nr_hash; i++)
+ hash_table[i] = NULL;
+
+ /* Setup free lists. */
+ for(i = 0; i < NR_SIZES; i++) {
+ free_list[i].list = NULL;
+ free_list[i].lock = SPIN_LOCK_UNLOCKED;
+ }
+
+ /* Setup lru lists. */
+ for(i = 0; i < NR_LIST; i++)
+ lru_list[i] = NULL;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head),
@@ -1933,21 +1873,6 @@ void __init buffer_init(unsigned long memory_size)
SLAB_HWCACHE_ALIGN, NULL, NULL);
if(!bh_cachep)
panic("Cannot create buffer head SLAB cache\n");
- /*
- * Allocate the reserved buffer heads.
- */
- while (nr_buffer_heads < NR_RESERVED) {
- struct buffer_head * bh;
-
- bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
- if (!bh)
- break;
- put_unused_buffer_head(bh);
- nr_buffer_heads++;
- }
-
- lru_list[BUF_CLEAN] = 0;
- grow_buffers(BLOCK_SIZE);
}
@@ -1983,70 +1908,49 @@ void wakeup_bdflush(int wait)
static int sync_old_buffers(void)
{
- int i;
- int ndirty, nwritten;
int nlist;
- int ncount;
- struct buffer_head * bh, *next;
+ lock_kernel();
sync_supers(0);
sync_inodes(0);
+ unlock_kernel();
- ncount = 0;
-#ifdef DEBUG
- for(nlist = 0; nlist < NR_LIST; nlist++)
-#else
- for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
-#endif
- {
- ndirty = 0;
- nwritten = 0;
+ for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
+ struct buffer_head *bh;
repeat:
-
+ spin_lock(&lru_list_lock);
bh = lru_list[nlist];
- if(bh)
- for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
- /* We may have stalled while waiting for I/O to complete. */
- if(bh->b_list != nlist) goto repeat;
- next = bh->b_next_free;
- if(!lru_list[nlist]) {
- printk("Dirty list empty %d\n", i);
- break;
- }
-
- /* Clean buffer on dirty list? Refile it */
- if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
- refile_buffer(bh);
- continue;
- }
-
- /* Unlocked buffer on locked list? Refile it */
- if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
- refile_buffer(bh);
- continue;
- }
+ if(bh) {
+ struct buffer_head *next;
+ int i;
+ for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
+ next = bh->b_next_free;
+
+ /* If the buffer is not on the proper list,
+ * then refile it.
+ */
+ if ((nlist == BUF_DIRTY &&
+ (!buffer_dirty(bh) && !buffer_locked(bh))) ||
+ (nlist == BUF_LOCKED && !buffer_locked(bh))) {
+ __refile_buffer(bh);
+ continue;
+ }
- if (buffer_locked(bh) || !buffer_dirty(bh))
- continue;
- ndirty++;
- nwritten++;
- next->b_count++;
- bh->b_count++;
- bh->b_flushtime = 0;
-#ifdef DEBUG
- if(nlist != BUF_DIRTY) ncount++;
-#endif
- ll_rw_block(WRITE, 1, &bh);
- bh->b_count--;
- next->b_count--;
- }
+ if (buffer_locked(bh) || !buffer_dirty(bh))
+ continue;
+
+ /* OK, now we are committed to write it out. */
+ bh->b_flushtime = 0;
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
+ ll_rw_block(WRITE, 1, &bh);
+ atomic_dec(&bh->b_count);
+ goto repeat;
+ }
+ }
+ spin_unlock(&lru_list_lock);
}
run_task_queue(&tq_disk);
-#ifdef DEBUG
- if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
- printk("Wrote %d/%d buffers\n", nwritten, ndirty);
-#endif
- run_task_queue(&tq_disk);
return 0;
}
@@ -2060,7 +1964,6 @@ asmlinkage int sys_bdflush(int func, long data)
{
int i, error = -EPERM;
- lock_kernel();
if (!capable(CAP_SYS_ADMIN))
goto out;
@@ -2092,7 +1995,6 @@ asmlinkage int sys_bdflush(int func, long data)
*/
error = 0;
out:
- unlock_kernel();
return error;
}
@@ -2114,52 +2016,37 @@ int bdflush(void * unused)
sprintf(current->comm, "kflushd");
bdflush_tsk = current;
- /*
- * As a kernel thread we want to tamper with system buffers
- * and other internals and thus be subject to the SMP locking
- * rules. (On a uniprocessor box this does nothing).
- */
- lock_kernel();
-
for (;;) {
int nlist;
CHECK_EMERGENCY_SYNC
- for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
- {
- int nr;
- int written = 0;
+ for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
+ int nr, major, written = 0;
struct buffer_head *next;
- int major;
repeat:
+ spin_lock(&lru_list_lock);
next = lru_list[nlist];
nr = nr_buffers_type[nlist];
-
while (nr-- > 0) {
struct buffer_head *bh = next;
- /* We may have stalled while waiting for I/O to complete. */
- if (next->b_list != nlist)
- goto repeat;
+
next = next->b_next_free;
- /* Clean buffer on dirty list? Refile it */
- if (nlist == BUF_DIRTY && !buffer_dirty(bh)) {
- refile_buffer(bh);
- continue;
- }
-
- /* Unlocked buffer on locked list? Refile it */
- if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
- refile_buffer(bh);
+ /* If the buffer is not on the correct list,
+ * then refile it.
+ */
+ if ((nlist == BUF_DIRTY &&
+ (!buffer_dirty(bh) && !buffer_locked(bh))) ||
+ (nlist == BUF_LOCKED && !buffer_locked(bh))) {
+ __refile_buffer(bh);
continue;
}
- /*
- * If we aren't in panic mode, don't write out too much
- * at a time. Also, don't write out buffers we don't really
- * have to write out yet..
+ /* If we aren't in panic mode, don't write out too much
+ * at a time. Also, don't write out buffers we don't
+ * really have to write out yet..
*/
if (!too_many_dirty_buffers) {
if (written > bdf_prm.b_un.ndirty)
@@ -2172,9 +2059,6 @@ int bdflush(void * unused)
continue;
major = MAJOR(bh->b_dev);
- if (next)
- next->b_count++;
- bh->b_count++;
written++;
bh->b_flushtime = 0;
@@ -2182,18 +2066,18 @@ int bdflush(void * unused)
* For the loop major we can try to do asynchronous writes,
* but we have to guarantee that we're making some progress..
*/
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
if (major == LOOP_MAJOR && written > 1) {
ll_rw_block(WRITEA, 1, &bh);
if (buffer_dirty(bh))
--written;
} else
ll_rw_block(WRITE, 1, &bh);
-
- bh->b_count--;
- if (next)
- next->b_count--;
- wake_up(&buffer_wait);
+ atomic_dec(&bh->b_count);
+ goto repeat;
}
+ spin_unlock(&lru_list_lock);
}
run_task_queue(&tq_disk);
wake_up(&bdflush_done);
@@ -2206,7 +2090,10 @@ int bdflush(void * unused)
*/
if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
too_many_dirty_buffers = 0;
- sleep_on_timeout(&bdflush_wait, 5*HZ);
+ spin_lock_irq(&current->sigmask_lock);
+ flush_signals(current);
+ spin_unlock_irq(&current->sigmask_lock);
+ interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);
}
}
}