summaryrefslogtreecommitdiffstats
path: root/fs/ext2/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext2/inode.c')
-rw-r--r--fs/ext2/inode.c545
1 files changed, 272 insertions, 273 deletions
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c7234e7b5..678eb4d28 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -18,6 +18,8 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
* 64-bit file support on 64-bit platforms by Jakub Jelinek
* (jj@sunsite.ms.mff.cuni.cz)
+ *
+ * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
*/
#include <linux/fs.h>
@@ -26,8 +28,6 @@
#include <linux/sched.h>
#include <linux/highuid.h>
-
-
static int ext2_update_inode(struct inode * inode, int do_sync);
/*
@@ -64,23 +64,18 @@ no_delete:
clear_inode(inode); /* We must guarantee clearing of inode... */
}
-/*
- * ext2_discard_prealloc and ext2_alloc_block are atomic wrt. the
- * superblock in the same manner as are ext2_free_blocks and
- * ext2_new_block. We just wait on the super rather than locking it
- * here, since ext2_new_block will do the necessary locking and we
- * can't block until then.
- */
void ext2_discard_prealloc (struct inode * inode)
{
#ifdef EXT2_PREALLOCATE
- unsigned short total;
-
lock_kernel();
+ /* Writer: ->i_prealloc* */
if (inode->u.ext2_i.i_prealloc_count) {
- total = inode->u.ext2_i.i_prealloc_count;
+ unsigned short total = inode->u.ext2_i.i_prealloc_count;
+ unsigned long block = inode->u.ext2_i.i_prealloc_block;
inode->u.ext2_i.i_prealloc_count = 0;
- ext2_free_blocks (inode, inode->u.ext2_i.i_prealloc_block, total);
+ inode->u.ext2_i.i_prealloc_block = 0;
+ /* Writer: end */
+ ext2_free_blocks (inode, block, total);
}
unlock_kernel();
#endif
@@ -93,22 +88,26 @@ static int ext2_alloc_block (struct inode * inode, unsigned long goal, int *err)
#endif
unsigned long result;
- wait_on_super (inode->i_sb);
#ifdef EXT2_PREALLOCATE
+ /* Writer: ->i_prealloc* */
if (inode->u.ext2_i.i_prealloc_count &&
(goal == inode->u.ext2_i.i_prealloc_block ||
goal + 1 == inode->u.ext2_i.i_prealloc_block))
{
result = inode->u.ext2_i.i_prealloc_block++;
inode->u.ext2_i.i_prealloc_count--;
+ /* Writer: end */
+#ifdef EXT2FS_DEBUG
ext2_debug ("preallocation hit (%lu/%lu).\n",
++alloc_hits, ++alloc_attempts);
-
+#endif
} else {
ext2_discard_prealloc (inode);
+#ifdef EXT2FS_DEBUG
ext2_debug ("preallocation miss (%lu/%lu).\n",
alloc_hits, ++alloc_attempts);
+#endif
if (S_ISREG(inode->i_mode))
result = ext2_new_block (inode, goal,
&inode->u.ext2_i.i_prealloc_count,
@@ -299,307 +298,307 @@ no_block:
return p;
}
-static struct buffer_head * inode_getblk (struct inode * inode, int nr,
- int new_block, int * err, int metadata, long *phys, int *new)
+/**
+ * ext2_find_near - find a place for allocation with sufficient locality
+ * @inode: owner
+ * @ind: descriptor of indirect block.
+ *
+ * This function returns the prefered place for block allocation.
+ * It is used when heuristic for sequential allocation fails.
+ * Rules are:
+ * + if there is a block to the left of our position - allocate near it.
+ * + if pointer will live in indirect block - allocate near that block.
+ * + if pointer will live in inode - allocate in the same cylinder group.
+ * Caller must make sure that @ind is valid and will stay that way.
+ */
+
+static inline unsigned long ext2_find_near(struct inode *inode, Indirect *ind)
{
- u32 * p;
- int tmp, goal = 0;
- struct buffer_head * result;
- int blocksize = inode->i_sb->s_blocksize;
+ u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext2_i.i_data;
+ u32 *p;
- p = inode->u.ext2_i.i_data + nr;
-repeat:
- tmp = le32_to_cpu(*p);
- if (tmp) {
- if (metadata) {
- result = getblk (inode->i_dev, tmp, blocksize);
- if (tmp == le32_to_cpu(*p))
- return result;
- brelse (result);
- goto repeat;
- } else {
- *phys = tmp;
- return NULL;
- }
- }
+ /* Try to find previous block */
+ for (p = ind->p - 1; p >= start; p--)
+ if (*p)
+ return le32_to_cpu(*p);
- if (inode->u.ext2_i.i_next_alloc_block == new_block)
- goal = inode->u.ext2_i.i_next_alloc_goal;
+ /* No such thing, so let's try location of indirect block */
+ if (ind->bh)
+ return ind->bh->b_blocknr;
- ext2_debug ("hint = %d,", goal);
+ /*
+ * It is going to be refered from inode itself? OK, just put it into
+ * the same cylinder group then.
+ */
+ return (inode->u.ext2_i.i_block_group *
+ EXT2_BLOCKS_PER_GROUP(inode->i_sb)) +
+ le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block);
+}
- if (!goal) {
- for (tmp = nr - 1; tmp >= 0; tmp--) {
- if (inode->u.ext2_i.i_data[tmp]) {
- goal = le32_to_cpu(inode->u.ext2_i.i_data[tmp]);
- break;
- }
- }
- if (!goal)
- goal = (inode->u.ext2_i.i_block_group *
- EXT2_BLOCKS_PER_GROUP(inode->i_sb)) +
- le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block);
- }
+/**
+ * ext2_find_goal - find a prefered place for allocation.
+ * @inode: owner
+ * @block: block we want
+ * @chain: chain of indirect blocks
+ * @partial: pointer to the last triple within a chain.
+ *
+ * This function returns the prefered place for block allocation.
+ */
- ext2_debug ("goal = %d.\n", goal);
-
- tmp = ext2_alloc_block (inode, goal, err);
- if (!tmp)
- return NULL;
-
- if (metadata) {
- result = getblk (inode->i_dev, tmp, blocksize);
- if (!buffer_uptodate(result))
- wait_on_buffer(result);
- memset(result->b_data, 0, blocksize);
- mark_buffer_uptodate(result, 1);
- mark_buffer_dirty(result, 1);
- if (*p) {
- ext2_free_blocks (inode, tmp, 1);
- bforget (result);
- goto repeat;
- }
- } else {
- if (*p) {
- /*
- * Nobody is allowed to change block allocation
- * state from under us:
- */
- ext2_error (inode->i_sb, "block_getblk",
- "data block filled under us");
- BUG();
- ext2_free_blocks (inode, tmp, 1);
- goto repeat;
- }
- *phys = tmp;
- result = NULL;
- *err = 0;
- *new = 1;
- }
- *p = cpu_to_le32(tmp);
+static inline unsigned long ext2_find_goal(struct inode *inode,
+ long block,
+ Indirect chain[4],
+ Indirect *partial)
+{
+ unsigned long goal = 0;
- inode->u.ext2_i.i_next_alloc_block = new_block;
- inode->u.ext2_i.i_next_alloc_goal = tmp;
- inode->i_ctime = CURRENT_TIME;
- inode->i_blocks += blocksize/512;
- if (IS_SYNC(inode) || inode->u.ext2_i.i_osync)
- ext2_sync_inode (inode);
- else
- mark_inode_dirty(inode);
- return result;
+ /* Writer: ->i_next_alloc* */
+ if (block == inode->u.ext2_i.i_next_alloc_block + 1) {
+ inode->u.ext2_i.i_next_alloc_block++;
+ inode->u.ext2_i.i_next_alloc_goal++;
+ }
+ /* Writer: end */
+ /* Reader: pointers, ->i_next_alloc* */
+ if (verify_chain(chain, partial)) {
+ /*
+ * try the heuristic for sequential allocation,
+ * failing that at least try to get decent locality.
+ */
+ if (block == inode->u.ext2_i.i_next_alloc_block)
+ goal = inode->u.ext2_i.i_next_alloc_goal;
+ if (!goal)
+ goal = ext2_find_near(inode, partial);
+ }
+ /* Reader: end */
+ return goal;
}
-/*
- * metadata / data
- * possibly create / access
- * can fail due to: - not present
- * - out of space
+/**
+ * ext2_alloc_branch - allocate and set up a chain of blocks.
+ * @inode: owner
+ * @num: depth of the chain (number of blocks to allocate)
+ * @offsets: offsets (in the blocks) to store the pointers to next.
+ * @branch: place to store the chain in.
*
- * NULL return in the data case is mandatory.
+ * This function allocates @num blocks, zeroes out all but the last one,
+ * links them into chain and (if we are synchronous) writes them to disk.
+ * In other words, it prepares a branch that can be spliced onto the
+ * inode. It stores the information about that chain in the branch[], in
+ * the same format as ext2_get_branch() would do. We are calling it after
+ * we had read the existing part of chain and partial points to the last
+ * triple of that (one with zero ->key). Upon the exit we have the same
+ * picture as after the successful ext2_get_block(), excpet that in one
+ * place chain is disconnected - *branch->p is still zero (we did not
+ * set the last link), but branch->key contains the number that should
+ * be placed into *branch->p to fill that gap.
+ *
+ * If allocation fails we free all blocks we've allocated (and forget
+ * ther buffer_heads) and return the error value the from failed
+ * ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ * as described above and return 0.
*/
-static struct buffer_head * block_getblk (struct inode * inode,
- struct buffer_head * bh, int nr,
- int new_block, int * err, int metadata, long *phys, int *new)
+
+static int ext2_alloc_branch(struct inode *inode,
+ int num,
+ unsigned long goal,
+ int *offsets,
+ Indirect *branch)
{
- int tmp, goal = 0;
- u32 * p;
- struct buffer_head * result;
int blocksize = inode->i_sb->s_blocksize;
+ int n = 0;
+ int err;
+ int i;
+ int parent = ext2_alloc_block(inode, goal, &err);
- result = NULL;
- if (!bh)
- goto out;
- if (!buffer_uptodate(bh)) {
- ll_rw_block (READ, 1, &bh);
- wait_on_buffer (bh);
+ branch[0].key = cpu_to_le32(parent);
+ if (parent) for (n = 1; n < num; n++) {
+ struct buffer_head *bh;
+ /* Allocate the next block */
+ int nr = ext2_alloc_block(inode, parent, &err);
+ if (!nr)
+ break;
+ branch[n].key = cpu_to_le32(nr);
+ /*
+ * Get buffer_head for parent block, zero it out and set
+ * the pointer to new one, then send parent to disk.
+ */
+ bh = getblk(inode->i_dev, parent, blocksize);
if (!buffer_uptodate(bh))
- goto out;
- }
- p = (u32 *) bh->b_data + nr;
-repeat:
- tmp = le32_to_cpu(*p);
- if (tmp) {
- if (metadata) {
- result = getblk (bh->b_dev, tmp, blocksize);
- if (tmp == le32_to_cpu(*p))
- goto out;
- brelse (result);
- goto repeat;
- } else {
- *phys = tmp;
- /* result == NULL */
- goto out;
+ wait_on_buffer(bh);
+ memset(bh->b_data, 0, blocksize);
+ branch[n].bh = bh;
+ branch[n].p = (u32*) bh->b_data + offsets[n];
+ *branch[n].p = branch[n].key;
+ mark_buffer_uptodate(bh, 1);
+ mark_buffer_dirty(bh, 1);
+ if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
+ ll_rw_block (WRITE, 1, &bh);
+ wait_on_buffer (bh);
}
+ parent = nr;
}
+ if (n == num)
+ return 0;
- if (inode->u.ext2_i.i_next_alloc_block == new_block)
- goal = inode->u.ext2_i.i_next_alloc_goal;
- if (!goal) {
- for (tmp = nr - 1; tmp >= 0; tmp--) {
- if (le32_to_cpu(((u32 *) bh->b_data)[tmp])) {
- goal = le32_to_cpu(((u32 *)bh->b_data)[tmp]);
- break;
- }
- }
- if (!goal)
- goal = bh->b_blocknr;
- }
- tmp = ext2_alloc_block (inode, goal, err);
- if (!tmp)
- goto out;
- if (metadata) {
- result = getblk (bh->b_dev, tmp, blocksize);
- if (!buffer_uptodate(result))
- wait_on_buffer(result);
- memset(result->b_data, 0, inode->i_sb->s_blocksize);
- mark_buffer_uptodate(result, 1);
- mark_buffer_dirty(result, 1);
- if (*p) {
- ext2_free_blocks (inode, tmp, 1);
- bforget (result);
- goto repeat;
- }
- } else {
- if (*p) {
- /*
- * Nobody is allowed to change block allocation
- * state from under us:
- */
- ext2_error (inode->i_sb, "block_getblk",
- "data block filled under us");
- BUG();
- ext2_free_blocks (inode, tmp, 1);
- goto repeat;
+ /* Allocation failed, free what we already allocated */
+ for (i = 1; i < n; i++)
+ bforget(branch[i].bh);
+ for (i = 0; i < n; i++)
+ ext2_free_blocks(inode, le32_to_cpu(branch[i].key), 1);
+ return err;
+}
+
+/**
+ * ext2_splice_branch - splice the allocated branch onto inode.
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ * ext2_alloc_branch)
+ * @where: location of missing link
+ * @num: number of blocks we are adding
+ *
+ * This function verifies that chain (up to the missing link) had not
+ * changed, fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0. Otherwise (== chain had been changed)
+ * we free the new blocks (forgetting their buffer_heads, indeed) and
+ * return -EAGAIN.
+ */
+
+static inline int ext2_splice_branch(struct inode *inode,
+ long block,
+ Indirect chain[4],
+ Indirect *where,
+ int num)
+{
+ int i;
+
+ /* Verify that place we are splicing to is still there and vacant */
+
+ /* Writer: pointers, ->i_next_alloc*, ->i_blocks */
+ if (!verify_chain(chain, where-1) || *where->p)
+ /* Writer: end */
+ goto changed;
+
+ /* That's it */
+
+ *where->p = where->key;
+ inode->u.ext2_i.i_next_alloc_block = block;
+ inode->u.ext2_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
+ inode->i_blocks += num * inode->i_sb->s_blocksize/512;
+
+ /* Writer: end */
+
+ /* We are done with atomic stuff, now do the rest of housekeeping */
+
+ inode->i_ctime = CURRENT_TIME;
+
+ /* had we spliced it onto indirect block? */
+ if (where->bh) {
+ mark_buffer_dirty(where->bh, 1);
+ if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
+ ll_rw_block (WRITE, 1, &where->bh);
+ wait_on_buffer(where->bh);
}
- *phys = tmp;
- *new = 1;
- }
- *p = le32_to_cpu(tmp);
- mark_buffer_dirty(bh, 1);
- if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
- ll_rw_block (WRITE, 1, &bh);
- wait_on_buffer (bh);
}
- inode->i_ctime = CURRENT_TIME;
- inode->i_blocks += blocksize/512;
- mark_inode_dirty(inode);
- inode->u.ext2_i.i_next_alloc_block = new_block;
- inode->u.ext2_i.i_next_alloc_goal = tmp;
- *err = 0;
-out:
- brelse (bh);
- return result;
+
+ if (IS_SYNC(inode) || inode->u.ext2_i.i_osync)
+ ext2_sync_inode (inode);
+ else
+ mark_inode_dirty(inode);
+ return 0;
+
+changed:
+ for (i = 1; i < num; i++)
+ bforget(where[i].bh);
+ for (i = 0; i < num; i++)
+ ext2_free_blocks(inode, le32_to_cpu(where[i].key), 1);
+ return -EAGAIN;
}
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ */
+
static int ext2_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
{
- int ret, err, new;
- struct buffer_head *bh;
- unsigned long phys;
+ int err = -EIO;
int offsets[4];
- int *p;
Indirect chain[4];
Indirect *partial;
- int depth;
+ unsigned long goal;
+ int left;
+ int depth = ext2_block_to_path(inode, iblock, offsets);
- depth = ext2_block_to_path(inode, iblock, offsets);
if (depth == 0)
- goto abort;
+ goto out;
lock_kernel();
+reread:
partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+ /* Simplest case - block found, no allocation needed */
if (!partial) {
- unlock_kernel();
- for (partial = chain + depth - 1; partial > chain; partial--)
- brelse(partial->bh);
+got_it:
bh_result->b_dev = inode->i_dev;
bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
bh_result->b_state |= (1UL << BH_Mapped);
- return 0;
+ /* Clean up and exit */
+ partial = chain+depth-1; /* the whole chain */
+ goto cleanup;
}
- while (partial > chain) {
- brelse(partial->bh);
- partial--;
- }
-
- if (!create) {
+ /* Next simple case - plain lookup or failed read of indirect block */
+ if (!create || err == -EIO) {
+cleanup:
+ while (partial > chain) {
+ brelse(partial->bh);
+ partial--;
+ }
unlock_kernel();
- return 0;
+out:
+ return err;
}
- err = -EIO;
- new = 0;
- ret = 0;
- bh = NULL;
-
/*
- * If this is a sequential block allocation, set the next_alloc_block
- * to this block now so that all the indblock and data block
- * allocations use the same goal zone
+ * Indirect block might be removed by truncate while we were
+ * reading it. Handling of that case (forget what we've got and
+ * reread) is taken out of the main path.
*/
+ if (err == -EAGAIN)
+ goto changed;
- ext2_debug ("block %lu, next %lu, goal %lu.\n", iblock,
- inode->u.ext2_i.i_next_alloc_block,
- inode->u.ext2_i.i_next_alloc_goal);
-
- if (iblock == inode->u.ext2_i.i_next_alloc_block + 1) {
- inode->u.ext2_i.i_next_alloc_block++;
- inode->u.ext2_i.i_next_alloc_goal++;
- }
+ goal = ext2_find_goal(inode, iblock, chain, partial);
+ if (!goal)
+ goto changed;
- err = 0;
+ left = (chain + depth) - partial;
+ err = ext2_alloc_branch(inode, left, goal,
+ offsets+(partial-chain), partial);
+ if (err)
+ goto cleanup;
- /*
- * ok, these macros clean the logic up a bit and make
- * it much more readable:
- */
-#define GET_INODE_DATABLOCK(x) \
- inode_getblk(inode, x, iblock, &err, 0, &phys, &new)
-#define GET_INODE_PTR(x) \
- inode_getblk(inode, x, iblock, &err, 1, NULL, NULL)
-#define GET_INDIRECT_DATABLOCK(x) \
- block_getblk (inode, bh, x, iblock, &err, 0, &phys, &new);
-#define GET_INDIRECT_PTR(x) \
- block_getblk (inode, bh, x, iblock, &err, 1, NULL, NULL);
-
- p = offsets;
- if (depth == 1) {
- bh = GET_INODE_DATABLOCK(*p);
- goto out;
- }
- bh = GET_INODE_PTR(*p);
- switch (depth) {
- default: /* case 4: */
- bh = GET_INDIRECT_PTR(*++p);
- case 3:
- bh = GET_INDIRECT_PTR(*++p);
- case 2:
- bh = GET_INDIRECT_DATABLOCK(*++p);
- }
+ if (ext2_splice_branch(inode, iblock, chain, partial, left) < 0)
+ goto changed;
-#undef GET_INODE_DATABLOCK
-#undef GET_INODE_PTR
-#undef GET_INDIRECT_DATABLOCK
-#undef GET_INDIRECT_PTR
+ bh_result->b_state |= (1UL << BH_New);
+ goto got_it;
-out:
- if (bh)
- BUG(); // temporary debugging check
- if (err)
- goto abort;
- if (!phys)
- BUG(); // must not happen either
-
- bh_result->b_dev = inode->i_dev;
- bh_result->b_blocknr = phys;
- bh_result->b_state |= (1UL << BH_Mapped); /* safe */
- if (new)
- bh_result->b_state |= (1UL << BH_New);
- unlock_kernel();
-abort:
- return err;
+changed:
+ while (partial > chain) {
+ bforget(partial->bh);
+ partial--;
+ }
+ goto reread;
}
struct buffer_head * ext2_getblk(struct inode * inode, long block, int create, int * err)
@@ -833,7 +832,7 @@ void ext2_read_inode (struct inode * inode)
inode->i_attr_flags = 0;
if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) {
inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS;
- inode->i_flags |= MS_SYNCHRONOUS;
+ inode->i_flags |= S_SYNC;
}
if (inode->u.ext2_i.i_flags & EXT2_APPEND_FL) {
inode->i_attr_flags |= ATTR_FLAG_APPEND;
@@ -845,7 +844,7 @@ void ext2_read_inode (struct inode * inode)
}
if (inode->u.ext2_i.i_flags & EXT2_NOATIME_FL) {
inode->i_attr_flags |= ATTR_FLAG_NOATIME;
- inode->i_flags |= MS_NOATIME;
+ inode->i_flags |= S_NOATIME;
}
return;
@@ -999,17 +998,17 @@ int ext2_notify_change(struct dentry *dentry, struct iattr *iattr)
flags = iattr->ia_attr_flags;
if (flags & ATTR_FLAG_SYNCRONOUS) {
- inode->i_flags |= MS_SYNCHRONOUS;
+ inode->i_flags |= S_SYNC;
inode->u.ext2_i.i_flags |= EXT2_SYNC_FL;
} else {
- inode->i_flags &= ~MS_SYNCHRONOUS;
+ inode->i_flags &= ~S_SYNC;
inode->u.ext2_i.i_flags &= ~EXT2_SYNC_FL;
}
if (flags & ATTR_FLAG_NOATIME) {
- inode->i_flags |= MS_NOATIME;
+ inode->i_flags |= S_NOATIME;
inode->u.ext2_i.i_flags |= EXT2_NOATIME_FL;
} else {
- inode->i_flags &= ~MS_NOATIME;
+ inode->i_flags &= ~S_NOATIME;
inode->u.ext2_i.i_flags &= ~EXT2_NOATIME_FL;
}
if (flags & ATTR_FLAG_APPEND) {