14 files changed, 371 insertions, 340 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f3c3e9e12..72b5e1b27 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -400,7 +400,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
-	list_del(&dentry->d_hash);
+	list_del_init(&dentry->d_hash);
 	spin_unlock(&dcache_lock);
 
 	dput(ino->dentry);
diff --git a/fs/buffer.c b/fs/buffer.c
index 341dfe591..ad0a04e68 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -482,16 +482,12 @@ static void __remove_from_queues(struct buffer_head *bh)
 	__remove_from_lru_list(bh, bh->b_list);
 }
 
-static void insert_into_queues(struct buffer_head *bh)
+static void __insert_into_queues(struct buffer_head *bh)
 {
 	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 
-	spin_lock(&lru_list_lock);
-	write_lock(&hash_table_lock);
 	__hash_link(bh, head);
 	__insert_into_lru_list(bh, bh->b_list);
-	write_unlock(&hash_table_lock);
-	spin_unlock(&lru_list_lock);
 }
 
 /* This function must only run if there are no other
@@ -524,19 +520,27 @@ static void put_last_free(struct buffer_head * bh)
  * will force it bad). This shouldn't really happen currently, but
  * the code is ready.
  */
-struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
+static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
 {
-	struct buffer_head **head = &hash(dev, block);
-	struct buffer_head *bh;
+	struct buffer_head *bh = hash(dev, block);
 
-	read_lock(&hash_table_lock);
-	for(bh = *head; bh; bh = bh->b_next)
+	for (; bh; bh = bh->b_next)
 		if (bh->b_blocknr == block	&&
 		    bh->b_size    == size	&&
 		    bh->b_dev     == dev)
 			break;
 	if (bh)
 		atomic_inc(&bh->b_count);
+
+	return bh;
+}
+
+struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
+{
+	struct buffer_head *bh;
+
+	read_lock(&hash_table_lock);
+	bh = __get_hash_table(dev, block, size);
 	read_unlock(&hash_table_lock);
 
 	return bh;
@@ -804,7 +808,9 @@ struct buffer_head * getblk(kdev_t dev, int block, int size)
 	int isize;
 
 repeat:
-	bh = get_hash_table(dev, block, size);
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	bh = __get_hash_table(dev, block, size);
 	if (bh)
 		goto out;
 
@@ -829,9 +835,10 @@ repeat:
 		bh->b_state = 1 << BH_Mapped;
 
 		/* Insert the buffer into the regular lists */
-		insert_into_queues(bh);
+		__insert_into_queues(bh);
 	out:
-		touch_buffer(bh);
+		write_unlock(&hash_table_lock);
+		spin_unlock(&lru_list_lock);
 		return bh;
 	}
 
@@ -839,6 +846,8 @@ repeat:
 	 * If we block while refilling the free list, somebody may
 	 * create the buffer first ... search the hashes again.
 	 */
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
 	refill_freelist(size);
 	goto repeat;
 }
@@ -2118,6 +2127,11 @@ out:
  *
  * This all is required so that we can free up memory
  * later.
+ *
+ * Wait:
+ *	0 - no wait (this does not get called - see try_to_free_buffers below)
+ *	1 - start IO for dirty buffers
+ *	2 - wait for completion of locked buffers
  */
 static void sync_page_buffers(struct buffer_head *bh, int wait)
 {
@@ -2127,7 +2141,7 @@ static void sync_page_buffers(struct buffer_head *bh, int wait)
 		struct buffer_head *p = tmp;
 		tmp = tmp->b_this_page;
 		if (buffer_locked(p)) {
-			if (wait)
+			if (wait > 1)
 				__wait_on_buffer(p);
 		} else if (buffer_dirty(p))
 			ll_rw_block(WRITE, 1, &p);
@@ -2200,8 +2214,9 @@ busy_buffer_page:
 	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
 	spin_unlock(&free_list[index].lock);
 	write_unlock(&hash_table_lock);
-	spin_unlock(&lru_list_lock);	
-	sync_page_buffers(bh, wait);
+	spin_unlock(&lru_list_lock);
+	if (wait)
+		sync_page_buffers(bh, wait);
 	return 0;
 }
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 9be3e8cdc..e2bcbe6a3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -81,8 +81,7 @@ static inline void dentry_iput(struct dentry * dentry)
 	struct inode *inode = dentry->d_inode;
 	if (inode) {
 		dentry->d_inode = NULL;
-		list_del(&dentry->d_alias);
-		INIT_LIST_HEAD(&dentry->d_alias);
+		list_del_init(&dentry->d_alias);
 		spin_unlock(&dcache_lock);
 		if (dentry->d_op && dentry->d_op->d_iput)
 			dentry->d_op->d_iput(dentry, inode);
@@ -153,7 +152,7 @@ repeat:
 	return;
 
 unhash_it:
-	list_del(&dentry->d_hash);
+	list_del_init(&dentry->d_hash);
 
 kill_it: {
 		struct dentry *parent;
@@ -218,8 +217,7 @@ int d_invalidate(struct dentry * dentry)
 		}
 	}
 
-	list_del(&dentry->d_hash);
-	INIT_LIST_HEAD(&dentry->d_hash);
+	list_del_init(&dentry->d_hash);
 	spin_unlock(&dcache_lock);
 	return 0;
 }
@@ -307,7 +305,7 @@ static inline void prune_one_dentry(struct dentry * dentry)
 {
 	struct dentry * parent;
 
-	list_del(&dentry->d_hash);
+	list_del_init(&dentry->d_hash);
 	list_del(&dentry->d_child);
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -342,8 +340,7 @@ void prune_dcache(int count)
 		if (tmp == &dentry_unused)
 			break;
 		dentry_stat.nr_unused--;
-		list_del(tmp);
-		INIT_LIST_HEAD(tmp);
+		list_del_init(tmp);
 		dentry = list_entry(tmp, struct dentry, d_lru);
 
 		/* Unused dentry with a count? */
diff --git a/fs/exec.c b/fs/exec.c
index f7745f9de..d162f8852 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -320,9 +320,11 @@ int setup_arg_pages(struct linux_binprm *bprm)
 	} 
 
 	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		if (bprm->page[i]) {
+		struct page *page = bprm->page[i];
+		if (page) {
+			bprm->page[i] = NULL;
 			current->mm->rss++;
-			put_dirty_page(current,bprm->page[i],stack_base);
+			put_dirty_page(current,page,stack_base);
 		}
 		stack_base += PAGE_SIZE;
 	}
@@ -873,11 +875,11 @@ out:
 	if (bprm.file)
 		fput(bprm.file);
 
-	/* Assumes that free_page() can take a NULL argument. */ 
-	/* I hope this is ok for all architectures */ 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++)
-		if (bprm.page[i])
-			__free_page(bprm.page[i]);
+	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
+		struct page * page = bprm.page[i];
+		if (page)
+			__free_page(page);
+	}
 
 	return retval;
 }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 3c95ccd70..c99f7f2c8 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -453,7 +453,7 @@ repeat:
 	inode->u.ext2_i.i_dtime = 0;
 	inode->u.ext2_i.i_block_group = i;
 	if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL)
-		inode->i_flags |= MS_SYNCHRONOUS;
+		inode->i_flags |= S_SYNC;
 	insert_inode_hash(inode);
 	inode->i_generation = event++;
 	mark_inode_dirty(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c7234e7b5..678eb4d28 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -18,6 +18,8 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  * 	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
  */
 
 #include <linux/fs.h>
@@ -26,8 +28,6 @@
 #include <linux/sched.h>
 #include <linux/highuid.h>
 
-
-
 static int ext2_update_inode(struct inode * inode, int do_sync);
 
 /*
@@ -64,23 +64,18 @@ no_delete:
 	clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 
-/* 
- * ext2_discard_prealloc and ext2_alloc_block are atomic wrt. the
- * superblock in the same manner as are ext2_free_blocks and
- * ext2_new_block.  We just wait on the super rather than locking it
- * here, since ext2_new_block will do the necessary locking and we
- * can't block until then.
- */
 void ext2_discard_prealloc (struct inode * inode)
 {
 #ifdef EXT2_PREALLOCATE
-	unsigned short total;
-
 	lock_kernel();
+	/* Writer: ->i_prealloc* */
 	if (inode->u.ext2_i.i_prealloc_count) {
-		total = inode->u.ext2_i.i_prealloc_count;
+		unsigned short total = inode->u.ext2_i.i_prealloc_count;
+		unsigned long block = inode->u.ext2_i.i_prealloc_block;
 		inode->u.ext2_i.i_prealloc_count = 0;
-		ext2_free_blocks (inode, inode->u.ext2_i.i_prealloc_block, total);
+		inode->u.ext2_i.i_prealloc_block = 0;
+		/* Writer: end */
+		ext2_free_blocks (inode, block, total);
 	}
 	unlock_kernel();
 #endif
@@ -93,22 +88,26 @@ static int ext2_alloc_block (struct inode * inode, unsigned long goal, int *err)
 #endif
 	unsigned long result;
 
-	wait_on_super (inode->i_sb);
 
 #ifdef EXT2_PREALLOCATE
+	/* Writer: ->i_prealloc* */
 	if (inode->u.ext2_i.i_prealloc_count &&
 	    (goal == inode->u.ext2_i.i_prealloc_block ||
 	     goal + 1 == inode->u.ext2_i.i_prealloc_block))
 	{		
 		result = inode->u.ext2_i.i_prealloc_block++;
 		inode->u.ext2_i.i_prealloc_count--;
+		/* Writer: end */
+#ifdef EXT2FS_DEBUG
 		ext2_debug ("preallocation hit (%lu/%lu).\n",
 			    ++alloc_hits, ++alloc_attempts);
-
+#endif
 	} else {
 		ext2_discard_prealloc (inode);
+#ifdef EXT2FS_DEBUG
 		ext2_debug ("preallocation miss (%lu/%lu).\n",
 			    alloc_hits, ++alloc_attempts);
+#endif
 		if (S_ISREG(inode->i_mode))
 			result = ext2_new_block (inode, goal, 
 				 &inode->u.ext2_i.i_prealloc_count,
@@ -299,307 +298,307 @@ no_block:
 	return p;
 }
 
-static struct buffer_head * inode_getblk (struct inode * inode, int nr,
-	int new_block, int * err, int metadata, long *phys, int *new)
+/**
+ *	ext2_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the prefered place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same cylinder group.
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+
+static inline unsigned long ext2_find_near(struct inode *inode, Indirect *ind)
 {
-	u32 * p;
-	int tmp, goal = 0;
-	struct buffer_head * result;
-	int blocksize = inode->i_sb->s_blocksize;
+	u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext2_i.i_data;
+	u32 *p;
 
-	p = inode->u.ext2_i.i_data + nr;
-repeat:
-	tmp = le32_to_cpu(*p);
-	if (tmp) {
-		if (metadata) {
-			result = getblk (inode->i_dev, tmp, blocksize);
-			if (tmp == le32_to_cpu(*p))
-				return result;
-			brelse (result);
-			goto repeat;
-		} else {
-			*phys = tmp;
-			return NULL;
-		}
-	}
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--)
+		if (*p)
+			return le32_to_cpu(*p);
 
-	if (inode->u.ext2_i.i_next_alloc_block == new_block)
-		goal = inode->u.ext2_i.i_next_alloc_goal;
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
 
-	ext2_debug ("hint = %d,", goal);
+	/*
+	 * It is going to be refered from inode itself? OK, just put it into
+	 * the same cylinder group then.
+	 */
+	return (inode->u.ext2_i.i_block_group * 
+		EXT2_BLOCKS_PER_GROUP(inode->i_sb)) +
+	       le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block);
+}
 
-	if (!goal) {
-		for (tmp = nr - 1; tmp >= 0; tmp--) {
-			if (inode->u.ext2_i.i_data[tmp]) {
-				goal = le32_to_cpu(inode->u.ext2_i.i_data[tmp]);
-				break;
-			}
-		}
-		if (!goal)
-			goal = (inode->u.ext2_i.i_block_group * 
-				EXT2_BLOCKS_PER_GROUP(inode->i_sb)) +
-			       le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block);
-	}
+/**
+ *	ext2_find_goal - find a prefered place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@chain:  chain of indirect blocks
+ *	@partial: pointer to the last triple within a chain.
+ *
+ *	This function returns the prefered place for block allocation.
+ */
 
-	ext2_debug ("goal = %d.\n", goal);
-
-	tmp = ext2_alloc_block (inode, goal, err);
-	if (!tmp)
-		return NULL;
-
-	if (metadata) {
-		result = getblk (inode->i_dev, tmp, blocksize);
-		if (!buffer_uptodate(result))
-			wait_on_buffer(result);
-		memset(result->b_data, 0, blocksize);
-		mark_buffer_uptodate(result, 1);
-		mark_buffer_dirty(result, 1);
-		if (*p) {
-			ext2_free_blocks (inode, tmp, 1);
-			bforget (result);
-			goto repeat;
-		}
-	} else {
-		if (*p) {
-			/*
-			 * Nobody is allowed to change block allocation
-			 * state from under us:
-			 */
-			ext2_error (inode->i_sb, "block_getblk",
-				    "data block filled under us");
-			BUG();
-			ext2_free_blocks (inode, tmp, 1);
-			goto repeat;
-		}
-		*phys = tmp;
-		result = NULL;
-		*err = 0;
-		*new = 1;
-	}
-	*p = cpu_to_le32(tmp);
+static inline unsigned long ext2_find_goal(struct inode *inode,
+					   long block,
+					   Indirect chain[4],
+					   Indirect *partial)
+{
+	unsigned long goal = 0;
 
-	inode->u.ext2_i.i_next_alloc_block = new_block;
-	inode->u.ext2_i.i_next_alloc_goal = tmp;
-	inode->i_ctime = CURRENT_TIME;
-	inode->i_blocks += blocksize/512;
-	if (IS_SYNC(inode) || inode->u.ext2_i.i_osync)
-		ext2_sync_inode (inode);
-	else
-		mark_inode_dirty(inode);
-	return result;
+	/* Writer: ->i_next_alloc* */
+	if (block == inode->u.ext2_i.i_next_alloc_block + 1) {
+		inode->u.ext2_i.i_next_alloc_block++;
+		inode->u.ext2_i.i_next_alloc_goal++;
+	} 
+	/* Writer: end */
+	/* Reader: pointers, ->i_next_alloc* */
+	if (verify_chain(chain, partial)) {
+		/*
+		 * try the heuristic for sequential allocation,
+		 * failing that at least try to get decent locality.
+		 */
+		if (block == inode->u.ext2_i.i_next_alloc_block)
+			goal = inode->u.ext2_i.i_next_alloc_goal;
+		if (!goal)
+			goal = ext2_find_near(inode, partial);
+	}
+	/* Reader: end */
+	return goal;
 }
 
-/*
- *   metadata / data
- *   possibly create / access
- *   can fail due to: - not present
- *                    - out of space
+/**
+ *	ext2_alloc_branch - allocate and set up a chain of blocks.
+ *	@inode: owner
+ *	@num: depth of the chain (number of blocks to allocate)
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
  *
- *   NULL return in the data case is mandatory.
+ *	This function allocates @num blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext2_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext2_get_block(), excpet that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	ther buffer_heads) and return the error value the from failed
+ *	ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
  */
-static struct buffer_head * block_getblk (struct inode * inode,
-	  struct buffer_head * bh, int nr,
-	  int new_block, int * err, int metadata, long *phys, int *new)
+
+static int ext2_alloc_branch(struct inode *inode,
+			     int num,
+			     unsigned long goal,
+			     int *offsets,
+			     Indirect *branch)
 {
-	int tmp, goal = 0;
-	u32 * p;
-	struct buffer_head * result;
 	int blocksize = inode->i_sb->s_blocksize;
+	int n = 0;
+	int err;
+	int i;
+	int parent = ext2_alloc_block(inode, goal, &err);
 
-	result = NULL;	
-	if (!bh)
-		goto out;
-	if (!buffer_uptodate(bh)) {
-		ll_rw_block (READ, 1, &bh);
-		wait_on_buffer (bh);
+	branch[0].key = cpu_to_le32(parent);
+	if (parent) for (n = 1; n < num; n++) {
+		struct buffer_head *bh;
+		/* Allocate the next block */
+		int nr = ext2_alloc_block(inode, parent, &err);
+		if (!nr)
+			break;
+		branch[n].key = cpu_to_le32(nr);
+		/*
+		 * Get buffer_head for parent block, zero it out and set 
+		 * the pointer to new one, then send parent to disk.
+		 */
+		bh = getblk(inode->i_dev, parent, blocksize);
 		if (!buffer_uptodate(bh))
-			goto out;
-	}
-	p = (u32 *) bh->b_data + nr;
-repeat:
-	tmp = le32_to_cpu(*p);
-	if (tmp) {
-		if (metadata) {
-			result = getblk (bh->b_dev, tmp, blocksize);
-			if (tmp == le32_to_cpu(*p))
-				goto out;
-			brelse (result);
-			goto repeat;
-		} else {
-			*phys = tmp;
-			/* result == NULL */
-			goto out;
+			wait_on_buffer(bh);
+		memset(bh->b_data, 0, blocksize);
+		branch[n].bh = bh;
+		branch[n].p = (u32*) bh->b_data + offsets[n];
+		*branch[n].p = branch[n].key;
+		mark_buffer_uptodate(bh, 1);
+		mark_buffer_dirty(bh, 1);
+		if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
+			ll_rw_block (WRITE, 1, &bh);
+			wait_on_buffer (bh);
 		}
+		parent = nr;
 	}
+	if (n == num)
+		return 0;
 
-	if (inode->u.ext2_i.i_next_alloc_block == new_block)
-		goal = inode->u.ext2_i.i_next_alloc_goal;
-	if (!goal) {
-		for (tmp = nr - 1; tmp >= 0; tmp--) {
-			if (le32_to_cpu(((u32 *) bh->b_data)[tmp])) {
-				goal = le32_to_cpu(((u32 *)bh->b_data)[tmp]);
-				break;
-			}
-		}
-		if (!goal)
-			goal = bh->b_blocknr;
-	}
-	tmp = ext2_alloc_block (inode, goal, err);
-	if (!tmp)
-		goto out;
-	if (metadata) {
-		result = getblk (bh->b_dev, tmp, blocksize);
-		if (!buffer_uptodate(result))
-			wait_on_buffer(result);
-		memset(result->b_data, 0, inode->i_sb->s_blocksize);
-		mark_buffer_uptodate(result, 1);
-		mark_buffer_dirty(result, 1);
-		if (*p) {
-			ext2_free_blocks (inode, tmp, 1);
-			bforget (result);
-			goto repeat;
-		}
-	} else {
-		if (*p) {
-			/*
-			 * Nobody is allowed to change block allocation
-			 * state from under us:
-			 */
-			ext2_error (inode->i_sb, "block_getblk",
-				    "data block filled under us");
-			BUG();
-			ext2_free_blocks (inode, tmp, 1);
-			goto repeat;
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i < n; i++)
+		bforget(branch[i].bh);
+	for (i = 0; i < n; i++)
+		ext2_free_blocks(inode, le32_to_cpu(branch[i].key), 1);
+	return err;
+}
+
+/**
+ *	ext2_splice_branch - splice the allocated branch onto inode.
+ *	@inode: owner
+ *	@block: (logical) number of block we are adding
+ *	@chain: chain of indirect blocks (with a missing link - see
+ *		ext2_alloc_branch)
+ *	@where: location of missing link
+ *	@num:   number of blocks we are adding
+ *
+ *	This function verifies that chain (up to the missing link) had not
+ *	changed, fills the missing link and does all housekeeping needed in
+ *	inode (->i_blocks, etc.). In case of success we end up with the full
+ *	chain to new block and return 0. Otherwise (== chain had been changed)
+ *	we free the new blocks (forgetting their buffer_heads, indeed) and
+ *	return -EAGAIN.
+ */
+
+static inline int ext2_splice_branch(struct inode *inode,
+				     long block,
+				     Indirect chain[4],
+				     Indirect *where,
+				     int num)
+{
+	int i;
+
+	/* Verify that place we are splicing to is still there and vacant */
+
+	/* Writer: pointers, ->i_next_alloc*, ->i_blocks */
+	if (!verify_chain(chain, where-1) || *where->p)
+		/* Writer: end */
+		goto changed;
+
+	/* That's it */
+
+	*where->p = where->key;
+	inode->u.ext2_i.i_next_alloc_block = block;
+	inode->u.ext2_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
+	inode->i_blocks += num * inode->i_sb->s_blocksize/512;
+
+	/* Writer: end */
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+
+	inode->i_ctime = CURRENT_TIME;
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh) {
+		mark_buffer_dirty(where->bh, 1);
+		if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
+			ll_rw_block (WRITE, 1, &where->bh);
+			wait_on_buffer(where->bh);
 		}
-		*phys = tmp;
-		*new = 1;
-	}
-	*p = le32_to_cpu(tmp);
-	mark_buffer_dirty(bh, 1);
-	if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
-		ll_rw_block (WRITE, 1, &bh);
-		wait_on_buffer (bh);
 	}
-	inode->i_ctime = CURRENT_TIME;
-	inode->i_blocks += blocksize/512;
-	mark_inode_dirty(inode);
-	inode->u.ext2_i.i_next_alloc_block = new_block;
-	inode->u.ext2_i.i_next_alloc_goal = tmp;
-	*err = 0;
-out:
-	brelse (bh);
-	return result;
+
+	if (IS_SYNC(inode) || inode->u.ext2_i.i_osync)
+		ext2_sync_inode (inode);
+	else
+		mark_inode_dirty(inode);
+	return 0;
+
+changed:
+	for (i = 1; i < num; i++)
+		bforget(where[i].bh);
+	for (i = 0; i < num; i++)
+		ext2_free_blocks(inode, le32_to_cpu(where[i].key), 1);
+	return -EAGAIN;
 }
 
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ */
+
 static int ext2_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
 {
-	int ret, err, new;
-	struct buffer_head *bh;
-	unsigned long phys;
+	int err = -EIO;
 	int offsets[4];
-	int *p;
 	Indirect chain[4];
 	Indirect *partial;
-	int depth;
+	unsigned long goal;
+	int left;
+	int depth = ext2_block_to_path(inode, iblock, offsets);
 
-	depth = ext2_block_to_path(inode, iblock, offsets);
 	if (depth == 0)
-		goto abort;
+		goto out;
 
 	lock_kernel();
+reread:
 	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 
+	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
-		unlock_kernel();
-		for (partial = chain + depth - 1; partial > chain; partial--)
-			brelse(partial->bh);
+got_it:
 		bh_result->b_dev = inode->i_dev;
 		bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
 		bh_result->b_state |= (1UL << BH_Mapped);
-		return 0;
+		/* Clean up and exit */
+		partial = chain+depth-1; /* the whole chain */
+		goto cleanup;
 	}
 
-	while (partial > chain) {
-		brelse(partial->bh);
-		partial--;
-	}
-
-	if (!create) {
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO) {
+cleanup:
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
 		unlock_kernel();
-		return 0;
+out:
+		return err;
 	}
 
-	err = -EIO;
-	new = 0;
-	ret = 0;
-	bh = NULL;
-
 	/*
-	 * If this is a sequential block allocation, set the next_alloc_block
-	 * to this block now so that all the indblock and data block
-	 * allocations use the same goal zone
+	 * Indirect block might be removed by truncate while we were
+	 * reading it. Handling of that case (forget what we've got and
+	 * reread) is taken out of the main path.
 	 */
+	if (err == -EAGAIN)
+		goto changed;
 
-	ext2_debug ("block %lu, next %lu, goal %lu.\n", iblock, 
-		    inode->u.ext2_i.i_next_alloc_block,
-		    inode->u.ext2_i.i_next_alloc_goal);
-
-	if (iblock == inode->u.ext2_i.i_next_alloc_block + 1) {
-		inode->u.ext2_i.i_next_alloc_block++;
-		inode->u.ext2_i.i_next_alloc_goal++;
-	}
+	goal = ext2_find_goal(inode, iblock, chain, partial);
+	if (!goal)
+		goto changed;
 
-	err = 0;
+	left = (chain + depth) - partial;
+	err = ext2_alloc_branch(inode, left, goal,
+					offsets+(partial-chain), partial);
+	if (err)
+		goto cleanup;
 
-	/*
-	 * ok, these macros clean the logic up a bit and make
-	 * it much more readable:
-	 */
-#define GET_INODE_DATABLOCK(x) \
-		inode_getblk(inode, x, iblock, &err, 0, &phys, &new)
-#define GET_INODE_PTR(x) \
-		inode_getblk(inode, x, iblock, &err, 1, NULL, NULL)
-#define GET_INDIRECT_DATABLOCK(x) \
-		block_getblk (inode, bh, x, iblock, &err, 0, &phys, &new);
-#define GET_INDIRECT_PTR(x) \
-		block_getblk (inode, bh, x, iblock, &err, 1, NULL, NULL);
-
-	p = offsets;
-	if (depth == 1) {
-		bh = GET_INODE_DATABLOCK(*p);
-		goto out;
-	}
-	bh = GET_INODE_PTR(*p);
-	switch (depth) {
-		default: /* case 4: */
-			bh = GET_INDIRECT_PTR(*++p);
-		case 3:
-			bh = GET_INDIRECT_PTR(*++p);
-		case 2:
-			bh = GET_INDIRECT_DATABLOCK(*++p);
-	}
+	if (ext2_splice_branch(inode, iblock, chain, partial, left) < 0)
+		goto changed;
 
-#undef GET_INODE_DATABLOCK
-#undef GET_INODE_PTR
-#undef GET_INDIRECT_DATABLOCK
-#undef GET_INDIRECT_PTR
+	bh_result->b_state |= (1UL << BH_New);
+	goto got_it;
 
-out:
-	if (bh)
-		BUG();	// temporary debugging check
-	if (err)
-		goto abort;
-	if (!phys)
-		BUG();	// must not happen either
-
-	bh_result->b_dev = inode->i_dev;
-	bh_result->b_blocknr = phys;
-	bh_result->b_state |= (1UL << BH_Mapped); /* safe */
-	if (new)
-		bh_result->b_state |= (1UL << BH_New);
-	unlock_kernel();
-abort:
-	return err;
+changed:
+	while (partial > chain) {
+		bforget(partial->bh);
+		partial--;
+	}
+	goto reread;
 }
 
 struct buffer_head * ext2_getblk(struct inode * inode, long block, int create, int * err)
@@ -833,7 +832,7 @@ void ext2_read_inode (struct inode * inode)
 	inode->i_attr_flags = 0;
 	if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) {
 		inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS;
-		inode->i_flags |= MS_SYNCHRONOUS;
+		inode->i_flags |= S_SYNC;
 	}
 	if (inode->u.ext2_i.i_flags & EXT2_APPEND_FL) {
 		inode->i_attr_flags |= ATTR_FLAG_APPEND;
@@ -845,7 +844,7 @@ void ext2_read_inode (struct inode * inode)
 	}
 	if (inode->u.ext2_i.i_flags & EXT2_NOATIME_FL) {
 		inode->i_attr_flags |= ATTR_FLAG_NOATIME;
-		inode->i_flags |= MS_NOATIME;
+		inode->i_flags |= S_NOATIME;
 	}
 	return;
 	
@@ -999,17 +998,17 @@ int ext2_notify_change(struct dentry *dentry, struct iattr *iattr)
 	
 	flags = iattr->ia_attr_flags;
 	if (flags & ATTR_FLAG_SYNCRONOUS) {
-		inode->i_flags |= MS_SYNCHRONOUS;
+		inode->i_flags |= S_SYNC;
 		inode->u.ext2_i.i_flags |= EXT2_SYNC_FL;
 	} else {
-		inode->i_flags &= ~MS_SYNCHRONOUS;
+		inode->i_flags &= ~S_SYNC;
 		inode->u.ext2_i.i_flags &= ~EXT2_SYNC_FL;
 	}
 	if (flags & ATTR_FLAG_NOATIME) {
-		inode->i_flags |= MS_NOATIME;
+		inode->i_flags |= S_NOATIME;
 		inode->u.ext2_i.i_flags |= EXT2_NOATIME_FL;
 	} else {
-		inode->i_flags &= ~MS_NOATIME;
+		inode->i_flags &= ~S_NOATIME;
 		inode->u.ext2_i.i_flags &= ~EXT2_NOATIME_FL;
 	}
 	if (flags & ATTR_FLAG_APPEND) {
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 0b456820a..6528f1f74 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -53,9 +53,9 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		inode->u.ext2_i.i_flags = flags;
 
 		if (flags & EXT2_SYNC_FL)
-			inode->i_flags |= MS_SYNCHRONOUS;
+			inode->i_flags |= S_SYNC;
 		else
-			inode->i_flags &= ~MS_SYNCHRONOUS;
+			inode->i_flags &= ~S_SYNC;
 		if (flags & EXT2_APPEND_FL)
 			inode->i_flags |= S_APPEND;
 		else
@@ -65,9 +65,9 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		else
 			inode->i_flags &= ~S_IMMUTABLE;
 		if (flags & EXT2_NOATIME_FL)
-			inode->i_flags |= MS_NOATIME;
+			inode->i_flags |= S_NOATIME;
 		else
-			inode->i_flags &= ~MS_NOATIME;
+			inode->i_flags &= ~S_NOATIME;
 		inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty(inode);
 		return 0;
diff --git a/fs/namei.c b/fs/namei.c
index 55658cd6b..97d9e2d22 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1739,7 +1739,7 @@ int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 	if (error)
 		return error;
 	/* The following d_move() should become unconditional */
-	if (!(old_dir->i_sb->s_flags & MS_ODD_RENAME)) {
+	if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME)) {
 		d_move(old_dentry, new_dentry);
 	}
 	return 0;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 993dcc1a4..db7c110e5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -275,8 +275,6 @@ nfs_read_super(struct super_block *sb, void *raw_data, int silent)
 	if (srvaddr.sin_addr.s_addr == INADDR_ANY)
 		goto out_no_remote;
 
-	sb->s_flags |= MS_ODD_RENAME; /* This should go away */
-
 	sb->s_magic      = NFS_SUPER_MAGIC;
 	sb->s_op         = &nfs_sops;
 	sb->s_blocksize_bits = 0;
@@ -1164,7 +1162,7 @@ out_changed:
 /*
  * File system information
  */
-static DECLARE_FSTYPE(nfs_fs_type, "nfs", nfs_read_super, 0);
+static DECLARE_FSTYPE(nfs_fs_type, "nfs", nfs_read_super, FS_ODD_RENAME);
 
 extern int nfs_init_fhcache(void);
 extern void nfs_destroy_fhcache(void);
diff --git a/fs/stat.c b/fs/stat.c
index 3ca996ed1..18d1f7a88 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -269,6 +269,9 @@ static long cp_new_stat64(struct inode * inode, struct stat64 * statbuf)
 	memset(&tmp, 0, sizeof(tmp));
 	tmp.st_dev = kdev_t_to_nr(inode->i_dev);
 	tmp.st_ino = inode->i_ino;
+#ifdef STAT64_HAS_BROKEN_ST_INO
+	tmp.__st_ino = inode->i_ino;
+#endif
 	tmp.st_mode = inode->i_mode;
 	tmp.st_nlink = inode->i_nlink;
 	tmp.st_uid = inode->i_uid;
diff --git a/fs/umsdos/emd.c b/fs/umsdos/emd.c
index 84e8ca99f..9e4ace724 100644
--- a/fs/umsdos/emd.c
+++ b/fs/umsdos/emd.c
@@ -22,6 +22,7 @@
 static void copy_entry(struct umsdos_dirent *p, struct umsdos_dirent *q)
 {
 	p->name_len = q->name_len;
+	p->name[p->name_len]='\0';
 	p->flags = q->flags;
 	p->nlink = le16_to_cpu (q->nlink);
 	/* FIXME -- 32bit UID/GID issues */
@@ -119,6 +120,7 @@ int umsdos_emd_dir_readentry (struct dentry *demd, loff_t *pos, struct umsdos_di
 	struct umsdos_dirent *p;
 	int offs = *pos & ~PAGE_CACHE_MASK;
 	int recsize;
+	int ret = 0;
 
 	page = read_cache_page(mapping, *pos>>PAGE_CACHE_SHIFT,
 			(filler_t*)mapping->a_ops->readpage, NULL);
@@ -128,6 +130,15 @@ int umsdos_emd_dir_readentry (struct dentry *demd, loff_t *pos, struct umsdos_di
 	if (!Page_Uptodate(page))
 		goto async_fail;
 	p = (struct umsdos_dirent*)((char*)kmap(page)+offs);
+
+	/* if this is an invalid entry (invalid name length), ignore it */
+	if( p->name_len > UMSDOS_MAXNAME )
+	{
+		printk (KERN_WARNING "Ignoring invalid EMD entry with size %d\n", entry->name_len);
+		p->name_len = 0; 
+		ret = -ENAMETOOLONG; /* notify umssync(8) code that something is wrong */
+	}
+
 	recsize = umsdos_evalrecsize(p->name_len);
 	if (offs + recsize > PAGE_CACHE_SIZE) {
 		struct page *page2;
@@ -157,7 +168,7 @@ int umsdos_emd_dir_readentry (struct dentry *demd, loff_t *pos, struct umsdos_di
 	kunmap(page);
 	page_cache_release(page);
 	*pos += recsize;
-	return 0;
+	return ret;
 async_fail:
 	page_cache_release(page);
 	page = ERR_PTR(-EIO);
@@ -172,7 +183,7 @@ sync_fail:
  *
  * Note: the caller must hold a lock on the parent directory.
  */
-static int umsdos_writeentry (struct dentry *parent, struct umsdos_info *info,
+int umsdos_writeentry (struct dentry *parent, struct umsdos_info *info,
 				int free_entry)
 {
 	struct inode *dir = parent->d_inode;
@@ -266,7 +277,7 @@ static int umsdos_writeentry (struct dentry *parent, struct umsdos_info *info,
 			goto out_unlock;
 	} else {
 		ret = mapping->a_ops->prepare_write(NULL,page,offs,
-					info->recsize);
+					offs + info->recsize);
 		if (ret)
 			goto out_unlock;
 		p->name_len = entry->name_len;
@@ -281,7 +292,7 @@ static int umsdos_writeentry (struct dentry *parent, struct umsdos_info *info,
 		p->mode = cpu_to_le16(entry->mode);
 		memcpy(p->spare,entry->spare,((char*)p+info->recsize)-p->spare);
 		ret = mapping->a_ops->commit_write(NULL,page,offs,
-					info->recsize);
+					offs + info->recsize);
 		if (ret)
 			goto out_unlock;
 	}
@@ -373,6 +384,7 @@ static int umsdos_find (struct dentry *demd, struct umsdos_info *info)
 			if (page) {
 				kunmap(page);
 				page_cache_release(page);
+				page = NULL;
 			}
 			if (pos >= emd_dir->i_size) {
 				info->f_pos = empty.posok;
diff --git a/fs/umsdos/inode.c b/fs/umsdos/inode.c
index bb48b6bf2..12b34e849 100644
--- a/fs/umsdos/inode.c
+++ b/fs/umsdos/inode.c
@@ -38,8 +38,7 @@ void UMSDOS_put_inode (struct inode *inode)
 		 ,atomic_read(&inode->i_count)));
 
 	if (inode == pseudo_root) {
-		printk (KERN_ERR "Umsdos: Oops releasing pseudo_root."
-			" Notify jacques@solucorp.qc.ca\n");
+		printk (KERN_ERR "Umsdos: debug: releasing pseudo_root - ino=%lu count=%d\n", inode->i_ino, atomic_read(&inode->i_count));
 	}
 
 	if (atomic_read(&inode->i_count) == 1)
@@ -203,15 +202,15 @@ dentry->d_parent->d_name.name, dentry->d_name.name, inode->u.umsdos_i.i_patched)
 	ret = PTR_ERR(demd);
 	if (IS_ERR(demd))
 		goto out;
-	ret = -EPERM;
-	if (!demd->d_inode) {
-		printk(KERN_WARNING
+	ret = 0;
+	/* don't do anything if directory is not promoted to umsdos yet */
+	if (!demd->d_inode) { 
+		Printk((KERN_DEBUG
 			"UMSDOS_notify_change: no EMD file %s/%s\n",
-			demd->d_parent->d_name.name, demd->d_name.name);
+			demd->d_parent->d_name.name, demd->d_name.name));
 		goto out_dput;
 	}
 
-	ret = 0;
 	/* don't do anything if this is the EMD itself */
 	if (inode == demd->d_inode)
 		goto out_dput;
@@ -295,10 +294,19 @@ static struct super_operations umsdos_sops =
 	put_inode:	UMSDOS_put_inode,
 	delete_inode:	fat_delete_inode,
 	put_super:	UMSDOS_put_super,
-	statfs:		fat_statfs,
+	statfs:		UMSDOS_statfs,
 	clear_inode:	fat_clear_inode,
 };
 
+int UMSDOS_statfs(struct super_block *sb,struct statfs *buf)
+{
+	int ret;
+	ret = fat_statfs (sb, buf);
+	if (!ret)	
+		buf->f_namelen = UMSDOS_MAXNAME;
+	return ret;
+}
+
 /*
  * Read the super block of an Extended MS-DOS FS.
  */
@@ -317,7 +325,7 @@ struct super_block *UMSDOS_read_super (struct super_block *sb, void *data,
 	if (!res)
 		goto out_fail;
 
-	printk (KERN_INFO "UMSDOS 0.86 "
+	printk (KERN_INFO "UMSDOS 0.86i "
 		"(compatibility level %d.%d, fast msdos)\n", 
 		UMSDOS_VERSION, UMSDOS_RELEASE);
 
diff --git a/fs/umsdos/ioctl.c b/fs/umsdos/ioctl.c
index 17d38c2dd..8731f8ecd 100644
--- a/fs/umsdos/ioctl.c
+++ b/fs/umsdos/ioctl.c
@@ -177,6 +177,16 @@ dentry->d_parent->d_name.name, dentry->d_name.name, cmd, data_ptr));
 			struct umsdos_info info;
 
 			ret = umsdos_emd_dir_readentry (demd, &pos, &entry);
+
+			if (ret == -ENAMETOOLONG) {
+				printk (KERN_INFO "Fixing EMD entry with invalid size -- zeroing out\n");
+				memset (&info, 0, sizeof (info));
+				info.f_pos = f_pos;
+				info.recsize = UMSDOS_REC_SIZE;
+				ret = umsdos_writeentry (dentry, &info, 1);
+				continue;
+			}
+
 			if (ret)
 				break;
 			if (entry.name_len <= 0)
diff --git a/fs/umsdos/namei.c b/fs/umsdos/namei.c
index 4102ce6e0..c29387f3c 100644
--- a/fs/umsdos/namei.c
+++ b/fs/umsdos/namei.c
@@ -335,19 +335,6 @@ static void umsdos_ren_init (struct umsdos_info *new_info,
 	new_info->entry.nlink = old_info->entry.nlink;
 }
 
-#ifdef OBSOLETE
-#define chkstk() \
-if (STACK_MAGIC != *(unsigned long *)current->kernel_stack_page){\
-    printk(KERN_ALERT "UMSDOS: %s magic %x != %lx ligne %d\n" \
-	   , current->comm,STACK_MAGIC \
-	   ,*(unsigned long *)current->kernel_stack_page \
-	   ,__LINE__); \
-}
-
-#undef chkstk
-#define chkstk() do { } while (0);
-#endif
-
 /*
  * Rename a file (move) in the file system.
  */