Merge with Linux 2.3.23. The new bootmem stuff has broken various

platforms. At this time I've only verified that IP22 support compiles and IP27 actually works.
author: Ralf Baechle <ralf@linux-mips.org> 2000-01-27 01:05:20 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-01-27 01:05:20 +0000
commit: 546db14ee74118296f425f3b91634fb767d67290 (patch)
tree: 22b613a3da8d4bf663eec5e155af01b87fdf9094 /fs
parent: 1e25e41c4f5474e14452094492dbc169b800e4c8 (diff)
22 files changed, 863 insertions, 720 deletions
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index cc72f4e18..ca5d8e8cb 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -270,7 +270,6 @@ static inline int do_load_aout_binary(struct linux_binprm * bprm, struct pt_regs
 	unsigned long fd_offset;
 	unsigned long rlim;
 	int retval;
-	static unsigned long error_time=0;
 
 	ex = *((struct exec *) bprm->buf);		/* exec-header */
 	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
@@ -282,29 +281,6 @@ static inline int do_load_aout_binary(struct linux_binprm * bprm, struct pt_regs
 
 	fd_offset = N_TXTOFF(ex);
 
-#ifdef __i386__
-	if (N_MAGIC(ex) == ZMAGIC && fd_offset != BLOCK_SIZE) {
-		if((jiffies-error_time) >5)
-		{
-			printk(KERN_NOTICE "N_TXTOFF != BLOCK_SIZE. See a.out.h.\n");
-			error_time=jiffies;
-		}
-		return -ENOEXEC;
-	}
-
-	if (N_MAGIC(ex) == ZMAGIC && ex.a_text &&
-	    bprm->dentry->d_inode->i_op &&
-	    bprm->dentry->d_inode->i_op->get_block &&
-	    (fd_offset < bprm->dentry->d_inode->i_sb->s_blocksize)) {
-		if((jiffies-error_time) >5)
-		{
-			printk(KERN_NOTICE "N_TXTOFF < BLOCK_SIZE. Please convert binary.\n");
-			error_time=jiffies;
-		}
-		return -ENOEXEC;
-	}
-#endif
-
 	/* Check initial limits. This avoids letting people circumvent
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
@@ -364,26 +340,32 @@ static inline int do_load_aout_binary(struct linux_binprm * bprm, struct pt_regs
 		flush_icache_range((unsigned long) 0,
 				   (unsigned long) ex.a_text+ex.a_data);
 	} else {
+		static unsigned long error_time, error_time2;
 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
-		    (N_MAGIC(ex) != NMAGIC))
+		    (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
+		{
 			printk(KERN_NOTICE "executable not page aligned\n");
+			error_time2 = jiffies;
+		}
 
 		fd = open_dentry(bprm->dentry, O_RDONLY);
 		if (fd < 0)
 			return fd;
 		file = fget(fd);
 
-		if ((fd_offset & ~PAGE_MASK) != 0) {
+		if ((fd_offset & ~PAGE_MASK) != 0 &&
+		    (jiffies-error_time) > 5*HZ)
+		{
 			printk(KERN_WARNING 
 			       "fd_offset is not page aligned. Please convert program: %s\n",
-			       file->f_dentry->d_name.name
-			       );
+			       file->f_dentry->d_name.name);
+			error_time = jiffies;
 		}
 
 		if (!file->f_op || !file->f_op->mmap || ((fd_offset & ~PAGE_MASK) != 0)) {
 			fput(file);
 			sys_close(fd);
-			do_brk(0, ex.a_text+ex.a_data);
+			do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
 			read_exec(bprm->dentry, fd_offset,
 				  (char *) N_TXTADDR(ex), ex.a_text+ex.a_data, 0);
 			flush_icache_range((unsigned long) N_TXTADDR(ex),
@@ -493,12 +475,6 @@ do_load_aout_library(int fd)
 		goto out_putf;
 	}
 
-	if (N_MAGIC(ex) == ZMAGIC && N_TXTOFF(ex) &&
-	    (N_TXTOFF(ex) < inode->i_sb->s_blocksize)) {
-		printk("N_TXTOFF < BLOCK_SIZE. Please convert library\n");
-		goto out_putf;
-	}
-
 	if (N_FLAGS(ex))
 		goto out_putf;
 
@@ -508,14 +484,17 @@ do_load_aout_library(int fd)
 	start_addr =  ex.a_entry & 0xfffff000;
 
 	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-		printk(KERN_WARNING 
-		       "N_TXTOFF is not page aligned. Please convert library: %s\n",
-		       file->f_dentry->d_name.name
-		       );
-		
-		do_mmap(NULL, start_addr & PAGE_MASK, ex.a_text + ex.a_data + ex.a_bss,
-			PROT_READ | PROT_WRITE | PROT_EXEC,
-			MAP_FIXED| MAP_PRIVATE, 0);
+		static unsigned long error_time;
+
+		if ((jiffies-error_time) > 5*HZ)
+		{
+			printk(KERN_WARNING 
+			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
+			       file->f_dentry->d_name.name);
+			error_time = jiffies;
+		}
+
+		do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
 		
 		read_exec(file->f_dentry, N_TXTOFF(ex),
 			  (char *)start_addr, ex.a_text + ex.a_data, 0);
diff --git a/fs/buffer.c b/fs/buffer.c
index c43c54a36..39dd880f8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -26,6 +26,8 @@
 
 /* Thread it... -DaveM */
 
+/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
+
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/malloc.h>
@@ -76,6 +78,7 @@ static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
 static struct buffer_head *lru_list[NR_LIST];
 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
 static int nr_buffers_type[NR_LIST] = {0,};
+static unsigned long size_buffers_type[NR_LIST] = {0,};
 
 static struct buffer_head * unused_list = NULL;
 static int nr_unused_buffer_heads = 0;
@@ -93,7 +96,7 @@ static kmem_cache_t *bh_cachep;
 static int grow_buffers(int size);
 
 /* This is used by some architectures to estimate available memory. */
-atomic_t buffermem = ATOMIC_INIT(0);
+atomic_t buffermem_pages = ATOMIC_INIT(0);
 
 /* Here is the parameter block for the bdflush process. If you add or
  * remove any of the parameters, make sure to update kernel/sysctl.c.
@@ -114,18 +117,18 @@ union bdflush_param {
 				each time we call refill */
 		int nref_dirt; /* Dirty buffer threshold for activating bdflush
 				  when trying to refill buffers. */
-		int dummy1;    /* unused */
+		int interval; /* jiffies delay between kupdate flushes */
 		int age_buffer;  /* Time for normal buffer to age before we flush it */
 		int age_super;  /* Time for superblock to age before we flush it */
 		int dummy2;    /* unused */
 		int dummy3;    /* unused */
 	} b_un;
 	unsigned int data[N_PARAM];
-} bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
+} bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 
 /* These are the min and max parameter values that we will allow to be assigned */
 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
-int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 
 void wakeup_bdflush(int);
 
@@ -482,6 +485,7 @@ static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 	(*bhp)->b_prev_free->b_next_free = bh;
 	(*bhp)->b_prev_free = bh;
 	nr_buffers_type[blist]++;
+	size_buffers_type[blist] += bh->b_size;
 }
 
 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
@@ -495,6 +499,7 @@ static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 			lru_list[blist] = NULL;
 		bh->b_next_free = bh->b_prev_free = NULL;
 		nr_buffers_type[blist]--;
+		size_buffers_type[blist] -= bh->b_size;
 	}
 }
 
@@ -813,6 +818,27 @@ out:
 	return bh;
 }
 
+/* -1 -> no need to flush
+    0 -> async flush
+    1 -> sync flush (wait for I/O completation) */
+static int balance_dirty_state(kdev_t dev)
+{
+	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+	tot = nr_lru_pages + nr_free_pages + nr_free_highpages;
+	hard_dirty_limit = tot * bdf_prm.b_un.nfract / 100;
+	soft_dirty_limit = hard_dirty_limit >> 1;
+
+	if (dirty > soft_dirty_limit)
+	{
+		if (dirty > hard_dirty_limit)
+			return 1;
+		return 0;
+	}
+	return -1;
+}
+
 /*
  * if a new dirty buffer is created we need to balance bdflush.
  *
@@ -820,23 +846,13 @@ out:
  * pressures on different devices - thus the (currently unused)
  * 'dev' parameter.
  */
-static int too_many_dirty_buffers;
-
 void balance_dirty(kdev_t dev)
 {
-	int dirty = nr_buffers_type[BUF_DIRTY];
-	int ndirty = bdf_prm.b_un.ndirty;
-
-	if (dirty > ndirty) {
-		if (dirty > 2*ndirty) {
-			too_many_dirty_buffers = 1;
-			wakeup_bdflush(1);
-			return;
-		}
-		wakeup_bdflush(0);
-	}
-	too_many_dirty_buffers = 0;
-	return;
+	int state = balance_dirty_state(dev);
+
+	if (state < 0)
+		return;
+	wakeup_bdflush(state);
 }
 
 static inline void __mark_dirty(struct buffer_head *bh, int flag)
@@ -1250,7 +1266,7 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset
 	 */
 	if (!offset) {
 		if (!try_to_free_buffers(page)) {
-			atomic_add(PAGE_CACHE_SIZE, &buffermem);
+			atomic_inc(&buffermem_pages);
 			return 0;
 		}
 	}
@@ -1364,6 +1380,7 @@ int block_write_partial_page(struct file *file, struct page *page, unsigned long
 	unsigned long bbits, blocks, i, len;
 	struct buffer_head *bh, *head;
 	char * target_buf;
+	int need_balance_dirty;
 
 	target_buf = (char *)page_address(page) + offset;
 
@@ -1403,6 +1420,7 @@ int block_write_partial_page(struct file *file, struct page *page, unsigned long
 	i = 0;
 	bh = head;
 	partial = 0;
+	need_balance_dirty = 0;
 	do {
 		if (!bh)
 			BUG();
@@ -1473,8 +1491,7 @@ int block_write_partial_page(struct file *file, struct page *page, unsigned long
 		set_bit(BH_Uptodate, &bh->b_state);
 		if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
 			__mark_dirty(bh, 0);
-			if (too_many_dirty_buffers)
-				balance_dirty(bh->b_dev);
+			need_balance_dirty = 1;
 		}
 
 		if (err) {
@@ -1488,6 +1505,9 @@ skip:
 		bh = bh->b_this_page;
 	} while (bh != head);
 
+	if (need_balance_dirty)
+		balance_dirty(bh->b_dev);
+
 	/*
 	 * is this a partial write that happened to make all buffers
 	 * uptodate then we can optimize away a bogus readpage() for
@@ -1519,6 +1539,7 @@ int block_write_cont_page(struct file *file, struct page *page, unsigned long of
 	struct buffer_head *bh, *head;
 	char * target_buf, *target_data;
 	unsigned long data_offset = offset;
+	int need_balance_dirty;
 
 	offset = inode->i_size - page->offset;
 	if (page->offset>inode->i_size)
@@ -1566,6 +1587,7 @@ int block_write_cont_page(struct file *file, struct page *page, unsigned long of
 	i = 0;
 	bh = head;
 	partial = 0;
+	need_balance_dirty = 0;
 	do {
 		if (!bh)
 			BUG();
@@ -1644,8 +1666,7 @@ int block_write_cont_page(struct file *file, struct page *page, unsigned long of
 		set_bit(BH_Uptodate, &bh->b_state);
 		if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
 			__mark_dirty(bh, 0);
-			if (too_many_dirty_buffers)
-				balance_dirty(bh->b_dev);
+			need_balance_dirty = 1;
 		}
 
 		if (err) {
@@ -1659,6 +1680,9 @@ skip:
 		bh = bh->b_this_page;
 	} while (bh != head);
 
+	if (need_balance_dirty)
+		balance_dirty(bh->b_dev);
+
 	/*
 	 * is this a partial write that happened to make all buffers
 	 * uptodate then we can optimize away a bogus readpage() for
@@ -1809,12 +1833,12 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
 		dprintk ("iobuf %d %d %d\n", offset, length, size);
 
 		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
-			page = iobuf->pagelist[pageind];
 			map  = iobuf->maplist[pageind];
-			if (map && PageBIGMEM(map)) {
+			if (map && PageHighMem(map)) {
 				err = -EIO;
 				goto error;
 			}
+			page = page_address(map);
 
 			while (length > 0) {
 				blocknr = b[bufind++];
@@ -2090,7 +2114,7 @@ static int grow_buffers(int size)
 	page_map = mem_map + MAP_NR(page);
 	page_map->buffers = bh;
 	lru_cache_add(page_map);
-	atomic_add(PAGE_SIZE, &buffermem);
+	atomic_inc(&buffermem_pages);
 	return 1;
 
 no_buffer_head:
@@ -2168,12 +2192,53 @@ out:
 
 busy_buffer_page:
 	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
-	too_many_dirty_buffers = 1;
 	wakeup_bdflush(0);
 	ret = 0;
 	goto out;
 }
 
+/* ================== Debugging =================== */
+
+void show_buffers(void)
+{
+	struct buffer_head * bh;
+	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
+	int protected = 0;
+	int nlist;
+	static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY" };
+
+	printk("Buffer memory:   %6dkB\n",
+			atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
+
+#ifdef __SMP__ /* trylock does nothing on UP and so we could deadlock */
+	if (!spin_trylock(&lru_list_lock))
+		return;
+	for(nlist = 0; nlist < NR_LIST; nlist++) {
+		found = locked = dirty = used = lastused = protected = 0;
+		bh = lru_list[nlist];
+		if(!bh) continue;
+
+		do {
+			found++;
+			if (buffer_locked(bh))
+				locked++;
+			if (buffer_protected(bh))
+				protected++;
+			if (buffer_dirty(bh))
+				dirty++;
+			if (atomic_read(&bh->b_count))
+				used++, lastused = found;
+			bh = bh->b_next_free;
+		} while (bh != lru_list[nlist]);
+		printk("%8s: %d buffers, %d used (last=%d), "
+		       "%d locked, %d protected, %d dirty\n",
+		       buf_types[nlist], found, used, lastused,
+		       locked, protected, dirty);
+	}
+	spin_unlock(&lru_list_lock);
+#endif
+}
+
 /* ===================== Init ======================= */
 
 /*
@@ -2181,7 +2246,7 @@ busy_buffer_page:
  * Use gfp() for the hash table to decrease TLB misses, use
  * SLAB cache for buffer heads.
  */
-void __init buffer_init(unsigned long memory_size)
+void __init buffer_init(unsigned long mempages)
 {
 	int order, i;
 	unsigned int nr_hash;
@@ -2189,9 +2254,11 @@ void __init buffer_init(unsigned long memory_size)
 	/* The buffer cache hash table is less important these days,
 	 * trim it a bit.
 	 */
-	memory_size >>= 14;
-	memory_size *= sizeof(struct buffer_head *);
-	for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
+	mempages >>= 14;
+
+	mempages *= sizeof(struct buffer_head *);
+
+	for (order = 0; (1 << order) < mempages; order++)
 		;
 
 	/* try to allocate something until we get it or we're asking
@@ -2246,21 +2313,92 @@ void __init buffer_init(unsigned long memory_size)
  * response to dirty buffers.  Once this process is activated, we write back
  * a limited number of buffers to the disks and then go back to sleep again.
  */
-static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
 struct task_struct *bdflush_tsk = 0;
 
-void wakeup_bdflush(int wait)
+void wakeup_bdflush(int block)
 {
+	DECLARE_WAITQUEUE(wait, current);
+
 	if (current == bdflush_tsk)
 		return;
-	if (wait)
-		run_task_queue(&tq_disk);
-	wake_up(&bdflush_wait);
-	if (wait)
-		sleep_on(&bdflush_done);
+
+	if (!block)
+	{
+		wake_up_process(bdflush_tsk);
+		return;
+	}
+
+	/* kflushd can wakeup us before we have a chance to
+	   go to sleep so we must be smart in handling
+	   this wakeup event from kflushd to avoid deadlocking in SMP
+	   (we are not holding any lock anymore in these two paths). */
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&bdflush_done, &wait);
+
+	wake_up_process(bdflush_tsk);
+	schedule();
+
+	remove_wait_queue(&bdflush_done, &wait);
+	__set_current_state(TASK_RUNNING);
 }
 
+/* This is the _only_ function that deals with flushing async writes
+   to disk.
+   NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
+   as all dirty buffers lives _only_ in the DIRTY lru list.
+   As we never browse the LOCKED and CLEAN lru lists they are infact
+   completly useless. */
+static void flush_dirty_buffers(int check_flushtime)
+{
+	struct buffer_head * bh, *next;
+	int flushed = 0, i;
+
+ restart:
+	spin_lock(&lru_list_lock);
+	bh = lru_list[BUF_DIRTY];
+	if (!bh)
+		goto out_unlock;
+	for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next)
+	{
+		next = bh->b_next_free;
+
+		if (!buffer_dirty(bh))
+		{
+			__refile_buffer(bh);
+			continue;
+		}
+		if (buffer_locked(bh))
+			continue;
+
+		if (check_flushtime)
+		{
+			/* The dirty lru list is chronogical ordered so
+			   if the current bh is not yet timed out,
+			   then also all the following bhs
+			   will be too young. */
+			if (time_before(jiffies, bh->b_flushtime))
+				goto out_unlock;
+		}
+		else
+		{
+			if (++flushed > bdf_prm.b_un.ndirty)
+				goto out_unlock;
+		}
+
+		/* OK, now we are committed to write it out. */
+		atomic_inc(&bh->b_count);
+		spin_unlock(&lru_list_lock);
+		ll_rw_block(WRITE, 1, &bh);
+		atomic_dec(&bh->b_count);
+
+		if (current->need_resched)
+			schedule();
+		goto restart;
+	}
+ out_unlock:
+	spin_unlock(&lru_list_lock);
+}
 
 /* 
  * Here we attempt to write back old buffers.  We also try to flush inodes 
@@ -2272,47 +2410,13 @@ void wakeup_bdflush(int wait)
 
 static int sync_old_buffers(void)
 {
-	int nlist;
-
 	lock_kernel();
 	sync_supers(0);
 	sync_inodes(0);
 	unlock_kernel();
 
-	for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
-		struct buffer_head *bh;
-	repeat:
-		spin_lock(&lru_list_lock);
-		bh = lru_list[nlist];
-		if(bh) {
-			struct buffer_head *next;
-			int i;
-			for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
-				next = bh->b_next_free;
-
-				/* If the buffer is not on the proper list,
-				 * then refile it.
-				 */
-				if ((nlist == BUF_DIRTY &&
-				     (!buffer_dirty(bh) && !buffer_locked(bh))) ||
-				    (nlist == BUF_LOCKED && !buffer_locked(bh))) {
-					__refile_buffer(bh);
-					continue;
-				}
-				 
-				if (buffer_locked(bh) || !buffer_dirty(bh))
-					continue;
-
-				/* OK, now we are committed to write it out. */
-				atomic_inc(&bh->b_count);
-				spin_unlock(&lru_list_lock);
-				ll_rw_block(WRITE, 1, &bh);
-				atomic_dec(&bh->b_count);
-				goto repeat;
-			}
-		}
-		spin_unlock(&lru_list_lock);
-	}
+	flush_dirty_buffers(1);
+	/* must really sync all the active I/O request to disk here */
 	run_task_queue(&tq_disk);
 	return 0;
 }
@@ -2328,6 +2432,10 @@ asmlinkage long sys_bdflush(int func, long data)
 		return -EPERM;
 
 	if (func == 1) {
+		/* do_exit directly and let kupdate to do its work alone. */
+		do_exit(0);
+#if 0 /* left here as it's the only example of lazy-mm-stuff used from
+	 a syscall that doesn't care about the current mm context. */
 		int error;
 		struct mm_struct *user_mm;
 
@@ -2341,6 +2449,7 @@ asmlinkage long sys_bdflush(int func, long data)
 		error = sync_old_buffers();
 		end_lazy_tlb(user_mm);
 		return error;
+#endif
 	}
 
 	/* Basically func 1 means read param 1, 2 means write param 1, etc */
@@ -2383,85 +2492,103 @@ int bdflush(void * unused)
 	sprintf(current->comm, "kflushd");
 	bdflush_tsk = current;
 
-	for (;;) {
-		int nlist;
+	/* avoid getting signals */
+	spin_lock_irq(&current->sigmask_lock);
+	flush_signals(current);
+	sigfillset(&current->blocked);
+	recalc_sigpending(current);
+	spin_unlock_irq(&current->sigmask_lock);
 
+	for (;;) {
 		CHECK_EMERGENCY_SYNC
 
-		for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
-			int nr, major, written = 0;
-			struct buffer_head *next;
-
-		repeat:
-			spin_lock(&lru_list_lock);
-			next = lru_list[nlist];
-			nr = nr_buffers_type[nlist];
-			while (nr-- > 0) {
-				struct buffer_head *bh = next;
-
-				next = next->b_next_free;
-					
-				/* If the buffer is not on the correct list,
-				 * then refile it.
-				 */
-				if ((nlist == BUF_DIRTY &&
-				     (!buffer_dirty(bh) && !buffer_locked(bh))) ||
-				    (nlist == BUF_LOCKED && !buffer_locked(bh))) {
-					__refile_buffer(bh);
-					continue;
-				}
-
-				/* If we aren't in panic mode, don't write out too much
-				 * at a time. Also, don't write out buffers we don't
-				 * really have to write out yet..
-				 */
-				if (!too_many_dirty_buffers) {
-					if (written > bdf_prm.b_un.ndirty)
-						break;
-					if (time_before(jiffies, bh->b_flushtime))
-						continue;
-				}
-
-				if (buffer_locked(bh) || !buffer_dirty(bh))
-					 continue;
-
-				major = MAJOR(bh->b_dev);
-				written++;
-
-				/*
-				 * For the loop major we can try to do asynchronous writes,
-				 * but we have to guarantee that we're making some progress..
-				 */
-				atomic_inc(&bh->b_count);
-				spin_unlock(&lru_list_lock);
-				ll_rw_block(WRITE, 1, &bh);
-				atomic_dec(&bh->b_count);
-				goto repeat;
-			}
-			spin_unlock(&lru_list_lock);
-		}
-		run_task_queue(&tq_disk);
+		flush_dirty_buffers(0);
+
+		/* If wakeup_bdflush will wakeup us
+		   after our bdflush_done wakeup, then
+		   we must make sure to not sleep
+		   in schedule_timeout otherwise
+		   wakeup_bdflush may wait for our
+		   bdflush_done wakeup that would never arrive
+		   (as we would be sleeping) and so it would
+		   deadlock in SMP. */
+		__set_current_state(TASK_INTERRUPTIBLE);
 		wake_up(&bdflush_done);
-		
 		/*
 		 * If there are still a lot of dirty buffers around,
 		 * skip the sleep and flush some more. Otherwise, we
-		 * sleep for a while and mark us as not being in panic
-		 * mode..
+		 * sleep for a while.
 		 */
-		if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
-			too_many_dirty_buffers = 0;
-			spin_lock_irq(&current->sigmask_lock);
-			flush_signals(current);
-			spin_unlock_irq(&current->sigmask_lock);
-			interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);
+		if (balance_dirty_state(NODEV) < 0)
+			schedule_timeout(5*HZ);
+		/* Remember to mark us as running otherwise
+		   the next schedule will block. */
+		__set_current_state(TASK_RUNNING);
+	}
+}
+
+/*
+ * This is the kernel update daemon. It was used to live in userspace
+ * but since it's need to run safely we want it unkillable by mistake.
+ * You don't need to change your userspace configuration since
+ * the userspace `update` will do_exit(0) at the first sys_bdflush().
+ */
+int kupdate(void * unused) 
+{
+	struct task_struct * tsk = current;
+	int interval;
+
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	strcpy(tsk->comm, "kupdate");
+
+	/* sigstop and sigcont will stop and wakeup kupdate */
+	spin_lock_irq(&tsk->sigmask_lock);
+	sigfillset(&tsk->blocked);
+	siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
+	recalc_sigpending(tsk);
+	spin_unlock_irq(&tsk->sigmask_lock);
+
+	for (;;) {
+		/* update interval */
+		interval = bdf_prm.b_un.interval;
+		if (interval)
+		{
+			tsk->state = TASK_INTERRUPTIBLE;
+			schedule_timeout(interval);
+		}
+		else
+		{
+		stop_kupdate:
+			tsk->state = TASK_STOPPED;
+			schedule(); /* wait for SIGCONT */
 		}
+		/* check for sigstop */
+		if (signal_pending(tsk))
+		{
+			int stopped = 0;
+			spin_lock_irq(&tsk->sigmask_lock);
+			if (sigismember(&tsk->signal, SIGSTOP))
+			{
+				sigdelset(&tsk->signal, SIGSTOP);
+				stopped = 1;
+			}
+			recalc_sigpending(tsk);
+			spin_unlock_irq(&tsk->sigmask_lock);
+			if (stopped)
+				goto stop_kupdate;
+		}
+#ifdef DEBUG
+		printk("kupdate() activated...\n");
+#endif
+		sync_old_buffers();
 	}
 }
 
 static int __init bdflush_init(void)
 {
 	kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	return 0;
 }
 
diff --git a/fs/dcache.c b/fs/dcache.c
index ef45eba7d..b6f7a7203 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -169,6 +169,11 @@ out:
 int d_invalidate(struct dentry * dentry)
 {
 	/*
+	 * If it's already been dropped, return OK.
+	 */
+	if (list_empty(&dentry->d_hash))
+		return 0;
+	/*
 	 * Check whether to do a partial shrink_dcache
 	 * to get rid of unused child entries.
 	 */
@@ -415,7 +420,7 @@ int shrink_dcache_memory(int priority, unsigned int gfp_mask)
 		unlock_kernel();
 		/* FIXME: kmem_cache_shrink here should tell us
 		   the number of pages freed, and it should
-		   work in a __GFP_DMA/__GFP_BIGMEM behaviour
+		   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
 		   to free only the interesting pages in
 		   function of the needs of the current allocation. */
 		kmem_cache_shrink(dentry_cache);
diff --git a/fs/exec.c b/fs/exec.c
index b3f31fd0a..dea4f0712 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -31,6 +31,8 @@
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -212,20 +214,42 @@ int copy_strings(int argc,char ** argv, struct linux_binprm *bprm)
 		/* XXX: add architecture specific overflow check here. */ 
 
 		pos = bprm->p;
-		while (len>0) {
-			char *pag;
+		while (len > 0) {
+			char *kaddr;
+			int i, new, err;
+			struct page *page;
 			int offset, bytes_to_copy;
 
 			offset = pos % PAGE_SIZE;
-			if (!(pag = (char *) bprm->page[pos/PAGE_SIZE]) &&
-			    !(pag = (char *) bprm->page[pos/PAGE_SIZE] =
-			      (unsigned long *) get_free_page(GFP_USER))) 
-				return -ENOMEM; 
+			i = pos/PAGE_SIZE;
+			page = bprm->page[i];
+			new = 0;
+			if (!page) {
+				/*
+				 * Cannot yet use highmem page because
+				 * we cannot sleep with a kmap held.
+				 */
+				page = __get_pages(GFP_USER, 0);
+				bprm->page[i] = page;
+				if (!page)
+					return -ENOMEM;
+				new = 1;
+			}
+			kaddr = (char *)kmap(page, KM_WRITE);
 
+			if (new && offset)
+				memset(kaddr, 0, offset);
 			bytes_to_copy = PAGE_SIZE - offset;
-			if (bytes_to_copy > len)
+			if (bytes_to_copy > len) {
 				bytes_to_copy = len;
-			if (copy_from_user(pag + offset, str, bytes_to_copy)) 
+				if (new)
+					memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len);
+			}
+			err = copy_from_user(kaddr + offset, str, bytes_to_copy);
+			flush_page_to_ram(page);
+			kunmap((unsigned long)kaddr, KM_WRITE);
+
+			if (err)
 				return -EFAULT; 
 
 			pos += bytes_to_copy;
@@ -276,7 +300,9 @@ int setup_arg_pages(struct linux_binprm *bprm)
 		mpnt->vm_offset = 0;
 		mpnt->vm_file = NULL;
 		mpnt->vm_private_data = (void *) 0;
+		vmlist_modify_lock(current->mm);
 		insert_vm_struct(current->mm, mpnt);
+		vmlist_modify_unlock(current->mm);
 		current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 	} 
 
@@ -467,6 +493,11 @@ int flush_old_exec(struct linux_binprm * bprm)
 	    permission(bprm->dentry->d_inode,MAY_READ))
 		current->dumpable = 0;
 
+	/* An exec changes our domain. We are no longer part of the thread
+	   group */
+	   
+	current->self_exec_id++;
+			
 	flush_signal_handlers(current);
 	flush_old_files(current->files);
 
@@ -640,14 +671,22 @@ void remove_arg_zero(struct linux_binprm *bprm)
 {
 	if (bprm->argc) {
 		unsigned long offset;
-		char * page;
+		char * kaddr;
+		struct page *page;
+
 		offset = bprm->p % PAGE_SIZE;
-		page = (char*)bprm->page[bprm->p/PAGE_SIZE];
-		while(bprm->p++,*(page+offset++))
-			if(offset==PAGE_SIZE){
-				offset=0;
-				page = (char*)bprm->page[bprm->p/PAGE_SIZE];
-			}
+		goto inside;
+
+		while (bprm->p++, *(kaddr+offset++)) {
+			if (offset != PAGE_SIZE)
+				continue;
+			offset = 0;
+			kunmap((unsigned long)kaddr, KM_WRITE);
+inside:
+			page = bprm->page[bprm->p/PAGE_SIZE];
+			kaddr = (char *)kmap(page, KM_WRITE);
+		}
+		kunmap((unsigned long)kaddr, KM_WRITE);
 		bprm->argc--;
 	}
 }
@@ -676,8 +715,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 		bprm->dentry = NULL;
 
 	        bprm_loader.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
-	        for (i=0 ; i<MAX_ARG_PAGES ; i++)       /* clear page-table */
-                    bprm_loader.page[i] = 0;
+	        for (i = 0 ; i < MAX_ARG_PAGES ; i++)	/* clear page-table */
+                    bprm_loader.page[i] = NULL;
 
 		dentry = open_namei(dynloader[0], 0, 0);
 		retval = PTR_ERR(dentry);
@@ -793,8 +832,9 @@ out:
 
 	/* Assumes that free_page() can take a NULL argument. */ 
 	/* I hope this is ok for all architectures */ 
-	for (i=0 ; i<MAX_ARG_PAGES ; i++)
-		free_page(bprm.page[i]);
+	for (i = 0 ; i < MAX_ARG_PAGES ; i++)
+		if (bprm.page[i])
+			__free_page(bprm.page[i]);
 
 	return retval;
 }
diff --git a/fs/file.c b/fs/file.c
index fd33dc8b8..d62fb3ef3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -16,7 +16,7 @@
 
 
 /*
- * Allocate an fd array, using get_free_page() if possible.
+ * Allocate an fd array, using __get_free_page() if possible.
  * Note: the array isn't cleared at allocation time.
  */
 struct file ** alloc_fd_array(int num)
@@ -129,7 +129,7 @@ out:
 }
 
 /*
- * Allocate an fdset array, using get_free_page() if possible.
+ * Allocate an fdset array, using __get_free_page() if possible.
  * Note: the array isn't cleared at allocation time.
  */
 fd_set * alloc_fdset(int num)
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 62410ca26..6fb9c1633 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -293,10 +293,10 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
 	if (!level) return;
 	if (s->s_hpfs_chk)
 		if (hpfs_stop_cycles(s, ano, &c1, &c2, "hpfs_remove_btree #2")) return;
+	brelse(bh);
 	hpfs_free_sectors(s, ano, 1);
 	oano = ano;
 	ano = anode->up;
-	brelse(bh);
 	if (--level) {
 		anode = hpfs_map_anode(s, ano, &bh);
 		btree1 = &anode->btree;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8af35847d..36e665c32 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -20,7 +20,34 @@ int hpfs_dir_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int hpfs_readdir(struct file *filp, void * dirent, filldir_t filldir)
+/* This is slow, but it's not used often */
+
+loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
+{
+	loff_t new_off = off + (whence == 1 ? filp->f_pos : 0);
+	loff_t pos;
+	struct quad_buffer_head qbh;
+	struct inode *i = filp->f_dentry->d_inode;
+	struct super_block *s = filp->f_dentry->d_sb;
+	/*printk("dir lseek\n");*/
+	if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
+	hpfs_lock_inode(i);
+	pos = ((loff_t) hpfs_de_as_down_as_possible(s, i->i_hpfs_dno) << 4) + 1;
+	while (pos != new_off) {
+		if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
+		else goto fail;
+		if (pos == 12) goto fail;
+	}
+	hpfs_unlock_inode(i);
+	ok:
+	return filp->f_pos = new_off;
+	fail:
+	hpfs_unlock_inode(i);
+	/*printk("illegal lseek: %016llx\n", new_off);*/
+	return -ESPIPE;
+}
+
+int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct quad_buffer_head qbh;
@@ -54,11 +81,11 @@ int hpfs_readdir(struct file *filp, void * dirent, filldir_t filldir)
 		if (e) return -EFSERROR;
 	}
 	lc = inode->i_sb->s_hpfs_lowercase;
-	if (filp->f_pos == -2) { /* diff -r requires this (note, that diff -r */
-		filp->f_pos = -3; /* also fails on msdos filesystem in 2.0) */
+	if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
+		filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
 		return 0;
 	}
-	if (filp->f_pos == -3) return -ENOENT;
+	if (filp->f_pos == 13) return -ENOENT;
 	
 	hpfs_lock_inode(inode);
 	
@@ -72,7 +99,7 @@ int hpfs_readdir(struct file *filp, void * dirent, filldir_t filldir)
 				hpfs_unlock_inode(inode);
 				return -EFSERROR;
 			}
-		if (filp->f_pos == -2) {
+		if (filp->f_pos == 12) {
 			hpfs_unlock_inode(inode);
 			return 0;
 		}
@@ -86,9 +113,9 @@ int hpfs_readdir(struct file *filp, void * dirent, filldir_t filldir)
 				hpfs_unlock_inode(inode);
 				return 0;
 			}
-			filp->f_pos = -1;
+			filp->f_pos = 11;
 		}
-		if (filp->f_pos == -1) {
+		if (filp->f_pos == 11) {
 			if (filldir(dirent, "..", 2, filp->f_pos, inode->i_hpfs_parent_dir) < 0) {
 				hpfs_unlock_inode(inode);
 				return 0;
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index e4b4bbc91..d1ca8e3e6 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -539,7 +539,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
 				brelse(bh);
 			}
 			i->i_hpfs_dno = down;
-			for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, (loff_t) -2);
+			for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, (loff_t) 12);
 			return;
 		}
 		if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return;
@@ -876,7 +876,7 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
 	hpfs_brelse4(&qbh0);
 	
 	bail:
-	*posp = -2;
+	*posp = 12;
 	return de;
 }
 
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index fbb1f2f6c..066ce5c28 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -58,91 +58,96 @@ void hpfs_truncate(struct inode *i)
 	hpfs_write_inode(i);
 }
 
-int hpfs_getblk_block(struct inode *inode, long block, int create, int *err, int *created)
+int hpfs_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
 {
-	int add;
-	int sec = 0;
-	down(&inode->i_sem);
-	if (err) *err = 0;
-	if (created) *created = 0;
-	if (!inode->i_blocks) {
-		hpfs_error(inode->i_sb, "hpfs_get_block: inode %08x has no blocks", inode->i_ino);
-		if (err) *err = -EFSERROR;
-		up(&inode->i_sem);
+	secno s;
+	if (iblock < inode->i_blocks - 1) {
+		s = hpfs_bmap(inode, iblock);
+		bh_result->b_dev = inode->i_dev;
+		bh_result->b_blocknr = s;
+		bh_result->b_state |= (1UL << BH_Mapped);
 		return 0;
 	}
-	if (block < ((add = inode->i_blocks - 1))) {
-		int bm;
-		if (!(bm = hpfs_bmap(inode, block))) {
-			hpfs_error(inode->i_sb, "hpfs_get_block: cound not bmap block %08x, inode %08x, size %08x", (int)block, inode->i_ino, (int)inode->i_size);
-			*err = -EFSERROR;
-		}
-		up(&inode->i_sem);
-		return bm;
-	}
-	if (!create) {
-		if (err) *err = -EFBIG;
-		up(&inode->i_sem);
-		return 0;
+	if (!create) return 0;
+	if (iblock > inode->i_blocks - 1) {
+		//hpfs_error(inode->i_sb, "hpfs_get_block beyond file end (requested %08x, inode size %08x", (int)iblock, (int)inode->i_blocks - 1);
+		printk("HPFS: could not write beyond file end. This is known bug.\n");
+		return -EFSERROR;
 	}
-	if (created) *created = 1;
-	while (add <= block) {
-		if ((sec = hpfs_add_sector_to_btree(inode->i_sb, inode->i_ino, 1, add)) == -1) {
-			if (err) *err = -ENOSPC;
-			hpfs_truncate_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1);
-			return 0;
-		} /* FIXME: clear block */
-		add++;
+	if ((s = hpfs_add_sector_to_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1)) == -1) {
+		hpfs_truncate_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1);
+		return -ENOSPC;
 	}
-	inode->i_blocks = add + 1;
-	up(&inode->i_sem);
-	return sec;
+	inode->i_blocks++;
+	bh_result->b_dev = inode->i_dev;
+	bh_result->b_blocknr = s;
+	bh_result->b_state |= (1UL << BH_Mapped) | (1UL << BH_New);
+	return 0;
 }
 
-/* copied from ext2fs */
-static int hpfs_get_block(struct inode *inode, unsigned long block, struct buffer_head *bh, int update)
+static int hpfs_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
 {
-	if (!bh->b_blocknr) {
-		int error, created;
-		unsigned long blocknr;
-
-		blocknr = hpfs_getblk_block(inode, block, 1, &error, &created);
-		if (!blocknr) {
-			if (!error)
-				error = -ENOSPC;
-			return error;
+	struct dentry *dentry = file->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	struct page *new_page, **hash;
+	unsigned long pgpos;
+	unsigned long page_cache = 0;
+	long status;
+
+	printk("- off: %08x\n", (int)page->offset);
+	pgpos = (inode->i_blocks - 1) * 512 & PAGE_CACHE_MASK;
+	while (pgpos < page->offset) {
+long pgp = pgpos;
+		printk("pgpos: %08x, bl: %d\n", (int)pgpos, (int)inode->i_blocks);
+		hash = page_hash(inode, pgpos);
+repeat_find:	new_page = __find_lock_page(inode, pgpos, hash);
+		if (!new_page) {
+			if (!page_cache) {
+				page_cache = page_cache_alloc();
+				if (page_cache)
+					goto repeat_find;
+				status = -ENOMEM;
+				goto out;
+			}
+			new_page = page_cache_entry(page_cache);
+			if (add_to_page_cache_unique(new_page,inode,pgpos,hash))
+				goto repeat_find;
+			page_cache = 0;
 		}
-
-		bh->b_dev = inode->i_dev;
-		bh->b_blocknr = blocknr;
-
-		if (!update)
-			return 0;
-
-		if (created) {
-			memset(bh->b_data, 0, bh->b_size);
-			set_bit(BH_Uptodate, &bh->b_state);
-			return 0;
+		printk("A\n");
+		status = block_write_cont_page(file, new_page, PAGE_SIZE, 0, NULL);
+		printk("B\n");
+		UnlockPage(new_page);
+		page_cache_release(new_page);
+		if (status < 0)
+			goto out;
+		pgpos = (inode->i_blocks - 1) * 512 & PAGE_CACHE_MASK;
+		printk("pgpos2: %08x, bl: %d\n", (int)pgpos, (int)inode->i_blocks);
+		if (pgpos == pgp) {
+			status = -1;
+			printk("ERROR\n");
+			goto out;
 		}
 	}
-
-	if (!update)
-		return 0;
-
-	lock_kernel();
-	ll_rw_block(READ, 1, &bh);
-	wait_on_buffer(bh);
-	unlock_kernel();
-
-	return buffer_uptodate(bh) ? 0 : -EIO;
+	//if ((status = block_write_cont_page(file, page, PAGE_SIZE, 0, NULL)) < 0) goto out;
+	printk("C\n");
+	status = block_write_cont_page(file, page, offset, bytes, buf);
+	printk("D\n");
+out:
+	printk("O\n");
+	if (page_cache)
+		page_cache_free(page_cache);
+	printk("E\n");
+	return status;
 }
 
+
 ssize_t hpfs_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
 {
 	ssize_t retval;
 
 	retval = generic_file_write(file, buf, count,
-				    ppos, block_write_partial_page);
+				    ppos, /*hpfs_write_partial_page*/block_write_partial_page);
 	if (retval > 0) {
 		struct inode *inode = file->f_dentry->d_inode;
 		inode->i_mtime = CURRENT_TIME;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 050b63597..9ae4a67da 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -226,6 +226,7 @@ void hpfs_set_dentry_operations(struct dentry *);
 
 int hpfs_dir_read(struct file *, char *, size_t, loff_t *);
 int hpfs_dir_release(struct inode *, struct file *);
+loff_t hpfs_dir_lseek(struct file *, loff_t, int);
 int hpfs_readdir(struct file *, void *, filldir_t);
 struct dentry *hpfs_lookup(struct inode *, struct dentry *);
 
@@ -258,9 +259,8 @@ int hpfs_open(struct inode *, struct file *);
 int hpfs_file_fsync(struct file *, struct dentry *);
 secno hpfs_bmap(struct inode *, unsigned);
 void hpfs_truncate(struct inode *);
-ssize_t hpfs_file_read(struct file *, char *, size_t, loff_t *);
-ssize_t hpfs_file_write(struct file *, const char *, size_t, loff_t *);
-int hpfs_writepage (struct file *, struct page *);
+int hpfs_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create);
+ssize_t hpfs_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos);
 
 /* inode.c */
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index efc776218..d79e55814 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -41,11 +41,9 @@ static const struct inode_operations hpfs_file_iops =
 	NULL,				/* rename */
 	NULL,				/* readlink */
 	NULL,				/* follow_link */
-	(int (*)(struct inode *, int))
-#warning Someone needs to code up hpfs_get_block properly... -DaveM
-	&hpfs_bmap,			/* get_block */
+	&hpfs_get_block,		/* get_block */
 	block_read_full_page,		/* readpage */
-	hpfs_writepage,			/* writepage */
+	block_write_full_page,		/* writepage */
 	block_flushpage,		/* flushpage */
 	hpfs_truncate,			/* truncate */
 	NULL,				/* permission */
@@ -55,7 +53,7 @@ static const struct inode_operations hpfs_file_iops =
 
 static const struct file_operations hpfs_dir_ops =
 {
-	NULL,				/* lseek - default */
+	hpfs_dir_lseek,			/* lseek */
 	hpfs_dir_read,			/* read */
 	NULL,				/* write - bad */
 	hpfs_readdir,			/* readdir */
diff --git a/fs/inode.c b/fs/inode.c
index 55eddfde8..f03295d5c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -89,6 +89,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		memset(inode, 0, sizeof(*inode));
 		init_waitqueue_head(&inode->i_wait);
 		INIT_LIST_HEAD(&inode->i_hash);
+		INIT_LIST_HEAD(&inode->i_pages);
 		INIT_LIST_HEAD(&inode->i_dentry);
 		sema_init(&inode->i_sem, 1);
 		spin_lock_init(&inode->i_shared_lock);
@@ -401,7 +402,7 @@ int shrink_icache_memory(int priority, int gfp_mask)
 		prune_icache(count);
 		/* FIXME: kmem_cache_shrink here should tell us
 		   the number of pages freed, and it should
-		   work in a __GFP_DMA/__GFP_BIGMEM behaviour
+		   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
 		   to free only the interesting pages in
 		   function of the needs of the current allocation. */
 		kmem_cache_shrink(inode_cachep);
@@ -429,7 +430,7 @@ static inline void __iget(struct inode * inode)
  * by hand after calling find_inode now! This simplify iunique and won't
  * add any additional branch in the common code.
  */
-static struct inode * find_inode(struct super_block * sb, unsigned long ino, struct list_head *head)
+static struct inode * find_inode(struct super_block * sb, unsigned long ino, struct list_head *head, find_inode_t find_actor, void *opaque)
 {
 	struct list_head *tmp;
 	struct inode * inode;
@@ -445,6 +446,8 @@ static struct inode * find_inode(struct super_block * sb, unsigned long ino, str
 			continue;
 		if (inode->i_ino != ino)
 			continue;
+		if (find_actor && !find_actor(inode, ino, opaque))
+			continue;
 		break;
 	}
 	return inode;
@@ -504,7 +507,7 @@ struct inode * get_empty_inode(void)
  * We no longer cache the sb_flags in i_flags - see fs.h
  *	-- rmk@arm.uk.linux.org
  */
-static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, struct list_head *head)
+static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, struct list_head *head, find_inode_t find_actor, void *opaque)
 {
 	struct inode * inode;
 
@@ -514,7 +517,7 @@ static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, s
 
 		spin_lock(&inode_lock);
 		/* We released the lock, so.. */
-		old = find_inode(sb, ino, head);
+		old = find_inode(sb, ino, head, find_actor, opaque);
 		if (!old)
 		{
 			list_add(&inode->i_list, &inode_in_use);
@@ -570,7 +573,7 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
 retry:
 	if (counter > max_reserved) {
 		head = inode_hashtable + hash(sb,counter);
-		inode = find_inode(sb, res = counter++, head);
+		inode = find_inode(sb, res = counter++, head, NULL, NULL);
 		if (!inode) {
 			spin_unlock(&inode_lock);
 			return res;
@@ -595,13 +598,13 @@ struct inode *igrab(struct inode *inode)
 	return inode;
 }
 
-struct inode *iget(struct super_block *sb, unsigned long ino)
+struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find_actor, void *opaque)
 {
 	struct list_head * head = inode_hashtable + hash(sb,ino);
 	struct inode * inode;
 
 	spin_lock(&inode_lock);
-	inode = find_inode(sb, ino, head);
+	inode = find_inode(sb, ino, head, find_actor, opaque);
 	if (inode) {
 		__iget(inode);
 		spin_unlock(&inode_lock);
@@ -614,7 +617,7 @@ struct inode *iget(struct super_block *sb, unsigned long ino)
 	 * get_new_inode() will do the right thing, re-trying the search
 	 * in case it had to block at any point.
 	 */
-	return get_new_inode(sb, ino, head);
+	return get_new_inode(sb, ino, head, find_actor, opaque);
 }
 
 void insert_inode_hash(struct inode *inode)
diff --git a/fs/iobuf.c b/fs/iobuf.c
index b46a13bfd..eaabf2f7c 100644
--- a/fs/iobuf.c
+++ b/fs/iobuf.c
@@ -50,7 +50,6 @@ int alloc_kiovec(int nr, struct kiobuf **bufp)
 		init_waitqueue_head(&iobuf->wait_queue);
 		iobuf->end_io = simple_wakeup_kiobuf;
 		iobuf->array_len = KIO_STATIC_PAGES;
-		iobuf->pagelist  = iobuf->page_array;
 		iobuf->maplist   = iobuf->map_array;
 		*bufp++ = iobuf;
 	}
@@ -65,50 +64,35 @@ void free_kiovec(int nr, struct kiobuf **bufp)
 	
 	for (i = 0; i < nr; i++) {
 		iobuf = bufp[i];
-		if (iobuf->array_len > KIO_STATIC_PAGES) {
-			kfree (iobuf->pagelist);
+		if (iobuf->array_len > KIO_STATIC_PAGES)
 			kfree (iobuf->maplist);
-		}
 		kmem_cache_free(kiobuf_cachep, bufp[i]);
 	}
 }
 
 int expand_kiobuf(struct kiobuf *iobuf, int wanted)
 {
-	unsigned long *	pagelist;
 	struct page ** maplist;
 	
 	if (iobuf->array_len >= wanted)
 		return 0;
 	
-	pagelist = (unsigned long *) 
-		kmalloc(wanted * sizeof(unsigned long), GFP_KERNEL);
-	if (!pagelist)
-		return -ENOMEM;
-	
 	maplist = (struct page **) 
 		kmalloc(wanted * sizeof(struct page **), GFP_KERNEL);
-	if (!maplist) {
-		kfree(pagelist);
+	if (!maplist)
 		return -ENOMEM;
-	}
 
 	/* Did it grow while we waited? */
 	if (iobuf->array_len >= wanted) {
-		kfree(pagelist);
 		kfree(maplist);
 		return 0;
 	}
 	
-	memcpy (pagelist, iobuf->pagelist, wanted * sizeof(unsigned long));
 	memcpy (maplist,  iobuf->maplist,   wanted * sizeof(struct page **));
 
-	if (iobuf->array_len > KIO_STATIC_PAGES) {
-		kfree (iobuf->pagelist);
+	if (iobuf->array_len > KIO_STATIC_PAGES)
 		kfree (iobuf->maplist);
-	}
 	
-	iobuf->pagelist  = pagelist;
 	iobuf->maplist   = maplist;
 	iobuf->array_len = wanted;
 	return 0;
diff --git a/fs/minix/truncate.c b/fs/minix/truncate.c
index f26aa086c..70b01dc20 100644
--- a/fs/minix/truncate.c
+++ b/fs/minix/truncate.c
@@ -33,7 +33,7 @@
  */
 
 #define DATA_BUFFER_USED(bh) \
-	(atomic_read(&bh->b_count) || buffer_locked(bh))
+	(atomic_read(&bh->b_count) > 1 || buffer_locked(bh))
 
 /*
  * The functions for minix V1 fs truncation.
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6b52b2d54..b7ec225ac 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -308,8 +308,7 @@ static struct page *try_to_get_dirent_page(struct file *file, __u32 cookie, int
 	struct nfs_readdirres rd_res;
 	struct dentry *dentry = file->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	struct page *page, **hash;
-	unsigned long page_cache;
+	struct page *page, **hash, *page_cache;
 	long offset;
 	__u32 *cookiep;
 
@@ -341,14 +340,14 @@ repeat:
 		goto unlock_out;
 	}
 
-	page = page_cache_entry(page_cache);
+	page = page_cache;
 	if (add_to_page_cache_unique(page, inode, offset, hash)) {
 		page_cache_release(page);
 		goto repeat;
 	}
 
 	rd_args.fh = NFS_FH(dentry);
-	rd_res.buffer = (char *)page_cache;
+	rd_res.buffer = (char *)page_address(page_cache);
 	rd_res.bufsiz = PAGE_CACHE_SIZE;
 	rd_res.cookie = *cookiep;
 	do {
@@ -533,13 +532,15 @@ static inline int nfs_dentry_force_reval(struct dentry *dentry, int flags)
  * If mtime is close to present time, we revalidate
  * more often.
  */
+#define NFS_REVALIDATE_NEGATIVE (1 * HZ)
 static inline int nfs_neg_need_reval(struct dentry *dentry)
 {
-	unsigned long timeout = 30 * HZ;
-	long diff = CURRENT_TIME - dentry->d_parent->d_inode->i_mtime;
+	struct inode *dir = dentry->d_parent->d_inode;
+	unsigned long timeout = NFS_ATTRTIMEO(dir);
+	long diff = CURRENT_TIME - dir->i_mtime;
 
-	if (diff < 5*60)
-		timeout = 1 * HZ;
+	if (diff < 5*60 && timeout > NFS_REVALIDATE_NEGATIVE)
+		timeout = NFS_REVALIDATE_NEGATIVE;
 
 	return time_after(jiffies, dentry->d_time + timeout);
 }
@@ -581,12 +582,14 @@ static int nfs_lookup_revalidate(struct dentry * dentry, int flags)
 		goto out_bad;
 	}
 
-	if (IS_ROOT(dentry))
-		goto out_valid;
-
 	if (!nfs_dentry_force_reval(dentry, flags))
 		goto out_valid;
 
+	if (IS_ROOT(dentry)) {
+		__nfs_revalidate_inode(NFS_DSERVER(dentry), dentry);
+		goto out_valid_renew;
+	}
+
 	/*
 	 * Do a new lookup and check the dentry attributes.
 	 */
@@ -596,32 +599,29 @@ static int nfs_lookup_revalidate(struct dentry * dentry, int flags)
 		goto out_bad;
 
 	/* Inode number matches? */
-	if (fattr.fileid != inode->i_ino)
+	if (NFS_FSID(inode) != fattr.fsid ||
+	    NFS_FILEID(inode) != fattr.fileid)
 		goto out_bad;
 
 	/* Filehandle matches? */
-	if (memcmp(dentry->d_fsdata, &fhandle, sizeof(struct nfs_fh))) {
-		if (!list_empty(&dentry->d_subdirs))
-			shrink_dcache_parent(dentry);
-		if (dentry->d_count < 2)
-			goto out_bad;
-	}
+	if (memcmp(dentry->d_fsdata, &fhandle, sizeof(struct nfs_fh)))
+		goto out_bad;
 
 	/* Ok, remeber that we successfully checked it.. */
-	nfs_renew_times(dentry);
 	nfs_refresh_inode(inode, &fattr);
 
+ out_valid_renew:
+	nfs_renew_times(dentry);
 out_valid:
 	return 1;
 out_bad:
+	d_drop(dentry);
+	if (!list_empty(&dentry->d_subdirs))
+		shrink_dcache_parent(dentry);
 	/* Purge readdir caches. */
 	if (dentry->d_parent->d_inode) {
-		invalidate_inode_pages(dentry->d_parent->d_inode);
-		nfs_flush_dircache(dentry->d_parent->d_inode);
-	}
-	if (inode && S_ISDIR(inode->i_mode)) {
-		invalidate_inode_pages(inode);
-		nfs_flush_dircache(inode);
+		nfs_zap_caches(dentry->d_parent->d_inode);
+		NFS_CACHEINV(dentry->d_parent->d_inode);
 	}
 	return 0;
 }
@@ -649,21 +649,6 @@ static void nfs_dentry_delete(struct dentry *dentry)
 				dentry->d_name.name, error);
 	}
 
-#ifdef NFS_PARANOIA
-	/*
-	 * Sanity check: if the dentry has been unhashed and the
-	 * inode still has users, we could have problems ...
-	 */
-	if (list_empty(&dentry->d_hash) && dentry->d_inode) {
-		struct inode *inode = dentry->d_inode;
-		int max_count = (S_ISDIR(inode->i_mode) ? 1 : inode->i_nlink);
-		if (inode->i_count > max_count) {
-printk("nfs_dentry_delete: %s/%s: ino=%ld, count=%d, nlink=%d\n",
-dentry->d_parent->d_name.name, dentry->d_name.name,
-inode->i_ino, inode->i_count, inode->i_nlink);
-		}
-	}
-#endif
 }
 
 static kmem_cache_t *nfs_fh_cachep;
@@ -750,14 +735,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry)
 		error = -EACCES;
 		inode = nfs_fhget(dentry, &fhandle, &fattr);
 		if (inode) {
-#ifdef NFS_PARANOIA
-if (inode->i_count > (S_ISDIR(inode->i_mode) ? 1 : inode->i_nlink)) {
-printk("nfs_lookup: %s/%s ino=%ld in use, count=%d, nlink=%d\n",
-dentry->d_parent->d_name.name, dentry->d_name.name,
-inode->i_ino, inode->i_count, inode->i_nlink);
-show_dentry(&inode->i_dentry);
-}
-#endif
 	    no_entry:
 			d_add(dentry, inode);
 			nfs_renew_times(dentry);
@@ -779,14 +756,6 @@ static int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 
 	inode = nfs_fhget(dentry, fhandle, fattr);
 	if (inode) {
-#ifdef NFS_PARANOIA
-if (inode->i_count > (S_ISDIR(inode->i_mode) ? 1 : inode->i_nlink)) {
-printk("nfs_instantiate: %s/%s ino=%ld in use, count=%d, nlink=%d\n",
-dentry->d_parent->d_name.name, dentry->d_name.name,
-inode->i_ino, inode->i_count, inode->i_nlink);
-show_dentry(&inode->i_dentry);
-}
-#endif
 		d_instantiate(dentry, inode);
 		nfs_renew_times(dentry);
 		error = 0;
@@ -803,16 +772,15 @@ show_dentry(&inode->i_dentry);
 static int nfs_create(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int error;
-	struct nfs_sattr sattr;
+	struct iattr attr;
 	struct nfs_fattr fattr;
 	struct nfs_fh fhandle;
 
 	dfprintk(VFS, "NFS: create(%x/%ld, %s\n",
 		dir->i_dev, dir->i_ino, dentry->d_name.name);
 
-	sattr.mode = mode;
-	sattr.uid = sattr.gid = sattr.size = (unsigned) -1;
-	sattr.atime.seconds = sattr.mtime.seconds = (unsigned) -1;
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
 
 	/*
 	 * Invalidate the dir cache before the operation to avoid a race.
@@ -820,7 +788,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode)
 	invalidate_inode_pages(dir);
 	nfs_flush_dircache(dir);
 	error = nfs_proc_create(NFS_SERVER(dir), NFS_FH(dentry->d_parent),
-			dentry->d_name.name, &sattr, &fhandle, &fattr);
+			dentry->d_name.name, &attr, &fhandle, &fattr);
 	if (!error)
 		error = nfs_instantiate(dentry, &fhandle, &fattr);
 	if (error)
@@ -834,23 +802,25 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode)
 static int nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, int rdev)
 {
 	int error;
-	struct nfs_sattr sattr;
+	struct iattr attr;
 	struct nfs_fattr fattr;
 	struct nfs_fh fhandle;
 
 	dfprintk(VFS, "NFS: mknod(%x/%ld, %s\n",
 		dir->i_dev, dir->i_ino, dentry->d_name.name);
 
-	sattr.mode = mode;
-	sattr.uid = sattr.gid = sattr.size = (unsigned) -1;
-	if (S_ISCHR(mode) || S_ISBLK(mode))
-		sattr.size = rdev; /* get out your barf bag */
-	sattr.atime.seconds = sattr.mtime.seconds = (unsigned) -1;
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
+	/* FIXME: move this to a special nfs_proc_mknod() */
+	if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		attr.ia_size = rdev; /* get out your barf bag */
+		attr.ia_valid |= ATTR_SIZE;
+	}
 
 	invalidate_inode_pages(dir);
 	nfs_flush_dircache(dir);
 	error = nfs_proc_create(NFS_SERVER(dir), NFS_FH(dentry->d_parent),
-				dentry->d_name.name, &sattr, &fhandle, &fattr);
+				dentry->d_name.name, &attr, &fhandle, &fattr);
 	if (!error)
 		error = nfs_instantiate(dentry, &fhandle, &fattr);
 	if (error)
@@ -864,16 +834,15 @@ static int nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, int rde
 static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int error;
-	struct nfs_sattr sattr;
+	struct iattr attr;
 	struct nfs_fattr fattr;
 	struct nfs_fh fhandle;
 
 	dfprintk(VFS, "NFS: mkdir(%x/%ld, %s\n",
 		dir->i_dev, dir->i_ino, dentry->d_name.name);
 
-	sattr.mode = mode | S_IFDIR;
-	sattr.uid = sattr.gid = sattr.size = (unsigned) -1;
-	sattr.atime.seconds = sattr.mtime.seconds = (unsigned) -1;
+	attr.ia_valid = ATTR_MODE;
+	attr.ia_mode = mode | S_IFDIR;
 
 	/*
 	 * Always drop the dentry, we can't always depend on
@@ -885,7 +854,7 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	invalidate_inode_pages(dir);
 	nfs_flush_dircache(dir);
 	error = nfs_proc_mkdir(NFS_DSERVER(dentry), NFS_FH(dentry->d_parent),
-				dentry->d_name.name, &sattr, &fhandle, &fattr);
+				dentry->d_name.name, &attr, &fhandle, &fattr);
 	if (!error)
 		dir->i_nlink++;
 	return error;
@@ -898,13 +867,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 	dfprintk(VFS, "NFS: rmdir(%x/%ld, %s\n",
 		dir->i_dev, dir->i_ino, dentry->d_name.name);
 
-#ifdef NFS_PARANOIA
-if (dentry->d_inode->i_count > 1)
-printk("nfs_rmdir: %s/%s inode busy?? i_count=%d, i_nlink=%d\n",
-dentry->d_parent->d_name.name, dentry->d_name.name,
-dentry->d_inode->i_count, dentry->d_inode->i_nlink);
-#endif
-
 	invalidate_inode_pages(dir);
 	nfs_flush_dircache(dir);
 	error = nfs_proc_rmdir(NFS_SERVER(dir), NFS_FH(dentry->d_parent),
@@ -1082,12 +1044,6 @@ dentry->d_parent->d_name.name, dentry->d_name.name, dentry->d_count);
 #endif
 		goto out;
 	}
-#ifdef NFS_PARANOIA
-if (inode && inode->i_count > inode->i_nlink)
-printk("nfs_safe_remove: %s/%s inode busy?? i_count=%d, i_nlink=%d\n",
-dentry->d_parent->d_name.name, dentry->d_name.name,
-inode->i_count, inode->i_nlink);
-#endif
 	/*
 	 * Unhash the dentry while we remove the file ...
 	 */
@@ -1141,7 +1097,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 static int
 nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
-	struct nfs_sattr sattr;
+	struct iattr attr;
 	int error;
 
 	dfprintk(VFS, "NFS: symlink(%x/%ld, %s, %s)\n",
@@ -1160,9 +1116,8 @@ dentry->d_parent->d_name.name, dentry->d_name.name);
 	 * Fill in the sattr for the call.
  	 * Note: SunOS 4.1.2 crashes if the mode isn't initialized!
 	 */
-	sattr.mode = S_IFLNK | S_IRWXUGO;
-	sattr.uid = sattr.gid = sattr.size = (unsigned) -1;
-	sattr.atime.seconds = sattr.mtime.seconds = (unsigned) -1;
+	attr.ia_valid = ATTR_MODE;
+	attr.ia_mode = S_IFLNK | S_IRWXUGO;
 
 	/*
 	 * Drop the dentry in advance to force a new lookup.
@@ -1173,7 +1128,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name);
 	invalidate_inode_pages(dir);
 	nfs_flush_dircache(dir);
 	error = nfs_proc_symlink(NFS_SERVER(dir), NFS_FH(dentry->d_parent),
-				dentry->d_name.name, symname, &sattr);
+				dentry->d_name.name, symname, &attr);
 	if (!error) {
 		nfs_renew_times(dentry->d_parent);
 	} else if (error == -EEXIST) {
@@ -1332,13 +1287,6 @@ do_rename:
 	 * To prevent any new references to the target during the rename,
 	 * we unhash the dentry and free the inode in advance.
 	 */
-#ifdef NFS_PARANOIA
-if (new_inode && 
-    new_inode->i_count > (S_ISDIR(new_inode->i_mode) ? 1 : new_inode->i_nlink))
-printk("nfs_rename: %s/%s inode busy?? i_count=%d, i_nlink=%d\n",
-new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-new_inode->i_count, new_inode->i_nlink);
-#endif
 	if (!list_empty(&new_dentry->d_hash)) {
 		d_drop(new_dentry);
 		rehash = update;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5421cebf9..ab1e51485 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,7 +37,7 @@
 #define NFS_PARANOIA 1
 
 static struct inode * __nfs_fhget(struct super_block *, struct nfs_fattr *);
-static void nfs_zap_caches(struct inode *);
+void nfs_zap_caches(struct inode *);
 static void nfs_invalidate_inode(struct inode *);
 
 static void nfs_read_inode(struct inode *);
@@ -78,6 +78,8 @@ nfs_read_inode(struct inode * inode)
 	inode->i_mode = 0;
 	inode->i_rdev = 0;
 	inode->i_op = NULL;
+	NFS_FILEID(inode) = 0;
+	NFS_FSID(inode) = 0;
 	NFS_CACHEINV(inode);
 	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 }
@@ -415,13 +417,15 @@ restart:
 		dprintk("nfs_free_dentries: found %s/%s, d_count=%d, hashed=%d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			dentry->d_count, !list_empty(&dentry->d_hash));
+		if (!list_empty(&dentry->d_subdirs))
+			shrink_dcache_parent(dentry);
 		if (!dentry->d_count) {
 			dget(dentry);
 			d_drop(dentry);
 			dput(dentry);
 			goto restart;
 		}
-		if (!list_empty(&dentry->d_hash))
+		if (list_empty(&dentry->d_hash))
 			unhashed++;
 	}
 	return unhashed;
@@ -430,7 +434,7 @@ restart:
 /*
  * Invalidate the local caches
  */
-static void
+void
 nfs_zap_caches(struct inode *inode)
 {
 	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
@@ -466,6 +470,8 @@ nfs_fill_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 * do this once. (We don't allow inodes to change types.)
 	 */
 	if (inode->i_mode == 0) {
+		NFS_FILEID(inode) = fattr->fileid;
+		NFS_FSID(inode) = fattr->fsid;
 		inode->i_mode = fattr->mode;
 		if (S_ISREG(inode->i_mode))
 			inode->i_op = &nfs_file_inode_operations;
@@ -487,6 +493,54 @@ nfs_fill_inode(struct inode *inode, struct nfs_fattr *fattr)
 }
 
 /*
+ * In NFSv3 we can have 64bit inode numbers. In order to support
+ * this, and re-exported directories (also seen in NFSv2)
+ * we are forced to allow 2 different inodes to have the same
+ * i_ino.
+ */
+static int
+nfs_find_actor(struct inode *inode, unsigned long ino, void *opaque)
+{
+	struct nfs_fattr *fattr = (struct nfs_fattr *)opaque;
+	if (NFS_FSID(inode) != fattr->fsid)
+		return 0;
+	if (NFS_FILEID(inode) != fattr->fileid)
+		return 0;
+	return 1;
+}
+
+static int
+nfs_inode_is_stale(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int unhashed;
+	int is_stale = 0;
+
+	if (inode->i_mode &&
+	    (fattr->mode & S_IFMT) != (inode->i_mode & S_IFMT))
+		is_stale = 1;
+
+	if (is_bad_inode(inode))
+		is_stale = 1;
+
+	/*
+	 * If the inode seems stale, free up cached dentries.
+	 */
+	unhashed = nfs_free_dentries(inode);
+
+	/* Assume we're holding an i_count
+	 *
+	 * NB: sockets sometimes have volatile file handles
+	 *     don't invalidate their inodes even if all dentries are
+	 *     unhashed.
+	 */
+	if (unhashed && inode->i_count == unhashed + 1
+	    && !S_ISSOCK(inode->i_mode) && !S_ISFIFO(inode->i_mode))
+		is_stale = 1;
+
+	return is_stale;
+}
+
+/*
  * This is our own version of iget that looks up inodes by file handle
  * instead of inode number.  We use this technique instead of using
  * the vfs read_inode function because there is no way to pass the
@@ -545,54 +599,40 @@ nfs_fhget(struct dentry *dentry, struct nfs_fh *fhandle,
 static struct inode *
 __nfs_fhget(struct super_block *sb, struct nfs_fattr *fattr)
 {
-	struct inode *inode;
-	int max_count, stale_inode, unhashed = 0;
+	struct inode *inode = NULL;
+	unsigned long ino;
 
-retry:
-	inode = iget(sb, fattr->fileid);
-	if (!inode)
+	if (!fattr->nlink) {
+		printk("NFS: Buggy server - nlink == 0!\n");
 		goto out_no_inode;
-	/* N.B. This should be impossible ... */
-	if (inode->i_ino != fattr->fileid)
-		goto out_bad_id;
+	}
 
-	/*
-	 * Check for busy inodes, and attempt to get rid of any
-	 * unused local references. If successful, we release the
-	 * inode and try again.
-	 *
-	 * Note that the busy test uses the values in the fattr,
-	 * as the inode may have become a different object.
-	 * (We can probably handle modes changes here, too.)
-	 */
-	stale_inode = inode->i_mode &&
-		      ((fattr->mode ^ inode->i_mode) & S_IFMT);
-	stale_inode |= inode->i_count && inode->i_count == unhashed;
-	max_count = S_ISDIR(fattr->mode) ? 1 : fattr->nlink;
-	if (stale_inode || inode->i_count > max_count + unhashed) {
-		dprintk("__nfs_fhget: inode %ld busy, i_count=%d, i_nlink=%d\n",
-			inode->i_ino, inode->i_count, inode->i_nlink);
-		unhashed = nfs_free_dentries(inode);
-		if (stale_inode || inode->i_count > max_count + unhashed) {
-			printk("__nfs_fhget: inode %ld still busy, i_count=%d\n",
-				inode->i_ino, inode->i_count);
-			if (!list_empty(&inode->i_dentry)) {
-				struct dentry *dentry;
-				dentry = list_entry(inode->i_dentry.next,
-						 struct dentry, d_alias);
-				printk("__nfs_fhget: killing %s/%s filehandle\n",
-					dentry->d_parent->d_name.name,
-					dentry->d_name.name);
-				memset(dentry->d_fsdata, 0,
-					sizeof(struct nfs_fh));
-			}
-			remove_inode_hash(inode);
-			nfs_invalidate_inode(inode);
-			unhashed = 0;
-		}
+	ino = fattr->fileid;
+
+	while((inode = iget4(sb, ino, nfs_find_actor, fattr)) != NULL) {
+
+		/*
+		 * Check for busy inodes, and attempt to get rid of any
+		 * unused local references. If successful, we release the
+		 * inode and try again.
+		 *
+		 * Note that the busy test uses the values in the fattr,
+		 * as the inode may have become a different object.
+		 * (We can probably handle modes changes here, too.)
+		 */
+		if (!nfs_inode_is_stale(inode,fattr))
+			break;
+
+		dprintk("__nfs_fhget: inode %ld still busy, i_count=%d\n",
+		       inode->i_ino, inode->i_count);
+		nfs_zap_caches(inode);
+		remove_inode_hash(inode);
 		iput(inode);
-		goto retry;
 	}
+
+	if (!inode)
+		goto out_no_inode;
+
 	nfs_fill_inode(inode, fattr);
 	dprintk("NFS: __nfs_fhget(%x/%ld ct=%d)\n",
 		inode->i_dev, inode->i_ino, inode->i_count);
@@ -603,18 +643,14 @@ out:
 out_no_inode:
 	printk("__nfs_fhget: iget failed\n");
 	goto out;
-out_bad_id:
-	printk("__nfs_fhget: unexpected inode from iget\n");
-	goto out;
 }
 
 int
 nfs_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
-	int error;
-	struct nfs_sattr sattr;
 	struct nfs_fattr fattr;
+	int error;
 
 	/*
 	 * Make sure the inode is up-to-date.
@@ -627,54 +663,29 @@ printk("nfs_notify_change: revalidate failed, error=%d\n", error);
 		goto out;
 	}
 
-	sattr.mode = (u32) -1;
-	if (attr->ia_valid & ATTR_MODE) 
-		sattr.mode = attr->ia_mode;
-
-	sattr.uid = (u32) -1;
-	if (attr->ia_valid & ATTR_UID)
-		sattr.uid = attr->ia_uid;
-
-	sattr.gid = (u32) -1;
-	if (attr->ia_valid & ATTR_GID)
-		sattr.gid = attr->ia_gid;
-
-	sattr.size = (u32) -1;
-	if ((attr->ia_valid & ATTR_SIZE) && S_ISREG(inode->i_mode))
-		sattr.size = attr->ia_size;
-
-	sattr.mtime.seconds = sattr.mtime.useconds = (u32) -1;
-	if (attr->ia_valid & ATTR_MTIME) {
-		sattr.mtime.seconds = attr->ia_mtime;
-		sattr.mtime.useconds = 0;
-	}
-
-	sattr.atime.seconds = sattr.atime.useconds = (u32) -1;
-	if (attr->ia_valid & ATTR_ATIME) {
-		sattr.atime.seconds = attr->ia_atime;
-		sattr.atime.useconds = 0;
-	}
+	if (!S_ISREG(inode->i_mode))
+		attr->ia_valid &= ~ATTR_SIZE;
 
 	error = nfs_wb_all(inode);
 	if (error)
 		goto out;
 
 	error = nfs_proc_setattr(NFS_DSERVER(dentry), NFS_FH(dentry),
-				&sattr, &fattr);
+				&fattr, attr);
 	if (error)
 		goto out;
 	/*
 	 * If we changed the size or mtime, update the inode
 	 * now to avoid invalidating the page cache.
 	 */
-	if (sattr.size != (u32) -1) {
-		if (sattr.size != fattr.size)
-			printk("nfs_notify_change: sattr=%d, fattr=%d??\n",
-				sattr.size, fattr.size);
-		inode->i_size  = sattr.size;
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size != fattr.size)
+			printk("nfs_notify_change: attr=%ld, fattr=%d??\n",
+				attr->ia_size, fattr.size);
+		inode->i_size  = attr->ia_size;
 		inode->i_mtime = fattr.mtime.seconds;
 	}
-	if (sattr.mtime.seconds != (u32) -1)
+	if (attr->ia_valid & ATTR_MTIME)
 		inode->i_mtime = fattr.mtime.seconds;
 	error = nfs_refresh_inode(inode, &fattr);
 out:
@@ -682,6 +693,34 @@ out:
 }
 
 /*
+ * Wait for the inode to get unlocked.
+ * (Used for NFS_INO_LOCKED and NFS_INO_REVALIDATING).
+ */
+int
+nfs_wait_on_inode(struct inode *inode, int flag)
+{
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	int			intr, error = 0;
+
+	intr = NFS_SERVER(inode)->flags & NFS_MOUNT_INTR;
+	add_wait_queue(&inode->i_wait, &wait);
+	for (;;) {
+		set_task_state(tsk, (intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE));
+		error = 0;
+		if (!(NFS_FLAGS(inode) & flag))
+			break;
+		error = -ERESTARTSYS;
+		if (intr && signalled())
+			break;
+		schedule();
+	}
+	set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(&inode->i_wait, &wait);
+	return error;
+}
+
+/*
  * Externally visible revalidation function
  */
 int
@@ -711,7 +750,7 @@ int nfs_release(struct inode *inode, struct file *filp)
  * the cached attributes have to be refreshed.
  */
 int
-_nfs_revalidate_inode(struct nfs_server *server, struct dentry *dentry)
+__nfs_revalidate_inode(struct nfs_server *server, struct dentry *dentry)
 {
 	struct inode	*inode = dentry->d_inode;
 	int		 status = 0;
@@ -720,6 +759,19 @@ _nfs_revalidate_inode(struct nfs_server *server, struct dentry *dentry)
 	dfprintk(PAGECACHE, "NFS: revalidating %s/%s, ino=%ld\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		inode->i_ino);
+
+	if (!inode || is_bad_inode(inode))
+		return -ESTALE;
+
+	while (NFS_REVALIDATING(inode)) {
+		status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING);
+		if (status < 0)
+			return status;
+		if (time_before(jiffies,NFS_READTIME(inode)+NFS_ATTRTIMEO(inode)))
+			return 0;
+	}
+	NFS_FLAGS(inode) |= NFS_INO_REVALIDATING;
+
 	status = nfs_proc_getattr(server, NFS_FH(dentry), &fattr);
 	if (status) {
 		int error;
@@ -759,6 +811,8 @@ _nfs_revalidate_inode(struct nfs_server *server, struct dentry *dentry)
 	dfprintk(PAGECACHE, "NFS: %s/%s revalidation complete\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 out:
+	NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING;
+	wake_up(&inode->i_wait);
 	return status;
 }
 
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 1bc7d3d37..a7e53e6db 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -118,19 +118,35 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	return p;
 }
 
+
+#define SATTR(p, attr, flag, field) \
+        *p++ = (attr->ia_valid & flag) ? htonl(attr->field) : ~(u32) 0
 static inline u32 *
-xdr_encode_sattr(u32 *p, struct nfs_sattr *sattr)
+xdr_encode_sattr(u32 *p, struct iattr *attr)
 {
-	*p++ = htonl(sattr->mode);
-	*p++ = htonl(sattr->uid);
-	*p++ = htonl(sattr->gid);
-	*p++ = htonl(sattr->size);
-	*p++ = htonl(sattr->atime.seconds);
-	*p++ = htonl(sattr->atime.useconds);
-	*p++ = htonl(sattr->mtime.seconds);
-	*p++ = htonl(sattr->mtime.useconds);
-	return p;
+	SATTR(p, attr, ATTR_MODE, ia_mode);
+	SATTR(p, attr, ATTR_UID, ia_uid);
+	SATTR(p, attr, ATTR_GID, ia_gid);
+	SATTR(p, attr, ATTR_SIZE, ia_size);
+
+	if (attr->ia_valid & (ATTR_ATIME|ATTR_ATIME_SET)) {
+		*p++ = htonl(attr->ia_atime);
+		*p++ = 0;
+	} else {
+		*p++ = ~(u32) 0;
+		*p++ = ~(u32) 0;
+	}
+
+	if (attr->ia_valid & (ATTR_MTIME|ATTR_MTIME_SET)) {
+		*p++ = htonl(attr->ia_mtime);
+		*p++ = 0;
+	} else {
+		*p++ = ~(u32) 0;	
+		*p++ = ~(u32) 0;
+	}
+  	return p;
 }
+#undef SATTR
 
 /*
  * NFS encode functions
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 3b48b326a..bb55ce6d6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -65,7 +65,7 @@ nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 
 int
 nfs_proc_setattr(struct nfs_server *server, struct nfs_fh *fhandle,
-			struct nfs_sattr *sattr, struct nfs_fattr *fattr)
+			struct nfs_fattr *fattr, struct iattr *sattr)
 {
 	struct nfs_sattrargs	arg = { fhandle, sattr };
 	int	status;
@@ -123,7 +123,7 @@ nfs_proc_write(struct nfs_server *server, struct nfs_fh *fhandle, int swap,
 
 int
 nfs_proc_create(struct nfs_server *server, struct nfs_fh *dir,
-			const char *name, struct nfs_sattr *sattr,
+			const char *name, struct iattr *sattr,
 			struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs_createargs	arg = { dir, name, sattr };
@@ -178,7 +178,7 @@ nfs_proc_link(struct nfs_server *server, struct nfs_fh *fhandle,
 int
 nfs_proc_symlink(struct nfs_server *server, struct nfs_fh *dir,
 			const char *name, const char *path,
-			struct nfs_sattr *sattr)
+			struct iattr *sattr)
 {
 	struct nfs_symlinkargs	arg = { dir, name, path, sattr };
 	int			status;
@@ -191,7 +191,7 @@ nfs_proc_symlink(struct nfs_server *server, struct nfs_fh *dir,
 
 int
 nfs_proc_mkdir(struct nfs_server *server, struct nfs_fh *dir,
-			const char *name, struct nfs_sattr *sattr,
+			const char *name, struct iattr *sattr,
 			struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs_createargs	arg = { dir, name, sattr };
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 6cd892740..6b0d0f05b 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -59,8 +59,7 @@ struct inode_operations nfs_symlink_inode_operations = {
 static struct page *try_to_get_symlink_page(struct dentry *dentry, struct inode *inode)
 {
 	struct nfs_readlinkargs rl_args;
-	struct page *page, **hash;
-	unsigned long page_cache;
+	struct page *page, **hash, *page_cache;
 
 	page = NULL;
 	page_cache = page_cache_alloc();
@@ -75,7 +74,7 @@ repeat:
 		goto unlock_out;
 	}
 
-	page = page_cache_entry(page_cache);
+	page = page_cache;
 	if (add_to_page_cache_unique(page, inode, 0, hash)) {
 		page_cache_release(page);
 		goto repeat;
@@ -86,7 +85,7 @@ repeat:
 	 * XDR response verification will NULL terminate it.
 	 */
 	rl_args.fh = NFS_FH(dentry);
-	rl_args.buffer = (const void *)page_cache;
+	rl_args.buffer = (const void *)page_address(page_cache);
 	if (rpc_call(NFS_CLIENT(inode), NFSPROC_READLINK,
 		     &rl_args, NULL, 0) < 0)
 		goto error;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d7f8ad9dd..249abd8cd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -38,6 +38,7 @@
  *
  * aeb@cwi.nl        :  /proc/partitions
  *
+ *
  * Alan Cox	     :  security fixes.
  *			<Alan.Cox@linux.org>
  *
@@ -45,11 +46,6 @@
  *
  * Gerhard Wichert   :  added BIGMEM support
  * Siemens AG           <Gerhard.Wichert@pdb.siemens.de>
- *
- * Chuck Lever       :  safe handling of task_struct
- *                      <cel@monkey.org>
- *
- * Andrea Arcangeli  :	SMP race/security fixes.
  */
 
 #include <linux/types.h>
@@ -71,7 +67,6 @@
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
-#include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -365,16 +360,24 @@ static int get_meminfo(char * buffer)
 	struct sysinfo i;
 	int len;
 
+/*
+ * display in kilobytes.
+ */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+
 	si_meminfo(&i);
 	si_swapinfo(&i);
 	len = sprintf(buffer, "        total:    used:    free:  shared: buffers:  cached:\n"
-		"Mem:  %8lu %8lu %8lu %8lu %8lu %8lu\n"
+		"Mem:  %8lu %8lu %8lu %8lu %8lu %8u\n"
 		"Swap: %8lu %8lu %8lu\n",
-		i.totalram, i.totalram-i.freeram, i.freeram, i.sharedram, i.bufferram, (unsigned long) atomic_read(&page_cache_size)*PAGE_SIZE,
-		i.totalswap, i.totalswap-i.freeswap, i.freeswap);
+		K(i.totalram), K(i.totalram-i.freeram), K(i.freeram),
+		K(i.sharedram), K(i.bufferram),
+		K(atomic_read(&page_cache_size)), K(i.totalswap),
+		K(i.totalswap-i.freeswap), K(i.freeswap));
 	/*
-	 * Tagged format, for easy grepping and expansion. The above will go away
-	 * eventually, once the tools have been updated.
+	 * Tagged format, for easy grepping and expansion.
+	 * The above will go away eventually, once the tools
+	 * have been updated.
 	 */
 	return len + sprintf(buffer+len,
 		"MemTotal:  %8lu kB\n"
@@ -382,19 +385,20 @@ static int get_meminfo(char * buffer)
 		"MemShared: %8lu kB\n"
 		"Buffers:   %8lu kB\n"
 		"Cached:    %8u kB\n"
-		"BigTotal:  %8lu kB\n"
-		"BigFree:   %8lu kB\n"
+		"HighTotal: %8lu kB\n"
+		"HighFree:  %8lu kB\n"
 		"SwapTotal: %8lu kB\n"
 		"SwapFree:  %8lu kB\n",
-		i.totalram >> 10,
-		i.freeram >> 10,
-		i.sharedram >> 10,
-		i.bufferram >> 10,
-		atomic_read(&page_cache_size) << (PAGE_SHIFT - 10),
-		i.totalbig >> 10,
-		i.freebig >> 10,
-		i.totalswap >> 10,
-		i.freeswap >> 10);
+		K(i.totalram),
+		K(i.freeram),
+		K(i.sharedram),
+		K(i.bufferram),
+		K(atomic_read(&page_cache_size)),
+		K(i.totalhigh),
+		K(i.freehigh),
+		K(i.totalswap),
+		K(i.freeswap));
+#undef K
 }
 
 static int get_version(char * buffer)
@@ -412,69 +416,68 @@ static int get_cmdline(char * buffer)
 	return sprintf(buffer, "%s\n", saved_command_line);
 }
 
-static unsigned long get_phys_addr(struct mm_struct * mm, unsigned long ptr)
+static struct page * get_phys_addr(struct mm_struct * mm, unsigned long ptr)
 {
-	pgd_t *page_dir;
-	pmd_t *page_middle;
+	pgd_t *pgd;
+	pmd_t *pmd;
 	pte_t pte;
 
 	if (ptr >= TASK_SIZE)
 		return 0;
-	page_dir = pgd_offset(mm,ptr);
-	if (pgd_none(*page_dir))
+	pgd = pgd_offset(mm,ptr);
+	if (pgd_none(*pgd))
 		return 0;
-	if (pgd_bad(*page_dir)) {
-		printk("bad page directory entry %08lx\n", pgd_val(*page_dir));
-		pgd_clear(page_dir);
+	if (pgd_bad(*pgd)) {
+		pgd_ERROR(*pgd);
+		pgd_clear(pgd);
 		return 0;
 	}
-	page_middle = pmd_offset(page_dir,ptr);
-	if (pmd_none(*page_middle))
+	pmd = pmd_offset(pgd,ptr);
+	if (pmd_none(*pmd))
 		return 0;
-	if (pmd_bad(*page_middle)) {
-		printk("bad page middle entry %08lx\n", pmd_val(*page_middle));
-		pmd_clear(page_middle);
+	if (pmd_bad(*pmd)) {
+		pmd_ERROR(*pmd);
+		pmd_clear(pmd);
 		return 0;
 	}
-	pte = *pte_offset(page_middle,ptr);
+	pte = *pte_offset(pmd,ptr);
 	if (!pte_present(pte))
 		return 0;
-	return pte_page(pte) + (ptr & ~PAGE_MASK);
+	return pte_page(pte);
 }
 
-#include <linux/bigmem.h>
-
 static int get_array(struct mm_struct *mm, unsigned long start, unsigned long end, char * buffer)
 {
-	unsigned long addr;
+	struct page *page;
+	unsigned long kaddr;
 	int size = 0, result = 0;
 	char c;
 
 	if (start >= end)
 		return result;
 	for (;;) {
-		addr = get_phys_addr(mm, start);
-		if (!addr)
+		page = get_phys_addr(mm, start);
+		if (!page)
 			return result;
-		addr = kmap(addr, KM_READ);
+		kaddr = kmap(page, KM_READ) + (start & ~PAGE_MASK);
 		do {
-			c = *(char *) addr;
+			c = *(char *) kaddr;
 			if (!c)
 				result = size;
 			if (size < PAGE_SIZE)
 				buffer[size++] = c;
 			else {
-				kunmap(addr, KM_READ);
+				kunmap(kaddr, KM_READ);
 				return result;
 			}
-			addr++;
+			kaddr++;
 			start++;
 			if (!c && start >= end) {
-				kunmap(addr, KM_READ);
+				kunmap(kaddr, KM_READ);
 				return result;
 			}
-		} while (addr & ~PAGE_MASK);
-		kunmap(addr-1, KM_READ);
+		} while (kaddr & ~PAGE_MASK);
+		kunmap(kaddr, KM_READ);
 	}
 	return result;
 }
@@ -483,9 +486,7 @@ static struct mm_struct *get_mm(int pid)
 {
 	struct task_struct *p;
 	struct mm_struct *mm = NULL;
-
-	/* need kernel lock to avoid the tsk->mm to go away under us */
-	lock_kernel();
+	
 	read_lock(&tasklist_lock);
 	p = find_task_by_pid(pid);
 	if (p)
@@ -493,10 +494,10 @@ static struct mm_struct *get_mm(int pid)
 	if (mm)
 		atomic_inc(&mm->mm_users);
 	read_unlock(&tasklist_lock);
-	unlock_kernel();
 	return mm;
 }
 
+
 static int get_env(int pid, char * buffer)
 {
 	struct mm_struct *mm = get_mm(pid);
@@ -859,9 +860,6 @@ static inline char * task_mem(struct mm_struct *mm, char *buffer)
 	return buffer;
 }
 
-/*
- * These next two assume that the task's sigmask_lock is held by the caller.
- */
 static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
 				    sigset_t *catch)
 {
@@ -914,115 +912,77 @@ extern inline char *task_cap(struct task_struct *p, char *buffer)
 			    cap_t(p->cap_effective));
 }
 
-/*
- * This is somewhat safer than it was before.  However...
- *
- * Embedded pointers in the task structure may reference data that
- * can be changed or that is no longer valid after the tasklist
- * lock is released, or that isn't even protected by the tasklist
- * lock.  Eg. tsk->tty, tsk->sig, and tsk->p_pptr can change after
- * we make our own copy of the task structure.  This doesn't matter
- * unless we are trying to use the pointed-to data as an address.
- * So there are still a few safety issues to be addressed here.
- */
+
 static int get_status(int pid, char * buffer)
 {
 	char * orig = buffer;
 	struct task_struct *tsk;
 	struct mm_struct *mm = NULL;
 
-	/*
-	 * We lock the whole kernel here because p->files and p->mm are still
-	 * protected by the global kernel lock.
-	 */
-	lock_kernel();
-
 	read_lock(&tasklist_lock);
 	tsk = find_task_by_pid(pid);
-	if (tsk) {
+	if (tsk)
 		mm = tsk->mm;
-		if (mm)
-			atomic_inc(&mm->mm_users);
-
-		buffer = task_name(tsk, buffer);
-		buffer = task_state(tsk, buffer);
-
-		spin_lock_irq(&tsk->sigmask_lock);
-		buffer = task_sig(tsk, buffer);
-		spin_unlock_irq(&tsk->sigmask_lock);
-
-		buffer = task_cap(tsk, buffer);
-	}
-	read_unlock(&tasklist_lock);
-
-	unlock_kernel();
-
-	/*
-	 * We can't hold the tasklist_lock and jiggle the mmap_sem --
-	 * that can result in a deadlock.
-	 */
-	if (mm) {
+	if (mm)
+		atomic_inc(&mm->mm_users);
+	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
+	if (!tsk)
+		return 0;
+	buffer = task_name(tsk, buffer);
+	buffer = task_state(tsk, buffer);
+	if (mm)
 		buffer = task_mem(mm, buffer);
+	buffer = task_sig(tsk, buffer);
+	buffer = task_cap(tsk, buffer);
+	if (mm)
 		mmput(mm);
-	}
-
-	/*
-	 * (buffer - orig) will be zero on an error exit.
-	 */
 	return buffer - orig;
 }
 
 static int get_stat(int pid, char * buffer)
 {
 	struct task_struct *tsk;
-	struct mm_struct *mm;
+	struct mm_struct *mm = NULL;
 	unsigned long vsize, eip, esp, wchan;
 	long priority, nice;
-	pid_t ppid = 0;
+	int tty_pgrp;
 	sigset_t sigign, sigcatch;
 	char state;
-	int res = 0;
-	unsigned int tty_device;
-	int tty_pgrp;
+	int res;
 
 	read_lock(&tasklist_lock);
 	tsk = find_task_by_pid(pid);
-	if (!tsk)
-		goto out_unlock;
-	/* avoid the task list to go away under us (security) */
-	get_page(MAP_NR(tsk) + mem_map);
-	ppid = tsk->p_pptr->pid;
-	read_unlock(&tasklist_lock);
-
-	/* we need the big kernel lock to avoid tsk->mm and tsk->tty
-	   to change under us */
-	lock_kernel();
-	mm = tsk->mm;
+	if (tsk)
+		mm = tsk->mm;
 	if (mm)
 		atomic_inc(&mm->mm_users);
-	tty_device = tsk->tty ? kdev_t_to_nr(tsk->tty->device) : 0;
-	tty_pgrp = tsk->tty ? tsk->tty->pgrp : -1;
-	unlock_kernel();
-
-	spin_lock_irq(&tsk->sigmask_lock);
-	collect_sigign_sigcatch(tsk, &sigign, &sigcatch);
-	spin_unlock_irq(&tsk->sigmask_lock);
-
-	eip = KSTK_EIP(tsk);
-	esp = KSTK_ESP(tsk);
-	wchan = get_wchan(tsk);
-
+	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
+	if (!tsk)
+		return 0;
 	state = *get_task_state(tsk);
 	vsize = eip = esp = 0;
-	if (mm)
-	{
+	if (mm) {
 		struct vm_area_struct *vma;
 		down(&mm->mmap_sem);
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
+		vma = mm->mmap;
+		while (vma) {
 			vsize += vma->vm_end - vma->vm_start;
+			vma = vma->vm_next;
+		}
+		eip = KSTK_EIP(tsk);
+		esp = KSTK_ESP(tsk);
 		up(&mm->mmap_sem);
 	}
 
+	wchan = get_wchan(tsk);
+
+	collect_sigign_sigcatch(tsk, &sigign, &sigcatch);
+
+	if (tsk->tty)
+		tty_pgrp = tsk->tty->pgrp;
+	else
+		tty_pgrp = -1;
+
 	/* scale priority and nice values from timeslices to -20..20 */
 	/* to make it look like a "normal" Unix priority/nice value  */
 	priority = tsk->counter;
@@ -1036,10 +996,10 @@ static int get_stat(int pid, char * buffer)
 		pid,
 		tsk->comm,
 		state,
-		ppid,
+		tsk->p_pptr->pid,
 		tsk->pgrp,
 		tsk->session,
-		tty_device,
+	        tsk->tty ? kdev_t_to_nr(tsk->tty->device) : 0,
 		tty_pgrp,
 		tsk->flags,
 		tsk->min_flt,
@@ -1076,16 +1036,9 @@ static int get_stat(int pid, char * buffer)
 		tsk->cnswap,
 		tsk->exit_signal,
 		tsk->processor);
-
 	if (mm)
 		mmput(mm);
-	free_task_struct(tsk);
 	return res;
-
-out_unlock:
-	read_unlock(&tasklist_lock);
-	unlock_kernel();
-	return 0;
 }
 		
 static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
@@ -1097,7 +1050,7 @@ static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned
 	if (pmd_none(*pmd))
 		return;
 	if (pmd_bad(*pmd)) {
-		printk("statm_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
+		pmd_ERROR(*pmd);
 		pmd_clear(pmd);
 		return;
 	}
@@ -1135,7 +1088,7 @@ static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned
 	if (pgd_none(*pgd))
 		return;
 	if (pgd_bad(*pgd)) {
-		printk("statm_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
+		pgd_ERROR(*pgd);
 		pgd_clear(pgd);
 		return;
 	}
@@ -1233,11 +1186,11 @@ static ssize_t read_maps (int pid, struct file * file, char * buf,
 			  size_t count, loff_t *ppos)
 {
 	struct task_struct *p;
-	struct mm_struct *mm = NULL;
 	struct vm_area_struct * map, * next;
 	char * destptr = buf, * buffer;
 	loff_t lineno;
 	ssize_t column, i;
+	int volatile_task;
 	long retval;
 
 	/*
@@ -1249,30 +1202,24 @@ static ssize_t read_maps (int pid, struct file * file, char * buf,
 		goto out;
 
 	retval = -EINVAL;
-	lock_kernel();
 	read_lock(&tasklist_lock);
 	p = find_task_by_pid(pid);
-	if (p) {
-		mm = p->mm;
-		if (mm)
-			atomic_inc(&mm->mm_users);
-	}
-	read_unlock(&tasklist_lock);
-	unlock_kernel();
+	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
 	if (!p)
 		goto freepage_out;
 
-	/* nothing to map */
-	if (!mm || count == 0)
+	if (!p->mm || count == 0)
 		goto getlen_out;
 
+	/* Check whether the mmaps could change if we sleep */
+	volatile_task = (p != current || atomic_read(&p->mm->mm_users) > 1);
+
 	/* decode f_pos */
 	lineno = *ppos >> MAPS_LINE_SHIFT;
 	column = *ppos & (MAPS_LINE_LENGTH-1);
 
-	down(&mm->mmap_sem);
-	/* quickly go to line "lineno" */
-	for (map = mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
+	/* quickly go to line lineno */
+	for (map = p->mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
 		continue;
 
 	for ( ; map ; map = next ) {
@@ -1343,13 +1290,17 @@ static ssize_t read_maps (int pid, struct file * file, char * buf,
 		/* done? */
 		if (count == 0)
 			break;
+
+		/* By writing to user space, we might have slept.
+		 * Stop the loop, to avoid a race condition.
+		 */
+		if (volatile_task)
+			break;
 	}
-	up(&mm->mmap_sem);
 
 	/* encode f_pos */
 	*ppos = (lineno << MAPS_LINE_SHIFT) + column;
 
-	mmput(mm);
 getlen_out:
 	retval = destptr - buf;
 
@@ -1362,31 +1313,28 @@ out:
 #ifdef __SMP__
 static int get_pidcpu(int pid, char * buffer)
 {
-	struct task_struct * tsk;
+	struct task_struct * tsk = current ;
 	int i, len = 0;
 
-	/*
-	 * Hold the tasklist_lock to guarantee that the task_struct
-	 * address will remain valid while we examine its contents.
-	 */
 	read_lock(&tasklist_lock);
-	tsk = find_task_by_pid(pid);
-	if (tsk)
-		get_page(MAP_NR(tsk) + mem_map);
-	read_unlock(&tasklist_lock);
-	if (tsk) {
-		len = sprintf(buffer,
-			"cpu  %lu %lu\n",
-			HZ_TO_STD(tsk->times.tms_utime),
-			HZ_TO_STD(tsk->times.tms_stime));
-		
-		for (i = 0 ; i < smp_num_cpus; i++)
-			len += sprintf(buffer + len, "cpu%d %lu %lu\n",
-				i,
-				HZ_TO_STD(tsk->per_cpu_utime[cpu_logical_map(i)]),
-				HZ_TO_STD(tsk->per_cpu_stime[cpu_logical_map(i)]));
-		free_task_struct(tsk);
-	}
+	if (pid != tsk->pid)
+		tsk = find_task_by_pid(pid);
+	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
+
+	if (tsk == NULL)
+		return 0;
+
+	len = sprintf(buffer,
+		"cpu  %lu %lu\n",
+		HZ_TO_STD(tsk->times.tms_utime),
+		HZ_TO_STD(tsk->times.tms_stime));
+
+	for (i = 0 ; i < smp_num_cpus; i++)
+		len += sprintf(buffer + len, "cpu%d %lu %lu\n",
+			i,
+			HZ_TO_STD(tsk->per_cpu_utime[cpu_logical_map(i)]),
+			HZ_TO_STD(tsk->per_cpu_stime[cpu_logical_map(i)]));
+
 	return len;
 }
 #endif
@@ -1519,6 +1467,12 @@ static int process_unauthorized(int type, int pid)
 	int ok = 0;
 		
 	read_lock(&tasklist_lock);
+	
+	/*
+	 *	Grab the lock, find the task, save the uid and
+	 *	check it has an mm still (ie its not dead)
+	 */
+	
 	p = find_task_by_pid(pid);
 	if (p) {
 		euid=p->euid;
@@ -1526,7 +1480,9 @@ static int process_unauthorized(int type, int pid)
 		if(!cap_issubset(p->cap_permitted, current->cap_permitted))
 			ok=0;			
 	}
+		
 	read_unlock(&tasklist_lock);
+
 	if (!p)
 		return 1;
 
diff --git a/fs/proc/mem.c b/fs/proc/mem.c
index f9fcb0970..90cd79722 100644
--- a/fs/proc/mem.c
+++ b/fs/proc/mem.c
@@ -10,7 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
-#include <linux/bigmem.h>
+#include <linux/highmem.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -79,9 +79,10 @@ static ssize_t mem_read(struct file * file, char * buf,
 	pgd_t *page_dir;
 	pmd_t *page_middle;
 	pte_t pte;
-	char * page;
+	struct page * page;
 	struct task_struct * tsk;
 	unsigned long addr;
+	unsigned long maddr; /* temporary mapped address */
 	char *tmp;
 	ssize_t scount, i;
 
@@ -102,7 +103,7 @@ static ssize_t mem_read(struct file * file, char * buf,
 		if (pgd_none(*page_dir))
 			break;
 		if (pgd_bad(*page_dir)) {
-			printk("Bad page dir entry %08lx\n", pgd_val(*page_dir));
+			pgd_ERROR(*page_dir);
 			pgd_clear(page_dir);
 			break;
 		}
@@ -110,20 +111,20 @@ static ssize_t mem_read(struct file * file, char * buf,
 		if (pmd_none(*page_middle))
 			break;
 		if (pmd_bad(*page_middle)) {
-			printk("Bad page middle entry %08lx\n", pmd_val(*page_middle));
+			pmd_ERROR(*page_middle);
 			pmd_clear(page_middle);
 			break;
 		}
 		pte = *pte_offset(page_middle,addr);
 		if (!pte_present(pte))
 			break;
-		page = (char *) pte_page(pte) + (addr & ~PAGE_MASK);
+		page = pte_page(pte);
 		i = PAGE_SIZE-(addr & ~PAGE_MASK);
 		if (i > scount)
 			i = scount;
-		page = (char *) kmap((unsigned long) page, KM_READ);
-		copy_to_user(tmp, page, i);
-		kunmap((unsigned long) page, KM_READ);
+		maddr = kmap(page, KM_READ);
+		copy_to_user(tmp, (char *)maddr + (addr & ~PAGE_MASK), i);
+		kunmap(maddr, KM_READ);
 		addr += i;
 		tmp += i;
 		scount -= i;
@@ -141,9 +142,10 @@ static ssize_t mem_write(struct file * file, char * buf,
 	pgd_t *page_dir;
 	pmd_t *page_middle;
 	pte_t pte;
-	char * page;
+	struct page * page;
 	struct task_struct * tsk;
 	unsigned long addr;
+	unsigned long maddr; /* temporary mapped address */
 	char *tmp;
 	long i;
 
@@ -159,7 +161,7 @@ static ssize_t mem_write(struct file * file, char * buf,
 		if (pgd_none(*page_dir))
 			break;
 		if (pgd_bad(*page_dir)) {
-			printk("Bad page dir entry %08lx\n", pgd_val(*page_dir));
+			pgd_ERROR(*page_dir);
 			pgd_clear(page_dir);
 			break;
 		}
@@ -167,7 +169,7 @@ static ssize_t mem_write(struct file * file, char * buf,
 		if (pmd_none(*page_middle))
 			break;
 		if (pmd_bad(*page_middle)) {
-			printk("Bad page middle entry %08lx\n", pmd_val(*page_middle));
+			pmd_ERROR(*page_middle);
 			pmd_clear(page_middle);
 			break;
 		}
@@ -176,13 +178,13 @@ static ssize_t mem_write(struct file * file, char * buf,
 			break;
 		if (!pte_write(pte))
 			break;
-		page = (char *) pte_page(pte) + (addr & ~PAGE_MASK);
+		page = pte_page(pte);
 		i = PAGE_SIZE-(addr & ~PAGE_MASK);
 		if (i > count)
 			i = count;
-		page = (unsigned long) kmap((unsigned long) page, KM_WRITE);
-		copy_from_user(page, tmp, i);
-		kunmap((unsigned long) page, KM_WRITE);
+		maddr = kmap(page, KM_WRITE);
+		copy_from_user((char *)maddr + (addr & ~PAGE_MASK), tmp, i);
+		kunmap(maddr, KM_WRITE);
 		addr += i;
 		tmp += i;
 		count -= i;
@@ -248,14 +250,14 @@ int mem_mmap(struct file * file, struct vm_area_struct * vma)
 		if (pgd_none(*src_dir))
 			return -EINVAL;
 		if (pgd_bad(*src_dir)) {
-			printk("Bad source page dir entry %08lx\n", pgd_val(*src_dir));
+			pgd_ERROR(*src_dir);
 			return -EINVAL;
 		}
 		src_middle = pmd_offset(src_dir, stmp);
 		if (pmd_none(*src_middle))
 			return -EINVAL;
 		if (pmd_bad(*src_middle)) {
-			printk("Bad source page middle entry %08lx\n", pmd_val(*src_middle));
+			pmd_ERROR(*src_middle);
 			return -EINVAL;
 		}
 		src_table = pte_offset(src_middle, stmp);
@@ -301,9 +303,9 @@ int mem_mmap(struct file * file, struct vm_area_struct * vma)
 
 		set_pte(src_table, pte_mkdirty(*src_table));
 		set_pte(dest_table, *src_table);
-		mapnr = MAP_NR(pte_page(*src_table));
+		mapnr = pte_pagenr(*src_table);
 		if (mapnr < max_mapnr)
-			get_page(mem_map + MAP_NR(pte_page(*src_table)));
+			get_page(mem_map + pte_pagenr(*src_table));
 
 		stmp += PAGE_SIZE;
 		dtmp += PAGE_SIZE;
diff --git a/fs/super.c b/fs/super.c
index 693017eee..3b58d13cc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -135,7 +135,7 @@ out:
 	return lptr;
 }
 
-static void remove_vfsmnt(kdev_t dev)
+void remove_vfsmnt(kdev_t dev)
 {
 	struct vfsmount *lptr, *tofree;
 
@@ -508,7 +508,7 @@ out:
 /*
  * Find a super_block with no device assigned.
  */
-static struct super_block *get_empty_super(void)
+struct super_block *get_empty_super(void)
 {
 	struct super_block *s;
author	Ralf Baechle <ralf@linux-mips.org>	2000-01-27 01:05:20 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-01-27 01:05:20 +0000
commit	546db14ee74118296f425f3b91634fb767d67290 (patch)
tree	22b613a3da8d4bf663eec5e155af01b87fdf9094 /fs
parent	1e25e41c4f5474e14452094492dbc169b800e4c8 (diff)