1 files changed, 1095 insertions, 397 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 93003dfc1..8f88f88a3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2,16 +2,17 @@
  *  linux/kernel/sched.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1996-04-21	Modified by Ulrich Windl to make NTP work
  */
 
 /*
  * 'sched.c' is the main kernel file. It contains scheduling primitives
  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
- * call functions (type getpid(), which just extracts a field from
+ * call functions (type getpid()), which just extract a field from
  * current-task
  */
 
-#include <linux/config.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
@@ -26,20 +27,23 @@
 #include <linux/tqueue.h>
 #include <linux/resource.h>
 #include <linux/mm.h>
+#include <linux/smp.h>
 
 #include <asm/system.h>
 #include <asm/io.h>
-#include <asm/segment.h>
+#include <asm/uaccess.h>
 #include <asm/pgtable.h>
-
-#define TIMER_IRQ 0
+#include <asm/mmu_context.h>
 
 #include <linux/timex.h>
 
 /*
  * kernel variables
  */
-long tick = 1000000 / HZ;               /* timer interrupt period */
+
+int securelevel = 0;			/* system security level */
+
+long tick = (1000000 + HZ/2) / HZ;	/* timer interrupt period */
 volatile struct timeval xtime;		/* The current time */
 int tickadj = 500/HZ;			/* microsecs */
 
@@ -50,17 +54,19 @@ DECLARE_TASK_QUEUE(tq_scheduler);
 /*
  * phase-lock loop variables
  */
-int time_status = TIME_BAD;     /* clock synchronization status */
-long time_offset = 0;           /* time adjustment (us) */
-long time_constant = 0;         /* pll time constant */
-long time_tolerance = MAXFREQ;  /* frequency tolerance (ppm) */
-long time_precision = 1; 	/* clock precision (us) */
-long time_maxerror = 0x70000000;/* maximum error */
-long time_esterror = 0x70000000;/* estimated error */
-long time_phase = 0;            /* phase offset (scaled us) */
-long time_freq = 0;             /* frequency offset (scaled ppm) */
-long time_adj = 0;              /* tick adjust (scaled 1 / HZ) */
-long time_reftime = 0;          /* time at last adjustment (s) */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+int time_state = TIME_ERROR;	/* clock synchronization status */
+int time_status = STA_UNSYNC;	/* clock status bits */
+long time_offset = 0;		/* time adjustment (us) */
+long time_constant = 2;		/* pll time constant */
+long time_tolerance = MAXFREQ;	/* frequency tolerance (ppm) */
+long time_precision = 1;	/* clock precision (us) */
+long time_maxerror = MAXPHASE;	/* maximum error (us) */
+long time_esterror = MAXPHASE;	/* estimated error (us) */
+long time_phase = 0;		/* phase offset (scaled us) */
+long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;	/* frequency offset (scaled ppm) */
+long time_adj = 0;		/* tick adjust (scaled 1 / HZ) */
+long time_reftime = 0;		/* time at last adjustment (s) */
 
 long time_adjust = 0;
 long time_adjust_step = 0;
@@ -69,132 +75,354 @@ int need_resched = 0;
 unsigned long event = 0;
 
 extern int _setitimer(int, struct itimerval *, struct itimerval *);
-unsigned long * prof_buffer = NULL;
+unsigned int * prof_buffer = NULL;
 unsigned long prof_len = 0;
+unsigned long prof_shift = 0;
 
 #define _S(nr) (1<<((nr)-1))
 
 extern void mem_use(void);
 
-extern int timer_interrupt(void);
- 
+#ifdef __mips__
+unsigned long init_kernel_stack[2048] = { STACK_MAGIC, };
+unsigned long init_user_stack[2048] = { STACK_MAGIC, };
+#else
 unsigned long init_kernel_stack[1024] = { STACK_MAGIC, };
 unsigned long init_user_stack[1024] = { STACK_MAGIC, };
+#endif
 static struct vm_area_struct init_mmap = INIT_MMAP;
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS;
+
+struct mm_struct init_mm = INIT_MM;
 struct task_struct init_task = INIT_TASK;
 
 unsigned long volatile jiffies=0;
 
-struct task_struct *current = &init_task;
+struct task_struct *current_set[NR_CPUS];
 struct task_struct *last_task_used_math = NULL;
 
 struct task_struct * task[NR_TASKS] = {&init_task, };
 
 struct kernel_stat kstat = { 0 };
 
-unsigned long itimer_ticks = 0;
-unsigned long itimer_next = ~0;
+static inline void add_to_runqueue(struct task_struct * p)
+{
+#ifdef __SMP__
+	int cpu=smp_processor_id();
+#endif	
+#if 1	/* sanity tests */
+	if (p->next_run || p->prev_run) {
+		printk("task already on run-queue\n");
+		return;
+	}
+#endif
+	if (p->counter > current->counter + 3)
+		need_resched = 1;
+	nr_running++;
+	(p->prev_run = init_task.prev_run)->next_run = p;
+	p->next_run = &init_task;
+	init_task.prev_run = p;
+#ifdef __SMP__
+	/* this is safe only if called with cli()*/
+	while(set_bit(31,&smp_process_available))
+	{
+		while(test_bit(31,&smp_process_available))
+		{
+			if(clear_bit(cpu,&smp_invalidate_needed))
+			{
+				local_flush_tlb();
+				set_bit(cpu,&cpu_callin_map[0]);
+			}
+		}
+	}
+	smp_process_available++;
+	clear_bit(31,&smp_process_available);
+	if ((0!=p->pid) && smp_threads_ready)
+	{
+		int i;
+		for (i=0;i<smp_num_cpus;i++)
+		{
+			if (0==current_set[cpu_logical_map[i]]->pid) 
+			{
+				smp_message_pass(cpu_logical_map[i], MSG_RESCHEDULE, 0L, 0);
+				break;
+			}
+		}
+	}
+#endif
+}
+
+static inline void del_from_runqueue(struct task_struct * p)
+{
+	struct task_struct *next = p->next_run;
+	struct task_struct *prev = p->prev_run;
+
+#if 1	/* sanity tests */
+	if (!next || !prev) {
+		printk("task not on run-queue\n");
+		return;
+	}
+#endif
+	if (p == &init_task) {
+		static int nr = 0;
+		if (nr < 5) {
+			nr++;
+			printk("idle task may not sleep\n");
+		}
+		return;
+	}
+	nr_running--;
+	next->prev_run = prev;
+	prev->next_run = next;
+	p->next_run = NULL;
+	p->prev_run = NULL;
+}
+
+static inline void move_last_runqueue(struct task_struct * p)
+{
+	struct task_struct *next = p->next_run;
+	struct task_struct *prev = p->prev_run;
+
+	/* remove from list */
+	next->prev_run = prev;
+	prev->next_run = next;
+	/* add back to list */
+	p->next_run = &init_task;
+	prev = init_task.prev_run;
+	init_task.prev_run = p;
+	p->prev_run = prev;
+	prev->next_run = p;
+}
+
+/*
+ * Wake up a process. Put it on the run-queue if it's not
+ * already there.  The "current" process is always on the
+ * run-queue (except when the actual re-schedule is in
+ * progress), and as such you're allowed to do the simpler
+ * "current->state = TASK_RUNNING" to mark yourself runnable
+ * without the overhead of this.
+ */
+inline void wake_up_process(struct task_struct * p)
+{
+	unsigned long flags;
+
+	save_flags(flags);
+	cli();
+	p->state = TASK_RUNNING;
+	if (!p->next_run)
+		add_to_runqueue(p);
+	restore_flags(flags);
+}
+
+static void process_timeout(unsigned long __data)
+{
+	struct task_struct * p = (struct task_struct *) __data;
+
+	p->timeout = 0;
+	wake_up_process(p);
+}
+
+/*
+ * This is the function that decides how desirable a process is..
+ * You can weigh different processes against each other depending
+ * on what CPU they've run on lately etc to try to handle cache
+ * and TLB miss penalties.
+ *
+ * Return values:
+ *	 -1000: never select this
+ *	     0: out of time, recalculate counters (but it might still be
+ *		selected)
+ *	   +ve: "goodness" value (the larger, the better)
+ *	 +1000: realtime process, select this.
+ */
+static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu)
+{
+	int weight;
+
+#ifdef __SMP__	
+	/* We are not permitted to run a task someone else is running */
+	if (p->processor != NO_PROC_ID)
+		return -1000;
+#ifdef PAST_2_0		
+	/* This process is locked to a processor group */
+	if (p->processor_mask && !(p->processor_mask & (1<<this_cpu))
+		return -1000;
+#endif		
+#endif
+
+	/*
+	 * Realtime process, select the first one on the
+	 * runqueue (taking priorities within processes
+	 * into account).
+	 */
+	if (p->policy != SCHED_OTHER)
+		return 1000 + p->rt_priority;
+
+	/*
+	 * Give the process a first-approximation goodness value
+	 * according to the number of clock-ticks it has left.
+	 *
+	 * Don't do any other calculations if the time slice is
+	 * over..
+	 */
+	weight = p->counter;
+	if (weight) {
+			
+#ifdef __SMP__
+		/* Give a largish advantage to the same processor...   */
+		/* (this is equivalent to penalizing other processors) */
+		if (p->last_processor == this_cpu)
+			weight += PROC_CHANGE_PENALTY;
+#endif
+
+		/* .. and a slight advantage to the current process */
+		if (p == prev)
+			weight += 1;
+	}
+
+	return weight;
+}
 
 /*
  *  'schedule()' is the scheduler function. It's a very simple and nice
  * scheduler: it's not perfect, but certainly works for most things.
- * The one thing you might take a look at is the signal-handler code here.
+ *
+ * The goto is "interesting".
  *
  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
  * information in task[0] is never used.
- *
- * The "confuse_gcc" goto is used only to get better assembly code..
- * Dijkstra probably hates me.
  */
 asmlinkage void schedule(void)
 {
 	int c;
 	struct task_struct * p;
-	struct task_struct * next;
-	unsigned long ticks;
+	struct task_struct * prev, * next;
+	unsigned long timeout = 0;
+	int this_cpu=smp_processor_id();
 
 /* check alarm, wake up any interruptible tasks that have got a signal */
 
-	if (intr_count) {
-		printk("Aiee: scheduling in interrupt\n");
+	if (intr_count)
+		goto scheduling_in_interrupt;
+
+	if (bh_active & bh_mask) {
+		intr_count = 1;
+		do_bottom_half();
 		intr_count = 0;
 	}
+
 	run_task_queue(&tq_scheduler);
-	cli();
-	ticks = itimer_ticks;
-	itimer_ticks = 0;
-	itimer_next = ~0;
-	sti();
+
 	need_resched = 0;
-	nr_running = 0;
-	p = &init_task;
-	for (;;) {
-		if ((p = p->next_task) == &init_task)
-			goto confuse_gcc1;
-		if (ticks && p->it_real_value) {
-			if (p->it_real_value <= ticks) {
-				send_sig(SIGALRM, p, 1);
-				if (!p->it_real_incr) {
-					p->it_real_value = 0;
-					goto end_itimer;
-				}
-				do {
-					p->it_real_value += p->it_real_incr;
-				} while (p->it_real_value <= ticks);
+	prev = current;
+	cli();
+	/* move an exhausted RR process to be last.. */
+	if (!prev->counter && prev->policy == SCHED_RR) {
+		prev->counter = prev->priority;
+		move_last_runqueue(prev);
+	}
+	switch (prev->state) {
+		case TASK_INTERRUPTIBLE:
+			if (prev->signal & ~prev->blocked)
+				goto makerunnable;
+			timeout = prev->timeout;
+			if (timeout && (timeout <= jiffies)) {
+				prev->timeout = 0;
+				timeout = 0;
+		makerunnable:
+				prev->state = TASK_RUNNING;
+				break;
 			}
-			p->it_real_value -= ticks;
-			if (p->it_real_value < itimer_next)
-				itimer_next = p->it_real_value;
-		}
-end_itimer:
-		if (p->state != TASK_INTERRUPTIBLE)
-			continue;
-		if (p->signal & ~p->blocked) {
-			p->state = TASK_RUNNING;
-			continue;
-		}
-		if (p->timeout && p->timeout <= jiffies) {
-			p->timeout = 0;
-			p->state = TASK_RUNNING;
-		}
+		default:
+			del_from_runqueue(prev);
+		case TASK_RUNNING:
 	}
-confuse_gcc1:
+	p = init_task.next_run;
+	sti();
+	
+#ifdef __SMP__
+	/*
+	 *	This is safe as we do not permit re-entry of schedule()
+	 */
+	prev->processor = NO_PROC_ID;
+#define idle_task (task[cpu_number_map[this_cpu]])
+#else
+#define idle_task (&init_task)
+#endif	
 
+/*
+ * Note! there may appear new tasks on the run-queue during this, as
+ * interrupts are enabled. However, they will be put on front of the
+ * list, so our list starting at "p" is essentially fixed.
+ */
 /* this is the scheduler proper: */
-#if 0
-	/* give processes that go to sleep a bit higher priority.. */
-	/* This depends on the values for TASK_XXX */
-	/* This gives smoother scheduling for some things, but */
-	/* can be very unfair under some circumstances, so.. */
- 	if (TASK_UNINTERRUPTIBLE >= (unsigned) current->state &&
-	    current->counter < current->priority*2) {
-		++current->counter;
-	}
-#endif
 	c = -1000;
-	next = p = &init_task;
-	for (;;) {
-		if ((p = p->next_task) == &init_task)
-			goto confuse_gcc2;
-		if (p->state == TASK_RUNNING) {
-			nr_running++;
-			if (p->counter > c)
-				c = p->counter, next = p;
-		}
+	next = idle_task;
+	while (p != &init_task) {
+		int weight = goodness(p, prev, this_cpu);
+		if (weight > c)
+			c = weight, next = p;
+		p = p->next_run;
 	}
-confuse_gcc2:
+
+	/* if all runnable processes have "counter == 0", re-calculate counters */
 	if (!c) {
 		for_each_task(p)
 			p->counter = (p->counter >> 1) + p->priority;
 	}
-	if (current == next)
-		return;
-	kstat.context_swtch++;
+#ifdef __SMP__
+	/*
+	 *	Allocate process to CPU
+	 */
+	 
+	 next->processor = this_cpu;
+	 next->last_processor = this_cpu;
+#endif	 
+#ifdef __SMP_PROF__ 
+	/* mark processor running an idle thread */
+	if (0==next->pid)
+		set_bit(this_cpu,&smp_idle_map);
+	else
+		clear_bit(this_cpu,&smp_idle_map);
+#endif
+	if (prev != next) {
+		struct timer_list timer;
+
+		kstat.context_swtch++;
+		if (timeout) {
+			init_timer(&timer);
+			timer.expires = timeout;
+			timer.data = (unsigned long) prev;
+			timer.function = process_timeout;
+			add_timer(&timer);
+		}
+
+		get_mmu_context(next);
+		switch_to(prev,next);
+		if (timeout)
+			del_timer(&timer);
+	}
+	return;
 
-	switch_to(next);
+scheduling_in_interrupt:
+	printk("Aiee: scheduling in interrupt %p\n",
+		return_address());
+/*
+ * System is probably fucked up anyway beyond a save landing; prevent
+ * messages on the screen from scrolling away.
+ */
+while(1);
 }
 
+#ifndef __alpha__
+
+/*
+ * For backwards compatibility?  This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
 asmlinkage int sys_pause(void)
 {
 	current->state = TASK_INTERRUPTIBLE;
@@ -202,6 +430,8 @@ asmlinkage int sys_pause(void)
 	return -ERESTARTNOHAND;
 }
 
+#endif
+
 /*
  * wake_up doesn't wake up stopped processes - they have to be awakened
  * with signals or similar.
@@ -212,70 +442,139 @@ asmlinkage int sys_pause(void)
  */
 void wake_up(struct wait_queue **q)
 {
-	struct wait_queue *tmp;
-	struct task_struct * p;
+	struct wait_queue *next;
+	struct wait_queue *head;
 
-	if (!q || !(tmp = *q))
+	if (!q || !(next = *q))
 		return;
-	do {
-		if ((p = tmp->task) != NULL) {
+	head = WAIT_QUEUE_HEAD(q);
+	while (next != head) {
+		struct task_struct *p = next->task;
+		next = next->next;
+		if (p != NULL) {
 			if ((p->state == TASK_UNINTERRUPTIBLE) ||
-			    (p->state == TASK_INTERRUPTIBLE)) {
-				p->state = TASK_RUNNING;
-				if (p->counter > current->counter + 3)
-					need_resched = 1;
-			}
+			    (p->state == TASK_INTERRUPTIBLE))
+				wake_up_process(p);
 		}
-		if (!tmp->next) {
-			printk("wait_queue is bad (eip = %p)\n",
-				__builtin_return_address(0));
-			printk("        q = %p\n",q);
-			printk("       *q = %p\n",*q);
-			printk("      tmp = %p\n",tmp);
-			break;
-		}
-		tmp = tmp->next;
-	} while (tmp != *q);
+		if (!next)
+			goto bad;
+	}
+	return;
+bad:
+	printk("wait_queue is bad (eip = %p)\n",
+		__builtin_return_address(0));
+	printk("        q = %p\n",q);
+	printk("       *q = %p\n",*q);
 }
 
 void wake_up_interruptible(struct wait_queue **q)
 {
-	struct wait_queue *tmp;
-	struct task_struct * p;
+	struct wait_queue *next;
+	struct wait_queue *head;
 
-	if (!q || !(tmp = *q))
+	if (!q || !(next = *q))
 		return;
-	do {
-		if ((p = tmp->task) != NULL) {
-			if (p->state == TASK_INTERRUPTIBLE) {
-				p->state = TASK_RUNNING;
-				if (p->counter > current->counter + 3)
-					need_resched = 1;
-			}
+	head = WAIT_QUEUE_HEAD(q);
+	while (next != head) {
+		struct task_struct *p = next->task;
+		next = next->next;
+		if (p != NULL) {
+			if (p->state == TASK_INTERRUPTIBLE)
+				wake_up_process(p);
 		}
-		if (!tmp->next) {
-			printk("wait_queue is bad (eip = %p)\n",
-				__builtin_return_address(0));
-			printk("        q = %p\n",q);
-			printk("       *q = %p\n",*q);
-			printk("      tmp = %p\n",tmp);
-			break;
-		}
-		tmp = tmp->next;
-	} while (tmp != *q);
+		if (!next)
+			goto bad;
+	}
+	return;
+bad:
+	printk("wait_queue is bad (eip = %p)\n",
+		__builtin_return_address(0));
+	printk("        q = %p\n",q);
+	printk("       *q = %p\n",*q);
+}
+
+/*
+ * Semaphores are implemented using a two-way counter:
+ * The "count" variable is decremented for each process
+ * that tries to sleep, while the "waiting" variable is
+ * incremented _while_ the process is sleeping on that
+ * semaphore. 
+ *
+ * Notably, the inline "up()" and "down()" functions can
+ * efficiently test if they need to do any extra work (up
+ * needs to do something only if count was negative before
+ * the increment operation.
+ */
+static inline void normalize_semaphore(struct semaphore *sem)
+{
+	atomic_add(xchg(&sem->waiting,0), &sem->count);
+}
+
+/*
+ * When __up() is called, the count was negative before
+ * incrementing it, and we need to wake up somebody. In
+ * most cases "waiting" will be positive, and the normalization
+ * will allow things to continue. However, if somebody has
+ * /just/ done a down(), it may be that count was negative
+ * without waiting being positive (or in the generic case
+ * "count is more negative than waiting is positive"), and
+ * the waiter needs to check this itself (see __down).
+ *
+ * Note that these functions are only called when there is
+ * contention on the lock, and as such all this is the
+ * "non-critical" part of the whole semaphore business. The
+ * critical part is the inline stuff in <asm/semaphore.h>
+ * where we want to avoid any extra jumps and calls.
+ */
+void __up(struct semaphore *sem)
+{
+	normalize_semaphore(sem);
+	wake_up(&sem->wait);
 }
 
 void __down(struct semaphore * sem)
 {
-	struct wait_queue wait = { current, NULL };
+	struct task_struct *tsk = current;
+	struct wait_queue wait = { tsk, NULL };
+
+	/*
+	 * The order here is important. We add ourselves to the
+	 * wait queues and mark ourselves sleeping _first_. That
+	 * way, if a "up()" comes in here, we'll either get
+	 * woken up (up happens after the wait queues are set up)
+	 * OR we'll have "waiting > 0".
+	 */
+	tsk->state = TASK_UNINTERRUPTIBLE;
 	add_wait_queue(&sem->wait, &wait);
-	current->state = TASK_UNINTERRUPTIBLE;
-	while (sem->count <= 0) {
-		schedule();
-		current->state = TASK_UNINTERRUPTIBLE;
+	atomic_inc(&sem->waiting);
+
+	/*
+	 * Ok, we're set up. The only race here is really that
+	 * an "up()" might have incremented count before we got
+	 * here, so we check "count+waiting". If that is larger
+	 * than zero, we shouldn't sleep, but re-try the lock.
+	 */
+	if (sem->count+sem->waiting <= 0) {
+		/*
+		 * If "count+waiting" <= 0, we have to wait
+		 * for a up(), which will normalize the count.
+		 * Remember, at this point we have decremented
+		 * count, and incremented up, so if count is
+		 * zero or positive we need to return to re-try
+		 * the lock.  It _may_ be that both count and
+		 * waiting is zero and that it is still locked,
+		 * but we still want to re-try the lock in that
+		 * case to make count go negative again so that
+		 * the optimized "up()" wake_up sequence works.
+		 */
+		do {
+			schedule();
+			tsk->state = TASK_UNINTERRUPTIBLE;
+		} while (sem->count < 0);
 	}
-	current->state = TASK_RUNNING;
+	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&sem->wait, &wait);
+	normalize_semaphore(sem);
 }
 
 static inline void __sleep_on(struct wait_queue **p, int state)
@@ -288,11 +587,13 @@ static inline void __sleep_on(struct wait_queue **p, int state)
 	if (current == task[0])
 		panic("task[0] trying to sleep");
 	current->state = state;
-	add_wait_queue(p, &wait);
 	save_flags(flags);
+	cli();
+	__add_wait_queue(p, &wait);
 	sti();
 	schedule();
-	remove_wait_queue(p, &wait);
+	cli();
+	__remove_wait_queue(p, &wait);
 	restore_flags(flags);
 }
 
@@ -311,7 +612,7 @@ void sleep_on(struct wait_queue **p)
  * and the sorting routine counts on this..
  */
 static struct timer_list timer_head = { &timer_head, &timer_head, ~0, 0, NULL };
-#define SLOW_BUT_DEBUGGING_TIMERS 1
+#define SLOW_BUT_DEBUGGING_TIMERS 0
 
 void add_timer(struct timer_list * timer)
 {
@@ -326,7 +627,6 @@ void add_timer(struct timer_list * timer)
 	}
 #endif
 	p = &timer_head;
-	timer->expires += jiffies;
 	save_flags(flags);
 	cli();
 	do {
@@ -341,42 +641,66 @@ void add_timer(struct timer_list * timer)
 
 int del_timer(struct timer_list * timer)
 {
-	unsigned long flags;
-#if SLOW_BUT_DEBUGGING_TIMERS
-	struct timer_list * p;
-
-	p = &timer_head;
-	save_flags(flags);
-	cli();
-	while ((p = p->next) != &timer_head) {
-		if (p == timer) {
-			timer->next->prev = timer->prev;
-			timer->prev->next = timer->next;
+	int ret = 0;
+	if (timer->next) {
+		unsigned long flags;
+		struct timer_list * next;
+		save_flags(flags);
+		cli();
+		if ((next = timer->next) != NULL) {
+			(next->prev = timer->prev)->next = next;
 			timer->next = timer->prev = NULL;
-			restore_flags(flags);
-			timer->expires -= jiffies;
-			return 1;
+			ret = 1;
 		}
+		restore_flags(flags);
 	}
-	if (timer->next || timer->prev)
-		printk("del_timer() called from %p with timer not initialized\n",
-			__builtin_return_address(0));
-	restore_flags(flags);
-	return 0;
-#else	
-	save_flags(flags);
+	return ret;
+}
+
+static inline void run_timer_list(void)
+{
+	struct timer_list * timer;
+
 	cli();
-	if (timer->next) {
+	while ((timer = timer_head.next) != &timer_head && timer->expires <= jiffies) {
+		void (*fn)(unsigned long) = timer->function;
+		unsigned long data = timer->data;
 		timer->next->prev = timer->prev;
 		timer->prev->next = timer->next;
 		timer->next = timer->prev = NULL;
-		restore_flags(flags);
-		timer->expires -= jiffies;
-		return 1;
+		sti();
+		fn(data);
+		cli();
 	}
-	restore_flags(flags);
-	return 0;
-#endif
+	sti();
+}
+
+static inline void run_old_timers(void)
+{
+	struct timer_struct *tp;
+	unsigned long mask;
+
+	for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) {
+		if (mask > timer_active)
+			break;
+		if (!(mask & timer_active))
+			continue;
+		if (tp->expires > jiffies)
+			continue;
+		timer_active &= ~mask;
+		tp->fn();
+		sti();
+	}
+}
+
+void tqueue_bh(void)
+{
+	run_task_queue(&tq_timer);
+}
+
+void immediate_bh(void)
+{
+	run_task_queue(&tq_immediate);
 }
 
 unsigned long timer_active = 0;
@@ -403,21 +727,25 @@ static unsigned long count_active_tasks(void)
 			   (*p)->state == TASK_UNINTERRUPTIBLE ||
 			   (*p)->state == TASK_SWAPPING))
 			nr += FIXED_1;
+#ifdef __SMP__
+	nr-=(smp_num_cpus-1)*FIXED_1;
+#endif			
 	return nr;
 }
 
-static inline void calc_load(void)
+static inline void calc_load(unsigned long ticks)
 {
 	unsigned long active_tasks; /* fixed-point */
 	static int count = LOAD_FREQ;
 
-	if (count-- > 0)
-		return;
-	count = LOAD_FREQ;
-	active_tasks = count_active_tasks();
-	CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-	CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-	CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+	count -= ticks;
+	if (count < 0) {
+		count += LOAD_FREQ;
+		active_tasks = count_active_tasks();
+		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+	}
 }
 
 /*
@@ -428,138 +756,138 @@ static inline void calc_load(void)
  * They were originally developed for SUN and DEC kernels.
  * All the kudos should go to Dave for this stuff.
  *
- * These were ported to Linux by Philip Gladstone.
  */
 static void second_overflow(void)
 {
-	long ltemp;
-
-	/* Bump the maxerror field */
-	time_maxerror = (0x70000000-time_maxerror < time_tolerance) ?
-	  0x70000000 : (time_maxerror + time_tolerance);
-
-	/* Run the PLL */
-	if (time_offset < 0) {
-		ltemp = (-(time_offset+1) >> (SHIFT_KG + time_constant)) + 1;
-		time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-		time_offset += (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE);
-		time_adj = - time_adj;
-	} else if (time_offset > 0) {
-		ltemp = ((time_offset-1) >> (SHIFT_KG + time_constant)) + 1;
-		time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-		time_offset -= (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE);
-	} else {
-		time_adj = 0;
-	}
-
-	time_adj += (time_freq >> (SHIFT_KF + SHIFT_HZ - SHIFT_SCALE))
-	    + FINETUNE;
-
-	/* Handle the leap second stuff */
-	switch (time_status) {
-		case TIME_INS:
-		/* ugly divide should be replaced */
-		if (xtime.tv_sec % 86400 == 0) {
-			xtime.tv_sec--; /* !! */
-			time_status = TIME_OOP;
-			printk("Clock: inserting leap second 23:59:60 UTC\n");
-		}
-		break;
-
-		case TIME_DEL:
-		/* ugly divide should be replaced */
-		if (xtime.tv_sec % 86400 == 86399) {
-			xtime.tv_sec++;
-			time_status = TIME_OK;
-			printk("Clock: deleting leap second 23:59:59 UTC\n");
-		}
-		break;
-
-		case TIME_OOP:
-		time_status = TIME_OK;
-		break;
+    long ltemp;
+
+    /* Bump the maxerror field */
+    time_maxerror += time_tolerance >> SHIFT_USEC;
+    if ( time_maxerror > MAXPHASE )
+        time_maxerror = MAXPHASE;
+
+    /*
+     * Leap second processing. If in leap-insert state at
+     * the end of the day, the system clock is set back one
+     * second; if in leap-delete state, the system clock is
+     * set ahead one second. The microtime() routine or
+     * external clock driver will insure that reported time
+     * is always monotonic. The ugly divides should be
+     * replaced.
+     */
+    switch (time_state) {
+
+    case TIME_OK:
+	if (time_status & STA_INS)
+	    time_state = TIME_INS;
+	else if (time_status & STA_DEL)
+	    time_state = TIME_DEL;
+	break;
+
+    case TIME_INS:
+	if (xtime.tv_sec % 86400 == 0) {
+	    xtime.tv_sec--;
+	    time_state = TIME_OOP;
+	    printk("Clock: inserting leap second 23:59:60 UTC\n");
 	}
-}
-
-/*
- * disregard lost ticks for now.. We don't care enough.
- */
-static void timer_bh(void * unused)
-{
-	unsigned long mask;
-	struct timer_struct *tp;
-	struct timer_list * timer;
+	break;
 
-	cli();
-	while ((timer = timer_head.next) != &timer_head && timer->expires < jiffies) {
-		void (*fn)(unsigned long) = timer->function;
-		unsigned long data = timer->data;
-		timer->next->prev = timer->prev;
-		timer->prev->next = timer->next;
-		timer->next = timer->prev = NULL;
-		sti();
-		fn(data);
-		cli();
-	}
-	sti();
-	
-	for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) {
-		if (mask > timer_active)
-			break;
-		if (!(mask & timer_active))
-			continue;
-		if (tp->expires > jiffies)
-			continue;
-		timer_active &= ~mask;
-		tp->fn();
-		sti();
+    case TIME_DEL:
+	if ((xtime.tv_sec + 1) % 86400 == 0) {
+	    xtime.tv_sec++;
+	    time_state = TIME_WAIT;
+	    printk("Clock: deleting leap second 23:59:59 UTC\n");
 	}
+	break;
+
+    case TIME_OOP:
+	time_state = TIME_WAIT;
+	break;
+
+    case TIME_WAIT:
+	if (!(time_status & (STA_INS | STA_DEL)))
+	    time_state = TIME_OK;
+    }
+
+    /*
+     * Compute the phase adjustment for the next second. In
+     * PLL mode, the offset is reduced by a fixed factor
+     * times the time constant. In FLL mode the offset is
+     * used directly. In either mode, the maximum phase
+     * adjustment for each second is clamped so as to spread
+     * the adjustment over not more than the number of
+     * seconds between updates.
+     */
+    if (time_offset < 0) {
+	ltemp = -time_offset;
+	if (!(time_status & STA_FLL))
+	    ltemp >>= SHIFT_KG + time_constant;
+	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+	time_offset += ltemp;
+	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+    } else {
+	ltemp = time_offset;
+	if (!(time_status & STA_FLL))
+	    ltemp >>= SHIFT_KG + time_constant;
+	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+	time_offset -= ltemp;
+	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+    }
+
+    /*
+     * Compute the frequency estimate and additional phase
+     * adjustment due to frequency error for the next
+     * second. When the PPS signal is engaged, gnaw on the
+     * watchdog counter and update the frequency computed by
+     * the pll and the PPS signal.
+     */
+    pps_valid++;
+    if (pps_valid == PPS_VALID) {
+	pps_jitter = MAXTIME;
+	pps_stabil = MAXFREQ;
+	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+			 STA_PPSWANDER | STA_PPSERROR);
+    }
+    ltemp = time_freq + pps_freq;
+    if (ltemp < 0)
+	time_adj -= -ltemp >>
+	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+    else
+	time_adj += ltemp >>
+	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if HZ == 100
+    /* compensate for (HZ==100) != 128. Add 25% to get 125; => only 3% error */
+    if (time_adj < 0)
+	time_adj -= -time_adj >> 2;
+    else
+	time_adj += time_adj >> 2;
+#endif
 }
 
-void tqueue_bh(void * unused)
-{
-	run_task_queue(&tq_timer);
-}
-
-void immediate_bh(void * unused)
-{
-	run_task_queue(&tq_immediate);
-}
-
-/*
- * The int argument is really a (struct pt_regs *), in case the
- * interrupt wants to know from where it was called. The timer
- * irq uses this to decide if it should update the user or system
- * times.
- */
-static void do_timer(int irq, struct pt_regs * regs)
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
 {
-	unsigned long mask;
-	struct timer_struct *tp;
-	/* last time the cmos clock got updated */
-	static long last_rtc_update=0;
-	extern int set_rtc_mmss(unsigned long);
-
-	long ltemp, psecs;
-
-	/* Advance the phase, once it gets to one microsecond, then
+	/*
+	 * Advance the phase, once it gets to one microsecond, then
 	 * advance the tick more.
 	 */
 	time_phase += time_adj;
-	if (time_phase < -FINEUSEC) {
-		ltemp = -time_phase >> SHIFT_SCALE;
+	if (time_phase <= -FINEUSEC) {
+		long ltemp = -time_phase >> SHIFT_SCALE;
 		time_phase += ltemp << SHIFT_SCALE;
 		xtime.tv_usec += tick + time_adjust_step - ltemp;
 	}
-	else if (time_phase > FINEUSEC) {
-		ltemp = time_phase >> SHIFT_SCALE;
+	else if (time_phase >= FINEUSEC) {
+		long ltemp = time_phase >> SHIFT_SCALE;
 		time_phase -= ltemp << SHIFT_SCALE;
 		xtime.tv_usec += tick + time_adjust_step + ltemp;
 	} else
 		xtime.tv_usec += tick + time_adjust_step;
 
-	if (time_adjust)
-	{
+	if (time_adjust) {
 	    /* We are doing an adjtime thing. 
 	     *
 	     * Modify the value of the tick for next time.
@@ -570,123 +898,240 @@ static void do_timer(int irq, struct pt_regs * regs)
 	     * in the range -tickadj .. +tickadj
 	     */
 	     if (time_adjust > tickadj)
-	       time_adjust_step = tickadj;
+		time_adjust_step = tickadj;
 	     else if (time_adjust < -tickadj)
-	       time_adjust_step = -tickadj;
+		time_adjust_step = -tickadj;
 	     else
-	       time_adjust_step = time_adjust;
+		time_adjust_step = time_adjust;
 	     
 	    /* Reduce by this step the amount of time left  */
 	    time_adjust -= time_adjust_step;
 	}
 	else
 	    time_adjust_step = 0;
+}
+
+/*
+ * Using a loop looks inefficient, but "ticks" is
+ * usually just one (we shouldn't be losing ticks,
+ * we're doing this this way mainly for interrupt
+ * latency reasons, not because we think we'll
+ * have lots of lost timer ticks
+ */
+static void update_wall_time(unsigned long ticks)
+{
+	do {
+		ticks--;
+		update_wall_time_one_tick();
+	} while (ticks);
 
 	if (xtime.tv_usec >= 1000000) {
 	    xtime.tv_usec -= 1000000;
 	    xtime.tv_sec++;
 	    second_overflow();
 	}
+}
 
-	/* If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 */
-	if (time_status != TIME_BAD && xtime.tv_sec > last_rtc_update + 660 &&
-	    xtime.tv_usec > 500000 - (tick >> 1) &&
-	    xtime.tv_usec < 500000 + (tick >> 1))
-	  if (set_rtc_mmss(xtime.tv_sec) == 0)
-	    last_rtc_update = xtime.tv_sec;
-	  else
-	    last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */
-
-	jiffies++;
-	calc_load();
-	if (user_mode(regs)) {
-		current->utime++;
-		if (current != task[0]) {
-			if (current->priority < 15)
-				kstat.cpu_nice++;
-			else
-				kstat.cpu_user++;
+static inline void do_process_times(struct task_struct *p,
+	unsigned long user, unsigned long system)
+{
+	long psecs;
+
+	p->utime += user;
+	p->stime += system;
+
+	psecs = (p->stime + p->utime) / HZ;
+	if (psecs > p->rlim[RLIMIT_CPU].rlim_cur) {
+		/* Send SIGXCPU every second.. */
+		if (psecs * HZ == p->stime + p->utime)
+			send_sig(SIGXCPU, p, 1);
+		/* and SIGKILL when we go over max.. */
+		if (psecs > p->rlim[RLIMIT_CPU].rlim_max)
+			send_sig(SIGKILL, p, 1);
+	}
+}
+
+static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
+{
+	unsigned long it_virt = p->it_virt_value;
+
+	if (it_virt) {
+		if (it_virt <= ticks) {
+			it_virt = ticks + p->it_virt_incr;
+			send_sig(SIGVTALRM, p, 1);
 		}
-		/* Update ITIMER_VIRT for current task if not in a system call */
-		if (current->it_virt_value && !(--current->it_virt_value)) {
-			current->it_virt_value = current->it_virt_incr;
-			send_sig(SIGVTALRM,current,1);
+		p->it_virt_value = it_virt - ticks;
+	}
+}
+
+static inline void do_it_prof(struct task_struct * p, unsigned long ticks)
+{
+	unsigned long it_prof = p->it_prof_value;
+
+	if (it_prof) {
+		if (it_prof <= ticks) {
+			it_prof = ticks + p->it_prof_incr;
+			send_sig(SIGPROF, p, 1);
 		}
-	} else {
-		current->stime++;
-		if(current != task[0])
-			kstat.cpu_system++;
-#ifdef CONFIG_PROFILE
-		if (prof_buffer && current != task[0]) {
-			extern int _stext;
-			unsigned long eip = regs->eip - (unsigned long) &_stext;
-			eip >>= CONFIG_PROFILE_SHIFT;
-			if (eip < prof_len)
-				prof_buffer[eip]++;
+		p->it_prof_value = it_prof - ticks;
+	}
+}
+
+static __inline__ void update_one_process(struct task_struct *p,
+	unsigned long ticks, unsigned long user, unsigned long system)
+{
+	do_process_times(p, user, system);
+	do_it_virt(p, user);
+	do_it_prof(p, ticks);
+}	
+
+static void update_process_times(unsigned long ticks, unsigned long system)
+{
+#ifndef  __SMP__
+	struct task_struct * p = current;
+	unsigned long user = ticks - system;
+	if (p->pid) {
+		p->counter -= ticks;
+		if (p->counter < 0) {
+			p->counter = 0;
+			need_resched = 1;
 		}
-#endif
+		if (p->priority < DEF_PRIORITY)
+			kstat.cpu_nice += user;
+		else
+			kstat.cpu_user += user;
+		kstat.cpu_system += system;
 	}
-	/*
-	 * check the cpu time limit on the process.
-	 */
-	if ((current->rlim[RLIMIT_CPU].rlim_max != RLIM_INFINITY) &&
-	    (((current->stime + current->utime) / HZ) >= current->rlim[RLIMIT_CPU].rlim_max))
-		send_sig(SIGKILL, current, 1);
-	if ((current->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) &&
-	    (((current->stime + current->utime) % HZ) == 0)) {
-		psecs = (current->stime + current->utime) / HZ;
-		/* send when equal */
-		if (psecs == current->rlim[RLIMIT_CPU].rlim_cur)
-			send_sig(SIGXCPU, current, 1);
-		/* and every five seconds thereafter. */
-		else if ((psecs > current->rlim[RLIMIT_CPU].rlim_cur) &&
-		        ((psecs - current->rlim[RLIMIT_CPU].rlim_cur) % 5) == 0)
-			send_sig(SIGXCPU, current, 1);
-	}
-
-	if (current != task[0] && 0 > --current->counter) {
-		current->counter = 0;
-		need_resched = 1;
+	update_one_process(p, ticks, user, system);
+#else
+	int cpu,j;
+	cpu = smp_processor_id();
+	for (j=0;j<smp_num_cpus;j++)
+	{
+		int i = cpu_logical_map[j];
+		struct task_struct *p;
+		
+#ifdef __SMP_PROF__
+		if (test_bit(i,&smp_idle_map)) 
+			smp_idle_count[i]++;
+#endif
+		p = current_set[i];
+		/*
+		 * Do we have a real process?
+		 */
+		if (p->pid) {
+			/* assume user-mode process */
+			unsigned long utime = ticks;
+			unsigned long stime = 0;
+			if (cpu == i) {
+				utime = ticks-system;
+				stime = system;
+			} else if (smp_proc_in_lock[j]) {
+				utime = 0;
+				stime = ticks;
+			}
+			update_one_process(p, ticks, utime, stime);
+
+			if (p->priority < DEF_PRIORITY)
+				kstat.cpu_nice += utime;
+			else
+				kstat.cpu_user += utime;
+			kstat.cpu_system += stime;
+
+			p->counter -= ticks;
+			if (p->counter >= 0)
+				continue;
+			p->counter = 0;
+		} else {
+			/*
+			 * Idle processor found, do we have anything
+			 * we could run?
+			 */
+			if (!(0x7fffffff & smp_process_available))
+				continue;
+		}
+		/* Ok, we should reschedule, do the magic */
+		if (i==cpu)
+			need_resched = 1;
+		else
+			smp_message_pass(i, MSG_RESCHEDULE, 0L, 0);
 	}
-	/* Update ITIMER_PROF for the current task */
-	if (current->it_prof_value && !(--current->it_prof_value)) {
-		current->it_prof_value = current->it_prof_incr;
-		send_sig(SIGPROF,current,1);
+#endif
+}
+
+static unsigned long lost_ticks = 0;
+static unsigned long lost_ticks_system = 0;
+
+static inline void update_times(void)
+{
+	unsigned long ticks;
+
+	ticks = xchg(&lost_ticks, 0);
+
+	if (ticks) {
+		unsigned long system;
+
+		system = xchg(&lost_ticks_system, 0);
+		calc_load(ticks);
+		update_wall_time(ticks);
+		update_process_times(ticks, system);
 	}
-	for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) {
-		if (mask > timer_active)
-			break;
-		if (!(mask & timer_active))
-			continue;
-		if (tp->expires > jiffies)
-			continue;
-		mark_bh(TIMER_BH);
+}
+
+static void timer_bh(void)
+{
+	update_times();
+	run_old_timers();
+	run_timer_list();
+}
+
+void do_timer(struct pt_regs * regs)
+{
+	(*(unsigned long *)&jiffies)++;
+	lost_ticks++;
+	mark_bh(TIMER_BH);
+	if (!user_mode(regs)) {
+		lost_ticks_system++;
+		if (prof_buffer && current->pid) {
+			extern int _stext;
+			unsigned long ip = instruction_pointer(regs);
+			ip -= (unsigned long) &_stext;
+			ip >>= prof_shift;
+			if (ip < prof_len)
+				prof_buffer[ip]++;
+		}
 	}
-	cli();
-	itimer_ticks++;
-	if (itimer_ticks > itimer_next)
-		need_resched = 1;
-	if (timer_head.next->expires < jiffies)
-		mark_bh(TIMER_BH);
-	if (tq_timer != &tq_last)
+	if (tq_timer)
 		mark_bh(TQUEUE_BH);
-	sti();
 }
 
-asmlinkage int sys_alarm(long seconds)
+#ifndef __alpha__
+
+/*
+ * For backwards compatibility?  This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+asmlinkage unsigned int sys_alarm(unsigned int seconds)
 {
 	struct itimerval it_new, it_old;
+	unsigned int oldalarm;
 
 	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
 	it_new.it_value.tv_sec = seconds;
 	it_new.it_value.tv_usec = 0;
 	_setitimer(ITIMER_REAL, &it_new, &it_old);
-	return(it_old.it_value.tv_sec + (it_old.it_value.tv_usec / 1000000));
+	oldalarm = it_old.it_value.tv_sec;
+	/* ehhh.. We can't return 0 if we have an alarm pending.. */
+	/* And we'd better return too much than too little anyway */
+	if (it_old.it_value.tv_usec)
+		oldalarm++;
+	return oldalarm;
 }
 
+/*
+ * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
+ * should be moved into arch/i386 instead?
+ */
 asmlinkage int sys_getpid(void)
 {
 	return current->pid;
@@ -717,47 +1162,286 @@ asmlinkage int sys_getegid(void)
 	return current->egid;
 }
 
-asmlinkage int sys_nice(long increment)
+/*
+ * This has been replaced by sys_setpriority.  Maybe it should be
+ * moved into the arch dependent tree for those ports that require
+ * it for backward compatibility?
+ */
+asmlinkage int sys_nice(int increment)
 {
-	int newprio;
-
-	if (increment < 0 && !suser())
-		return -EPERM;
+	unsigned long newprio;
+	int increase = 0;
+
+	newprio = increment;
+	if (increment < 0) {
+		if (!suser())
+			return -EPERM;
+		newprio = -increment;
+		increase = 1;
+	}
+	if (newprio > 40)
+		newprio = 40;
+	/*
+	 * do a "normalization" of the priority (traditionally
+	 * unix nice values are -20..20, linux doesn't really
+	 * use that kind of thing, but uses the length of the
+	 * timeslice instead (default 150 msec). The rounding is
+	 * why we want to avoid negative values.
+	 */
+	newprio = (newprio * DEF_PRIORITY + 10) / 20;
+	increment = newprio;
+	if (increase)
+		increment = -increment;
 	newprio = current->priority - increment;
-	if (newprio < 1)
+	if ((signed) newprio < 1)
 		newprio = 1;
-	if (newprio > 35)
-		newprio = 35;
+	if (newprio > DEF_PRIORITY*2)
+		newprio = DEF_PRIORITY*2;
 	current->priority = newprio;
 	return 0;
 }
 
+#endif
+
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+	struct task_struct *p;
+
+	p = current;
+	if (pid) {
+		for_each_task(p) {
+			if (p->pid == pid)
+				goto found;
+		}
+		p = NULL;
+	}
+found:
+	return p;
+}
+
+static int setscheduler(pid_t pid, int policy, 
+			struct sched_param *param)
+{
+	struct sched_param lp;
+	struct task_struct *p;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+
+	if (copy_from_user(&lp, param, sizeof(struct sched_param)))
+		return -EFAULT;
+
+	p = find_process_by_pid(pid);
+	if (!p)
+		return -ESRCH;
+			
+	if (policy < 0)
+		policy = p->policy;
+	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+		 policy != SCHED_OTHER)
+		return -EINVAL;
+	
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
+	 * priority for SCHED_OTHER is 0.
+	 */
+	if (lp.sched_priority < 0 || lp.sched_priority > 99)
+		return -EINVAL;
+	if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
+		return -EINVAL;
+
+	if ((policy == SCHED_FIFO || policy == SCHED_RR) && !suser())
+		return -EPERM;
+	if ((current->euid != p->euid) && (current->euid != p->uid) &&
+	    !suser())
+		return -EPERM;
+
+	p->policy = policy;
+	p->rt_priority = lp.sched_priority;
+	cli();
+	if (p->next_run)
+		move_last_runqueue(p);
+	sti();
+	schedule();
+
+	return 0;
+}
+
+asmlinkage int sys_sched_setscheduler(pid_t pid, int policy, 
+				      struct sched_param *param)
+{
+	return setscheduler(pid, policy, param);
+}
+
+asmlinkage int sys_sched_setparam(pid_t pid, struct sched_param *param)
+{
+	return setscheduler(pid, -1, param);
+}
+
+asmlinkage int sys_sched_getscheduler(pid_t pid)
+{
+	struct task_struct *p;
+
+	if (pid < 0)
+		return -EINVAL;
+
+	p = find_process_by_pid(pid);
+	if (!p)
+		return -ESRCH;
+			
+	return p->policy;
+}
+
+asmlinkage int sys_sched_getparam(pid_t pid, struct sched_param *param)
+{
+	struct task_struct *p;
+	struct sched_param lp;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+
+	p = find_process_by_pid(pid);
+	if (!p)
+		return -ESRCH;
+
+	lp.sched_priority = p->rt_priority;
+	return copy_to_user(param, &lp, sizeof(struct sched_param)) ? -EFAULT : 0;
+}
+
+asmlinkage int sys_sched_yield(void)
+{
+	cli();
+	move_last_runqueue(current);
+	sti();
+	return 0;
+}
+
+asmlinkage int sys_sched_get_priority_max(int policy)
+{
+	switch (policy) {
+	      case SCHED_FIFO:
+	      case SCHED_RR:
+		return 99;
+	      case SCHED_OTHER:
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+asmlinkage int sys_sched_get_priority_min(int policy)
+{
+	switch (policy) {
+	      case SCHED_FIFO:
+	      case SCHED_RR:
+		return 1;
+	      case SCHED_OTHER:
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
+{
+	struct timespec t;
+
+	t.tv_sec = 0;
+	t.tv_nsec = 0;   /* <-- Linus, please fill correct value in here */
+	return -ENOSYS;  /* and then delete this line. Thanks!           */
+	return copy_to_user(interval, &t, sizeof(struct timespec)) ? -EFAULT : 0;
+}
+
+/*
+ * change timeval to jiffies, trying to avoid the 
+ * most obvious overflows..
+ */
+static unsigned long timespectojiffies(struct timespec *value)
+{
+	unsigned long sec = (unsigned) value->tv_sec;
+	long nsec = value->tv_nsec;
+
+	if (sec > (LONG_MAX / HZ))
+		return LONG_MAX;
+	nsec += 1000000000L / HZ - 1;
+	nsec /= 1000000000L / HZ;
+	return HZ * sec + nsec;
+}
+
+static void jiffiestotimespec(unsigned long jiffies, struct timespec *value)
+{
+	value->tv_nsec = (jiffies % HZ) * (1000000000L / HZ);
+	value->tv_sec = jiffies / HZ;
+	return;
+}
+
+asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
+{
+	int error;
+	struct timespec t;
+	unsigned long expire;
+
+	error = copy_from_user(&t, rqtp, sizeof(struct timespec));
+	if (error)
+		return -EFAULT;	
+
+	if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
+		return -EINVAL;
+
+	if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
+	    current->policy != SCHED_OTHER) {
+		/*
+		 * Short delay requests up to 2 ms will be handled with
+		 * high precision by a busy wait for all real-time processes.
+		 */
+		udelay((t.tv_nsec + 999) / 1000);
+		return 0;
+	}
+
+	expire = timespectojiffies(&t) + (t.tv_sec || t.tv_nsec) + jiffies;
+	current->timeout = expire;
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+
+	if (expire > jiffies) {
+		if (rmtp) {
+			jiffiestotimespec(expire - jiffies -
+					  (expire > jiffies + 1), &t);
+			if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
+				return -EFAULT;	
+		}
+		return -EINTR;
+	}
+
+	return 0;
+}
+
 static void show_task(int nr,struct task_struct * p)
 {
 	unsigned long free;
-	static char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
+	static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
 
 	printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr);
 	if (((unsigned) p->state) < sizeof(stat_nam)/sizeof(char *))
 		printk(stat_nam[p->state]);
 	else
 		printk(" ");
-#ifdef __i386__
+#if ((~0UL) == 0xffffffff)
 	if (p == current)
 		printk(" current  ");
 	else
-		printk(" %08lX ", ((unsigned long *)p->tss.esp)[3]);
-#elif defined (__mips__)
+		printk(" %08lX ", thread_saved_pc(&p->tss));
+#else
 	if (p == current)
-		printk(" current  ");
+		printk("   current task   ");
 	else
-		printk("          ");
+		printk(" %016lx ", thread_saved_pc(&p->tss));
 #endif
-	for (free = 1; free < 1024 ; free++) {
+	for (free = 1; free < PAGE_SIZE/sizeof(long) ; free++) {
 		if (((unsigned long *)p->kernel_stack_page)[free])
 			break;
 	}
-	printk("%5lu %5d %6d ", free << 2, p->pid, p->p_pptr->pid);
+	printk("%5lu %5d %6d ", free*sizeof(long), p->pid, p->p_pptr->pid);
 	if (p->p_cptr)
 		printk("%5d ", p->p_cptr->pid);
 	else
@@ -776,8 +1460,15 @@ void show_state(void)
 {
 	int i;
 
-	printk("                         free                        sibling\n");
+#if ((~0UL) == 0xffffffff)
+	printk("\n"
+	       "                         free                        sibling\n");
 	printk("  task             PC    stack   pid father child younger older\n");
+#else
+	printk("\n"
+	       "                                 free                        sibling\n");
+	printk("  task                 PC        stack   pid father child younger older\n");
+#endif
 	for (i=0 ; i<NR_TASKS ; i++)
 		if (task[i])
 			show_task(i,task[i]);
@@ -785,12 +1476,19 @@ void show_state(void)
 
 void sched_init(void)
 {
-	bh_base[TIMER_BH].routine = timer_bh;
-	bh_base[TQUEUE_BH].routine = tqueue_bh;
-	bh_base[IMMEDIATE_BH].routine = immediate_bh;
-	if (request_irq(TIMER_IRQ, do_timer, 0, "timer") != 0)
-		panic("Could not allocate timer IRQ!");
-	enable_bh(TIMER_BH);
-	enable_bh(TQUEUE_BH);
-	enable_bh(IMMEDIATE_BH);
+	/*
+	 *	We have to do a little magic to get the first
+	 *	process right in SMP mode.
+	 */
+	int cpu=smp_processor_id();
+#ifndef __SMP__	
+	current_set[cpu]=&init_task;
+#else
+	init_task.processor=cpu;
+	for(cpu = 0; cpu < NR_CPUS; cpu++)
+		current_set[cpu] = &init_task;
+#endif
+	init_bh(TIMER_BH, timer_bh);
+	init_bh(TQUEUE_BH, tqueue_bh);
+	init_bh(IMMEDIATE_BH, immediate_bh);
 }