summaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c869
1 files changed, 513 insertions, 356 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 98502b3fc..bc256d029 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4,6 +4,9 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1996-04-21 Modified by Ulrich Windl to make NTP work
+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
+ * make semaphores SMP safe
+ * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
*/
/*
@@ -28,12 +31,14 @@
#include <linux/resource.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/smp_lock.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
+#include <asm/spinlock.h>
#include <linux/timex.h>
@@ -44,7 +49,7 @@
int securelevel = 0; /* system security level */
long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */
-volatile struct timeval xtime; /* The current time */
+volatile struct timeval xtime __attribute__ ((aligned (8))); /* The current time */
int tickadj = 500/HZ; /* microsecs */
DECLARE_TASK_QUEUE(tq_timer);
@@ -100,7 +105,12 @@ struct task_struct init_task = INIT_TASK;
unsigned long volatile jiffies=0;
-struct task_struct *current_set[NR_CPUS];
+/*
+ * Init task must be ok at boot for the ix86 as we will check its signals
+ * via the SMP irq return path.
+ */
+
+struct task_struct *current_set[NR_CPUS] = {&init_task, };
struct task_struct *last_task_used_math = NULL;
struct task_struct * task[NR_TASKS] = {&init_task, };
@@ -109,9 +119,6 @@ struct kernel_stat kstat = { 0 };
static inline void add_to_runqueue(struct task_struct * p)
{
-#ifdef __SMP__
- int cpu=smp_processor_id();
-#endif
#if 1 /* sanity tests */
if (p->next_run || p->prev_run) {
printk("task already on run-queue\n");
@@ -124,36 +131,6 @@ static inline void add_to_runqueue(struct task_struct * p)
(p->prev_run = init_task.prev_run)->next_run = p;
p->next_run = &init_task;
init_task.prev_run = p;
-#ifdef __SMP__
- /* this is safe only if called with cli()*/
- while(set_bit(31,&smp_process_available));
-#if 0
- {
- while(test_bit(31,&smp_process_available))
- {
- if(clear_bit(cpu,&smp_invalidate_needed))
- {
- local_flush_tlb();
- set_bit(cpu,&cpu_callin_map[0]);
- }
- }
- }
-#endif
- smp_process_available++;
- clear_bit(31,&smp_process_available);
- if ((0!=p->pid) && smp_threads_ready)
- {
- int i;
- for (i=0;i<smp_num_cpus;i++)
- {
- if (0==current_set[cpu_logical_map[i]]->pid)
- {
- smp_message_pass(cpu_logical_map[i], MSG_RESCHEDULE, 0L, 0);
- break;
- }
- }
- }
-#endif
}
static inline void del_from_runqueue(struct task_struct * p)
@@ -167,7 +144,7 @@ static inline void del_from_runqueue(struct task_struct * p)
return;
}
#endif
- if (p == &init_task) {
+ if (!p->pid) {
static int nr = 0;
if (nr < 5) {
nr++;
@@ -199,6 +176,21 @@ static inline void move_last_runqueue(struct task_struct * p)
}
/*
+ * The tasklist_lock protects the linked list of processes.
+ *
+ * The scheduler lock is protecting against multiple entry
+ * into the scheduling code, and doesn't need to worry
+ * about interrupts (because interrupts cannot call the
+ * scheduler).
+ *
+ * The run-queue lock locks the parts that actually access
+ * and change the run-queues, and have to be interrupt-safe.
+ */
+rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;
+spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED;
+
+/*
* Wake up a process. Put it on the run-queue if it's not
* already there. The "current" process is always on the
* run-queue (except when the actual re-schedule is in
@@ -210,12 +202,11 @@ inline void wake_up_process(struct task_struct * p)
{
unsigned long flags;
- save_flags(flags);
- cli();
+ spin_lock_irqsave(&runqueue_lock, flags);
p->state = TASK_RUNNING;
if (!p->next_run)
add_to_runqueue(p);
- restore_flags(flags);
+ spin_unlock_irqrestore(&runqueue_lock, flags);
}
static void process_timeout(unsigned long __data)
@@ -243,17 +234,6 @@ static inline int goodness(struct task_struct * p, struct task_struct * prev, in
{
int weight;
-#ifdef __SMP__
- /* We are not permitted to run a task someone else is running */
- if (p->processor != NO_PROC_ID)
- return -1000;
-#ifdef PAST_2_0
- /* This process is locked to a processor group */
- if (p->processor_mask && !(p->processor_mask & (1<<this_cpu))
- return -1000;
-#endif
-#endif
-
/*
* Realtime process, select the first one on the
* runqueue (taking priorities within processes
@@ -287,6 +267,18 @@ static inline int goodness(struct task_struct * p, struct task_struct * prev, in
return weight;
}
+#ifdef __SMP__
+
+#define idle_task (task[cpu_number_map[this_cpu]])
+#define can_schedule(p) ((p)->processor == NO_PROC_ID)
+
+#else
+
+#define idle_task (&init_task)
+#define can_schedule(p) (1)
+
+#endif
+
/*
* 'schedule()' is the scheduler function. It's a very simple and nice
* scheduler: it's not perfect, but certainly works for most things.
@@ -299,33 +291,39 @@ static inline int goodness(struct task_struct * p, struct task_struct * prev, in
*/
asmlinkage void schedule(void)
{
- int c;
- struct task_struct * p;
+ int lock_depth;
struct task_struct * prev, * next;
- unsigned long timeout = 0;
- int this_cpu=smp_processor_id();
+ unsigned long timeout;
+ int this_cpu;
-/* check alarm, wake up any interruptible tasks that have got a signal */
-
- if (intr_count)
- goto scheduling_in_interrupt;
-
- if (bh_active & bh_mask) {
- intr_count = 1;
- do_bottom_half();
- intr_count = 0;
+ need_resched = 0;
+ this_cpu = smp_processor_id();
+ if (local_irq_count[this_cpu]) {
+ printk("Scheduling in interrupt\n");
+ *(char *)0 = 0;
}
+ prev = current;
+ release_kernel_lock(prev, this_cpu, lock_depth);
+ if (bh_active & bh_mask)
+ do_bottom_half();
- run_task_queue(&tq_scheduler);
+ spin_lock(&scheduler_lock);
+ spin_lock_irq(&runqueue_lock);
- need_resched = 0;
- prev = current;
- cli();
/* move an exhausted RR process to be last.. */
if (!prev->counter && prev->policy == SCHED_RR) {
- prev->counter = prev->priority;
- move_last_runqueue(prev);
+ if (prev->pid) {
+ prev->counter = prev->priority;
+ move_last_runqueue(prev);
+ } else {
+ static int count = 5;
+ if (count) {
+ count--;
+ printk("Moving pid 0 last\n");
+ }
+ }
}
+ timeout = 0;
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (prev->signal & ~prev->blocked)
@@ -342,54 +340,55 @@ asmlinkage void schedule(void)
del_from_runqueue(prev);
case TASK_RUNNING:
}
- p = init_task.next_run;
- sti();
-
+ {
+ struct task_struct * p = init_task.next_run;
+ /*
+ * This is subtle.
+ * Note how we can enable interrupts here, even
+ * though interrupts can add processes to the run-
+ * queue. This is because any new processes will
+ * be added to the front of the queue, so "p" above
+ * is a safe starting point.
+ * run-queue deletion and re-ordering is protected by
+ * the scheduler lock
+ */
+ spin_unlock_irq(&runqueue_lock);
#ifdef __SMP__
- /*
- * This is safe as we do not permit re-entry of schedule()
- */
- prev->processor = NO_PROC_ID;
-#define idle_task (task[cpu_number_map[this_cpu]])
-#else
-#define idle_task (&init_task)
-#endif
-
+ prev->processor = NO_PROC_ID;
+#endif
+
/*
* Note! there may appear new tasks on the run-queue during this, as
* interrupts are enabled. However, they will be put on front of the
* list, so our list starting at "p" is essentially fixed.
*/
/* this is the scheduler proper: */
- c = -1000;
- next = idle_task;
- while (p != &init_task) {
- int weight = goodness(p, prev, this_cpu);
- if (weight > c)
- c = weight, next = p;
- p = p->next_run;
- }
+ {
+ int c = -1000;
+ next = idle_task;
+ while (p != &init_task) {
+ if (can_schedule(p)) {
+ int weight = goodness(p, prev, this_cpu);
+ if (weight > c)
+ c = weight, next = p;
+ }
+ p = p->next_run;
+ }
- /* if all runnable processes have "counter == 0", re-calculate counters */
- if (!c) {
- for_each_task(p)
- p->counter = (p->counter >> 1) + p->priority;
+ /* Do we need to re-calculate counters? */
+ if (!c) {
+ struct task_struct *p;
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ p->counter = (p->counter >> 1) + p->priority;
+ read_unlock(&tasklist_lock);
+ }
+ }
}
-#ifdef __SMP__
- /*
- * Allocate process to CPU
- */
-
- next->processor = this_cpu;
- next->last_processor = this_cpu;
-#endif
-#ifdef __SMP_PROF__
- /* mark processor running an idle thread */
- if (0==next->pid)
- set_bit(this_cpu,&smp_idle_map);
- else
- clear_bit(this_cpu,&smp_idle_map);
-#endif
+
+ next->processor = this_cpu;
+ next->last_processor = this_cpu;
+
if (prev != next) {
struct timer_list timer;
@@ -404,14 +403,13 @@ asmlinkage void schedule(void)
get_mmu_context(next);
switch_to(prev,next);
+
if (timeout)
del_timer(&timer);
}
- return;
+ spin_unlock(&scheduler_lock);
-scheduling_in_interrupt:
- printk("Aiee: scheduling in interrupt %p\n",
- return_address());
+ reacquire_kernel_lock(prev, smp_processor_id(), lock_depth);
}
#ifndef __alpha__
@@ -429,93 +427,92 @@ asmlinkage int sys_pause(void)
#endif
+spinlock_t waitqueue_lock;
+
/*
* wake_up doesn't wake up stopped processes - they have to be awakened
* with signals or similar.
- *
- * Note that this doesn't need cli-sti pairs: interrupts may not change
- * the wait-queue structures directly, but only call wake_up() to wake
- * a process. The process itself must remove the queue once it has woken.
*/
void wake_up(struct wait_queue **q)
{
+ unsigned long flags;
struct wait_queue *next;
struct wait_queue *head;
- if (!q || !(next = *q))
- return;
- head = WAIT_QUEUE_HEAD(q);
- while (next != head) {
- struct task_struct *p = next->task;
- next = next->next;
- if (p != NULL) {
- if ((p->state == TASK_UNINTERRUPTIBLE) ||
- (p->state == TASK_INTERRUPTIBLE))
- wake_up_process(p);
+ spin_lock_irqsave(&waitqueue_lock, flags);
+ if (q && (next = *q)) {
+ head = WAIT_QUEUE_HEAD(q);
+ while (next != head) {
+ struct task_struct *p = next->task;
+ next = next->next;
+ if (p != NULL) {
+ if ((p->state == TASK_UNINTERRUPTIBLE) ||
+ (p->state == TASK_INTERRUPTIBLE))
+ wake_up_process(p);
+ }
+ if (next)
+ continue;
+ printk("wait_queue is bad (eip = %p)\n",
+ __builtin_return_address(0));
+ printk(" q = %p\n",q);
+ printk(" *q = %p\n",*q);
+ break;
}
- if (!next)
- goto bad;
}
- return;
-bad:
- printk("wait_queue is bad (eip = %p)\n",
- __builtin_return_address(0));
- printk(" q = %p\n",q);
- printk(" *q = %p\n",*q);
+ spin_unlock_irqrestore(&waitqueue_lock, flags);
}
void wake_up_interruptible(struct wait_queue **q)
{
+ unsigned long flags;
struct wait_queue *next;
struct wait_queue *head;
- if (!q || !(next = *q))
- return;
- head = WAIT_QUEUE_HEAD(q);
- while (next != head) {
- struct task_struct *p = next->task;
- next = next->next;
- if (p != NULL) {
- if (p->state == TASK_INTERRUPTIBLE)
- wake_up_process(p);
+ spin_lock_irqsave(&waitqueue_lock, flags);
+ if (q && (next = *q)) {
+ head = WAIT_QUEUE_HEAD(q);
+ while (next != head) {
+ struct task_struct *p = next->task;
+ next = next->next;
+ if (p != NULL) {
+ if (p->state == TASK_INTERRUPTIBLE)
+ wake_up_process(p);
+ }
+ if (next)
+ continue;
+ printk("wait_queue is bad (eip = %p)\n",
+ __builtin_return_address(0));
+ printk(" q = %p\n",q);
+ printk(" *q = %p\n",*q);
+ break;
}
- if (!next)
- goto bad;
}
- return;
-bad:
- printk("wait_queue is bad (eip = %p)\n",
- return_address());
- printk(" q = %p\n",q);
- printk(" *q = %p\n",*q);
+ spin_unlock_irqrestore(&waitqueue_lock, flags);
}
/*
* Semaphores are implemented using a two-way counter:
* The "count" variable is decremented for each process
- * that tries to sleep, while the "waiting" variable is
- * incremented _while_ the process is sleeping on that
- * semaphore.
+ * that tries to sleep, while the "waking" variable is
+ * incremented when the "up()" code goes to wake up waiting
+ * processes.
*
* Notably, the inline "up()" and "down()" functions can
* efficiently test if they need to do any extra work (up
* needs to do something only if count was negative before
* the increment operation.
- */
-static inline void normalize_semaphore(struct semaphore *sem)
-{
- atomic_add(xchg(&sem->waiting,0), &sem->count);
-}
-
-/*
+ *
+ * waking_non_zero() (from asm/semaphore.h) must execute
+ * atomically.
+ *
* When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody. In
- * most cases "waiting" will be positive, and the normalization
- * will allow things to continue. However, if somebody has
- * /just/ done a down(), it may be that count was negative
- * without waiting being positive (or in the generic case
- * "count is more negative than waiting is positive"), and
- * the waiter needs to check this itself (see __down).
+ * incrementing it, and we need to wake up somebody.
+ *
+ * This routine adds one to the count of processes that need to
+ * wake up and exit. ALL waiting processes actually wake up but
+ * only the one that gets to the "waking" field first will gate
+ * through and acquire the semaphore. The others will go back
+ * to sleep.
*
* Note that these functions are only called when there is
* contention on the lock, and as such all this is the
@@ -525,55 +522,83 @@ static inline void normalize_semaphore(struct semaphore *sem)
*/
void __up(struct semaphore *sem)
{
- normalize_semaphore(sem);
+ wake_one_more(sem);
wake_up(&sem->wait);
}
-void __down(struct semaphore * sem)
+/*
+ * Perform the "down" function. Return zero for semaphore acquired,
+ * return negative for signalled out of the function.
+ *
+ * If called from __down, the return is ignored and the wait loop is
+ * not interruptible. This means that a task waiting on a semaphore
+ * using "down()" cannot be killed until someone does an "up()" on
+ * the semaphore.
+ *
+ * If called from __down_interruptible, the return value gets checked
+ * upon return. If the return value is negative then the task continues
+ * with the negative value in the return register (it can be tested by
+ * the caller).
+ *
+ * Either form may be used in conjunction with "up()".
+ *
+ */
+static inline int __do_down(struct semaphore * sem, int task_state)
{
struct task_struct *tsk = current;
struct wait_queue wait = { tsk, NULL };
+ int ret = 0;
- /*
- * The order here is important. We add ourselves to the
- * wait queues and mark ourselves sleeping _first_. That
- * way, if a "up()" comes in here, we'll either get
- * woken up (up happens after the wait queues are set up)
- * OR we'll have "waiting > 0".
- */
- tsk->state = TASK_UNINTERRUPTIBLE;
+ tsk->state = task_state;
add_wait_queue(&sem->wait, &wait);
- atomic_inc(&sem->waiting);
/*
- * Ok, we're set up. The only race here is really that
- * an "up()" might have incremented count before we got
- * here, so we check "count+waiting". If that is larger
- * than zero, we shouldn't sleep, but re-try the lock.
+ * Ok, we're set up. sem->count is known to be less than zero
+ * so we must wait.
+ *
+ * We can let go the lock for purposes of waiting.
+ * We re-acquire it after awaking so as to protect
+ * all semaphore operations.
+ *
+ * If "up()" is called before we call waking_non_zero() then
+ * we will catch it right away. If it is called later then
+ * we will have to go through a wakeup cycle to catch it.
+ *
+ * Multiple waiters contend for the semaphore lock to see
+ * who gets to gate through and who has to wait some more.
*/
- if (sem->count+sem->waiting <= 0) {
- /*
- * If "count+waiting" <= 0, we have to wait
- * for a up(), which will normalize the count.
- * Remember, at this point we have decremented
- * count, and incremented up, so if count is
- * zero or positive we need to return to re-try
- * the lock. It _may_ be that both count and
- * waiting is zero and that it is still locked,
- * but we still want to re-try the lock in that
- * case to make count go negative again so that
- * the optimized "up()" wake_up sequence works.
- */
- do {
- schedule();
- tsk->state = TASK_UNINTERRUPTIBLE;
- } while (sem->count < 0);
+ for (;;) {
+ if (waking_non_zero(sem)) /* are we waking up? */
+ break; /* yes, exit loop */
+
+ if ( task_state == TASK_INTERRUPTIBLE
+ && (tsk->signal & ~tsk->blocked) /* signalled */
+ ) {
+ ret = -EINTR; /* interrupted */
+ atomic_inc(&sem->count); /* give up on down operation */
+ break;
+ }
+
+ schedule();
+ tsk->state = task_state;
}
+
tsk->state = TASK_RUNNING;
remove_wait_queue(&sem->wait, &wait);
- normalize_semaphore(sem);
+ return ret;
+}
+
+void __down(struct semaphore * sem)
+{
+ __do_down(sem,TASK_UNINTERRUPTIBLE);
}
+int __down_interruptible(struct semaphore * sem)
+{
+ return __do_down(sem,TASK_INTERRUPTIBLE);
+}
+
+
static inline void __sleep_on(struct wait_queue **p, int state)
{
unsigned long flags;
@@ -584,14 +609,14 @@ static inline void __sleep_on(struct wait_queue **p, int state)
if (current == task[0])
panic("task[0] trying to sleep");
current->state = state;
- save_flags(flags);
- cli();
+ spin_lock_irqsave(&waitqueue_lock, flags);
__add_wait_queue(p, &wait);
+ spin_unlock(&waitqueue_lock);
sti();
schedule();
- cli();
+ spin_lock_irq(&waitqueue_lock);
__remove_wait_queue(p, &wait);
- restore_flags(flags);
+ spin_unlock_irqrestore(&waitqueue_lock, flags);
}
void interruptible_sleep_on(struct wait_queue **p)
@@ -604,74 +629,178 @@ void sleep_on(struct wait_queue **p)
__sleep_on(p,TASK_UNINTERRUPTIBLE);
}
-/*
- * The head for the timer-list has a "expires" field of MAX_UINT,
- * and the sorting routine counts on this..
- */
-static struct timer_list timer_head = { &timer_head, &timer_head, ~0, 0, NULL };
+
+#define TVN_BITS 6
+#define TVR_BITS 8
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+
#define SLOW_BUT_DEBUGGING_TIMERS 0
-void add_timer(struct timer_list * timer)
+struct timer_vec {
+ int index;
+ struct timer_list *vec[TVN_SIZE];
+};
+
+struct timer_vec_root {
+ int index;
+ struct timer_list *vec[TVR_SIZE];
+};
+
+static struct timer_vec tv5 = { 0 };
+static struct timer_vec tv4 = { 0 };
+static struct timer_vec tv3 = { 0 };
+static struct timer_vec tv2 = { 0 };
+static struct timer_vec_root tv1 = { 0 };
+
+static struct timer_vec * const tvecs[] = {
+ (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
+};
+
+#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
+
+static unsigned long timer_jiffies = 0;
+
+static inline void insert_timer(struct timer_list *timer,
+ struct timer_list **vec, int idx)
+{
+ if ((timer->next = vec[idx]))
+ vec[idx]->prev = timer;
+ vec[idx] = timer;
+ timer->prev = (struct timer_list *)&vec[idx];
+}
+
+static inline void internal_add_timer(struct timer_list *timer)
+{
+ /*
+ * must be cli-ed when calling this
+ */
+ unsigned long expires = timer->expires;
+ unsigned long idx = expires - timer_jiffies;
+
+ if (idx < TVR_SIZE) {
+ int i = expires & TVR_MASK;
+ insert_timer(timer, tv1.vec, i);
+ } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+ int i = (expires >> TVR_BITS) & TVN_MASK;
+ insert_timer(timer, tv2.vec, i);
+ } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+ insert_timer(timer, tv3.vec, i);
+ } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+ insert_timer(timer, tv4.vec, i);
+ } else if (expires < timer_jiffies) {
+ /* can happen if you add a timer with expires == jiffies,
+ * or you set a timer to go off in the past
+ */
+ insert_timer(timer, tv1.vec, tv1.index);
+ } else if (idx < 0xffffffffUL) {
+ int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
+ insert_timer(timer, tv5.vec, i);
+ } else {
+ /* Can only get here on architectures with 64-bit jiffies */
+ timer->next = timer->prev = timer;
+ }
+}
+
+static spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
+
+void add_timer(struct timer_list *timer)
{
unsigned long flags;
- struct timer_list *p;
+ spin_lock_irqsave(&timerlist_lock, flags);
#if SLOW_BUT_DEBUGGING_TIMERS
- if (timer->next || timer->prev) {
- printk("add_timer() called with non-zero list from %p\n",
- __builtin_return_address(0));
- return;
- }
+ if (timer->next || timer->prev) {
+ printk("add_timer() called with non-zero list from %p\n",
+ __builtin_return_address(0));
+ goto out;
+ }
#endif
- p = &timer_head;
- save_flags(flags);
- cli();
- do {
- p = p->next;
- } while (timer->expires > p->expires);
- timer->next = p;
- timer->prev = p->prev;
- p->prev = timer;
- timer->prev->next = timer;
- restore_flags(flags);
+ internal_add_timer(timer);
+#if SLOW_BUT_DEBUGGING_TIMERS
+out:
+#endif
+ spin_unlock_irqrestore(&timerlist_lock, flags);
}
-int del_timer(struct timer_list * timer)
+static inline int detach_timer(struct timer_list *timer)
{
int ret = 0;
- if (timer->next) {
- unsigned long flags;
- struct timer_list * next;
- save_flags(flags);
- cli();
- if ((next = timer->next) != NULL) {
- (next->prev = timer->prev)->next = next;
- timer->next = timer->prev = NULL;
- ret = 1;
- }
- restore_flags(flags);
+ struct timer_list *next, *prev;
+ next = timer->next;
+ prev = timer->prev;
+ if (next) {
+ next->prev = prev;
+ }
+ if (prev) {
+ ret = 1;
+ prev->next = next;
}
return ret;
}
-static inline void run_timer_list(void)
+
+int del_timer(struct timer_list * timer)
{
- struct timer_list * timer;
+ int ret;
+ unsigned long flags;
- cli();
- while ((timer = timer_head.next) != &timer_head && timer->expires <= jiffies) {
- void (*fn)(unsigned long) = timer->function;
- unsigned long data = timer->data;
- timer->next->prev = timer->prev;
- timer->prev->next = timer->next;
- timer->next = timer->prev = NULL;
- sti();
- fn(data);
- cli();
+ spin_lock_irqsave(&timerlist_lock, flags);
+ ret = detach_timer(timer);
+ timer->next = timer->prev = 0;
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+ return ret;
+}
+
+static inline void cascade_timers(struct timer_vec *tv)
+{
+ /* cascade all the timers from tv up one level */
+ struct timer_list *timer;
+ timer = tv->vec[tv->index];
+ /*
+ * We are removing _all_ timers from the list, so we don't have to
+ * detach them individually, just clear the list afterwards.
+ */
+ while (timer) {
+ struct timer_list *tmp = timer;
+ timer = timer->next;
+ internal_add_timer(tmp);
+ }
+ tv->vec[tv->index] = NULL;
+ tv->index = (tv->index + 1) & TVN_MASK;
+}
+
+static inline void run_timer_list(void)
+{
+ spin_lock_irq(&timerlist_lock);
+ while ((long)(jiffies - timer_jiffies) >= 0) {
+ struct timer_list *timer;
+ if (!tv1.index) {
+ int n = 1;
+ do {
+ cascade_timers(tvecs[n]);
+ } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
+ }
+ while ((timer = tv1.vec[tv1.index])) {
+ void (*fn)(unsigned long) = timer->function;
+ unsigned long data = timer->data;
+ detach_timer(timer);
+ timer->next = timer->prev = NULL;
+ spin_unlock_irq(&timerlist_lock);
+ fn(data);
+ spin_lock_irq(&timerlist_lock);
+ }
+ ++timer_jiffies;
+ tv1.index = (tv1.index + 1) & TVR_MASK;
}
- sti();
+ spin_unlock_irq(&timerlist_lock);
}
+
static inline void run_old_timers(void)
{
struct timer_struct *tp;
@@ -690,6 +819,8 @@ static inline void run_old_timers(void)
}
}
+spinlock_t tqueue_lock;
+
void tqueue_bh(void)
{
run_task_queue(&tq_timer);
@@ -974,7 +1105,7 @@ static inline void do_it_prof(struct task_struct * p, unsigned long ticks)
}
}
-static __inline__ void update_one_process(struct task_struct *p,
+void update_one_process(struct task_struct *p,
unsigned long ticks, unsigned long user, unsigned long system)
{
do_process_times(p, user, system);
@@ -984,6 +1115,9 @@ static __inline__ void update_one_process(struct task_struct *p,
static void update_process_times(unsigned long ticks, unsigned long system)
{
+/*
+ * SMP does this on a per-CPU basis elsewhere
+ */
#ifndef __SMP__
struct task_struct * p = current;
unsigned long user = ticks - system;
@@ -1000,79 +1134,35 @@ static void update_process_times(unsigned long ticks, unsigned long system)
kstat.cpu_system += system;
}
update_one_process(p, ticks, user, system);
-#else
- int cpu,j;
- cpu = smp_processor_id();
- for (j=0;j<smp_num_cpus;j++)
- {
- int i = cpu_logical_map[j];
- struct task_struct *p;
-
-#ifdef __SMP_PROF__
- if (test_bit(i,&smp_idle_map))
- smp_idle_count[i]++;
-#endif
- p = current_set[i];
- /*
- * Do we have a real process?
- */
- if (p->pid) {
- /* assume user-mode process */
- unsigned long utime = ticks;
- unsigned long stime = 0;
- if (cpu == i) {
- utime = ticks-system;
- stime = system;
- } else if (smp_proc_in_lock[j]) {
- utime = 0;
- stime = ticks;
- }
- update_one_process(p, ticks, utime, stime);
-
- if (p->priority < DEF_PRIORITY)
- kstat.cpu_nice += utime;
- else
- kstat.cpu_user += utime;
- kstat.cpu_system += stime;
-
- p->counter -= ticks;
- if (p->counter >= 0)
- continue;
- p->counter = 0;
- } else {
- /*
- * Idle processor found, do we have anything
- * we could run?
- */
- if (!(0x7fffffff & smp_process_available))
- continue;
- }
- /* Ok, we should reschedule, do the magic */
- if (i==cpu)
- need_resched = 1;
- else
- smp_message_pass(i, MSG_RESCHEDULE, 0L, 0);
- }
#endif
}
-static unsigned long lost_ticks = 0;
+volatile unsigned long lost_ticks = 0;
static unsigned long lost_ticks_system = 0;
static inline void update_times(void)
{
unsigned long ticks;
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
- ticks = xchg(&lost_ticks, 0);
+ ticks = lost_ticks;
+ lost_ticks = 0;
if (ticks) {
unsigned long system;
-
system = xchg(&lost_ticks_system, 0);
+
calc_load(ticks);
update_wall_time(ticks);
+ restore_flags(flags);
+
update_process_times(ticks, system);
- }
+
+ } else
+ restore_flags(flags);
}
static void timer_bh(void)
@@ -1087,17 +1177,8 @@ void do_timer(struct pt_regs * regs)
(*(unsigned long *)&jiffies)++;
lost_ticks++;
mark_bh(TIMER_BH);
- if (!user_mode(regs)) {
+ if (!user_mode(regs))
lost_ticks_system++;
- if (prof_buffer && current->pid) {
- extern int _stext;
- unsigned long ip = instruction_pointer(regs);
- ip -= (unsigned long) &_stext;
- ip >>= prof_shift;
- if (ip < prof_len)
- prof_buffer[ip]++;
- }
- }
if (tq_timer)
mark_bh(TQUEUE_BH);
}
@@ -1129,34 +1210,81 @@ asmlinkage unsigned int sys_alarm(unsigned int seconds)
* The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
* should be moved into arch/i386 instead?
*/
+
asmlinkage int sys_getpid(void)
{
+ /* This is SMP safe - current->pid doesnt change */
return current->pid;
}
+/*
+ * This is not strictly SMP safe: p_opptr could change
+ * from under us. However, rather than getting any lock
+ * we can use an optimistic algorithm: get the parent
+ * pid, and go back and check that the parent is still
+ * the same. If it has changed (which is extremely unlikely
+ * indeed), we just try again..
+ *
+ * NOTE! This depends on the fact that even if we _do_
+ * get an old value of "parent", we can happily dereference
+ * the pointer: we just can't necessarily trust the result
+ * until we know that the parent pointer is valid.
+ *
+ * The "mb()" macro is a memory barrier - a synchronizing
+ * event. It also makes sure that gcc doesn't optimize
+ * away the necessary memory references.. The barrier doesn't
+ * have to have all that strong semantics: on x86 we don't
+ * really require a synchronizing instruction, for example.
+ * The barrier is more important for code generation than
+ * for any real memory ordering semantics (even if there is
+ * a small window for a race, using the old pointer is
+ * harmless for a while).
+ */
asmlinkage int sys_getppid(void)
{
- return current->p_opptr->pid;
+ int pid;
+ struct task_struct * me = current;
+ struct task_struct * parent;
+
+ parent = me->p_opptr;
+ for (;;) {
+ pid = parent->pid;
+#if __SMP__
+{
+ struct task_struct *old = parent;
+ mb();
+ parent = me->p_opptr;
+ if (old != parent)
+ continue;
+}
+#endif
+ break;
+ }
+ return pid;
}
asmlinkage int sys_getuid(void)
{
+ /* Only we change this so SMP safe */
return current->uid;
}
asmlinkage int sys_geteuid(void)
{
+ /* Only we change this so SMP safe */
return current->euid;
}
asmlinkage int sys_getgid(void)
{
+ /* Only we change this so SMP safe */
return current->gid;
}
asmlinkage int sys_getegid(void)
{
- return current->egid;
+ /* Only we change this so SMP safe */
+ return current->egid;
}
/*
@@ -1164,11 +1292,18 @@ asmlinkage int sys_getegid(void)
* moved into the arch dependent tree for those ports that require
* it for backward compatibility?
*/
+
asmlinkage int sys_nice(int increment)
{
unsigned long newprio;
int increase = 0;
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+
newprio = increment;
if (increment < 0) {
if (!suser())
@@ -1176,6 +1311,7 @@ asmlinkage int sys_nice(int increment)
newprio = -increment;
increase = 1;
}
+
if (newprio > 40)
newprio = 40;
/*
@@ -1189,6 +1325,14 @@ asmlinkage int sys_nice(int increment)
increment = newprio;
if (increase)
increment = -increment;
+ /*
+ * Current->priority can change between this point
+ * and the assignment. We are assigning not doing add/subs
+ * so thats ok. Conceptually a process might just instantaneously
+ * read the value we stomp over. I don't think that is an issue
+ * unless posix makes it one. If so we can loop on changes
+ * to current->priority.
+ */
newprio = current->priority - increment;
if ((signed) newprio < 1)
newprio = 1;
@@ -1206,13 +1350,15 @@ static struct task_struct *find_process_by_pid(pid_t pid)
p = current;
if (pid) {
+ read_lock(&tasklist_lock);
for_each_task(p) {
if (p->pid == pid)
goto found;
}
p = NULL;
- }
found:
+ read_unlock(&tasklist_lock);
+ }
return p;
}
@@ -1255,12 +1401,13 @@ static int setscheduler(pid_t pid, int policy,
p->policy = policy;
p->rt_priority = lp.sched_priority;
- cli();
+ spin_lock(&scheduler_lock);
+ spin_lock_irq(&runqueue_lock);
if (p->next_run)
move_last_runqueue(p);
- sti();
- schedule();
-
+ spin_unlock_irq(&runqueue_lock);
+ spin_unlock(&scheduler_lock);
+ need_resched = 1;
return 0;
}
@@ -1307,36 +1454,44 @@ asmlinkage int sys_sched_getparam(pid_t pid, struct sched_param *param)
asmlinkage int sys_sched_yield(void)
{
- cli();
+ spin_lock(&scheduler_lock);
+ spin_lock_irq(&runqueue_lock);
move_last_runqueue(current);
- sti();
+ spin_unlock_irq(&runqueue_lock);
+ spin_unlock(&scheduler_lock);
+ need_resched = 1;
return 0;
}
asmlinkage int sys_sched_get_priority_max(int policy)
{
+ int ret = -EINVAL;
+
switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- return 99;
- case SCHED_OTHER:
- return 0;
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 99;
+ break;
+ case SCHED_OTHER:
+ ret = 0;
+ break;
}
-
- return -EINVAL;
+ return ret;
}
asmlinkage int sys_sched_get_priority_min(int policy)
{
+ int ret = -EINVAL;
+
switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- return 1;
- case SCHED_OTHER:
- return 0;
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_OTHER:
+ ret = 0;
}
-
- return -EINVAL;
+ return ret;
}
asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
@@ -1344,9 +1499,10 @@ asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
struct timespec t;
t.tv_sec = 0;
- t.tv_nsec = 0; /* <-- Linus, please fill correct value in here */
- return -ENOSYS; /* and then delete this line. Thanks! */
- return copy_to_user(interval, &t, sizeof(struct timespec)) ? -EFAULT : 0;
+ t.tv_nsec = 150000;
+ if (copy_to_user(interval, &t, sizeof(struct timespec)))
+ return -EFAULT;
+ return 0;
}
/*
@@ -1369,33 +1525,35 @@ static void jiffiestotimespec(unsigned long jiffies, struct timespec *value)
{
value->tv_nsec = (jiffies % HZ) * (1000000000L / HZ);
value->tv_sec = jiffies / HZ;
- return;
}
asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
{
- int error;
struct timespec t;
unsigned long expire;
- error = copy_from_user(&t, rqtp, sizeof(struct timespec));
- if (error)
- return -EFAULT;
+ if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
+ return -EFAULT;
if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
return -EINVAL;
+
if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
- current->policy != SCHED_OTHER) {
+ current->policy != SCHED_OTHER)
+ {
/*
* Short delay requests up to 2 ms will be handled with
* high precision by a busy wait for all real-time processes.
+ *
+ * Its important on SMP not to do this holding locks.
*/
udelay((t.tv_nsec + 999) / 1000);
return 0;
}
expire = timespectojiffies(&t) + (t.tv_sec || t.tv_nsec) + jiffies;
+
current->timeout = expire;
current->state = TASK_INTERRUPTIBLE;
schedule();
@@ -1405,11 +1563,10 @@ asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
jiffiestotimespec(expire - jiffies -
(expire > jiffies + 1), &t);
if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
- return -EFAULT;
+ return -EFAULT;
}
return -EINTR;
}
-
return 0;
}
@@ -1478,7 +1635,7 @@ void sched_init(void)
* process right in SMP mode.
*/
int cpu=smp_processor_id();
-#ifndef __SMP__
+#ifndef __SMP__
current_set[cpu]=&init_task;
#else
init_task.processor=cpu;