summaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1999-02-15 02:15:32 +0000
committerRalf Baechle <ralf@linux-mips.org>1999-02-15 02:15:32 +0000
commit86464aed71025541805e7b1515541aee89879e33 (patch)
treee01a457a4912a8553bc65524aa3125d51f29f810 /kernel/sched.c
parent88f99939ecc6a95a79614574cb7d95ffccfc3466 (diff)
Merge with Linux 2.2.1.
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c474
1 files changed, 330 insertions, 144 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index c8c297180..add76fbe0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7,6 +7,14 @@
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
+ * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
+ * Copyright (C) 1998 Andrea Arcangeli
+ * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
*/
/*
@@ -59,8 +67,8 @@ long time_offset = 0; /* time adjustment (us) */
long time_constant = 2; /* pll time constant */
long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
long time_precision = 1; /* clock precision (us) */
-long time_maxerror = MAXPHASE; /* maximum error (us) */
-long time_esterror = MAXPHASE; /* estimated error (us) */
+long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
+long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
long time_phase = 0; /* phase offset (scaled us) */
long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; /* frequency offset (scaled ppm) */
long time_adj = 0; /* tick adjust (scaled 1 / HZ) */
@@ -91,47 +99,122 @@ struct kernel_stat kstat = { 0 };
void scheduling_functions_start_here(void) { }
-static inline void reschedule_idle(struct task_struct * p)
+#ifdef __SMP__
+static void reschedule_idle_slow(struct task_struct * p)
{
+/*
+ * (see reschedule_idle() for an explanation first ...)
+ *
+ * Pass #2
+ *
+ * We try to find another (idle) CPU for this woken-up process.
+ *
+ * On SMP, we mostly try to see if the CPU the task used
+ * to run on is idle.. but we will use another idle CPU too,
+ * at this point we already know that this CPU is not
+ * willing to reschedule in the near future.
+ *
+ * An idle CPU is definitely wasted, especially if this CPU is
+ * running long-timeslice processes. The following algorithm is
+ * pretty good at finding the best idle CPU to send this process
+ * to.
+ *
+ * [We can try to preempt low-priority processes on other CPUs in
+ * 2.3. Also we can try to use the avg_slice value to predict
+ * 'likely reschedule' events even on other CPUs.]
+ */
+ int best_cpu = p->processor, this_cpu = smp_processor_id();
+ struct task_struct **idle = task, *tsk, *target_tsk;
+ int i = smp_num_cpus;
+
+ target_tsk = NULL;
+ do {
+ tsk = *idle;
+ idle++;
+ if (tsk->has_cpu) {
+ if (tsk->processor == this_cpu)
+ continue;
+ target_tsk = tsk;
+ if (tsk->processor == best_cpu) {
+ /*
+ * bingo, we couldnt get a better
+ * CPU, activate it.
+ */
+ goto send; /* this one helps GCC ... */
+ }
+ }
+ } while (--i > 0);
/*
- * For SMP, we try to see if the CPU the task used
- * to run on is idle..
+ * found any idle CPU?
*/
-#if 0
+ if (target_tsk) {
+send:
+ target_tsk->need_resched = 1;
+ smp_send_reschedule(target_tsk->processor);
+ return;
+ }
+}
+#endif /* __SMP__ */
+
+/*
+ * If there is a dependency between p1 and p2,
+ * don't be too eager to go into the slow schedule.
+ * In particular, if p1 and p2 both want the kernel
+ * lock, there is no point in trying to make them
+ * extremely parallel..
+ *
+ * (No lock - lock_depth < 0)
+ */
+#define related(p1,p2) ((p1)->lock_depth >= 0 && (p2)->lock_depth >= 0)
+
+static inline void reschedule_idle(struct task_struct * p)
+{
+
+ if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) {
+ current->need_resched = 1;
+ return;
+ }
+
+#ifdef __SMP__
/*
- * Disable this for now. Ingo has some interesting
- * code that looks too complex, and I have some ideas,
- * but in the meantime.. One problem is that "wakeup()"
- * can be (and is) called before we've even initialized
- * SMP completely, so..
+ * ("wakeup()" should not be called before we've initialized
+ * SMP completely.
+ * Basically a not-yet initialized SMP subsystem can be
+ * considered as a not-yet working scheduler, simply dont use
+ * it before it's up and running ...)
+ *
+ * SMP rescheduling is done in 2 passes:
+ * - pass #1: faster: 'quick decisions'
+ * - pass #2: slower: 'lets try and find another CPU'
*/
-#ifdef __SMP__
- int want_cpu = p->processor;
/*
- * Don't even try to find another CPU for us if the task
- * ran on this one before..
+ * Pass #1
+ *
+ * There are two metrics here:
+ *
+ * first, a 'cutoff' interval, currently 0-200 usecs on
+ * x86 CPUs, depending on the size of the 'SMP-local cache'.
+ * If the current process has longer average timeslices than
+ * this, then we utilize the idle CPU.
+ *
+ * second, if the wakeup comes from a process context,
+ * then the two processes are 'related'. (they form a
+ * 'gang')
+ *
+ * An idle CPU is almost always a bad thing, thus we skip
+ * the idle-CPU utilization only if both these conditions
+ * are true. (ie. a 'process-gang' rescheduling with rather
+ * high frequency should stay on the same CPU).
+ *
+ * [We can switch to something more finegrained in 2.3.]
*/
- if (want_cpu != smp_processor_id()) {
- struct task_struct **idle = task;
- int i = smp_num_cpus;
-
- do {
- struct task_struct *tsk = *idle;
- idle++;
- /* Something like this.. */
- if (tsk->has_cpu && tsk->processor == want_cpu) {
- tsk->need_resched = 1;
- smp_send_reschedule(want_cpu);
- return;
- }
- } while (--i > 0);
- }
-#endif
-#endif
- if (p->policy != SCHED_OTHER || p->counter > current->counter + 3)
- current->need_resched = 1;
+ if ((current->avg_slice < cacheflush_time) && related(current, p))
+ return;
+
+ reschedule_idle_slow(p);
+#endif /* __SMP__ */
}
/*
@@ -149,6 +232,7 @@ static inline void add_to_runqueue(struct task_struct * p)
init_task.next_run = p;
p->next_run = next;
next->prev_run = p;
+ nr_running++;
}
static inline void del_from_runqueue(struct task_struct * p)
@@ -227,7 +311,6 @@ void wake_up_process(struct task_struct * p)
if (!p->next_run) {
add_to_runqueue(p);
reschedule_idle(p);
- nr_running++;
}
spin_unlock_irqrestore(&runqueue_lock, flags);
}
@@ -437,23 +520,6 @@ signed long schedule_timeout(signed long timeout)
struct timer_list timer;
unsigned long expire;
- /*
- * PARANOID.
- */
- if (current->state == TASK_UNINTERRUPTIBLE)
- {
- printk(KERN_WARNING "schedule_timeout: task not interrutible "
- "from %p\n", __builtin_return_address(0));
- /*
- * We don' t want to interrupt a not interruptible task
- * risking to cause corruption. Better a a deadlock ;-).
- */
- timeout = MAX_SCHEDULE_TIMEOUT;
- }
-
- /*
- * Here we start for real.
- */
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
@@ -501,6 +567,63 @@ signed long schedule_timeout(signed long timeout)
}
/*
+ * This one aligns per-CPU data on cacheline boundaries.
+ */
+static union {
+ struct schedule_data {
+ struct task_struct * prev;
+ long prevstate;
+ cycles_t last_schedule;
+ } schedule_data;
+ char __pad [L1_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+
+
+static inline void __schedule_tail (void)
+{
+#ifdef __SMP__
+ struct schedule_data * sched_data;
+
+ /*
+ * We might have switched CPUs:
+ */
+ sched_data = & aligned_data[smp_processor_id()].schedule_data;
+
+ /*
+ * Subtle. In the rare event that we got a wakeup to 'prev' just
+ * during the reschedule (this is possible, the scheduler is pretty
+ * parallel), we should do another reschedule in the next task's
+ * context. schedule() will do the right thing next time around.
+ * this is equivalent to 'delaying' the wakeup until the reschedule
+ * has finished.
+ */
+ if (sched_data->prev->state != sched_data->prevstate)
+ current->need_resched = 1;
+
+ /*
+ * Release the previous process ...
+ *
+ * We have dropped all locks, and we must make sure that we
+ * only mark the previous process as no longer having a CPU
+ * after all other state has been seen by other CPU's. Thus
+ * the write memory barrier!
+ */
+ wmb();
+ sched_data->prev->has_cpu = 0;
+#endif /* __SMP__ */
+}
+
+/*
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
+ */
+void schedule_tail (void)
+{
+ __schedule_tail();
+}
+
+/*
* 'schedule()' is the scheduler function. It's a very simple and nice
* scheduler: it's not perfect, but certainly works for most things.
*
@@ -512,11 +635,18 @@ signed long schedule_timeout(signed long timeout)
*/
asmlinkage void schedule(void)
{
+ struct schedule_data * sched_data;
struct task_struct * prev, * next;
int this_cpu;
prev = current;
this_cpu = prev->processor;
+ /*
+ * 'sched_data' is protected by the fact that we can run
+ * only one process per CPU.
+ */
+ sched_data = & aligned_data[this_cpu].schedule_data;
+
if (in_interrupt())
goto scheduling_in_interrupt;
release_kernel_lock(prev, this_cpu);
@@ -531,6 +661,7 @@ asmlinkage void schedule(void)
/* move an exhausted RR process to be last.. */
prev->need_resched = 0;
+
if (!prev->counter && prev->policy == SCHED_RR) {
prev->counter = prev->priority;
move_last_runqueue(prev);
@@ -546,6 +677,9 @@ asmlinkage void schedule(void)
del_from_runqueue(prev);
case TASK_RUNNING:
}
+
+ sched_data->prevstate = prev->state;
+
{
struct task_struct * p = init_task.next_run;
/*
@@ -592,25 +726,49 @@ asmlinkage void schedule(void)
}
}
+ /*
+ * maintain the per-process 'average timeslice' value.
+ * (this has to be recalculated even if we reschedule to
+ * the same process) Currently this is only used on SMP:
+ */
#ifdef __SMP__
- next->has_cpu = 1;
- next->processor = this_cpu;
-#endif
+ {
+ cycles_t t, this_slice;
- if (prev != next) {
- kstat.context_swtch++;
- get_mmu_context(next);
- switch_to(prev,next);
- }
+ t = get_cycles();
+ this_slice = t - sched_data->last_schedule;
+ sched_data->last_schedule = t;
- spin_unlock(&scheduler_lock);
+ /*
+ * Simple, exponentially fading average calculation:
+ */
+ prev->avg_slice = this_slice + prev->avg_slice;
+ prev->avg_slice >>= 1;
+ }
/*
- * At this point "prev" is "current", as we just
- * switched into it (from an even more "previous"
- * prev)
+ * We drop the scheduler lock early (it's a global spinlock),
+ * thus we have to lock the previous process from getting
+ * rescheduled during switch_to().
*/
- reacquire_kernel_lock(prev);
+ prev->has_cpu = 1;
+
+ next->has_cpu = 1;
+ next->processor = this_cpu;
+ spin_unlock(&scheduler_lock);
+#endif /* __SMP__ */
+ if (prev != next) {
+#ifdef __SMP__
+ sched_data->prev = prev;
+#endif
+ kstat.context_swtch++;
+ get_mmu_context(next);
+ switch_to(prev,next);
+
+ __schedule_tail();
+ }
+
+ reacquire_kernel_lock(current);
return;
scheduling_in_interrupt:
@@ -618,7 +776,6 @@ scheduling_in_interrupt:
*(int *)0 = 0;
}
-
rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
/*
@@ -701,56 +858,64 @@ void __up(struct semaphore *sem)
* Either form may be used in conjunction with "up()".
*
*/
-static inline int __do_down(struct semaphore * sem, int task_state)
-{
- struct task_struct *tsk = current;
- struct wait_queue wait = { tsk, NULL };
- int ret = 0;
- tsk->state = task_state;
- add_wait_queue(&sem->wait, &wait);
+#define DOWN_VAR \
+ struct task_struct *tsk = current; \
+ struct wait_queue wait = { tsk, NULL };
- /*
- * Ok, we're set up. sem->count is known to be less than zero
- * so we must wait.
- *
- * We can let go the lock for purposes of waiting.
- * We re-acquire it after awaking so as to protect
- * all semaphore operations.
- *
- * If "up()" is called before we call waking_non_zero() then
- * we will catch it right away. If it is called later then
- * we will have to go through a wakeup cycle to catch it.
- *
- * Multiple waiters contend for the semaphore lock to see
- * who gets to gate through and who has to wait some more.
- */
- for (;;) {
- if (waking_non_zero(sem)) /* are we waking up? */
+#define DOWN_HEAD(task_state) \
+ \
+ \
+ tsk->state = (task_state); \
+ add_wait_queue(&sem->wait, &wait); \
+ \
+ /* \
+ * Ok, we're set up. sem->count is known to be less than zero \
+ * so we must wait. \
+ * \
+ * We can let go the lock for purposes of waiting. \
+ * We re-acquire it after awaking so as to protect \
+ * all semaphore operations. \
+ * \
+ * If "up()" is called before we call waking_non_zero() then \
+ * we will catch it right away. If it is called later then \
+ * we will have to go through a wakeup cycle to catch it. \
+ * \
+ * Multiple waiters contend for the semaphore lock to see \
+ * who gets to gate through and who has to wait some more. \
+ */ \
+ for (;;) { \
+ if (waking_non_zero(sem, tsk)) /* are we waking up? */ \
break; /* yes, exit loop */
- if (task_state == TASK_INTERRUPTIBLE && signal_pending(tsk)) {
- ret = -EINTR; /* interrupted */
- atomic_inc(&sem->count); /* give up on down operation */
- break;
- }
-
- schedule();
- tsk->state = task_state;
- }
- tsk->state = TASK_RUNNING;
+#define DOWN_TAIL(task_state) \
+ tsk->state = (task_state); \
+ } \
+ tsk->state = TASK_RUNNING; \
remove_wait_queue(&sem->wait, &wait);
- return ret;
-}
void __down(struct semaphore * sem)
{
- __do_down(sem,TASK_UNINTERRUPTIBLE);
+ DOWN_VAR
+ DOWN_HEAD(TASK_UNINTERRUPTIBLE)
+ schedule();
+ DOWN_TAIL(TASK_UNINTERRUPTIBLE)
}
int __down_interruptible(struct semaphore * sem)
{
- return __do_down(sem,TASK_INTERRUPTIBLE);
+ DOWN_VAR
+ int ret = 0;
+ DOWN_HEAD(TASK_INTERRUPTIBLE)
+ if (signal_pending(tsk))
+ {
+ ret = -EINTR; /* interrupted */
+ atomic_inc(&sem->count); /* give up on down operation */
+ break;
+ }
+ schedule();
+ DOWN_TAIL(TASK_INTERRUPTIBLE)
+ return ret;
}
#define SLEEP_ON_VAR \
@@ -803,6 +968,19 @@ void sleep_on(struct wait_queue **p)
SLEEP_ON_TAIL
}
+long sleep_on_timeout(struct wait_queue **p, long timeout)
+{
+ SLEEP_ON_VAR
+
+ current->state = TASK_UNINTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ timeout = schedule_timeout(timeout);
+ SLEEP_ON_TAIL
+
+ return timeout;
+}
+
void scheduling_functions_end_here(void) { }
static inline void cascade_timers(struct timer_vec *tv)
@@ -940,8 +1118,11 @@ static void second_overflow(void)
/* Bump the maxerror field */
time_maxerror += time_tolerance >> SHIFT_USEC;
- if ( time_maxerror > MAXPHASE )
- time_maxerror = MAXPHASE;
+ if ( time_maxerror > NTP_PHASE_LIMIT ) {
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_state = TIME_ERROR; /* p. 17, sect. 4.3, (b) */
+ time_status |= STA_UNSYNC;
+ }
/*
* Leap second processing. If in leap-insert state at
@@ -965,7 +1146,7 @@ static void second_overflow(void)
if (xtime.tv_sec % 86400 == 0) {
xtime.tv_sec--;
time_state = TIME_OOP;
- printk("Clock: inserting leap second 23:59:60 UTC\n");
+ printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
}
break;
@@ -973,7 +1154,7 @@ static void second_overflow(void)
if ((xtime.tv_sec + 1) % 86400 == 0) {
xtime.tv_sec++;
time_state = TIME_WAIT;
- printk("Clock: deleting leap second 23:59:59 UTC\n");
+ printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
}
break;
@@ -1021,7 +1202,7 @@ static void second_overflow(void)
* the pll and the PPS signal.
*/
pps_valid++;
- if (pps_valid == PPS_VALID) {
+ if (pps_valid == PPS_VALID) { /* PPS signal lost */
pps_jitter = MAXTIME;
pps_stabil = MAXFREQ;
time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
@@ -1036,17 +1217,38 @@ static void second_overflow(void)
(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
#if HZ == 100
- /* compensate for (HZ==100) != 128. Add 25% to get 125; => only 3% error */
+ /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
+ * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
+ */
if (time_adj < 0)
- time_adj -= -time_adj >> 2;
+ time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
else
- time_adj += time_adj >> 2;
+ time_adj += (time_adj >> 2) + (time_adj >> 5);
#endif
}
/* in the NTP reference this is called "hardclock()" */
static void update_wall_time_one_tick(void)
{
+ if ( (time_adjust_step = time_adjust) != 0 ) {
+ /* We are doing an adjtime thing.
+ *
+ * Prepare time_adjust_step to be within bounds.
+ * Note that a positive time_adjust means we want the clock
+ * to run faster.
+ *
+ * Limit the amount of the step to be in the range
+ * -tickadj .. +tickadj
+ */
+ if (time_adjust > tickadj)
+ time_adjust_step = tickadj;
+ else if (time_adjust < -tickadj)
+ time_adjust_step = -tickadj;
+
+ /* Reduce by this step the amount of time left */
+ time_adjust -= time_adjust_step;
+ }
+ xtime.tv_usec += tick + time_adjust_step;
/*
* Advance the phase, once it gets to one microsecond, then
* advance the tick more.
@@ -1055,37 +1257,13 @@ static void update_wall_time_one_tick(void)
if (time_phase <= -FINEUSEC) {
long ltemp = -time_phase >> SHIFT_SCALE;
time_phase += ltemp << SHIFT_SCALE;
- xtime.tv_usec += tick + time_adjust_step - ltemp;
+ xtime.tv_usec -= ltemp;
}
else if (time_phase >= FINEUSEC) {
long ltemp = time_phase >> SHIFT_SCALE;
time_phase -= ltemp << SHIFT_SCALE;
- xtime.tv_usec += tick + time_adjust_step + ltemp;
- } else
- xtime.tv_usec += tick + time_adjust_step;
-
- if (time_adjust) {
- /* We are doing an adjtime thing.
- *
- * Modify the value of the tick for next time.
- * Note that a positive delta means we want the clock
- * to run fast. This means that the tick should be bigger
- *
- * Limit the amount of the step for *next* tick to be
- * in the range -tickadj .. +tickadj
- */
- if (time_adjust > tickadj)
- time_adjust_step = tickadj;
- else if (time_adjust < -tickadj)
- time_adjust_step = -tickadj;
- else
- time_adjust_step = time_adjust;
-
- /* Reduce by this step the amount of time left */
- time_adjust -= time_adjust_step;
+ xtime.tv_usec += ltemp;
}
- else
- time_adjust_step = 0;
}
/*
@@ -1189,13 +1367,21 @@ static void update_process_times(unsigned long ticks, unsigned long system)
volatile unsigned long lost_ticks = 0;
static unsigned long lost_ticks_system = 0;
+/*
+ * This spinlock protect us from races in SMP while playing with xtime. -arca
+ */
+rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
+
static inline void update_times(void)
{
unsigned long ticks;
- unsigned long flags;
- save_flags(flags);
- cli();
+ /*
+ * update_times() is run from the raw timer_bh handler so we
+ * just know that the irqs are locally enabled and so we don't
+ * need to save/restore the flags of the local CPU here. -arca
+ */
+ write_lock_irq(&xtime_lock);
ticks = lost_ticks;
lost_ticks = 0;
@@ -1206,12 +1392,12 @@ static inline void update_times(void)
calc_load(ticks);
update_wall_time(ticks);
- restore_flags(flags);
+ write_unlock_irq(&xtime_lock);
update_process_times(ticks, system);
} else
- restore_flags(flags);
+ write_unlock_irq(&xtime_lock);
}
static void timer_bh(void)
@@ -1367,7 +1553,7 @@ asmlinkage int sys_nice(int increment)
* do a "normalization" of the priority (traditionally
* Unix nice values are -20 to 20; Linux doesn't really
* use that kind of thing, but uses the length of the
- * timeslice instead (default 150 ms). The rounding is
+ * timeslice instead (default 210 ms). The rounding is
* why we want to avoid negative values.
*/
newprio = (newprio * DEF_PRIORITY + 10) / 20;