diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1999-02-15 02:15:32 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1999-02-15 02:15:32 +0000 |
commit | 86464aed71025541805e7b1515541aee89879e33 (patch) | |
tree | e01a457a4912a8553bc65524aa3125d51f29f810 /kernel/sched.c | |
parent | 88f99939ecc6a95a79614574cb7d95ffccfc3466 (diff) |
Merge with Linux 2.2.1.
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 474 |
1 files changed, 330 insertions, 144 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index c8c297180..add76fbe0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7,6 +7,14 @@ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar */ /* @@ -59,8 +67,8 @@ long time_offset = 0; /* time adjustment (us) */ long time_constant = 2; /* pll time constant */ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ -long time_maxerror = MAXPHASE; /* maximum error (us) */ -long time_esterror = MAXPHASE; /* estimated error (us) */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_phase = 0; /* phase offset (scaled us) */ long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; /* frequency offset (scaled ppm) */ long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ @@ -91,47 +99,122 @@ struct kernel_stat kstat = { 0 }; void scheduling_functions_start_here(void) { } -static inline void reschedule_idle(struct task_struct * p) +#ifdef __SMP__ +static void reschedule_idle_slow(struct task_struct * p) { +/* + * (see reschedule_idle() for an explanation first ...) + * + * Pass #2 + * + * We try to find another (idle) CPU for this woken-up process. + * + * On SMP, we mostly try to see if the CPU the task used + * to run on is idle.. but we will use another idle CPU too, + * at this point we already know that this CPU is not + * willing to reschedule in the near future. + * + * An idle CPU is definitely wasted, especially if this CPU is + * running long-timeslice processes. The following algorithm is + * pretty good at finding the best idle CPU to send this process + * to. + * + * [We can try to preempt low-priority processes on other CPUs in + * 2.3. Also we can try to use the avg_slice value to predict + * 'likely reschedule' events even on other CPUs.] + */ + int best_cpu = p->processor, this_cpu = smp_processor_id(); + struct task_struct **idle = task, *tsk, *target_tsk; + int i = smp_num_cpus; + + target_tsk = NULL; + do { + tsk = *idle; + idle++; + if (tsk->has_cpu) { + if (tsk->processor == this_cpu) + continue; + target_tsk = tsk; + if (tsk->processor == best_cpu) { + /* + * bingo, we couldnt get a better + * CPU, activate it. + */ + goto send; /* this one helps GCC ... */ + } + } + } while (--i > 0); /* - * For SMP, we try to see if the CPU the task used - * to run on is idle.. + * found any idle CPU? */ -#if 0 + if (target_tsk) { +send: + target_tsk->need_resched = 1; + smp_send_reschedule(target_tsk->processor); + return; + } +} +#endif /* __SMP__ */ + +/* + * If there is a dependency between p1 and p2, + * don't be too eager to go into the slow schedule. + * In particular, if p1 and p2 both want the kernel + * lock, there is no point in trying to make them + * extremely parallel.. + * + * (No lock - lock_depth < 0) + */ +#define related(p1,p2) ((p1)->lock_depth >= 0 && (p2)->lock_depth >= 0) + +static inline void reschedule_idle(struct task_struct * p) +{ + + if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) { + current->need_resched = 1; + return; + } + +#ifdef __SMP__ /* - * Disable this for now. Ingo has some interesting - * code that looks too complex, and I have some ideas, - * but in the meantime.. One problem is that "wakeup()" - * can be (and is) called before we've even initialized - * SMP completely, so.. + * ("wakeup()" should not be called before we've initialized + * SMP completely. + * Basically a not-yet initialized SMP subsystem can be + * considered as a not-yet working scheduler, simply dont use + * it before it's up and running ...) + * + * SMP rescheduling is done in 2 passes: + * - pass #1: faster: 'quick decisions' + * - pass #2: slower: 'lets try and find another CPU' */ -#ifdef __SMP__ - int want_cpu = p->processor; /* - * Don't even try to find another CPU for us if the task - * ran on this one before.. + * Pass #1 + * + * There are two metrics here: + * + * first, a 'cutoff' interval, currently 0-200 usecs on + * x86 CPUs, depending on the size of the 'SMP-local cache'. + * If the current process has longer average timeslices than + * this, then we utilize the idle CPU. + * + * second, if the wakeup comes from a process context, + * then the two processes are 'related'. (they form a + * 'gang') + * + * An idle CPU is almost always a bad thing, thus we skip + * the idle-CPU utilization only if both these conditions + * are true. (ie. a 'process-gang' rescheduling with rather + * high frequency should stay on the same CPU). + * + * [We can switch to something more finegrained in 2.3.] */ - if (want_cpu != smp_processor_id()) { - struct task_struct **idle = task; - int i = smp_num_cpus; - - do { - struct task_struct *tsk = *idle; - idle++; - /* Something like this.. */ - if (tsk->has_cpu && tsk->processor == want_cpu) { - tsk->need_resched = 1; - smp_send_reschedule(want_cpu); - return; - } - } while (--i > 0); - } -#endif -#endif - if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) - current->need_resched = 1; + if ((current->avg_slice < cacheflush_time) && related(current, p)) + return; + + reschedule_idle_slow(p); +#endif /* __SMP__ */ } /* @@ -149,6 +232,7 @@ static inline void add_to_runqueue(struct task_struct * p) init_task.next_run = p; p->next_run = next; next->prev_run = p; + nr_running++; } static inline void del_from_runqueue(struct task_struct * p) @@ -227,7 +311,6 @@ void wake_up_process(struct task_struct * p) if (!p->next_run) { add_to_runqueue(p); reschedule_idle(p); - nr_running++; } spin_unlock_irqrestore(&runqueue_lock, flags); } @@ -437,23 +520,6 @@ signed long schedule_timeout(signed long timeout) struct timer_list timer; unsigned long expire; - /* - * PARANOID. - */ - if (current->state == TASK_UNINTERRUPTIBLE) - { - printk(KERN_WARNING "schedule_timeout: task not interrutible " - "from %p\n", __builtin_return_address(0)); - /* - * We don' t want to interrupt a not interruptible task - * risking to cause corruption. Better a a deadlock ;-). - */ - timeout = MAX_SCHEDULE_TIMEOUT; - } - - /* - * Here we start for real. - */ switch (timeout) { case MAX_SCHEDULE_TIMEOUT: @@ -501,6 +567,63 @@ signed long schedule_timeout(signed long timeout) } /* + * This one aligns per-CPU data on cacheline boundaries. + */ +static union { + struct schedule_data { + struct task_struct * prev; + long prevstate; + cycles_t last_schedule; + } schedule_data; + char __pad [L1_CACHE_BYTES]; +} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; + + +static inline void __schedule_tail (void) +{ +#ifdef __SMP__ + struct schedule_data * sched_data; + + /* + * We might have switched CPUs: + */ + sched_data = & aligned_data[smp_processor_id()].schedule_data; + + /* + * Subtle. In the rare event that we got a wakeup to 'prev' just + * during the reschedule (this is possible, the scheduler is pretty + * parallel), we should do another reschedule in the next task's + * context. schedule() will do the right thing next time around. + * this is equivalent to 'delaying' the wakeup until the reschedule + * has finished. + */ + if (sched_data->prev->state != sched_data->prevstate) + current->need_resched = 1; + + /* + * Release the previous process ... + * + * We have dropped all locks, and we must make sure that we + * only mark the previous process as no longer having a CPU + * after all other state has been seen by other CPU's. Thus + * the write memory barrier! + */ + wmb(); + sched_data->prev->has_cpu = 0; +#endif /* __SMP__ */ +} + +/* + * schedule_tail() is getting called from the fork return path. This + * cleans up all remaining scheduler things, without impacting the + * common case. + */ +void schedule_tail (void) +{ + __schedule_tail(); +} + +/* * 'schedule()' is the scheduler function. It's a very simple and nice * scheduler: it's not perfect, but certainly works for most things. * @@ -512,11 +635,18 @@ signed long schedule_timeout(signed long timeout) */ asmlinkage void schedule(void) { + struct schedule_data * sched_data; struct task_struct * prev, * next; int this_cpu; prev = current; this_cpu = prev->processor; + /* + * 'sched_data' is protected by the fact that we can run + * only one process per CPU. + */ + sched_data = & aligned_data[this_cpu].schedule_data; + if (in_interrupt()) goto scheduling_in_interrupt; release_kernel_lock(prev, this_cpu); @@ -531,6 +661,7 @@ asmlinkage void schedule(void) /* move an exhausted RR process to be last.. */ prev->need_resched = 0; + if (!prev->counter && prev->policy == SCHED_RR) { prev->counter = prev->priority; move_last_runqueue(prev); @@ -546,6 +677,9 @@ asmlinkage void schedule(void) del_from_runqueue(prev); case TASK_RUNNING: } + + sched_data->prevstate = prev->state; + { struct task_struct * p = init_task.next_run; /* @@ -592,25 +726,49 @@ asmlinkage void schedule(void) } } + /* + * maintain the per-process 'average timeslice' value. + * (this has to be recalculated even if we reschedule to + * the same process) Currently this is only used on SMP: + */ #ifdef __SMP__ - next->has_cpu = 1; - next->processor = this_cpu; -#endif + { + cycles_t t, this_slice; - if (prev != next) { - kstat.context_swtch++; - get_mmu_context(next); - switch_to(prev,next); - } + t = get_cycles(); + this_slice = t - sched_data->last_schedule; + sched_data->last_schedule = t; - spin_unlock(&scheduler_lock); + /* + * Simple, exponentially fading average calculation: + */ + prev->avg_slice = this_slice + prev->avg_slice; + prev->avg_slice >>= 1; + } /* - * At this point "prev" is "current", as we just - * switched into it (from an even more "previous" - * prev) + * We drop the scheduler lock early (it's a global spinlock), + * thus we have to lock the previous process from getting + * rescheduled during switch_to(). */ - reacquire_kernel_lock(prev); + prev->has_cpu = 1; + + next->has_cpu = 1; + next->processor = this_cpu; + spin_unlock(&scheduler_lock); +#endif /* __SMP__ */ + if (prev != next) { +#ifdef __SMP__ + sched_data->prev = prev; +#endif + kstat.context_swtch++; + get_mmu_context(next); + switch_to(prev,next); + + __schedule_tail(); + } + + reacquire_kernel_lock(current); return; scheduling_in_interrupt: @@ -618,7 +776,6 @@ scheduling_in_interrupt: *(int *)0 = 0; } - rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED; /* @@ -701,56 +858,64 @@ void __up(struct semaphore *sem) * Either form may be used in conjunction with "up()". * */ -static inline int __do_down(struct semaphore * sem, int task_state) -{ - struct task_struct *tsk = current; - struct wait_queue wait = { tsk, NULL }; - int ret = 0; - tsk->state = task_state; - add_wait_queue(&sem->wait, &wait); +#define DOWN_VAR \ + struct task_struct *tsk = current; \ + struct wait_queue wait = { tsk, NULL }; - /* - * Ok, we're set up. sem->count is known to be less than zero - * so we must wait. - * - * We can let go the lock for purposes of waiting. - * We re-acquire it after awaking so as to protect - * all semaphore operations. - * - * If "up()" is called before we call waking_non_zero() then - * we will catch it right away. If it is called later then - * we will have to go through a wakeup cycle to catch it. - * - * Multiple waiters contend for the semaphore lock to see - * who gets to gate through and who has to wait some more. - */ - for (;;) { - if (waking_non_zero(sem)) /* are we waking up? */ +#define DOWN_HEAD(task_state) \ + \ + \ + tsk->state = (task_state); \ + add_wait_queue(&sem->wait, &wait); \ + \ + /* \ + * Ok, we're set up. sem->count is known to be less than zero \ + * so we must wait. \ + * \ + * We can let go the lock for purposes of waiting. \ + * We re-acquire it after awaking so as to protect \ + * all semaphore operations. \ + * \ + * If "up()" is called before we call waking_non_zero() then \ + * we will catch it right away. If it is called later then \ + * we will have to go through a wakeup cycle to catch it. \ + * \ + * Multiple waiters contend for the semaphore lock to see \ + * who gets to gate through and who has to wait some more. \ + */ \ + for (;;) { \ + if (waking_non_zero(sem, tsk)) /* are we waking up? */ \ break; /* yes, exit loop */ - if (task_state == TASK_INTERRUPTIBLE && signal_pending(tsk)) { - ret = -EINTR; /* interrupted */ - atomic_inc(&sem->count); /* give up on down operation */ - break; - } - - schedule(); - tsk->state = task_state; - } - tsk->state = TASK_RUNNING; +#define DOWN_TAIL(task_state) \ + tsk->state = (task_state); \ + } \ + tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); - return ret; -} void __down(struct semaphore * sem) { - __do_down(sem,TASK_UNINTERRUPTIBLE); + DOWN_VAR + DOWN_HEAD(TASK_UNINTERRUPTIBLE) + schedule(); + DOWN_TAIL(TASK_UNINTERRUPTIBLE) } int __down_interruptible(struct semaphore * sem) { - return __do_down(sem,TASK_INTERRUPTIBLE); + DOWN_VAR + int ret = 0; + DOWN_HEAD(TASK_INTERRUPTIBLE) + if (signal_pending(tsk)) + { + ret = -EINTR; /* interrupted */ + atomic_inc(&sem->count); /* give up on down operation */ + break; + } + schedule(); + DOWN_TAIL(TASK_INTERRUPTIBLE) + return ret; } #define SLEEP_ON_VAR \ @@ -803,6 +968,19 @@ void sleep_on(struct wait_queue **p) SLEEP_ON_TAIL } +long sleep_on_timeout(struct wait_queue **p, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + void scheduling_functions_end_here(void) { } static inline void cascade_timers(struct timer_vec *tv) @@ -940,8 +1118,11 @@ static void second_overflow(void) /* Bump the maxerror field */ time_maxerror += time_tolerance >> SHIFT_USEC; - if ( time_maxerror > MAXPHASE ) - time_maxerror = MAXPHASE; + if ( time_maxerror > NTP_PHASE_LIMIT ) { + time_maxerror = NTP_PHASE_LIMIT; + time_state = TIME_ERROR; /* p. 17, sect. 4.3, (b) */ + time_status |= STA_UNSYNC; + } /* * Leap second processing. If in leap-insert state at @@ -965,7 +1146,7 @@ static void second_overflow(void) if (xtime.tv_sec % 86400 == 0) { xtime.tv_sec--; time_state = TIME_OOP; - printk("Clock: inserting leap second 23:59:60 UTC\n"); + printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } break; @@ -973,7 +1154,7 @@ static void second_overflow(void) if ((xtime.tv_sec + 1) % 86400 == 0) { xtime.tv_sec++; time_state = TIME_WAIT; - printk("Clock: deleting leap second 23:59:59 UTC\n"); + printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); } break; @@ -1021,7 +1202,7 @@ static void second_overflow(void) * the pll and the PPS signal. */ pps_valid++; - if (pps_valid == PPS_VALID) { + if (pps_valid == PPS_VALID) { /* PPS signal lost */ pps_jitter = MAXTIME; pps_stabil = MAXFREQ; time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | @@ -1036,17 +1217,38 @@ static void second_overflow(void) (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); #if HZ == 100 - /* compensate for (HZ==100) != 128. Add 25% to get 125; => only 3% error */ + /* Compensate for (HZ==100) != (1 << SHIFT_HZ). + * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) + */ if (time_adj < 0) - time_adj -= -time_adj >> 2; + time_adj -= (-time_adj >> 2) + (-time_adj >> 5); else - time_adj += time_adj >> 2; + time_adj += (time_adj >> 2) + (time_adj >> 5); #endif } /* in the NTP reference this is called "hardclock()" */ static void update_wall_time_one_tick(void) { + if ( (time_adjust_step = time_adjust) != 0 ) { + /* We are doing an adjtime thing. + * + * Prepare time_adjust_step to be within bounds. + * Note that a positive time_adjust means we want the clock + * to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + if (time_adjust > tickadj) + time_adjust_step = tickadj; + else if (time_adjust < -tickadj) + time_adjust_step = -tickadj; + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + } + xtime.tv_usec += tick + time_adjust_step; /* * Advance the phase, once it gets to one microsecond, then * advance the tick more. @@ -1055,37 +1257,13 @@ static void update_wall_time_one_tick(void) if (time_phase <= -FINEUSEC) { long ltemp = -time_phase >> SHIFT_SCALE; time_phase += ltemp << SHIFT_SCALE; - xtime.tv_usec += tick + time_adjust_step - ltemp; + xtime.tv_usec -= ltemp; } else if (time_phase >= FINEUSEC) { long ltemp = time_phase >> SHIFT_SCALE; time_phase -= ltemp << SHIFT_SCALE; - xtime.tv_usec += tick + time_adjust_step + ltemp; - } else - xtime.tv_usec += tick + time_adjust_step; - - if (time_adjust) { - /* We are doing an adjtime thing. - * - * Modify the value of the tick for next time. - * Note that a positive delta means we want the clock - * to run fast. This means that the tick should be bigger - * - * Limit the amount of the step for *next* tick to be - * in the range -tickadj .. +tickadj - */ - if (time_adjust > tickadj) - time_adjust_step = tickadj; - else if (time_adjust < -tickadj) - time_adjust_step = -tickadj; - else - time_adjust_step = time_adjust; - - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; + xtime.tv_usec += ltemp; } - else - time_adjust_step = 0; } /* @@ -1189,13 +1367,21 @@ static void update_process_times(unsigned long ticks, unsigned long system) volatile unsigned long lost_ticks = 0; static unsigned long lost_ticks_system = 0; +/* + * This spinlock protect us from races in SMP while playing with xtime. -arca + */ +rwlock_t xtime_lock = RW_LOCK_UNLOCKED; + static inline void update_times(void) { unsigned long ticks; - unsigned long flags; - save_flags(flags); - cli(); + /* + * update_times() is run from the raw timer_bh handler so we + * just know that the irqs are locally enabled and so we don't + * need to save/restore the flags of the local CPU here. -arca + */ + write_lock_irq(&xtime_lock); ticks = lost_ticks; lost_ticks = 0; @@ -1206,12 +1392,12 @@ static inline void update_times(void) calc_load(ticks); update_wall_time(ticks); - restore_flags(flags); + write_unlock_irq(&xtime_lock); update_process_times(ticks, system); } else - restore_flags(flags); + write_unlock_irq(&xtime_lock); } static void timer_bh(void) @@ -1367,7 +1553,7 @@ asmlinkage int sys_nice(int increment) * do a "normalization" of the priority (traditionally * Unix nice values are -20 to 20; Linux doesn't really * use that kind of thing, but uses the length of the - * timeslice instead (default 150 ms). The rounding is + * timeslice instead (default 210 ms). The rounding is * why we want to avoid negative values. */ newprio = (newprio * DEF_PRIORITY + 10) / 20; |