diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 4 | ||||
-rw-r--r-- | kernel/exit.c | 6 | ||||
-rw-r--r-- | kernel/fork.c | 89 | ||||
-rw-r--r-- | kernel/ksyms.c | 9 | ||||
-rw-r--r-- | kernel/printk.c | 12 | ||||
-rw-r--r-- | kernel/sched.c | 741 | ||||
-rw-r--r-- | kernel/signal.c | 23 | ||||
-rw-r--r-- | kernel/softirq.c | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 19 | ||||
-rw-r--r-- | kernel/time.c | 24 |
10 files changed, 557 insertions, 371 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index dc0baed32..a8a94f734 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -194,13 +194,13 @@ asmlinkage int sys_acct(const char *name) } if (old_acct) { do_acct_process(0,old_acct); - fput(old_acct); + filp_close(old_acct, NULL); } out: unlock_kernel(); return error; out_err: - fput(file); + filp_close(file, NULL); goto out; } diff --git a/kernel/exit.c b/kernel/exit.c index 58eb6df5d..b11ed8a11 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -32,9 +32,9 @@ void release(struct task_struct * p) */ for (;;) { int has_cpu; - spin_lock(&scheduler_lock); + spin_lock_irq(&runqueue_lock); has_cpu = p->has_cpu; - spin_unlock(&scheduler_lock); + spin_unlock_irq(&runqueue_lock); if (!has_cpu) break; do { @@ -169,7 +169,7 @@ static inline void close_files(struct files_struct * files) struct file * file = files->fd[i]; if (file) { files->fd[i] = NULL; - close_fp(file, files); + filp_close(file, files); } } i++; diff --git a/kernel/fork.c b/kernel/fork.c index 5c714fe73..a431ffe2b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -57,36 +57,54 @@ kmem_cache_t *uid_cachep; #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1)) +/* + * These routines must be called with the uidhash spinlock held! + */ static inline void uid_hash_insert(struct user_struct *up, unsigned int hashent) { - spin_lock(&uidhash_lock); if((up->next = uidhash[hashent]) != NULL) uidhash[hashent]->pprev = &up->next; up->pprev = &uidhash[hashent]; uidhash[hashent] = up; - spin_unlock(&uidhash_lock); } static inline void uid_hash_remove(struct user_struct *up) { - spin_lock(&uidhash_lock); if(up->next) up->next->pprev = up->pprev; *up->pprev = up->next; - spin_unlock(&uidhash_lock); } -static inline struct user_struct *uid_find(unsigned short uid, unsigned int hashent) +static inline struct user_struct *uid_hash_find(unsigned short uid, unsigned int hashent) { - struct user_struct *up; - - spin_lock(&uidhash_lock); - for(up = uidhash[hashent]; (up && up->uid != uid); up = up->next) - ; - spin_unlock(&uidhash_lock); + struct user_struct *up, *next; + + next = uidhash[hashent]; + for (;;) { + up = next; + if (next) { + next = up->next; + if (up->uid != uid) + continue; + atomic_inc(&up->count); + } + break; + } return up; } +/* + * For SMP, we need to re-test the user struct counter + * after having aquired the spinlock. This allows us to do + * the common case (not freeing anything) without having + * any locking. + */ +#ifdef __SMP__ + #define uid_hash_free(up) (!atomic_read(&(up)->count)) +#else + #define uid_hash_free(up) (1) +#endif + void free_uid(struct task_struct *p) { struct user_struct *up = p->user; @@ -94,8 +112,12 @@ void free_uid(struct task_struct *p) if (up) { p->user = NULL; if (atomic_dec_and_test(&up->count)) { - uid_hash_remove(up); - kmem_cache_free(uid_cachep, up); + spin_lock(&uidhash_lock); + if (uid_hash_free(up)) { + uid_hash_remove(up); + kmem_cache_free(uid_cachep, up); + } + spin_unlock(&uidhash_lock); } } } @@ -103,20 +125,37 @@ void free_uid(struct task_struct *p) int alloc_uid(struct task_struct *p) { unsigned int hashent = uidhashfn(p->uid); - struct user_struct *up = uid_find(p->uid, hashent); + struct user_struct *up; + + spin_lock(&uidhash_lock); + up = uid_hash_find(p->uid, hashent); + spin_unlock(&uidhash_lock); - p->user = up; if (!up) { - up = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); - if (!up) + struct user_struct *new; + + new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); + if (!new) return -EAGAIN; - p->user = up; - up->uid = p->uid; - atomic_set(&up->count, 0); - uid_hash_insert(up, hashent); - } + new->uid = p->uid; + atomic_set(&new->count, 1); - atomic_inc(&up->count); + /* + * Before adding this, check whether we raced + * on adding the same user already.. + */ + spin_lock(&uidhash_lock); + up = uid_hash_find(p->uid, hashent); + if (up) { + kmem_cache_free(uid_cachep, new); + } else { + uid_hash_insert(new, hashent); + up = new; + } + spin_unlock(&uidhash_lock); + + } + p->user = up; return 0; } @@ -172,8 +211,8 @@ inside: if(last_pid & 0xffff8000) last_pid = 300; next_safe = PID_MAX; - goto repeat; } + goto repeat; } if(p->pid > last_pid && next_safe > p->pid) next_safe = p->pid; @@ -510,6 +549,7 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) down(¤t->mm->mmap_sem); lock_kernel(); + retval = -EAGAIN; if (p->user) { if (atomic_read(&p->user->count) >= p->rlim[RLIMIT_NPROC].rlim_cur) goto bad_fork_free; @@ -518,7 +558,6 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) { struct task_struct **tslot; tslot = find_empty_process(); - retval = -EAGAIN; if (!tslot) goto bad_fork_free; p->tarray_ptr = tslot; diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 6cf723a4d..8f55f7dfb 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -37,6 +37,7 @@ #include <linux/file.h> #include <linux/console.h> #include <linux/poll.h> +#include <linux/mm.h> #if defined(CONFIG_PROC_FS) #include <linux/proc_fs.h> @@ -60,7 +61,7 @@ extern int request_dma(unsigned int dmanr, char * deviceID); extern void free_dma(unsigned int dmanr); extern spinlock_t dma_spin_lock; -#ifdef MODVERSIONS +#ifdef CONFIG_MODVERSIONS const struct module_symbol __export_Using_Versions __attribute__((section("__ksymtab"))) = { 1 /* Version version */, "Using_Versions" @@ -105,6 +106,8 @@ EXPORT_SYMBOL(max_mapnr); EXPORT_SYMBOL(high_memory); EXPORT_SYMBOL(update_vm_cache); EXPORT_SYMBOL(vmtruncate); +EXPORT_SYMBOL(find_vma); +EXPORT_SYMBOL(get_unmapped_area); /* filesystem internal functions */ EXPORT_SYMBOL(in_group_p); @@ -133,6 +136,7 @@ EXPORT_SYMBOL(__mark_inode_dirty); EXPORT_SYMBOL(get_empty_filp); EXPORT_SYMBOL(init_private_file); EXPORT_SYMBOL(filp_open); +EXPORT_SYMBOL(filp_close); EXPORT_SYMBOL(fput); EXPORT_SYMBOL(put_filp); EXPORT_SYMBOL(check_disk_change); @@ -319,6 +323,8 @@ EXPORT_SYMBOL(printk); EXPORT_SYMBOL(sprintf); EXPORT_SYMBOL(vsprintf); EXPORT_SYMBOL(kdevname); +EXPORT_SYMBOL(bdevname); +EXPORT_SYMBOL(cdevname); EXPORT_SYMBOL(simple_strtoul); EXPORT_SYMBOL(system_utsname); /* UTS data */ EXPORT_SYMBOL(uts_sem); /* UTS semaphore */ @@ -367,6 +373,7 @@ EXPORT_SYMBOL(is_bad_inode); EXPORT_SYMBOL(event); EXPORT_SYMBOL(__down); EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__down_trylock); EXPORT_SYMBOL(__up); EXPORT_SYMBOL(brw_page); diff --git a/kernel/printk.c b/kernel/printk.c index a333fe18e..36414fcf3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -137,15 +137,9 @@ int do_syslog(int type, char * buf, int len) error = verify_area(VERIFY_WRITE,buf,len); if (error) goto out; - cli(); - error = -ERESTARTSYS; - while (!log_size) { - if (signal_pending(current)) { - sti(); - goto out; - } - interruptible_sleep_on(&log_wait); - } + error = wait_event_interruptible(log_wait, log_size); + if (error) + goto out; i = 0; while (log_size && i < len) { c = *((char *) log_buf+log_start); diff --git a/kernel/sched.c b/kernel/sched.c index add76fbe0..098c90408 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3,7 +3,6 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * 1996-04-21 Modified by Ulrich Windl to make NTP work * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. @@ -15,6 +14,7 @@ * serialize accesses to xtime/lost_ticks). * Copyright (C) 1998 Andrea Arcangeli * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar + * 1999-03-10 Improved NTP compatibility by Ulrich Windl */ /* @@ -36,6 +36,7 @@ #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> +#include <asm/semaphore-helper.h> #include <linux/timex.h> @@ -61,7 +62,7 @@ DECLARE_TASK_QUEUE(tq_scheduler); * phase-lock loop variables */ /* TIME_ERROR prevents overwriting the CMOS clock */ -int time_state = TIME_ERROR; /* clock synchronization status */ +int time_state = TIME_OK; /* clock synchronization status */ int time_status = STA_UNSYNC; /* clock status bits */ long time_offset = 0; /* time adjustment (us) */ long time_constant = 2; /* pll time constant */ @@ -95,13 +96,156 @@ unsigned long volatile jiffies=0; struct task_struct * task[NR_TASKS] = {&init_task, }; +/* + * We align per-CPU scheduling data on cacheline boundaries, + * to prevent cacheline ping-pong. + */ +static union { + struct schedule_data { + struct task_struct * curr; + cycles_t last_schedule; + } schedule_data; + char __pad [SMP_CACHE_BYTES]; +} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; + +#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr + struct kernel_stat kstat = { 0 }; +#ifdef __SMP__ + +#define idle_task(cpu) (task[cpu_number_map[(cpu)]]) +#define can_schedule(p) (!(p)->has_cpu) + +#else + +#define idle_task(cpu) (&init_task) +#define can_schedule(p) (1) + +#endif + void scheduling_functions_start_here(void) { } +/* + * This is the function that decides how desirable a process is.. + * You can weigh different processes against each other depending + * on what CPU they've run on lately etc to try to handle cache + * and TLB miss penalties. + * + * Return values: + * -1000: never select this + * 0: out of time, recalculate counters (but it might still be + * selected) + * +ve: "goodness" value (the larger, the better) + * +1000: realtime process, select this. + */ + +static inline int goodness (struct task_struct * prev, + struct task_struct * p, int this_cpu) +{ + int weight; + + /* + * Realtime process, select the first one on the + * runqueue (taking priorities within processes + * into account). + */ + if (p->policy != SCHED_OTHER) { + weight = 1000 + p->rt_priority; + goto out; + } + + /* + * Give the process a first-approximation goodness value + * according to the number of clock-ticks it has left. + * + * Don't do any other calculations if the time slice is + * over.. + */ + weight = p->counter; + if (!weight) + goto out; + #ifdef __SMP__ -static void reschedule_idle_slow(struct task_struct * p) + /* Give a largish advantage to the same processor... */ + /* (this is equivalent to penalizing other processors) */ + if (p->processor == this_cpu) + weight += PROC_CHANGE_PENALTY; +#endif + + /* .. and a slight advantage to the current MM */ + if (p->mm == prev->mm) + weight += 1; + weight += p->priority; + +out: + return weight; +} + +/* + * subtle. We want to discard a yielded process only if it's being + * considered for a reschedule. Wakeup-time 'queries' of the scheduling + * state do not count. Another optimization we do: sched_yield()-ed + * processes are runnable (and thus will be considered for scheduling) + * right when they are calling schedule(). So the only place we need + * to care about SCHED_YIELD is when we calculate the previous process' + * goodness ... + */ +static inline int prev_goodness (struct task_struct * prev, + struct task_struct * p, int this_cpu) { + if (p->policy & SCHED_YIELD) { + p->policy &= ~SCHED_YIELD; + return 0; + } + return goodness(prev, p, this_cpu); +} + +/* + * the 'goodness value' of replacing a process on a given CPU. + * positive value means 'replace', zero or negative means 'dont'. + */ +static inline int preemption_goodness (struct task_struct * prev, + struct task_struct * p, int cpu) +{ + return goodness(prev, p, cpu) - goodness(prev, prev, cpu); +} + +/* + * If there is a dependency between p1 and p2, + * don't be too eager to go into the slow schedule. + * In particular, if p1 and p2 both want the kernel + * lock, there is no point in trying to make them + * extremely parallel.. + * + * (No lock - lock_depth < 0) + * + * There are two additional metrics here: + * + * first, a 'cutoff' interval, currently 0-200 usecs on + * x86 CPUs, depending on the size of the 'SMP-local cache'. + * If the current process has longer average timeslices than + * this, then we utilize the idle CPU. + * + * second, if the wakeup comes from a process context, + * then the two processes are 'related'. (they form a + * 'gang') + * + * An idle CPU is almost always a bad thing, thus we skip + * the idle-CPU utilization only if both these conditions + * are true. (ie. a 'process-gang' rescheduling with rather + * high frequency should stay on the same CPU). + * + * [We can switch to something more finegrained in 2.3.] + * + * do not 'guess' if the to-be-scheduled task is RT. + */ +#define related(p1,p2) (((p1)->lock_depth >= 0) && (p2)->lock_depth >= 0) && \ + (((p2)->policy == SCHED_OTHER) && ((p1)->avg_slice < cacheflush_time)) + +static inline void reschedule_idle_slow(struct task_struct * p) +{ +#ifdef __SMP__ /* * (see reschedule_idle() for an explanation first ...) * @@ -123,60 +267,71 @@ static void reschedule_idle_slow(struct task_struct * p) * 2.3. Also we can try to use the avg_slice value to predict * 'likely reschedule' events even on other CPUs.] */ - int best_cpu = p->processor, this_cpu = smp_processor_id(); - struct task_struct **idle = task, *tsk, *target_tsk; - int i = smp_num_cpus; + int this_cpu = smp_processor_id(), target_cpu; + struct task_struct *tsk, *target_tsk; + int cpu, best_cpu, weight, best_weight, i; + unsigned long flags; + + best_weight = 0; /* prevents negative weight */ + + spin_lock_irqsave(&runqueue_lock, flags); + + /* + * shortcut if the woken up task's last CPU is + * idle now. + */ + best_cpu = p->processor; + target_tsk = idle_task(best_cpu); + if (cpu_curr(best_cpu) == target_tsk) + goto send_now; target_tsk = NULL; - do { - tsk = *idle; - idle++; - if (tsk->has_cpu) { - if (tsk->processor == this_cpu) - continue; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + tsk = cpu_curr(cpu); + if (related(tsk, p)) + goto out_no_target; + weight = preemption_goodness(tsk, p, cpu); + if (weight > best_weight) { + best_weight = weight; target_tsk = tsk; - if (tsk->processor == best_cpu) { - /* - * bingo, we couldnt get a better - * CPU, activate it. - */ - goto send; /* this one helps GCC ... */ - } } - } while (--i > 0); + } /* - * found any idle CPU? + * found any suitable CPU? */ - if (target_tsk) { -send: - target_tsk->need_resched = 1; - smp_send_reschedule(target_tsk->processor); - return; - } -} -#endif /* __SMP__ */ + if (!target_tsk) + goto out_no_target; + +send_now: + target_cpu = target_tsk->processor; + target_tsk->need_resched = 1; + spin_unlock_irqrestore(&runqueue_lock, flags); + /* + * the APIC stuff can go outside of the lock because + * it uses no task information, only CPU#. + */ + if (target_cpu != this_cpu) + smp_send_reschedule(target_cpu); + return; +out_no_target: + spin_unlock_irqrestore(&runqueue_lock, flags); + return; +#else /* UP */ + int this_cpu = smp_processor_id(); + struct task_struct *tsk; -/* - * If there is a dependency between p1 and p2, - * don't be too eager to go into the slow schedule. - * In particular, if p1 and p2 both want the kernel - * lock, there is no point in trying to make them - * extremely parallel.. - * - * (No lock - lock_depth < 0) - */ -#define related(p1,p2) ((p1)->lock_depth >= 0 && (p2)->lock_depth >= 0) + tsk = cpu_curr(this_cpu); + if (preemption_goodness(tsk, p, this_cpu) > 0) + tsk->need_resched = 1; +#endif +} -static inline void reschedule_idle(struct task_struct * p) +static void reschedule_idle(struct task_struct * p) { - - if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) { - current->need_resched = 1; - return; - } - #ifdef __SMP__ + int cpu = smp_processor_id(); /* * ("wakeup()" should not be called before we've initialized * SMP completely. @@ -186,35 +341,20 @@ static inline void reschedule_idle(struct task_struct * p) * * SMP rescheduling is done in 2 passes: * - pass #1: faster: 'quick decisions' - * - pass #2: slower: 'lets try and find another CPU' + * - pass #2: slower: 'lets try and find a suitable CPU' */ /* - * Pass #1 - * - * There are two metrics here: - * - * first, a 'cutoff' interval, currently 0-200 usecs on - * x86 CPUs, depending on the size of the 'SMP-local cache'. - * If the current process has longer average timeslices than - * this, then we utilize the idle CPU. - * - * second, if the wakeup comes from a process context, - * then the two processes are 'related'. (they form a - * 'gang') - * - * An idle CPU is almost always a bad thing, thus we skip - * the idle-CPU utilization only if both these conditions - * are true. (ie. a 'process-gang' rescheduling with rather - * high frequency should stay on the same CPU). - * - * [We can switch to something more finegrained in 2.3.] + * Pass #1. (subtle. We might be in the middle of __switch_to, so + * to preserve scheduling atomicity we have to use cpu_curr) */ - if ((current->avg_slice < cacheflush_time) && related(current, p)) + if ((p->processor == cpu) && related(cpu_curr(cpu), p)) return; - - reschedule_idle_slow(p); #endif /* __SMP__ */ + /* + * Pass #2 + */ + reschedule_idle_slow(p); } /* @@ -290,7 +430,6 @@ static inline void move_first_runqueue(struct task_struct * p) * The run-queue lock locks the parts that actually access * and change the run-queues, and have to be interrupt-safe. */ -spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED; /* should be acquired first */ spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second */ rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */ @@ -306,12 +445,19 @@ void wake_up_process(struct task_struct * p) { unsigned long flags; + /* + * We want the common case fall through straight, thus the goto. + */ spin_lock_irqsave(&runqueue_lock, flags); p->state = TASK_RUNNING; - if (!p->next_run) { - add_to_runqueue(p); - reschedule_idle(p); - } + if (p->next_run) + goto out; + add_to_runqueue(p); + spin_unlock_irqrestore(&runqueue_lock, flags); + + reschedule_idle(p); + return; +out: spin_unlock_irqrestore(&runqueue_lock, flags); } @@ -323,63 +469,6 @@ static void process_timeout(unsigned long __data) } /* - * This is the function that decides how desirable a process is.. - * You can weigh different processes against each other depending - * on what CPU they've run on lately etc to try to handle cache - * and TLB miss penalties. - * - * Return values: - * -1000: never select this - * 0: out of time, recalculate counters (but it might still be - * selected) - * +ve: "goodness" value (the larger, the better) - * +1000: realtime process, select this. - */ -static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu) -{ - int policy = p->policy; - int weight; - - if (policy & SCHED_YIELD) { - p->policy = policy & ~SCHED_YIELD; - return 0; - } - - /* - * Realtime process, select the first one on the - * runqueue (taking priorities within processes - * into account). - */ - if (policy != SCHED_OTHER) - return 1000 + p->rt_priority; - - /* - * Give the process a first-approximation goodness value - * according to the number of clock-ticks it has left. - * - * Don't do any other calculations if the time slice is - * over.. - */ - weight = p->counter; - if (weight) { - -#ifdef __SMP__ - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; -#endif - - /* .. and a slight advantage to the current thread */ - if (p->mm == prev->mm) - weight += 1; - weight += p->priority; - } - - return weight; -} - -/* * Event timer code */ #define TVN_BITS 6 @@ -463,8 +552,17 @@ void add_timer(struct timer_list *timer) unsigned long flags; spin_lock_irqsave(&timerlist_lock, flags); + if (timer->prev) + goto bug; internal_add_timer(timer); +out: spin_unlock_irqrestore(&timerlist_lock, flags); + return; + +bug: + printk("bug: kernel timer added twice at %p.\n", + __builtin_return_address(0)); + goto out; } static inline int detach_timer(struct timer_list *timer) @@ -503,18 +601,6 @@ int del_timer(struct timer_list * timer) return ret; } -#ifdef __SMP__ - -#define idle_task (task[cpu_number_map[this_cpu]]) -#define can_schedule(p) (!(p)->has_cpu) - -#else - -#define idle_task (&init_task) -#define can_schedule(p) (1) - -#endif - signed long schedule_timeout(signed long timeout) { struct timer_list timer; @@ -567,60 +653,24 @@ signed long schedule_timeout(signed long timeout) } /* - * This one aligns per-CPU data on cacheline boundaries. + * schedule_tail() is getting called from the fork return path. This + * cleans up all remaining scheduler things, without impacting the + * common case. */ -static union { - struct schedule_data { - struct task_struct * prev; - long prevstate; - cycles_t last_schedule; - } schedule_data; - char __pad [L1_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; - - -static inline void __schedule_tail (void) +static inline void __schedule_tail (struct task_struct *prev) { #ifdef __SMP__ - struct schedule_data * sched_data; - - /* - * We might have switched CPUs: - */ - sched_data = & aligned_data[smp_processor_id()].schedule_data; - - /* - * Subtle. In the rare event that we got a wakeup to 'prev' just - * during the reschedule (this is possible, the scheduler is pretty - * parallel), we should do another reschedule in the next task's - * context. schedule() will do the right thing next time around. - * this is equivalent to 'delaying' the wakeup until the reschedule - * has finished. - */ - if (sched_data->prev->state != sched_data->prevstate) - current->need_resched = 1; - - /* - * Release the previous process ... - * - * We have dropped all locks, and we must make sure that we - * only mark the previous process as no longer having a CPU - * after all other state has been seen by other CPU's. Thus - * the write memory barrier! - */ + if ((prev->state == TASK_RUNNING) && + (prev != idle_task(smp_processor_id()))) + reschedule_idle(prev); wmb(); - sched_data->prev->has_cpu = 0; + prev->has_cpu = 0; #endif /* __SMP__ */ } -/* - * schedule_tail() is getting called from the fork return path. This - * cleans up all remaining scheduler things, without impacting the - * common case. - */ -void schedule_tail (void) +void schedule_tail (struct task_struct *prev) { - __schedule_tail(); + __schedule_tail(prev); } /* @@ -636,36 +686,38 @@ void schedule_tail (void) asmlinkage void schedule(void) { struct schedule_data * sched_data; - struct task_struct * prev, * next; - int this_cpu; + struct task_struct *prev, *next, *p; + int this_cpu, c; + + if (tq_scheduler) + goto handle_tq_scheduler; +tq_scheduler_back: prev = current; this_cpu = prev->processor; - /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. - */ - sched_data = & aligned_data[this_cpu].schedule_data; if (in_interrupt()) goto scheduling_in_interrupt; + release_kernel_lock(prev, this_cpu); /* Do "administrative" work here while we don't hold any locks */ - if (bh_active & bh_mask) - do_bottom_half(); - run_task_queue(&tq_scheduler); + if (bh_mask & bh_active) + goto handle_bh; +handle_bh_back: + + /* + * 'sched_data' is protected by the fact that we can run + * only one process per CPU. + */ + sched_data = & aligned_data[this_cpu].schedule_data; - spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); /* move an exhausted RR process to be last.. */ - prev->need_resched = 0; - - if (!prev->counter && prev->policy == SCHED_RR) { - prev->counter = prev->priority; - move_last_runqueue(prev); - } + if (prev->policy == SCHED_RR) + goto move_rr_last; +move_rr_back: switch (prev->state) { case TASK_INTERRUPTIBLE: @@ -677,61 +729,72 @@ asmlinkage void schedule(void) del_from_runqueue(prev); case TASK_RUNNING: } + prev->need_resched = 0; - sched_data->prevstate = prev->state; +repeat_schedule: - { - struct task_struct * p = init_task.next_run; - /* - * This is subtle. - * Note how we can enable interrupts here, even - * though interrupts can add processes to the run- - * queue. This is because any new processes will - * be added to the front of the queue, so "p" above - * is a safe starting point. - * run-queue deletion and re-ordering is protected by - * the scheduler lock - */ - spin_unlock_irq(&runqueue_lock); -#ifdef __SMP__ - prev->has_cpu = 0; -#endif - + /* + * this is the scheduler proper: + */ + + p = init_task.next_run; + /* Default process to select.. */ + next = idle_task(this_cpu); + c = -1000; + if (prev->state == TASK_RUNNING) + goto still_running; +still_running_back: + + /* + * This is subtle. + * Note how we can enable interrupts here, even + * though interrupts can add processes to the run- + * queue. This is because any new processes will + * be added to the front of the queue, so "p" above + * is a safe starting point. + * run-queue deletion and re-ordering is protected by + * the scheduler lock + */ /* * Note! there may appear new tasks on the run-queue during this, as * interrupts are enabled. However, they will be put on front of the * list, so our list starting at "p" is essentially fixed. */ -/* this is the scheduler proper: */ - { - int c = -1000; - next = idle_task; - while (p != &init_task) { - if (can_schedule(p)) { - int weight = goodness(p, prev, this_cpu); - if (weight > c) - c = weight, next = p; - } - p = p->next_run; - } - - /* Do we need to re-calculate counters? */ - if (!c) { - struct task_struct *p; - read_lock(&tasklist_lock); - for_each_task(p) - p->counter = (p->counter >> 1) + p->priority; - read_unlock(&tasklist_lock); - } + while (p != &init_task) { + if (can_schedule(p)) { + int weight = goodness(prev, p, this_cpu); + if (weight > c) + c = weight, next = p; } + p = p->next_run; } + /* Do we need to re-calculate counters? */ + if (!c) + goto recalculate; + /* + * from this point on nothing can prevent us from + * switching to the next task, save this fact in + * sched_data. + */ + sched_data->curr = next; +#ifdef __SMP__ + next->has_cpu = 1; + next->processor = this_cpu; +#endif + spin_unlock_irq(&runqueue_lock); + + if (prev == next) + goto same_process; + +#ifdef __SMP__ /* * maintain the per-process 'average timeslice' value. * (this has to be recalculated even if we reschedule to - * the same process) Currently this is only used on SMP: + * the same process) Currently this is only used on SMP, + * and it's approximate, so we do not have to maintain + * it while holding the runqueue spinlock. */ -#ifdef __SMP__ { cycles_t t, this_slice; @@ -740,10 +803,11 @@ asmlinkage void schedule(void) sched_data->last_schedule = t; /* - * Simple, exponentially fading average calculation: + * Exponentially fading average calculation, with + * some weight so it doesnt get fooled easily by + * smaller irregularities. */ - prev->avg_slice = this_slice + prev->avg_slice; - prev->avg_slice >>= 1; + prev->avg_slice = (this_slice*1 + prev->avg_slice*1)/2; } /* @@ -751,29 +815,55 @@ asmlinkage void schedule(void) * thus we have to lock the previous process from getting * rescheduled during switch_to(). */ - prev->has_cpu = 1; - next->has_cpu = 1; - next->processor = this_cpu; - spin_unlock(&scheduler_lock); #endif /* __SMP__ */ - if (prev != next) { -#ifdef __SMP__ - sched_data->prev = prev; -#endif - kstat.context_swtch++; - get_mmu_context(next); - switch_to(prev,next); - __schedule_tail(); - } + kstat.context_swtch++; + get_mmu_context(next); + switch_to(prev, next, prev); + __schedule_tail(prev); + +same_process: reacquire_kernel_lock(current); return; +recalculate: + { + struct task_struct *p; + spin_unlock_irq(&runqueue_lock); + read_lock(&tasklist_lock); + for_each_task(p) + p->counter = (p->counter >> 1) + p->priority; + read_unlock(&tasklist_lock); + spin_lock_irq(&runqueue_lock); + goto repeat_schedule; + } + +still_running: + c = prev_goodness(prev, prev, this_cpu); + next = prev; + goto still_running_back; + +handle_bh: + do_bottom_half(); + goto handle_bh_back; + +handle_tq_scheduler: + run_task_queue(&tq_scheduler); + goto tq_scheduler_back; + +move_rr_last: + if (!prev->counter) { + prev->counter = prev->priority; + move_last_runqueue(prev); + } + goto move_rr_back; + scheduling_in_interrupt: printk("Scheduling in interrupt\n"); *(int *)0 = 0; + return; } rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED; @@ -788,21 +878,42 @@ rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED; */ void __wake_up(struct wait_queue **q, unsigned int mode) { - struct wait_queue *next; + struct task_struct *p; + struct wait_queue *head, *next; + + if (!q) + goto out; + /* + * this is safe to be done before the check because it + * means no deference, just pointer operations. + */ + head = WAIT_QUEUE_HEAD(q); read_lock(&waitqueue_lock); - if (q && (next = *q)) { - struct wait_queue *head; - - head = WAIT_QUEUE_HEAD(q); - while (next != head) { - struct task_struct *p = next->task; - next = next->next; - if (p->state & mode) + next = *q; + if (!next) + goto out_unlock; + + while (next != head) { + p = next->task; + next = next->next; + if (p->state & mode) { + /* + * We can drop the read-lock early if this + * is the only/last process. + */ + if (next == head) { + read_unlock(&waitqueue_lock); wake_up_process(p); + goto out; + } + wake_up_process(p); } } +out_unlock: read_unlock(&waitqueue_lock); +out: + return; } /* @@ -863,30 +974,28 @@ void __up(struct semaphore *sem) struct task_struct *tsk = current; \ struct wait_queue wait = { tsk, NULL }; -#define DOWN_HEAD(task_state) \ - \ - \ - tsk->state = (task_state); \ - add_wait_queue(&sem->wait, &wait); \ - \ - /* \ - * Ok, we're set up. sem->count is known to be less than zero \ - * so we must wait. \ - * \ - * We can let go the lock for purposes of waiting. \ - * We re-acquire it after awaking so as to protect \ - * all semaphore operations. \ - * \ - * If "up()" is called before we call waking_non_zero() then \ - * we will catch it right away. If it is called later then \ - * we will have to go through a wakeup cycle to catch it. \ - * \ - * Multiple waiters contend for the semaphore lock to see \ - * who gets to gate through and who has to wait some more. \ - */ \ - for (;;) { \ - if (waking_non_zero(sem, tsk)) /* are we waking up? */ \ - break; /* yes, exit loop */ +#define DOWN_HEAD(task_state) \ + \ + \ + tsk->state = (task_state); \ + add_wait_queue(&sem->wait, &wait); \ + \ + /* \ + * Ok, we're set up. sem->count is known to be less than zero \ + * so we must wait. \ + * \ + * We can let go the lock for purposes of waiting. \ + * We re-acquire it after awaking so as to protect \ + * all semaphore operations. \ + * \ + * If "up()" is called before we call waking_non_zero() then \ + * we will catch it right away. If it is called later then \ + * we will have to go through a wakeup cycle to catch it. \ + * \ + * Multiple waiters contend for the semaphore lock to see \ + * who gets to gate through and who has to wait some more. \ + */ \ + for (;;) { #define DOWN_TAIL(task_state) \ tsk->state = (task_state); \ @@ -898,6 +1007,8 @@ void __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) + if (waking_non_zero(sem)) + break; schedule(); DOWN_TAIL(TASK_UNINTERRUPTIBLE) } @@ -907,10 +1018,13 @@ int __down_interruptible(struct semaphore * sem) DOWN_VAR int ret = 0; DOWN_HEAD(TASK_INTERRUPTIBLE) - if (signal_pending(tsk)) + + ret = waking_non_zero_interruptible(sem, tsk); + if (ret) { - ret = -EINTR; /* interrupted */ - atomic_inc(&sem->count); /* give up on down operation */ + if (ret == 1) + /* ret != 0 only if we get interrupted -arca */ + ret = 0; break; } schedule(); @@ -918,20 +1032,25 @@ int __down_interruptible(struct semaphore * sem) return ret; } +int __down_trylock(struct semaphore * sem) +{ + return waking_non_zero_trylock(sem); +} + #define SLEEP_ON_VAR \ unsigned long flags; \ struct wait_queue wait; #define SLEEP_ON_HEAD \ wait.task = current; \ - write_lock_irqsave(&waitqueue_lock, flags); \ + write_lock_irqsave(&waitqueue_lock,flags); \ __add_wait_queue(p, &wait); \ write_unlock(&waitqueue_lock); #define SLEEP_ON_TAIL \ write_lock_irq(&waitqueue_lock); \ __remove_wait_queue(p, &wait); \ - write_unlock_irqrestore(&waitqueue_lock, flags); + write_unlock_irqrestore(&waitqueue_lock,flags); void interruptible_sleep_on(struct wait_queue **p) { @@ -1120,7 +1239,6 @@ static void second_overflow(void) time_maxerror += time_tolerance >> SHIFT_USEC; if ( time_maxerror > NTP_PHASE_LIMIT ) { time_maxerror = NTP_PHASE_LIMIT; - time_state = TIME_ERROR; /* p. 17, sect. 4.3, (b) */ time_status |= STA_UNSYNC; } @@ -1606,7 +1724,6 @@ static int setscheduler(pid_t pid, int policy, /* * We play safe to avoid deadlocks. */ - spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); read_lock(&tasklist_lock); @@ -1654,7 +1771,6 @@ static int setscheduler(pid_t pid, int policy, out_unlock: read_unlock(&tasklist_lock); spin_unlock_irq(&runqueue_lock); - spin_unlock(&scheduler_lock); out_nounlock: return retval; @@ -1729,14 +1845,12 @@ out_unlock: asmlinkage int sys_sched_yield(void) { - spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); if (current->policy == SCHED_OTHER) current->policy |= SCHED_YIELD; current->need_resched = 1; move_last_runqueue(current); spin_unlock_irq(&runqueue_lock); - spin_unlock(&scheduler_lock); return 0; } @@ -1913,11 +2027,22 @@ void show_state(void) read_unlock(&tasklist_lock); } +void __init init_idle(void) +{ + cycles_t t; + struct schedule_data * sched_data; + sched_data = &aligned_data[smp_processor_id()].schedule_data; + + t = get_cycles(); + sched_data->curr = current; + sched_data->last_schedule = t; +} + void __init sched_init(void) { /* - * We have to do a little magic to get the first - * process right in SMP mode. + * We have to do a little magic to get the first + * process right in SMP mode. */ int cpu=hard_smp_processor_id(); int nr = NR_TASKS; diff --git a/kernel/signal.c b/kernel/signal.c index c9ea86038..56cb317d9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -265,7 +265,7 @@ printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig); && ((sig != SIGCONT) || (current->session != t->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) - && !capable(CAP_SYS_ADMIN)) + && !capable(CAP_KILL)) goto out_nolock; /* The null signal is a permissions and process existance probe. @@ -363,8 +363,27 @@ printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig); } sigaddset(&t->signal, sig); - if (!sigismember(&t->blocked, sig)) + if (!sigismember(&t->blocked, sig)) { t->sigpending = 1; +#ifdef __SMP__ + /* + * If the task is running on a different CPU + * force a reschedule on the other CPU - note that + * the code below is a tad loose and might occasionally + * kick the wrong CPU if we catch the process in the + * process of changing - but no harm is done by that + * other than doing an extra (lightweight) IPI interrupt. + * + * note that we rely on the previous spin_lock to + * lock interrupts for us! No need to set need_resched + * since signal event passing goes through ->blocked. + */ + spin_lock(&runqueue_lock); + if (t->has_cpu && t->processor != smp_processor_id()) + smp_send_reschedule(t->processor); + spin_unlock(&runqueue_lock); +#endif /* __SMP__ */ + } out: spin_unlock_irqrestore(&t->sigmask_lock, flags); diff --git a/kernel/softirq.c b/kernel/softirq.c index 1b364a6a1..d184c944e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -62,6 +62,7 @@ asmlinkage void do_bottom_half(void) if (hardirq_trylock(cpu)) { __sti(); run_bottom_halves(); + __cli(); hardirq_endlock(cpu); } softirq_endlock(cpu); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 44a62a9e9..ed9824136 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -557,14 +557,19 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root continue; } unregister_proc_table(table->child, de); + + /* Don't unregister directories which still have entries.. */ + if (de->subdir) + continue; } - /* Don't unregister proc directories which still have - entries... */ - if (!((de->mode & S_IFDIR) && de->subdir)) { - proc_unregister(root, de->low_ino); - table->de = NULL; - kfree(de); - } + + /* Don't unregoster proc entries that are still being used.. */ + if (de->count) + continue; + + proc_unregister(root, de->low_ino); + table->de = NULL; + kfree(de); } } diff --git a/kernel/time.c b/kernel/time.c index b1347c32f..9fe12559c 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -96,7 +96,6 @@ asmlinkage int sys_stime(int * tptr) xtime.tv_usec = 0; time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; - time_state = TIME_ERROR; /* p. 24, (a) */ time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; sti(); @@ -221,7 +220,7 @@ void (*hardpps_ptr)(struct timeval *) = (void (*)(struct timeval *))0; int do_adjtimex(struct timex *txc) { long ltemp, mtemp, save_adjust; - int error = 0; + int result = time_state; /* mostly `TIME_OK' */ /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) @@ -250,16 +249,13 @@ int do_adjtimex(struct timex *txc) /* If there are input parameters, then process them */ if (txc->modes) { - if (time_state == TIME_ERROR) - time_state = TIME_OK; /* reset error -- why? */ - if (txc->modes & ADJ_STATUS) /* only set allowed bits */ time_status = (txc->status & ~STA_RONLY) | (time_status & STA_RONLY); if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { - error = -EINVAL; + result = -EINVAL; goto leave; } time_freq = txc->freq - pps_freq; @@ -267,7 +263,7 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_MAXERROR) { if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { - error = -EINVAL; + result = -EINVAL; goto leave; } time_maxerror = txc->maxerror; @@ -275,7 +271,7 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_ESTERROR) { if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { - error = -EINVAL; + result = -EINVAL; goto leave; } time_esterror = txc->esterror; @@ -283,7 +279,7 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ if (txc->constant < 0) { /* NTP v4 uses values > 6 */ - error = -EINVAL; + result = -EINVAL; goto leave; } time_constant = txc->constant; @@ -329,7 +325,7 @@ int do_adjtimex(struct timex *txc) else time_freq += ltemp >> SHIFT_KH; } else /* calibration interval too short (p. 12) */ - time_state = TIME_ERROR; + result = TIME_ERROR; } else { /* PLL mode */ if (mtemp < MAXSEC) { ltemp *= mtemp; @@ -342,7 +338,7 @@ int do_adjtimex(struct timex *txc) time_constant + SHIFT_KF - SHIFT_USEC); } else /* calibration interval too long (p. 12) */ - time_state = TIME_ERROR; + result = TIME_ERROR; } if (time_freq > time_tolerance) time_freq = time_tolerance; @@ -354,7 +350,7 @@ int do_adjtimex(struct timex *txc) /* if the quartz is off by more than 10% something is VERY wrong ! */ if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ) { - error = -EINVAL; + result = -EINVAL; goto leave; } tick = txc->tick; @@ -370,7 +366,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 || ((time_status & STA_PPSFREQ) != 0 && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0)) /* p. 24, (d) */ - time_state = TIME_ERROR; + result = TIME_ERROR; if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; @@ -399,7 +395,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 txc->stbcnt = pps_stbcnt; sti(); - return(error < 0 ? error : time_state); + return(result); } asmlinkage int sys_adjtimex(struct timex *txc_p) |