summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c89
-rw-r--r--kernel/ksyms.c9
-rw-r--r--kernel/printk.c12
-rw-r--r--kernel/sched.c741
-rw-r--r--kernel/signal.c23
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/time.c24
10 files changed, 557 insertions, 371 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index dc0baed32..a8a94f734 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -194,13 +194,13 @@ asmlinkage int sys_acct(const char *name)
}
if (old_acct) {
do_acct_process(0,old_acct);
- fput(old_acct);
+ filp_close(old_acct, NULL);
}
out:
unlock_kernel();
return error;
out_err:
- fput(file);
+ filp_close(file, NULL);
goto out;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 58eb6df5d..b11ed8a11 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,9 +32,9 @@ void release(struct task_struct * p)
*/
for (;;) {
int has_cpu;
- spin_lock(&scheduler_lock);
+ spin_lock_irq(&runqueue_lock);
has_cpu = p->has_cpu;
- spin_unlock(&scheduler_lock);
+ spin_unlock_irq(&runqueue_lock);
if (!has_cpu)
break;
do {
@@ -169,7 +169,7 @@ static inline void close_files(struct files_struct * files)
struct file * file = files->fd[i];
if (file) {
files->fd[i] = NULL;
- close_fp(file, files);
+ filp_close(file, files);
}
}
i++;
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c714fe73..a431ffe2b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -57,36 +57,54 @@ kmem_cache_t *uid_cachep;
#define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
static inline void uid_hash_insert(struct user_struct *up, unsigned int hashent)
{
- spin_lock(&uidhash_lock);
if((up->next = uidhash[hashent]) != NULL)
uidhash[hashent]->pprev = &up->next;
up->pprev = &uidhash[hashent];
uidhash[hashent] = up;
- spin_unlock(&uidhash_lock);
}
static inline void uid_hash_remove(struct user_struct *up)
{
- spin_lock(&uidhash_lock);
if(up->next)
up->next->pprev = up->pprev;
*up->pprev = up->next;
- spin_unlock(&uidhash_lock);
}
-static inline struct user_struct *uid_find(unsigned short uid, unsigned int hashent)
+static inline struct user_struct *uid_hash_find(unsigned short uid, unsigned int hashent)
{
- struct user_struct *up;
-
- spin_lock(&uidhash_lock);
- for(up = uidhash[hashent]; (up && up->uid != uid); up = up->next)
- ;
- spin_unlock(&uidhash_lock);
+ struct user_struct *up, *next;
+
+ next = uidhash[hashent];
+ for (;;) {
+ up = next;
+ if (next) {
+ next = up->next;
+ if (up->uid != uid)
+ continue;
+ atomic_inc(&up->count);
+ }
+ break;
+ }
return up;
}
+/*
+ * For SMP, we need to re-test the user struct counter
+ * after having aquired the spinlock. This allows us to do
+ * the common case (not freeing anything) without having
+ * any locking.
+ */
+#ifdef __SMP__
+ #define uid_hash_free(up) (!atomic_read(&(up)->count))
+#else
+ #define uid_hash_free(up) (1)
+#endif
+
void free_uid(struct task_struct *p)
{
struct user_struct *up = p->user;
@@ -94,8 +112,12 @@ void free_uid(struct task_struct *p)
if (up) {
p->user = NULL;
if (atomic_dec_and_test(&up->count)) {
- uid_hash_remove(up);
- kmem_cache_free(uid_cachep, up);
+ spin_lock(&uidhash_lock);
+ if (uid_hash_free(up)) {
+ uid_hash_remove(up);
+ kmem_cache_free(uid_cachep, up);
+ }
+ spin_unlock(&uidhash_lock);
}
}
}
@@ -103,20 +125,37 @@ void free_uid(struct task_struct *p)
int alloc_uid(struct task_struct *p)
{
unsigned int hashent = uidhashfn(p->uid);
- struct user_struct *up = uid_find(p->uid, hashent);
+ struct user_struct *up;
+
+ spin_lock(&uidhash_lock);
+ up = uid_hash_find(p->uid, hashent);
+ spin_unlock(&uidhash_lock);
- p->user = up;
if (!up) {
- up = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
- if (!up)
+ struct user_struct *new;
+
+ new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
+ if (!new)
return -EAGAIN;
- p->user = up;
- up->uid = p->uid;
- atomic_set(&up->count, 0);
- uid_hash_insert(up, hashent);
- }
+ new->uid = p->uid;
+ atomic_set(&new->count, 1);
- atomic_inc(&up->count);
+ /*
+ * Before adding this, check whether we raced
+ * on adding the same user already..
+ */
+ spin_lock(&uidhash_lock);
+ up = uid_hash_find(p->uid, hashent);
+ if (up) {
+ kmem_cache_free(uid_cachep, new);
+ } else {
+ uid_hash_insert(new, hashent);
+ up = new;
+ }
+ spin_unlock(&uidhash_lock);
+
+ }
+ p->user = up;
return 0;
}
@@ -172,8 +211,8 @@ inside:
if(last_pid & 0xffff8000)
last_pid = 300;
next_safe = PID_MAX;
- goto repeat;
}
+ goto repeat;
}
if(p->pid > last_pid && next_safe > p->pid)
next_safe = p->pid;
@@ -510,6 +549,7 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
down(&current->mm->mmap_sem);
lock_kernel();
+ retval = -EAGAIN;
if (p->user) {
if (atomic_read(&p->user->count) >= p->rlim[RLIMIT_NPROC].rlim_cur)
goto bad_fork_free;
@@ -518,7 +558,6 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
{
struct task_struct **tslot;
tslot = find_empty_process();
- retval = -EAGAIN;
if (!tslot)
goto bad_fork_free;
p->tarray_ptr = tslot;
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 6cf723a4d..8f55f7dfb 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -37,6 +37,7 @@
#include <linux/file.h>
#include <linux/console.h>
#include <linux/poll.h>
+#include <linux/mm.h>
#if defined(CONFIG_PROC_FS)
#include <linux/proc_fs.h>
@@ -60,7 +61,7 @@ extern int request_dma(unsigned int dmanr, char * deviceID);
extern void free_dma(unsigned int dmanr);
extern spinlock_t dma_spin_lock;
-#ifdef MODVERSIONS
+#ifdef CONFIG_MODVERSIONS
const struct module_symbol __export_Using_Versions
__attribute__((section("__ksymtab"))) = {
1 /* Version version */, "Using_Versions"
@@ -105,6 +106,8 @@ EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(update_vm_cache);
EXPORT_SYMBOL(vmtruncate);
+EXPORT_SYMBOL(find_vma);
+EXPORT_SYMBOL(get_unmapped_area);
/* filesystem internal functions */
EXPORT_SYMBOL(in_group_p);
@@ -133,6 +136,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
EXPORT_SYMBOL(get_empty_filp);
EXPORT_SYMBOL(init_private_file);
EXPORT_SYMBOL(filp_open);
+EXPORT_SYMBOL(filp_close);
EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(put_filp);
EXPORT_SYMBOL(check_disk_change);
@@ -319,6 +323,8 @@ EXPORT_SYMBOL(printk);
EXPORT_SYMBOL(sprintf);
EXPORT_SYMBOL(vsprintf);
EXPORT_SYMBOL(kdevname);
+EXPORT_SYMBOL(bdevname);
+EXPORT_SYMBOL(cdevname);
EXPORT_SYMBOL(simple_strtoul);
EXPORT_SYMBOL(system_utsname); /* UTS data */
EXPORT_SYMBOL(uts_sem); /* UTS semaphore */
@@ -367,6 +373,7 @@ EXPORT_SYMBOL(is_bad_inode);
EXPORT_SYMBOL(event);
EXPORT_SYMBOL(__down);
EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__down_trylock);
EXPORT_SYMBOL(__up);
EXPORT_SYMBOL(brw_page);
diff --git a/kernel/printk.c b/kernel/printk.c
index a333fe18e..36414fcf3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -137,15 +137,9 @@ int do_syslog(int type, char * buf, int len)
error = verify_area(VERIFY_WRITE,buf,len);
if (error)
goto out;
- cli();
- error = -ERESTARTSYS;
- while (!log_size) {
- if (signal_pending(current)) {
- sti();
- goto out;
- }
- interruptible_sleep_on(&log_wait);
- }
+ error = wait_event_interruptible(log_wait, log_size);
+ if (error)
+ goto out;
i = 0;
while (log_size && i < len) {
c = *((char *) log_buf+log_start);
diff --git a/kernel/sched.c b/kernel/sched.c
index add76fbe0..098c90408 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3,7 +3,6 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
- * 1996-04-21 Modified by Ulrich Windl to make NTP work
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
@@ -15,6 +14,7 @@
* serialize accesses to xtime/lost_ticks).
* Copyright (C) 1998 Andrea Arcangeli
* 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
+ * 1999-03-10 Improved NTP compatibility by Ulrich Windl
*/
/*
@@ -36,6 +36,7 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
+#include <asm/semaphore-helper.h>
#include <linux/timex.h>
@@ -61,7 +62,7 @@ DECLARE_TASK_QUEUE(tq_scheduler);
* phase-lock loop variables
*/
/* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_ERROR; /* clock synchronization status */
+int time_state = TIME_OK; /* clock synchronization status */
int time_status = STA_UNSYNC; /* clock status bits */
long time_offset = 0; /* time adjustment (us) */
long time_constant = 2; /* pll time constant */
@@ -95,13 +96,156 @@ unsigned long volatile jiffies=0;
struct task_struct * task[NR_TASKS] = {&init_task, };
+/*
+ * We align per-CPU scheduling data on cacheline boundaries,
+ * to prevent cacheline ping-pong.
+ */
+static union {
+ struct schedule_data {
+ struct task_struct * curr;
+ cycles_t last_schedule;
+ } schedule_data;
+ char __pad [SMP_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+
+#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
+
struct kernel_stat kstat = { 0 };
+#ifdef __SMP__
+
+#define idle_task(cpu) (task[cpu_number_map[(cpu)]])
+#define can_schedule(p) (!(p)->has_cpu)
+
+#else
+
+#define idle_task(cpu) (&init_task)
+#define can_schedule(p) (1)
+
+#endif
+
void scheduling_functions_start_here(void) { }
+/*
+ * This is the function that decides how desirable a process is..
+ * You can weigh different processes against each other depending
+ * on what CPU they've run on lately etc to try to handle cache
+ * and TLB miss penalties.
+ *
+ * Return values:
+ * -1000: never select this
+ * 0: out of time, recalculate counters (but it might still be
+ * selected)
+ * +ve: "goodness" value (the larger, the better)
+ * +1000: realtime process, select this.
+ */
+
+static inline int goodness (struct task_struct * prev,
+ struct task_struct * p, int this_cpu)
+{
+ int weight;
+
+ /*
+ * Realtime process, select the first one on the
+ * runqueue (taking priorities within processes
+ * into account).
+ */
+ if (p->policy != SCHED_OTHER) {
+ weight = 1000 + p->rt_priority;
+ goto out;
+ }
+
+ /*
+ * Give the process a first-approximation goodness value
+ * according to the number of clock-ticks it has left.
+ *
+ * Don't do any other calculations if the time slice is
+ * over..
+ */
+ weight = p->counter;
+ if (!weight)
+ goto out;
+
#ifdef __SMP__
-static void reschedule_idle_slow(struct task_struct * p)
+ /* Give a largish advantage to the same processor... */
+ /* (this is equivalent to penalizing other processors) */
+ if (p->processor == this_cpu)
+ weight += PROC_CHANGE_PENALTY;
+#endif
+
+ /* .. and a slight advantage to the current MM */
+ if (p->mm == prev->mm)
+ weight += 1;
+ weight += p->priority;
+
+out:
+ return weight;
+}
+
+/*
+ * subtle. We want to discard a yielded process only if it's being
+ * considered for a reschedule. Wakeup-time 'queries' of the scheduling
+ * state do not count. Another optimization we do: sched_yield()-ed
+ * processes are runnable (and thus will be considered for scheduling)
+ * right when they are calling schedule(). So the only place we need
+ * to care about SCHED_YIELD is when we calculate the previous process'
+ * goodness ...
+ */
+static inline int prev_goodness (struct task_struct * prev,
+ struct task_struct * p, int this_cpu)
{
+ if (p->policy & SCHED_YIELD) {
+ p->policy &= ~SCHED_YIELD;
+ return 0;
+ }
+ return goodness(prev, p, this_cpu);
+}
+
+/*
+ * the 'goodness value' of replacing a process on a given CPU.
+ * positive value means 'replace', zero or negative means 'dont'.
+ */
+static inline int preemption_goodness (struct task_struct * prev,
+ struct task_struct * p, int cpu)
+{
+ return goodness(prev, p, cpu) - goodness(prev, prev, cpu);
+}
+
+/*
+ * If there is a dependency between p1 and p2,
+ * don't be too eager to go into the slow schedule.
+ * In particular, if p1 and p2 both want the kernel
+ * lock, there is no point in trying to make them
+ * extremely parallel..
+ *
+ * (No lock - lock_depth < 0)
+ *
+ * There are two additional metrics here:
+ *
+ * first, a 'cutoff' interval, currently 0-200 usecs on
+ * x86 CPUs, depending on the size of the 'SMP-local cache'.
+ * If the current process has longer average timeslices than
+ * this, then we utilize the idle CPU.
+ *
+ * second, if the wakeup comes from a process context,
+ * then the two processes are 'related'. (they form a
+ * 'gang')
+ *
+ * An idle CPU is almost always a bad thing, thus we skip
+ * the idle-CPU utilization only if both these conditions
+ * are true. (ie. a 'process-gang' rescheduling with rather
+ * high frequency should stay on the same CPU).
+ *
+ * [We can switch to something more finegrained in 2.3.]
+ *
+ * do not 'guess' if the to-be-scheduled task is RT.
+ */
+#define related(p1,p2) (((p1)->lock_depth >= 0) && (p2)->lock_depth >= 0) && \
+ (((p2)->policy == SCHED_OTHER) && ((p1)->avg_slice < cacheflush_time))
+
+static inline void reschedule_idle_slow(struct task_struct * p)
+{
+#ifdef __SMP__
/*
* (see reschedule_idle() for an explanation first ...)
*
@@ -123,60 +267,71 @@ static void reschedule_idle_slow(struct task_struct * p)
* 2.3. Also we can try to use the avg_slice value to predict
* 'likely reschedule' events even on other CPUs.]
*/
- int best_cpu = p->processor, this_cpu = smp_processor_id();
- struct task_struct **idle = task, *tsk, *target_tsk;
- int i = smp_num_cpus;
+ int this_cpu = smp_processor_id(), target_cpu;
+ struct task_struct *tsk, *target_tsk;
+ int cpu, best_cpu, weight, best_weight, i;
+ unsigned long flags;
+
+ best_weight = 0; /* prevents negative weight */
+
+ spin_lock_irqsave(&runqueue_lock, flags);
+
+ /*
+ * shortcut if the woken up task's last CPU is
+ * idle now.
+ */
+ best_cpu = p->processor;
+ target_tsk = idle_task(best_cpu);
+ if (cpu_curr(best_cpu) == target_tsk)
+ goto send_now;
target_tsk = NULL;
- do {
- tsk = *idle;
- idle++;
- if (tsk->has_cpu) {
- if (tsk->processor == this_cpu)
- continue;
+ for (i = 0; i < smp_num_cpus; i++) {
+ cpu = cpu_logical_map(i);
+ tsk = cpu_curr(cpu);
+ if (related(tsk, p))
+ goto out_no_target;
+ weight = preemption_goodness(tsk, p, cpu);
+ if (weight > best_weight) {
+ best_weight = weight;
target_tsk = tsk;
- if (tsk->processor == best_cpu) {
- /*
- * bingo, we couldnt get a better
- * CPU, activate it.
- */
- goto send; /* this one helps GCC ... */
- }
}
- } while (--i > 0);
+ }
/*
- * found any idle CPU?
+ * found any suitable CPU?
*/
- if (target_tsk) {
-send:
- target_tsk->need_resched = 1;
- smp_send_reschedule(target_tsk->processor);
- return;
- }
-}
-#endif /* __SMP__ */
+ if (!target_tsk)
+ goto out_no_target;
+
+send_now:
+ target_cpu = target_tsk->processor;
+ target_tsk->need_resched = 1;
+ spin_unlock_irqrestore(&runqueue_lock, flags);
+ /*
+ * the APIC stuff can go outside of the lock because
+ * it uses no task information, only CPU#.
+ */
+ if (target_cpu != this_cpu)
+ smp_send_reschedule(target_cpu);
+ return;
+out_no_target:
+ spin_unlock_irqrestore(&runqueue_lock, flags);
+ return;
+#else /* UP */
+ int this_cpu = smp_processor_id();
+ struct task_struct *tsk;
-/*
- * If there is a dependency between p1 and p2,
- * don't be too eager to go into the slow schedule.
- * In particular, if p1 and p2 both want the kernel
- * lock, there is no point in trying to make them
- * extremely parallel..
- *
- * (No lock - lock_depth < 0)
- */
-#define related(p1,p2) ((p1)->lock_depth >= 0 && (p2)->lock_depth >= 0)
+ tsk = cpu_curr(this_cpu);
+ if (preemption_goodness(tsk, p, this_cpu) > 0)
+ tsk->need_resched = 1;
+#endif
+}
-static inline void reschedule_idle(struct task_struct * p)
+static void reschedule_idle(struct task_struct * p)
{
-
- if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) {
- current->need_resched = 1;
- return;
- }
-
#ifdef __SMP__
+ int cpu = smp_processor_id();
/*
* ("wakeup()" should not be called before we've initialized
* SMP completely.
@@ -186,35 +341,20 @@ static inline void reschedule_idle(struct task_struct * p)
*
* SMP rescheduling is done in 2 passes:
* - pass #1: faster: 'quick decisions'
- * - pass #2: slower: 'lets try and find another CPU'
+ * - pass #2: slower: 'lets try and find a suitable CPU'
*/
/*
- * Pass #1
- *
- * There are two metrics here:
- *
- * first, a 'cutoff' interval, currently 0-200 usecs on
- * x86 CPUs, depending on the size of the 'SMP-local cache'.
- * If the current process has longer average timeslices than
- * this, then we utilize the idle CPU.
- *
- * second, if the wakeup comes from a process context,
- * then the two processes are 'related'. (they form a
- * 'gang')
- *
- * An idle CPU is almost always a bad thing, thus we skip
- * the idle-CPU utilization only if both these conditions
- * are true. (ie. a 'process-gang' rescheduling with rather
- * high frequency should stay on the same CPU).
- *
- * [We can switch to something more finegrained in 2.3.]
+ * Pass #1. (subtle. We might be in the middle of __switch_to, so
+ * to preserve scheduling atomicity we have to use cpu_curr)
*/
- if ((current->avg_slice < cacheflush_time) && related(current, p))
+ if ((p->processor == cpu) && related(cpu_curr(cpu), p))
return;
-
- reschedule_idle_slow(p);
#endif /* __SMP__ */
+ /*
+ * Pass #2
+ */
+ reschedule_idle_slow(p);
}
/*
@@ -290,7 +430,6 @@ static inline void move_first_runqueue(struct task_struct * p)
* The run-queue lock locks the parts that actually access
* and change the run-queues, and have to be interrupt-safe.
*/
-spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED; /* should be acquired first */
spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second */
rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */
@@ -306,12 +445,19 @@ void wake_up_process(struct task_struct * p)
{
unsigned long flags;
+ /*
+ * We want the common case fall through straight, thus the goto.
+ */
spin_lock_irqsave(&runqueue_lock, flags);
p->state = TASK_RUNNING;
- if (!p->next_run) {
- add_to_runqueue(p);
- reschedule_idle(p);
- }
+ if (p->next_run)
+ goto out;
+ add_to_runqueue(p);
+ spin_unlock_irqrestore(&runqueue_lock, flags);
+
+ reschedule_idle(p);
+ return;
+out:
spin_unlock_irqrestore(&runqueue_lock, flags);
}
@@ -323,63 +469,6 @@ static void process_timeout(unsigned long __data)
}
/*
- * This is the function that decides how desirable a process is..
- * You can weigh different processes against each other depending
- * on what CPU they've run on lately etc to try to handle cache
- * and TLB miss penalties.
- *
- * Return values:
- * -1000: never select this
- * 0: out of time, recalculate counters (but it might still be
- * selected)
- * +ve: "goodness" value (the larger, the better)
- * +1000: realtime process, select this.
- */
-static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu)
-{
- int policy = p->policy;
- int weight;
-
- if (policy & SCHED_YIELD) {
- p->policy = policy & ~SCHED_YIELD;
- return 0;
- }
-
- /*
- * Realtime process, select the first one on the
- * runqueue (taking priorities within processes
- * into account).
- */
- if (policy != SCHED_OTHER)
- return 1000 + p->rt_priority;
-
- /*
- * Give the process a first-approximation goodness value
- * according to the number of clock-ticks it has left.
- *
- * Don't do any other calculations if the time slice is
- * over..
- */
- weight = p->counter;
- if (weight) {
-
-#ifdef __SMP__
- /* Give a largish advantage to the same processor... */
- /* (this is equivalent to penalizing other processors) */
- if (p->processor == this_cpu)
- weight += PROC_CHANGE_PENALTY;
-#endif
-
- /* .. and a slight advantage to the current thread */
- if (p->mm == prev->mm)
- weight += 1;
- weight += p->priority;
- }
-
- return weight;
-}
-
-/*
* Event timer code
*/
#define TVN_BITS 6
@@ -463,8 +552,17 @@ void add_timer(struct timer_list *timer)
unsigned long flags;
spin_lock_irqsave(&timerlist_lock, flags);
+ if (timer->prev)
+ goto bug;
internal_add_timer(timer);
+out:
spin_unlock_irqrestore(&timerlist_lock, flags);
+ return;
+
+bug:
+ printk("bug: kernel timer added twice at %p.\n",
+ __builtin_return_address(0));
+ goto out;
}
static inline int detach_timer(struct timer_list *timer)
@@ -503,18 +601,6 @@ int del_timer(struct timer_list * timer)
return ret;
}
-#ifdef __SMP__
-
-#define idle_task (task[cpu_number_map[this_cpu]])
-#define can_schedule(p) (!(p)->has_cpu)
-
-#else
-
-#define idle_task (&init_task)
-#define can_schedule(p) (1)
-
-#endif
-
signed long schedule_timeout(signed long timeout)
{
struct timer_list timer;
@@ -567,60 +653,24 @@ signed long schedule_timeout(signed long timeout)
}
/*
- * This one aligns per-CPU data on cacheline boundaries.
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
*/
-static union {
- struct schedule_data {
- struct task_struct * prev;
- long prevstate;
- cycles_t last_schedule;
- } schedule_data;
- char __pad [L1_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
-
-
-static inline void __schedule_tail (void)
+static inline void __schedule_tail (struct task_struct *prev)
{
#ifdef __SMP__
- struct schedule_data * sched_data;
-
- /*
- * We might have switched CPUs:
- */
- sched_data = & aligned_data[smp_processor_id()].schedule_data;
-
- /*
- * Subtle. In the rare event that we got a wakeup to 'prev' just
- * during the reschedule (this is possible, the scheduler is pretty
- * parallel), we should do another reschedule in the next task's
- * context. schedule() will do the right thing next time around.
- * this is equivalent to 'delaying' the wakeup until the reschedule
- * has finished.
- */
- if (sched_data->prev->state != sched_data->prevstate)
- current->need_resched = 1;
-
- /*
- * Release the previous process ...
- *
- * We have dropped all locks, and we must make sure that we
- * only mark the previous process as no longer having a CPU
- * after all other state has been seen by other CPU's. Thus
- * the write memory barrier!
- */
+ if ((prev->state == TASK_RUNNING) &&
+ (prev != idle_task(smp_processor_id())))
+ reschedule_idle(prev);
wmb();
- sched_data->prev->has_cpu = 0;
+ prev->has_cpu = 0;
#endif /* __SMP__ */
}
-/*
- * schedule_tail() is getting called from the fork return path. This
- * cleans up all remaining scheduler things, without impacting the
- * common case.
- */
-void schedule_tail (void)
+void schedule_tail (struct task_struct *prev)
{
- __schedule_tail();
+ __schedule_tail(prev);
}
/*
@@ -636,36 +686,38 @@ void schedule_tail (void)
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
- struct task_struct * prev, * next;
- int this_cpu;
+ struct task_struct *prev, *next, *p;
+ int this_cpu, c;
+
+ if (tq_scheduler)
+ goto handle_tq_scheduler;
+tq_scheduler_back:
prev = current;
this_cpu = prev->processor;
- /*
- * 'sched_data' is protected by the fact that we can run
- * only one process per CPU.
- */
- sched_data = & aligned_data[this_cpu].schedule_data;
if (in_interrupt())
goto scheduling_in_interrupt;
+
release_kernel_lock(prev, this_cpu);
/* Do "administrative" work here while we don't hold any locks */
- if (bh_active & bh_mask)
- do_bottom_half();
- run_task_queue(&tq_scheduler);
+ if (bh_mask & bh_active)
+ goto handle_bh;
+handle_bh_back:
+
+ /*
+ * 'sched_data' is protected by the fact that we can run
+ * only one process per CPU.
+ */
+ sched_data = & aligned_data[this_cpu].schedule_data;
- spin_lock(&scheduler_lock);
spin_lock_irq(&runqueue_lock);
/* move an exhausted RR process to be last.. */
- prev->need_resched = 0;
-
- if (!prev->counter && prev->policy == SCHED_RR) {
- prev->counter = prev->priority;
- move_last_runqueue(prev);
- }
+ if (prev->policy == SCHED_RR)
+ goto move_rr_last;
+move_rr_back:
switch (prev->state) {
case TASK_INTERRUPTIBLE:
@@ -677,61 +729,72 @@ asmlinkage void schedule(void)
del_from_runqueue(prev);
case TASK_RUNNING:
}
+ prev->need_resched = 0;
- sched_data->prevstate = prev->state;
+repeat_schedule:
- {
- struct task_struct * p = init_task.next_run;
- /*
- * This is subtle.
- * Note how we can enable interrupts here, even
- * though interrupts can add processes to the run-
- * queue. This is because any new processes will
- * be added to the front of the queue, so "p" above
- * is a safe starting point.
- * run-queue deletion and re-ordering is protected by
- * the scheduler lock
- */
- spin_unlock_irq(&runqueue_lock);
-#ifdef __SMP__
- prev->has_cpu = 0;
-#endif
-
+ /*
+ * this is the scheduler proper:
+ */
+
+ p = init_task.next_run;
+ /* Default process to select.. */
+ next = idle_task(this_cpu);
+ c = -1000;
+ if (prev->state == TASK_RUNNING)
+ goto still_running;
+still_running_back:
+
+ /*
+ * This is subtle.
+ * Note how we can enable interrupts here, even
+ * though interrupts can add processes to the run-
+ * queue. This is because any new processes will
+ * be added to the front of the queue, so "p" above
+ * is a safe starting point.
+ * run-queue deletion and re-ordering is protected by
+ * the scheduler lock
+ */
/*
* Note! there may appear new tasks on the run-queue during this, as
* interrupts are enabled. However, they will be put on front of the
* list, so our list starting at "p" is essentially fixed.
*/
-/* this is the scheduler proper: */
- {
- int c = -1000;
- next = idle_task;
- while (p != &init_task) {
- if (can_schedule(p)) {
- int weight = goodness(p, prev, this_cpu);
- if (weight > c)
- c = weight, next = p;
- }
- p = p->next_run;
- }
-
- /* Do we need to re-calculate counters? */
- if (!c) {
- struct task_struct *p;
- read_lock(&tasklist_lock);
- for_each_task(p)
- p->counter = (p->counter >> 1) + p->priority;
- read_unlock(&tasklist_lock);
- }
+ while (p != &init_task) {
+ if (can_schedule(p)) {
+ int weight = goodness(prev, p, this_cpu);
+ if (weight > c)
+ c = weight, next = p;
}
+ p = p->next_run;
}
+ /* Do we need to re-calculate counters? */
+ if (!c)
+ goto recalculate;
+ /*
+ * from this point on nothing can prevent us from
+ * switching to the next task, save this fact in
+ * sched_data.
+ */
+ sched_data->curr = next;
+#ifdef __SMP__
+ next->has_cpu = 1;
+ next->processor = this_cpu;
+#endif
+ spin_unlock_irq(&runqueue_lock);
+
+ if (prev == next)
+ goto same_process;
+
+#ifdef __SMP__
/*
* maintain the per-process 'average timeslice' value.
* (this has to be recalculated even if we reschedule to
- * the same process) Currently this is only used on SMP:
+ * the same process) Currently this is only used on SMP,
+ * and it's approximate, so we do not have to maintain
+ * it while holding the runqueue spinlock.
*/
-#ifdef __SMP__
{
cycles_t t, this_slice;
@@ -740,10 +803,11 @@ asmlinkage void schedule(void)
sched_data->last_schedule = t;
/*
- * Simple, exponentially fading average calculation:
+ * Exponentially fading average calculation, with
+ * some weight so it doesnt get fooled easily by
+ * smaller irregularities.
*/
- prev->avg_slice = this_slice + prev->avg_slice;
- prev->avg_slice >>= 1;
+ prev->avg_slice = (this_slice*1 + prev->avg_slice*1)/2;
}
/*
@@ -751,29 +815,55 @@ asmlinkage void schedule(void)
* thus we have to lock the previous process from getting
* rescheduled during switch_to().
*/
- prev->has_cpu = 1;
- next->has_cpu = 1;
- next->processor = this_cpu;
- spin_unlock(&scheduler_lock);
#endif /* __SMP__ */
- if (prev != next) {
-#ifdef __SMP__
- sched_data->prev = prev;
-#endif
- kstat.context_swtch++;
- get_mmu_context(next);
- switch_to(prev,next);
- __schedule_tail();
- }
+ kstat.context_swtch++;
+ get_mmu_context(next);
+ switch_to(prev, next, prev);
+ __schedule_tail(prev);
+
+same_process:
reacquire_kernel_lock(current);
return;
+recalculate:
+ {
+ struct task_struct *p;
+ spin_unlock_irq(&runqueue_lock);
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ p->counter = (p->counter >> 1) + p->priority;
+ read_unlock(&tasklist_lock);
+ spin_lock_irq(&runqueue_lock);
+ goto repeat_schedule;
+ }
+
+still_running:
+ c = prev_goodness(prev, prev, this_cpu);
+ next = prev;
+ goto still_running_back;
+
+handle_bh:
+ do_bottom_half();
+ goto handle_bh_back;
+
+handle_tq_scheduler:
+ run_task_queue(&tq_scheduler);
+ goto tq_scheduler_back;
+
+move_rr_last:
+ if (!prev->counter) {
+ prev->counter = prev->priority;
+ move_last_runqueue(prev);
+ }
+ goto move_rr_back;
+
scheduling_in_interrupt:
printk("Scheduling in interrupt\n");
*(int *)0 = 0;
+ return;
}
rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
@@ -788,21 +878,42 @@ rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
*/
void __wake_up(struct wait_queue **q, unsigned int mode)
{
- struct wait_queue *next;
+ struct task_struct *p;
+ struct wait_queue *head, *next;
+
+ if (!q)
+ goto out;
+ /*
+ * this is safe to be done before the check because it
+ * means no deference, just pointer operations.
+ */
+ head = WAIT_QUEUE_HEAD(q);
read_lock(&waitqueue_lock);
- if (q && (next = *q)) {
- struct wait_queue *head;
-
- head = WAIT_QUEUE_HEAD(q);
- while (next != head) {
- struct task_struct *p = next->task;
- next = next->next;
- if (p->state & mode)
+ next = *q;
+ if (!next)
+ goto out_unlock;
+
+ while (next != head) {
+ p = next->task;
+ next = next->next;
+ if (p->state & mode) {
+ /*
+ * We can drop the read-lock early if this
+ * is the only/last process.
+ */
+ if (next == head) {
+ read_unlock(&waitqueue_lock);
wake_up_process(p);
+ goto out;
+ }
+ wake_up_process(p);
}
}
+out_unlock:
read_unlock(&waitqueue_lock);
+out:
+ return;
}
/*
@@ -863,30 +974,28 @@ void __up(struct semaphore *sem)
struct task_struct *tsk = current; \
struct wait_queue wait = { tsk, NULL };
-#define DOWN_HEAD(task_state) \
- \
- \
- tsk->state = (task_state); \
- add_wait_queue(&sem->wait, &wait); \
- \
- /* \
- * Ok, we're set up. sem->count is known to be less than zero \
- * so we must wait. \
- * \
- * We can let go the lock for purposes of waiting. \
- * We re-acquire it after awaking so as to protect \
- * all semaphore operations. \
- * \
- * If "up()" is called before we call waking_non_zero() then \
- * we will catch it right away. If it is called later then \
- * we will have to go through a wakeup cycle to catch it. \
- * \
- * Multiple waiters contend for the semaphore lock to see \
- * who gets to gate through and who has to wait some more. \
- */ \
- for (;;) { \
- if (waking_non_zero(sem, tsk)) /* are we waking up? */ \
- break; /* yes, exit loop */
+#define DOWN_HEAD(task_state) \
+ \
+ \
+ tsk->state = (task_state); \
+ add_wait_queue(&sem->wait, &wait); \
+ \
+ /* \
+ * Ok, we're set up. sem->count is known to be less than zero \
+ * so we must wait. \
+ * \
+ * We can let go the lock for purposes of waiting. \
+ * We re-acquire it after awaking so as to protect \
+ * all semaphore operations. \
+ * \
+ * If "up()" is called before we call waking_non_zero() then \
+ * we will catch it right away. If it is called later then \
+ * we will have to go through a wakeup cycle to catch it. \
+ * \
+ * Multiple waiters contend for the semaphore lock to see \
+ * who gets to gate through and who has to wait some more. \
+ */ \
+ for (;;) {
#define DOWN_TAIL(task_state) \
tsk->state = (task_state); \
@@ -898,6 +1007,8 @@ void __down(struct semaphore * sem)
{
DOWN_VAR
DOWN_HEAD(TASK_UNINTERRUPTIBLE)
+ if (waking_non_zero(sem))
+ break;
schedule();
DOWN_TAIL(TASK_UNINTERRUPTIBLE)
}
@@ -907,10 +1018,13 @@ int __down_interruptible(struct semaphore * sem)
DOWN_VAR
int ret = 0;
DOWN_HEAD(TASK_INTERRUPTIBLE)
- if (signal_pending(tsk))
+
+ ret = waking_non_zero_interruptible(sem, tsk);
+ if (ret)
{
- ret = -EINTR; /* interrupted */
- atomic_inc(&sem->count); /* give up on down operation */
+ if (ret == 1)
+ /* ret != 0 only if we get interrupted -arca */
+ ret = 0;
break;
}
schedule();
@@ -918,20 +1032,25 @@ int __down_interruptible(struct semaphore * sem)
return ret;
}
+int __down_trylock(struct semaphore * sem)
+{
+ return waking_non_zero_trylock(sem);
+}
+
#define SLEEP_ON_VAR \
unsigned long flags; \
struct wait_queue wait;
#define SLEEP_ON_HEAD \
wait.task = current; \
- write_lock_irqsave(&waitqueue_lock, flags); \
+ write_lock_irqsave(&waitqueue_lock,flags); \
__add_wait_queue(p, &wait); \
write_unlock(&waitqueue_lock);
#define SLEEP_ON_TAIL \
write_lock_irq(&waitqueue_lock); \
__remove_wait_queue(p, &wait); \
- write_unlock_irqrestore(&waitqueue_lock, flags);
+ write_unlock_irqrestore(&waitqueue_lock,flags);
void interruptible_sleep_on(struct wait_queue **p)
{
@@ -1120,7 +1239,6 @@ static void second_overflow(void)
time_maxerror += time_tolerance >> SHIFT_USEC;
if ( time_maxerror > NTP_PHASE_LIMIT ) {
time_maxerror = NTP_PHASE_LIMIT;
- time_state = TIME_ERROR; /* p. 17, sect. 4.3, (b) */
time_status |= STA_UNSYNC;
}
@@ -1606,7 +1724,6 @@ static int setscheduler(pid_t pid, int policy,
/*
* We play safe to avoid deadlocks.
*/
- spin_lock(&scheduler_lock);
spin_lock_irq(&runqueue_lock);
read_lock(&tasklist_lock);
@@ -1654,7 +1771,6 @@ static int setscheduler(pid_t pid, int policy,
out_unlock:
read_unlock(&tasklist_lock);
spin_unlock_irq(&runqueue_lock);
- spin_unlock(&scheduler_lock);
out_nounlock:
return retval;
@@ -1729,14 +1845,12 @@ out_unlock:
asmlinkage int sys_sched_yield(void)
{
- spin_lock(&scheduler_lock);
spin_lock_irq(&runqueue_lock);
if (current->policy == SCHED_OTHER)
current->policy |= SCHED_YIELD;
current->need_resched = 1;
move_last_runqueue(current);
spin_unlock_irq(&runqueue_lock);
- spin_unlock(&scheduler_lock);
return 0;
}
@@ -1913,11 +2027,22 @@ void show_state(void)
read_unlock(&tasklist_lock);
}
+void __init init_idle(void)
+{
+ cycles_t t;
+ struct schedule_data * sched_data;
+ sched_data = &aligned_data[smp_processor_id()].schedule_data;
+
+ t = get_cycles();
+ sched_data->curr = current;
+ sched_data->last_schedule = t;
+}
+
void __init sched_init(void)
{
/*
- * We have to do a little magic to get the first
- * process right in SMP mode.
+ * We have to do a little magic to get the first
+ * process right in SMP mode.
*/
int cpu=hard_smp_processor_id();
int nr = NR_TASKS;
diff --git a/kernel/signal.c b/kernel/signal.c
index c9ea86038..56cb317d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -265,7 +265,7 @@ printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig);
&& ((sig != SIGCONT) || (current->session != t->session))
&& (current->euid ^ t->suid) && (current->euid ^ t->uid)
&& (current->uid ^ t->suid) && (current->uid ^ t->uid)
- && !capable(CAP_SYS_ADMIN))
+ && !capable(CAP_KILL))
goto out_nolock;
/* The null signal is a permissions and process existance probe.
@@ -363,8 +363,27 @@ printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig);
}
sigaddset(&t->signal, sig);
- if (!sigismember(&t->blocked, sig))
+ if (!sigismember(&t->blocked, sig)) {
t->sigpending = 1;
+#ifdef __SMP__
+ /*
+ * If the task is running on a different CPU
+ * force a reschedule on the other CPU - note that
+ * the code below is a tad loose and might occasionally
+ * kick the wrong CPU if we catch the process in the
+ * process of changing - but no harm is done by that
+ * other than doing an extra (lightweight) IPI interrupt.
+ *
+ * note that we rely on the previous spin_lock to
+ * lock interrupts for us! No need to set need_resched
+ * since signal event passing goes through ->blocked.
+ */
+ spin_lock(&runqueue_lock);
+ if (t->has_cpu && t->processor != smp_processor_id())
+ smp_send_reschedule(t->processor);
+ spin_unlock(&runqueue_lock);
+#endif /* __SMP__ */
+ }
out:
spin_unlock_irqrestore(&t->sigmask_lock, flags);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1b364a6a1..d184c944e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -62,6 +62,7 @@ asmlinkage void do_bottom_half(void)
if (hardirq_trylock(cpu)) {
__sti();
run_bottom_halves();
+ __cli();
hardirq_endlock(cpu);
}
softirq_endlock(cpu);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 44a62a9e9..ed9824136 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -557,14 +557,19 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root
continue;
}
unregister_proc_table(table->child, de);
+
+ /* Don't unregister directories which still have entries.. */
+ if (de->subdir)
+ continue;
}
- /* Don't unregister proc directories which still have
- entries... */
- if (!((de->mode & S_IFDIR) && de->subdir)) {
- proc_unregister(root, de->low_ino);
- table->de = NULL;
- kfree(de);
- }
+
+ /* Don't unregoster proc entries that are still being used.. */
+ if (de->count)
+ continue;
+
+ proc_unregister(root, de->low_ino);
+ table->de = NULL;
+ kfree(de);
}
}
diff --git a/kernel/time.c b/kernel/time.c
index b1347c32f..9fe12559c 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -96,7 +96,6 @@ asmlinkage int sys_stime(int * tptr)
xtime.tv_usec = 0;
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
- time_state = TIME_ERROR; /* p. 24, (a) */
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
sti();
@@ -221,7 +220,7 @@ void (*hardpps_ptr)(struct timeval *) = (void (*)(struct timeval *))0;
int do_adjtimex(struct timex *txc)
{
long ltemp, mtemp, save_adjust;
- int error = 0;
+ int result = time_state; /* mostly `TIME_OK' */
/* In order to modify anything, you gotta be super-user! */
if (txc->modes && !capable(CAP_SYS_TIME))
@@ -250,16 +249,13 @@ int do_adjtimex(struct timex *txc)
/* If there are input parameters, then process them */
if (txc->modes)
{
- if (time_state == TIME_ERROR)
- time_state = TIME_OK; /* reset error -- why? */
-
if (txc->modes & ADJ_STATUS) /* only set allowed bits */
time_status = (txc->status & ~STA_RONLY) |
(time_status & STA_RONLY);
if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
- error = -EINVAL;
+ result = -EINVAL;
goto leave;
}
time_freq = txc->freq - pps_freq;
@@ -267,7 +263,7 @@ int do_adjtimex(struct timex *txc)
if (txc->modes & ADJ_MAXERROR) {
if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
- error = -EINVAL;
+ result = -EINVAL;
goto leave;
}
time_maxerror = txc->maxerror;
@@ -275,7 +271,7 @@ int do_adjtimex(struct timex *txc)
if (txc->modes & ADJ_ESTERROR) {
if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
- error = -EINVAL;
+ result = -EINVAL;
goto leave;
}
time_esterror = txc->esterror;
@@ -283,7 +279,7 @@ int do_adjtimex(struct timex *txc)
if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
if (txc->constant < 0) { /* NTP v4 uses values > 6 */
- error = -EINVAL;
+ result = -EINVAL;
goto leave;
}
time_constant = txc->constant;
@@ -329,7 +325,7 @@ int do_adjtimex(struct timex *txc)
else
time_freq += ltemp >> SHIFT_KH;
} else /* calibration interval too short (p. 12) */
- time_state = TIME_ERROR;
+ result = TIME_ERROR;
} else { /* PLL mode */
if (mtemp < MAXSEC) {
ltemp *= mtemp;
@@ -342,7 +338,7 @@ int do_adjtimex(struct timex *txc)
time_constant +
SHIFT_KF - SHIFT_USEC);
} else /* calibration interval too long (p. 12) */
- time_state = TIME_ERROR;
+ result = TIME_ERROR;
}
if (time_freq > time_tolerance)
time_freq = time_tolerance;
@@ -354,7 +350,7 @@ int do_adjtimex(struct timex *txc)
/* if the quartz is off by more than 10% something is
VERY wrong ! */
if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ) {
- error = -EINVAL;
+ result = -EINVAL;
goto leave;
}
tick = txc->tick;
@@ -370,7 +366,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
|| ((time_status & STA_PPSFREQ) != 0
&& (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
/* p. 24, (d) */
- time_state = TIME_ERROR;
+ result = TIME_ERROR;
if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
txc->offset = save_adjust;
@@ -399,7 +395,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
txc->stbcnt = pps_stbcnt;
sti();
- return(error < 0 ? error : time_state);
+ return(result);
}
asmlinkage int sys_adjtimex(struct timex *txc_p)