summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/ksyms.c31
-rw-r--r--kernel/module.c42
-rw-r--r--kernel/resource.c41
-rw-r--r--kernel/sched.c897
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/timer.c791
10 files changed, 979 insertions, 855 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 05357e348..00d0dfa69 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,7 +13,7 @@
O_TARGET := kernel.o
O_OBJS = sched.o dma.o fork.o exec_domain.o panic.o printk.o sys.o \
module.o exit.o itimer.o info.o time.o softirq.o resource.o \
- sysctl.o acct.o capability.o ptrace.o
+ sysctl.o acct.o capability.o ptrace.o timer.o
OX_OBJS += signal.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 7e64105a8..fdadf7a9f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -150,12 +150,12 @@ asmlinkage long sys_acct(const char *name)
{
struct file *file = NULL, *old_acct = NULL;
char *tmp;
- int error = -EPERM;
+ int error;
- lock_kernel();
if (!capable(CAP_SYS_PACCT))
- goto out;
+ return -EPERM;
+ lock_kernel();
if (name) {
tmp = getname(name);
error = PTR_ERR(tmp);
@@ -257,8 +257,6 @@ static comp_t encode_comp_t(unsigned long value)
* into the accounting file. This function should only be called from
* do_exit().
*/
-#define KSTK_EIP(stack) (((unsigned long *)(stack))[1019])
-#define KSTK_ESP(stack) (((unsigned long *)(stack))[1022])
/*
* do_acct_process does all actual work.
diff --git a/kernel/capability.c b/kernel/capability.c
index 2dbfe83f7..7c5f6df21 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -9,7 +9,7 @@
#include <asm/uaccess.h>
/* Note: never hold tasklist_lock while spinning for this one */
-spinlock_t task_capability_lock;
+spinlock_t task_capability_lock = SPIN_LOCK_UNLOCKED;
/*
* For sys_getproccap() and sys_setproccap(), any of the three
diff --git a/kernel/fork.c b/kernel/fork.c
index a90a6bc47..de0b59bac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -19,6 +19,7 @@
#include <linux/vmalloc.h>
#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 555e735d0..600395f49 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -39,6 +39,7 @@
#include <linux/iobuf.h>
#include <linux/console.h>
#include <linux/poll.h>
+#include <linux/mmzone.h>
#include <linux/mm.h>
#include <linux/capability.h>
@@ -83,7 +84,7 @@ EXPORT_SYMBOL(get_option);
EXPORT_SYMBOL(get_options);
/* process memory management */
-EXPORT_SYMBOL(do_mmap);
+EXPORT_SYMBOL(do_mmap_pgoff);
EXPORT_SYMBOL(do_munmap);
EXPORT_SYMBOL(do_brk);
EXPORT_SYMBOL(exit_mm);
@@ -92,9 +93,12 @@ EXPORT_SYMBOL(exit_fs);
EXPORT_SYMBOL(exit_sighand);
/* internal kernel memory management */
-EXPORT_SYMBOL(__get_free_pages);
-EXPORT_SYMBOL(free_pages);
-EXPORT_SYMBOL(__free_page);
+EXPORT_SYMBOL(__alloc_pages);
+EXPORT_SYMBOL(alloc_pages_node);
+EXPORT_SYMBOL(__free_pages_ok);
+#ifndef CONFIG_DISCONTIGMEM
+EXPORT_SYMBOL(contig_page_data);
+#endif
EXPORT_SYMBOL(kmem_find_general_cachep);
EXPORT_SYMBOL(kmem_cache_create);
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -114,6 +118,11 @@ EXPORT_SYMBOL(vmtruncate);
EXPORT_SYMBOL(find_vma);
EXPORT_SYMBOL(get_unmapped_area);
EXPORT_SYMBOL(init_mm);
+#ifdef CONFIG_HIGHMEM
+EXPORT_SYMBOL(kmap_high);
+EXPORT_SYMBOL(kunmap_high);
+EXPORT_SYMBOL(highmem_start_page);
+#endif
/* filesystem internal functions */
EXPORT_SYMBOL(in_group_p);
@@ -173,12 +182,13 @@ EXPORT_SYMBOL(__brelse);
EXPORT_SYMBOL(__bforget);
EXPORT_SYMBOL(ll_rw_block);
EXPORT_SYMBOL(__wait_on_buffer);
+EXPORT_SYMBOL(___wait_on_page);
EXPORT_SYMBOL(add_blkdev_randomness);
EXPORT_SYMBOL(block_read_full_page);
EXPORT_SYMBOL(block_write_full_page);
EXPORT_SYMBOL(block_write_partial_page);
EXPORT_SYMBOL(block_write_cont_page);
-EXPORT_SYMBOL(block_flushpage);
+EXPORT_SYMBOL(block_write_zero_range);
EXPORT_SYMBOL(generic_file_read);
EXPORT_SYMBOL(do_generic_file_read);
EXPORT_SYMBOL(generic_file_write);
@@ -193,7 +203,6 @@ EXPORT_SYMBOL(posix_block_lock);
EXPORT_SYMBOL(posix_unblock_lock);
EXPORT_SYMBOL(locks_mandatory_area);
EXPORT_SYMBOL(dput);
-EXPORT_SYMBOL(put_cached_page);
EXPORT_SYMBOL(is_root_busy);
EXPORT_SYMBOL(prune_dcache);
EXPORT_SYMBOL(shrink_dcache_sb);
@@ -206,10 +215,16 @@ EXPORT_SYMBOL(vfs_unlink);
EXPORT_SYMBOL(vfs_rename);
EXPORT_SYMBOL(__pollwait);
EXPORT_SYMBOL(ROOT_DEV);
-EXPORT_SYMBOL(add_to_page_cache_unique);
EXPORT_SYMBOL(__find_get_page);
EXPORT_SYMBOL(__find_lock_page);
-
+EXPORT_SYMBOL(grab_cache_page);
+EXPORT_SYMBOL(read_cache_page);
+EXPORT_SYMBOL(vfs_readlink);
+EXPORT_SYMBOL(vfs_follow_link);
+EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(page_follow_link);
+EXPORT_SYMBOL(block_symlink);
+
#if !defined(CONFIG_NFSD) && defined(CONFIG_NFSD_MODULE)
EXPORT_SYMBOL(do_nfsservctl);
#endif
diff --git a/kernel/module.c b/kernel/module.c
index 6f4ad977d..fb9d4ef8d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4,7 +4,7 @@
#include <asm/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/smp_lock.h>
-#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
#include <linux/init.h>
/*
@@ -13,6 +13,7 @@
* 0.99.14 version by Jon Tombs <jon@gtex02.us.es>,
* Heavily modified by Bjorn Ekwall <bj0rn@blox.se> May 1994 (C)
* Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
+ * Add MOD_INITIALIZING Keith Owens <kaos@ocs.com.au> Nov 1999
*
* This source is covered by the GNU GPL, the same as all kernel sources.
*/
@@ -111,11 +112,9 @@ sys_create_module(const char *name_user, size_t size)
long namelen, error;
struct module *mod;
+ if (!capable(CAP_SYS_MODULE))
+ return -EPERM;
lock_kernel();
- if (!capable(CAP_SYS_MODULE)) {
- error = -EPERM;
- goto err0;
- }
if ((namelen = get_mod_name(name_user, &name)) < 0) {
error = namelen;
goto err0;
@@ -162,13 +161,13 @@ sys_init_module(const char *name_user, struct module *mod_user)
{
struct module mod_tmp, *mod;
char *name, *n_name;
- long namelen, n_namelen, i, error = -EPERM;
+ long namelen, n_namelen, i, error;
unsigned long mod_user_size;
struct module_ref *dep;
- lock_kernel();
if (!capable(CAP_SYS_MODULE))
- goto err0;
+ return -EPERM;
+ lock_kernel();
if ((namelen = get_mod_name(name_user, &name)) < 0) {
error = namelen;
goto err0;
@@ -325,16 +324,18 @@ sys_init_module(const char *name_user, struct module *mod_user)
put_mod_name(name);
/* Initialize the module. */
+ mod->flags |= MOD_INITIALIZING;
atomic_set(&mod->uc.usecount,1);
if (mod->init && mod->init() != 0) {
atomic_set(&mod->uc.usecount,0);
+ mod->flags &= ~MOD_INITIALIZING;
error = -EBUSY;
goto err0;
}
atomic_dec(&mod->uc.usecount);
/* And set it running. */
- mod->flags |= MOD_RUNNING;
+ mod->flags = (mod->flags | MOD_RUNNING) & ~MOD_INITIALIZING;
error = 0;
goto err0;
@@ -354,13 +355,13 @@ sys_delete_module(const char *name_user)
{
struct module *mod, *next;
char *name;
- long error = -EPERM;
+ long error;
int something_changed;
- lock_kernel();
if (!capable(CAP_SYS_MODULE))
- goto out;
+ return -EPERM;
+ lock_kernel();
if (name_user) {
if ((error = get_mod_name(name_user, &name)) < 0)
goto out;
@@ -458,7 +459,7 @@ qm_deps(struct module *mod, char *buf, size_t bufsize, size_t *ret)
if (mod == &kernel_module)
return -EINVAL;
- if ((mod->flags & (MOD_RUNNING | MOD_DELETED)) != MOD_RUNNING)
+ if (!MOD_CAN_QUERY(mod))
if (put_user(0, ret))
return -EFAULT;
else
@@ -502,7 +503,7 @@ qm_refs(struct module *mod, char *buf, size_t bufsize, size_t *ret)
if (mod == &kernel_module)
return -EINVAL;
- if ((mod->flags & (MOD_RUNNING | MOD_DELETED)) != MOD_RUNNING)
+ if (!MOD_CAN_QUERY(mod))
if (put_user(0, ret))
return -EFAULT;
else
@@ -546,7 +547,7 @@ qm_symbols(struct module *mod, char *buf, size_t bufsize, size_t *ret)
char *strings;
unsigned long *vals;
- if ((mod->flags & (MOD_RUNNING | MOD_DELETED)) != MOD_RUNNING)
+ if (!MOD_CAN_QUERY(mod))
if (put_user(0, ret))
return -EFAULT;
else
@@ -712,7 +713,7 @@ sys_get_kernel_syms(struct kernel_sym *table)
struct module_symbol *msym;
unsigned int j;
- if ((mod->flags & (MOD_RUNNING|MOD_DELETED)) != MOD_RUNNING)
+ if (!MOD_CAN_QUERY(mod))
continue;
/* magic: write module info as a pseudo symbol */
@@ -861,7 +862,10 @@ int get_module_list(char *p)
safe_copy_cstr(" (autoclean)");
if (!(mod->flags & MOD_USED_ONCE))
safe_copy_cstr(" (unused)");
- } else
+ }
+ else if (mod->flags & MOD_INITIALIZING)
+ safe_copy_cstr(" (initializing)");
+ else
safe_copy_cstr(" (uninitialized)");
if ((ref = mod->refs) != NULL) {
@@ -905,7 +909,7 @@ get_ksyms_list(char *buf, char **start, off_t offset, int length)
unsigned i;
struct module_symbol *sym;
- if (!(mod->flags & MOD_RUNNING) || (mod->flags & MOD_DELETED))
+ if (!MOD_CAN_QUERY(mod))
continue;
for (i = mod->nsyms, sym = mod->syms; i > 0; --i, ++sym) {
@@ -953,7 +957,7 @@ get_module_symbol(char *modname, char *symname)
for (mp = module_list; mp; mp = mp->next) {
if (((modname == NULL) || (strcmp(mp->name, modname) == 0)) &&
- (mp->flags & (MOD_RUNNING | MOD_DELETED)) == MOD_RUNNING &&
+ MOD_CAN_QUERY(mp) &&
(mp->nsyms > 0)) {
for (i = mp->nsyms, sym = mp->syms;
i > 0; --i, ++sym) {
diff --git a/kernel/resource.c b/kernel/resource.c
index e3dd3a16d..bdf9fef31 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -13,9 +13,10 @@
#include <linux/init.h>
#include <linux/malloc.h>
#include <linux/spinlock.h>
+#include <asm/io.h>
-struct resource ioport_resource = { "PCI IO", 0x0000, 0xFFFF, IORESOURCE_IO };
-struct resource iomem_resource = { "PCI mem", 0x00000000, 0xFFFFFFFF, IORESOURCE_MEM };
+struct resource ioport_resource = { "PCI IO", 0x0000, IO_SPACE_LIMIT, IORESOURCE_IO };
+struct resource iomem_resource = { "PCI mem", 0x00000000, 0xffffffff, IORESOURCE_MEM };
static rwlock_t resource_lock = RW_LOCK_UNLOCKED;
@@ -126,30 +127,32 @@ int release_resource(struct resource *old)
static int find_resource(struct resource *root, struct resource *new,
unsigned long size,
unsigned long min, unsigned long max,
- unsigned long align)
+ unsigned long align,
+ void (*alignf)(void *, struct resource *, unsigned long),
+ void *alignf_data)
{
struct resource *this = root->child;
- unsigned long start, end;
- start = root->start;
+ new->start = root->start;
for(;;) {
if (this)
- end = this->start;
+ new->end = this->start;
else
- end = root->end;
- if (start < min)
- start = min;
- if (end > max)
- end = max;
- start = (start + align - 1) & ~(align - 1);
- if (start < end && end - start + 1 >= size) {
- new->start = start;
- new->end = start + size - 1;
+ new->end = root->end;
+ if (new->start < min)
+ new->start = min;
+ if (new->end > max)
+ new->end = max;
+ new->start = (new->start + align - 1) & ~(align - 1);
+ if (alignf)
+ alignf(alignf_data, new, size);
+ if (new->start < new->end && new->end - new->start + 1 >= size) {
+ new->end = new->start + size - 1;
return 0;
}
if (!this)
break;
- start = this->end + 1;
+ new->start = this->end + 1;
this = this->sibling;
}
return -EBUSY;
@@ -161,12 +164,14 @@ static int find_resource(struct resource *root, struct resource *new,
int allocate_resource(struct resource *root, struct resource *new,
unsigned long size,
unsigned long min, unsigned long max,
- unsigned long align)
+ unsigned long align,
+ void (*alignf)(void *, struct resource *, unsigned long),
+ void *alignf_data)
{
int err;
write_lock(&resource_lock);
- err = find_resource(root, new, size, min, max, align);
+ err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index ac68af2e3..e653fd22c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1,20 +1,15 @@
/*
* linux/kernel/sched.c
*
+ * Kernel scheduler and related syscalls
+ *
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
- * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
- * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
- * "A Kernel Model for Precision Timekeeping" by Dave Mills
* 1998-11-19 Implemented schedule_timeout() and related stuff
* by Andrea Arcangeli
- * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
- * serialize accesses to xtime/lost_ticks).
- * Copyright (C) 1998 Andrea Arcangeli
* 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
- * 1999-03-10 Improved NTP compatibility by Ulrich Windl
*/
/*
@@ -25,69 +20,27 @@
*/
#include <linux/mm.h>
-#include <linux/kernel_stat.h>
-#include <linux/fdreg.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
#include <linux/init.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
-#include <asm/io.h>
#include <asm/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-#include <linux/timex.h>
+
+extern void timer_bh(void);
+extern void tqueue_bh(void);
+extern void immediate_bh(void);
/*
- * kernel variables
+ * scheduler variables
*/
unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
-long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */
-
-/* The current time */
-volatile struct timeval xtime __attribute__ ((aligned (16)));
-
-/* Don't completely fail for HZ > 500. */
-int tickadj = 500/HZ ? : 1; /* microsecs */
-
-DECLARE_TASK_QUEUE(tq_timer);
-DECLARE_TASK_QUEUE(tq_immediate);
-DECLARE_TASK_QUEUE(tq_scheduler);
-
-/*
- * phase-lock loop variables
- */
-/* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK; /* clock synchronization status */
-int time_status = STA_UNSYNC; /* clock status bits */
-long time_offset = 0; /* time adjustment (us) */
-long time_constant = 2; /* pll time constant */
-long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
-long time_precision = 1; /* clock precision (us) */
-long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
-long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
-long time_phase = 0; /* phase offset (scaled us) */
-long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; /* frequency offset (scaled ppm) */
-long time_adj = 0; /* tick adjust (scaled 1 / HZ) */
-long time_reftime = 0; /* time at last adjustment (s) */
-
-long time_adjust = 0;
-long time_adjust_step = 0;
-
-unsigned long event = 0;
-
-extern int do_setitimer(int, struct itimerval *, struct itimerval *);
-unsigned int * prof_buffer = NULL;
-unsigned long prof_len = 0;
-unsigned long prof_shift = 0;
-
extern void mem_use(void);
-unsigned long volatile jiffies=0;
-
/*
* Init task must be ok at boot for the ix86 as we will check its signals
* via the SMP irq return path.
@@ -223,49 +176,88 @@ static inline int preemption_goodness(struct task_struct * prev, struct task_str
return goodness(p, cpu, prev->mm) - goodness(prev, cpu, prev->mm);
}
-static void reschedule_idle(struct task_struct * p)
+/*
+ * This is ugly, but reschedule_idle() is very timing-critical.
+ * We enter with the runqueue spinlock held, but we might end
+ * up unlocking it early, so the caller must not unlock the
+ * runqueue, it's always done by reschedule_idle().
+ */
+static inline void reschedule_idle(struct task_struct * p, unsigned long flags)
{
#ifdef __SMP__
int this_cpu = smp_processor_id(), target_cpu;
- struct task_struct *tsk, *target_tsk;
+ struct task_struct *tsk;
int cpu, best_cpu, i;
- unsigned long flags;
-
- spin_lock_irqsave(&runqueue_lock, flags);
/*
* shortcut if the woken up task's last CPU is
* idle now.
*/
best_cpu = p->processor;
- target_tsk = idle_task(best_cpu);
- if (cpu_curr(best_cpu) == target_tsk)
+ tsk = idle_task(best_cpu);
+ if (cpu_curr(best_cpu) == tsk)
goto send_now;
- target_tsk = NULL;
+ /*
+ * The only heuristics - we use the tsk->avg_slice value
+ * to detect 'frequent reschedulers'.
+ *
+ * If both the woken-up process and the preferred CPU is
+ * is a frequent rescheduler, then skip the asynchronous
+ * wakeup, the frequent rescheduler will likely chose this
+ * task during it's next schedule():
+ */
+ if (p->policy == SCHED_OTHER) {
+ tsk = cpu_curr(best_cpu);
+ if (p->avg_slice + tsk->avg_slice < cacheflush_time)
+ goto out_no_target;
+ }
+
+ /*
+ * We know that the preferred CPU has a cache-affine current
+ * process, lets try to find a new idle CPU for the woken-up
+ * process:
+ */
for (i = 0; i < smp_num_cpus; i++) {
cpu = cpu_logical_map(i);
tsk = cpu_curr(cpu);
+ /*
+ * We use the first available idle CPU. This creates
+ * a priority list between idle CPUs, but this is not
+ * a problem.
+ */
if (tsk == idle_task(cpu))
- target_tsk = tsk;
+ goto send_now;
}
- if (target_tsk && p->avg_slice > cacheflush_time)
- goto send_now;
-
+ /*
+ * No CPU is idle, but maybe this process has enough priority
+ * to preempt it's preferred CPU. (this is a shortcut):
+ */
tsk = cpu_curr(best_cpu);
if (preemption_goodness(tsk, p, best_cpu) > 0)
- target_tsk = tsk;
+ goto send_now;
/*
- * found any suitable CPU?
+ * We should get here rarely - or in the high CPU contention
+ * case. No CPU is idle and this process is either lowprio or
+ * the preferred CPU is highprio. Maybe some other CPU can/must
+ * be preempted:
*/
- if (!target_tsk)
- goto out_no_target;
+ for (i = 0; i < smp_num_cpus; i++) {
+ cpu = cpu_logical_map(i);
+ tsk = cpu_curr(cpu);
+ if (preemption_goodness(tsk, p, cpu) > 0)
+ goto send_now;
+ }
+
+out_no_target:
+ spin_unlock_irqrestore(&runqueue_lock, flags);
+ return;
send_now:
- target_cpu = target_tsk->processor;
- target_tsk->need_resched = 1;
+ target_cpu = tsk->processor;
+ tsk->need_resched = 1;
spin_unlock_irqrestore(&runqueue_lock, flags);
/*
* the APIC stuff can go outside of the lock because
@@ -274,9 +266,6 @@ send_now:
if (target_cpu != this_cpu)
smp_send_reschedule(target_cpu);
return;
-out_no_target:
- spin_unlock_irqrestore(&runqueue_lock, flags);
- return;
#else /* UP */
int this_cpu = smp_processor_id();
struct task_struct *tsk;
@@ -320,7 +309,7 @@ static inline void move_first_runqueue(struct task_struct * p)
* "current->state = TASK_RUNNING" to mark yourself runnable
* without the overhead of this.
*/
-void wake_up_process(struct task_struct * p)
+inline void wake_up_process(struct task_struct * p)
{
unsigned long flags;
@@ -332,152 +321,34 @@ void wake_up_process(struct task_struct * p)
if (task_on_runqueue(p))
goto out;
add_to_runqueue(p);
- spin_unlock_irqrestore(&runqueue_lock, flags);
+ reschedule_idle(p, flags); // spin_unlocks runqueue
- reschedule_idle(p);
return;
out:
spin_unlock_irqrestore(&runqueue_lock, flags);
}
-static void process_timeout(unsigned long __data)
-{
- struct task_struct * p = (struct task_struct *) __data;
-
- wake_up_process(p);
-}
-
-/*
- * Event timer code
- */
-#define TVN_BITS 6
-#define TVR_BITS 8
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-
-struct timer_vec {
- int index;
- struct timer_list *vec[TVN_SIZE];
-};
-
-struct timer_vec_root {
- int index;
- struct timer_list *vec[TVR_SIZE];
-};
-
-static struct timer_vec tv5 = { 0 };
-static struct timer_vec tv4 = { 0 };
-static struct timer_vec tv3 = { 0 };
-static struct timer_vec tv2 = { 0 };
-static struct timer_vec_root tv1 = { 0 };
-
-static struct timer_vec * const tvecs[] = {
- (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
-};
-
-#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
-
-static unsigned long timer_jiffies = 0;
-
-static inline void insert_timer(struct timer_list *timer,
- struct timer_list **vec, int idx)
+static inline void wake_up_process_synchronous(struct task_struct * p)
{
- if ((timer->next = vec[idx]))
- vec[idx]->prev = timer;
- vec[idx] = timer;
- timer->prev = (struct timer_list *)&vec[idx];
-}
+ unsigned long flags;
-static inline void internal_add_timer(struct timer_list *timer)
-{
/*
- * must be cli-ed when calling this
+ * We want the common case fall through straight, thus the goto.
*/
- unsigned long expires = timer->expires;
- unsigned long idx = expires - timer_jiffies;
-
- if (idx < TVR_SIZE) {
- int i = expires & TVR_MASK;
- insert_timer(timer, tv1.vec, i);
- } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
- int i = (expires >> TVR_BITS) & TVN_MASK;
- insert_timer(timer, tv2.vec, i);
- } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
- int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
- insert_timer(timer, tv3.vec, i);
- } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
- int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
- insert_timer(timer, tv4.vec, i);
- } else if ((signed long) idx < 0) {
- /* can happen if you add a timer with expires == jiffies,
- * or you set a timer to go off in the past
- */
- insert_timer(timer, tv1.vec, tv1.index);
- } else if (idx <= 0xffffffffUL) {
- int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
- insert_timer(timer, tv5.vec, i);
- } else {
- /* Can only get here on architectures with 64-bit jiffies */
- timer->next = timer->prev = timer;
- }
-}
-
-spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
-
-void add_timer(struct timer_list *timer)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&timerlist_lock, flags);
- if (timer->prev)
- goto bug;
- internal_add_timer(timer);
+ spin_lock_irqsave(&runqueue_lock, flags);
+ p->state = TASK_RUNNING;
+ if (task_on_runqueue(p))
+ goto out;
+ add_to_runqueue(p);
out:
- spin_unlock_irqrestore(&timerlist_lock, flags);
- return;
-
-bug:
- printk("bug: kernel timer added twice at %p.\n",
- __builtin_return_address(0));
- goto out;
-}
-
-static inline int detach_timer(struct timer_list *timer)
-{
- struct timer_list *prev = timer->prev;
- if (prev) {
- struct timer_list *next = timer->next;
- prev->next = next;
- if (next)
- next->prev = prev;
- return 1;
- }
- return 0;
-}
-
-void mod_timer(struct timer_list *timer, unsigned long expires)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&timerlist_lock, flags);
- timer->expires = expires;
- detach_timer(timer);
- internal_add_timer(timer);
- spin_unlock_irqrestore(&timerlist_lock, flags);
+ spin_unlock_irqrestore(&runqueue_lock, flags);
}
-int del_timer(struct timer_list * timer)
+static void process_timeout(unsigned long __data)
{
- int ret;
- unsigned long flags;
+ struct task_struct * p = (struct task_struct *) __data;
- spin_lock_irqsave(&timerlist_lock, flags);
- ret = detach_timer(timer);
- timer->next = timer->prev = 0;
- spin_unlock_irqrestore(&timerlist_lock, flags);
- return ret;
+ wake_up_process(p);
}
signed long schedule_timeout(signed long timeout)
@@ -541,8 +412,12 @@ static inline void __schedule_tail(struct task_struct *prev)
{
#ifdef __SMP__
if ((prev->state == TASK_RUNNING) &&
- (prev != idle_task(smp_processor_id())))
- reschedule_idle(prev);
+ (prev != idle_task(smp_processor_id()))) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&runqueue_lock, flags);
+ reschedule_idle(prev, flags); // spin_unlocks runqueue
+ }
wmb();
prev->has_cpu = 0;
#endif /* __SMP__ */
@@ -765,7 +640,7 @@ scheduling_in_interrupt:
return;
}
-void __wake_up(wait_queue_head_t *q, unsigned int mode)
+static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, const int sync)
{
struct list_head *tmp, *head;
struct task_struct *p;
@@ -801,7 +676,10 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode)
#if WAITQUEUE_DEBUG
curr->__waker = (long)__builtin_return_address(0);
#endif
- wake_up_process(p);
+ if (sync)
+ wake_up_process_synchronous(p);
+ else
+ wake_up_process(p);
if (state & TASK_EXCLUSIVE)
break;
}
@@ -811,6 +689,16 @@ out:
return;
}
+void __wake_up(wait_queue_head_t *q, unsigned int mode)
+{
+ __wake_up_common(q, mode, 0);
+}
+
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode)
+{
+ __wake_up_common(q, mode, 1);
+}
+
#define SLEEP_ON_VAR \
unsigned long flags; \
wait_queue_t wait; \
@@ -876,549 +764,9 @@ long sleep_on_timeout(wait_queue_head_t *q, long timeout)
void scheduling_functions_end_here(void) { }
-static inline void cascade_timers(struct timer_vec *tv)
-{
- /* cascade all the timers from tv up one level */
- struct timer_list *timer;
- timer = tv->vec[tv->index];
- /*
- * We are removing _all_ timers from the list, so we don't have to
- * detach them individually, just clear the list afterwards.
- */
- while (timer) {
- struct timer_list *tmp = timer;
- timer = timer->next;
- internal_add_timer(tmp);
- }
- tv->vec[tv->index] = NULL;
- tv->index = (tv->index + 1) & TVN_MASK;
-}
-
-static inline void run_timer_list(void)
-{
- spin_lock_irq(&timerlist_lock);
- while ((long)(jiffies - timer_jiffies) >= 0) {
- struct timer_list *timer;
- if (!tv1.index) {
- int n = 1;
- do {
- cascade_timers(tvecs[n]);
- } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
- }
- while ((timer = tv1.vec[tv1.index])) {
- void (*fn)(unsigned long) = timer->function;
- unsigned long data = timer->data;
- detach_timer(timer);
- timer->next = timer->prev = NULL;
- spin_unlock_irq(&timerlist_lock);
- fn(data);
- spin_lock_irq(&timerlist_lock);
- }
- ++timer_jiffies;
- tv1.index = (tv1.index + 1) & TVR_MASK;
- }
- spin_unlock_irq(&timerlist_lock);
-}
-
-
-static inline void run_old_timers(void)
-{
- struct timer_struct *tp;
- unsigned long mask;
-
- for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) {
- if (mask > timer_active)
- break;
- if (!(mask & timer_active))
- continue;
- if (time_after(tp->expires, jiffies))
- continue;
- timer_active &= ~mask;
- tp->fn();
- sti();
- }
-}
-
-spinlock_t tqueue_lock;
-
-void tqueue_bh(void)
-{
- run_task_queue(&tq_timer);
-}
-
-void immediate_bh(void)
-{
- run_task_queue(&tq_immediate);
-}
-
-unsigned long timer_active = 0;
-struct timer_struct timer_table[32];
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- */
-unsigned long avenrun[3] = { 0,0,0 };
-
-/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
- struct task_struct *p;
- unsigned long nr = 0;
-
- read_lock(&tasklist_lock);
- for_each_task(p) {
- if ((p->state == TASK_RUNNING ||
- (p->state & TASK_UNINTERRUPTIBLE) ||
- (p->state & TASK_SWAPPING)))
- nr += FIXED_1;
- }
- read_unlock(&tasklist_lock);
- return nr;
-}
-
-static inline void calc_load(unsigned long ticks)
-{
- unsigned long active_tasks; /* fixed-point */
- static int count = LOAD_FREQ;
-
- count -= ticks;
- if (count < 0) {
- count += LOAD_FREQ;
- active_tasks = count_active_tasks();
- CALC_LOAD(avenrun[0], EXP_1, active_tasks);
- CALC_LOAD(avenrun[1], EXP_5, active_tasks);
- CALC_LOAD(avenrun[2], EXP_15, active_tasks);
- }
-}
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- *
- */
-static void second_overflow(void)
-{
- long ltemp;
-
- /* Bump the maxerror field */
- time_maxerror += time_tolerance >> SHIFT_USEC;
- if ( time_maxerror > NTP_PHASE_LIMIT ) {
- time_maxerror = NTP_PHASE_LIMIT;
- time_status |= STA_UNSYNC;
- }
-
- /*
- * Leap second processing. If in leap-insert state at
- * the end of the day, the system clock is set back one
- * second; if in leap-delete state, the system clock is
- * set ahead one second. The microtime() routine or
- * external clock driver will insure that reported time
- * is always monotonic. The ugly divides should be
- * replaced.
- */
- switch (time_state) {
-
- case TIME_OK:
- if (time_status & STA_INS)
- time_state = TIME_INS;
- else if (time_status & STA_DEL)
- time_state = TIME_DEL;
- break;
-
- case TIME_INS:
- if (xtime.tv_sec % 86400 == 0) {
- xtime.tv_sec--;
- time_state = TIME_OOP;
- printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
- }
- break;
-
- case TIME_DEL:
- if ((xtime.tv_sec + 1) % 86400 == 0) {
- xtime.tv_sec++;
- time_state = TIME_WAIT;
- printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
- }
- break;
-
- case TIME_OOP:
- time_state = TIME_WAIT;
- break;
-
- case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
- }
-
- /*
- * Compute the phase adjustment for the next second. In
- * PLL mode, the offset is reduced by a fixed factor
- * times the time constant. In FLL mode the offset is
- * used directly. In either mode, the maximum phase
- * adjustment for each second is clamped so as to spread
- * the adjustment over not more than the number of
- * seconds between updates.
- */
- if (time_offset < 0) {
- ltemp = -time_offset;
- if (!(time_status & STA_FLL))
- ltemp >>= SHIFT_KG + time_constant;
- if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
- ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
- time_offset += ltemp;
- time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
- } else {
- ltemp = time_offset;
- if (!(time_status & STA_FLL))
- ltemp >>= SHIFT_KG + time_constant;
- if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
- ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
- time_offset -= ltemp;
- time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
- }
-
- /*
- * Compute the frequency estimate and additional phase
- * adjustment due to frequency error for the next
- * second. When the PPS signal is engaged, gnaw on the
- * watchdog counter and update the frequency computed by
- * the pll and the PPS signal.
- */
- pps_valid++;
- if (pps_valid == PPS_VALID) { /* PPS signal lost */
- pps_jitter = MAXTIME;
- pps_stabil = MAXFREQ;
- time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
- STA_PPSWANDER | STA_PPSERROR);
- }
- ltemp = time_freq + pps_freq;
- if (ltemp < 0)
- time_adj -= -ltemp >>
- (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
- else
- time_adj += ltemp >>
- (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
-
-#if HZ == 100
- /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
- * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
- */
- if (time_adj < 0)
- time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
- else
- time_adj += (time_adj >> 2) + (time_adj >> 5);
-#endif
-}
-
-/* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
-{
- if ( (time_adjust_step = time_adjust) != 0 ) {
- /* We are doing an adjtime thing.
- *
- * Prepare time_adjust_step to be within bounds.
- * Note that a positive time_adjust means we want the clock
- * to run faster.
- *
- * Limit the amount of the step to be in the range
- * -tickadj .. +tickadj
- */
- if (time_adjust > tickadj)
- time_adjust_step = tickadj;
- else if (time_adjust < -tickadj)
- time_adjust_step = -tickadj;
-
- /* Reduce by this step the amount of time left */
- time_adjust -= time_adjust_step;
- }
- xtime.tv_usec += tick + time_adjust_step;
- /*
- * Advance the phase, once it gets to one microsecond, then
- * advance the tick more.
- */
- time_phase += time_adj;
- if (time_phase <= -FINEUSEC) {
- long ltemp = -time_phase >> SHIFT_SCALE;
- time_phase += ltemp << SHIFT_SCALE;
- xtime.tv_usec -= ltemp;
- }
- else if (time_phase >= FINEUSEC) {
- long ltemp = time_phase >> SHIFT_SCALE;
- time_phase -= ltemp << SHIFT_SCALE;
- xtime.tv_usec += ltemp;
- }
-}
-
-/*
- * Using a loop looks inefficient, but "ticks" is
- * usually just one (we shouldn't be losing ticks,
- * we're doing this this way mainly for interrupt
- * latency reasons, not because we think we'll
- * have lots of lost timer ticks
- */
-static void update_wall_time(unsigned long ticks)
-{
- do {
- ticks--;
- update_wall_time_one_tick();
- } while (ticks);
-
- if (xtime.tv_usec >= 1000000) {
- xtime.tv_usec -= 1000000;
- xtime.tv_sec++;
- second_overflow();
- }
-}
-
-static inline void do_process_times(struct task_struct *p,
- unsigned long user, unsigned long system)
-{
- unsigned long psecs;
-
- psecs = (p->times.tms_utime += user);
- psecs += (p->times.tms_stime += system);
- if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
- /* Send SIGXCPU every second.. */
- if (!(psecs % HZ))
- send_sig(SIGXCPU, p, 1);
- /* and SIGKILL when we go over max.. */
- if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
- send_sig(SIGKILL, p, 1);
- }
-}
-
-static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
-{
- unsigned long it_virt = p->it_virt_value;
-
- if (it_virt) {
- if (it_virt <= ticks) {
- it_virt = ticks + p->it_virt_incr;
- send_sig(SIGVTALRM, p, 1);
- }
- p->it_virt_value = it_virt - ticks;
- }
-}
-
-static inline void do_it_prof(struct task_struct * p, unsigned long ticks)
-{
- unsigned long it_prof = p->it_prof_value;
-
- if (it_prof) {
- if (it_prof <= ticks) {
- it_prof = ticks + p->it_prof_incr;
- send_sig(SIGPROF, p, 1);
- }
- p->it_prof_value = it_prof - ticks;
- }
-}
-
-void update_one_process(struct task_struct *p,
- unsigned long ticks, unsigned long user, unsigned long system, int cpu)
-{
- p->per_cpu_utime[cpu] += user;
- p->per_cpu_stime[cpu] += system;
- do_process_times(p, user, system);
- do_it_virt(p, user);
- do_it_prof(p, ticks);
-}
-
-static void update_process_times(unsigned long ticks, unsigned long system)
-{
-/*
- * SMP does this on a per-CPU basis elsewhere
- */
-#ifndef __SMP__
- struct task_struct * p = current;
- unsigned long user = ticks - system;
- if (p->pid) {
- p->counter -= ticks;
- if (p->counter <= 0) {
- p->counter = 0;
- p->need_resched = 1;
- }
- if (p->priority < DEF_PRIORITY)
- kstat.cpu_nice += user;
- else
- kstat.cpu_user += user;
- kstat.cpu_system += system;
- }
- update_one_process(p, ticks, user, system, 0);
-#endif
-}
-
-volatile unsigned long lost_ticks = 0;
-static unsigned long lost_ticks_system = 0;
-
-/*
- * This spinlock protect us from races in SMP while playing with xtime. -arca
- */
-rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
-
-static inline void update_times(void)
-{
- unsigned long ticks;
-
- /*
- * update_times() is run from the raw timer_bh handler so we
- * just know that the irqs are locally enabled and so we don't
- * need to save/restore the flags of the local CPU here. -arca
- */
- write_lock_irq(&xtime_lock);
-
- ticks = lost_ticks;
- lost_ticks = 0;
-
- if (ticks) {
- unsigned long system;
- system = xchg(&lost_ticks_system, 0);
-
- calc_load(ticks);
- update_wall_time(ticks);
- write_unlock_irq(&xtime_lock);
-
- update_process_times(ticks, system);
-
- } else
- write_unlock_irq(&xtime_lock);
-}
-
-static void timer_bh(void)
-{
- update_times();
- run_old_timers();
- run_timer_list();
-}
-
-void do_timer(struct pt_regs * regs)
-{
- (*(unsigned long *)&jiffies)++;
- lost_ticks++;
- mark_bh(TIMER_BH);
- if (!user_mode(regs))
- lost_ticks_system++;
- if (tq_timer)
- mark_bh(TQUEUE_BH);
-}
-
-#if !defined(__alpha__) && !defined(__ia64__)
-
-/*
- * For backwards compatibility? This can be done in libc so Alpha
- * and all newer ports shouldn't need it.
- */
-asmlinkage unsigned long sys_alarm(unsigned int seconds)
-{
- struct itimerval it_new, it_old;
- unsigned int oldalarm;
-
- it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
- it_new.it_value.tv_sec = seconds;
- it_new.it_value.tv_usec = 0;
- do_setitimer(ITIMER_REAL, &it_new, &it_old);
- oldalarm = it_old.it_value.tv_sec;
- /* ehhh.. We can't return 0 if we have an alarm pending.. */
- /* And we'd better return too much than too little anyway */
- if (it_old.it_value.tv_usec)
- oldalarm++;
- return oldalarm;
-}
-
-#endif
-
#ifndef __alpha__
/*
- * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
- * should be moved into arch/i386 instead?
- */
-
-asmlinkage long sys_getpid(void)
-{
- /* This is SMP safe - current->pid doesn't change */
- return current->pid;
-}
-
-/*
- * This is not strictly SMP safe: p_opptr could change
- * from under us. However, rather than getting any lock
- * we can use an optimistic algorithm: get the parent
- * pid, and go back and check that the parent is still
- * the same. If it has changed (which is extremely unlikely
- * indeed), we just try again..
- *
- * NOTE! This depends on the fact that even if we _do_
- * get an old value of "parent", we can happily dereference
- * the pointer: we just can't necessarily trust the result
- * until we know that the parent pointer is valid.
- *
- * The "mb()" macro is a memory barrier - a synchronizing
- * event. It also makes sure that gcc doesn't optimize
- * away the necessary memory references.. The barrier doesn't
- * have to have all that strong semantics: on x86 we don't
- * really require a synchronizing instruction, for example.
- * The barrier is more important for code generation than
- * for any real memory ordering semantics (even if there is
- * a small window for a race, using the old pointer is
- * harmless for a while).
- */
-asmlinkage long sys_getppid(void)
-{
- int pid;
- struct task_struct * me = current;
- struct task_struct * parent;
-
- parent = me->p_opptr;
- for (;;) {
- pid = parent->pid;
-#if __SMP__
-{
- struct task_struct *old = parent;
- mb();
- parent = me->p_opptr;
- if (old != parent)
- continue;
-}
-#endif
- break;
- }
- return pid;
-}
-
-asmlinkage long sys_getuid(void)
-{
- /* Only we change this so SMP safe */
- return current->uid;
-}
-
-asmlinkage long sys_geteuid(void)
-{
- /* Only we change this so SMP safe */
- return current->euid;
-}
-
-asmlinkage long sys_getgid(void)
-{
- /* Only we change this so SMP safe */
- return current->gid;
-}
-
-asmlinkage long sys_getegid(void)
-{
- /* Only we change this so SMP safe */
- return current->egid;
-}
-
-/*
* This has been replaced by sys_setpriority. Maybe it should be
* moved into the arch dependent tree for those ports that require
* it for backward compatibility?
@@ -1674,47 +1022,6 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
return 0;
}
-asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
-{
- struct timespec t;
- unsigned long expire;
-
- if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
- return -EFAULT;
-
- if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
- return -EINVAL;
-
-
- if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
- current->policy != SCHED_OTHER)
- {
- /*
- * Short delay requests up to 2 ms will be handled with
- * high precision by a busy wait for all real-time processes.
- *
- * Its important on SMP not to do this holding locks.
- */
- udelay((t.tv_nsec + 999) / 1000);
- return 0;
- }
-
- expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire);
-
- if (expire) {
- if (rmtp) {
- jiffies_to_timespec(expire, &t);
- if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
- return -EFAULT;
- }
- return -EINTR;
- }
- return 0;
-}
-
static void show_task(struct task_struct * p)
{
unsigned long free = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f18ad13c1..fa69edbac 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,7 +49,9 @@ extern char modprobe_path[];
extern int sg_big_buff;
#endif
#ifdef CONFIG_SYSVIPC
-extern size_t shm_prm[];
+extern size_t shm_ctlmax;
+extern int shm_ctlall;
+extern int shm_ctlmni;
extern int msg_ctlmax;
extern int msg_ctlmnb;
extern int msg_ctlmni;
@@ -133,10 +135,8 @@ struct inode_operations proc_sys_inode_operations =
NULL, /* get_block */
NULL, /* readpage */
NULL, /* writepage */
- NULL, /* flushpage */
NULL, /* truncate */
proc_sys_permission, /* permission */
- NULL, /* smap */
NULL /* revalidate */
};
@@ -217,8 +217,12 @@ static ctl_table kern_table[] = {
{KERN_RTSIGMAX, "rtsig-max", &max_queued_signals, sizeof(int),
0644, NULL, &proc_dointvec},
#ifdef CONFIG_SYSVIPC
- {KERN_SHMMAX, "shmmax", &shm_prm, 3*sizeof (size_t),
+ {KERN_SHMMAX, "shmmax", &shm_ctlmax, sizeof (size_t),
0644, NULL, &proc_doulongvec_minmax},
+ {KERN_SHMALL, "shmall", &shm_ctlall, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_SHMMNI, "shmmni", &shm_ctlmni, sizeof (int),
+ 0644, NULL, &proc_dointvec},
{KERN_MSGMAX, "msgmax", &msg_ctlmax, sizeof (int),
0644, NULL, &proc_dointvec},
{KERN_MSGMNI, "msgmni", &msg_ctlmni, sizeof (int),
@@ -351,7 +355,7 @@ extern asmlinkage long sys_sysctl(struct __sysctl_args *args)
}
/* Like in_group_p, but testing against egid, not fsgid */
-static int in_egroup_p(gid_t grp)
+int in_egroup_p(gid_t grp)
{
if (grp != current->egid) {
int i = current->ngroups;
@@ -584,13 +588,12 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root
continue;
}
- /* Don't unregoster proc entries that are still being used.. */
+ /* Don't unregister proc entries that are still being used.. */
if (de->count)
continue;
- proc_unregister(root, de->low_ino);
table->de = NULL;
- kfree(de);
+ remove_proc_entry(table->procname, root);
}
}
diff --git a/kernel/timer.c b/kernel/timer.c
new file mode 100644
index 000000000..fccf7faa7
--- /dev/null
+++ b/kernel/timer.c
@@ -0,0 +1,791 @@
+/*
+ * linux/kernel/timer.c
+ *
+ * Kernel internal timers, kernel timekeeping, basic process system calls
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
+ *
+ * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
+ * Copyright (C) 1998 Andrea Arcangeli
+ * 1999-03-10 Improved NTP compatibility by Ulrich Windl
+ */
+
+#include <linux/mm.h>
+#include <linux/timex.h>
+#include <linux/delay.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Timekeeping variables
+ */
+
+long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */
+
+/* The current time */
+volatile struct timeval xtime __attribute__ ((aligned (16)));
+
+/* Don't completely fail for HZ > 500. */
+int tickadj = 500/HZ ? : 1; /* microsecs */
+
+DECLARE_TASK_QUEUE(tq_timer);
+DECLARE_TASK_QUEUE(tq_immediate);
+DECLARE_TASK_QUEUE(tq_scheduler);
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+int time_state = TIME_OK; /* clock synchronization status */
+int time_status = STA_UNSYNC; /* clock status bits */
+long time_offset = 0; /* time adjustment (us) */
+long time_constant = 2; /* pll time constant */
+long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
+long time_precision = 1; /* clock precision (us) */
+long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
+long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
+long time_phase = 0; /* phase offset (scaled us) */
+long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
+ /* frequency offset (scaled ppm)*/
+long time_adj = 0; /* tick adjust (scaled 1 / HZ) */
+long time_reftime = 0; /* time at last adjustment (s) */
+
+long time_adjust = 0;
+long time_adjust_step = 0;
+
+unsigned long event = 0;
+
+extern int do_setitimer(int, struct itimerval *, struct itimerval *);
+
+unsigned long volatile jiffies = 0;
+
+unsigned int * prof_buffer = NULL;
+unsigned long prof_len = 0;
+unsigned long prof_shift = 0;
+
+/*
+ * Event timer code
+ */
+#define TVN_BITS 6
+#define TVR_BITS 8
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+
+struct timer_vec {
+ int index;
+ struct timer_list *vec[TVN_SIZE];
+};
+
+struct timer_vec_root {
+ int index;
+ struct timer_list *vec[TVR_SIZE];
+};
+
+static struct timer_vec tv5 = { 0 };
+static struct timer_vec tv4 = { 0 };
+static struct timer_vec tv3 = { 0 };
+static struct timer_vec tv2 = { 0 };
+static struct timer_vec_root tv1 = { 0 };
+
+static struct timer_vec * const tvecs[] = {
+ (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
+};
+
+#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
+
+static unsigned long timer_jiffies = 0;
+
+static inline void insert_timer(struct timer_list *timer,
+ struct timer_list **vec, int idx)
+{
+ if ((timer->next = vec[idx]))
+ vec[idx]->prev = timer;
+ vec[idx] = timer;
+ timer->prev = (struct timer_list *)&vec[idx];
+}
+
+static inline void internal_add_timer(struct timer_list *timer)
+{
+ /*
+ * must be cli-ed when calling this
+ */
+ unsigned long expires = timer->expires;
+ unsigned long idx = expires - timer_jiffies;
+
+ if (idx < TVR_SIZE) {
+ int i = expires & TVR_MASK;
+ insert_timer(timer, tv1.vec, i);
+ } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+ int i = (expires >> TVR_BITS) & TVN_MASK;
+ insert_timer(timer, tv2.vec, i);
+ } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+ insert_timer(timer, tv3.vec, i);
+ } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+ insert_timer(timer, tv4.vec, i);
+ } else if ((signed long) idx < 0) {
+ /* can happen if you add a timer with expires == jiffies,
+ * or you set a timer to go off in the past
+ */
+ insert_timer(timer, tv1.vec, tv1.index);
+ } else if (idx <= 0xffffffffUL) {
+ int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
+ insert_timer(timer, tv5.vec, i);
+ } else {
+ /* Can only get here on architectures with 64-bit jiffies */
+ timer->next = timer->prev = timer;
+ }
+}
+
+spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
+
+void add_timer(struct timer_list *timer)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&timerlist_lock, flags);
+ if (timer->prev)
+ goto bug;
+ internal_add_timer(timer);
+out:
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+ return;
+
+bug:
+ printk("bug: kernel timer added twice at %p.\n",
+ __builtin_return_address(0));
+ goto out;
+}
+
+static inline int detach_timer(struct timer_list *timer)
+{
+ struct timer_list *prev = timer->prev;
+ if (prev) {
+ struct timer_list *next = timer->next;
+ prev->next = next;
+ if (next)
+ next->prev = prev;
+ return 1;
+ }
+ return 0;
+}
+
+void mod_timer(struct timer_list *timer, unsigned long expires)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&timerlist_lock, flags);
+ timer->expires = expires;
+ detach_timer(timer);
+ internal_add_timer(timer);
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+}
+
+int del_timer(struct timer_list * timer)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&timerlist_lock, flags);
+ ret = detach_timer(timer);
+ timer->next = timer->prev = 0;
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+ return ret;
+}
+
+static inline void cascade_timers(struct timer_vec *tv)
+{
+ /* cascade all the timers from tv up one level */
+ struct timer_list *timer;
+ timer = tv->vec[tv->index];
+ /*
+ * We are removing _all_ timers from the list, so we don't have to
+ * detach them individually, just clear the list afterwards.
+ */
+ while (timer) {
+ struct timer_list *tmp = timer;
+ timer = timer->next;
+ internal_add_timer(tmp);
+ }
+ tv->vec[tv->index] = NULL;
+ tv->index = (tv->index + 1) & TVN_MASK;
+}
+
+static inline void run_timer_list(void)
+{
+ spin_lock_irq(&timerlist_lock);
+ while ((long)(jiffies - timer_jiffies) >= 0) {
+ struct timer_list *timer;
+ if (!tv1.index) {
+ int n = 1;
+ do {
+ cascade_timers(tvecs[n]);
+ } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
+ }
+ while ((timer = tv1.vec[tv1.index])) {
+ void (*fn)(unsigned long) = timer->function;
+ unsigned long data = timer->data;
+ detach_timer(timer);
+ timer->next = timer->prev = NULL;
+ spin_unlock_irq(&timerlist_lock);
+ fn(data);
+ spin_lock_irq(&timerlist_lock);
+ }
+ ++timer_jiffies;
+ tv1.index = (tv1.index + 1) & TVR_MASK;
+ }
+ spin_unlock_irq(&timerlist_lock);
+}
+
+
+static inline void run_old_timers(void)
+{
+ struct timer_struct *tp;
+ unsigned long mask;
+
+ for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) {
+ if (mask > timer_active)
+ break;
+ if (!(mask & timer_active))
+ continue;
+ if (time_after(tp->expires, jiffies))
+ continue;
+ timer_active &= ~mask;
+ tp->fn();
+ sti();
+ }
+}
+
+spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED;
+
+void tqueue_bh(void)
+{
+ run_task_queue(&tq_timer);
+}
+
+void immediate_bh(void)
+{
+ run_task_queue(&tq_immediate);
+}
+
+unsigned long timer_active = 0;
+struct timer_struct timer_table[32];
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ */
+static void second_overflow(void)
+{
+ long ltemp;
+
+ /* Bump the maxerror field */
+ time_maxerror += time_tolerance >> SHIFT_USEC;
+ if ( time_maxerror > NTP_PHASE_LIMIT ) {
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_status |= STA_UNSYNC;
+ }
+
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second. The microtime() routine or
+ * external clock driver will insure that reported time
+ * is always monotonic. The ugly divides should be
+ * replaced.
+ */
+ switch (time_state) {
+
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+
+ case TIME_INS:
+ if (xtime.tv_sec % 86400 == 0) {
+ xtime.tv_sec--;
+ time_state = TIME_OOP;
+ printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+ }
+ break;
+
+ case TIME_DEL:
+ if ((xtime.tv_sec + 1) % 86400 == 0) {
+ xtime.tv_sec++;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+ }
+ break;
+
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+
+ /*
+ * Compute the phase adjustment for the next second. In
+ * PLL mode, the offset is reduced by a fixed factor
+ * times the time constant. In FLL mode the offset is
+ * used directly. In either mode, the maximum phase
+ * adjustment for each second is clamped so as to spread
+ * the adjustment over not more than the number of
+ * seconds between updates.
+ */
+ if (time_offset < 0) {
+ ltemp = -time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+ time_offset += ltemp;
+ time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+ } else {
+ ltemp = time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+ time_offset -= ltemp;
+ time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+ }
+
+ /*
+ * Compute the frequency estimate and additional phase
+ * adjustment due to frequency error for the next
+ * second. When the PPS signal is engaged, gnaw on the
+ * watchdog counter and update the frequency computed by
+ * the pll and the PPS signal.
+ */
+ pps_valid++;
+ if (pps_valid == PPS_VALID) { /* PPS signal lost */
+ pps_jitter = MAXTIME;
+ pps_stabil = MAXFREQ;
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ }
+ ltemp = time_freq + pps_freq;
+ if (ltemp < 0)
+ time_adj -= -ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+ else
+ time_adj += ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if HZ == 100
+ /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
+ * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
+ */
+ if (time_adj < 0)
+ time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
+ else
+ time_adj += (time_adj >> 2) + (time_adj >> 5);
+#endif
+}
+
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+ if ( (time_adjust_step = time_adjust) != 0 ) {
+ /* We are doing an adjtime thing.
+ *
+ * Prepare time_adjust_step to be within bounds.
+ * Note that a positive time_adjust means we want the clock
+ * to run faster.
+ *
+ * Limit the amount of the step to be in the range
+ * -tickadj .. +tickadj
+ */
+ if (time_adjust > tickadj)
+ time_adjust_step = tickadj;
+ else if (time_adjust < -tickadj)
+ time_adjust_step = -tickadj;
+
+ /* Reduce by this step the amount of time left */
+ time_adjust -= time_adjust_step;
+ }
+ xtime.tv_usec += tick + time_adjust_step;
+ /*
+ * Advance the phase, once it gets to one microsecond, then
+ * advance the tick more.
+ */
+ time_phase += time_adj;
+ if (time_phase <= -FINEUSEC) {
+ long ltemp = -time_phase >> SHIFT_SCALE;
+ time_phase += ltemp << SHIFT_SCALE;
+ xtime.tv_usec -= ltemp;
+ }
+ else if (time_phase >= FINEUSEC) {
+ long ltemp = time_phase >> SHIFT_SCALE;
+ time_phase -= ltemp << SHIFT_SCALE;
+ xtime.tv_usec += ltemp;
+ }
+}
+
+/*
+ * Using a loop looks inefficient, but "ticks" is
+ * usually just one (we shouldn't be losing ticks,
+ * we're doing this this way mainly for interrupt
+ * latency reasons, not because we think we'll
+ * have lots of lost timer ticks
+ */
+static void update_wall_time(unsigned long ticks)
+{
+ do {
+ ticks--;
+ update_wall_time_one_tick();
+ } while (ticks);
+
+ if (xtime.tv_usec >= 1000000) {
+ xtime.tv_usec -= 1000000;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+}
+
+static inline void do_process_times(struct task_struct *p,
+ unsigned long user, unsigned long system)
+{
+ unsigned long psecs;
+
+ psecs = (p->times.tms_utime += user);
+ psecs += (p->times.tms_stime += system);
+ if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
+ /* Send SIGXCPU every second.. */
+ if (!(psecs % HZ))
+ send_sig(SIGXCPU, p, 1);
+ /* and SIGKILL when we go over max.. */
+ if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
+ send_sig(SIGKILL, p, 1);
+ }
+}
+
+static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
+{
+ unsigned long it_virt = p->it_virt_value;
+
+ if (it_virt) {
+ if (it_virt <= ticks) {
+ it_virt = ticks + p->it_virt_incr;
+ send_sig(SIGVTALRM, p, 1);
+ }
+ p->it_virt_value = it_virt - ticks;
+ }
+}
+
+static inline void do_it_prof(struct task_struct * p, unsigned long ticks)
+{
+ unsigned long it_prof = p->it_prof_value;
+
+ if (it_prof) {
+ if (it_prof <= ticks) {
+ it_prof = ticks + p->it_prof_incr;
+ send_sig(SIGPROF, p, 1);
+ }
+ p->it_prof_value = it_prof - ticks;
+ }
+}
+
+void update_one_process(struct task_struct *p,
+ unsigned long ticks, unsigned long user, unsigned long system, int cpu)
+{
+ p->per_cpu_utime[cpu] += user;
+ p->per_cpu_stime[cpu] += system;
+ do_process_times(p, user, system);
+ do_it_virt(p, user);
+ do_it_prof(p, ticks);
+}
+
+static void update_process_times(unsigned long ticks, unsigned long system)
+{
+/*
+ * SMP does this on a per-CPU basis elsewhere
+ */
+#ifndef __SMP__
+ struct task_struct * p = current;
+ unsigned long user = ticks - system;
+ if (p->pid) {
+ p->counter -= ticks;
+ if (p->counter <= 0) {
+ p->counter = 0;
+ p->need_resched = 1;
+ }
+ if (p->priority < DEF_PRIORITY)
+ kstat.cpu_nice += user;
+ else
+ kstat.cpu_user += user;
+ kstat.cpu_system += system;
+ }
+ update_one_process(p, ticks, user, system, 0);
+#endif
+}
+
+/*
+ * Nr of active tasks - counted in fixed-point numbers
+ */
+static unsigned long count_active_tasks(void)
+{
+ struct task_struct *p;
+ unsigned long nr = 0;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if ((p->state == TASK_RUNNING ||
+ (p->state & TASK_UNINTERRUPTIBLE) ||
+ (p->state & TASK_SWAPPING)))
+ nr += FIXED_1;
+ }
+ read_unlock(&tasklist_lock);
+ return nr;
+}
+
+/*
+ * Hmm.. Changed this, as the GNU make sources (load.c) seems to
+ * imply that avenrun[] is the standard name for this kind of thing.
+ * Nothing else seems to be standardized: the fractional size etc
+ * all seem to differ on different machines.
+ */
+unsigned long avenrun[3] = { 0,0,0 };
+
+static inline void calc_load(unsigned long ticks)
+{
+ unsigned long active_tasks; /* fixed-point */
+ static int count = LOAD_FREQ;
+
+ count -= ticks;
+ if (count < 0) {
+ count += LOAD_FREQ;
+ active_tasks = count_active_tasks();
+ CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+ CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+ CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+ }
+}
+
+volatile unsigned long lost_ticks = 0;
+static unsigned long lost_ticks_system = 0;
+
+/*
+ * This spinlock protect us from races in SMP while playing with xtime. -arca
+ */
+rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
+
+static inline void update_times(void)
+{
+ unsigned long ticks;
+
+ /*
+ * update_times() is run from the raw timer_bh handler so we
+ * just know that the irqs are locally enabled and so we don't
+ * need to save/restore the flags of the local CPU here. -arca
+ */
+ write_lock_irq(&xtime_lock);
+
+ ticks = lost_ticks;
+ lost_ticks = 0;
+
+ if (ticks) {
+ unsigned long system;
+ system = xchg(&lost_ticks_system, 0);
+
+ calc_load(ticks);
+ update_wall_time(ticks);
+ write_unlock_irq(&xtime_lock);
+
+ update_process_times(ticks, system);
+
+ } else
+ write_unlock_irq(&xtime_lock);
+}
+
+void timer_bh(void)
+{
+ update_times();
+ run_old_timers();
+ run_timer_list();
+}
+
+void do_timer(struct pt_regs * regs)
+{
+ (*(unsigned long *)&jiffies)++;
+ lost_ticks++;
+ mark_bh(TIMER_BH);
+ if (!user_mode(regs))
+ lost_ticks_system++;
+ if (tq_timer)
+ mark_bh(TQUEUE_BH);
+}
+
+#if !defined(__alpha__) && !defined(__ia64__)
+
+/*
+ * For backwards compatibility? This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+asmlinkage unsigned long sys_alarm(unsigned int seconds)
+{
+ struct itimerval it_new, it_old;
+ unsigned int oldalarm;
+
+ it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+ it_new.it_value.tv_sec = seconds;
+ it_new.it_value.tv_usec = 0;
+ do_setitimer(ITIMER_REAL, &it_new, &it_old);
+ oldalarm = it_old.it_value.tv_sec;
+ /* ehhh.. We can't return 0 if we have an alarm pending.. */
+ /* And we'd better return too much than too little anyway */
+ if (it_old.it_value.tv_usec)
+ oldalarm++;
+ return oldalarm;
+}
+
+#endif
+
+#ifndef __alpha__
+
+/*
+ * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
+ * should be moved into arch/i386 instead?
+ */
+
+asmlinkage long sys_getpid(void)
+{
+ /* This is SMP safe - current->pid doesn't change */
+ return current->pid;
+}
+
+/*
+ * This is not strictly SMP safe: p_opptr could change
+ * from under us. However, rather than getting any lock
+ * we can use an optimistic algorithm: get the parent
+ * pid, and go back and check that the parent is still
+ * the same. If it has changed (which is extremely unlikely
+ * indeed), we just try again..
+ *
+ * NOTE! This depends on the fact that even if we _do_
+ * get an old value of "parent", we can happily dereference
+ * the pointer: we just can't necessarily trust the result
+ * until we know that the parent pointer is valid.
+ *
+ * The "mb()" macro is a memory barrier - a synchronizing
+ * event. It also makes sure that gcc doesn't optimize
+ * away the necessary memory references.. The barrier doesn't
+ * have to have all that strong semantics: on x86 we don't
+ * really require a synchronizing instruction, for example.
+ * The barrier is more important for code generation than
+ * for any real memory ordering semantics (even if there is
+ * a small window for a race, using the old pointer is
+ * harmless for a while).
+ */
+asmlinkage long sys_getppid(void)
+{
+ int pid;
+ struct task_struct * me = current;
+ struct task_struct * parent;
+
+ parent = me->p_opptr;
+ for (;;) {
+ pid = parent->pid;
+#if __SMP__
+{
+ struct task_struct *old = parent;
+ mb();
+ parent = me->p_opptr;
+ if (old != parent)
+ continue;
+}
+#endif
+ break;
+ }
+ return pid;
+}
+
+asmlinkage long sys_getuid(void)
+{
+ /* Only we change this so SMP safe */
+ return current->uid;
+}
+
+asmlinkage long sys_geteuid(void)
+{
+ /* Only we change this so SMP safe */
+ return current->euid;
+}
+
+asmlinkage long sys_getgid(void)
+{
+ /* Only we change this so SMP safe */
+ return current->gid;
+}
+
+asmlinkage long sys_getegid(void)
+{
+ /* Only we change this so SMP safe */
+ return current->egid;
+}
+
+#endif
+
+asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
+{
+ struct timespec t;
+ unsigned long expire;
+
+ if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
+ return -EFAULT;
+
+ if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
+ return -EINVAL;
+
+
+ if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
+ current->policy != SCHED_OTHER)
+ {
+ /*
+ * Short delay requests up to 2 ms will be handled with
+ * high precision by a busy wait for all real-time processes.
+ *
+ * Its important on SMP not to do this holding locks.
+ */
+ udelay((t.tv_nsec + 999) / 1000);
+ return 0;
+ }
+
+ expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
+
+ current->state = TASK_INTERRUPTIBLE;
+ expire = schedule_timeout(expire);
+
+ if (expire) {
+ if (rmtp) {
+ jiffies_to_timespec(expire, &t);
+ if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
+ return -EFAULT;
+ }
+ return -EINTR;
+ }
+ return 0;
+}
+