diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/acct.c | 8 | ||||
-rw-r--r-- | kernel/capability.c | 2 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/ksyms.c | 31 | ||||
-rw-r--r-- | kernel/module.c | 42 | ||||
-rw-r--r-- | kernel/resource.c | 41 | ||||
-rw-r--r-- | kernel/sched.c | 897 | ||||
-rw-r--r-- | kernel/sysctl.c | 19 | ||||
-rw-r--r-- | kernel/timer.c | 791 |
10 files changed, 979 insertions, 855 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 05357e348..00d0dfa69 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -13,7 +13,7 @@ O_TARGET := kernel.o O_OBJS = sched.o dma.o fork.o exec_domain.o panic.o printk.o sys.o \ module.o exit.o itimer.o info.o time.o softirq.o resource.o \ - sysctl.o acct.o capability.o ptrace.o + sysctl.o acct.o capability.o ptrace.o timer.o OX_OBJS += signal.o diff --git a/kernel/acct.c b/kernel/acct.c index 7e64105a8..fdadf7a9f 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -150,12 +150,12 @@ asmlinkage long sys_acct(const char *name) { struct file *file = NULL, *old_acct = NULL; char *tmp; - int error = -EPERM; + int error; - lock_kernel(); if (!capable(CAP_SYS_PACCT)) - goto out; + return -EPERM; + lock_kernel(); if (name) { tmp = getname(name); error = PTR_ERR(tmp); @@ -257,8 +257,6 @@ static comp_t encode_comp_t(unsigned long value) * into the accounting file. This function should only be called from * do_exit(). */ -#define KSTK_EIP(stack) (((unsigned long *)(stack))[1019]) -#define KSTK_ESP(stack) (((unsigned long *)(stack))[1022]) /* * do_acct_process does all actual work. diff --git a/kernel/capability.c b/kernel/capability.c index 2dbfe83f7..7c5f6df21 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -9,7 +9,7 @@ #include <asm/uaccess.h> /* Note: never hold tasklist_lock while spinning for this one */ -spinlock_t task_capability_lock; +spinlock_t task_capability_lock = SPIN_LOCK_UNLOCKED; /* * For sys_getproccap() and sys_setproccap(), any of the three diff --git a/kernel/fork.c b/kernel/fork.c index a90a6bc47..de0b59bac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -19,6 +19,7 @@ #include <linux/vmalloc.h> #include <asm/pgtable.h> +#include <asm/pgalloc.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 555e735d0..600395f49 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -39,6 +39,7 @@ #include <linux/iobuf.h> #include <linux/console.h> #include <linux/poll.h> +#include <linux/mmzone.h> #include <linux/mm.h> #include <linux/capability.h> @@ -83,7 +84,7 @@ EXPORT_SYMBOL(get_option); EXPORT_SYMBOL(get_options); /* process memory management */ -EXPORT_SYMBOL(do_mmap); +EXPORT_SYMBOL(do_mmap_pgoff); EXPORT_SYMBOL(do_munmap); EXPORT_SYMBOL(do_brk); EXPORT_SYMBOL(exit_mm); @@ -92,9 +93,12 @@ EXPORT_SYMBOL(exit_fs); EXPORT_SYMBOL(exit_sighand); /* internal kernel memory management */ -EXPORT_SYMBOL(__get_free_pages); -EXPORT_SYMBOL(free_pages); -EXPORT_SYMBOL(__free_page); +EXPORT_SYMBOL(__alloc_pages); +EXPORT_SYMBOL(alloc_pages_node); +EXPORT_SYMBOL(__free_pages_ok); +#ifndef CONFIG_DISCONTIGMEM +EXPORT_SYMBOL(contig_page_data); +#endif EXPORT_SYMBOL(kmem_find_general_cachep); EXPORT_SYMBOL(kmem_cache_create); EXPORT_SYMBOL(kmem_cache_destroy); @@ -114,6 +118,11 @@ EXPORT_SYMBOL(vmtruncate); EXPORT_SYMBOL(find_vma); EXPORT_SYMBOL(get_unmapped_area); EXPORT_SYMBOL(init_mm); +#ifdef CONFIG_HIGHMEM +EXPORT_SYMBOL(kmap_high); +EXPORT_SYMBOL(kunmap_high); +EXPORT_SYMBOL(highmem_start_page); +#endif /* filesystem internal functions */ EXPORT_SYMBOL(in_group_p); @@ -173,12 +182,13 @@ EXPORT_SYMBOL(__brelse); EXPORT_SYMBOL(__bforget); EXPORT_SYMBOL(ll_rw_block); EXPORT_SYMBOL(__wait_on_buffer); +EXPORT_SYMBOL(___wait_on_page); EXPORT_SYMBOL(add_blkdev_randomness); EXPORT_SYMBOL(block_read_full_page); EXPORT_SYMBOL(block_write_full_page); EXPORT_SYMBOL(block_write_partial_page); EXPORT_SYMBOL(block_write_cont_page); -EXPORT_SYMBOL(block_flushpage); +EXPORT_SYMBOL(block_write_zero_range); EXPORT_SYMBOL(generic_file_read); EXPORT_SYMBOL(do_generic_file_read); EXPORT_SYMBOL(generic_file_write); @@ -193,7 +203,6 @@ EXPORT_SYMBOL(posix_block_lock); EXPORT_SYMBOL(posix_unblock_lock); EXPORT_SYMBOL(locks_mandatory_area); EXPORT_SYMBOL(dput); -EXPORT_SYMBOL(put_cached_page); EXPORT_SYMBOL(is_root_busy); EXPORT_SYMBOL(prune_dcache); EXPORT_SYMBOL(shrink_dcache_sb); @@ -206,10 +215,16 @@ EXPORT_SYMBOL(vfs_unlink); EXPORT_SYMBOL(vfs_rename); EXPORT_SYMBOL(__pollwait); EXPORT_SYMBOL(ROOT_DEV); -EXPORT_SYMBOL(add_to_page_cache_unique); EXPORT_SYMBOL(__find_get_page); EXPORT_SYMBOL(__find_lock_page); - +EXPORT_SYMBOL(grab_cache_page); +EXPORT_SYMBOL(read_cache_page); +EXPORT_SYMBOL(vfs_readlink); +EXPORT_SYMBOL(vfs_follow_link); +EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(page_follow_link); +EXPORT_SYMBOL(block_symlink); + #if !defined(CONFIG_NFSD) && defined(CONFIG_NFSD_MODULE) EXPORT_SYMBOL(do_nfsservctl); #endif diff --git a/kernel/module.c b/kernel/module.c index 6f4ad977d..fb9d4ef8d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4,7 +4,7 @@ #include <asm/uaccess.h> #include <linux/vmalloc.h> #include <linux/smp_lock.h> -#include <asm/pgtable.h> +#include <asm/pgalloc.h> #include <linux/init.h> /* @@ -13,6 +13,7 @@ * 0.99.14 version by Jon Tombs <jon@gtex02.us.es>, * Heavily modified by Bjorn Ekwall <bj0rn@blox.se> May 1994 (C) * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996 + * Add MOD_INITIALIZING Keith Owens <kaos@ocs.com.au> Nov 1999 * * This source is covered by the GNU GPL, the same as all kernel sources. */ @@ -111,11 +112,9 @@ sys_create_module(const char *name_user, size_t size) long namelen, error; struct module *mod; + if (!capable(CAP_SYS_MODULE)) + return -EPERM; lock_kernel(); - if (!capable(CAP_SYS_MODULE)) { - error = -EPERM; - goto err0; - } if ((namelen = get_mod_name(name_user, &name)) < 0) { error = namelen; goto err0; @@ -162,13 +161,13 @@ sys_init_module(const char *name_user, struct module *mod_user) { struct module mod_tmp, *mod; char *name, *n_name; - long namelen, n_namelen, i, error = -EPERM; + long namelen, n_namelen, i, error; unsigned long mod_user_size; struct module_ref *dep; - lock_kernel(); if (!capable(CAP_SYS_MODULE)) - goto err0; + return -EPERM; + lock_kernel(); if ((namelen = get_mod_name(name_user, &name)) < 0) { error = namelen; goto err0; @@ -325,16 +324,18 @@ sys_init_module(const char *name_user, struct module *mod_user) put_mod_name(name); /* Initialize the module. */ + mod->flags |= MOD_INITIALIZING; atomic_set(&mod->uc.usecount,1); if (mod->init && mod->init() != 0) { atomic_set(&mod->uc.usecount,0); + mod->flags &= ~MOD_INITIALIZING; error = -EBUSY; goto err0; } atomic_dec(&mod->uc.usecount); /* And set it running. */ - mod->flags |= MOD_RUNNING; + mod->flags = (mod->flags | MOD_RUNNING) & ~MOD_INITIALIZING; error = 0; goto err0; @@ -354,13 +355,13 @@ sys_delete_module(const char *name_user) { struct module *mod, *next; char *name; - long error = -EPERM; + long error; int something_changed; - lock_kernel(); if (!capable(CAP_SYS_MODULE)) - goto out; + return -EPERM; + lock_kernel(); if (name_user) { if ((error = get_mod_name(name_user, &name)) < 0) goto out; @@ -458,7 +459,7 @@ qm_deps(struct module *mod, char *buf, size_t bufsize, size_t *ret) if (mod == &kernel_module) return -EINVAL; - if ((mod->flags & (MOD_RUNNING | MOD_DELETED)) != MOD_RUNNING) + if (!MOD_CAN_QUERY(mod)) if (put_user(0, ret)) return -EFAULT; else @@ -502,7 +503,7 @@ qm_refs(struct module *mod, char *buf, size_t bufsize, size_t *ret) if (mod == &kernel_module) return -EINVAL; - if ((mod->flags & (MOD_RUNNING | MOD_DELETED)) != MOD_RUNNING) + if (!MOD_CAN_QUERY(mod)) if (put_user(0, ret)) return -EFAULT; else @@ -546,7 +547,7 @@ qm_symbols(struct module *mod, char *buf, size_t bufsize, size_t *ret) char *strings; unsigned long *vals; - if ((mod->flags & (MOD_RUNNING | MOD_DELETED)) != MOD_RUNNING) + if (!MOD_CAN_QUERY(mod)) if (put_user(0, ret)) return -EFAULT; else @@ -712,7 +713,7 @@ sys_get_kernel_syms(struct kernel_sym *table) struct module_symbol *msym; unsigned int j; - if ((mod->flags & (MOD_RUNNING|MOD_DELETED)) != MOD_RUNNING) + if (!MOD_CAN_QUERY(mod)) continue; /* magic: write module info as a pseudo symbol */ @@ -861,7 +862,10 @@ int get_module_list(char *p) safe_copy_cstr(" (autoclean)"); if (!(mod->flags & MOD_USED_ONCE)) safe_copy_cstr(" (unused)"); - } else + } + else if (mod->flags & MOD_INITIALIZING) + safe_copy_cstr(" (initializing)"); + else safe_copy_cstr(" (uninitialized)"); if ((ref = mod->refs) != NULL) { @@ -905,7 +909,7 @@ get_ksyms_list(char *buf, char **start, off_t offset, int length) unsigned i; struct module_symbol *sym; - if (!(mod->flags & MOD_RUNNING) || (mod->flags & MOD_DELETED)) + if (!MOD_CAN_QUERY(mod)) continue; for (i = mod->nsyms, sym = mod->syms; i > 0; --i, ++sym) { @@ -953,7 +957,7 @@ get_module_symbol(char *modname, char *symname) for (mp = module_list; mp; mp = mp->next) { if (((modname == NULL) || (strcmp(mp->name, modname) == 0)) && - (mp->flags & (MOD_RUNNING | MOD_DELETED)) == MOD_RUNNING && + MOD_CAN_QUERY(mp) && (mp->nsyms > 0)) { for (i = mp->nsyms, sym = mp->syms; i > 0; --i, ++sym) { diff --git a/kernel/resource.c b/kernel/resource.c index e3dd3a16d..bdf9fef31 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -13,9 +13,10 @@ #include <linux/init.h> #include <linux/malloc.h> #include <linux/spinlock.h> +#include <asm/io.h> -struct resource ioport_resource = { "PCI IO", 0x0000, 0xFFFF, IORESOURCE_IO }; -struct resource iomem_resource = { "PCI mem", 0x00000000, 0xFFFFFFFF, IORESOURCE_MEM }; +struct resource ioport_resource = { "PCI IO", 0x0000, IO_SPACE_LIMIT, IORESOURCE_IO }; +struct resource iomem_resource = { "PCI mem", 0x00000000, 0xffffffff, IORESOURCE_MEM }; static rwlock_t resource_lock = RW_LOCK_UNLOCKED; @@ -126,30 +127,32 @@ int release_resource(struct resource *old) static int find_resource(struct resource *root, struct resource *new, unsigned long size, unsigned long min, unsigned long max, - unsigned long align) + unsigned long align, + void (*alignf)(void *, struct resource *, unsigned long), + void *alignf_data) { struct resource *this = root->child; - unsigned long start, end; - start = root->start; + new->start = root->start; for(;;) { if (this) - end = this->start; + new->end = this->start; else - end = root->end; - if (start < min) - start = min; - if (end > max) - end = max; - start = (start + align - 1) & ~(align - 1); - if (start < end && end - start + 1 >= size) { - new->start = start; - new->end = start + size - 1; + new->end = root->end; + if (new->start < min) + new->start = min; + if (new->end > max) + new->end = max; + new->start = (new->start + align - 1) & ~(align - 1); + if (alignf) + alignf(alignf_data, new, size); + if (new->start < new->end && new->end - new->start + 1 >= size) { + new->end = new->start + size - 1; return 0; } if (!this) break; - start = this->end + 1; + new->start = this->end + 1; this = this->sibling; } return -EBUSY; @@ -161,12 +164,14 @@ static int find_resource(struct resource *root, struct resource *new, int allocate_resource(struct resource *root, struct resource *new, unsigned long size, unsigned long min, unsigned long max, - unsigned long align) + unsigned long align, + void (*alignf)(void *, struct resource *, unsigned long), + void *alignf_data) { int err; write_lock(&resource_lock); - err = find_resource(root, new, size, min, max, align); + err = find_resource(root, new, size, min, max, align, alignf, alignf_data); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); diff --git a/kernel/sched.c b/kernel/sched.c index ac68af2e3..e653fd22c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1,20 +1,15 @@ /* * linux/kernel/sched.c * + * Kernel scheduler and related syscalls + * * Copyright (C) 1991, 1992 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe - * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli - * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - * Copyright (C) 1998 Andrea Arcangeli * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar - * 1999-03-10 Improved NTP compatibility by Ulrich Windl */ /* @@ -25,69 +20,27 @@ */ #include <linux/mm.h> -#include <linux/kernel_stat.h> -#include <linux/fdreg.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> #include <linux/init.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> -#include <asm/io.h> #include <asm/uaccess.h> -#include <asm/pgtable.h> #include <asm/mmu_context.h> -#include <linux/timex.h> + +extern void timer_bh(void); +extern void tqueue_bh(void); +extern void immediate_bh(void); /* - * kernel variables + * scheduler variables */ unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ -long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */ - -/* The current time */ -volatile struct timeval xtime __attribute__ ((aligned (16))); - -/* Don't completely fail for HZ > 500. */ -int tickadj = 500/HZ ? : 1; /* microsecs */ - -DECLARE_TASK_QUEUE(tq_timer); -DECLARE_TASK_QUEUE(tq_immediate); -DECLARE_TASK_QUEUE(tq_scheduler); - -/* - * phase-lock loop variables - */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -long time_offset = 0; /* time adjustment (us) */ -long time_constant = 2; /* pll time constant */ -long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ -long time_precision = 1; /* clock precision (us) */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -long time_phase = 0; /* phase offset (scaled us) */ -long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; /* frequency offset (scaled ppm) */ -long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ -long time_reftime = 0; /* time at last adjustment (s) */ - -long time_adjust = 0; -long time_adjust_step = 0; - -unsigned long event = 0; - -extern int do_setitimer(int, struct itimerval *, struct itimerval *); -unsigned int * prof_buffer = NULL; -unsigned long prof_len = 0; -unsigned long prof_shift = 0; - extern void mem_use(void); -unsigned long volatile jiffies=0; - /* * Init task must be ok at boot for the ix86 as we will check its signals * via the SMP irq return path. @@ -223,49 +176,88 @@ static inline int preemption_goodness(struct task_struct * prev, struct task_str return goodness(p, cpu, prev->mm) - goodness(prev, cpu, prev->mm); } -static void reschedule_idle(struct task_struct * p) +/* + * This is ugly, but reschedule_idle() is very timing-critical. + * We enter with the runqueue spinlock held, but we might end + * up unlocking it early, so the caller must not unlock the + * runqueue, it's always done by reschedule_idle(). + */ +static inline void reschedule_idle(struct task_struct * p, unsigned long flags) { #ifdef __SMP__ int this_cpu = smp_processor_id(), target_cpu; - struct task_struct *tsk, *target_tsk; + struct task_struct *tsk; int cpu, best_cpu, i; - unsigned long flags; - - spin_lock_irqsave(&runqueue_lock, flags); /* * shortcut if the woken up task's last CPU is * idle now. */ best_cpu = p->processor; - target_tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == target_tsk) + tsk = idle_task(best_cpu); + if (cpu_curr(best_cpu) == tsk) goto send_now; - target_tsk = NULL; + /* + * The only heuristics - we use the tsk->avg_slice value + * to detect 'frequent reschedulers'. + * + * If both the woken-up process and the preferred CPU is + * is a frequent rescheduler, then skip the asynchronous + * wakeup, the frequent rescheduler will likely chose this + * task during it's next schedule(): + */ + if (p->policy == SCHED_OTHER) { + tsk = cpu_curr(best_cpu); + if (p->avg_slice + tsk->avg_slice < cacheflush_time) + goto out_no_target; + } + + /* + * We know that the preferred CPU has a cache-affine current + * process, lets try to find a new idle CPU for the woken-up + * process: + */ for (i = 0; i < smp_num_cpus; i++) { cpu = cpu_logical_map(i); tsk = cpu_curr(cpu); + /* + * We use the first available idle CPU. This creates + * a priority list between idle CPUs, but this is not + * a problem. + */ if (tsk == idle_task(cpu)) - target_tsk = tsk; + goto send_now; } - if (target_tsk && p->avg_slice > cacheflush_time) - goto send_now; - + /* + * No CPU is idle, but maybe this process has enough priority + * to preempt it's preferred CPU. (this is a shortcut): + */ tsk = cpu_curr(best_cpu); if (preemption_goodness(tsk, p, best_cpu) > 0) - target_tsk = tsk; + goto send_now; /* - * found any suitable CPU? + * We should get here rarely - or in the high CPU contention + * case. No CPU is idle and this process is either lowprio or + * the preferred CPU is highprio. Maybe some other CPU can/must + * be preempted: */ - if (!target_tsk) - goto out_no_target; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + tsk = cpu_curr(cpu); + if (preemption_goodness(tsk, p, cpu) > 0) + goto send_now; + } + +out_no_target: + spin_unlock_irqrestore(&runqueue_lock, flags); + return; send_now: - target_cpu = target_tsk->processor; - target_tsk->need_resched = 1; + target_cpu = tsk->processor; + tsk->need_resched = 1; spin_unlock_irqrestore(&runqueue_lock, flags); /* * the APIC stuff can go outside of the lock because @@ -274,9 +266,6 @@ send_now: if (target_cpu != this_cpu) smp_send_reschedule(target_cpu); return; -out_no_target: - spin_unlock_irqrestore(&runqueue_lock, flags); - return; #else /* UP */ int this_cpu = smp_processor_id(); struct task_struct *tsk; @@ -320,7 +309,7 @@ static inline void move_first_runqueue(struct task_struct * p) * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. */ -void wake_up_process(struct task_struct * p) +inline void wake_up_process(struct task_struct * p) { unsigned long flags; @@ -332,152 +321,34 @@ void wake_up_process(struct task_struct * p) if (task_on_runqueue(p)) goto out; add_to_runqueue(p); - spin_unlock_irqrestore(&runqueue_lock, flags); + reschedule_idle(p, flags); // spin_unlocks runqueue - reschedule_idle(p); return; out: spin_unlock_irqrestore(&runqueue_lock, flags); } -static void process_timeout(unsigned long __data) -{ - struct task_struct * p = (struct task_struct *) __data; - - wake_up_process(p); -} - -/* - * Event timer code - */ -#define TVN_BITS 6 -#define TVR_BITS 8 -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -struct timer_vec { - int index; - struct timer_list *vec[TVN_SIZE]; -}; - -struct timer_vec_root { - int index; - struct timer_list *vec[TVR_SIZE]; -}; - -static struct timer_vec tv5 = { 0 }; -static struct timer_vec tv4 = { 0 }; -static struct timer_vec tv3 = { 0 }; -static struct timer_vec tv2 = { 0 }; -static struct timer_vec_root tv1 = { 0 }; - -static struct timer_vec * const tvecs[] = { - (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 -}; - -#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) - -static unsigned long timer_jiffies = 0; - -static inline void insert_timer(struct timer_list *timer, - struct timer_list **vec, int idx) +static inline void wake_up_process_synchronous(struct task_struct * p) { - if ((timer->next = vec[idx])) - vec[idx]->prev = timer; - vec[idx] = timer; - timer->prev = (struct timer_list *)&vec[idx]; -} + unsigned long flags; -static inline void internal_add_timer(struct timer_list *timer) -{ /* - * must be cli-ed when calling this + * We want the common case fall through straight, thus the goto. */ - unsigned long expires = timer->expires; - unsigned long idx = expires - timer_jiffies; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - insert_timer(timer, tv1.vec, i); - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - insert_timer(timer, tv2.vec, i); - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - insert_timer(timer, tv3.vec, i); - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - insert_timer(timer, tv4.vec, i); - } else if ((signed long) idx < 0) { - /* can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - insert_timer(timer, tv1.vec, tv1.index); - } else if (idx <= 0xffffffffUL) { - int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - insert_timer(timer, tv5.vec, i); - } else { - /* Can only get here on architectures with 64-bit jiffies */ - timer->next = timer->prev = timer; - } -} - -spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED; - -void add_timer(struct timer_list *timer) -{ - unsigned long flags; - - spin_lock_irqsave(&timerlist_lock, flags); - if (timer->prev) - goto bug; - internal_add_timer(timer); + spin_lock_irqsave(&runqueue_lock, flags); + p->state = TASK_RUNNING; + if (task_on_runqueue(p)) + goto out; + add_to_runqueue(p); out: - spin_unlock_irqrestore(&timerlist_lock, flags); - return; - -bug: - printk("bug: kernel timer added twice at %p.\n", - __builtin_return_address(0)); - goto out; -} - -static inline int detach_timer(struct timer_list *timer) -{ - struct timer_list *prev = timer->prev; - if (prev) { - struct timer_list *next = timer->next; - prev->next = next; - if (next) - next->prev = prev; - return 1; - } - return 0; -} - -void mod_timer(struct timer_list *timer, unsigned long expires) -{ - unsigned long flags; - - spin_lock_irqsave(&timerlist_lock, flags); - timer->expires = expires; - detach_timer(timer); - internal_add_timer(timer); - spin_unlock_irqrestore(&timerlist_lock, flags); + spin_unlock_irqrestore(&runqueue_lock, flags); } -int del_timer(struct timer_list * timer) +static void process_timeout(unsigned long __data) { - int ret; - unsigned long flags; + struct task_struct * p = (struct task_struct *) __data; - spin_lock_irqsave(&timerlist_lock, flags); - ret = detach_timer(timer); - timer->next = timer->prev = 0; - spin_unlock_irqrestore(&timerlist_lock, flags); - return ret; + wake_up_process(p); } signed long schedule_timeout(signed long timeout) @@ -541,8 +412,12 @@ static inline void __schedule_tail(struct task_struct *prev) { #ifdef __SMP__ if ((prev->state == TASK_RUNNING) && - (prev != idle_task(smp_processor_id()))) - reschedule_idle(prev); + (prev != idle_task(smp_processor_id()))) { + unsigned long flags; + + spin_lock_irqsave(&runqueue_lock, flags); + reschedule_idle(prev, flags); // spin_unlocks runqueue + } wmb(); prev->has_cpu = 0; #endif /* __SMP__ */ @@ -765,7 +640,7 @@ scheduling_in_interrupt: return; } -void __wake_up(wait_queue_head_t *q, unsigned int mode) +static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, const int sync) { struct list_head *tmp, *head; struct task_struct *p; @@ -801,7 +676,10 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode) #if WAITQUEUE_DEBUG curr->__waker = (long)__builtin_return_address(0); #endif - wake_up_process(p); + if (sync) + wake_up_process_synchronous(p); + else + wake_up_process(p); if (state & TASK_EXCLUSIVE) break; } @@ -811,6 +689,16 @@ out: return; } +void __wake_up(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 0); +} + +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1); +} + #define SLEEP_ON_VAR \ unsigned long flags; \ wait_queue_t wait; \ @@ -876,549 +764,9 @@ long sleep_on_timeout(wait_queue_head_t *q, long timeout) void scheduling_functions_end_here(void) { } -static inline void cascade_timers(struct timer_vec *tv) -{ - /* cascade all the timers from tv up one level */ - struct timer_list *timer; - timer = tv->vec[tv->index]; - /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. - */ - while (timer) { - struct timer_list *tmp = timer; - timer = timer->next; - internal_add_timer(tmp); - } - tv->vec[tv->index] = NULL; - tv->index = (tv->index + 1) & TVN_MASK; -} - -static inline void run_timer_list(void) -{ - spin_lock_irq(&timerlist_lock); - while ((long)(jiffies - timer_jiffies) >= 0) { - struct timer_list *timer; - if (!tv1.index) { - int n = 1; - do { - cascade_timers(tvecs[n]); - } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); - } - while ((timer = tv1.vec[tv1.index])) { - void (*fn)(unsigned long) = timer->function; - unsigned long data = timer->data; - detach_timer(timer); - timer->next = timer->prev = NULL; - spin_unlock_irq(&timerlist_lock); - fn(data); - spin_lock_irq(&timerlist_lock); - } - ++timer_jiffies; - tv1.index = (tv1.index + 1) & TVR_MASK; - } - spin_unlock_irq(&timerlist_lock); -} - - -static inline void run_old_timers(void) -{ - struct timer_struct *tp; - unsigned long mask; - - for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { - if (mask > timer_active) - break; - if (!(mask & timer_active)) - continue; - if (time_after(tp->expires, jiffies)) - continue; - timer_active &= ~mask; - tp->fn(); - sti(); - } -} - -spinlock_t tqueue_lock; - -void tqueue_bh(void) -{ - run_task_queue(&tq_timer); -} - -void immediate_bh(void) -{ - run_task_queue(&tq_immediate); -} - -unsigned long timer_active = 0; -struct timer_struct timer_table[32]; - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - */ -unsigned long avenrun[3] = { 0,0,0 }; - -/* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - struct task_struct *p; - unsigned long nr = 0; - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((p->state == TASK_RUNNING || - (p->state & TASK_UNINTERRUPTIBLE) || - (p->state & TASK_SWAPPING))) - nr += FIXED_1; - } - read_unlock(&tasklist_lock); - return nr; -} - -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (count < 0) { - count += LOAD_FREQ; - active_tasks = count_active_tasks(); - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - } -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - * - */ -static void second_overflow(void) -{ - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if ( time_maxerror > NTP_PHASE_LIMIT ) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at - * the end of the day, the system clock is set back one - * second; if in leap-delete state, the system clock is - * set ahead one second. The microtime() routine or - * external clock driver will insure that reported time - * is always monotonic. The ugly divides should be - * replaced. - */ - switch (time_state) { - - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - time_state = TIME_OOP; - printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); - } - break; - - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - time_state = TIME_WAIT; - printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); - } - break; - - case TIME_OOP: - time_state = TIME_WAIT; - break; - - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In - * PLL mode, the offset is reduced by a fixed factor - * times the time constant. In FLL mode the offset is - * used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread - * the adjustment over not more than the number of - * seconds between updates. - */ - if (time_offset < 0) { - ltemp = -time_offset; - if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; - time_offset += ltemp; - time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } else { - ltemp = time_offset; - if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; - time_offset -= ltemp; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } - - /* - * Compute the frequency estimate and additional phase - * adjustment due to frequency error for the next - * second. When the PPS signal is engaged, gnaw on the - * watchdog counter and update the frequency computed by - * the pll and the PPS signal. - */ - pps_valid++; - if (pps_valid == PPS_VALID) { /* PPS signal lost */ - pps_jitter = MAXTIME; - pps_stabil = MAXFREQ; - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - } - ltemp = time_freq + pps_freq; - if (ltemp < 0) - time_adj -= -ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); - else - time_adj += ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); - -#if HZ == 100 - /* Compensate for (HZ==100) != (1 << SHIFT_HZ). - * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 2) + (-time_adj >> 5); - else - time_adj += (time_adj >> 2) + (time_adj >> 5); -#endif -} - -/* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) -{ - if ( (time_adjust_step = time_adjust) != 0 ) { - /* We are doing an adjtime thing. - * - * Prepare time_adjust_step to be within bounds. - * Note that a positive time_adjust means we want the clock - * to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - if (time_adjust > tickadj) - time_adjust_step = tickadj; - else if (time_adjust < -tickadj) - time_adjust_step = -tickadj; - - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; - } - xtime.tv_usec += tick + time_adjust_step; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if (time_phase <= -FINEUSEC) { - long ltemp = -time_phase >> SHIFT_SCALE; - time_phase += ltemp << SHIFT_SCALE; - xtime.tv_usec -= ltemp; - } - else if (time_phase >= FINEUSEC) { - long ltemp = time_phase >> SHIFT_SCALE; - time_phase -= ltemp << SHIFT_SCALE; - xtime.tv_usec += ltemp; - } -} - -/* - * Using a loop looks inefficient, but "ticks" is - * usually just one (we shouldn't be losing ticks, - * we're doing this this way mainly for interrupt - * latency reasons, not because we think we'll - * have lots of lost timer ticks - */ -static void update_wall_time(unsigned long ticks) -{ - do { - ticks--; - update_wall_time_one_tick(); - } while (ticks); - - if (xtime.tv_usec >= 1000000) { - xtime.tv_usec -= 1000000; - xtime.tv_sec++; - second_overflow(); - } -} - -static inline void do_process_times(struct task_struct *p, - unsigned long user, unsigned long system) -{ - unsigned long psecs; - - psecs = (p->times.tms_utime += user); - psecs += (p->times.tms_stime += system); - if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) { - /* Send SIGXCPU every second.. */ - if (!(psecs % HZ)) - send_sig(SIGXCPU, p, 1); - /* and SIGKILL when we go over max.. */ - if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max) - send_sig(SIGKILL, p, 1); - } -} - -static inline void do_it_virt(struct task_struct * p, unsigned long ticks) -{ - unsigned long it_virt = p->it_virt_value; - - if (it_virt) { - if (it_virt <= ticks) { - it_virt = ticks + p->it_virt_incr; - send_sig(SIGVTALRM, p, 1); - } - p->it_virt_value = it_virt - ticks; - } -} - -static inline void do_it_prof(struct task_struct * p, unsigned long ticks) -{ - unsigned long it_prof = p->it_prof_value; - - if (it_prof) { - if (it_prof <= ticks) { - it_prof = ticks + p->it_prof_incr; - send_sig(SIGPROF, p, 1); - } - p->it_prof_value = it_prof - ticks; - } -} - -void update_one_process(struct task_struct *p, - unsigned long ticks, unsigned long user, unsigned long system, int cpu) -{ - p->per_cpu_utime[cpu] += user; - p->per_cpu_stime[cpu] += system; - do_process_times(p, user, system); - do_it_virt(p, user); - do_it_prof(p, ticks); -} - -static void update_process_times(unsigned long ticks, unsigned long system) -{ -/* - * SMP does this on a per-CPU basis elsewhere - */ -#ifndef __SMP__ - struct task_struct * p = current; - unsigned long user = ticks - system; - if (p->pid) { - p->counter -= ticks; - if (p->counter <= 0) { - p->counter = 0; - p->need_resched = 1; - } - if (p->priority < DEF_PRIORITY) - kstat.cpu_nice += user; - else - kstat.cpu_user += user; - kstat.cpu_system += system; - } - update_one_process(p, ticks, user, system, 0); -#endif -} - -volatile unsigned long lost_ticks = 0; -static unsigned long lost_ticks_system = 0; - -/* - * This spinlock protect us from races in SMP while playing with xtime. -arca - */ -rwlock_t xtime_lock = RW_LOCK_UNLOCKED; - -static inline void update_times(void) -{ - unsigned long ticks; - - /* - * update_times() is run from the raw timer_bh handler so we - * just know that the irqs are locally enabled and so we don't - * need to save/restore the flags of the local CPU here. -arca - */ - write_lock_irq(&xtime_lock); - - ticks = lost_ticks; - lost_ticks = 0; - - if (ticks) { - unsigned long system; - system = xchg(&lost_ticks_system, 0); - - calc_load(ticks); - update_wall_time(ticks); - write_unlock_irq(&xtime_lock); - - update_process_times(ticks, system); - - } else - write_unlock_irq(&xtime_lock); -} - -static void timer_bh(void) -{ - update_times(); - run_old_timers(); - run_timer_list(); -} - -void do_timer(struct pt_regs * regs) -{ - (*(unsigned long *)&jiffies)++; - lost_ticks++; - mark_bh(TIMER_BH); - if (!user_mode(regs)) - lost_ticks_system++; - if (tq_timer) - mark_bh(TQUEUE_BH); -} - -#if !defined(__alpha__) && !defined(__ia64__) - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -asmlinkage unsigned long sys_alarm(unsigned int seconds) -{ - struct itimerval it_new, it_old; - unsigned int oldalarm; - - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; - it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - do_setitimer(ITIMER_REAL, &it_new, &it_old); - oldalarm = it_old.it_value.tv_sec; - /* ehhh.. We can't return 0 if we have an alarm pending.. */ - /* And we'd better return too much than too little anyway */ - if (it_old.it_value.tv_usec) - oldalarm++; - return oldalarm; -} - -#endif - #ifndef __alpha__ /* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - -asmlinkage long sys_getpid(void) -{ - /* This is SMP safe - current->pid doesn't change */ - return current->pid; -} - -/* - * This is not strictly SMP safe: p_opptr could change - * from under us. However, rather than getting any lock - * we can use an optimistic algorithm: get the parent - * pid, and go back and check that the parent is still - * the same. If it has changed (which is extremely unlikely - * indeed), we just try again.. - * - * NOTE! This depends on the fact that even if we _do_ - * get an old value of "parent", we can happily dereference - * the pointer: we just can't necessarily trust the result - * until we know that the parent pointer is valid. - * - * The "mb()" macro is a memory barrier - a synchronizing - * event. It also makes sure that gcc doesn't optimize - * away the necessary memory references.. The barrier doesn't - * have to have all that strong semantics: on x86 we don't - * really require a synchronizing instruction, for example. - * The barrier is more important for code generation than - * for any real memory ordering semantics (even if there is - * a small window for a race, using the old pointer is - * harmless for a while). - */ -asmlinkage long sys_getppid(void) -{ - int pid; - struct task_struct * me = current; - struct task_struct * parent; - - parent = me->p_opptr; - for (;;) { - pid = parent->pid; -#if __SMP__ -{ - struct task_struct *old = parent; - mb(); - parent = me->p_opptr; - if (old != parent) - continue; -} -#endif - break; - } - return pid; -} - -asmlinkage long sys_getuid(void) -{ - /* Only we change this so SMP safe */ - return current->uid; -} - -asmlinkage long sys_geteuid(void) -{ - /* Only we change this so SMP safe */ - return current->euid; -} - -asmlinkage long sys_getgid(void) -{ - /* Only we change this so SMP safe */ - return current->gid; -} - -asmlinkage long sys_getegid(void) -{ - /* Only we change this so SMP safe */ - return current->egid; -} - -/* * This has been replaced by sys_setpriority. Maybe it should be * moved into the arch dependent tree for those ports that require * it for backward compatibility? @@ -1674,47 +1022,6 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) return 0; } -asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) -{ - struct timespec t; - unsigned long expire; - - if(copy_from_user(&t, rqtp, sizeof(struct timespec))) - return -EFAULT; - - if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0) - return -EINVAL; - - - if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && - current->policy != SCHED_OTHER) - { - /* - * Short delay requests up to 2 ms will be handled with - * high precision by a busy wait for all real-time processes. - * - * Its important on SMP not to do this holding locks. - */ - udelay((t.tv_nsec + 999) / 1000); - return 0; - } - - expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); - - if (expire) { - if (rmtp) { - jiffies_to_timespec(expire, &t); - if (copy_to_user(rmtp, &t, sizeof(struct timespec))) - return -EFAULT; - } - return -EINTR; - } - return 0; -} - static void show_task(struct task_struct * p) { unsigned long free = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f18ad13c1..fa69edbac 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,7 +49,9 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -extern size_t shm_prm[]; +extern size_t shm_ctlmax; +extern int shm_ctlall; +extern int shm_ctlmni; extern int msg_ctlmax; extern int msg_ctlmnb; extern int msg_ctlmni; @@ -133,10 +135,8 @@ struct inode_operations proc_sys_inode_operations = NULL, /* get_block */ NULL, /* readpage */ NULL, /* writepage */ - NULL, /* flushpage */ NULL, /* truncate */ proc_sys_permission, /* permission */ - NULL, /* smap */ NULL /* revalidate */ }; @@ -217,8 +217,12 @@ static ctl_table kern_table[] = { {KERN_RTSIGMAX, "rtsig-max", &max_queued_signals, sizeof(int), 0644, NULL, &proc_dointvec}, #ifdef CONFIG_SYSVIPC - {KERN_SHMMAX, "shmmax", &shm_prm, 3*sizeof (size_t), + {KERN_SHMMAX, "shmmax", &shm_ctlmax, sizeof (size_t), 0644, NULL, &proc_doulongvec_minmax}, + {KERN_SHMALL, "shmall", &shm_ctlall, sizeof (int), + 0644, NULL, &proc_dointvec}, + {KERN_SHMMNI, "shmmni", &shm_ctlmni, sizeof (int), + 0644, NULL, &proc_dointvec}, {KERN_MSGMAX, "msgmax", &msg_ctlmax, sizeof (int), 0644, NULL, &proc_dointvec}, {KERN_MSGMNI, "msgmni", &msg_ctlmni, sizeof (int), @@ -351,7 +355,7 @@ extern asmlinkage long sys_sysctl(struct __sysctl_args *args) } /* Like in_group_p, but testing against egid, not fsgid */ -static int in_egroup_p(gid_t grp) +int in_egroup_p(gid_t grp) { if (grp != current->egid) { int i = current->ngroups; @@ -584,13 +588,12 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root continue; } - /* Don't unregoster proc entries that are still being used.. */ + /* Don't unregister proc entries that are still being used.. */ if (de->count) continue; - proc_unregister(root, de->low_ino); table->de = NULL; - kfree(de); + remove_proc_entry(table->procname, root); } } diff --git a/kernel/timer.c b/kernel/timer.c new file mode 100644 index 000000000..fccf7faa7 --- /dev/null +++ b/kernel/timer.c @@ -0,0 +1,791 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers, kernel timekeeping, basic process system calls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + */ + +#include <linux/mm.h> +#include <linux/timex.h> +#include <linux/delay.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +#include <asm/uaccess.h> + +/* + * Timekeeping variables + */ + +long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */ + +/* The current time */ +volatile struct timeval xtime __attribute__ ((aligned (16))); + +/* Don't completely fail for HZ > 500. */ +int tickadj = 500/HZ ? : 1; /* microsecs */ + +DECLARE_TASK_QUEUE(tq_timer); +DECLARE_TASK_QUEUE(tq_immediate); +DECLARE_TASK_QUEUE(tq_scheduler); + +/* + * phase-lock loop variables + */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +int time_state = TIME_OK; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +long time_offset = 0; /* time adjustment (us) */ +long time_constant = 2; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +long time_phase = 0; /* phase offset (scaled us) */ +long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; + /* frequency offset (scaled ppm)*/ +long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ +long time_reftime = 0; /* time at last adjustment (s) */ + +long time_adjust = 0; +long time_adjust_step = 0; + +unsigned long event = 0; + +extern int do_setitimer(int, struct itimerval *, struct itimerval *); + +unsigned long volatile jiffies = 0; + +unsigned int * prof_buffer = NULL; +unsigned long prof_len = 0; +unsigned long prof_shift = 0; + +/* + * Event timer code + */ +#define TVN_BITS 6 +#define TVR_BITS 8 +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +struct timer_vec { + int index; + struct timer_list *vec[TVN_SIZE]; +}; + +struct timer_vec_root { + int index; + struct timer_list *vec[TVR_SIZE]; +}; + +static struct timer_vec tv5 = { 0 }; +static struct timer_vec tv4 = { 0 }; +static struct timer_vec tv3 = { 0 }; +static struct timer_vec tv2 = { 0 }; +static struct timer_vec_root tv1 = { 0 }; + +static struct timer_vec * const tvecs[] = { + (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 +}; + +#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) + +static unsigned long timer_jiffies = 0; + +static inline void insert_timer(struct timer_list *timer, + struct timer_list **vec, int idx) +{ + if ((timer->next = vec[idx])) + vec[idx]->prev = timer; + vec[idx] = timer; + timer->prev = (struct timer_list *)&vec[idx]; +} + +static inline void internal_add_timer(struct timer_list *timer) +{ + /* + * must be cli-ed when calling this + */ + unsigned long expires = timer->expires; + unsigned long idx = expires - timer_jiffies; + + if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + insert_timer(timer, tv1.vec, i); + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + insert_timer(timer, tv2.vec, i); + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + insert_timer(timer, tv3.vec, i); + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + insert_timer(timer, tv4.vec, i); + } else if ((signed long) idx < 0) { + /* can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + insert_timer(timer, tv1.vec, tv1.index); + } else if (idx <= 0xffffffffUL) { + int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + insert_timer(timer, tv5.vec, i); + } else { + /* Can only get here on architectures with 64-bit jiffies */ + timer->next = timer->prev = timer; + } +} + +spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED; + +void add_timer(struct timer_list *timer) +{ + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + if (timer->prev) + goto bug; + internal_add_timer(timer); +out: + spin_unlock_irqrestore(&timerlist_lock, flags); + return; + +bug: + printk("bug: kernel timer added twice at %p.\n", + __builtin_return_address(0)); + goto out; +} + +static inline int detach_timer(struct timer_list *timer) +{ + struct timer_list *prev = timer->prev; + if (prev) { + struct timer_list *next = timer->next; + prev->next = next; + if (next) + next->prev = prev; + return 1; + } + return 0; +} + +void mod_timer(struct timer_list *timer, unsigned long expires) +{ + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + timer->expires = expires; + detach_timer(timer); + internal_add_timer(timer); + spin_unlock_irqrestore(&timerlist_lock, flags); +} + +int del_timer(struct timer_list * timer) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + ret = detach_timer(timer); + timer->next = timer->prev = 0; + spin_unlock_irqrestore(&timerlist_lock, flags); + return ret; +} + +static inline void cascade_timers(struct timer_vec *tv) +{ + /* cascade all the timers from tv up one level */ + struct timer_list *timer; + timer = tv->vec[tv->index]; + /* + * We are removing _all_ timers from the list, so we don't have to + * detach them individually, just clear the list afterwards. + */ + while (timer) { + struct timer_list *tmp = timer; + timer = timer->next; + internal_add_timer(tmp); + } + tv->vec[tv->index] = NULL; + tv->index = (tv->index + 1) & TVN_MASK; +} + +static inline void run_timer_list(void) +{ + spin_lock_irq(&timerlist_lock); + while ((long)(jiffies - timer_jiffies) >= 0) { + struct timer_list *timer; + if (!tv1.index) { + int n = 1; + do { + cascade_timers(tvecs[n]); + } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); + } + while ((timer = tv1.vec[tv1.index])) { + void (*fn)(unsigned long) = timer->function; + unsigned long data = timer->data; + detach_timer(timer); + timer->next = timer->prev = NULL; + spin_unlock_irq(&timerlist_lock); + fn(data); + spin_lock_irq(&timerlist_lock); + } + ++timer_jiffies; + tv1.index = (tv1.index + 1) & TVR_MASK; + } + spin_unlock_irq(&timerlist_lock); +} + + +static inline void run_old_timers(void) +{ + struct timer_struct *tp; + unsigned long mask; + + for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { + if (mask > timer_active) + break; + if (!(mask & timer_active)) + continue; + if (time_after(tp->expires, jiffies)) + continue; + timer_active &= ~mask; + tp->fn(); + sti(); + } +} + +spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED; + +void tqueue_bh(void) +{ + run_task_queue(&tq_timer); +} + +void immediate_bh(void) +{ + run_task_queue(&tq_immediate); +} + +unsigned long timer_active = 0; +struct timer_struct timer_table[32]; + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + */ +static void second_overflow(void) +{ + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if ( time_maxerror > NTP_PHASE_LIMIT ) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + time_state = TIME_OOP; + printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); + } + break; + + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + time_state = TIME_WAIT; + printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); + } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ + pps_valid++; + if (pps_valid == PPS_VALID) { /* PPS signal lost */ + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; + if (ltemp < 0) + time_adj -= -ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + else + time_adj += ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + +#if HZ == 100 + /* Compensate for (HZ==100) != (1 << SHIFT_HZ). + * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) + */ + if (time_adj < 0) + time_adj -= (-time_adj >> 2) + (-time_adj >> 5); + else + time_adj += (time_adj >> 2) + (time_adj >> 5); +#endif +} + +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) +{ + if ( (time_adjust_step = time_adjust) != 0 ) { + /* We are doing an adjtime thing. + * + * Prepare time_adjust_step to be within bounds. + * Note that a positive time_adjust means we want the clock + * to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + if (time_adjust > tickadj) + time_adjust_step = tickadj; + else if (time_adjust < -tickadj) + time_adjust_step = -tickadj; + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + } + xtime.tv_usec += tick + time_adjust_step; + /* + * Advance the phase, once it gets to one microsecond, then + * advance the tick more. + */ + time_phase += time_adj; + if (time_phase <= -FINEUSEC) { + long ltemp = -time_phase >> SHIFT_SCALE; + time_phase += ltemp << SHIFT_SCALE; + xtime.tv_usec -= ltemp; + } + else if (time_phase >= FINEUSEC) { + long ltemp = time_phase >> SHIFT_SCALE; + time_phase -= ltemp << SHIFT_SCALE; + xtime.tv_usec += ltemp; + } +} + +/* + * Using a loop looks inefficient, but "ticks" is + * usually just one (we shouldn't be losing ticks, + * we're doing this this way mainly for interrupt + * latency reasons, not because we think we'll + * have lots of lost timer ticks + */ +static void update_wall_time(unsigned long ticks) +{ + do { + ticks--; + update_wall_time_one_tick(); + } while (ticks); + + if (xtime.tv_usec >= 1000000) { + xtime.tv_usec -= 1000000; + xtime.tv_sec++; + second_overflow(); + } +} + +static inline void do_process_times(struct task_struct *p, + unsigned long user, unsigned long system) +{ + unsigned long psecs; + + psecs = (p->times.tms_utime += user); + psecs += (p->times.tms_stime += system); + if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) { + /* Send SIGXCPU every second.. */ + if (!(psecs % HZ)) + send_sig(SIGXCPU, p, 1); + /* and SIGKILL when we go over max.. */ + if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max) + send_sig(SIGKILL, p, 1); + } +} + +static inline void do_it_virt(struct task_struct * p, unsigned long ticks) +{ + unsigned long it_virt = p->it_virt_value; + + if (it_virt) { + if (it_virt <= ticks) { + it_virt = ticks + p->it_virt_incr; + send_sig(SIGVTALRM, p, 1); + } + p->it_virt_value = it_virt - ticks; + } +} + +static inline void do_it_prof(struct task_struct * p, unsigned long ticks) +{ + unsigned long it_prof = p->it_prof_value; + + if (it_prof) { + if (it_prof <= ticks) { + it_prof = ticks + p->it_prof_incr; + send_sig(SIGPROF, p, 1); + } + p->it_prof_value = it_prof - ticks; + } +} + +void update_one_process(struct task_struct *p, + unsigned long ticks, unsigned long user, unsigned long system, int cpu) +{ + p->per_cpu_utime[cpu] += user; + p->per_cpu_stime[cpu] += system; + do_process_times(p, user, system); + do_it_virt(p, user); + do_it_prof(p, ticks); +} + +static void update_process_times(unsigned long ticks, unsigned long system) +{ +/* + * SMP does this on a per-CPU basis elsewhere + */ +#ifndef __SMP__ + struct task_struct * p = current; + unsigned long user = ticks - system; + if (p->pid) { + p->counter -= ticks; + if (p->counter <= 0) { + p->counter = 0; + p->need_resched = 1; + } + if (p->priority < DEF_PRIORITY) + kstat.cpu_nice += user; + else + kstat.cpu_user += user; + kstat.cpu_system += system; + } + update_one_process(p, ticks, user, system, 0); +#endif +} + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_tasks(void) +{ + struct task_struct *p; + unsigned long nr = 0; + + read_lock(&tasklist_lock); + for_each_task(p) { + if ((p->state == TASK_RUNNING || + (p->state & TASK_UNINTERRUPTIBLE) || + (p->state & TASK_SWAPPING))) + nr += FIXED_1; + } + read_unlock(&tasklist_lock); + return nr; +} + +/* + * Hmm.. Changed this, as the GNU make sources (load.c) seems to + * imply that avenrun[] is the standard name for this kind of thing. + * Nothing else seems to be standardized: the fractional size etc + * all seem to differ on different machines. + */ +unsigned long avenrun[3] = { 0,0,0 }; + +static inline void calc_load(unsigned long ticks) +{ + unsigned long active_tasks; /* fixed-point */ + static int count = LOAD_FREQ; + + count -= ticks; + if (count < 0) { + count += LOAD_FREQ; + active_tasks = count_active_tasks(); + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + } +} + +volatile unsigned long lost_ticks = 0; +static unsigned long lost_ticks_system = 0; + +/* + * This spinlock protect us from races in SMP while playing with xtime. -arca + */ +rwlock_t xtime_lock = RW_LOCK_UNLOCKED; + +static inline void update_times(void) +{ + unsigned long ticks; + + /* + * update_times() is run from the raw timer_bh handler so we + * just know that the irqs are locally enabled and so we don't + * need to save/restore the flags of the local CPU here. -arca + */ + write_lock_irq(&xtime_lock); + + ticks = lost_ticks; + lost_ticks = 0; + + if (ticks) { + unsigned long system; + system = xchg(&lost_ticks_system, 0); + + calc_load(ticks); + update_wall_time(ticks); + write_unlock_irq(&xtime_lock); + + update_process_times(ticks, system); + + } else + write_unlock_irq(&xtime_lock); +} + +void timer_bh(void) +{ + update_times(); + run_old_timers(); + run_timer_list(); +} + +void do_timer(struct pt_regs * regs) +{ + (*(unsigned long *)&jiffies)++; + lost_ticks++; + mark_bh(TIMER_BH); + if (!user_mode(regs)) + lost_ticks_system++; + if (tq_timer) + mark_bh(TQUEUE_BH); +} + +#if !defined(__alpha__) && !defined(__ia64__) + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +asmlinkage unsigned long sys_alarm(unsigned int seconds) +{ + struct itimerval it_new, it_old; + unsigned int oldalarm; + + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + do_setitimer(ITIMER_REAL, &it_new, &it_old); + oldalarm = it_old.it_value.tv_sec; + /* ehhh.. We can't return 0 if we have an alarm pending.. */ + /* And we'd better return too much than too little anyway */ + if (it_old.it_value.tv_usec) + oldalarm++; + return oldalarm; +} + +#endif + +#ifndef __alpha__ + +/* + * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this + * should be moved into arch/i386 instead? + */ + +asmlinkage long sys_getpid(void) +{ + /* This is SMP safe - current->pid doesn't change */ + return current->pid; +} + +/* + * This is not strictly SMP safe: p_opptr could change + * from under us. However, rather than getting any lock + * we can use an optimistic algorithm: get the parent + * pid, and go back and check that the parent is still + * the same. If it has changed (which is extremely unlikely + * indeed), we just try again.. + * + * NOTE! This depends on the fact that even if we _do_ + * get an old value of "parent", we can happily dereference + * the pointer: we just can't necessarily trust the result + * until we know that the parent pointer is valid. + * + * The "mb()" macro is a memory barrier - a synchronizing + * event. It also makes sure that gcc doesn't optimize + * away the necessary memory references.. The barrier doesn't + * have to have all that strong semantics: on x86 we don't + * really require a synchronizing instruction, for example. + * The barrier is more important for code generation than + * for any real memory ordering semantics (even if there is + * a small window for a race, using the old pointer is + * harmless for a while). + */ +asmlinkage long sys_getppid(void) +{ + int pid; + struct task_struct * me = current; + struct task_struct * parent; + + parent = me->p_opptr; + for (;;) { + pid = parent->pid; +#if __SMP__ +{ + struct task_struct *old = parent; + mb(); + parent = me->p_opptr; + if (old != parent) + continue; +} +#endif + break; + } + return pid; +} + +asmlinkage long sys_getuid(void) +{ + /* Only we change this so SMP safe */ + return current->uid; +} + +asmlinkage long sys_geteuid(void) +{ + /* Only we change this so SMP safe */ + return current->euid; +} + +asmlinkage long sys_getgid(void) +{ + /* Only we change this so SMP safe */ + return current->gid; +} + +asmlinkage long sys_getegid(void) +{ + /* Only we change this so SMP safe */ + return current->egid; +} + +#endif + +asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) +{ + struct timespec t; + unsigned long expire; + + if(copy_from_user(&t, rqtp, sizeof(struct timespec))) + return -EFAULT; + + if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0) + return -EINVAL; + + + if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && + current->policy != SCHED_OTHER) + { + /* + * Short delay requests up to 2 ms will be handled with + * high precision by a busy wait for all real-time processes. + * + * Its important on SMP not to do this holding locks. + */ + udelay((t.tv_nsec + 999) / 1000); + return 0; + } + + expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); + + current->state = TASK_INTERRUPTIBLE; + expire = schedule_timeout(expire); + + if (expire) { + if (rmtp) { + jiffies_to_timespec(expire, &t); + if (copy_to_user(rmtp, &t, sizeof(struct timespec))) + return -EFAULT; + } + return -EINTR; + } + return 0; +} + |