From beb116954b9b7f3bb56412b2494b562f02b864b1 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 7 Jan 1997 02:33:00 +0000 Subject: Import of Linux/MIPS 2.1.14 --- kernel/Makefile | 37 +- kernel/dma.c | 8 +- kernel/exec_domain.c | 21 + kernel/exit.c | 293 +++++++--- kernel/fork.c | 289 ++++++---- kernel/info.c | 16 +- kernel/itimer.c | 88 ++- kernel/ksyms.c | 281 +++++----- kernel/ksyms.ver | 194 +++++++ kernel/module.c | 365 ++++++------ kernel/panic.c | 32 ++ kernel/printk.c | 38 +- kernel/resource.c | 9 - kernel/sched.c | 1492 ++++++++++++++++++++++++++++++++++++-------------- kernel/signal.c | 97 ++-- kernel/softirq.c | 15 +- kernel/sys.c | 606 ++++++++++++-------- kernel/sysctl.c | 922 +++++++++++++++++++++++++++++++ kernel/time.c | 493 ++++++----------- 19 files changed, 3704 insertions(+), 1592 deletions(-) create mode 100644 kernel/sysctl.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index fd73ad5f0..9586d067f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,40 +9,17 @@ .S.s: $(CPP) -traditional $< -o $*.s -.c.s: - $(CC) $(CFLAGS) -S $< -.s.o: - $(AS) -o $*.o $< -.c.o: - $(CC) $(CFLAGS) -c $< -OBJS = sched.o dma.o fork.o exec_domain.o panic.o printk.o sys.o \ +O_TARGET := kernel.o +O_OBJS = sched.o dma.o fork.o exec_domain.o panic.o printk.o sys.o \ module.o exit.o signal.o itimer.o info.o time.o softirq.o \ - resource.o + resource.o sysctl.o -SYMTAB_OBJS = ksyms.o - -all: kernel.o - -include ../versions.mk +ifeq ($(CONFIG_MODULES),y) +OX_OBJS = ksyms.o +endif -kernel.o: $(SYMTAB_OBJS) $(OBJS) - $(LD) -r -o kernel.o $(SYMTAB_OBJS) $(OBJS) - sync +include $(TOPDIR)/Rules.make sched.o: sched.c $(CC) $(CFLAGS) $(PROFILING) -fno-omit-frame-pointer -c $< - -dep: - $(CPP) -M *.c > .depend - -dummy: -modules: - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif - diff --git a/kernel/dma.c b/kernel/dma.c index 94b121653..0f13e6627 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -40,7 +40,7 @@ struct dma_chan { int lock; - char *device_id; + const char *device_id; }; static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { @@ -69,12 +69,12 @@ int get_dma_list(char *buf) } /* get_dma_list */ -int request_dma(unsigned int dmanr, char * device_id) +int request_dma(unsigned int dmanr, const char * device_id) { if (dmanr >= MAX_DMA_CHANNELS) return -EINVAL; - if (xchg_u32(&dma_chan_busy[dmanr].lock, 1) != 0) + if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0) return -EBUSY; dma_chan_busy[dmanr].device_id = device_id; @@ -91,7 +91,7 @@ void free_dma(unsigned int dmanr) return; } - if (xchg_u32(&dma_chan_busy[dmanr].lock, 0) == 0) { + if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { printk("Trying to free free DMA%d\n", dmanr); return; } diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 7f0114a46..9a202359a 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -28,6 +28,27 @@ static struct exec_domain *exec_domains = &default_exec_domain; static asmlinkage void no_lcall7(struct pt_regs * regs) { + + /* + * This may have been a static linked SVr4 binary, so we would have the + * personality set incorrectly. Check to see whether SVr4 is available, + * and use it, otherwise give the user a SEGV. + */ + if (current->exec_domain && current->exec_domain->use_count) + (*current->exec_domain->use_count)--; + + current->personality = PER_SVR4; + current->exec_domain = lookup_exec_domain(current->personality); + + if (current->exec_domain && current->exec_domain->use_count) + (*current->exec_domain->use_count)++; + + if (current->exec_domain && current->exec_domain->handler + && current->exec_domain->handler != no_lcall7) { + current->exec_domain->handler(regs); + return; + } + send_sig(SIGSEGV, current, 1); } diff --git a/kernel/exit.c b/kernel/exit.c index 59c0b075b..d2fdbdc4a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -4,7 +4,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#define DEBUG_PROC_TREE +#undef DEBUG_PROC_TREE #include #include @@ -15,56 +15,85 @@ #include #include #include +#include + +#include +#include -#include extern void sem_exit (void); +extern void acct_process (long exitcode); +extern void kerneld_exit(void); int getrusage(struct task_struct *, int, struct rusage *); -static int generate(unsigned long sig, struct task_struct * p) +static inline void generate(unsigned long sig, struct task_struct * p) { unsigned long mask = 1 << (sig-1); - struct sigaction * sa = sig + p->sigaction - 1; + struct sigaction * sa = sig + p->sig->action - 1; - /* always generate signals for traced processes ??? */ - if (p->flags & PF_PTRACED) { - p->signal |= mask; - return 1; + /* + * Optimize away the signal, if it's a signal that can + * be handled immediately (ie non-blocked and untraced) + * and that is ignored (either explicitly or by default) + */ + if (!(mask & p->blocked) && !(p->flags & PF_PTRACED)) { + /* don't bother with ignored signals (but SIGCHLD is special) */ + if (sa->sa_handler == SIG_IGN && sig != SIGCHLD) + return; + /* some signals are ignored by default.. (but SIGCONT already did its deed) */ + if ((sa->sa_handler == SIG_DFL) && + (sig == SIGCONT || sig == SIGCHLD || sig == SIGWINCH || sig == SIGURG)) + return; } - /* don't bother with ignored signals (but SIGCHLD is special) */ - if (sa->sa_handler == SIG_IGN && sig != SIGCHLD) - return 0; - /* some signals are ignored by default.. (but SIGCONT already did its deed) */ - if ((sa->sa_handler == SIG_DFL) && - (sig == SIGCONT || sig == SIGCHLD || sig == SIGWINCH)) - return 0; p->signal |= mask; - return 1; + if (p->state == TASK_INTERRUPTIBLE && (p->signal & ~p->blocked)) + wake_up_process(p); } +/* + * Force a signal that the process can't ignore: if necessary + * we unblock the signal and change any SIG_IGN to SIG_DFL. + */ +void force_sig(unsigned long sig, struct task_struct * p) +{ + sig--; + if (p->sig) { + unsigned long mask = 1UL << sig; + struct sigaction *sa = p->sig->action + sig; + p->signal |= mask; + p->blocked &= ~mask; + if (sa->sa_handler == SIG_IGN) + sa->sa_handler = SIG_DFL; + if (p->state == TASK_INTERRUPTIBLE) + wake_up_process(p); + } +} + + int send_sig(unsigned long sig,struct task_struct * p,int priv) { if (!p || sig > 32) return -EINVAL; if (!priv && ((sig != SIGCONT) || (current->session != p->session)) && - (current->euid != p->euid) && (current->euid != p->uid) && !suser()) + (current->euid ^ p->suid) && (current->euid ^ p->uid) && + (current->uid ^ p->suid) && (current->uid ^ p->uid) && + !suser()) return -EPERM; if (!sig) return 0; /* * Forget it if the process is already zombie'd. */ - if (p->state == TASK_ZOMBIE) + if (!p->sig) return 0; if ((sig == SIGKILL) || (sig == SIGCONT)) { if (p->state == TASK_STOPPED) - p->state = TASK_RUNNING; + wake_up_process(p); p->exit_code = 0; p->signal &= ~( (1<<(SIGSTOP-1)) | (1<<(SIGTSTP-1)) | (1<<(SIGTTIN-1)) | (1<<(SIGTTOU-1)) ); } - /* Depends on order SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU */ - if ((sig >= SIGSTOP) && (sig <= SIGTTOU)) + if (sig == SIGSTOP || sig == SIGTSTP || sig == SIGTTIN || sig == SIGTTOU) p->signal &= ~(1<<(SIGCONT-1)); /* Actually generate the signal */ generate(sig,p); @@ -73,7 +102,7 @@ int send_sig(unsigned long sig,struct task_struct * p,int priv) void notify_parent(struct task_struct * tsk) { - if (tsk->p_pptr == task[1]) + if (tsk->p_pptr == task[smp_num_cpus]) /* Init */ tsk->exit_signal = SIGCHLD; send_sig(tsk->exit_signal, tsk->p_pptr, 1); wake_up_interruptible(&tsk->p_pptr->wait_chldexit); @@ -94,10 +123,14 @@ void release(struct task_struct * p) nr_tasks--; task[i] = NULL; REMOVE_LINKS(p); + release_thread(p); if (STACK_MAGIC != *(unsigned long *)p->kernel_stack_page) printk(KERN_ALERT "release: %s kernel stack corruption. Aiee\n", p->comm); - free_page(p->kernel_stack_page); - free_page((long) p); + free_kernel_stack(p->kernel_stack_page); + current->cmin_flt += p->min_flt + p->cmin_flt; + current->cmaj_flt += p->maj_flt + p->cmaj_flt; + current->cnswap += p->nswap + p->cnswap; + kfree(p); return; } panic("trying to release non-existent task"); @@ -313,12 +346,12 @@ asmlinkage int sys_kill(int pid,int sig) * * "I ask you, have you ever known what it is to be an orphan?" */ -int is_orphaned_pgrp(int pgrp) +static int will_become_orphaned_pgrp(int pgrp, struct task_struct * ignored_task) { struct task_struct *p; for_each_task(p) { - if ((p->pgrp != pgrp) || + if ((p == ignored_task) || (p->pgrp != pgrp) || (p->state == TASK_ZOMBIE) || (p->p_pptr->pid == 1)) continue; @@ -329,7 +362,12 @@ int is_orphaned_pgrp(int pgrp) return(1); /* (sighing) "Often!" */ } -static int has_stopped_jobs(int pgrp) +int is_orphaned_pgrp(int pgrp) +{ + return will_become_orphaned_pgrp(pgrp, 0); +} + +static inline int has_stopped_jobs(int pgrp) { struct task_struct * p; @@ -342,65 +380,140 @@ static int has_stopped_jobs(int pgrp) return(0); } -static void forget_original_parent(struct task_struct * father) +static inline void forget_original_parent(struct task_struct * father) { struct task_struct * p; for_each_task(p) { if (p->p_opptr == father) - if (task[1]) - p->p_opptr = task[1]; + if (task[smp_num_cpus]) /* init */ + p->p_opptr = task[smp_num_cpus]; else p->p_opptr = task[0]; } } -static void exit_files(void) +static inline void close_files(struct files_struct * files) { - int i; + int i, j; + + j = 0; + for (;;) { + unsigned long set = files->open_fds.fds_bits[j]; + i = j * __NFDBITS; + j++; + if (i >= NR_OPEN) + break; + while (set) { + if (set & 1) + close_fp(files->fd[i]); + i++; + set >>= 1; + } + } +} - for (i=0 ; ifiles->fd[i]) - sys_close(i); +static inline void __exit_files(struct task_struct *tsk) +{ + struct files_struct * files = tsk->files; + + if (files) { + tsk->files = NULL; + if (!--files->count) { + close_files(files); + kfree(files); + } + } } -static void exit_fs(void) +void exit_files(struct task_struct *tsk) { - iput(current->fs->pwd); - current->fs->pwd = NULL; - iput(current->fs->root); - current->fs->root = NULL; + __exit_files(tsk); } -NORET_TYPE void do_exit(long code) +static inline void __exit_fs(struct task_struct *tsk) { - struct task_struct *p; + struct fs_struct * fs = tsk->fs; + + if (fs) { + tsk->fs = NULL; + if (!--fs->count) { + iput(fs->root); + iput(fs->pwd); + kfree(fs); + } + } +} - if (intr_count) { - printk("Aiee, killing interrupt handler\n"); - intr_count = 0; +void exit_fs(struct task_struct *tsk) +{ + __exit_fs(tsk); +} + +static inline void __exit_sighand(struct task_struct *tsk) +{ + struct signal_struct * sig = tsk->sig; + + if (sig) { + tsk->sig = NULL; + if (!--sig->count) { + kfree(sig); + } } -fake_volatile: - current->flags |= PF_EXITING; - sem_exit(); - exit_mmap(current); - free_page_tables(current); - exit_files(); - exit_fs(); - exit_thread(); +} + +void exit_sighand(struct task_struct *tsk) +{ + __exit_sighand(tsk); +} + +static inline void __exit_mm(struct task_struct * tsk) +{ + struct mm_struct * mm = tsk->mm; + + /* Set us up to use the kernel mm state */ + if (mm != &init_mm) { + flush_cache_mm(mm); + flush_tlb_mm(mm); + tsk->mm = &init_mm; + tsk->swappable = 0; + SET_PAGE_DIR(tsk, swapper_pg_dir); + + /* free the old state - not used any more */ + if (!--mm->count) { + exit_mmap(mm); + free_page_tables(mm); + kfree(mm); + } + } +} + +void exit_mm(struct task_struct *tsk) +{ + __exit_mm(tsk); +} + +/* + * Send signals to all our closest relatives so that they know + * to properly mourn us.. + */ +static void exit_notify(void) +{ + struct task_struct * p; + forget_original_parent(current); /* * Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGUP and then a SIGCONT. (POSIX 3.2.2.2) + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) * * Case i: Our father is in a different pgrp than we are * and we were the only connection outside, so our pgrp * is about to become orphaned. - */ + */ if ((current->p_pptr->pgrp != current->pgrp) && (current->p_pptr->session == current->session) && - is_orphaned_pgrp(current->pgrp) && + will_become_orphaned_pgrp(current->pgrp, current) && has_stopped_jobs(current->pgrp)) { kill_pg(current->pgrp,SIGHUP,1); kill_pg(current->pgrp,SIGCONT,1); @@ -420,8 +533,8 @@ fake_volatile: current->p_cptr = p->p_osptr; p->p_ysptr = NULL; p->flags &= ~(PF_PTRACED|PF_TRACESYS); - if (task[1] && task[1] != current) - p->p_pptr = task[1]; + if (task[smp_num_cpus] && task[smp_num_cpus] != current) /* init */ + p->p_pptr = task[smp_num_cpus]; else p->p_pptr = task[0]; p->p_osptr = p->p_pptr->p_cptr; @@ -445,11 +558,28 @@ fake_volatile: } if (current->leader) disassociate_ctty(1); - if (last_task_used_math == current) - last_task_used_math = NULL; +} + +NORET_TYPE void do_exit(long code) +{ + if (intr_count) { + printk("Aiee, killing interrupt handler\n"); + intr_count = 0; + } +fake_volatile: + acct_process(code); + current->flags |= PF_EXITING; + del_timer(¤t->real_timer); + sem_exit(); + kerneld_exit(); + __exit_mm(current); + __exit_files(current); + __exit_fs(current); + __exit_sighand(current); + exit_thread(); current->state = TASK_ZOMBIE; current->exit_code = code; - current->mm->rss = 0; + exit_notify(); #ifdef DEBUG_PROC_TREE audit_ptree(); #endif @@ -486,10 +616,18 @@ asmlinkage int sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct struct task_struct *p; if (stat_addr) { - flag = verify_area(VERIFY_WRITE, stat_addr, 4); + flag = verify_area(VERIFY_WRITE, stat_addr, sizeof(*stat_addr)); + if (flag) + return flag; + } + if (ru) { + flag = verify_area(VERIFY_WRITE, ru, sizeof(*ru)); if (flag) return flag; } + if (options & ~(WNOHANG|WUNTRACED|__WCLONE)) + return -EINVAL; + add_wait_queue(¤t->wait_chldexit,&wait); repeat: flag=0; @@ -514,24 +652,22 @@ repeat: continue; if (!(options & WUNTRACED) && !(p->flags & PF_PTRACED)) continue; + if (ru != NULL) + getrusage(p, RUSAGE_BOTH, ru); if (stat_addr) - put_fs_long((p->exit_code << 8) | 0x7f, + put_user((p->exit_code << 8) | 0x7f, stat_addr); p->exit_code = 0; - if (ru != NULL) - getrusage(p, RUSAGE_BOTH, ru); retval = p->pid; goto end_wait4; case TASK_ZOMBIE: current->cutime += p->utime + p->cutime; current->cstime += p->stime + p->cstime; - current->mm->cmin_flt += p->mm->min_flt + p->mm->cmin_flt; - current->mm->cmaj_flt += p->mm->maj_flt + p->mm->cmaj_flt; if (ru != NULL) getrusage(p, RUSAGE_BOTH, ru); - flag = p->pid; if (stat_addr) - put_fs_long(p->exit_code, stat_addr); + put_user(p->exit_code, stat_addr); + retval = p->pid; if (p->p_opptr != p->p_pptr) { REMOVE_LINKS(p); p->p_pptr = p->p_opptr; @@ -542,7 +678,6 @@ repeat: #ifdef DEBUG_PROC_TREE audit_ptree(); #endif - retval = flag; goto end_wait4; default: continue; @@ -552,12 +687,11 @@ repeat: retval = 0; if (options & WNOHANG) goto end_wait4; - current->state=TASK_INTERRUPTIBLE; - schedule(); - current->signal &= ~(1<<(SIGCHLD-1)); retval = -ERESTARTSYS; if (current->signal & ~current->blocked) goto end_wait4; + current->state=TASK_INTERRUPTIBLE; + schedule(); goto repeat; } retval = -ECHILD; @@ -566,6 +700,8 @@ end_wait4: return retval; } +#ifndef __alpha__ + /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. @@ -574,3 +710,14 @@ asmlinkage int sys_waitpid(pid_t pid,unsigned int * stat_addr, int options) { return sys_wait4(pid, stat_addr, options, NULL); } + +#endif + +/* + * sys_wait() has been added for compatibility. wait() should be + * implemented by calling sys_wait4() from libc.a. + */ +asmlinkage int sys_wait(unsigned int * stat_addr) +{ + return sys_wait4(-1, stat_addr, 0, NULL); +} diff --git a/kernel/fork.c b/kernel/fork.c index 104ffea96..864bc52e8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -15,87 +15,81 @@ #include #include #include -#include #include #include #include -#include +#include -#include #include +#include +#include int nr_tasks=1; int nr_running=1; -long last_pid=0; +unsigned long int total_forks=0; /* Handle normal Linux uptimes. */ +int last_pid=0; -static int find_empty_process(void) +static inline int find_empty_process(void) { - int free_task; - int i, tasks_free; - int this_user_tasks; + int i; -repeat: - if ((++last_pid) & 0xffff8000) - last_pid=1; - this_user_tasks = 0; - tasks_free = 0; - free_task = -EAGAIN; - i = NR_TASKS; - while (--i > 0) { - if (!task[i]) { - free_task = i; - tasks_free++; - continue; - } - if (task[i]->uid == current->uid) - this_user_tasks++; - if (task[i]->pid == last_pid || task[i]->pgrp == last_pid || - task[i]->session == last_pid) - goto repeat; - } - if (tasks_free <= MIN_TASKS_LEFT_FOR_ROOT || - this_user_tasks > current->rlim[RLIMIT_NPROC].rlim_cur) + if (nr_tasks >= NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT) { if (current->uid) return -EAGAIN; - return free_task; -} + } + if (current->uid) { + long max_tasks = current->rlim[RLIMIT_NPROC].rlim_cur; -static struct file * copy_fd(struct file * old_file) -{ - struct file * new_file = get_empty_filp(); - int error; - - if (new_file) { - memcpy(new_file,old_file,sizeof(struct file)); - new_file->f_count = 1; - if (new_file->f_inode) - new_file->f_inode->i_count++; - if (new_file->f_op && new_file->f_op->open) { - error = new_file->f_op->open(new_file->f_inode,new_file); - if (error) { - iput(new_file->f_inode); - new_file->f_count = 0; - new_file = NULL; + max_tasks--; /* count the new process.. */ + if (max_tasks < nr_tasks) { + struct task_struct *p; + for_each_task (p) { + if (p->uid == current->uid) + if (--max_tasks < 0) + return -EAGAIN; } } } - return new_file; + for (i = 0 ; i < NR_TASKS ; i++) { + if (!task[i]) + return i; + } + return -EAGAIN; +} + +static int get_pid(unsigned long flags) +{ + struct task_struct *p; + + if (flags & CLONE_PID) + return current->pid; +repeat: + if ((++last_pid) & 0xffff8000) + last_pid=1; + for_each_task (p) { + if (p->pid == last_pid || + p->pgrp == last_pid || + p->session == last_pid) + goto repeat; + } + return last_pid; } -static int dup_mmap(struct task_struct * tsk) +static inline int dup_mmap(struct mm_struct * mm) { struct vm_area_struct * mpnt, **p, *tmp; - tsk->mm->mmap = NULL; - p = &tsk->mm->mmap; + mm->mmap = NULL; + p = &mm->mmap; for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { tmp = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); if (!tmp) { - exit_mmap(tsk); + exit_mmap(mm); return -ENOMEM; } *tmp = *mpnt; - tmp->vm_task = tsk; + tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_mm = mm; tmp->vm_next = NULL; if (tmp->vm_inode) { tmp->vm_inode->i_count++; @@ -104,59 +98,109 @@ static int dup_mmap(struct task_struct * tsk) mpnt->vm_next_share = tmp; tmp->vm_prev_share = mpnt; } + if (copy_page_range(mm, current->mm, tmp)) { + exit_mmap(mm); + return -ENOMEM; + } if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); *p = tmp; p = &tmp->vm_next; } - build_mmap_avl(tsk); + build_mmap_avl(mm); return 0; } -/* - * SHAREFD not yet implemented.. - */ -static void copy_files(unsigned long clone_flags, struct task_struct * p) +static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { - int i; - struct file * f; - - if (clone_flags & COPYFD) { - for (i=0; ifiles->fd[i]) != NULL) - p->files->fd[i] = copy_fd(f); - } else { - for (i=0; ifiles->fd[i]) != NULL) - f->f_count++; + if (!(clone_flags & CLONE_VM)) { + struct mm_struct * mm = kmalloc(sizeof(*tsk->mm), GFP_KERNEL); + if (!mm) + return -1; + *mm = *current->mm; + mm->count = 1; + mm->def_flags = 0; + tsk->mm = mm; + tsk->min_flt = tsk->maj_flt = 0; + tsk->cmin_flt = tsk->cmaj_flt = 0; + tsk->nswap = tsk->cnswap = 0; + if (new_page_tables(tsk)) + return -1; + if (dup_mmap(mm)) { + free_page_tables(mm); + return -1; + } + return 0; } + SET_PAGE_DIR(tsk, current->mm->pgd); + current->mm->count++; + return 0; } -/* - * CLONEVM not yet correctly implemented: needs to clone the mmap - * instead of duplicating it.. - */ -static int copy_mm(unsigned long clone_flags, struct task_struct * p) +static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) { - if (clone_flags & COPYVM) { - p->mm->min_flt = p->mm->maj_flt = 0; - p->mm->cmin_flt = p->mm->cmaj_flt = 0; - if (copy_page_tables(p)) - return 1; - return dup_mmap(p); - } else { - if (clone_page_tables(p)) - return 1; - return dup_mmap(p); /* wrong.. */ + if (clone_flags & CLONE_FS) { + current->fs->count++; + return 0; + } + tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL); + if (!tsk->fs) + return -1; + tsk->fs->count = 1; + tsk->fs->umask = current->fs->umask; + if ((tsk->fs->root = current->fs->root)) + tsk->fs->root->i_count++; + if ((tsk->fs->pwd = current->fs->pwd)) + tsk->fs->pwd->i_count++; + return 0; +} + +static inline int copy_files(unsigned long clone_flags, struct task_struct * tsk) +{ + int i; + struct files_struct *oldf, *newf; + struct file **old_fds, **new_fds; + + oldf = current->files; + if (clone_flags & CLONE_FILES) { + oldf->count++; + return 0; + } + + newf = kmalloc(sizeof(*newf), GFP_KERNEL); + tsk->files = newf; + if (!newf) + return -1; + + newf->count = 1; + newf->close_on_exec = oldf->close_on_exec; + newf->open_fds = oldf->open_fds; + + old_fds = oldf->fd; + new_fds = newf->fd; + for (i = NR_OPEN; i != 0; i--) { + struct file * f = *old_fds; + old_fds++; + *new_fds = f; + new_fds++; + if (f) + f->f_count++; } + return 0; } -static void copy_fs(unsigned long clone_flags, struct task_struct * p) +static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { - if (current->fs->pwd) - current->fs->pwd->i_count++; - if (current->fs->root) - current->fs->root->i_count++; + if (clone_flags & CLONE_SIGHAND) { + current->sig->count++; + return 0; + } + tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL); + if (!tsk->sig) + return -1; + tsk->sig->count = 1; + memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action)); + return 0; } /* @@ -167,17 +211,20 @@ static void copy_fs(unsigned long clone_flags, struct task_struct * p) int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) { int nr; + int error = -ENOMEM; unsigned long new_stack; struct task_struct *p; - if(!(p = (struct task_struct*)__get_free_page(GFP_KERNEL))) + p = (struct task_struct *) kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) goto bad_fork; - new_stack = get_free_page(GFP_KERNEL); + new_stack = alloc_kernel_stack(); if (!new_stack) - goto bad_fork_free; + goto bad_fork_free_p; + error = -EAGAIN; nr = find_empty_process(); if (nr < 0) - goto bad_fork_free; + goto bad_fork_free_stack; *p = *current; @@ -187,47 +234,75 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) (*p->binfmt->use_count)++; p->did_exec = 0; + p->swappable = 0; p->kernel_stack_page = new_stack; *(unsigned long *) p->kernel_stack_page = STACK_MAGIC; p->state = TASK_UNINTERRUPTIBLE; - p->flags &= ~(PF_PTRACED|PF_TRACESYS); - p->pid = last_pid; + p->flags &= ~(PF_PTRACED|PF_TRACESYS|PF_SUPERPRIV); + p->flags |= PF_FORKNOEXEC; + p->pid = get_pid(clone_flags); + p->next_run = NULL; + p->prev_run = NULL; p->p_pptr = p->p_opptr = current; p->p_cptr = NULL; + init_waitqueue(&p->wait_chldexit); p->signal = 0; p->it_real_value = p->it_virt_value = p->it_prof_value = 0; p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0; - p->leader = 0; /* process leadership doesn't inherit */ + init_timer(&p->real_timer); + p->real_timer.data = (unsigned long) p; + p->leader = 0; /* session leadership doesn't inherit */ p->tty_old_pgrp = 0; p->utime = p->stime = 0; p->cutime = p->cstime = 0; +#ifdef __SMP__ + p->processor = NO_PROC_ID; + p->lock_depth = 1; +#endif p->start_time = jiffies; - p->mm->swappable = 0; /* don't try to swap it out before it's set up */ task[nr] = p; SET_LINKS(p); nr_tasks++; + error = -ENOMEM; /* copy all the process information */ - copy_thread(nr, clone_flags, usp, p, regs); - if (copy_mm(clone_flags, p)) + if (copy_files(clone_flags, p)) goto bad_fork_cleanup; + if (copy_fs(clone_flags, p)) + goto bad_fork_cleanup_files; + if (copy_sighand(clone_flags, p)) + goto bad_fork_cleanup_fs; + if (copy_mm(clone_flags, p)) + goto bad_fork_cleanup_sighand; + copy_thread(nr, clone_flags, usp, p, regs); p->semundo = NULL; - copy_files(clone_flags, p); - copy_fs(clone_flags, p); /* ok, now we should be set up.. */ - p->mm->swappable = 1; + p->swappable = 1; p->exit_signal = clone_flags & CSIGNAL; p->counter = current->counter >> 1; - p->state = TASK_RUNNING; /* do this last, just in case */ + wake_up_process(p); /* do this last, just in case */ + ++total_forks; return p->pid; + +bad_fork_cleanup_sighand: + exit_sighand(p); +bad_fork_cleanup_fs: + exit_fs(p); +bad_fork_cleanup_files: + exit_files(p); bad_fork_cleanup: + if (p->exec_domain && p->exec_domain->use_count) + (*p->exec_domain->use_count)--; + if (p->binfmt && p->binfmt->use_count) + (*p->binfmt->use_count)--; task[nr] = NULL; REMOVE_LINKS(p); nr_tasks--; -bad_fork_free: - free_page(new_stack); - free_page((long) p); +bad_fork_free_stack: + free_kernel_stack(new_stack); +bad_fork_free_p: + kfree(p); bad_fork: - return -EAGAIN; + return error; } diff --git a/kernel/info.c b/kernel/info.c index c7b2b9a8c..20b6ad6ae 100644 --- a/kernel/info.c +++ b/kernel/info.c @@ -6,23 +6,19 @@ /* This implements the sysinfo() system call */ -#include - #include #include #include #include #include +#include + +#include asmlinkage int sys_sysinfo(struct sysinfo *info) { - int error; struct sysinfo val; - struct task_struct **p; - error = verify_area(VERIFY_WRITE, info, sizeof(struct sysinfo)); - if (error) - return error; memset((char *)&val, 0, sizeof(struct sysinfo)); val.uptime = jiffies / HZ; @@ -31,12 +27,12 @@ asmlinkage int sys_sysinfo(struct sysinfo *info) val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - for (p = &LAST_TASK; p > &FIRST_TASK; p--) - if (*p) val.procs++; + val.procs = nr_tasks-1; si_meminfo(&val); si_swapinfo(&val); - memcpy_tofs(info, &val, sizeof(struct sysinfo)); + if (copy_to_user(info, &val, sizeof(struct sysinfo))) + return -EFAULT; return 0; } diff --git a/kernel/itimer.c b/kernel/itimer.c index 02f7b7ce8..efcc8351b 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -13,13 +13,28 @@ #include #include -#include +#include +/* + * change timeval to jiffies, trying to avoid the + * most obvious overflows.. + * + * The tv_*sec values are signed, but nothing seems to + * indicate whether we really should use them as signed values + * when doing itimers. POSIX doesn't mention this (but if + * alarm() uses itimers without checking, we have to use unsigned + * arithmetic). + */ static unsigned long tvtojiffies(struct timeval *value) { - return((unsigned long )value->tv_sec * HZ + - (unsigned long )(value->tv_usec + (1000000 / HZ - 1)) / - (1000000 / HZ)); + unsigned long sec = (unsigned) value->tv_sec; + unsigned long usec = (unsigned) value->tv_usec; + + if (sec > (ULONG_MAX / HZ)) + return ULONG_MAX; + usec += 1000000 / HZ - 1; + usec /= 1000000 / HZ; + return HZ*sec+usec; } static void jiffiestotv(unsigned long jiffies, struct timeval *value) @@ -29,14 +44,23 @@ static void jiffiestotv(unsigned long jiffies, struct timeval *value) return; } -int _getitimer(int which, struct itimerval *value) +static int _getitimer(int which, struct itimerval *value) { register unsigned long val, interval; switch (which) { case ITIMER_REAL: - val = current->it_real_value; interval = current->it_real_incr; + val = 0; + if (del_timer(¤t->real_timer)) { + unsigned long now = jiffies; + val = current->real_timer.expires; + add_timer(¤t->real_timer); + /* look out for negative/zero itimer.. */ + if (val <= now) + val = now+1; + val -= now; + } break; case ITIMER_VIRTUAL: val = current->it_virt_value; @@ -51,7 +75,7 @@ int _getitimer(int which, struct itimerval *value) } jiffiestotv(val, &value->it_value); jiffiestotv(interval, &value->it_interval); - return(0); + return 0; } asmlinkage int sys_getitimer(int which, struct itimerval *value) @@ -64,11 +88,24 @@ asmlinkage int sys_getitimer(int which, struct itimerval *value) error = _getitimer(which, &get_buffer); if (error) return error; - error = verify_area(VERIFY_WRITE, value, sizeof(struct itimerval)); - if (error) - return error; - memcpy_tofs(value, &get_buffer, sizeof(get_buffer)); - return 0; + return copy_to_user(value, &get_buffer, sizeof(get_buffer)) ? -EFAULT : 0; +} + +void it_real_fn(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + unsigned long interval; + + send_sig(SIGALRM, p, 1); + interval = p->it_real_incr; + if (interval) { + unsigned long timeout = jiffies + interval; + /* check for overflow */ + if (timeout < interval) + timeout = ULONG_MAX; + p->real_timer.expires = timeout; + add_timer(&p->real_timer); + } } int _setitimer(int which, struct itimerval *value, struct itimerval *ovalue) @@ -82,13 +119,17 @@ int _setitimer(int which, struct itimerval *value, struct itimerval *ovalue) return k; switch (which) { case ITIMER_REAL: - if (j) { - j += 1+itimer_ticks; - if (j < itimer_next) - itimer_next = j; - } + del_timer(¤t->real_timer); current->it_real_value = j; current->it_real_incr = i; + if (!j) + break; + i = j + jiffies; + /* check for overflow.. */ + if (i < j) + i = ULONG_MAX; + current->real_timer.expires = i; + add_timer(¤t->real_timer); break; case ITIMER_VIRTUAL: if (j) @@ -117,20 +158,17 @@ asmlinkage int sys_setitimer(int which, struct itimerval *value, struct itimerva error = verify_area(VERIFY_READ, value, sizeof(*value)); if (error) return error; - memcpy_fromfs(&set_buffer, value, sizeof(set_buffer)); + error = copy_from_user(&set_buffer, value, sizeof(set_buffer)); + if (error) + return -EFAULT; } else memset((char *) &set_buffer, 0, sizeof(set_buffer)); - if (ovalue) { - error = verify_area(VERIFY_WRITE, ovalue, sizeof(struct itimerval)); - if (error) - return error; - } - error = _setitimer(which, &set_buffer, ovalue ? &get_buffer : 0); if (error || !ovalue) return error; - memcpy_tofs(ovalue, &get_buffer, sizeof(get_buffer)); + if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + error = -EFAULT; return error; } diff --git a/kernel/ksyms.c b/kernel/ksyms.c index ccb2f2b4c..a612201c0 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -9,14 +9,19 @@ * by Bjorn Ekwall */ -#include #include +#include #include +#include #include #include +#include +#include #include +#include #include #include +#include #include #include #include @@ -32,75 +37,63 @@ #include #include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#ifdef CONFIG_NET -#include -#include -#ifdef CONFIG_INET -#include -#include -#include -#include -#if defined(CONFIG_PPP) || defined(CONFIG_SLIP) -#include "../drivers/net/slhc.h" -#endif -#endif +extern unsigned char aux_device_present, kbd_read_mask; +#ifdef __i386__ + extern struct drive_info_struct drive_info; #endif + #ifdef CONFIG_PCI #include #include #endif -#if defined(CONFIG_MSDOS_FS) && !defined(CONFIG_UMSDOS_FS) -#include +#if defined(CONFIG_PROC_FS) +#include +#endif +#ifdef CONFIG_KERNELD +#include #endif - #include +#ifdef __SMP__ +#include +#endif -extern char *floppy_track_buffer; - +extern char *get_options(char *str, int *ints); extern void set_device_ro(int dev,int flag); extern struct file_operations * get_blkfops(unsigned int); - -extern void *sys_call_table; - -#ifdef CONFIG_FTAPE -extern char * ftape_big_buffer; -#endif +extern void blkdev_release(struct inode * inode); -#ifdef CONFIG_SCSI -#include "../drivers/scsi/scsi.h" -#include "../drivers/scsi/hosts.h" -#include "../drivers/scsi/constants.h" -#endif +extern void *sys_call_table; extern int sys_tz; extern int request_dma(unsigned int dmanr, char * deviceID); extern void free_dma(unsigned int dmanr); -extern int close_fp(struct file *filp); -extern void (* iABI_hook)(struct pt_regs * regs); - struct symbol_table symbol_table = { #include -#ifdef CONFIG_MODVERSIONS - { (void *)1 /* Version version :-) */, "_Using_Versions" }, +#ifdef MODVERSIONS + { (void *)1 /* Version version :-) */, + SYMBOL_NAME_STR (Using_Versions) }, #endif - /* stackable module support */ - X(rename_module_symbol), - X(register_symtab), - /* system info variables */ - /* These check that they aren't defines (0/1) */ -#ifndef EISA_bus__is_a_macro - X(EISA_bus), -#endif -#ifndef MCA_bus__is_a_macro - X(MCA_bus), -#endif -#ifndef wp_works_ok__is_a_macro - X(wp_works_ok), + /* stackable module support */ + X(register_symtab_from), +#ifdef CONFIG_KERNELD + X(kerneld_send), #endif + X(get_options), #ifdef CONFIG_PCI /* PCI BIOS support */ @@ -117,21 +110,22 @@ struct symbol_table symbol_table = { #endif /* process memory management */ - X(verify_area), X(do_mmap), X(do_munmap), - X(zeromap_page_range), - X(unmap_page_range), - X(insert_vm_struct), - X(merge_segments), + X(exit_mm), /* internal kernel memory management */ X(__get_free_pages), X(free_pages), X(kmalloc), - X(kfree_s), + X(kfree), X(vmalloc), X(vfree), + X(mem_map), + X(remap_page_range), + X(max_mapnr), + X(high_memory), + X(update_vm_cache), /* filesystem internal functions */ X(getname), @@ -141,23 +135,32 @@ struct symbol_table symbol_table = { X(namei), X(lnamei), X(open_namei), + X(sys_close), X(close_fp), X(check_disk_change), X(invalidate_buffers), + X(invalidate_inodes), + X(invalidate_inode_pages), X(fsync_dev), X(permission), X(inode_setattr), X(inode_change_ok), - X(generic_mmap), X(set_blocksize), X(getblk), X(bread), X(breada), - X(brelse), + X(__brelse), + X(__bforget), X(ll_rw_block), X(__wait_on_buffer), + X(mark_buffer_uptodate), + X(unlock_buffer), X(dcache_lookup), X(dcache_add), + X(add_blkdev_randomness), + X(generic_file_read), + X(generic_file_mmap), + X(generic_readpage), /* device registration */ X(register_chrdev), @@ -168,6 +171,14 @@ struct symbol_table symbol_table = { X(tty_unregister_driver), X(tty_std_termios), +#if defined(CONFIG_BLK_DEV_IDECD) || \ + defined(CONFIG_BLK_DEV_SR) || \ + defined(CONFIG_CM206) + X(register_cdrom), + X(unregister_cdrom), + X(cdrom_fops), +#endif + /* block device driver support */ X(block_read), X(block_write), @@ -182,16 +193,27 @@ struct symbol_table symbol_table = { X(bmap), X(sync_dev), X(get_blkfops), - + X(blkdev_open), + X(blkdev_release), + X(gendisk_head), + X(resetup_one_dev), + X(unplug_device), +#ifdef __i386__ + X(drive_info), +#endif + +#ifdef CONFIG_SERIAL /* Module creation of serial units */ X(register_serial), X(unregister_serial), - +#endif /* tty routines */ X(tty_hangup), X(tty_wait_until_sent), X(tty_check_change), X(tty_hung_up_p), + X(do_SAK), + X(console_print), /* filesystem registration */ X(register_filesystem), @@ -200,27 +222,47 @@ struct symbol_table symbol_table = { /* executable format registration */ X(register_binfmt), X(unregister_binfmt), + X(search_binary_handler), + X(prepare_binprm), + X(remove_arg_zero), /* execution environment registration */ X(lookup_exec_domain), X(register_exec_domain), X(unregister_exec_domain), + /* sysctl table registration */ + X(register_sysctl_table), + X(unregister_sysctl_table), + X(sysctl_string), + X(sysctl_intvec), + X(proc_dostring), + X(proc_dointvec), + X(proc_dointvec_minmax), + /* interrupt handling */ X(request_irq), X(free_irq), X(enable_irq), X(disable_irq), + X(probe_irq_on), + X(probe_irq_off), X(bh_active), X(bh_mask), + X(bh_mask_count), + X(bh_base), X(add_timer), X(del_timer), X(tq_timer), X(tq_immediate), X(tq_scheduler), - X(tq_last), X(timer_active), X(timer_table), + X(intr_count), + + /* autoirq from drivers/net/auto_irq.c */ + X(autoirq_setup), + X(autoirq_report), /* dma handling */ X(request_dma), @@ -241,11 +283,13 @@ struct symbol_table symbol_table = { X(sleep_on), X(interruptible_sleep_on), X(schedule), - X(current), + X(current_set), X(jiffies), X(xtime), + X(do_gettimeofday), X(loops_per_sec), X(need_resched), + X(kstat), X(kill_proc), X(kill_pg), X(kill_sl), @@ -255,18 +299,20 @@ struct symbol_table symbol_table = { X(printk), X(sprintf), X(vsprintf), + X(kdevname), X(simple_strtoul), X(system_utsname), X(sys_call_table), + X(hard_reset_now), + X(_ctype), + X(secure_tcp_sequence_number), /* Signal interfaces */ - X(do_signal), X(send_sig), /* Program loader interfaces */ X(setup_arg_pages), X(copy_strings), - X(create_tables), X(do_execve), X(flush_old_exec), X(open_inode), @@ -274,77 +320,7 @@ struct symbol_table symbol_table = { /* Miscellaneous access points */ X(si_meminfo), -#ifdef CONFIG_NET - /* socket layer registration */ - X(sock_register), - X(sock_unregister), - /* Internet layer registration */ -#ifdef CONFIG_INET - X(inet_add_protocol), - X(inet_del_protocol), -#if defined(CONFIG_PPP) || defined(CONFIG_SLIP) - /* VJ header compression */ - X(slhc_init), - X(slhc_free), - X(slhc_remember), - X(slhc_compress), - X(slhc_uncompress), -#endif -#endif - /* Device callback registration */ - X(register_netdevice_notifier), - X(unregister_netdevice_notifier), -#endif -#ifdef CONFIG_FTAPE - /* The next labels are needed for ftape driver. */ - X(ftape_big_buffer), -#endif - X(floppy_track_buffer), -#ifdef CONFIG_INET - /* support for loadable net drivers */ - X(register_netdev), - X(unregister_netdev), - X(ether_setup), - X(alloc_skb), - X(kfree_skb), - X(dev_kfree_skb), - X(netif_rx), - X(dev_rint), - X(dev_tint), - X(irq2dev_map), - X(dev_add_pack), - X(dev_remove_pack), - X(dev_get), - X(dev_ioctl), - X(dev_queue_xmit), - X(dev_base), - X(dev_close), - X(arp_find), - X(n_tty_ioctl), - X(tty_register_ldisc), - X(kill_fasync), -#endif -#ifdef CONFIG_SCSI - /* Supports loadable scsi drivers */ - /* - * in_scan_scsis is a hack, and should go away once the new - * memory allocation code is in the NCR driver - */ - X(in_scan_scsis), - X(scsi_register_module), - X(scsi_unregister_module), - X(scsi_free), - X(scsi_malloc), - X(scsi_register), - X(scsi_unregister), - X(scsicam_bios_param), - X(scsi_init_malloc), - X(scsi_init_free), - X(print_command), - X(print_msg), - X(print_status), -#endif /* Added to make file system as module */ X(set_writetime), X(sys_tz), @@ -352,9 +328,12 @@ struct symbol_table symbol_table = { X(file_fsync), X(clear_inode), X(refile_buffer), + X(nr_async_pages), X(___strtok), X(init_fifo), X(super_blocks), + X(reuse_list), + X(fifo_inode_operations), X(chrdev_inode_operations), X(blkdev_inode_operations), X(read_ahead), @@ -363,29 +342,23 @@ struct symbol_table symbol_table = { X(insert_inode_hash), X(event), X(__down), -#if defined(CONFIG_MSDOS_FS) && !defined(CONFIG_UMSDOS_FS) - /* support for umsdos fs */ - X(msdos_bmap), - X(msdos_create), - X(msdos_file_read), - X(msdos_file_write), - X(msdos_lookup), - X(msdos_mkdir), - X(msdos_mmap), - X(msdos_put_inode), - X(msdos_put_super), - X(msdos_read_inode), - X(msdos_read_super), - X(msdos_readdir), - X(msdos_rename), - X(msdos_rmdir), - X(msdos_smap), - X(msdos_statfs), - X(msdos_truncate), - X(msdos_unlink), - X(msdos_unlink_umsdos), - X(msdos_write_inode), + X(__up), + X(securelevel), +/* all busmice */ + X(add_mouse_randomness), + X(fasync_helper), +/* psaux mouse */ + X(aux_device_present), + X(kbd_read_mask), + +#ifdef CONFIG_BLK_DEV_MD + X(disk_name), /* for md.c */ #endif + + /* binfmt_aout */ + X(get_write_access), + X(put_write_access), + /******************************************************** * Do not add anything below this line, * as the stacked modules depend on this! diff --git a/kernel/ksyms.ver b/kernel/ksyms.ver index 76537cd4c..69c4c3f9e 100644 --- a/kernel/ksyms.ver +++ b/kernel/ksyms.ver @@ -192,3 +192,197 @@ #define __down _set_ver(__down, 75aa9e96) #endif /* _KSYMS_VER_ */ #endif /* CONFIG_MODVERSIONS !__GENKSYMS__ */ +/**** This file is generated by genksyms DO NOT EDIT! ****/ +#if defined(CONFIG_MODVERSIONS) && !defined(__GENKSYMS__) +#ifndef _KSYMS_VER_ +#define _KSYMS_VER_ +#define rename_module_symbol _set_ver(rename_module_symbol, b81c73c1) +#define register_symtab _set_ver(register_symtab, e910ea66) +#define EISA_bus _set_ver(EISA_bus, 7e37737c) +#define wp_works_ok _set_ver(wp_works_ok, f37f99e9) +#define verify_area _set_ver(verify_area, 4cfda560) +#define do_mmap _set_ver(do_mmap, 677e7ee1) +#define do_munmap _set_ver(do_munmap, 6221f117) +#define zeromap_page_range _set_ver(zeromap_page_range, 7c395a26) +#define unmap_page_range _set_ver(unmap_page_range, 0110085f) +#define insert_vm_struct _set_ver(insert_vm_struct, 1f4e4882) +#define merge_segments _set_ver(merge_segments, 6854be5a) +#define __get_free_pages _set_ver(__get_free_pages, 5243d78b) +#define free_pages _set_ver(free_pages, 96448859) +#define kmalloc _set_ver(kmalloc, d31fb2cb) +#define kfree_s _set_ver(kfree_s, 1e72eb79) +#define vmalloc _set_ver(vmalloc, 667f3e25) +#define vfree _set_ver(vfree, 6df52add) +#define getname _set_ver(getname, 81487159) +#define putname _set_ver(putname, b19e8126) +#define __iget _set_ver(__iget, ee2b6320) +#define iput _set_ver(iput, 59241ced) +#define namei _set_ver(namei, 00478bcd) +#define lnamei _set_ver(lnamei, fcfddbb1) +#define open_namei _set_ver(open_namei, 414b2b0f) +#define close_fp _set_ver(close_fp, 1d4c15d8) +#define check_disk_change _set_ver(check_disk_change, b66ed457) +#define invalidate_buffers _set_ver(invalidate_buffers, c65255f1) +#define fsync_dev _set_ver(fsync_dev, a221190d) +#define permission _set_ver(permission, 0ebf7474) +#define inode_setattr _set_ver(inode_setattr, 0c80a3c1) +#define inode_change_ok _set_ver(inode_change_ok, 5d1cb326) +#define generic_mmap _set_ver(generic_mmap, d4ff59f3) +#define set_blocksize _set_ver(set_blocksize, f45fda38) +#define getblk _set_ver(getblk, d40228ac) +#define bread _set_ver(bread, c73bf0f0) +#define breada _set_ver(breada, eb8e858c) +#define brelse _set_ver(brelse, 4c27ac3d) +#define ll_rw_block _set_ver(ll_rw_block, f3aa4dd3) +#define __wait_on_buffer _set_ver(__wait_on_buffer, e8fcc968) +#define dcache_lookup _set_ver(dcache_lookup, 83336566) +#define dcache_add _set_ver(dcache_add, fe71f11e) +#define register_chrdev _set_ver(register_chrdev, da99513f) +#define unregister_chrdev _set_ver(unregister_chrdev, 61ea5ee8) +#define register_blkdev _set_ver(register_blkdev, 4699a621) +#define unregister_blkdev _set_ver(unregister_blkdev, d39bbca9) +#define tty_register_driver _set_ver(tty_register_driver, fcc8591c) +#define tty_unregister_driver _set_ver(tty_unregister_driver, c78132a8) +#define tty_std_termios _set_ver(tty_std_termios, cf350678) +#define block_read _set_ver(block_read, a7fe4f51) +#define block_write _set_ver(block_write, 902674c9) +#define block_fsync _set_ver(block_fsync, 182888d8) +#define wait_for_request _set_ver(wait_for_request, 9ca2932e) +#define blksize_size _set_ver(blksize_size, dea1eb55) +#define hardsect_size _set_ver(hardsect_size, ed1ee14f) +#define blk_size _set_ver(blk_size, f60b5398) +#define blk_dev _set_ver(blk_dev, dbf5fdd4) +#define is_read_only _set_ver(is_read_only, b0c5f83e) +#define set_device_ro _set_ver(set_device_ro, 8fb69e13) +#define bmap _set_ver(bmap, 73bb8bdd) +#define sync_dev _set_ver(sync_dev, 9bca536d) +#define get_blkfops _set_ver(get_blkfops, 83827791) +#define register_serial _set_ver(register_serial, 3425f38c) +#define unregister_serial _set_ver(unregister_serial, c013d717) +#define tty_hangup _set_ver(tty_hangup, e3487df0) +#define tty_wait_until_sent _set_ver(tty_wait_until_sent, da85d428) +#define tty_check_change _set_ver(tty_check_change, 705eaab0) +#define tty_hung_up_p _set_ver(tty_hung_up_p, f99ac1e4) +#define register_filesystem _set_ver(register_filesystem, 1c7110ef) +#define unregister_filesystem _set_ver(unregister_filesystem, 5e353af7) +#define register_binfmt _set_ver(register_binfmt, 66ece706) +#define unregister_binfmt _set_ver(unregister_binfmt, 41822618) +#define lookup_exec_domain _set_ver(lookup_exec_domain, 32f10d48) +#define register_exec_domain _set_ver(register_exec_domain, eda4711f) +#define unregister_exec_domain _set_ver(unregister_exec_domain, 78ea447c) +#define request_irq _set_ver(request_irq, 9e81629c) +#define free_irq _set_ver(free_irq, f487dc0c) +#define enable_irq _set_ver(enable_irq, 54e09f5f) +#define disable_irq _set_ver(disable_irq, b4449c1f) +#define bh_active _set_ver(bh_active, 98fb5ca1) +#define bh_mask _set_ver(bh_mask, 1abf3d3f) +#define add_timer _set_ver(add_timer, f13cb728) +#define del_timer _set_ver(del_timer, c7aff713) +#define tq_timer _set_ver(tq_timer, 46cf583e) +#define tq_immediate _set_ver(tq_immediate, 46cf583e) +#define tq_scheduler _set_ver(tq_scheduler, 46cf583e) +#define tq_last _set_ver(tq_last, 457cf547) +#define timer_active _set_ver(timer_active, 5a6747ee) +#define timer_table _set_ver(timer_table, 9e03b650) +#define request_dma _set_ver(request_dma, 2a687646) +#define free_dma _set_ver(free_dma, 5d4b914c) +#define disable_hlt _set_ver(disable_hlt, 794487ee) +#define enable_hlt _set_ver(enable_hlt, 9c7077bd) +#define check_region _set_ver(check_region, b91154fb) +#define request_region _set_ver(request_region, 138b0a1e) +#define release_region _set_ver(release_region, f41d6d31) +#define wake_up _set_ver(wake_up, e8d71419) +#define wake_up_interruptible _set_ver(wake_up_interruptible, 64c8cb92) +#define sleep_on _set_ver(sleep_on, 67a00cee) +#define interruptible_sleep_on _set_ver(interruptible_sleep_on, 6a5fc80d) +#define schedule _set_ver(schedule, 01000e51) +#define current _set_ver(current, fc1cb29b) +#define jiffies _set_ver(jiffies, 2f7c7437) +#define xtime _set_ver(xtime, e70c0be0) +#define loops_per_sec _set_ver(loops_per_sec, 40a14192) +#define need_resched _set_ver(need_resched, dfc016ea) +#define kill_proc _set_ver(kill_proc, 911f760a) +#define kill_pg _set_ver(kill_pg, 0a758a45) +#define kill_sl _set_ver(kill_sl, 49625e94) +#define panic _set_ver(panic, 400c0de3) +#define printk _set_ver(printk, ad1148ba) +#define sprintf _set_ver(sprintf, f9003107) +#define vsprintf _set_ver(vsprintf, e605cb6b) +#define simple_strtoul _set_ver(simple_strtoul, bdb8c1e3) +#define system_utsname _set_ver(system_utsname, 066845bc) +#define sys_call_table _set_ver(sys_call_table, 79fa4011) +#define do_signal _set_ver(do_signal, 86f9bc59) +#define send_sig _set_ver(send_sig, 5cddd8d9) +#define setup_arg_pages _set_ver(setup_arg_pages, fe68d94a) +#define copy_strings _set_ver(copy_strings, 232aee96) +#define create_tables _set_ver(create_tables, ba788fa2) +#define do_execve _set_ver(do_execve, 8c99dc0a) +#define flush_old_exec _set_ver(flush_old_exec, c737e178) +#define open_inode _set_ver(open_inode, 27302cb6) +#define read_exec _set_ver(read_exec, a80a2dd0) +#define si_meminfo _set_ver(si_meminfo, bb05fc9a) +#define sock_register _set_ver(sock_register, d68e1649) +#define sock_unregister _set_ver(sock_unregister, 72c332bd) +#define inet_add_protocol _set_ver(inet_add_protocol, 55292121) +#define inet_del_protocol _set_ver(inet_del_protocol, 73908a1b) +#define slhc_init _set_ver(slhc_init, e490a4b8) +#define slhc_free _set_ver(slhc_free, 39ab902b) +#define slhc_remember _set_ver(slhc_remember, db333be6) +#define slhc_compress _set_ver(slhc_compress, e753e2d2) +#define slhc_uncompress _set_ver(slhc_uncompress, 81cc1144) +#define register_netdevice_notifier _set_ver(register_netdevice_notifier, e7aace7c) +#define unregister_netdevice_notifier _set_ver(unregister_netdevice_notifier, be114416) +#define floppy_track_buffer _set_ver(floppy_track_buffer, c6e3f7c2) +#define register_netdev _set_ver(register_netdev, 0d8d1bb4) +#define unregister_netdev _set_ver(unregister_netdev, 25a99579) +#define ether_setup _set_ver(ether_setup, 4eafef91) +#define alloc_skb _set_ver(alloc_skb, b6b523ba) +#define kfree_skb _set_ver(kfree_skb, 0b938572) +#define dev_kfree_skb _set_ver(dev_kfree_skb, aa1fe7f4) +#define netif_rx _set_ver(netif_rx, d8051cb2) +#define dev_rint _set_ver(dev_rint, 040d3f4b) +#define dev_tint _set_ver(dev_tint, 860b350b) +#define irq2dev_map _set_ver(irq2dev_map, 10bdcd8a) +#define dev_add_pack _set_ver(dev_add_pack, 6d7d9be4) +#define dev_remove_pack _set_ver(dev_remove_pack, 784fa59f) +#define dev_get _set_ver(dev_get, 72ed90fd) +#define dev_ioctl _set_ver(dev_ioctl, 08760203) +#define dev_queue_xmit _set_ver(dev_queue_xmit, 4a478225) +#define dev_base _set_ver(dev_base, 0a8809f0) +#define dev_close _set_ver(dev_close, 9bdad56d) +#define arp_find _set_ver(arp_find, a141bd11) +#define n_tty_ioctl _set_ver(n_tty_ioctl, 538e5fa6) +#define tty_register_ldisc _set_ver(tty_register_ldisc, 8fdde939) +#define kill_fasync _set_ver(kill_fasync, 890501b6) +#define in_scan_scsis _set_ver(in_scan_scsis, 21874a88) +#define scsi_register_module _set_ver(scsi_register_module, 8eff1010) +#define scsi_unregister_module _set_ver(scsi_unregister_module, d913b8f0) +#define scsi_free _set_ver(scsi_free, 475dddfa) +#define scsi_malloc _set_ver(scsi_malloc, 1cce3f92) +#define scsi_register _set_ver(scsi_register, d6e77069) +#define scsi_unregister _set_ver(scsi_unregister, 3b0b616b) +#define scsicam_bios_param _set_ver(scsicam_bios_param, 3d965248) +#define scsi_init_malloc _set_ver(scsi_init_malloc, e5167cbc) +#define scsi_init_free _set_ver(scsi_init_free, 8b2721f8) +#define print_command _set_ver(print_command, 6f14cd75) +#define print_msg _set_ver(print_msg, 0465f877) +#define print_status _set_ver(print_status, 32f84646) +#define set_writetime _set_ver(set_writetime, 52131916) +#define sys_tz _set_ver(sys_tz, aa3c9782) +#define __wait_on_super _set_ver(__wait_on_super, 61a5c00a) +#define file_fsync _set_ver(file_fsync, d30a190f) +#define clear_inode _set_ver(clear_inode, da2b0e9f) +#define refile_buffer _set_ver(refile_buffer, 8c69e123) +#define ___strtok _set_ver(___strtok, 8b55d69c) +#define init_fifo _set_ver(init_fifo, 082629c7) +#define super_blocks _set_ver(super_blocks, e1f1ee99) +#define chrdev_inode_operations _set_ver(chrdev_inode_operations, 6ba1faa3) +#define blkdev_inode_operations _set_ver(blkdev_inode_operations, ed443696) +#define read_ahead _set_ver(read_ahead, bbcd3768) +#define get_hash_table _set_ver(get_hash_table, 3b5f3c55) +#define get_empty_inode _set_ver(get_empty_inode, 554bdc75) +#define insert_inode_hash _set_ver(insert_inode_hash, 59b8c371) +#define event _set_ver(event, a6aac9c1) +#define __down _set_ver(__down, 75aa9e96) +#endif /* _KSYMS_VER_ */ +#endif /* CONFIG_MODVERSIONS !__GENKSYMS__ */ diff --git a/kernel/module.c b/kernel/module.c index e29a48ba5..09cee93b7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,11 +1,14 @@ #include #include -#include #include /* defines GFP_KERNEL */ #include #include #include #include +#include +#include + +#include /* * Originally by Anonymous (as far as I know...) * Linux version by Bas Laarhoven @@ -34,30 +37,31 @@ * and finally: reducing the number of entries in ksyms.c * since every subsystem should now be able to decide and * control exactly what symbols it wants to export, locally! + * + * On 1-Aug-95: altered code to use same style as + * do /proc/net/XXX "files". Namely allow more than 4kB + * (or what the block size is) output. + * + * - Use dummy syscall functions for users who disable all + * module support. Similar to kernel/sys.c (Paul Gortmaker) */ -#ifdef DEBUG_MODULE -#define PRINTK(a) printk a -#else -#define PRINTK(a) /* */ -#endif +#ifdef CONFIG_MODULES /* a *big* #ifdef block... */ static struct module kernel_module; -static struct module *module_list = &kernel_module; +struct module *module_list = &kernel_module; static int freeing_modules; /* true if some modules are marked for deletion */ static struct module *find_module( const char *name); -static int get_mod_name( char *user_name, char *buf); static int free_modules( void); -static int module_init_flag = 0; /* Hmm... */ +extern struct symbol_table symbol_table; /* in kernel/ksyms.c */ /* * Called at boot time */ void init_modules(void) { - extern struct symbol_table symbol_table; /* in kernel/ksyms.c */ struct internal_symbol *sym; int i; @@ -70,62 +74,21 @@ void init_modules(void) { kernel_module.name = ""; } -int -rename_module_symbol(char *old_name, char *new_name) -{ - struct internal_symbol *sym; - int i = 0; /* keep gcc silent */ - - if (module_list->symtab) { - sym = module_list->symtab->symbol; - for (i = module_list->symtab->n_symbols; i > 0; ++sym, --i) { - if (strcmp(sym->name, old_name) == 0) { /* found it! */ - sym->name = new_name; /* done! */ - PRINTK(("renamed %s to %s\n", old_name, new_name)); - return 1; /* it worked! */ - } - } - } - printk("rename %s to %s failed!\n", old_name, new_name); - return 0; /* not there... */ - /* - * This one will change the name of the first matching symbol! - * - * With this function, you can replace the name of a symbol defined - * in the current module with a new name, e.g. when you want to insert - * your own function instead of a previously defined function - * with the same name. - * - * "Normal" usage: - * - * bogus_function(int params) - * { - * do something "smart"; - * return real_function(params); - * } - * - * ... - * - * init_module() - * { - * if (rename_module_symbol("_bogus_function", "_real_function")) - * printk("yep!\n"); - * else - * printk("no way!\n"); - * ... - * } - * - * When loading this module, real_function will be resolved - * to the real function address. - * All later loaded modules that refer to "real_function()" will - * then really call "bogus_function()" instead!!! - * - * This feature will give you ample opportunities to get to know - * the taste of your foot when you stuff it into your mouth!!! - */ +/* + * Copy the name of a module from user space. + */ +inline int +get_mod_name(char *user_name, char *buf) +{ + /* Should return -EBIG instead of -EFAULT when the name + is too long, but that we couldn't detect real faults then. + Maybe strncpy_from_user() should return -EBIG, when + the source string is too long. */ + return strncpy_from_user(buf, user_name, MOD_MAX_NAME); } + /* * Allocate space for a module. */ @@ -143,7 +106,7 @@ sys_create_module(char *module_name, unsigned long size) return -EPERM; if (module_name == NULL || size == 0) return -EINVAL; - if ((error = get_mod_name(module_name, name)) != 0) + if ((error = get_mod_name(module_name, name)) < 0) return error; if (find_module(name) != NULL) { return -EEXIST; @@ -154,8 +117,8 @@ sys_create_module(char *module_name, unsigned long size) } strcpy((char *)(mp + 1), name); /* why not? */ - npages = (size + sizeof (int) + 4095) / 4096; - if ((addr = vmalloc(npages * 4096)) == 0) { + npages = (size + sizeof (long) + PAGE_SIZE - 1) / PAGE_SIZE; + if ((addr = vmalloc(npages * PAGE_SIZE)) == 0) { kfree_s(mp, sspace); return -ENOMEM; } @@ -168,15 +131,18 @@ sys_create_module(char *module_name, unsigned long size) mp->addr = addr; mp->state = MOD_UNINITIALIZED; mp->cleanup = NULL; + mp->exceptinfo.start = NULL; + mp->exceptinfo.stop = NULL; - * (int *) addr = 0; /* set use count to zero */ + * (long *) addr = 0; /* set use count to zero */ module_list = mp; /* link it in */ - PRINTK(("module `%s' (%lu pages @ 0x%08lx) created\n", - mp->name, (unsigned long) mp->size, (unsigned long) mp->addr)); + pr_debug("module `%s' (%lu pages @ 0x%08lx) created\n", + mp->name, (unsigned long) mp->size, (unsigned long) mp->addr); return (unsigned long) addr; } + /* * Initialize a module. */ @@ -194,34 +160,44 @@ sys_init_module(char *module_name, char *code, unsigned codesize, if (!suser()) return -EPERM; +#ifdef __i386__ /* A little bit of protection... we "know" where the user stack is... */ + if (symtab && ((unsigned long)symtab > 0xb0000000)) { - printk("warning: you are using an old insmod, no symbols will be inserted!\n"); + printk(KERN_WARNING "warning: you are using an old insmod, no symbols will be inserted!\n"); symtab = NULL; } - - /* - * First reclaim any memory from dead modules that where not - * freed when deleted. Should I think be done by timers when - * the module was deleted - Jon. - */ - free_modules(); - - if ((error = get_mod_name(module_name, name)) != 0) +#endif + if ((error = get_mod_name(module_name, name)) < 0) return error; - PRINTK(("initializing module `%s', %d (0x%x) bytes\n", - name, codesize, codesize)); - memcpy_fromfs(&rt, routines, sizeof rt); + pr_debug("initializing module `%s', %d (0x%x) bytes\n", + name, codesize, codesize); + if (copy_from_user(&rt, routines, sizeof rt)) + return -EFAULT; if ((mp = find_module(name)) == NULL) return -ENOENT; - if ((codesize + sizeof (int) + 4095) / 4096 > mp->size) + if (codesize & MOD_AUTOCLEAN) { + /* + * set autoclean marker from codesize... + * set usage count to "zero" + */ + codesize &= ~MOD_AUTOCLEAN; + GET_USE_COUNT(mp) = MOD_AUTOCLEAN; + } + if ((codesize + sizeof (long) + PAGE_SIZE - 1) / PAGE_SIZE > mp->size) + return -EINVAL; + if (copy_from_user((char *)mp->addr + sizeof (long), code, codesize)) + return -EFAULT; + memset((char *)mp->addr + sizeof (long) + codesize, 0, + mp->size * PAGE_SIZE - (codesize + sizeof (long))); + pr_debug("module init entry = 0x%08lx, cleanup entry = 0x%08lx\n", + (unsigned long) rt.init, (unsigned long) rt.cleanup); + if (rt.signature != MODULE_2_1_7_SIG){ + printk ("Older insmod used with kernel 2.1.7 +\n"); return -EINVAL; - memcpy_fromfs((char *)mp->addr + sizeof (int), code, codesize); - memset((char *)mp->addr + sizeof (int) + codesize, 0, - mp->size * 4096 - (codesize + sizeof (int))); - PRINTK(( "module init entry = 0x%08lx, cleanup entry = 0x%08lx\n", - (unsigned long) rt.init, (unsigned long) rt.cleanup)); + } mp->cleanup = rt.cleanup; + mp->exceptinfo = rt.exceptinfo; /* update kernel symbol table */ if (symtab) { /* symtab == NULL means no new entries to handle */ @@ -231,37 +207,34 @@ sys_init_module(char *module_name, char *code, unsigned codesize, int i; int legal_start; - if ((error = verify_area(VERIFY_READ, symtab, sizeof(int)))) - return error; - memcpy_fromfs((char *)(&(size)), symtab, sizeof(int)); - + error = get_user(size, &symtab->size); + if (error) + return error; if ((newtab = (struct symbol_table*) kmalloc(size, GFP_KERNEL)) == NULL) { return -ENOMEM; } - if ((error = verify_area(VERIFY_READ, symtab, size))) { - kfree_s(newtab, size); - return error; + if (copy_from_user((char *)(newtab), symtab, size)) { + kfree_s(newtab, size); + return -EFAULT; } - memcpy_fromfs((char *)(newtab), symtab, size); /* sanity check */ legal_start = sizeof(struct symbol_table) + newtab->n_symbols * sizeof(struct internal_symbol) + newtab->n_refs * sizeof(struct module_ref); - if ((newtab->n_symbols < 0) || (newtab->n_refs < 0) || - (legal_start > size)) { - printk("Illegal symbol table! Rejected!\n"); + if ((newtab->n_symbols < 0) || (newtab->n_refs < 0) || (legal_start > size)) { + printk(KERN_WARNING "Rejecting illegal symbol table (n_symbols=%d,n_refs=%d)\n", + newtab->n_symbols, newtab->n_refs); kfree_s(newtab, size); return -EINVAL; } /* relocate name pointers, index referred from start of table */ - for (sym = &(newtab->symbol[0]), i = 0; - i < newtab->n_symbols; ++sym, ++i) { + for (sym = &(newtab->symbol[0]), i = 0; i < newtab->n_symbols; ++sym, ++i) { if ((unsigned long)sym->name < legal_start || size <= (unsigned long)sym->name) { - printk("Illegal symbol table! Rejected!\n"); + printk(KERN_WARNING "Rejecting illegal symbol table\n"); kfree_s(newtab, size); return -EINVAL; } @@ -287,7 +260,7 @@ sys_init_module(char *module_name, char *code, unsigned codesize, link = link->next; if (link == (struct module *)0) { - printk("Non-module reference! Rejected!\n"); + printk(KERN_WARNING "Non-module reference! Rejected!\n"); return -EINVAL; } @@ -297,12 +270,12 @@ sys_init_module(char *module_name, char *code, unsigned codesize, } } - module_init_flag = 1; /* Hmm... */ + GET_USE_COUNT(mp) += 1; if ((*rt.init)() != 0) { - module_init_flag = 0; /* Hmm... */ + GET_USE_COUNT(mp) = 0; return -EBUSY; } - module_init_flag = 0; /* Hmm... */ + GET_USE_COUNT(mp) -= 1; mp->state = MOD_RUNNING; return 0; @@ -319,17 +292,39 @@ sys_delete_module(char *module_name) return -EPERM; /* else */ if (module_name != NULL) { - if ((error = get_mod_name(module_name, name)) != 0) + if ((error = get_mod_name(module_name, name)) < 0) return error; if ((mp = find_module(name)) == NULL) return -ENOENT; - if ((mp->ref != NULL) || (GET_USE_COUNT(mp) != 0)) + if ((mp->ref != NULL) || + ((GET_USE_COUNT(mp) & ~(MOD_AUTOCLEAN | MOD_VISITED)) != 0)) return -EBUSY; + GET_USE_COUNT(mp) &= ~(MOD_AUTOCLEAN | MOD_VISITED); if (mp->state == MOD_RUNNING) (*mp->cleanup)(); mp->state = MOD_DELETED; + free_modules(); + } + /* for automatic reaping */ + else { + struct module *mp_next; + for (mp = module_list; mp != &kernel_module; mp = mp_next) { + mp_next = mp->next; + if ((mp->ref == NULL) && (mp->state == MOD_RUNNING) && + ((GET_USE_COUNT(mp) & ~MOD_VISITED) == MOD_AUTOCLEAN)) { + if ((GET_USE_COUNT(mp) & MOD_VISITED)) { + /* Don't reap until one "cycle" after last _use_ */ + GET_USE_COUNT(mp) &= ~MOD_VISITED; + } + else { + GET_USE_COUNT(mp) &= ~(MOD_AUTOCLEAN | MOD_VISITED); + (*mp->cleanup)(); + mp->state = MOD_DELETED; + free_modules(); + } + } + } } - free_modules(); return 0; } @@ -364,6 +359,7 @@ sys_get_kernel_syms(struct kernel_sym *table) struct module *mp = module_list; int i; int nmodsyms = 0; + int err; for (mp = module_list; mp; mp = mp->next) { if (mp->symtab && mp->symtab->n_symbols) { @@ -378,16 +374,15 @@ sys_get_kernel_syms(struct kernel_sym *table) if (table != NULL) { to = table; - if ((i = verify_area(VERIFY_WRITE, to, nmodsyms * sizeof(*table)))) - return i; - /* copy all module symbols first (always LIFO order) */ for (mp = module_list; mp; mp = mp->next) { if (mp->state == MOD_RUNNING) { /* magic: write module info as a pseudo symbol */ isym.value = (unsigned long)mp; sprintf(isym.name, "#%s", mp->name); - memcpy_tofs(to, &isym, sizeof isym); + err = copy_to_user(to, &isym, sizeof isym); + if (err) + return -EFAULT; ++to; if (mp->symtab != NULL) { @@ -397,7 +392,9 @@ sys_get_kernel_syms(struct kernel_sym *table) isym.value = (unsigned long)from->addr; strncpy(isym.name, from->name, sizeof isym.name); - memcpy_tofs(to, &isym, sizeof isym); + err = copy_to_user(to, &isym, sizeof isym); + if (err) + return -EFAULT; } } } @@ -407,24 +404,6 @@ sys_get_kernel_syms(struct kernel_sym *table) return nmodsyms; } - -/* - * Copy the name of a module from user space. - */ -int -get_mod_name(char *user_name, char *buf) -{ - int i; - - i = 0; - for (i = 0 ; (buf[i] = get_fs_byte(user_name + i)) != '\0' ; ) { - if (++i >= MOD_MAX_NAME) - return -E2BIG; - } - return 0; -} - - /* * Look for a module by name, ignoring modules marked for deletion. */ @@ -482,7 +461,7 @@ free_modules( void) if (mp->state != MOD_DELETED) { mpp = &mp->next; } else { - if (GET_USE_COUNT(mp) != 0) { + if ((GET_USE_COUNT(mp) != 0) || (mp->ref != NULL)) { freeing_modules = 1; mpp = &mp->next; } else { /* delete it */ @@ -509,7 +488,7 @@ free_modules( void) int get_module_list(char *buf) { char *p; - char *q; + const char *q; int i; struct module *mp; struct module_ref *ref; @@ -521,6 +500,8 @@ int get_module_list(char *buf) if (p - buf > 4096 - 100) break; /* avoid overflowing buffer */ q = mp->name; + if (*q == '\0' && mp->size == 0 && mp->ref == NULL) + continue; /* don't list modules for kernel syms */ i = 20; while (*q) { *p++ = *q++; @@ -546,8 +527,8 @@ int get_module_list(char *buf) while (*q) *p++ = *q++; + *p++ = '\t'; if ((ref = mp->ref) != NULL) { - *p++ = '\t'; *p++ = '['; for (; ref; ref = ref->next) { q = ref->module->name; @@ -558,6 +539,15 @@ int get_module_list(char *buf) } *p++ = ']'; } + if (mp->state == MOD_RUNNING) { + sprintf(size,"\t%ld%s", + GET_USE_COUNT(mp) & ~(MOD_AUTOCLEAN | MOD_VISITED), + ((GET_USE_COUNT(mp) & MOD_AUTOCLEAN)? + " (autoclean)":"")); + q = size; + while (*q) + *p++ = *q++; + } *p++ = '\n'; } return p - buf; @@ -567,40 +557,51 @@ int get_module_list(char *buf) /* * Called by the /proc file system to return a current list of ksyms. */ -int get_ksyms_list(char *buf) +int get_ksyms_list(char *buf, char **start, off_t offset, int length) { struct module *mp; struct internal_symbol *sym; int i; char *p = buf; + int len = 0; /* code from net/ipv4/proc.c */ + off_t pos = 0; + off_t begin = 0; for (mp = module_list; mp; mp = mp->next) { if ((mp->state == MOD_RUNNING) && - (mp->symtab != NULL) && (mp->symtab->n_symbols > 0)) { + (mp->symtab != NULL) && + (mp->symtab->n_symbols > 0)) { for (i = mp->symtab->n_symbols, sym = mp->symtab->symbol; i > 0; --i, ++sym) { - if (p - buf > 4096 - 100) { - strcat(p, "...\n"); - p += strlen(p); - return p - buf; /* avoid overflowing buffer */ - } - + p = buf + len; if (mp->name[0]) { - sprintf(p, "%08lx %s\t[%s]\n", - (long)sym->addr, sym->name, mp->name); + len += sprintf(p, "%08lx %s\t[%s]\n", + (long)sym->addr, + sym->name, mp->name); + } else { + len += sprintf(p, "%08lx %s\n", + (long)sym->addr, + sym->name); } - else { - sprintf(p, "%08lx %s\n", - (long)sym->addr, sym->name); + pos = begin + len; + if (pos < offset) { + len = 0; + begin = pos; } - p += strlen(p); + pos = begin + len; + if (pos > offset+length) + goto leave_the_loop; } } } - - return p - buf; + leave_the_loop: + *start = buf + (offset - begin); + len -= (offset - begin); + if (len > length) + len = length; + return len; } /* @@ -613,7 +614,7 @@ int get_ksyms_list(char *buf) * - For a loadable module, the function should only be called in the * context of init_module * - * Those are the only restrictions! (apart from not being reenterable...) + * Those are the only restrictions! (apart from not being reentrant...) * * If you want to remove a symbol table for a loadable module, * the call looks like: "register_symtab(0)". @@ -628,7 +629,7 @@ int get_ksyms_list(char *buf) static struct symbol_table nulltab; int -register_symtab(struct symbol_table *intab) +register_symtab_from(struct symbol_table *intab, long *from) { struct module *mp; struct module *link; @@ -645,11 +646,16 @@ register_symtab(struct symbol_table *intab) intab->n_symbols +=1; } -#if 1 - if (module_init_flag == 0) { /* Hmm... */ -#else - if (module_list == &kernel_module) { -#endif + for (mp = module_list; mp != &kernel_module; mp = mp->next) { + /* + * "from" points to "mod_use_count_" (== start of module) + * or is == 0 if called from a non-module + */ + if ((unsigned long)(mp->addr) == (unsigned long)from) + break; + } + + if (mp == &kernel_module) { /* Aha! Called from an "internal" module */ if (!intab) return 0; /* or -ESILLY_PROGRAMMER :-) */ @@ -657,7 +663,7 @@ register_symtab(struct symbol_table *intab) /* create a pseudo module! */ if (!(mp = (struct module*) kmalloc(MODSIZ, GFP_KERNEL))) { /* panic time! */ - printk("Out of memory for new symbol table!\n"); + printk(KERN_ERR "Out of memory for new symbol table!\n"); return -ENOMEM; } /* else OK */ @@ -680,7 +686,6 @@ register_symtab(struct symbol_table *intab) * call to init_module i.e. when loading the module!! * Or else... */ - mp = module_list; /* true when doing init_module! */ /* Any table there before? */ if ((oldtab = mp->symtab) == (struct symbol_table*)0) { @@ -690,12 +695,6 @@ register_symtab(struct symbol_table *intab) } /* else ****** we have to replace the module symbol table ******/ -#if 0 - if (oldtab->n_symbols > 0) { - /* Oh dear, I have to drop the old ones... */ - printk("Warning, dropping old symbols\n"); - } -#endif if (oldtab->n_refs == 0) { /* no problems! */ mp->symtab = intab; @@ -720,7 +719,7 @@ register_symtab(struct symbol_table *intab) oldtab->n_refs * REFSIZ, GFP_KERNEL))) { /* panic time! */ - printk("Out of memory for new symbol table!\n"); + printk(KERN_ERR "Out of memory for new symbol table!\n"); return -ENOMEM; } @@ -759,3 +758,35 @@ register_symtab(struct symbol_table *intab) return 0; } + +#else /* CONFIG_MODULES */ + +/* Dummy syscalls for people who don't want modules */ + +asmlinkage unsigned long sys_create_module(void) +{ + return -ENOSYS; +} + +asmlinkage int sys_init_module(void) +{ + return -ENOSYS; +} + +asmlinkage int sys_delete_module(void) +{ + return -ENOSYS; +} + +asmlinkage int sys_get_kernel_syms(void) +{ + return -ENOSYS; +} + +int register_symtab_from(struct symbol_table *intab, long *from) +{ + return 0; +} + +#endif /* CONFIG_MODULES */ + diff --git a/kernel/panic.c b/kernel/panic.c index 300fcbbf3..7e04fdc31 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -7,18 +7,33 @@ /* * This function is used through-out the kernel (including mm and fs) * to indicate a major problem. + * Support for machines without PC-style console hardware - , July 96 */ #include #include #include +#include +#include +#include asmlinkage void sys_sync(void); /* it's really int */ +extern void do_unblank_screen(void); +extern int C_A_D; + +int panic_timeout = 0; + +void panic_setup(char *str, int *ints) +{ + if (ints[0] == 1) + panic_timeout = ints[1]; +} NORET_TYPE void panic(const char * fmt, ...) { static char buf[1024]; va_list args; + int i; va_start(args, fmt); vsprintf(buf, fmt, args); @@ -28,5 +43,22 @@ NORET_TYPE void panic(const char * fmt, ...) printk(KERN_EMERG "In swapper task - not syncing\n"); else sys_sync(); + +#ifndef CONFIG_SERIAL_ONLY_CONSOLE + do_unblank_screen(); +#endif + + if (panic_timeout > 0) + { + /* + * Delay timeout seconds before rebooting the machine. + * We can't use the "normal" timers since we just panicked.. + */ + printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); + for(i = 0; i < (panic_timeout*1000); i++) + udelay(1000); + hard_reset_now(); + } for(;;); } + diff --git a/kernel/printk.c b/kernel/printk.c index 8b518f6cb..ed39d4fab 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -13,22 +13,29 @@ #include -#include #include #include #include #include #include +#include +#include -#define LOG_BUF_LEN 4096 +#include + +#define LOG_BUF_LEN 8192 static char buf[1024]; extern void console_print(const char *); -#define DEFAULT_MESSAGE_LOGLEVEL 7 /* KERN_DEBUG */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything more serious than KERN_DEBUG */ +/* printk's without a loglevel use this.. */ +#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ + +/* We show everything that is MORE important than this.. */ +#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ +#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ unsigned long log_size = 0; struct wait_queue * log_wait = NULL; @@ -89,7 +96,7 @@ asmlinkage int sys_syslog(int type, char * buf, int len) log_size--; log_start &= LOG_BUF_LEN-1; sti(); - put_fs_byte(c,buf); + put_user(c,buf); buf++; i++; cli(); @@ -115,7 +122,7 @@ asmlinkage int sys_syslog(int type, char * buf, int len) j = log_start + log_size - count; for (i = 0; i < count; i++) { c = *((char *) log_buf+(j++ & (LOG_BUF_LEN-1))); - put_fs_byte(c, buf++); + put_user(c, buf++); } if (do_clear) logged_chars = 0; @@ -124,7 +131,7 @@ asmlinkage int sys_syslog(int type, char * buf, int len) logged_chars = 0; return 0; case 6: /* Disable logging to console */ - console_loglevel = 1; /* only panic messages shown */ + console_loglevel = MINIMUM_CONSOLE_LOGLEVEL; return 0; case 7: /* Enable logging to console */ console_loglevel = DEFAULT_CONSOLE_LOGLEVEL; @@ -132,6 +139,8 @@ asmlinkage int sys_syslog(int type, char * buf, int len) case 8: if (len < 1 || len > 8) return -EINVAL; + if (len < MINIMUM_CONSOLE_LOGLEVEL) + len = MINIMUM_CONSOLE_LOGLEVEL; console_loglevel = len; return 0; } @@ -164,7 +173,7 @@ asmlinkage int printk(const char *fmt, ...) ) { p -= 3; p[0] = '<'; - p[1] = DEFAULT_MESSAGE_LOGLEVEL - 1 + '0'; + p[1] = DEFAULT_MESSAGE_LOGLEVEL + '0'; p[2] = '>'; } else msg += 3; @@ -230,3 +239,16 @@ void register_console(void (*proc)(const char *)) j = 0; } } + +/* + * Write a message to a certain tty, not just the console. This is used for + * messages that need to be redirected to a specific tty. + * We don't put it into the syslog queue right now maybe in the future if + * really needed. + */ +void tty_write_message(struct tty_struct *tty, char *msg) +{ + if (tty && tty->driver.write) + tty->driver.write(tty, 0, msg, strlen(msg)); + return; +} diff --git a/kernel/resource.c b/kernel/resource.c index 5a7999d73..48184bfcf 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -92,15 +92,6 @@ void request_region(unsigned int from, unsigned int num, const char *name) } } -/* - * This is for compatibility with older drivers. - * It can be removed when all drivers call the new function. - */ -void snarf_region(unsigned int from, unsigned int num) -{ - request_region(from,num,"No name given."); -} - /* * Call this when the device driver is unloaded */ diff --git a/kernel/sched.c b/kernel/sched.c index 93003dfc1..8f88f88a3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2,16 +2,17 @@ * linux/kernel/sched.c * * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1996-04-21 Modified by Ulrich Windl to make NTP work */ /* * 'sched.c' is the main kernel file. It contains scheduling primitives * (sleep_on, wakeup, schedule etc) as well as a number of simple system - * call functions (type getpid(), which just extracts a field from + * call functions (type getpid()), which just extract a field from * current-task */ -#include #include #include #include @@ -26,20 +27,23 @@ #include #include #include +#include #include #include -#include +#include #include - -#define TIMER_IRQ 0 +#include #include /* * kernel variables */ -long tick = 1000000 / HZ; /* timer interrupt period */ + +int securelevel = 0; /* system security level */ + +long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */ volatile struct timeval xtime; /* The current time */ int tickadj = 500/HZ; /* microsecs */ @@ -50,17 +54,19 @@ DECLARE_TASK_QUEUE(tq_scheduler); /* * phase-lock loop variables */ -int time_status = TIME_BAD; /* clock synchronization status */ -long time_offset = 0; /* time adjustment (us) */ -long time_constant = 0; /* pll time constant */ -long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ -long time_precision = 1; /* clock precision (us) */ -long time_maxerror = 0x70000000;/* maximum error */ -long time_esterror = 0x70000000;/* estimated error */ -long time_phase = 0; /* phase offset (scaled us) */ -long time_freq = 0; /* frequency offset (scaled ppm) */ -long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ -long time_reftime = 0; /* time at last adjustment (s) */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +int time_state = TIME_ERROR; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +long time_offset = 0; /* time adjustment (us) */ +long time_constant = 2; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = MAXPHASE; /* maximum error (us) */ +long time_esterror = MAXPHASE; /* estimated error (us) */ +long time_phase = 0; /* phase offset (scaled us) */ +long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; /* frequency offset (scaled ppm) */ +long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ +long time_reftime = 0; /* time at last adjustment (s) */ long time_adjust = 0; long time_adjust_step = 0; @@ -69,132 +75,354 @@ int need_resched = 0; unsigned long event = 0; extern int _setitimer(int, struct itimerval *, struct itimerval *); -unsigned long * prof_buffer = NULL; +unsigned int * prof_buffer = NULL; unsigned long prof_len = 0; +unsigned long prof_shift = 0; #define _S(nr) (1<<((nr)-1)) extern void mem_use(void); -extern int timer_interrupt(void); - +#ifdef __mips__ +unsigned long init_kernel_stack[2048] = { STACK_MAGIC, }; +unsigned long init_user_stack[2048] = { STACK_MAGIC, }; +#else unsigned long init_kernel_stack[1024] = { STACK_MAGIC, }; unsigned long init_user_stack[1024] = { STACK_MAGIC, }; +#endif static struct vm_area_struct init_mmap = INIT_MMAP; +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS; + +struct mm_struct init_mm = INIT_MM; struct task_struct init_task = INIT_TASK; unsigned long volatile jiffies=0; -struct task_struct *current = &init_task; +struct task_struct *current_set[NR_CPUS]; struct task_struct *last_task_used_math = NULL; struct task_struct * task[NR_TASKS] = {&init_task, }; struct kernel_stat kstat = { 0 }; -unsigned long itimer_ticks = 0; -unsigned long itimer_next = ~0; +static inline void add_to_runqueue(struct task_struct * p) +{ +#ifdef __SMP__ + int cpu=smp_processor_id(); +#endif +#if 1 /* sanity tests */ + if (p->next_run || p->prev_run) { + printk("task already on run-queue\n"); + return; + } +#endif + if (p->counter > current->counter + 3) + need_resched = 1; + nr_running++; + (p->prev_run = init_task.prev_run)->next_run = p; + p->next_run = &init_task; + init_task.prev_run = p; +#ifdef __SMP__ + /* this is safe only if called with cli()*/ + while(set_bit(31,&smp_process_available)) + { + while(test_bit(31,&smp_process_available)) + { + if(clear_bit(cpu,&smp_invalidate_needed)) + { + local_flush_tlb(); + set_bit(cpu,&cpu_callin_map[0]); + } + } + } + smp_process_available++; + clear_bit(31,&smp_process_available); + if ((0!=p->pid) && smp_threads_ready) + { + int i; + for (i=0;ipid) + { + smp_message_pass(cpu_logical_map[i], MSG_RESCHEDULE, 0L, 0); + break; + } + } + } +#endif +} + +static inline void del_from_runqueue(struct task_struct * p) +{ + struct task_struct *next = p->next_run; + struct task_struct *prev = p->prev_run; + +#if 1 /* sanity tests */ + if (!next || !prev) { + printk("task not on run-queue\n"); + return; + } +#endif + if (p == &init_task) { + static int nr = 0; + if (nr < 5) { + nr++; + printk("idle task may not sleep\n"); + } + return; + } + nr_running--; + next->prev_run = prev; + prev->next_run = next; + p->next_run = NULL; + p->prev_run = NULL; +} + +static inline void move_last_runqueue(struct task_struct * p) +{ + struct task_struct *next = p->next_run; + struct task_struct *prev = p->prev_run; + + /* remove from list */ + next->prev_run = prev; + prev->next_run = next; + /* add back to list */ + p->next_run = &init_task; + prev = init_task.prev_run; + init_task.prev_run = p; + p->prev_run = prev; + prev->next_run = p; +} + +/* + * Wake up a process. Put it on the run-queue if it's not + * already there. The "current" process is always on the + * run-queue (except when the actual re-schedule is in + * progress), and as such you're allowed to do the simpler + * "current->state = TASK_RUNNING" to mark yourself runnable + * without the overhead of this. + */ +inline void wake_up_process(struct task_struct * p) +{ + unsigned long flags; + + save_flags(flags); + cli(); + p->state = TASK_RUNNING; + if (!p->next_run) + add_to_runqueue(p); + restore_flags(flags); +} + +static void process_timeout(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + + p->timeout = 0; + wake_up_process(p); +} + +/* + * This is the function that decides how desirable a process is.. + * You can weigh different processes against each other depending + * on what CPU they've run on lately etc to try to handle cache + * and TLB miss penalties. + * + * Return values: + * -1000: never select this + * 0: out of time, recalculate counters (but it might still be + * selected) + * +ve: "goodness" value (the larger, the better) + * +1000: realtime process, select this. + */ +static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu) +{ + int weight; + +#ifdef __SMP__ + /* We are not permitted to run a task someone else is running */ + if (p->processor != NO_PROC_ID) + return -1000; +#ifdef PAST_2_0 + /* This process is locked to a processor group */ + if (p->processor_mask && !(p->processor_mask & (1<policy != SCHED_OTHER) + return 1000 + p->rt_priority; + + /* + * Give the process a first-approximation goodness value + * according to the number of clock-ticks it has left. + * + * Don't do any other calculations if the time slice is + * over.. + */ + weight = p->counter; + if (weight) { + +#ifdef __SMP__ + /* Give a largish advantage to the same processor... */ + /* (this is equivalent to penalizing other processors) */ + if (p->last_processor == this_cpu) + weight += PROC_CHANGE_PENALTY; +#endif + + /* .. and a slight advantage to the current process */ + if (p == prev) + weight += 1; + } + + return weight; +} /* * 'schedule()' is the scheduler function. It's a very simple and nice * scheduler: it's not perfect, but certainly works for most things. - * The one thing you might take a look at is the signal-handler code here. + * + * The goto is "interesting". * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' * information in task[0] is never used. - * - * The "confuse_gcc" goto is used only to get better assembly code.. - * Dijkstra probably hates me. */ asmlinkage void schedule(void) { int c; struct task_struct * p; - struct task_struct * next; - unsigned long ticks; + struct task_struct * prev, * next; + unsigned long timeout = 0; + int this_cpu=smp_processor_id(); /* check alarm, wake up any interruptible tasks that have got a signal */ - if (intr_count) { - printk("Aiee: scheduling in interrupt\n"); + if (intr_count) + goto scheduling_in_interrupt; + + if (bh_active & bh_mask) { + intr_count = 1; + do_bottom_half(); intr_count = 0; } + run_task_queue(&tq_scheduler); - cli(); - ticks = itimer_ticks; - itimer_ticks = 0; - itimer_next = ~0; - sti(); + need_resched = 0; - nr_running = 0; - p = &init_task; - for (;;) { - if ((p = p->next_task) == &init_task) - goto confuse_gcc1; - if (ticks && p->it_real_value) { - if (p->it_real_value <= ticks) { - send_sig(SIGALRM, p, 1); - if (!p->it_real_incr) { - p->it_real_value = 0; - goto end_itimer; - } - do { - p->it_real_value += p->it_real_incr; - } while (p->it_real_value <= ticks); + prev = current; + cli(); + /* move an exhausted RR process to be last.. */ + if (!prev->counter && prev->policy == SCHED_RR) { + prev->counter = prev->priority; + move_last_runqueue(prev); + } + switch (prev->state) { + case TASK_INTERRUPTIBLE: + if (prev->signal & ~prev->blocked) + goto makerunnable; + timeout = prev->timeout; + if (timeout && (timeout <= jiffies)) { + prev->timeout = 0; + timeout = 0; + makerunnable: + prev->state = TASK_RUNNING; + break; } - p->it_real_value -= ticks; - if (p->it_real_value < itimer_next) - itimer_next = p->it_real_value; - } -end_itimer: - if (p->state != TASK_INTERRUPTIBLE) - continue; - if (p->signal & ~p->blocked) { - p->state = TASK_RUNNING; - continue; - } - if (p->timeout && p->timeout <= jiffies) { - p->timeout = 0; - p->state = TASK_RUNNING; - } + default: + del_from_runqueue(prev); + case TASK_RUNNING: } -confuse_gcc1: + p = init_task.next_run; + sti(); + +#ifdef __SMP__ + /* + * This is safe as we do not permit re-entry of schedule() + */ + prev->processor = NO_PROC_ID; +#define idle_task (task[cpu_number_map[this_cpu]]) +#else +#define idle_task (&init_task) +#endif +/* + * Note! there may appear new tasks on the run-queue during this, as + * interrupts are enabled. However, they will be put on front of the + * list, so our list starting at "p" is essentially fixed. + */ /* this is the scheduler proper: */ -#if 0 - /* give processes that go to sleep a bit higher priority.. */ - /* This depends on the values for TASK_XXX */ - /* This gives smoother scheduling for some things, but */ - /* can be very unfair under some circumstances, so.. */ - if (TASK_UNINTERRUPTIBLE >= (unsigned) current->state && - current->counter < current->priority*2) { - ++current->counter; - } -#endif c = -1000; - next = p = &init_task; - for (;;) { - if ((p = p->next_task) == &init_task) - goto confuse_gcc2; - if (p->state == TASK_RUNNING) { - nr_running++; - if (p->counter > c) - c = p->counter, next = p; - } + next = idle_task; + while (p != &init_task) { + int weight = goodness(p, prev, this_cpu); + if (weight > c) + c = weight, next = p; + p = p->next_run; } -confuse_gcc2: + + /* if all runnable processes have "counter == 0", re-calculate counters */ if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } - if (current == next) - return; - kstat.context_swtch++; +#ifdef __SMP__ + /* + * Allocate process to CPU + */ + + next->processor = this_cpu; + next->last_processor = this_cpu; +#endif +#ifdef __SMP_PROF__ + /* mark processor running an idle thread */ + if (0==next->pid) + set_bit(this_cpu,&smp_idle_map); + else + clear_bit(this_cpu,&smp_idle_map); +#endif + if (prev != next) { + struct timer_list timer; + + kstat.context_swtch++; + if (timeout) { + init_timer(&timer); + timer.expires = timeout; + timer.data = (unsigned long) prev; + timer.function = process_timeout; + add_timer(&timer); + } + + get_mmu_context(next); + switch_to(prev,next); + if (timeout) + del_timer(&timer); + } + return; - switch_to(next); +scheduling_in_interrupt: + printk("Aiee: scheduling in interrupt %p\n", + return_address()); +/* + * System is probably fucked up anyway beyond a save landing; prevent + * messages on the screen from scrolling away. + */ +while(1); } +#ifndef __alpha__ + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ asmlinkage int sys_pause(void) { current->state = TASK_INTERRUPTIBLE; @@ -202,6 +430,8 @@ asmlinkage int sys_pause(void) return -ERESTARTNOHAND; } +#endif + /* * wake_up doesn't wake up stopped processes - they have to be awakened * with signals or similar. @@ -212,70 +442,139 @@ asmlinkage int sys_pause(void) */ void wake_up(struct wait_queue **q) { - struct wait_queue *tmp; - struct task_struct * p; + struct wait_queue *next; + struct wait_queue *head; - if (!q || !(tmp = *q)) + if (!q || !(next = *q)) return; - do { - if ((p = tmp->task) != NULL) { + head = WAIT_QUEUE_HEAD(q); + while (next != head) { + struct task_struct *p = next->task; + next = next->next; + if (p != NULL) { if ((p->state == TASK_UNINTERRUPTIBLE) || - (p->state == TASK_INTERRUPTIBLE)) { - p->state = TASK_RUNNING; - if (p->counter > current->counter + 3) - need_resched = 1; - } + (p->state == TASK_INTERRUPTIBLE)) + wake_up_process(p); } - if (!tmp->next) { - printk("wait_queue is bad (eip = %p)\n", - __builtin_return_address(0)); - printk(" q = %p\n",q); - printk(" *q = %p\n",*q); - printk(" tmp = %p\n",tmp); - break; - } - tmp = tmp->next; - } while (tmp != *q); + if (!next) + goto bad; + } + return; +bad: + printk("wait_queue is bad (eip = %p)\n", + __builtin_return_address(0)); + printk(" q = %p\n",q); + printk(" *q = %p\n",*q); } void wake_up_interruptible(struct wait_queue **q) { - struct wait_queue *tmp; - struct task_struct * p; + struct wait_queue *next; + struct wait_queue *head; - if (!q || !(tmp = *q)) + if (!q || !(next = *q)) return; - do { - if ((p = tmp->task) != NULL) { - if (p->state == TASK_INTERRUPTIBLE) { - p->state = TASK_RUNNING; - if (p->counter > current->counter + 3) - need_resched = 1; - } + head = WAIT_QUEUE_HEAD(q); + while (next != head) { + struct task_struct *p = next->task; + next = next->next; + if (p != NULL) { + if (p->state == TASK_INTERRUPTIBLE) + wake_up_process(p); } - if (!tmp->next) { - printk("wait_queue is bad (eip = %p)\n", - __builtin_return_address(0)); - printk(" q = %p\n",q); - printk(" *q = %p\n",*q); - printk(" tmp = %p\n",tmp); - break; - } - tmp = tmp->next; - } while (tmp != *q); + if (!next) + goto bad; + } + return; +bad: + printk("wait_queue is bad (eip = %p)\n", + __builtin_return_address(0)); + printk(" q = %p\n",q); + printk(" *q = %p\n",*q); +} + +/* + * Semaphores are implemented using a two-way counter: + * The "count" variable is decremented for each process + * that tries to sleep, while the "waiting" variable is + * incremented _while_ the process is sleeping on that + * semaphore. + * + * Notably, the inline "up()" and "down()" functions can + * efficiently test if they need to do any extra work (up + * needs to do something only if count was negative before + * the increment operation. + */ +static inline void normalize_semaphore(struct semaphore *sem) +{ + atomic_add(xchg(&sem->waiting,0), &sem->count); +} + +/* + * When __up() is called, the count was negative before + * incrementing it, and we need to wake up somebody. In + * most cases "waiting" will be positive, and the normalization + * will allow things to continue. However, if somebody has + * /just/ done a down(), it may be that count was negative + * without waiting being positive (or in the generic case + * "count is more negative than waiting is positive"), and + * the waiter needs to check this itself (see __down). + * + * Note that these functions are only called when there is + * contention on the lock, and as such all this is the + * "non-critical" part of the whole semaphore business. The + * critical part is the inline stuff in + * where we want to avoid any extra jumps and calls. + */ +void __up(struct semaphore *sem) +{ + normalize_semaphore(sem); + wake_up(&sem->wait); } void __down(struct semaphore * sem) { - struct wait_queue wait = { current, NULL }; + struct task_struct *tsk = current; + struct wait_queue wait = { tsk, NULL }; + + /* + * The order here is important. We add ourselves to the + * wait queues and mark ourselves sleeping _first_. That + * way, if a "up()" comes in here, we'll either get + * woken up (up happens after the wait queues are set up) + * OR we'll have "waiting > 0". + */ + tsk->state = TASK_UNINTERRUPTIBLE; add_wait_queue(&sem->wait, &wait); - current->state = TASK_UNINTERRUPTIBLE; - while (sem->count <= 0) { - schedule(); - current->state = TASK_UNINTERRUPTIBLE; + atomic_inc(&sem->waiting); + + /* + * Ok, we're set up. The only race here is really that + * an "up()" might have incremented count before we got + * here, so we check "count+waiting". If that is larger + * than zero, we shouldn't sleep, but re-try the lock. + */ + if (sem->count+sem->waiting <= 0) { + /* + * If "count+waiting" <= 0, we have to wait + * for a up(), which will normalize the count. + * Remember, at this point we have decremented + * count, and incremented up, so if count is + * zero or positive we need to return to re-try + * the lock. It _may_ be that both count and + * waiting is zero and that it is still locked, + * but we still want to re-try the lock in that + * case to make count go negative again so that + * the optimized "up()" wake_up sequence works. + */ + do { + schedule(); + tsk->state = TASK_UNINTERRUPTIBLE; + } while (sem->count < 0); } - current->state = TASK_RUNNING; + tsk->state = TASK_RUNNING; remove_wait_queue(&sem->wait, &wait); + normalize_semaphore(sem); } static inline void __sleep_on(struct wait_queue **p, int state) @@ -288,11 +587,13 @@ static inline void __sleep_on(struct wait_queue **p, int state) if (current == task[0]) panic("task[0] trying to sleep"); current->state = state; - add_wait_queue(p, &wait); save_flags(flags); + cli(); + __add_wait_queue(p, &wait); sti(); schedule(); - remove_wait_queue(p, &wait); + cli(); + __remove_wait_queue(p, &wait); restore_flags(flags); } @@ -311,7 +612,7 @@ void sleep_on(struct wait_queue **p) * and the sorting routine counts on this.. */ static struct timer_list timer_head = { &timer_head, &timer_head, ~0, 0, NULL }; -#define SLOW_BUT_DEBUGGING_TIMERS 1 +#define SLOW_BUT_DEBUGGING_TIMERS 0 void add_timer(struct timer_list * timer) { @@ -326,7 +627,6 @@ void add_timer(struct timer_list * timer) } #endif p = &timer_head; - timer->expires += jiffies; save_flags(flags); cli(); do { @@ -341,42 +641,66 @@ void add_timer(struct timer_list * timer) int del_timer(struct timer_list * timer) { - unsigned long flags; -#if SLOW_BUT_DEBUGGING_TIMERS - struct timer_list * p; - - p = &timer_head; - save_flags(flags); - cli(); - while ((p = p->next) != &timer_head) { - if (p == timer) { - timer->next->prev = timer->prev; - timer->prev->next = timer->next; + int ret = 0; + if (timer->next) { + unsigned long flags; + struct timer_list * next; + save_flags(flags); + cli(); + if ((next = timer->next) != NULL) { + (next->prev = timer->prev)->next = next; timer->next = timer->prev = NULL; - restore_flags(flags); - timer->expires -= jiffies; - return 1; + ret = 1; } + restore_flags(flags); } - if (timer->next || timer->prev) - printk("del_timer() called from %p with timer not initialized\n", - __builtin_return_address(0)); - restore_flags(flags); - return 0; -#else - save_flags(flags); + return ret; +} + +static inline void run_timer_list(void) +{ + struct timer_list * timer; + cli(); - if (timer->next) { + while ((timer = timer_head.next) != &timer_head && timer->expires <= jiffies) { + void (*fn)(unsigned long) = timer->function; + unsigned long data = timer->data; timer->next->prev = timer->prev; timer->prev->next = timer->next; timer->next = timer->prev = NULL; - restore_flags(flags); - timer->expires -= jiffies; - return 1; + sti(); + fn(data); + cli(); } - restore_flags(flags); - return 0; -#endif + sti(); +} + +static inline void run_old_timers(void) +{ + struct timer_struct *tp; + unsigned long mask; + + for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { + if (mask > timer_active) + break; + if (!(mask & timer_active)) + continue; + if (tp->expires > jiffies) + continue; + timer_active &= ~mask; + tp->fn(); + sti(); + } +} + +void tqueue_bh(void) +{ + run_task_queue(&tq_timer); +} + +void immediate_bh(void) +{ + run_task_queue(&tq_immediate); } unsigned long timer_active = 0; @@ -403,21 +727,25 @@ static unsigned long count_active_tasks(void) (*p)->state == TASK_UNINTERRUPTIBLE || (*p)->state == TASK_SWAPPING)) nr += FIXED_1; +#ifdef __SMP__ + nr-=(smp_num_cpus-1)*FIXED_1; +#endif return nr; } -static inline void calc_load(void) +static inline void calc_load(unsigned long ticks) { unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; - if (count-- > 0) - return; - count = LOAD_FREQ; - active_tasks = count_active_tasks(); - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); + count -= ticks; + if (count < 0) { + count += LOAD_FREQ; + active_tasks = count_active_tasks(); + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + } } /* @@ -428,138 +756,138 @@ static inline void calc_load(void) * They were originally developed for SUN and DEC kernels. * All the kudos should go to Dave for this stuff. * - * These were ported to Linux by Philip Gladstone. */ static void second_overflow(void) { - long ltemp; - - /* Bump the maxerror field */ - time_maxerror = (0x70000000-time_maxerror < time_tolerance) ? - 0x70000000 : (time_maxerror + time_tolerance); - - /* Run the PLL */ - if (time_offset < 0) { - ltemp = (-(time_offset+1) >> (SHIFT_KG + time_constant)) + 1; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - time_offset += (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE); - time_adj = - time_adj; - } else if (time_offset > 0) { - ltemp = ((time_offset-1) >> (SHIFT_KG + time_constant)) + 1; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - time_offset -= (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE); - } else { - time_adj = 0; - } - - time_adj += (time_freq >> (SHIFT_KF + SHIFT_HZ - SHIFT_SCALE)) - + FINETUNE; - - /* Handle the leap second stuff */ - switch (time_status) { - case TIME_INS: - /* ugly divide should be replaced */ - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; /* !! */ - time_status = TIME_OOP; - printk("Clock: inserting leap second 23:59:60 UTC\n"); - } - break; - - case TIME_DEL: - /* ugly divide should be replaced */ - if (xtime.tv_sec % 86400 == 86399) { - xtime.tv_sec++; - time_status = TIME_OK; - printk("Clock: deleting leap second 23:59:59 UTC\n"); - } - break; - - case TIME_OOP: - time_status = TIME_OK; - break; + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if ( time_maxerror > MAXPHASE ) + time_maxerror = MAXPHASE; + + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + time_state = TIME_OOP; + printk("Clock: inserting leap second 23:59:60 UTC\n"); } -} - -/* - * disregard lost ticks for now.. We don't care enough. - */ -static void timer_bh(void * unused) -{ - unsigned long mask; - struct timer_struct *tp; - struct timer_list * timer; + break; - cli(); - while ((timer = timer_head.next) != &timer_head && timer->expires < jiffies) { - void (*fn)(unsigned long) = timer->function; - unsigned long data = timer->data; - timer->next->prev = timer->prev; - timer->prev->next = timer->next; - timer->next = timer->prev = NULL; - sti(); - fn(data); - cli(); - } - sti(); - - for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { - if (mask > timer_active) - break; - if (!(mask & timer_active)) - continue; - if (tp->expires > jiffies) - continue; - timer_active &= ~mask; - tp->fn(); - sti(); + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + time_state = TIME_WAIT; + printk("Clock: deleting leap second 23:59:59 UTC\n"); } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ + pps_valid++; + if (pps_valid == PPS_VALID) { + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; + if (ltemp < 0) + time_adj -= -ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + else + time_adj += ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + +#if HZ == 100 + /* compensate for (HZ==100) != 128. Add 25% to get 125; => only 3% error */ + if (time_adj < 0) + time_adj -= -time_adj >> 2; + else + time_adj += time_adj >> 2; +#endif } -void tqueue_bh(void * unused) -{ - run_task_queue(&tq_timer); -} - -void immediate_bh(void * unused) -{ - run_task_queue(&tq_immediate); -} - -/* - * The int argument is really a (struct pt_regs *), in case the - * interrupt wants to know from where it was called. The timer - * irq uses this to decide if it should update the user or system - * times. - */ -static void do_timer(int irq, struct pt_regs * regs) +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) { - unsigned long mask; - struct timer_struct *tp; - /* last time the cmos clock got updated */ - static long last_rtc_update=0; - extern int set_rtc_mmss(unsigned long); - - long ltemp, psecs; - - /* Advance the phase, once it gets to one microsecond, then + /* + * Advance the phase, once it gets to one microsecond, then * advance the tick more. */ time_phase += time_adj; - if (time_phase < -FINEUSEC) { - ltemp = -time_phase >> SHIFT_SCALE; + if (time_phase <= -FINEUSEC) { + long ltemp = -time_phase >> SHIFT_SCALE; time_phase += ltemp << SHIFT_SCALE; xtime.tv_usec += tick + time_adjust_step - ltemp; } - else if (time_phase > FINEUSEC) { - ltemp = time_phase >> SHIFT_SCALE; + else if (time_phase >= FINEUSEC) { + long ltemp = time_phase >> SHIFT_SCALE; time_phase -= ltemp << SHIFT_SCALE; xtime.tv_usec += tick + time_adjust_step + ltemp; } else xtime.tv_usec += tick + time_adjust_step; - if (time_adjust) - { + if (time_adjust) { /* We are doing an adjtime thing. * * Modify the value of the tick for next time. @@ -570,123 +898,240 @@ static void do_timer(int irq, struct pt_regs * regs) * in the range -tickadj .. +tickadj */ if (time_adjust > tickadj) - time_adjust_step = tickadj; + time_adjust_step = tickadj; else if (time_adjust < -tickadj) - time_adjust_step = -tickadj; + time_adjust_step = -tickadj; else - time_adjust_step = time_adjust; + time_adjust_step = time_adjust; /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; } else time_adjust_step = 0; +} + +/* + * Using a loop looks inefficient, but "ticks" is + * usually just one (we shouldn't be losing ticks, + * we're doing this this way mainly for interrupt + * latency reasons, not because we think we'll + * have lots of lost timer ticks + */ +static void update_wall_time(unsigned long ticks) +{ + do { + ticks--; + update_wall_time_one_tick(); + } while (ticks); if (xtime.tv_usec >= 1000000) { xtime.tv_usec -= 1000000; xtime.tv_sec++; second_overflow(); } +} - /* If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - */ - if (time_status != TIME_BAD && xtime.tv_sec > last_rtc_update + 660 && - xtime.tv_usec > 500000 - (tick >> 1) && - xtime.tv_usec < 500000 + (tick >> 1)) - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ - - jiffies++; - calc_load(); - if (user_mode(regs)) { - current->utime++; - if (current != task[0]) { - if (current->priority < 15) - kstat.cpu_nice++; - else - kstat.cpu_user++; +static inline void do_process_times(struct task_struct *p, + unsigned long user, unsigned long system) +{ + long psecs; + + p->utime += user; + p->stime += system; + + psecs = (p->stime + p->utime) / HZ; + if (psecs > p->rlim[RLIMIT_CPU].rlim_cur) { + /* Send SIGXCPU every second.. */ + if (psecs * HZ == p->stime + p->utime) + send_sig(SIGXCPU, p, 1); + /* and SIGKILL when we go over max.. */ + if (psecs > p->rlim[RLIMIT_CPU].rlim_max) + send_sig(SIGKILL, p, 1); + } +} + +static inline void do_it_virt(struct task_struct * p, unsigned long ticks) +{ + unsigned long it_virt = p->it_virt_value; + + if (it_virt) { + if (it_virt <= ticks) { + it_virt = ticks + p->it_virt_incr; + send_sig(SIGVTALRM, p, 1); } - /* Update ITIMER_VIRT for current task if not in a system call */ - if (current->it_virt_value && !(--current->it_virt_value)) { - current->it_virt_value = current->it_virt_incr; - send_sig(SIGVTALRM,current,1); + p->it_virt_value = it_virt - ticks; + } +} + +static inline void do_it_prof(struct task_struct * p, unsigned long ticks) +{ + unsigned long it_prof = p->it_prof_value; + + if (it_prof) { + if (it_prof <= ticks) { + it_prof = ticks + p->it_prof_incr; + send_sig(SIGPROF, p, 1); } - } else { - current->stime++; - if(current != task[0]) - kstat.cpu_system++; -#ifdef CONFIG_PROFILE - if (prof_buffer && current != task[0]) { - extern int _stext; - unsigned long eip = regs->eip - (unsigned long) &_stext; - eip >>= CONFIG_PROFILE_SHIFT; - if (eip < prof_len) - prof_buffer[eip]++; + p->it_prof_value = it_prof - ticks; + } +} + +static __inline__ void update_one_process(struct task_struct *p, + unsigned long ticks, unsigned long user, unsigned long system) +{ + do_process_times(p, user, system); + do_it_virt(p, user); + do_it_prof(p, ticks); +} + +static void update_process_times(unsigned long ticks, unsigned long system) +{ +#ifndef __SMP__ + struct task_struct * p = current; + unsigned long user = ticks - system; + if (p->pid) { + p->counter -= ticks; + if (p->counter < 0) { + p->counter = 0; + need_resched = 1; } -#endif + if (p->priority < DEF_PRIORITY) + kstat.cpu_nice += user; + else + kstat.cpu_user += user; + kstat.cpu_system += system; } - /* - * check the cpu time limit on the process. - */ - if ((current->rlim[RLIMIT_CPU].rlim_max != RLIM_INFINITY) && - (((current->stime + current->utime) / HZ) >= current->rlim[RLIMIT_CPU].rlim_max)) - send_sig(SIGKILL, current, 1); - if ((current->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) && - (((current->stime + current->utime) % HZ) == 0)) { - psecs = (current->stime + current->utime) / HZ; - /* send when equal */ - if (psecs == current->rlim[RLIMIT_CPU].rlim_cur) - send_sig(SIGXCPU, current, 1); - /* and every five seconds thereafter. */ - else if ((psecs > current->rlim[RLIMIT_CPU].rlim_cur) && - ((psecs - current->rlim[RLIMIT_CPU].rlim_cur) % 5) == 0) - send_sig(SIGXCPU, current, 1); - } - - if (current != task[0] && 0 > --current->counter) { - current->counter = 0; - need_resched = 1; + update_one_process(p, ticks, user, system); +#else + int cpu,j; + cpu = smp_processor_id(); + for (j=0;jpid) { + /* assume user-mode process */ + unsigned long utime = ticks; + unsigned long stime = 0; + if (cpu == i) { + utime = ticks-system; + stime = system; + } else if (smp_proc_in_lock[j]) { + utime = 0; + stime = ticks; + } + update_one_process(p, ticks, utime, stime); + + if (p->priority < DEF_PRIORITY) + kstat.cpu_nice += utime; + else + kstat.cpu_user += utime; + kstat.cpu_system += stime; + + p->counter -= ticks; + if (p->counter >= 0) + continue; + p->counter = 0; + } else { + /* + * Idle processor found, do we have anything + * we could run? + */ + if (!(0x7fffffff & smp_process_available)) + continue; + } + /* Ok, we should reschedule, do the magic */ + if (i==cpu) + need_resched = 1; + else + smp_message_pass(i, MSG_RESCHEDULE, 0L, 0); } - /* Update ITIMER_PROF for the current task */ - if (current->it_prof_value && !(--current->it_prof_value)) { - current->it_prof_value = current->it_prof_incr; - send_sig(SIGPROF,current,1); +#endif +} + +static unsigned long lost_ticks = 0; +static unsigned long lost_ticks_system = 0; + +static inline void update_times(void) +{ + unsigned long ticks; + + ticks = xchg(&lost_ticks, 0); + + if (ticks) { + unsigned long system; + + system = xchg(&lost_ticks_system, 0); + calc_load(ticks); + update_wall_time(ticks); + update_process_times(ticks, system); } - for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { - if (mask > timer_active) - break; - if (!(mask & timer_active)) - continue; - if (tp->expires > jiffies) - continue; - mark_bh(TIMER_BH); +} + +static void timer_bh(void) +{ + update_times(); + run_old_timers(); + run_timer_list(); +} + +void do_timer(struct pt_regs * regs) +{ + (*(unsigned long *)&jiffies)++; + lost_ticks++; + mark_bh(TIMER_BH); + if (!user_mode(regs)) { + lost_ticks_system++; + if (prof_buffer && current->pid) { + extern int _stext; + unsigned long ip = instruction_pointer(regs); + ip -= (unsigned long) &_stext; + ip >>= prof_shift; + if (ip < prof_len) + prof_buffer[ip]++; + } } - cli(); - itimer_ticks++; - if (itimer_ticks > itimer_next) - need_resched = 1; - if (timer_head.next->expires < jiffies) - mark_bh(TIMER_BH); - if (tq_timer != &tq_last) + if (tq_timer) mark_bh(TQUEUE_BH); - sti(); } -asmlinkage int sys_alarm(long seconds) +#ifndef __alpha__ + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +asmlinkage unsigned int sys_alarm(unsigned int seconds) { struct itimerval it_new, it_old; + unsigned int oldalarm; it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; it_new.it_value.tv_sec = seconds; it_new.it_value.tv_usec = 0; _setitimer(ITIMER_REAL, &it_new, &it_old); - return(it_old.it_value.tv_sec + (it_old.it_value.tv_usec / 1000000)); + oldalarm = it_old.it_value.tv_sec; + /* ehhh.. We can't return 0 if we have an alarm pending.. */ + /* And we'd better return too much than too little anyway */ + if (it_old.it_value.tv_usec) + oldalarm++; + return oldalarm; } +/* + * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this + * should be moved into arch/i386 instead? + */ asmlinkage int sys_getpid(void) { return current->pid; @@ -717,47 +1162,286 @@ asmlinkage int sys_getegid(void) return current->egid; } -asmlinkage int sys_nice(long increment) +/* + * This has been replaced by sys_setpriority. Maybe it should be + * moved into the arch dependent tree for those ports that require + * it for backward compatibility? + */ +asmlinkage int sys_nice(int increment) { - int newprio; - - if (increment < 0 && !suser()) - return -EPERM; + unsigned long newprio; + int increase = 0; + + newprio = increment; + if (increment < 0) { + if (!suser()) + return -EPERM; + newprio = -increment; + increase = 1; + } + if (newprio > 40) + newprio = 40; + /* + * do a "normalization" of the priority (traditionally + * unix nice values are -20..20, linux doesn't really + * use that kind of thing, but uses the length of the + * timeslice instead (default 150 msec). The rounding is + * why we want to avoid negative values. + */ + newprio = (newprio * DEF_PRIORITY + 10) / 20; + increment = newprio; + if (increase) + increment = -increment; newprio = current->priority - increment; - if (newprio < 1) + if ((signed) newprio < 1) newprio = 1; - if (newprio > 35) - newprio = 35; + if (newprio > DEF_PRIORITY*2) + newprio = DEF_PRIORITY*2; current->priority = newprio; return 0; } +#endif + +static struct task_struct *find_process_by_pid(pid_t pid) +{ + struct task_struct *p; + + p = current; + if (pid) { + for_each_task(p) { + if (p->pid == pid) + goto found; + } + p = NULL; + } +found: + return p; +} + +static int setscheduler(pid_t pid, int policy, + struct sched_param *param) +{ + struct sched_param lp; + struct task_struct *p; + + if (!param || pid < 0) + return -EINVAL; + + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + return -EFAULT; + + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + if (policy < 0) + policy = p->policy; + else if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_OTHER) + return -EINVAL; + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid + * priority for SCHED_OTHER is 0. + */ + if (lp.sched_priority < 0 || lp.sched_priority > 99) + return -EINVAL; + if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) + return -EINVAL; + + if ((policy == SCHED_FIFO || policy == SCHED_RR) && !suser()) + return -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !suser()) + return -EPERM; + + p->policy = policy; + p->rt_priority = lp.sched_priority; + cli(); + if (p->next_run) + move_last_runqueue(p); + sti(); + schedule(); + + return 0; +} + +asmlinkage int sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param *param) +{ + return setscheduler(pid, policy, param); +} + +asmlinkage int sys_sched_setparam(pid_t pid, struct sched_param *param) +{ + return setscheduler(pid, -1, param); +} + +asmlinkage int sys_sched_getscheduler(pid_t pid) +{ + struct task_struct *p; + + if (pid < 0) + return -EINVAL; + + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + return p->policy; +} + +asmlinkage int sys_sched_getparam(pid_t pid, struct sched_param *param) +{ + struct task_struct *p; + struct sched_param lp; + + if (!param || pid < 0) + return -EINVAL; + + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + lp.sched_priority = p->rt_priority; + return copy_to_user(param, &lp, sizeof(struct sched_param)) ? -EFAULT : 0; +} + +asmlinkage int sys_sched_yield(void) +{ + cli(); + move_last_runqueue(current); + sti(); + return 0; +} + +asmlinkage int sys_sched_get_priority_max(int policy) +{ + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + return 99; + case SCHED_OTHER: + return 0; + } + + return -EINVAL; +} + +asmlinkage int sys_sched_get_priority_min(int policy) +{ + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + return 1; + case SCHED_OTHER: + return 0; + } + + return -EINVAL; +} + +asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) +{ + struct timespec t; + + t.tv_sec = 0; + t.tv_nsec = 0; /* <-- Linus, please fill correct value in here */ + return -ENOSYS; /* and then delete this line. Thanks! */ + return copy_to_user(interval, &t, sizeof(struct timespec)) ? -EFAULT : 0; +} + +/* + * change timeval to jiffies, trying to avoid the + * most obvious overflows.. + */ +static unsigned long timespectojiffies(struct timespec *value) +{ + unsigned long sec = (unsigned) value->tv_sec; + long nsec = value->tv_nsec; + + if (sec > (LONG_MAX / HZ)) + return LONG_MAX; + nsec += 1000000000L / HZ - 1; + nsec /= 1000000000L / HZ; + return HZ * sec + nsec; +} + +static void jiffiestotimespec(unsigned long jiffies, struct timespec *value) +{ + value->tv_nsec = (jiffies % HZ) * (1000000000L / HZ); + value->tv_sec = jiffies / HZ; + return; +} + +asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) +{ + int error; + struct timespec t; + unsigned long expire; + + error = copy_from_user(&t, rqtp, sizeof(struct timespec)); + if (error) + return -EFAULT; + + if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0) + return -EINVAL; + + if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && + current->policy != SCHED_OTHER) { + /* + * Short delay requests up to 2 ms will be handled with + * high precision by a busy wait for all real-time processes. + */ + udelay((t.tv_nsec + 999) / 1000); + return 0; + } + + expire = timespectojiffies(&t) + (t.tv_sec || t.tv_nsec) + jiffies; + current->timeout = expire; + current->state = TASK_INTERRUPTIBLE; + schedule(); + + if (expire > jiffies) { + if (rmtp) { + jiffiestotimespec(expire - jiffies - + (expire > jiffies + 1), &t); + if (copy_to_user(rmtp, &t, sizeof(struct timespec))) + return -EFAULT; + } + return -EINTR; + } + + return 0; +} + static void show_task(int nr,struct task_struct * p) { unsigned long free; - static char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; + static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr); if (((unsigned) p->state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[p->state]); else printk(" "); -#ifdef __i386__ +#if ((~0UL) == 0xffffffff) if (p == current) printk(" current "); else - printk(" %08lX ", ((unsigned long *)p->tss.esp)[3]); -#elif defined (__mips__) + printk(" %08lX ", thread_saved_pc(&p->tss)); +#else if (p == current) - printk(" current "); + printk(" current task "); else - printk(" "); + printk(" %016lx ", thread_saved_pc(&p->tss)); #endif - for (free = 1; free < 1024 ; free++) { + for (free = 1; free < PAGE_SIZE/sizeof(long) ; free++) { if (((unsigned long *)p->kernel_stack_page)[free]) break; } - printk("%5lu %5d %6d ", free << 2, p->pid, p->p_pptr->pid); + printk("%5lu %5d %6d ", free*sizeof(long), p->pid, p->p_pptr->pid); if (p->p_cptr) printk("%5d ", p->p_cptr->pid); else @@ -776,8 +1460,15 @@ void show_state(void) { int i; - printk(" free sibling\n"); +#if ((~0UL) == 0xffffffff) + printk("\n" + " free sibling\n"); printk(" task PC stack pid father child younger older\n"); +#else + printk("\n" + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); +#endif for (i=0 ; i #include -#include +#include #define _S(nr) (1<<((nr)-1)) #define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) +#if !defined(__alpha__) && !defined(__mips__) + +/* + * This call isn't used by all ports, in particular, the Alpha + * uses osf_sigprocmask instead. Maybe it should be moved into + * arch-dependent dir? + */ asmlinkage int sys_sigprocmask(int how, sigset_t *set, sigset_t *oset) { sigset_t new_set, old_set = current->blocked; int error; if (set) { - error = verify_area(VERIFY_READ, set, sizeof(sigset_t)); + error = get_user(new_set, set); if (error) - return error; - new_set = get_fs_long((unsigned long *) set) & _BLOCKABLE; + return error; + new_set &= _BLOCKABLE; switch (how) { case SIG_BLOCK: current->blocked |= new_set; @@ -44,14 +51,19 @@ asmlinkage int sys_sigprocmask(int how, sigset_t *set, sigset_t *oset) } } if (oset) { - error = verify_area(VERIFY_WRITE, oset, sizeof(sigset_t)); + error = put_user(old_set, oset); if (error) - return error; - put_fs_long(old_set, (unsigned long *) oset); + return error; } return 0; } +#endif +#ifndef __alpha__ + +/* + * For backwards compatibility? Functionality superseded by sigprocmask. + */ asmlinkage int sys_sgetmask(void) { return current->blocked; @@ -65,21 +77,19 @@ asmlinkage int sys_ssetmask(int newmask) return old; } +#endif + asmlinkage int sys_sigpending(sigset_t *set) { - int error; - /* fill in "set" with signals pending but blocked. */ - error = verify_area(VERIFY_WRITE, set, 4); - if (!error) - put_fs_long(current->blocked & current->signal, (unsigned long *)set); - return error; + return put_user(current->blocked & current->signal, + /* Hack */(unsigned long *)set); } /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is pending * shall cause the pending signal to be discarded, whether or not - * it is blocked" (but SIGCHLD is unspecified: linux leaves it alone). + * it is blocked." * * "Setting a signal action to SIG_DFL for a signal that is pending * and whose default action is to ignore the signal (for example, @@ -90,31 +100,39 @@ asmlinkage int sys_sigpending(sigset_t *set) * isn't actually ignored, but does automatic child reaping, while * SIG_DFL is explicitly said by POSIX to force the signal to be ignored.. */ -static void check_pending(int signum) +static inline void check_pending(int signum) { struct sigaction *p; - p = signum - 1 + current->sigaction; + p = signum - 1 + current->sig->action; if (p->sa_handler == SIG_IGN) { - if (signum == SIGCHLD) - return; - current->signal &= ~_S(signum); + k_sigdelset(¤t->signal, signum); return; } if (p->sa_handler == SIG_DFL) { if (signum != SIGCONT && signum != SIGCHLD && signum != SIGWINCH) return; - current->signal &= ~_S(signum); + k_sigdelset(¤t->signal, signum); return; } } -asmlinkage unsigned long sys_signal(int signum, void (*handler)(int)) +#if !defined(__alpha__) && !defined(__mips__) +/* + * For backwards compatibility? Functionality superseded by sigaction. + */ +asmlinkage unsigned long sys_signal(int signum, __sighandler_t handler) { int err; struct sigaction tmp; - if (signum<1 || signum>32) + /* + * HACK: We still cannot handle signals > 32 due to the limited + * size of ksigset_t (which will go away). + */ + if (signum > 32) + return -EINVAL; + if (signum<1 || signum>_NSIG) return -EINVAL; if (signum==SIGKILL || signum==SIGSTOP) return -EINVAL; @@ -123,37 +141,38 @@ asmlinkage unsigned long sys_signal(int signum, void (*handler)(int)) if (err) return err; } + memset(&tmp, 0, sizeof(tmp)); tmp.sa_handler = handler; - tmp.sa_mask = 0; tmp.sa_flags = SA_ONESHOT | SA_NOMASK; - tmp.sa_restorer = NULL; - handler = current->sigaction[signum-1].sa_handler; - current->sigaction[signum-1] = tmp; + handler = current->sig->action[signum-1].sa_handler; + current->sig->action[signum-1] = tmp; check_pending(signum); return (unsigned long) handler; } +#endif /* !defined(__alpha__) && !defined(__mips__) */ asmlinkage int sys_sigaction(int signum, const struct sigaction * action, struct sigaction * oldaction) { struct sigaction new_sa, *p; - if (signum<1 || signum>32) + /* + * HACK: We still cannot handle signals > 32 due to the limited + * size of ksigset_t (which will go away). + */ + if (signum > 32) return -EINVAL; - if (signum==SIGKILL || signum==SIGSTOP) + if (signum<1 || signum>_NSIG) return -EINVAL; - p = signum - 1 + current->sigaction; + p = signum - 1 + current->sig->action; if (action) { int err = verify_area(VERIFY_READ, action, sizeof(*action)); if (err) return err; - memcpy_fromfs(&new_sa, action, sizeof(struct sigaction)); - if (new_sa.sa_flags & SA_NOMASK) - new_sa.sa_mask = 0; - else { - new_sa.sa_mask |= _S(signum); - new_sa.sa_mask &= _BLOCKABLE; - } + if (signum==SIGKILL || signum==SIGSTOP) + return -EINVAL; + if (copy_from_user(&new_sa, action, sizeof(struct sigaction))) + return -EFAULT; if (new_sa.sa_handler != SIG_DFL && new_sa.sa_handler != SIG_IGN) { err = verify_area(VERIFY_READ, new_sa.sa_handler, 1); if (err) @@ -161,10 +180,8 @@ asmlinkage int sys_sigaction(int signum, const struct sigaction * action, } } if (oldaction) { - int err = verify_area(VERIFY_WRITE, oldaction, sizeof(*oldaction)); - if (err) - return err; - memcpy_tofs(oldaction, p, sizeof(struct sigaction)); + if (copy_to_user(oldaction, p, sizeof(struct sigaction))) + return -EFAULT; } if (action) { *p = new_sa; diff --git a/kernel/softirq.c b/kernel/softirq.c index 7d919272b..022b55355 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -21,32 +21,31 @@ #include #include -#define INCLUDE_INLINE_FUNCS -#include - unsigned long intr_count = 0; +int bh_mask_count[32]; unsigned long bh_active = 0; unsigned long bh_mask = 0; -struct bh_struct bh_base[32]; +void (*bh_base[32])(void); asmlinkage void do_bottom_half(void) { unsigned long active; unsigned long mask, left; - struct bh_struct *bh; + void (**bh)(void); + sti(); bh = bh_base; active = bh_active & bh_mask; for (mask = 1, left = ~0 ; left & active ; bh++,mask += mask,left += left) { if (mask & active) { - void (*fn)(void *); + void (*fn)(void); bh_active &= ~mask; - fn = bh->routine; + fn = *bh; if (!fn) goto bad_bh; - fn(bh->data); + fn(); } } return; diff --git a/kernel/sys.c b/kernel/sys.c index 171d2411c..b2cc8f154 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -4,6 +4,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include #include #include #include @@ -17,19 +18,24 @@ #include #include #include +#include +#include +#include +#if defined(CONFIG_APM) && defined(CONFIG_APM_POWER_OFF) +#include +#endif -#include +#include #include +#include /* * this indicates whether you can reboot with ctrl-alt-del: the default is yes */ -static int C_A_D = 1; +int C_A_D = 1; extern void adjust_clock(void); -#define PZERO 15 - asmlinkage int sys_ni_syscall(void) { return -ENOSYS; @@ -37,70 +43,101 @@ asmlinkage int sys_ni_syscall(void) static int proc_sel(struct task_struct *p, int which, int who) { - switch (which) { - case PRIO_PROCESS: - if (!who && p == current) - return 1; - return(p->pid == who); - case PRIO_PGRP: - if (!who) - who = current->pgrp; - return(p->pgrp == who); - case PRIO_USER: - if (!who) - who = current->uid; - return(p->uid == who); + if(p->pid) + { + switch (which) { + case PRIO_PROCESS: + if (!who && p == current) + return 1; + return(p->pid == who); + case PRIO_PGRP: + if (!who) + who = current->pgrp; + return(p->pgrp == who); + case PRIO_USER: + if (!who) + who = current->uid; + return(p->uid == who); + } } return 0; } asmlinkage int sys_setpriority(int which, int who, int niceval) { - struct task_struct **p; + struct task_struct *p; int error = ESRCH; - int priority; + unsigned int priority; if (which > 2 || which < 0) return -EINVAL; - if ((priority = PZERO - niceval) <= 0) - priority = 1; + /* normalize: avoid signed division (rounding problems) */ + priority = niceval; + if (niceval < 0) + priority = -niceval; + if (priority > 20) + priority = 20; + priority = (priority * DEF_PRIORITY + 10) / 20 + DEF_PRIORITY; + + if (niceval >= 0) { + priority = 2*DEF_PRIORITY - priority; + if (!priority) + priority = 1; + } - for(p = &LAST_TASK; p > &FIRST_TASK; --p) { - if (!*p || !proc_sel(*p, which, who)) + for_each_task(p) { + if (!proc_sel(p, which, who)) continue; - if ((*p)->uid != current->euid && - (*p)->uid != current->uid && !suser()) { + if (p->uid != current->euid && + p->uid != current->uid && !suser()) { error = EPERM; continue; } if (error == ESRCH) error = 0; - if (priority > (*p)->priority && !suser()) + if (priority > p->priority && !suser()) error = EACCES; else - (*p)->priority = priority; + p->priority = priority; } return -error; } +/* + * Ugh. To avoid negative return values, "getpriority()" will + * not return the normal nice-value, but a value that has been + * offset by 20 (ie it returns 0..40 instead of -20..20) + */ asmlinkage int sys_getpriority(int which, int who) { - struct task_struct **p; - int max_prio = 0; + struct task_struct *p; + long max_prio = -ESRCH; if (which > 2 || which < 0) return -EINVAL; - for(p = &LAST_TASK; p > &FIRST_TASK; --p) { - if (!*p || !proc_sel(*p, which, who)) + for_each_task (p) { + if (!proc_sel(p, which, who)) continue; - if ((*p)->priority > max_prio) - max_prio = (*p)->priority; + if (p->priority > max_prio) + max_prio = p->priority; } - return(max_prio ? max_prio : -ESRCH); + + /* scale the priority from timeslice to 0..40 */ + if (max_prio > 0) + max_prio = (max_prio * 20 + DEF_PRIORITY/2) / DEF_PRIORITY; + return max_prio; } +#ifndef __alpha__ + +/* + * Why do these exist? Binary compatibility with some other standard? + * If so, maybe they should be moved into the appropriate arch + * directory. + */ + asmlinkage int sys_profil(void) { return -ENOSYS; @@ -131,7 +168,8 @@ asmlinkage int sys_prof(void) return -ENOSYS; } -extern void hard_reset_now(void); +#endif + extern asmlinkage sys_kill(int, int); /* @@ -157,6 +195,9 @@ asmlinkage int sys_reboot(int magic, int magic_too, int flag) else if (flag == 0xCDEF0123) { printk(KERN_EMERG "System halted\n"); sys_kill(-1, SIGKILL); +#if defined(CONFIG_APM) && defined(CONFIG_APM_POWER_OFF) + apm_set_power_state(APM_STATE_OFF); +#endif do_exit(0); } else return -EINVAL; @@ -173,7 +214,7 @@ void ctrl_alt_del(void) if (C_A_D) hard_reset_now(); else - send_sig(SIGINT,task[1],1); + kill_proc(1, SIGINT, 1); } @@ -195,6 +236,7 @@ void ctrl_alt_del(void) asmlinkage int sys_setregid(gid_t rgid, gid_t egid) { int old_rgid = current->gid; + int old_egid = current->egid; if (rgid != (gid_t) -1) { if ((old_rgid == rgid) || @@ -209,7 +251,7 @@ asmlinkage int sys_setregid(gid_t rgid, gid_t egid) (current->egid == egid) || (current->sgid == egid) || suser()) - current->egid = egid; + current->fsgid = current->egid = egid; else { current->gid = old_rgid; return(-EPERM); @@ -219,6 +261,8 @@ asmlinkage int sys_setregid(gid_t rgid, gid_t egid) (egid != (gid_t) -1 && egid != old_rgid)) current->sgid = current->egid; current->fsgid = current->egid; + if (current->egid != old_egid) + current->dumpable = 0; return 0; } @@ -227,19 +271,134 @@ asmlinkage int sys_setregid(gid_t rgid, gid_t egid) */ asmlinkage int sys_setgid(gid_t gid) { + int old_egid = current->egid; + if (suser()) current->gid = current->egid = current->sgid = current->fsgid = gid; else if ((gid == current->gid) || (gid == current->sgid)) current->egid = current->fsgid = gid; else return -EPERM; + if (current->egid != old_egid) + current->dumpable = 0; return 0; } + +static char acct_active = 0; +static struct file acct_file; + +int acct_process(long exitcode) +{ + struct acct ac; + unsigned short fs; + + if (acct_active) { + strncpy(ac.ac_comm, current->comm, ACCT_COMM); + ac.ac_comm[ACCT_COMM-1] = '\0'; + ac.ac_utime = current->utime; + ac.ac_stime = current->stime; + ac.ac_btime = CT_TO_SECS(current->start_time) + (xtime.tv_sec - (jiffies / HZ)); + ac.ac_etime = CURRENT_TIME - ac.ac_btime; + ac.ac_uid = current->uid; + ac.ac_gid = current->gid; + ac.ac_tty = (current)->tty == NULL ? -1 : + kdev_t_to_nr(current->tty->device); + ac.ac_flag = 0; + if (current->flags & PF_FORKNOEXEC) + ac.ac_flag |= AFORK; + if (current->flags & PF_SUPERPRIV) + ac.ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + ac.ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + ac.ac_flag |= AXSIG; + ac.ac_minflt = current->min_flt; + ac.ac_majflt = current->maj_flt; + ac.ac_exitcode = exitcode; + + /* Kernel segment override */ + fs = get_fs(); + set_fs(KERNEL_DS); + + acct_file.f_op->write(acct_file.f_inode, &acct_file, + (char *)&ac, sizeof(struct acct)); + + set_fs(fs); + } + return 0; +} + +asmlinkage int sys_acct(const char *name) +{ + struct inode *inode = (struct inode *)0; + char *tmp; + int error; + + if (!suser()) + return -EPERM; + + if (name == (char *)0) { + if (acct_active) { + if (acct_file.f_op->release) + acct_file.f_op->release(acct_file.f_inode, &acct_file); + + if (acct_file.f_inode != (struct inode *) 0) + iput(acct_file.f_inode); + + acct_active = 0; + } + return 0; + } else { + if (!acct_active) { + + if ((error = getname(name, &tmp)) != 0) + return (error); + + error = open_namei(tmp, O_RDWR, 0600, &inode, 0); + putname(tmp); + + if (error) + return (error); + + if (!S_ISREG(inode->i_mode)) { + iput(inode); + return -EACCES; + } + + if (!inode->i_op || !inode->i_op->default_file_ops || + !inode->i_op->default_file_ops->write) { + iput(inode); + return -EIO; + } + + acct_file.f_mode = 3; + acct_file.f_flags = 0; + acct_file.f_count = 1; + acct_file.f_inode = inode; + acct_file.f_pos = inode->i_size; + acct_file.f_reada = 0; + acct_file.f_op = inode->i_op->default_file_ops; + + if (acct_file.f_op->open) + if (acct_file.f_op->open(acct_file.f_inode, &acct_file)) { + iput(inode); + return -EIO; + } + + acct_active = 1; + return 0; + } else + return -EBUSY; + } +} + +#ifndef __alpha__ -asmlinkage int sys_acct(void) -{ - return -ENOSYS; -} +/* + * Why do these exist? Binary compatibility with some other standard? + * If so, maybe they should be moved into the appropriate arch + * directory. + */ asmlinkage int sys_phys(void) { @@ -266,6 +425,8 @@ asmlinkage int sys_old_syscall(void) return -ENOSYS; } +#endif + /* * Unprivileged users may change the real uid to the effective uid * or vice versa. (BSD-style) @@ -284,6 +445,7 @@ asmlinkage int sys_old_syscall(void) asmlinkage int sys_setreuid(uid_t ruid, uid_t euid) { int old_ruid = current->uid; + int old_euid = current->euid; if (ruid != (uid_t) -1) { if ((old_ruid == ruid) || @@ -298,7 +460,7 @@ asmlinkage int sys_setreuid(uid_t ruid, uid_t euid) (current->euid == euid) || (current->suid == euid) || suser()) - current->euid = euid; + current->fsuid = current->euid = euid; else { current->uid = old_ruid; return(-EPERM); @@ -308,6 +470,8 @@ asmlinkage int sys_setreuid(uid_t ruid, uid_t euid) (euid != (uid_t) -1 && euid != old_ruid)) current->suid = current->euid; current->fsuid = current->euid; + if (current->euid != old_euid) + current->dumpable = 0; return 0; } @@ -324,15 +488,61 @@ asmlinkage int sys_setreuid(uid_t ruid, uid_t euid) */ asmlinkage int sys_setuid(uid_t uid) { + int old_euid = current->euid; + if (suser()) current->uid = current->euid = current->suid = current->fsuid = uid; else if ((uid == current->uid) || (uid == current->suid)) current->fsuid = current->euid = uid; else return -EPERM; + if (current->euid != old_euid) + current->dumpable = 0; return(0); } + +/* + * This function implementes a generic ability to update ruid, euid, + * and suid. This allows you to implement the 4.4 compatible seteuid(). + */ +asmlinkage int sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) +{ + uid_t old_ruid, old_euid, old_suid; + + old_ruid = current->uid; + old_euid = current->euid; + old_suid = current->suid; + + if ((ruid != (uid_t) -1) && (ruid != current->uid) && + (ruid != current->euid) && (ruid != current->suid)) + return -EPERM; + if ((euid != (uid_t) -1) && (euid != current->uid) && + (euid != current->euid) && (euid != current->suid)) + return -EPERM; + if ((suid != (uid_t) -1) && (suid != current->uid) && + (suid != current->euid) && (suid != current->suid)) + return -EPERM; + if (ruid != (uid_t) -1) + current->uid = ruid; + if (euid != (uid_t) -1) + current->euid = euid; + if (suid != (uid_t) -1) + current->suid = suid; + return 0; +} + +asmlinkage int sys_getresuid(uid_t *ruid, uid_t *euid, uid_t *suid) +{ + int retval; + + if (!(retval = put_user(current->uid, ruid)) && + !(retval = put_user(current->euid, euid))) + retval = put_user(current->suid, suid); + return retval; +} + + /* * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This * is used for "access()" and for the NFS daemon (letting nfsd stay at @@ -346,6 +556,8 @@ asmlinkage int sys_setfsuid(uid_t uid) if (uid == current->uid || uid == current->euid || uid == current->suid || uid == current->fsuid || suser()) current->fsuid = uid; + if (current->fsuid != old_fsuid) + current->dumpable = 0; return old_fsuid; } @@ -359,95 +571,31 @@ asmlinkage int sys_setfsgid(gid_t gid) if (gid == current->gid || gid == current->egid || gid == current->sgid || gid == current->fsgid || suser()) current->fsgid = gid; + if (current->fsgid != old_fsgid) + current->dumpable = 0; return old_fsgid; } -asmlinkage int sys_times(struct tms * tbuf) +asmlinkage long sys_times(struct tms * tbuf) { + int error; if (tbuf) { - int error = verify_area(VERIFY_WRITE,tbuf,sizeof *tbuf); + error = put_user(current->utime,&tbuf->tms_utime); + if (!error) + error = put_user(current->stime,&tbuf->tms_stime); + if (!error) + error = put_user(current->cutime,&tbuf->tms_cutime); + if (!error) + error = put_user(current->cstime,&tbuf->tms_cstime); if (error) - return error; - put_fs_long(current->utime,(unsigned long *)&tbuf->tms_utime); - put_fs_long(current->stime,(unsigned long *)&tbuf->tms_stime); - put_fs_long(current->cutime,(unsigned long *)&tbuf->tms_cutime); - put_fs_long(current->cstime,(unsigned long *)&tbuf->tms_cstime); + return error; } return jiffies; } -asmlinkage unsigned long sys_brk(unsigned long brk) -{ - int freepages; - unsigned long rlim; - unsigned long newbrk, oldbrk; - - if (brk < current->mm->end_code) - return current->mm->brk; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(current->mm->brk); - if (oldbrk == newbrk) - return current->mm->brk = brk; - - /* - * Always allow shrinking brk - */ - if (brk <= current->mm->brk) { - current->mm->brk = brk; - do_munmap(newbrk, oldbrk-newbrk); - return brk; - } - /* - * Check against rlimit and stack.. - */ - rlim = current->rlim[RLIMIT_DATA].rlim_cur; - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (brk - current->mm->end_code > rlim) - return current->mm->brk; - /* - * Check against existing mmap mappings. - */ - if (find_vma_intersection(current, oldbrk, newbrk+PAGE_SIZE)) - return current->mm->brk; - /* - * stupid algorithm to decide if we have enough memory: while - * simple, it hopefully works in most obvious cases.. Easy to - * fool it, but this should catch most mistakes. - */ - freepages = buffermem >> 12; - freepages += nr_free_pages; - freepages += nr_swap_pages; -#if 0 - /* - * This assumes a PCish memory architecture... - */ - freepages -= (high_memory - 0x100000) >> 16; -#else - freepages -= (high_memory - KSEG0) >> 16; -#endif - freepages -= (newbrk-oldbrk) >> 12; - if (freepages < 0) - return current->mm->brk; -#if 0 - freepages += current->mm->rss; - freepages -= oldbrk >> 12; - if (freepages < 0) - return current->mm->brk; -#endif - /* - * Ok, we have probably got enough memory - let it rip. - */ - current->mm->brk = brk; - do_mmap(NULL, oldbrk, newbrk-oldbrk, - PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_FIXED|MAP_PRIVATE, 0); - return brk; -} - /* - * This needs some heave checking ... - * I just haven't get the stomach for it. I also don't fully + * This needs some heavy checking ... + * I just haven't the stomach for it. I also don't fully * understand sessions/pgrp etc. Let somebody who does explain it. * * OK, I think I have the protection semantics right.... this is really @@ -516,10 +664,28 @@ asmlinkage int sys_getpgrp(void) return current->pgrp; } +asmlinkage int sys_getsid(pid_t pid) +{ + struct task_struct * p; + + if (!pid) + return current->session; + for_each_task(p) { + if (p->pid == pid) + return p->session; + } + return -ESRCH; +} + asmlinkage int sys_setsid(void) { - if (current->leader) - return -EPERM; + struct task_struct * p; + + for_each_task(p) { + if (p->pgrp == current->pid) + return -EPERM; + } + current->leader = 1; current->session = current->pgrp = current->pid; current->tty = NULL; @@ -533,88 +699,76 @@ asmlinkage int sys_setsid(void) asmlinkage int sys_getgroups(int gidsetsize, gid_t *grouplist) { int i; - int * groups; + if (gidsetsize < 0) + return -EINVAL; + i = current->ngroups; if (gidsetsize) { - i = verify_area(VERIFY_WRITE, grouplist, sizeof(gid_t) * gidsetsize); - if (i) - return i; + if (i > gidsetsize) + return -EINVAL; + if (copy_to_user(grouplist, current->groups, sizeof(gid_t)*i)) + return -EFAULT; } - groups = current->groups; - for (i = 0 ; (i < NGROUPS) && (*groups != NOGROUP) ; i++, groups++) { - if (!gidsetsize) - continue; - if (i >= gidsetsize) - break; - put_user(*groups, grouplist); - grouplist++; - } - return(i); + return i; } asmlinkage int sys_setgroups(int gidsetsize, gid_t *grouplist) { - int i; + int err; if (!suser()) return -EPERM; - if (gidsetsize > NGROUPS) + if ((unsigned) gidsetsize > NGROUPS) return -EINVAL; - for (i = 0; i < gidsetsize; i++, grouplist++) { - current->groups[i] = get_fs_word((unsigned short *) grouplist); - } - if (i < NGROUPS) - current->groups[i] = NOGROUP; - return 0; + err = copy_from_user(current->groups, grouplist, gidsetsize * sizeof(gid_t)); + if (err) { + gidsetsize = 0; + err = -EFAULT; + } + current->ngroups = gidsetsize; + return err; } int in_group_p(gid_t grp) { - int i; - - if (grp == current->fsgid) - return 1; - - for (i = 0; i < NGROUPS; i++) { - if (current->groups[i] == NOGROUP) - break; - if (current->groups[i] == grp) - return 1; + if (grp != current->fsgid) { + int i = current->ngroups; + if (i) { + gid_t *groups = current->groups; + do { + if (*groups == grp) + goto out; + groups++; + i--; + } while (i); + } + return 0; } - return 0; +out: + return 1; } asmlinkage int sys_newuname(struct new_utsname * name) { - int error; - if (!name) return -EFAULT; - error = verify_area(VERIFY_WRITE, name, sizeof *name); - if (!error) - memcpy_tofs(name,&system_utsname,sizeof *name); - return error; + if (copy_to_user(name,&system_utsname,sizeof *name)) + return -EFAULT; + return 0; } +#ifndef __alpha__ + +/* + * Move these to arch dependent dir since they are for + * backward compatibility only? + */ asmlinkage int sys_uname(struct old_utsname * name) { - int error; - if (!name) - return -EFAULT; - error = verify_area(VERIFY_WRITE, name,sizeof *name); - if (error) - return error; - memcpy_tofs(&name->sysname,&system_utsname.sysname, - sizeof (system_utsname.sysname)); - memcpy_tofs(&name->nodename,&system_utsname.nodename, - sizeof (system_utsname.nodename)); - memcpy_tofs(&name->release,&system_utsname.release, - sizeof (system_utsname.release)); - memcpy_tofs(&name->version,&system_utsname.version, - sizeof (system_utsname.version)); - memcpy_tofs(&name->machine,&system_utsname.machine, - sizeof (system_utsname.machine)); - return 0; + int error = -EFAULT;; + if (name && !copy_to_user(name, &system_utsname, sizeof (*name))) + error = 0; + return error; } asmlinkage int sys_olduname(struct oldold_utsname * name) @@ -622,22 +776,30 @@ asmlinkage int sys_olduname(struct oldold_utsname * name) int error; if (!name) return -EFAULT; - error = verify_area(VERIFY_WRITE, name,sizeof *name); - if (error) - return error; - memcpy_tofs(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - put_fs_byte(0,name->sysname+__OLD_UTS_LEN); - memcpy_tofs(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - put_fs_byte(0,name->nodename+__OLD_UTS_LEN); - memcpy_tofs(&name->release,&system_utsname.release,__OLD_UTS_LEN); - put_fs_byte(0,name->release+__OLD_UTS_LEN); - memcpy_tofs(&name->version,&system_utsname.version,__OLD_UTS_LEN); - put_fs_byte(0,name->version+__OLD_UTS_LEN); - memcpy_tofs(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); - put_fs_byte(0,name->machine+__OLD_UTS_LEN); - return 0; + error = copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); + if (!error) + error = put_user(0,name->sysname+__OLD_UTS_LEN); + if (!error) + error = copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); + if (!error) + error = put_user(0,name->nodename+__OLD_UTS_LEN); + if (!error) + error = copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); + if (!error) + error = put_user(0,name->release+__OLD_UTS_LEN); + if (!error) + error = copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); + if (!error) + error = put_user(0,name->version+__OLD_UTS_LEN); + if (!error) + error = copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); + if (!error) + error = put_user(0,name->machine+__OLD_UTS_LEN); + return error ? -EFAULT : 0; } +#endif + asmlinkage int sys_sethostname(char *name, int len) { int error; @@ -646,10 +808,9 @@ asmlinkage int sys_sethostname(char *name, int len) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - error = verify_area(VERIFY_READ, name, len); + error = copy_from_user(system_utsname.nodename, name, len); if (error) - return error; - memcpy_fromfs(system_utsname.nodename, name, len); + return -EFAULT; system_utsname.nodename[len] = 0; return 0; } @@ -660,14 +821,10 @@ asmlinkage int sys_gethostname(char *name, int len) if (len < 0) return -EINVAL; - i = verify_area(VERIFY_WRITE, name, len); - if (i) - return i; i = 1+strlen(system_utsname.nodename); if (i > len) i = len; - memcpy_tofs(name, system_utsname.nodename, i); - return 0; + return copy_to_user(name, system_utsname.nodename, i) ? -EFAULT : 0; } /* @@ -676,31 +833,25 @@ asmlinkage int sys_gethostname(char *name, int len) */ asmlinkage int sys_setdomainname(char *name, int len) { - int i; + int error; if (!suser()) return -EPERM; - if (len > __NEW_UTS_LEN) + if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - for (i=0; i < len; i++) { - if ((system_utsname.domainname[i] = get_fs_byte(name+i)) == 0) - return 0; - } - system_utsname.domainname[i] = 0; + error = copy_from_user(system_utsname.domainname, name, len); + if (error) + return -EFAULT; + system_utsname.domainname[len] = 0; return 0; } asmlinkage int sys_getrlimit(unsigned int resource, struct rlimit *rlim) { - int error; - if (resource >= RLIM_NLIMITS) return -EINVAL; - error = verify_area(VERIFY_WRITE,rlim,sizeof *rlim); - if (error) - return error; - memcpy_tofs(rlim, current->rlim + resource, sizeof(*rlim)); - return 0; + return copy_to_user(rlim, current->rlim + resource, sizeof(*rlim)) + ? -EFAULT : 0 ; } asmlinkage int sys_setrlimit(unsigned int resource, struct rlimit *rlim) @@ -710,10 +861,9 @@ asmlinkage int sys_setrlimit(unsigned int resource, struct rlimit *rlim) if (resource >= RLIM_NLIMITS) return -EINVAL; - err = verify_area(VERIFY_READ, rlim, sizeof(*rlim)); + err = copy_from_user(&new_rlim, rlim, sizeof(*rlim)); if (err) - return err; - memcpy_fromfs(&new_rlim, rlim, sizeof(*rlim)); + return -EFAULT; old_rlim = current->rlim + resource; if (((new_rlim.rlim_cur > old_rlim->rlim_max) || (new_rlim.rlim_max > old_rlim->rlim_max)) && @@ -737,12 +887,8 @@ asmlinkage int sys_setrlimit(unsigned int resource, struct rlimit *rlim) */ int getrusage(struct task_struct *p, int who, struct rusage *ru) { - int error; struct rusage r; - error = verify_area(VERIFY_WRITE, ru, sizeof *ru); - if (error) - return error; memset((char *) &r, 0, sizeof(r)); switch (who) { case RUSAGE_SELF: @@ -750,28 +896,30 @@ int getrusage(struct task_struct *p, int who, struct rusage *ru) r.ru_utime.tv_usec = CT_TO_USECS(p->utime); r.ru_stime.tv_sec = CT_TO_SECS(p->stime); r.ru_stime.tv_usec = CT_TO_USECS(p->stime); - r.ru_minflt = p->mm->min_flt; - r.ru_majflt = p->mm->maj_flt; + r.ru_minflt = p->min_flt; + r.ru_majflt = p->maj_flt; + r.ru_nswap = p->nswap; break; case RUSAGE_CHILDREN: r.ru_utime.tv_sec = CT_TO_SECS(p->cutime); r.ru_utime.tv_usec = CT_TO_USECS(p->cutime); r.ru_stime.tv_sec = CT_TO_SECS(p->cstime); r.ru_stime.tv_usec = CT_TO_USECS(p->cstime); - r.ru_minflt = p->mm->cmin_flt; - r.ru_majflt = p->mm->cmaj_flt; + r.ru_minflt = p->cmin_flt; + r.ru_majflt = p->cmaj_flt; + r.ru_nswap = p->cnswap; break; default: r.ru_utime.tv_sec = CT_TO_SECS(p->utime + p->cutime); r.ru_utime.tv_usec = CT_TO_USECS(p->utime + p->cutime); r.ru_stime.tv_sec = CT_TO_SECS(p->stime + p->cstime); r.ru_stime.tv_usec = CT_TO_USECS(p->stime + p->cstime); - r.ru_minflt = p->mm->min_flt + p->mm->cmin_flt; - r.ru_majflt = p->mm->maj_flt + p->mm->cmaj_flt; + r.ru_minflt = p->min_flt + p->cmin_flt; + r.ru_majflt = p->maj_flt + p->cmaj_flt; + r.ru_nswap = p->nswap + p->cnswap; break; } - memcpy_tofs(ru, &r, sizeof(r)); - return 0; + return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } asmlinkage int sys_getrusage(int who, struct rusage *ru) diff --git a/kernel/sysctl.c b/kernel/sysctl.c new file mode 100644 index 000000000..3d0fbf49b --- /dev/null +++ b/kernel/sysctl.c @@ -0,0 +1,922 @@ +/* + * sysctl.c: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + * Added /proc support, Dec 1995 + * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. + * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. + * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. + * Dynamic registration fixes, Stephen Tweedie. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* External variables not in a header file. */ +extern int panic_timeout; + + +#ifdef CONFIG_ROOT_NFS +#include +#endif + +static ctl_table root_table[]; +static struct ctl_table_header root_table_header = + {root_table, DNODE_SINGLE(&root_table_header)}; + +static int parse_table(int *, int, void *, size_t *, void *, size_t, + ctl_table *, void **); + +static ctl_table kern_table[]; +static ctl_table vm_table[]; +extern ctl_table net_table[]; + +/* /proc declarations: */ + +#ifdef CONFIG_PROC_FS + +static long proc_readsys(struct inode * inode, struct file * file, + char * buf, unsigned long count); +static long proc_writesys(struct inode * inode, struct file * file, + const char * buf, unsigned long count); +static int proc_sys_permission(struct inode *, int); + +struct file_operations proc_sys_file_operations = +{ + NULL, /* lseek */ + proc_readsys, /* read */ + proc_writesys, /* write */ + NULL, /* readdir */ + NULL, /* select */ + NULL, /* ioctl */ + NULL, /* mmap */ + NULL, /* no special open code */ + NULL, /* no special release code */ + NULL /* can't fsync */ +}; + +struct inode_operations proc_sys_inode_operations = +{ + &proc_sys_file_operations, + NULL, /* create */ + NULL, /* lookup */ + NULL, /* link */ + NULL, /* unlink */ + NULL, /* symlink */ + NULL, /* mkdir */ + NULL, /* rmdir */ + NULL, /* mknod */ + NULL, /* rename */ + NULL, /* readlink */ + NULL, /* follow_link */ + NULL, /* readpage */ + NULL, /* writepage */ + NULL, /* bmap */ + NULL, /* truncate */ + proc_sys_permission +}; + +extern struct proc_dir_entry proc_sys_root; + +static void register_proc_table(ctl_table *, struct proc_dir_entry *); +static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); +#endif + +extern int bdf_prm[], bdflush_min[], bdflush_max[]; + +static int do_securelevel_strategy (ctl_table *, int *, int, void *, size_t *, + void *, size_t, void **); + +extern char binfmt_java_interpreter[], binfmt_java_appletviewer[]; + +/* The default sysctl tables: */ + +static ctl_table root_table[] = { + {CTL_KERN, "kernel", NULL, 0, 0555, kern_table}, + {CTL_VM, "vm", NULL, 0, 0555, vm_table}, + {CTL_NET, "net", NULL, 0, 0555, net_table}, + {0} +}; + +static ctl_table kern_table[] = { + {KERN_OSTYPE, "ostype", system_utsname.sysname, 64, + 0444, NULL, &proc_dostring, &sysctl_string}, + {KERN_OSRELEASE, "osrelease", system_utsname.release, 64, + 0444, NULL, &proc_dostring, &sysctl_string}, + {KERN_VERSION, "version", system_utsname.version, 64, + 0444, NULL, &proc_dostring, &sysctl_string}, + {KERN_NODENAME, "hostname", system_utsname.nodename, 64, + 0644, NULL, &proc_dostring, &sysctl_string}, + {KERN_DOMAINNAME, "domainname", system_utsname.domainname, 64, + 0644, NULL, &proc_dostring, &sysctl_string}, + {KERN_NRINODE, "inode-nr", &nr_inodes, 2*sizeof(int), + 0444, NULL, &proc_dointvec}, + {KERN_MAXINODE, "inode-max", &max_inodes, sizeof(int), + 0644, NULL, &proc_dointvec}, + {KERN_NRFILE, "file-nr", &nr_files, sizeof(int), + 0444, NULL, &proc_dointvec}, + {KERN_MAXFILE, "file-max", &max_files, sizeof(int), + 0644, NULL, &proc_dointvec}, + {KERN_SECURELVL, "securelevel", &securelevel, sizeof(int), + 0444, NULL, &proc_dointvec, (ctl_handler *)&do_securelevel_strategy}, + {KERN_PANIC, "panic", &panic_timeout, sizeof(int), + 0644, NULL, &proc_dointvec}, +#ifdef CONFIG_BLK_DEV_INITRD + {KERN_REALROOTDEV, "real-root-dev", &real_root_dev, sizeof(int), + 0644, NULL, &proc_dointvec}, +#endif +#ifdef CONFIG_ROOT_NFS + {KERN_NFSRNAME, "nfs-root-name", nfs_root_name, NFS_ROOT_NAME_LEN, + 0644, NULL, &proc_dostring, &sysctl_string }, + {KERN_NFSRNAME, "nfs-root-addrs", nfs_root_addrs, NFS_ROOT_ADDRS_LEN, + 0644, NULL, &proc_dostring, &sysctl_string }, +#endif +#ifdef CONFIG_BINFMT_JAVA + {KERN_JAVA_INTERPRETER, "java-interpreter", binfmt_java_interpreter, + 64, 0644, NULL, &proc_dostring, &sysctl_string }, + {KERN_JAVA_APPLETVIEWER, "java-appletviewer", binfmt_java_appletviewer, + 64, 0644, NULL, &proc_dostring, &sysctl_string }, +#endif + {0} +}; + +static ctl_table vm_table[] = { + {VM_SWAPCTL, "swapctl", + &swap_control, sizeof(swap_control_t), 0600, NULL, &proc_dointvec}, + {VM_KSWAPD, "kswapd", + &kswapd_ctl, sizeof(kswapd_ctl), 0600, NULL, &proc_dointvec}, + {VM_FREEPG, "freepages", + &min_free_pages, 3*sizeof(int), 0600, NULL, &proc_dointvec}, + {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0600, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, + &bdflush_min, &bdflush_max}, + {0} +}; + +void sysctl_init(void) +{ +#ifdef CONFIG_PROC_FS + register_proc_table(root_table, &proc_sys_root); +#endif +} + + +int do_sysctl (int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen) +{ + int error; + struct ctl_table_header *tmp; + void *context; + + if (nlen == 0 || nlen >= CTL_MAXNAME) + return -ENOTDIR; + + error = verify_area(VERIFY_READ,name,nlen*sizeof(int)); + if (error) return error; + if (oldval) { + int old_len; + if (!oldlenp) + return -EFAULT; + error = verify_area(VERIFY_WRITE,oldlenp,sizeof(size_t)); + if (error) return error; + get_user(old_len, oldlenp); + error = verify_area(VERIFY_WRITE,oldval,old_len); + if (error) return error; + } + if (newval) { + error = verify_area(VERIFY_READ,newval,newlen); + if (error) return error; + } + tmp = &root_table_header; + do { + context = 0; + error = parse_table(name, nlen, oldval, oldlenp, + newval, newlen, tmp->ctl_table, &context); + if (context) + kfree(context); + if (error != -ENOTDIR) + return error; + tmp = tmp->DLIST_NEXT(ctl_entry); + } while (tmp != &root_table_header); + return -ENOTDIR; +} + +extern asmlinkage int sys_sysctl(struct __sysctl_args *args) +{ + struct __sysctl_args tmp; + int error; + error = verify_area(VERIFY_READ, args, sizeof(*args)); + if (error) + return error; + copy_from_user(&tmp, args, sizeof(tmp)); + return do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, + tmp.newval, tmp.newlen); +} + +/* Like in_group_p, but testing against egid, not fsgid */ +static int in_egroup_p(gid_t grp) +{ + if (grp != current->egid) { + int i = current->ngroups; + if (i) { + gid_t *groups = current->groups; + do { + if (*groups == grp) + goto out; + groups++; + i--; + } while (i); + } + return 0; + } +out: + return 1; +} + +/* ctl_perm does NOT grant the superuser all rights automatically, because + some sysctl variables are readonly even to root. */ +static int test_perm(int mode, int op) +{ + if (!current->euid) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((mode & op & 0007) == op) + return 0; + return -EACCES; +} +static inline int ctl_perm(ctl_table *table, int op) +{ + return test_perm(table->mode, op); +} + +static int parse_table(int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, + ctl_table *table, void **context) +{ + int error; +repeat: + if (!nlen) + return -ENOTDIR; + + for ( ; table->ctl_name; table++) { + int n; + get_user(n,name); + if (n == table->ctl_name || + table->ctl_name == CTL_ANY) { + if (table->child) { + if (ctl_perm(table, 001)) + return -EPERM; + if (table->strategy) { + error = table->strategy( + table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + if (error) + return error; + } + name++; + nlen--; + table = table->child; + goto repeat; + } + error = do_sysctl_strategy(table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + return error; + } + }; + return -ENOTDIR; +} + +/* Perform the actual read/write of a sysctl table entry. */ +int do_sysctl_strategy (ctl_table *table, + int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, void **context) +{ + int op = 0, rc, len; + + if (oldval) + op |= 004; + if (newval) + op |= 002; + if (ctl_perm(table, op)) + return -EPERM; + + if (table->strategy) { + rc = table->strategy(table, name, nlen, oldval, oldlenp, + newval, newlen, context); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + } + + /* If there is no strategy routine, or if the strategy returns + * zero, proceed with automatic r/w */ + if (table->data && table->maxlen) { + if (oldval && oldlenp) { + get_user(len, oldlenp); + if (len) { + if (len > table->maxlen) + len = table->maxlen; + copy_to_user(oldval, table->data, len); + put_user(len, oldlenp); + } + } + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + copy_from_user(table->data, newval, len); + } + } + return 0; +} + +/* + * This function only checks permission for changing the security level + * If the tests are successful, the actual change is done by + * do_sysctl_strategy + */ +static int do_securelevel_strategy (ctl_table *table, + int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, void **context) +{ + int level; + + if (newval && newlen) { + if (newlen != sizeof (int)) + return -EINVAL; + copy_from_user (&level, newval, newlen); + if (level < securelevel && current->pid != 1) + return -EPERM; + } + return 0; +} + +struct ctl_table_header *register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + struct ctl_table_header *tmp; + tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return 0; + *tmp = ((struct ctl_table_header) {table, DNODE_NULL}); + if (insert_at_head) + DLIST_INSERT_AFTER(&root_table_header, tmp, ctl_entry); + else + DLIST_INSERT_BEFORE(&root_table_header, tmp, ctl_entry); +#ifdef CONFIG_PROC_FS + register_proc_table(table, &proc_sys_root); +#endif + return tmp; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ + DLIST_DELETE(table, ctl_entry); +#ifdef CONFIG_PROC_FS + unregister_proc_table(table->ctl_table, &proc_sys_root); +#endif +} + +/* + * /proc/sys support + */ + +#ifdef CONFIG_PROC_FS + +/* Scan the sysctl entries in table and add them all into /proc */ +static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) +{ + struct proc_dir_entry *de, *tmp; + int exists; + + for (; table->ctl_name; table++) { + exists = 0; + /* Can't do anything without a proc name. */ + if (!table->procname) + continue; + /* Maybe we can't do anything with it... */ + if (!table->proc_handler && + !table->child) + continue; + + de = kmalloc(sizeof(*de), GFP_KERNEL); + if (!de) continue; + de->namelen = strlen(table->procname); + de->name = table->procname; + de->mode = table->mode; + de->nlink = 1; + de->uid = 0; + de->gid = 0; + de->size = 0; + de->get_info = 0; /* For internal use if we want it */ + de->fill_inode = 0; /* To override struct inode fields */ + de->next = de->subdir = 0; + de->data = (void *) table; + /* Is it a file? */ + if (table->proc_handler) { + de->ops = &proc_sys_inode_operations; + de->mode |= S_IFREG; + } + /* Otherwise it's a subdir */ + else { + /* First check to see if it already exists */ + for (tmp = root->subdir; tmp; tmp = tmp->next) { + if (tmp->namelen == de->namelen && + !memcmp(tmp->name,de->name,de->namelen)) { + exists = 1; + kfree (de); + de = tmp; + } + } + if (!exists) { + de->ops = &proc_dir_inode_operations; + de->nlink++; + de->mode |= S_IFDIR; + } + } + table->de = de; + if (!exists) + proc_register_dynamic(root, de); + if (de->mode & S_IFDIR ) + register_proc_table(table->child, de); + } +} + +static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) +{ + struct proc_dir_entry *de; + for (; table->ctl_name; table++) { + if (!(de = table->de)) + continue; + if (de->mode & S_IFDIR) { + if (!table->child) { + printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); + continue; + } + unregister_proc_table(table->child, de); + } + /* Don't unregister proc directories which still have + entries... */ + if (!((de->mode & S_IFDIR) && de->subdir)) { + proc_unregister(root, de->low_ino); + kfree(de); + } + } +} + + +static long do_rw_proc(int write, struct inode * inode, struct file * file, + char * buf, unsigned long count) +{ + int op; + struct proc_dir_entry *de; + struct ctl_table *table; + size_t res; + long error; + + error = verify_area(write ? VERIFY_READ : VERIFY_WRITE, buf, count); + if (error) + return error; + + de = (struct proc_dir_entry*) inode->u.generic_ip; + if (!de || !de->data) + return -ENOTDIR; + table = (struct ctl_table *) de->data; + if (!table || !table->proc_handler) + return -ENOTDIR; + op = (write ? 002 : 004); + if (ctl_perm(table, op)) + return -EPERM; + + res = count; + error = (*table->proc_handler) (table, write, file, buf, &res); + if (error) + return error; + return res; +} + +static long proc_readsys(struct inode * inode, struct file * file, + char * buf, unsigned long count) +{ + return do_rw_proc(0, inode, file, buf, count); +} + +static long proc_writesys(struct inode * inode, struct file * file, + const char * buf, unsigned long count) +{ + return do_rw_proc(1, inode, file, (char *) buf, count); +} + +static int proc_sys_permission(struct inode *inode, int op) +{ + return test_perm(inode->i_mode, op); +} + +int proc_dostring(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + int len; + char *p, c; + + if (!table->data || !table->maxlen || !*lenp || + (filp->f_pos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + len = 0; + p = buffer; + while (len < *lenp) { + get_user(c, p++); + if (c == 0 || c == '\n') + break; + len++; + } + if (len >= table->maxlen) + len = table->maxlen-1; + copy_from_user(table->data, buffer, len); + ((char *) table->data)[len] = 0; + filp->f_pos += *lenp; + } else { + len = strlen(table->data); + if (len > table->maxlen) + len = table->maxlen; + if (len > *lenp) + len = *lenp; + if (len) + copy_to_user(buffer, table->data, len); + if (len < *lenp) { + put_user('\n', ((char *) buffer) + len); + len++; + } + *lenp = len; + filp->f_pos += len; + } + return 0; +} + +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + int *i, vleft, first=1, len, left, neg, val; + #define TMPBUFLEN 20 + char buf[TMPBUFLEN], *p; + + if (!table->data || !table->maxlen || !*lenp || + (filp->f_pos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) table->data; + vleft = table->maxlen / sizeof(int); + left = *lenp; + + for (; left && vleft--; i++, first=0) { + if (write) { + while (left) { + char c; + get_user(c,(char *) buffer); + if (!isspace(c)) + break; + left--; + ((char *) buffer)++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > TMPBUFLEN-1) + len = TMPBUFLEN-1; + copy_from_user(buf, buffer, len); + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + val = simple_strtoul(p, &p, 0); + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + buffer += len; + left -= len; + *i = val; + } else { + p = buf; + if (!first) + *p++ = '\t'; + sprintf(p, "%d", *i); + len = strlen(buf); + if (len > left) + len = left; + copy_to_user(buffer, buf, len); + left -= len; + buffer += len; + } + } + + if (!write && !first && left) { + put_user('\n', (char *) buffer); + left--, buffer++; + } + if (write) { + p = (char *) buffer; + while (left) { + char c; + get_user(c, p++); + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + filp->f_pos += *lenp; + return 0; +} + +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + int *i, *min, *max, vleft, first=1, len, left, neg, val; + #define TMPBUFLEN 20 + char buf[TMPBUFLEN], *p; + + if (!table->data || !table->maxlen || !*lenp || + (filp->f_pos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) table->data; + min = (int *) table->extra1; + max = (int *) table->extra2; + vleft = table->maxlen / sizeof(int); + left = *lenp; + + for (; left && vleft--; i++, first=0) { + if (write) { + while (left) { + char c; + get_user(c, (char *) buffer); + if (!isspace(c)) + break; + left--; + ((char *) buffer)++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > TMPBUFLEN-1) + len = TMPBUFLEN-1; + copy_from_user(buf, buffer, len); + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + val = simple_strtoul(p, &p, 0); + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + buffer += len; + left -= len; + + if (min && val < *min++) + continue; + if (max && val > *max++) + continue; + *i = val; + } else { + p = buf; + if (!first) + *p++ = '\t'; + sprintf(p, "%d", *i); + len = strlen(buf); + if (len > left) + len = left; + copy_to_user(buffer, buf, len); + left -= len; + buffer += len; + } + } + + if (!write && !first && left) { + put_user('\n', (char *) buffer); + left--, buffer++; + } + if (write) { + p = (char *) buffer; + while (left) { + char c; + get_user(c, p++); + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + filp->f_pos += *lenp; + return 0; +} + +#else /* CONFIG_PROC_FS */ + +int proc_dostring(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +#endif /* CONFIG_PROC_FS */ + + +/* + * General sysctl support routines + */ + +/* The generic string strategy routine: */ +int sysctl_string(ctl_table *table, int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, void **context) +{ + int l, len; + + if (!table->data || !table->maxlen) + return -ENOTDIR; + + if (oldval && oldlenp) { + get_user(len, oldlenp); + if (len) { + l = strlen(table->data); + if (len > l) len = l; + if (len >= table->maxlen) + len = table->maxlen; + copy_to_user(oldval, table->data, len); + put_user(0, ((char *) oldval) + len); + put_user(len, oldlenp); + } + } + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + copy_from_user(table->data, newval, len); + if (len == table->maxlen) + len--; + ((char *) table->data)[len] = 0; + } + return 0; +} + +/* + * This function makes sure that all of the integers in the vector + * are between the minimum and maximum values given in the arrays + * table->extra1 and table->extra2, respectively. + */ +int sysctl_intvec(ctl_table *table, int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, void **context) +{ + int i, length, *vec, *min, *max; + + if (newval && newlen) { + if (newlen % sizeof(int) != 0) + return -EINVAL; + + if (!table->extra1 && !table->extra2) + return 0; + + if (newlen > table->maxlen) + newlen = table->maxlen; + length = newlen / sizeof(int); + + vec = (int *) newval; + min = (int *) table->extra1; + max = (int *) table->extra2; + + for (i = 0; i < length; i++) { + int value; + get_user(value, vec + i); + if (min && value < min[i]) + return -EINVAL; + if (max && value > max[i]) + return -EINVAL; + } + } + return 0; +} + +int do_string ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, char *data, size_t max) +{ + int l = strlen(data) + 1; + if (newval && !rdwr) + return -EPERM; + if (newval && newlen >= max) + return -EINVAL; + if (oldval) { + int old_l; + get_user(old_l, oldlenp); + if (l > old_l) + return -ENOMEM; + put_user(l, oldlenp); + copy_to_user(oldval, data, l); + } + if (newval) { + copy_from_user(data, newval, newlen); + data[newlen] = 0; + } + return 0; +} + +int do_int ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, int *data) +{ + if (newval && !rdwr) + return -EPERM; + if (newval && newlen != sizeof(int)) + return -EINVAL; + if (oldval) { + int old_l; + get_user(old_l, oldlenp); + if (old_l < sizeof(int)) + return -ENOMEM; + put_user(sizeof(int), oldlenp); + copy_to_user(oldval, data, sizeof(int)); + } + if (newval) + copy_from_user(data, newval, sizeof(int)); + return 0; +} + +int do_struct ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, void *data, size_t len) +{ + if (newval && !rdwr) + return -EPERM; + if (newval && newlen != len) + return -EINVAL; + if (oldval) { + int old_l; + get_user(old_l, oldlenp); + if (old_l < len) + return -ENOMEM; + put_user(len, oldlenp); + copy_to_user(oldval, data, len); + } + if (newval) + copy_from_user(data, newval, len); + return 0; +} + diff --git a/kernel/time.c b/kernel/time.c index 0424b2eaa..c2090a583 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -14,15 +14,8 @@ * Created file with time related functions from sched.c and adjtimex() * 1993-10-08 Torsten Duwe * adjtime interface update and CMOS clock write code - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * - * to do: adjtimex() has to be updated to recent (1994-12-13) revision - * of David Mill's kernel clock model. For more information, check - * . + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1489) */ #include @@ -31,223 +24,91 @@ #include #include #include - -#include -#include - -#include #include -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. - * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 - * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. - * - * [For the Julian calendar (which was used in Russia before 1917, - * Britain & colonies before 1752, anywhere else before 1582, - * and is still in use by some communities) leave out the - * -year/100+year/400 terms, and add 10.] - * - * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines were long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) +#include + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. */ -static inline unsigned long mktime(unsigned int year, unsigned int mon, - unsigned int day, unsigned int hour, - unsigned int min, unsigned int sec) +struct timezone sys_tz = { 0, 0}; + +static void do_normal_gettime(struct timeval * tm) { - if (0 >= (int) (mon -= 2)) { /* 1..12 -> 11,12,1..10 */ - mon += 12; /* Puts Feb last since it has leap day */ - year -= 1; - } - return ((( - (unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day) + - year*365 - 719499 - )*24 + hour /* now have hours */ - )*60 + min /* now have minutes */ - )*60 + sec; /* finally seconds */ + *tm=xtime; } -void time_init(void) -{ - unsigned int year, mon, day, hour, min, sec; - int i; +void (*do_get_fast_time)(struct timeval *) = do_normal_gettime; - /* The Linux interpretation of the CMOS clock register contents: - * When the Update-In-Progress (UIP) flag goes from 1 to 0, the - * RTC registers show the second which has precisely just started. - * Let's hope other operating systems interpret the RTC the same way. - */ - /* read RTC exactly on falling edge of update flag */ - for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */ - if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) - break; - for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */ - if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - break; - do { /* Isn't this overkill ? UIP above should guarantee consistency */ - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - } while (sec != CMOS_READ(RTC_SECONDS)); - if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - } - if ((year += 1900) < 1970) - year += 100; - xtime.tv_sec = mktime(year, mon, day, hour, min, sec); - xtime.tv_usec = 0; -printk("Year : %d\n", year); -printk("Mon : %d\n", mon); -printk("Day : %d\n", day); -printk("Hour : %d\n", hour); -printk("Min : %d\n", min); -printk("Sec : %d\n", sec); +/* + * Generic way to access 'xtime' (the current time of day). + * This can be changed if the platform provides a more accurate (and fast!) + * version. + */ + +void get_fast_time(struct timeval * t) +{ + do_get_fast_time(t); } -/* - * The timezone where the local system is located. Used as a default by some - * programs who obtain this value by using gettimeofday. - */ -struct timezone sys_tz = { 0, 0}; +#ifndef __alpha__ -asmlinkage int sys_time(long * tloc) +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +asmlinkage int sys_time(int * tloc) { - int i, error; + int i; i = CURRENT_TIME; if (tloc) { - error = verify_area(VERIFY_WRITE, tloc, 4); - if (error) - return error; - put_fs_long(i,(unsigned long *)tloc); + if (put_user(i,tloc)) + i = -EFAULT; } return i; } -asmlinkage int sys_stime(unsigned long * tptr) +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +asmlinkage int sys_stime(int * tptr) { - int error; - unsigned long value; + int value; if (!suser()) return -EPERM; - error = verify_area(VERIFY_READ, tptr, sizeof(*tptr)); - if (error) - return error; - value = get_fs_long(tptr); + if (get_user(value, tptr)) + return -EFAULT; cli(); xtime.tv_sec = value; xtime.tv_usec = 0; - time_status = TIME_BAD; - time_maxerror = 0x70000000; - time_esterror = 0x70000000; + time_state = TIME_ERROR; + time_maxerror = MAXPHASE; + time_esterror = MAXPHASE; sti(); return 0; } -/* This function must be called with interrupts disabled - * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs - * - * However, the pc-audio speaker driver changes the divisor so that - * it gets interrupted rather more often - it loads 64 into the - * counter rather than 11932! This has an adverse impact on - * do_gettimeoffset() -- it stops working! What is also not - * good is that the interval that our timer function gets called - * is no longer 10.0002 ms, but 9.9767 ms. To get around this - * would require using a different timing source. Maybe someone - * could use the RTC - I know that this can interrupt at frequencies - * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix - * it so that at startup, the timer code in sched.c would select - * using either the RTC or the 8253 timer. The decision would be - * based on whether there was any other device around that needed - * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz, - * and then do some jiggery to have a version of do_timer that - * advanced the clock by 1/1024 s. Every time that reached over 1/100 - * of a second, then do all the old code. If the time was kept correct - * then do_gettimeoffset could just return 0 - there is no low order - * divider that can be accessed. - * - * Ideally, you would be able to use the RTC for the speaker driver, - * but it appears that the speaker driver really needs interrupt more - * often than every 120 us or so. - * - * Anyway, this needs more thought.... pjsg (1993-08-28) - * - * If you are really that interested, you should be reading - * comp.protocols.time.ntp! - */ - -#define TICK_SIZE tick - -static inline unsigned long do_gettimeoffset(void) -{ - int count; - unsigned long offset = 0; - - /* timer count may underflow right here */ - outb_p(0x00, 0x43); /* latch the count ASAP */ - count = inb_p(0x40); /* read the latched count */ - count |= inb(0x40) << 8; - /* we know probability of underflow is always MUCH less than 1% */ - if (count > (LATCH - LATCH/100)) { - /* check for pending timer interrupt */ - outb_p(0x0a, 0x20); - if (inb(0x20) & 1) - offset = TICK_SIZE; - } - count = ((LATCH-1) - count) * TICK_SIZE; - count = (count + LATCH/2) / LATCH; - return offset + count; -} - -/* - * This version of gettimeofday has near microsecond resolution. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long flags; - - save_flags(flags); - cli(); - *tv = xtime; -#if defined (__i386__) || defined (__mips__) - tv->tv_usec += do_gettimeoffset(); - if (tv->tv_usec >= 1000000) { - tv->tv_usec -= 1000000; - tv->tv_sec++; - } -#endif /* !defined (__i386__) && !defined (__mips__) */ - restore_flags(flags); -} +#endif asmlinkage int sys_gettimeofday(struct timeval *tv, struct timezone *tz) { - int error; - if (tv) { struct timeval ktv; - error = verify_area(VERIFY_WRITE, tv, sizeof *tv); - if (error) - return error; do_gettimeofday(&ktv); - memcpy_tofs(tv, &ktv, sizeof(ktv)); + if (copy_to_user(tv, &ktv, sizeof(ktv))) + return -EFAULT; } if (tz) { - error = verify_area(VERIFY_WRITE, tz, sizeof *tz); - if (error) - return error; - memcpy_tofs(tz, &sys_tz, sizeof(sys_tz)); + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; } return 0; } @@ -294,18 +155,12 @@ asmlinkage int sys_settimeofday(struct timeval *tv, struct timezone *tz) if (!suser()) return -EPERM; if (tv) { - int error = verify_area(VERIFY_READ, tv, sizeof(*tv)); - if (error) - return error; - memcpy_fromfs(&new_tv, tv, sizeof(*tv)); - } - if (tz) { - int error = verify_area(VERIFY_READ, tz, sizeof(*tz)); - if (error) - return error; - memcpy_fromfs(&new_tz, tz, sizeof(*tz)); + if (copy_from_user(&new_tv, tv, sizeof(*tv))) + return -EFAULT; } if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; sys_tz = new_tz; if (firsttime) { firsttime = 0; @@ -313,30 +168,29 @@ asmlinkage int sys_settimeofday(struct timeval *tv, struct timezone *tz) warp_clock(); } } - if (tv) { - cli(); - /* This is revolting. We need to set the xtime.tv_usec - * correctly. However, the value in this location is - * is value at the last tick. - * Discover what correction gettimeofday - * would have done, and then undo it! - */ - new_tv.tv_usec -= do_gettimeoffset(); - - if (new_tv.tv_usec < 0) { - new_tv.tv_usec += 1000000; - new_tv.tv_sec--; - } - - xtime = new_tv; - time_status = TIME_BAD; - time_maxerror = 0x70000000; - time_esterror = 0x70000000; - sti(); - } + if (tv) + do_settimeofday(&new_tv); return 0; } +long pps_offset = 0; /* pps time offset (us) */ +long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */ + +long pps_freq = 0; /* frequency offset (scaled ppm) */ +long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ + +long pps_valid = PPS_VALID; /* pps signal watchdog counter */ + +int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ + +long pps_jitcnt = 0; /* jitter limit exceeded */ +long pps_calcnt = 0; /* calibration intervals */ +long pps_errcnt = 0; /* calibration errors */ +long pps_stbcnt = 0; /* stability limit exceeded */ + +/* hook for a loadable hardpps kernel module */ +void (*hardpps_ptr)(struct timeval *) = (void (*)(struct timeval *))0; + /* adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ @@ -348,36 +202,28 @@ asmlinkage int sys_adjtimex(struct timex *txc_p) /* Local copy of parameter */ struct timex txc; - error = verify_area(VERIFY_WRITE, txc_p, sizeof(struct timex)); - if (error) - return error; - /* Copy the user data space into the kernel copy * structure. But bear in mind that the structures * may change */ - memcpy_fromfs(&txc, txc_p, sizeof(struct timex)); + error = copy_from_user(&txc, txc_p, sizeof(struct timex)); + if (error) + return -EFAULT; /* In order to modify anything, you gotta be super-user! */ - if (txc.mode && !suser()) + if (txc.modes && !suser()) return -EPERM; /* Now we validate the data before disabling interrupts */ - if (txc.mode != ADJ_OFFSET_SINGLESHOT && (txc.mode & ADJ_OFFSET)) - /* Microsec field limited to -131000 .. 131000 usecs */ - if (txc.offset <= -(1 << (31 - SHIFT_UPDATE)) - || txc.offset >= (1 << (31 - SHIFT_UPDATE))) - return -EINVAL; - - /* time_status must be in a fairly small range */ - if (txc.mode & ADJ_STATUS) - if (txc.status < TIME_OK || txc.status > TIME_BAD) + if (txc.modes != ADJ_OFFSET_SINGLESHOT && (txc.modes & ADJ_OFFSET)) + /* adjustment Offset limited to +- .512 seconds */ + if (txc.offset <= - MAXPHASE || txc.offset >= MAXPHASE ) return -EINVAL; /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc.mode & ADJ_TICK) + if (txc.modes & ADJ_TICK) if (txc.tick < 900000/HZ || txc.tick > 1100000/HZ) return -EINVAL; @@ -387,130 +233,115 @@ asmlinkage int sys_adjtimex(struct timex *txc_p) save_adjust = time_adjust; /* If there are input parameters, then process them */ - if (txc.mode) + if (txc.modes) { - if (time_status == TIME_BAD) - time_status = TIME_OK; + if (time_state == TIME_BAD) + time_state = TIME_OK; - if (txc.mode & ADJ_STATUS) + if (txc.modes & ADJ_STATUS) time_status = txc.status; - if (txc.mode & ADJ_FREQUENCY) - time_freq = txc.frequency << (SHIFT_KF - 16); + if (txc.modes & ADJ_FREQUENCY) + time_freq = txc.freq; - if (txc.mode & ADJ_MAXERROR) + if (txc.modes & ADJ_MAXERROR) time_maxerror = txc.maxerror; - if (txc.mode & ADJ_ESTERROR) + if (txc.modes & ADJ_ESTERROR) time_esterror = txc.esterror; - if (txc.mode & ADJ_TIMECONST) - time_constant = txc.time_constant; + if (txc.modes & ADJ_TIMECONST) + time_constant = txc.constant; - if (txc.mode & ADJ_OFFSET) - if (txc.mode == ADJ_OFFSET_SINGLESHOT) + if (txc.modes & ADJ_OFFSET) + if ((txc.modes == ADJ_OFFSET_SINGLESHOT) + || !(time_status & STA_PLL)) { time_adjust = txc.offset; } - else /* XXX should give an error if other bits set */ + else if ((time_status & STA_PLL)||(time_status & STA_PPSTIME)) { - time_offset = txc.offset << SHIFT_UPDATE; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; - if (mtemp > (MAXSEC+2) || mtemp < 0) - mtemp = 0; - - if (txc.offset < 0) - time_freq -= (-txc.offset * mtemp) >> - (time_constant + time_constant); + ltemp = (time_status & STA_PPSTIME && + time_status & STA_PPSSIGNAL) ? + pps_offset : txc.offset; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + if (ltemp > MAXPHASE) + time_offset = MAXPHASE << SHIFT_UPDATE; + else if (ltemp < -MAXPHASE) + time_offset = -(MAXPHASE << SHIFT_UPDATE); else - time_freq += (txc.offset * mtemp) >> - (time_constant + time_constant); + time_offset = ltemp << SHIFT_UPDATE; - ltemp = time_tolerance << SHIFT_KF; + /* + * Select whether the frequency is to be controlled and in which + * mode (PLL or FLL). Clamp to the operating range. Ugly + * multiply/divide should be replaced someday. + */ - if (time_freq > ltemp) - time_freq = ltemp; - else if (time_freq < -ltemp) - time_freq = -ltemp; - } - if (txc.mode & ADJ_TICK) + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = xtime.tv_sec; + mtemp = xtime.tv_sec - time_reftime; + time_reftime = xtime.tv_sec; + if (time_status & STA_FLL) + { + if (mtemp >= MINSEC) + { + ltemp = ((time_offset / mtemp) << (SHIFT_USEC - + SHIFT_UPDATE)); + if (ltemp < 0) + time_freq -= -ltemp >> SHIFT_KH; + else + time_freq += ltemp >> SHIFT_KH; + } + } + else + { + if (mtemp < MAXSEC) + { + ltemp *= mtemp; + if (ltemp < 0) + time_freq -= -ltemp >> (time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC); + else + time_freq += ltemp >> (time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC); + } + } + if (time_freq > time_tolerance) + time_freq = time_tolerance; + else if (time_freq < -time_tolerance) + time_freq = -time_tolerance; + } /* STA_PLL || STA_PPSTIME */ + if (txc.modes & ADJ_TICK) tick = txc.tick; } txc.offset = save_adjust; - txc.frequency = ((time_freq+1) >> (SHIFT_KF - 16)); + txc.freq = time_freq; txc.maxerror = time_maxerror; txc.esterror = time_esterror; txc.status = time_status; - txc.time_constant = time_constant; + txc.constant = time_constant; txc.precision = time_precision; txc.tolerance = time_tolerance; txc.time = xtime; txc.tick = tick; + txc.ppsfreq = pps_freq; + txc.jitter = pps_jitter; + txc.shift = pps_shift; + txc.stabil = pps_stabil; + txc.jitcnt = pps_jitcnt; + txc.calcnt = pps_calcnt; + txc.errcnt = pps_errcnt; + txc.stbcnt = pps_stbcnt; sti(); - memcpy_tofs(txc_p, &txc, sizeof(struct timex)); - return time_status; -} - -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be - * called 500 ms after the second nowtime has started, because when - * nowtime is written into the registers of the CMOS clock, it will - * jump to the next second precisely 500 ms later. Check the Motorola - * MC146818A or Dallas DS12887 data sheet for details. - */ -int set_rtc_mmss(unsigned long nowtime) -{ - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char save_control, save_freq_select; - - save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */ - CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); - - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */ - CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); - - /* since we're only adjusting minutes and seconds, - * don't interfere with hour overflow. This avoids - * messing with unknown time zones but requires your - * RTC not to be off by more than 15 minutes - */ - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) < 30) - { - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - } - CMOS_WRITE(real_seconds,RTC_SECONDS); - CMOS_WRITE(real_minutes,RTC_MINUTES); - } - else - retval = -1; - - /* The following flags have to be released exactly in this order, - * otherwise the DS12887 (popular MC146818A clone with integrated - * battery and quartz) will not reset the oscillator and will not - * update precisely 500 ms later. You won't find this mentioned in - * the Dallas Semiconductor data sheets, but who believes data - * sheets anyway ... -- Markus Kuhn - */ - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - - return retval; + return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : time_state; } -- cgit v1.2.3