/* * linux/kernel/fork.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'fork.c' contains the help-routines for the 'fork' system call * (see also system_call.s). * Fork is rather simple, once you get the hang of it, but the memory * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()' */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int nr_tasks=1; int nr_running=1; unsigned long int total_forks=0; /* Handle normal Linux uptimes. */ int last_pid=0; /* SLAB cache for mm_struct's. */ kmem_cache_t *mm_cachep; struct task_struct *pidhash[PIDHASH_SZ]; spinlock_t pidhash_lock = SPIN_LOCK_UNLOCKED; struct task_struct **tarray_freelist = NULL; spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED; /* UID task count cache, to prevent walking entire process list every * single fork() operation. */ #define UIDHASH_SZ (PIDHASH_SZ >> 2) static struct uid_taskcount { struct uid_taskcount *next, **pprev; unsigned short uid; int task_count; } *uidhash[UIDHASH_SZ]; static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED; kmem_cache_t *uid_cachep; #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1)) static inline void uid_hash_insert(struct uid_taskcount *up, unsigned int hashent) { spin_lock(&uidhash_lock); if((up->next = uidhash[hashent]) != NULL) uidhash[hashent]->pprev = &up->next; up->pprev = &uidhash[hashent]; uidhash[hashent] = up; spin_unlock(&uidhash_lock); } static inline void uid_hash_remove(struct uid_taskcount *up) { spin_lock(&uidhash_lock); if(up->next) up->next->pprev = up->pprev; *up->pprev = up->next; spin_unlock(&uidhash_lock); } static inline struct uid_taskcount *uid_find(unsigned short uid, unsigned int hashent) { struct uid_taskcount *up; spin_lock(&uidhash_lock); for(up = uidhash[hashent]; (up && up->uid != uid); up = up->next) ; spin_unlock(&uidhash_lock); return up; } int charge_uid(struct task_struct *p, int count) { unsigned int hashent = uidhashfn(p->uid); struct uid_taskcount *up = uid_find(p->uid, hashent); if(up) { int limit = p->rlim[RLIMIT_NPROC].rlim_cur; int newcnt = up->task_count + count; if(newcnt > limit) return -EAGAIN; else if(newcnt == 0) { uid_hash_remove(up); kmem_cache_free(uid_cachep, up); return 0; } } else { up = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); if(!up) return -EAGAIN; up->uid = p->uid; up->task_count = 0; uid_hash_insert(up, hashent); } up->task_count += count; return 0; } void uidcache_init(void) { int i; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct uid_taskcount), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if(!uid_cachep) panic("Cannot create uid taskcount SLAB cache\n"); for(i = 0; i < UIDHASH_SZ; i++) uidhash[i] = 0; } static inline int find_empty_process(void) { struct task_struct **tslot; if(current->uid) { int error; if(nr_tasks >= NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT) return -EAGAIN; if((error = charge_uid(current, 1)) < 0) return error; } tslot = get_free_taskslot(); if(tslot) return tslot - &task[0]; return -EAGAIN; } /* Protects next_safe and last_pid. */ static spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED; static int get_pid(unsigned long flags) { static int next_safe = PID_MAX; struct task_struct *p; if (flags & CLONE_PID) return current->pid; spin_lock(&lastpid_lock); if((++last_pid) & 0xffff8000) { last_pid = 300; /* Skip daemons etc. */ goto inside; } if(last_pid >= next_safe) { inside: next_safe = PID_MAX; read_lock(&tasklist_lock); repeat: for_each_task(p) { if(p->pid == last_pid || p->pgrp == last_pid || p->session == last_pid) { if(++last_pid >= next_safe) { if(last_pid & 0xffff8000) last_pid = 300; next_safe = PID_MAX; goto repeat; } } if(p->pid > last_pid && next_safe > p->pid) next_safe = p->pid; if(p->pgrp > last_pid && next_safe > p->pgrp) next_safe = p->pgrp; if(p->session > last_pid && next_safe > p->session) next_safe = p->session; } read_unlock(&tasklist_lock); } spin_unlock(&lastpid_lock); return last_pid; } static inline int dup_mmap(struct mm_struct * mm) { struct vm_area_struct * mpnt, *tmp, **pprev; mm->mmap = mm->mmap_cache = NULL; flush_cache_mm(current->mm); pprev = &mm->mmap; for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { struct inode *inode; tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!tmp) { exit_mmap(mm); flush_tlb_mm(current->mm); return -ENOMEM; } *tmp = *mpnt; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; inode = tmp->vm_inode; if (inode) { inode->i_count++; if (tmp->vm_flags & VM_DENYWRITE) inode->i_writecount--; /* insert tmp into the share list, just after mpnt */ if((tmp->vm_next_share = mpnt->vm_next_share) != NULL) mpnt->vm_next_share->vm_pprev_share = &tmp->vm_next_share; mpnt->vm_next_share = tmp; tmp->vm_pprev_share = &mpnt->vm_next_share; } if (copy_page_range(mm, current->mm, tmp)) { exit_mmap(mm); flush_tlb_mm(current->mm); return -ENOMEM; } if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); /* Ok, finally safe to link it in. */ if((tmp->vm_next = *pprev) != NULL) (*pprev)->vm_pprev = &tmp->vm_next; *pprev = tmp; tmp->vm_pprev = pprev; pprev = &tmp->vm_next; } flush_tlb_mm(current->mm); return 0; } static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { if (!(clone_flags & CLONE_VM)) { struct mm_struct * mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL); if (!mm) return -1; *mm = *current->mm; init_new_context(mm); mm->count = 1; mm->def_flags = 0; /* It has not run yet, so cannot be present in anyone's * cache or tlb. */ mm->cpu_vm_mask = 0; tsk->mm = mm; tsk->min_flt = tsk->maj_flt = 0; tsk->cmin_flt = tsk->cmaj_flt = 0; tsk->nswap = tsk->cnswap = 0; if (new_page_tables(tsk)) goto free_mm; if (dup_mmap(mm)) { free_page_tables(mm); free_mm: kmem_cache_free(mm_cachep, mm); return -1; } return 0; } current->mm->count++; SET_PAGE_DIR(tsk, current->mm->pgd); return 0; } static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) { if (clone_flags & CLONE_FS) { current->fs->count++; return 0; } tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL); if (!tsk->fs) return -1; tsk->fs->count = 1; tsk->fs->umask = current->fs->umask; if ((tsk->fs->root = current->fs->root)) tsk->fs->root->i_count++; if ((tsk->fs->pwd = current->fs->pwd)) tsk->fs->pwd->i_count++; return 0; } static inline int copy_files(unsigned long clone_flags, struct task_struct * tsk) { int i; struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; oldf = current->files; if (clone_flags & CLONE_FILES) { oldf->count++; return 0; } newf = kmalloc(sizeof(*newf), GFP_KERNEL); tsk->files = newf; if (!newf) return -1; newf->count = 1; newf->close_on_exec = oldf->close_on_exec; newf->open_fds = oldf->open_fds; old_fds = oldf->fd; new_fds = newf->fd; for (i = NR_OPEN; i != 0; i--) { struct file * f = *old_fds; old_fds++; *new_fds = f; new_fds++; if (f) f->f_count++; } return 0; } static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { if (clone_flags & CLONE_SIGHAND) { atomic_inc(¤t->sig->count); return 0; } tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL); if (!tsk->sig) return -1; spin_lock_init(&tsk->sig->siglock); atomic_set(&tsk->sig->count, 1); memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action)); return 0; } /* * Ok, this is the main fork-routine. It copies the system process * information (task[nr]) and sets up the necessary registers. It * also copies the data segment in its entirety. */ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) { int nr; int error = -ENOMEM; struct task_struct *p; lock_kernel(); p = alloc_task_struct(); if (!p) goto bad_fork; error = -EAGAIN; nr = find_empty_process(); if (nr < 0) goto bad_fork_free; *p = *current; if (p->exec_domain && p->exec_domain->module) __MOD_INC_USE_COUNT(p->exec_domain->module); if (p->binfmt && p->binfmt->module) __MOD_INC_USE_COUNT(p->binfmt->module); p->did_exec = 0; p->swappable = 0; p->state = TASK_UNINTERRUPTIBLE; p->flags &= ~(PF_PTRACED|PF_TRACESYS|PF_SUPERPRIV); p->flags |= PF_FORKNOEXEC; p->pid = get_pid(clone_flags); p->next_run = NULL; p->prev_run = NULL; p->p_pptr = p->p_opptr = current; p->p_cptr = NULL; init_waitqueue(&p->wait_chldexit); p->signal = 0; p->it_real_value = p->it_virt_value = p->it_prof_value = 0; p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0; init_timer(&p->real_timer); p->real_timer.data = (unsigned long) p; p->leader = 0; /* session leadership doesn't inherit */ p->tty_old_pgrp = 0; p->times.tms_utime = p->times.tms_stime = 0; p->times.tms_cutime = p->times.tms_cstime = 0; #ifdef __SMP__ p->has_cpu = 0; p->processor = NO_PROC_ID; #endif p->lock_depth = 0; p->start_time = jiffies; p->tarray_ptr = &task[nr]; *p->tarray_ptr = p; SET_LINKS(p); hash_pid(p); nr_tasks++; error = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p)) goto bad_fork_cleanup; if (copy_fs(clone_flags, p)) goto bad_fork_cleanup_files; if (copy_sighand(clone_flags, p)) goto bad_fork_cleanup_fs; if (copy_mm(clone_flags, p)) goto bad_fork_cleanup_sighand; error = copy_thread(nr, clone_flags, usp, p, regs); if (error) goto bad_fork_cleanup_sighand; p->semundo = NULL; /* ok, now we should be set up.. */ p->swappable = 1; p->exit_signal = clone_flags & CSIGNAL; /* * "share" dynamic priority between parent and child, thus the * total amount of dynamic priorities in the system doesnt change, * more scheduling fairness. This is only important in the first * timeslice, on the long run the scheduling behaviour is unchanged. */ current->counter >>= 1; p->counter = current->counter; if(p->pid) { wake_up_process(p); /* do this last, just in case */ } else { p->state = TASK_RUNNING; p->next_run = p->prev_run = p; } ++total_forks; error = p->pid; goto fork_out; bad_fork_cleanup_sighand: exit_sighand(p); bad_fork_cleanup_fs: exit_fs(p); bad_fork_cleanup_files: exit_files(p); bad_fork_cleanup: charge_uid(current, -1); if (p->exec_domain && p->exec_domain->module) __MOD_DEC_USE_COUNT(p->exec_domain->module); if (p->binfmt && p->binfmt->module) __MOD_DEC_USE_COUNT(p->binfmt->module); add_free_taskslot(p->tarray_ptr); unhash_pid(p); REMOVE_LINKS(p); nr_tasks--; bad_fork_free: free_task_struct(p); bad_fork: fork_out: unlock_kernel(); return error; }