diff options
Diffstat (limited to 'arch/sparc64/kernel/smp.c')
-rw-r--r-- | arch/sparc64/kernel/smp.c | 260 |
1 files changed, 195 insertions, 65 deletions
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 4bdfca1b7..27344f4b6 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -5,6 +5,8 @@ #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/tasks.h> #include <linux/smp.h> #include <linux/smp_lock.h> @@ -34,24 +36,23 @@ extern int linux_num_cpus; extern void calibrate_delay(void); extern unsigned prom_cpu_nodes[]; -volatile int smp_processors_ready = 0; -unsigned long cpu_present_map = 0; -int smp_num_cpus = 1; -int smp_threads_ready = 0; +struct cpuinfo_sparc cpu_data[NR_CPUS] __attribute__ ((aligned (64))); -struct cpuinfo_sparc cpu_data[NR_CPUS] __attribute__ ((aligned (64))); +volatile int cpu_number_map[NR_CPUS] __attribute__ ((aligned (64))); +volatile int __cpu_logical_map[NR_CPUS] __attribute__ ((aligned (64))); -/* Please don't make this initdata!!! --DaveM */ +/* Please don't make this stuff initdata!!! --DaveM */ static unsigned char boot_cpu_id = 0; - static int smp_activated = 0; -volatile int cpu_number_map[NR_CPUS]; -volatile int __cpu_logical_map[NR_CPUS]; - /* Kernel spinlock */ spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED; +volatile int smp_processors_ready = 0; +unsigned long cpu_present_map = 0; +int smp_num_cpus = 1; +int smp_threads_ready = 0; + __initfunc(void smp_setup(char *str, int *ints)) { /* XXX implement me XXX */ @@ -84,6 +85,8 @@ int smp_bogo(char *buf) __initfunc(void smp_store_cpu_info(int id)) { + int i; + cpu_data[id].irq_count = 0; cpu_data[id].bh_count = 0; /* multiplier and counter set by @@ -94,16 +97,18 @@ __initfunc(void smp_store_cpu_info(int id)) cpu_data[id].pte_cache = NULL; cpu_data[id].pgdcache_size = 0; cpu_data[id].pgd_cache = NULL; -} + cpu_data[id].idle_volume = 1; -extern void distribute_irqs(void); + for(i = 0; i < 16; i++) + cpu_data[id].irq_worklists[i] = 0; +} __initfunc(void smp_commence(void)) { - distribute_irqs(); } static void smp_setup_percpu_timer(void); +static void smp_tune_scheduling(void); static volatile unsigned long callin_flag = 0; @@ -173,10 +178,16 @@ void cpu_panic(void) panic("SMP bolixed\n"); } -extern struct prom_cpuinfo linux_cpus[NR_CPUS]; +extern struct prom_cpuinfo linux_cpus[64]; extern unsigned long smp_trampoline; +/* The OBP cpu startup callback truncates the 3rd arg cookie to + * 32-bits (I think) so to be safe we have it read the pointer + * contained here so we work on >4GB machines. -DaveM + */ +static struct task_struct *cpu_new_task = NULL; + __initfunc(void smp_boot_cpus(void)) { int cpucount = 0, i; @@ -184,6 +195,8 @@ __initfunc(void smp_boot_cpus(void)) printk("Entering UltraSMPenguin Mode...\n"); __sti(); smp_store_cpu_info(boot_cpu_id); + smp_tune_scheduling(); + init_idle(); if(linux_num_cpus == 1) return; @@ -194,21 +207,25 @@ __initfunc(void smp_boot_cpus(void)) if(cpu_present_map & (1UL << i)) { unsigned long entry = (unsigned long)(&smp_trampoline); + unsigned long cookie = (unsigned long)(&cpu_new_task); struct task_struct *p; int timeout; int no; extern unsigned long phys_base; entry += phys_base - KERNBASE; + cookie += phys_base - KERNBASE; kernel_thread(start_secondary, NULL, CLONE_PID); p = task[++cpucount]; p->processor = i; + p->has_cpu = 1; /* we schedule the first task manually */ callin_flag = 0; for (no = 0; no < linux_num_cpus; no++) if (linux_cpus[no].mid == i) break; + cpu_new_task = p; prom_startcpu(linux_cpus[no].prom_node, - entry, ((unsigned long)p)); + entry, cookie); for(timeout = 0; timeout < 5000000; timeout++) { if(callin_flag) break; @@ -216,8 +233,8 @@ __initfunc(void smp_boot_cpus(void)) } if(callin_flag) { cpu_number_map[i] = cpucount; - prom_cpu_nodes[i] = linux_cpus[no].prom_node; __cpu_logical_map[cpucount] = i; + prom_cpu_nodes[i] = linux_cpus[no].prom_node; } else { cpucount--; printk("Processor %d is stuck.\n", i); @@ -228,6 +245,7 @@ __initfunc(void smp_boot_cpus(void)) cpu_number_map[i] = -1; } } + cpu_new_task = NULL; if(cpucount == 0) { printk("Error: only one processor found.\n"); cpu_present_map = (1UL << smp_processor_id()); @@ -249,17 +267,6 @@ __initfunc(void smp_boot_cpus(void)) membar("#StoreStore | #StoreLoad"); } -/* We don't even need to do anything, the only generic message pass done - * anymore is to stop all cpus during a panic(). When the user drops to - * the PROM prompt, the firmware will send the other cpu's it's MONDO - * vector anyways, so doing anything special here is pointless. - * - * This whole thing should go away anyways... - */ -void smp_message_pass(int target, int msg, unsigned long data, int wait) -{ -} - /* #define XCALL_DEBUG */ static inline void xcall_deliver(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu) @@ -342,6 +349,17 @@ extern unsigned long xcall_flush_tlb_all; extern unsigned long xcall_tlbcachesync; extern unsigned long xcall_flush_cache_all; extern unsigned long xcall_report_regs; +extern unsigned long xcall_receive_signal; + +void smp_receive_signal(int cpu) +{ + if(smp_processors_ready && + (cpu_present_map & (1UL<<cpu)) != 0) { + u64 pstate, data0 = (((u64)&xcall_receive_signal) & 0xffffffff); + __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate)); + xcall_deliver(data0, 0, 0, pstate, cpu); + } +} void smp_report_regs(void) { @@ -364,37 +382,51 @@ void smp_flush_tlb_all(void) * to the stack before we get here because all callers of us * are flush_tlb_*() routines, and these run after flush_cache_*() * which performs the flushw. + * + * The SMP TLB coherency scheme we use works as follows: + * + * 1) mm->cpu_vm_mask is a bit mask of which cpus an address + * space has (potentially) executed on, this is the heuristic + * we use to avoid doing cross calls. + * + * 2) TLB context numbers are shared globally across all processors + * in the system, this allows us to play several games to avoid + * cross calls. + * + * One invariant is that when a cpu switches to a process, and + * that processes tsk->mm->cpu_vm_mask does not have the current + * cpu's bit set, that tlb context is flushed locally. + * + * If the address space is non-shared (ie. mm->count == 1) we avoid + * cross calls when we want to flush the currently running process's + * tlb state. This is done by clearing all cpu bits except the current + * processor's in current->mm->cpu_vm_mask and performing the flush + * locally only. This will force any subsequent cpus which run this + * task to flush the context from the local tlb if the process migrates + * to another cpu (again). + * + * 3) For shared address spaces (threads) and swapping we bite the + * bullet for most cases and perform the cross call. + * + * The performance gain from "optimizing" away the cross call for threads is + * questionable (in theory the big win for threads is the massive sharing of + * address space state across processors). + * + * For the swapping case the locking is difficult to get right, we'd have to + * enforce strict ordered access to mm->cpu_vm_mask via a spinlock for example. + * Then again one could argue that when you are swapping, the cost of a cross + * call won't even show up on the performance radar. But in any case we do get + * rid of the cross-call when the task has a dead context or the task has only + * ever run on the local cpu. */ -static void smp_cross_call_avoidance(struct mm_struct *mm) -{ - u32 ctx; - - spin_lock(&scheduler_lock); - get_new_mmu_context(mm); - mm->cpu_vm_mask = (1UL << smp_processor_id()); - current->tss.ctx = ctx = mm->context & 0x3ff; - spitfire_set_secondary_context(ctx); - __asm__ __volatile__("flush %g6"); - spitfire_flush_dtlb_secondary_context(); - spitfire_flush_itlb_secondary_context(); - __asm__ __volatile__("flush %g6"); - if(!segment_eq(current->tss.current_ds,USER_DS)) { - /* Rarely happens. */ - current->tss.ctx = 0; - spitfire_set_secondary_context(0); - __asm__ __volatile__("flush %g6"); - } - spin_unlock(&scheduler_lock); -} - void smp_flush_tlb_mm(struct mm_struct *mm) { u32 ctx = mm->context & 0x3ff; if(mm == current->mm && atomic_read(&mm->count) == 1) { - if(mm->cpu_vm_mask == (1UL << smp_processor_id())) - goto local_flush_and_out; - return smp_cross_call_avoidance(mm); + if(mm->cpu_vm_mask != (1UL << smp_processor_id())) + mm->cpu_vm_mask = (1UL << smp_processor_id()); + goto local_flush_and_out; } smp_cross_call(&xcall_flush_tlb_mm, ctx, 0, 0); @@ -410,9 +442,9 @@ void smp_flush_tlb_range(struct mm_struct *mm, unsigned long start, start &= PAGE_MASK; end &= PAGE_MASK; if(mm == current->mm && atomic_read(&mm->count) == 1) { - if(mm->cpu_vm_mask == (1UL << smp_processor_id())) - goto local_flush_and_out; - return smp_cross_call_avoidance(mm); + if(mm->cpu_vm_mask != (1UL << smp_processor_id())) + mm->cpu_vm_mask = (1UL << smp_processor_id()); + goto local_flush_and_out; } smp_cross_call(&xcall_flush_tlb_range, ctx, start, end); @@ -426,22 +458,26 @@ void smp_flush_tlb_page(struct mm_struct *mm, unsigned long page) page &= PAGE_MASK; if(mm == current->mm && atomic_read(&mm->count) == 1) { - if(mm->cpu_vm_mask == (1UL << smp_processor_id())) - goto local_flush_and_out; - return smp_cross_call_avoidance(mm); - } -#if 0 /* XXX Disabled until further notice... */ - else if(atomic_read(&mm->count) == 1) { + if(mm->cpu_vm_mask != (1UL << smp_processor_id())) + mm->cpu_vm_mask = (1UL << smp_processor_id()); + goto local_flush_and_out; + } else { /* Try to handle two special cases to avoid cross calls * in common scenerios where we are swapping process * pages out. */ - if((mm->context ^ tlb_context_cache) & CTX_VERSION_MASK) + if(((mm->context ^ tlb_context_cache) & CTX_VERSION_MASK) || + (mm->cpu_vm_mask == 0)) { + /* A dead context cannot ever become "alive" until + * a task switch is done to it. + */ return; /* It's dead, nothing to do. */ - if(mm->cpu_vm_mask == (1UL << smp_processor_id())) - goto local_flush_and_out; + } + if(mm->cpu_vm_mask == (1UL << smp_processor_id())) { + __flush_tlb_page(ctx, page, SECONDARY_CONTEXT); + return; /* Only local flush is necessary. */ + } } -#endif smp_cross_call(&xcall_flush_tlb_page, ctx, page, 0); local_flush_and_out: @@ -644,6 +680,100 @@ __initfunc(void smp_tick_init(void)) prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1; } +static inline unsigned long find_flush_base(unsigned long size) +{ + struct page *p = mem_map; + unsigned long found, base; + + size = PAGE_ALIGN(size); + found = size; + base = page_address(p); + while(found != 0) { + /* Failure. */ + if(p >= (mem_map + max_mapnr)) + return 0UL; + if(PageSkip(p)) { + p = p->next_hash; + base = page_address(p); + found = size; + } else { + found -= PAGE_SIZE; + p++; + } + } + return base; +} + +cycles_t cacheflush_time; + +__initfunc(static void smp_tune_scheduling (void)) +{ + unsigned long flush_base, flags, *p; + unsigned int ecache_size; + cycles_t tick1, tick2, raw; + + /* Approximate heuristic for SMP scheduling. It is an + * estimation of the time it takes to flush the L2 cache + * on the local processor. + * + * The ia32 chooses to use the L1 cache flush time instead, + * and I consider this complete nonsense. The Ultra can service + * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and + * L2 misses are what create extra bus traffic (ie. the "cost" + * of moving a process from one cpu to another). + */ + printk("SMP: Calibrating ecache flush... "); + ecache_size = prom_getintdefault(linux_cpus[0].prom_node, + "ecache-size", (512 *1024)); + flush_base = find_flush_base(ecache_size << 1); + + if(flush_base != 0UL) { + __save_and_cli(flags); + + /* Scan twice the size once just to get the TLB entries + * loaded and make sure the second scan measures pure misses. + */ + for(p = (unsigned long *)flush_base; + ((unsigned long)p) < (flush_base + (ecache_size<<1)); + p += (64 / sizeof(unsigned long))) + *((volatile unsigned long *)p); + + /* Now the real measurement. */ + __asm__ __volatile__(" + b,pt %%xcc, 1f + rd %%tick, %0 + + .align 64 +1: ldx [%2 + 0x000], %%g1 + ldx [%2 + 0x040], %%g2 + ldx [%2 + 0x080], %%g3 + ldx [%2 + 0x0c0], %%g5 + add %2, 0x100, %2 + cmp %2, %4 + bne,pt %%xcc, 1b + nop + + rd %%tick, %1" + : "=&r" (tick1), "=&r" (tick2), "=&r" (flush_base) + : "2" (flush_base), "r" (flush_base + ecache_size) + : "g1", "g2", "g3", "g5"); + + __restore_flags(flags); + + raw = (tick2 - tick1); + + /* Dampen it a little, considering two processes + * sharing the cache and fitting. + */ + cacheflush_time = (raw - (raw >> 2)); + } else + cacheflush_time = ((ecache_size << 2) + + (ecache_size << 1)); + + printk("Using heuristic of %d cycles.\n", + (int) cacheflush_time); +} + int __init setup_profiling_timer(unsigned int multiplier) { unsigned long flags; |