summaryrefslogtreecommitdiffstats
path: root/arch/sparc64/kernel/smp.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sparc64/kernel/smp.c')
-rw-r--r--arch/sparc64/kernel/smp.c260
1 files changed, 195 insertions, 65 deletions
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 4bdfca1b7..27344f4b6 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -5,6 +5,8 @@
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/tasks.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
@@ -34,24 +36,23 @@ extern int linux_num_cpus;
extern void calibrate_delay(void);
extern unsigned prom_cpu_nodes[];
-volatile int smp_processors_ready = 0;
-unsigned long cpu_present_map = 0;
-int smp_num_cpus = 1;
-int smp_threads_ready = 0;
+struct cpuinfo_sparc cpu_data[NR_CPUS] __attribute__ ((aligned (64)));
-struct cpuinfo_sparc cpu_data[NR_CPUS] __attribute__ ((aligned (64)));
+volatile int cpu_number_map[NR_CPUS] __attribute__ ((aligned (64)));
+volatile int __cpu_logical_map[NR_CPUS] __attribute__ ((aligned (64)));
-/* Please don't make this initdata!!! --DaveM */
+/* Please don't make this stuff initdata!!! --DaveM */
static unsigned char boot_cpu_id = 0;
-
static int smp_activated = 0;
-volatile int cpu_number_map[NR_CPUS];
-volatile int __cpu_logical_map[NR_CPUS];
-
/* Kernel spinlock */
spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
+volatile int smp_processors_ready = 0;
+unsigned long cpu_present_map = 0;
+int smp_num_cpus = 1;
+int smp_threads_ready = 0;
+
__initfunc(void smp_setup(char *str, int *ints))
{
/* XXX implement me XXX */
@@ -84,6 +85,8 @@ int smp_bogo(char *buf)
__initfunc(void smp_store_cpu_info(int id))
{
+ int i;
+
cpu_data[id].irq_count = 0;
cpu_data[id].bh_count = 0;
/* multiplier and counter set by
@@ -94,16 +97,18 @@ __initfunc(void smp_store_cpu_info(int id))
cpu_data[id].pte_cache = NULL;
cpu_data[id].pgdcache_size = 0;
cpu_data[id].pgd_cache = NULL;
-}
+ cpu_data[id].idle_volume = 1;
-extern void distribute_irqs(void);
+ for(i = 0; i < 16; i++)
+ cpu_data[id].irq_worklists[i] = 0;
+}
__initfunc(void smp_commence(void))
{
- distribute_irqs();
}
static void smp_setup_percpu_timer(void);
+static void smp_tune_scheduling(void);
static volatile unsigned long callin_flag = 0;
@@ -173,10 +178,16 @@ void cpu_panic(void)
panic("SMP bolixed\n");
}
-extern struct prom_cpuinfo linux_cpus[NR_CPUS];
+extern struct prom_cpuinfo linux_cpus[64];
extern unsigned long smp_trampoline;
+/* The OBP cpu startup callback truncates the 3rd arg cookie to
+ * 32-bits (I think) so to be safe we have it read the pointer
+ * contained here so we work on >4GB machines. -DaveM
+ */
+static struct task_struct *cpu_new_task = NULL;
+
__initfunc(void smp_boot_cpus(void))
{
int cpucount = 0, i;
@@ -184,6 +195,8 @@ __initfunc(void smp_boot_cpus(void))
printk("Entering UltraSMPenguin Mode...\n");
__sti();
smp_store_cpu_info(boot_cpu_id);
+ smp_tune_scheduling();
+ init_idle();
if(linux_num_cpus == 1)
return;
@@ -194,21 +207,25 @@ __initfunc(void smp_boot_cpus(void))
if(cpu_present_map & (1UL << i)) {
unsigned long entry = (unsigned long)(&smp_trampoline);
+ unsigned long cookie = (unsigned long)(&cpu_new_task);
struct task_struct *p;
int timeout;
int no;
extern unsigned long phys_base;
entry += phys_base - KERNBASE;
+ cookie += phys_base - KERNBASE;
kernel_thread(start_secondary, NULL, CLONE_PID);
p = task[++cpucount];
p->processor = i;
+ p->has_cpu = 1; /* we schedule the first task manually */
callin_flag = 0;
for (no = 0; no < linux_num_cpus; no++)
if (linux_cpus[no].mid == i)
break;
+ cpu_new_task = p;
prom_startcpu(linux_cpus[no].prom_node,
- entry, ((unsigned long)p));
+ entry, cookie);
for(timeout = 0; timeout < 5000000; timeout++) {
if(callin_flag)
break;
@@ -216,8 +233,8 @@ __initfunc(void smp_boot_cpus(void))
}
if(callin_flag) {
cpu_number_map[i] = cpucount;
- prom_cpu_nodes[i] = linux_cpus[no].prom_node;
__cpu_logical_map[cpucount] = i;
+ prom_cpu_nodes[i] = linux_cpus[no].prom_node;
} else {
cpucount--;
printk("Processor %d is stuck.\n", i);
@@ -228,6 +245,7 @@ __initfunc(void smp_boot_cpus(void))
cpu_number_map[i] = -1;
}
}
+ cpu_new_task = NULL;
if(cpucount == 0) {
printk("Error: only one processor found.\n");
cpu_present_map = (1UL << smp_processor_id());
@@ -249,17 +267,6 @@ __initfunc(void smp_boot_cpus(void))
membar("#StoreStore | #StoreLoad");
}
-/* We don't even need to do anything, the only generic message pass done
- * anymore is to stop all cpus during a panic(). When the user drops to
- * the PROM prompt, the firmware will send the other cpu's it's MONDO
- * vector anyways, so doing anything special here is pointless.
- *
- * This whole thing should go away anyways...
- */
-void smp_message_pass(int target, int msg, unsigned long data, int wait)
-{
-}
-
/* #define XCALL_DEBUG */
static inline void xcall_deliver(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu)
@@ -342,6 +349,17 @@ extern unsigned long xcall_flush_tlb_all;
extern unsigned long xcall_tlbcachesync;
extern unsigned long xcall_flush_cache_all;
extern unsigned long xcall_report_regs;
+extern unsigned long xcall_receive_signal;
+
+void smp_receive_signal(int cpu)
+{
+ if(smp_processors_ready &&
+ (cpu_present_map & (1UL<<cpu)) != 0) {
+ u64 pstate, data0 = (((u64)&xcall_receive_signal) & 0xffffffff);
+ __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
+ xcall_deliver(data0, 0, 0, pstate, cpu);
+ }
+}
void smp_report_regs(void)
{
@@ -364,37 +382,51 @@ void smp_flush_tlb_all(void)
* to the stack before we get here because all callers of us
* are flush_tlb_*() routines, and these run after flush_cache_*()
* which performs the flushw.
+ *
+ * The SMP TLB coherency scheme we use works as follows:
+ *
+ * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
+ * space has (potentially) executed on, this is the heuristic
+ * we use to avoid doing cross calls.
+ *
+ * 2) TLB context numbers are shared globally across all processors
+ * in the system, this allows us to play several games to avoid
+ * cross calls.
+ *
+ * One invariant is that when a cpu switches to a process, and
+ * that processes tsk->mm->cpu_vm_mask does not have the current
+ * cpu's bit set, that tlb context is flushed locally.
+ *
+ * If the address space is non-shared (ie. mm->count == 1) we avoid
+ * cross calls when we want to flush the currently running process's
+ * tlb state. This is done by clearing all cpu bits except the current
+ * processor's in current->mm->cpu_vm_mask and performing the flush
+ * locally only. This will force any subsequent cpus which run this
+ * task to flush the context from the local tlb if the process migrates
+ * to another cpu (again).
+ *
+ * 3) For shared address spaces (threads) and swapping we bite the
+ * bullet for most cases and perform the cross call.
+ *
+ * The performance gain from "optimizing" away the cross call for threads is
+ * questionable (in theory the big win for threads is the massive sharing of
+ * address space state across processors).
+ *
+ * For the swapping case the locking is difficult to get right, we'd have to
+ * enforce strict ordered access to mm->cpu_vm_mask via a spinlock for example.
+ * Then again one could argue that when you are swapping, the cost of a cross
+ * call won't even show up on the performance radar. But in any case we do get
+ * rid of the cross-call when the task has a dead context or the task has only
+ * ever run on the local cpu.
*/
-static void smp_cross_call_avoidance(struct mm_struct *mm)
-{
- u32 ctx;
-
- spin_lock(&scheduler_lock);
- get_new_mmu_context(mm);
- mm->cpu_vm_mask = (1UL << smp_processor_id());
- current->tss.ctx = ctx = mm->context & 0x3ff;
- spitfire_set_secondary_context(ctx);
- __asm__ __volatile__("flush %g6");
- spitfire_flush_dtlb_secondary_context();
- spitfire_flush_itlb_secondary_context();
- __asm__ __volatile__("flush %g6");
- if(!segment_eq(current->tss.current_ds,USER_DS)) {
- /* Rarely happens. */
- current->tss.ctx = 0;
- spitfire_set_secondary_context(0);
- __asm__ __volatile__("flush %g6");
- }
- spin_unlock(&scheduler_lock);
-}
-
void smp_flush_tlb_mm(struct mm_struct *mm)
{
u32 ctx = mm->context & 0x3ff;
if(mm == current->mm && atomic_read(&mm->count) == 1) {
- if(mm->cpu_vm_mask == (1UL << smp_processor_id()))
- goto local_flush_and_out;
- return smp_cross_call_avoidance(mm);
+ if(mm->cpu_vm_mask != (1UL << smp_processor_id()))
+ mm->cpu_vm_mask = (1UL << smp_processor_id());
+ goto local_flush_and_out;
}
smp_cross_call(&xcall_flush_tlb_mm, ctx, 0, 0);
@@ -410,9 +442,9 @@ void smp_flush_tlb_range(struct mm_struct *mm, unsigned long start,
start &= PAGE_MASK;
end &= PAGE_MASK;
if(mm == current->mm && atomic_read(&mm->count) == 1) {
- if(mm->cpu_vm_mask == (1UL << smp_processor_id()))
- goto local_flush_and_out;
- return smp_cross_call_avoidance(mm);
+ if(mm->cpu_vm_mask != (1UL << smp_processor_id()))
+ mm->cpu_vm_mask = (1UL << smp_processor_id());
+ goto local_flush_and_out;
}
smp_cross_call(&xcall_flush_tlb_range, ctx, start, end);
@@ -426,22 +458,26 @@ void smp_flush_tlb_page(struct mm_struct *mm, unsigned long page)
page &= PAGE_MASK;
if(mm == current->mm && atomic_read(&mm->count) == 1) {
- if(mm->cpu_vm_mask == (1UL << smp_processor_id()))
- goto local_flush_and_out;
- return smp_cross_call_avoidance(mm);
- }
-#if 0 /* XXX Disabled until further notice... */
- else if(atomic_read(&mm->count) == 1) {
+ if(mm->cpu_vm_mask != (1UL << smp_processor_id()))
+ mm->cpu_vm_mask = (1UL << smp_processor_id());
+ goto local_flush_and_out;
+ } else {
/* Try to handle two special cases to avoid cross calls
* in common scenerios where we are swapping process
* pages out.
*/
- if((mm->context ^ tlb_context_cache) & CTX_VERSION_MASK)
+ if(((mm->context ^ tlb_context_cache) & CTX_VERSION_MASK) ||
+ (mm->cpu_vm_mask == 0)) {
+ /* A dead context cannot ever become "alive" until
+ * a task switch is done to it.
+ */
return; /* It's dead, nothing to do. */
- if(mm->cpu_vm_mask == (1UL << smp_processor_id()))
- goto local_flush_and_out;
+ }
+ if(mm->cpu_vm_mask == (1UL << smp_processor_id())) {
+ __flush_tlb_page(ctx, page, SECONDARY_CONTEXT);
+ return; /* Only local flush is necessary. */
+ }
}
-#endif
smp_cross_call(&xcall_flush_tlb_page, ctx, page, 0);
local_flush_and_out:
@@ -644,6 +680,100 @@ __initfunc(void smp_tick_init(void))
prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
}
+static inline unsigned long find_flush_base(unsigned long size)
+{
+ struct page *p = mem_map;
+ unsigned long found, base;
+
+ size = PAGE_ALIGN(size);
+ found = size;
+ base = page_address(p);
+ while(found != 0) {
+ /* Failure. */
+ if(p >= (mem_map + max_mapnr))
+ return 0UL;
+ if(PageSkip(p)) {
+ p = p->next_hash;
+ base = page_address(p);
+ found = size;
+ } else {
+ found -= PAGE_SIZE;
+ p++;
+ }
+ }
+ return base;
+}
+
+cycles_t cacheflush_time;
+
+__initfunc(static void smp_tune_scheduling (void))
+{
+ unsigned long flush_base, flags, *p;
+ unsigned int ecache_size;
+ cycles_t tick1, tick2, raw;
+
+ /* Approximate heuristic for SMP scheduling. It is an
+ * estimation of the time it takes to flush the L2 cache
+ * on the local processor.
+ *
+ * The ia32 chooses to use the L1 cache flush time instead,
+ * and I consider this complete nonsense. The Ultra can service
+ * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
+ * L2 misses are what create extra bus traffic (ie. the "cost"
+ * of moving a process from one cpu to another).
+ */
+ printk("SMP: Calibrating ecache flush... ");
+ ecache_size = prom_getintdefault(linux_cpus[0].prom_node,
+ "ecache-size", (512 *1024));
+ flush_base = find_flush_base(ecache_size << 1);
+
+ if(flush_base != 0UL) {
+ __save_and_cli(flags);
+
+ /* Scan twice the size once just to get the TLB entries
+ * loaded and make sure the second scan measures pure misses.
+ */
+ for(p = (unsigned long *)flush_base;
+ ((unsigned long)p) < (flush_base + (ecache_size<<1));
+ p += (64 / sizeof(unsigned long)))
+ *((volatile unsigned long *)p);
+
+ /* Now the real measurement. */
+ __asm__ __volatile__("
+ b,pt %%xcc, 1f
+ rd %%tick, %0
+
+ .align 64
+1: ldx [%2 + 0x000], %%g1
+ ldx [%2 + 0x040], %%g2
+ ldx [%2 + 0x080], %%g3
+ ldx [%2 + 0x0c0], %%g5
+ add %2, 0x100, %2
+ cmp %2, %4
+ bne,pt %%xcc, 1b
+ nop
+
+ rd %%tick, %1"
+ : "=&r" (tick1), "=&r" (tick2), "=&r" (flush_base)
+ : "2" (flush_base), "r" (flush_base + ecache_size)
+ : "g1", "g2", "g3", "g5");
+
+ __restore_flags(flags);
+
+ raw = (tick2 - tick1);
+
+ /* Dampen it a little, considering two processes
+ * sharing the cache and fitting.
+ */
+ cacheflush_time = (raw - (raw >> 2));
+ } else
+ cacheflush_time = ((ecache_size << 2) +
+ (ecache_size << 1));
+
+ printk("Using heuristic of %d cycles.\n",
+ (int) cacheflush_time);
+}
+
int __init setup_profiling_timer(unsigned int multiplier)
{
unsigned long flags;