1 files changed, 195 insertions, 65 deletions
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 4bdfca1b7..27344f4b6 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -5,6 +5,8 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/tasks.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
@@ -34,24 +36,23 @@ extern int linux_num_cpus;
 extern void calibrate_delay(void);
 extern unsigned prom_cpu_nodes[];
 
-volatile int smp_processors_ready = 0;
-unsigned long cpu_present_map = 0;
-int smp_num_cpus = 1;
-int smp_threads_ready = 0;
+struct cpuinfo_sparc cpu_data[NR_CPUS]  __attribute__ ((aligned (64)));
 
-struct cpuinfo_sparc cpu_data[NR_CPUS] __attribute__ ((aligned (64)));
+volatile int cpu_number_map[NR_CPUS]    __attribute__ ((aligned (64)));
+volatile int __cpu_logical_map[NR_CPUS] __attribute__ ((aligned (64)));
 
-/* Please don't make this initdata!!!  --DaveM */
+/* Please don't make this stuff initdata!!!  --DaveM */
 static unsigned char boot_cpu_id = 0;
-
 static int smp_activated = 0;
 
-volatile int cpu_number_map[NR_CPUS];
-volatile int __cpu_logical_map[NR_CPUS];
-
 /* Kernel spinlock */
 spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
 
+volatile int smp_processors_ready = 0;
+unsigned long cpu_present_map = 0;
+int smp_num_cpus = 1;
+int smp_threads_ready = 0;
+
 __initfunc(void smp_setup(char *str, int *ints))
 {
 	/* XXX implement me XXX */
@@ -84,6 +85,8 @@ int smp_bogo(char *buf)
 
 __initfunc(void smp_store_cpu_info(int id))
 {
+	int i;
+
 	cpu_data[id].irq_count			= 0;
 	cpu_data[id].bh_count			= 0;
 	/* multiplier and counter set by
@@ -94,16 +97,18 @@ __initfunc(void smp_store_cpu_info(int id))
 	cpu_data[id].pte_cache			= NULL;
 	cpu_data[id].pgdcache_size		= 0;
 	cpu_data[id].pgd_cache			= NULL;
-}
+	cpu_data[id].idle_volume		= 1;
 
-extern void distribute_irqs(void);
+	for(i = 0; i < 16; i++)
+		cpu_data[id].irq_worklists[i] = 0;
+}
 
 __initfunc(void smp_commence(void))
 {
-	distribute_irqs();
 }
 
 static void smp_setup_percpu_timer(void);
+static void smp_tune_scheduling(void);
 
 static volatile unsigned long callin_flag = 0;
 
@@ -173,10 +178,16 @@ void cpu_panic(void)
 	panic("SMP bolixed\n");
 }
 
-extern struct prom_cpuinfo linux_cpus[NR_CPUS];
+extern struct prom_cpuinfo linux_cpus[64];
 
 extern unsigned long smp_trampoline;
 
+/* The OBP cpu startup callback truncates the 3rd arg cookie to
+ * 32-bits (I think) so to be safe we have it read the pointer
+ * contained here so we work on >4GB machines. -DaveM
+ */
+static struct task_struct *cpu_new_task = NULL;
+
 __initfunc(void smp_boot_cpus(void))
 {
 	int cpucount = 0, i;
@@ -184,6 +195,8 @@ __initfunc(void smp_boot_cpus(void))
 	printk("Entering UltraSMPenguin Mode...\n");
 	__sti();
 	smp_store_cpu_info(boot_cpu_id);
+	smp_tune_scheduling();
+	init_idle();
 
 	if(linux_num_cpus == 1)
 		return;
@@ -194,21 +207,25 @@ __initfunc(void smp_boot_cpus(void))
 
 		if(cpu_present_map & (1UL << i)) {
 			unsigned long entry = (unsigned long)(&smp_trampoline);
+			unsigned long cookie = (unsigned long)(&cpu_new_task);
 			struct task_struct *p;
 			int timeout;
 			int no;
 			extern unsigned long phys_base;
 
 			entry += phys_base - KERNBASE;
+			cookie += phys_base - KERNBASE;
 			kernel_thread(start_secondary, NULL, CLONE_PID);
 			p = task[++cpucount];
 			p->processor = i;
+			p->has_cpu = 1; /* we schedule the first task manually */
 			callin_flag = 0;
 			for (no = 0; no < linux_num_cpus; no++)
 				if (linux_cpus[no].mid == i)
 					break;
+			cpu_new_task = p;
 			prom_startcpu(linux_cpus[no].prom_node,
-				      entry, ((unsigned long)p));
+				      entry, cookie);
 			for(timeout = 0; timeout < 5000000; timeout++) {
 				if(callin_flag)
 					break;
@@ -216,8 +233,8 @@ __initfunc(void smp_boot_cpus(void))
 			}
 			if(callin_flag) {
 				cpu_number_map[i] = cpucount;
-				prom_cpu_nodes[i] = linux_cpus[no].prom_node;
 				__cpu_logical_map[cpucount] = i;
+				prom_cpu_nodes[i] = linux_cpus[no].prom_node;
 			} else {
 				cpucount--;
 				printk("Processor %d is stuck.\n", i);
@@ -228,6 +245,7 @@ __initfunc(void smp_boot_cpus(void))
 			cpu_number_map[i] = -1;
 		}
 	}
+	cpu_new_task = NULL;
 	if(cpucount == 0) {
 		printk("Error: only one processor found.\n");
 		cpu_present_map = (1UL << smp_processor_id());
@@ -249,17 +267,6 @@ __initfunc(void smp_boot_cpus(void))
 	membar("#StoreStore | #StoreLoad");
 }
 
-/* We don't even need to do anything, the only generic message pass done
- * anymore is to stop all cpus during a panic().  When the user drops to
- * the PROM prompt, the firmware will send the other cpu's it's MONDO
- * vector anyways, so doing anything special here is pointless.
- *
- * This whole thing should go away anyways...
- */
-void smp_message_pass(int target, int msg, unsigned long data, int wait)
-{
-}
-
 /* #define XCALL_DEBUG */
 
 static inline void xcall_deliver(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu)
@@ -342,6 +349,17 @@ extern unsigned long xcall_flush_tlb_all;
 extern unsigned long xcall_tlbcachesync;
 extern unsigned long xcall_flush_cache_all;
 extern unsigned long xcall_report_regs;
+extern unsigned long xcall_receive_signal;
+
+void smp_receive_signal(int cpu)
+{
+	if(smp_processors_ready &&
+	   (cpu_present_map & (1UL<<cpu)) != 0) {
+		u64 pstate, data0 = (((u64)&xcall_receive_signal) & 0xffffffff);
+		__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
+		xcall_deliver(data0, 0, 0, pstate, cpu);
+	}
+}
 
 void smp_report_regs(void)
 {
@@ -364,37 +382,51 @@ void smp_flush_tlb_all(void)
  * to the stack before we get here because all callers of us
  * are flush_tlb_*() routines, and these run after flush_cache_*()
  * which performs the flushw.
+ *
+ * The SMP TLB coherency scheme we use works as follows:
+ *
+ * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
+ *    space has (potentially) executed on, this is the heuristic
+ *    we use to avoid doing cross calls.
+ *
+ * 2) TLB context numbers are shared globally across all processors
+ *    in the system, this allows us to play several games to avoid
+ *    cross calls.
+ *
+ *    One invariant is that when a cpu switches to a process, and
+ *    that processes tsk->mm->cpu_vm_mask does not have the current
+ *    cpu's bit set, that tlb context is flushed locally.
+ *
+ *    If the address space is non-shared (ie. mm->count == 1) we avoid
+ *    cross calls when we want to flush the currently running process's
+ *    tlb state.  This is done by clearing all cpu bits except the current
+ *    processor's in current->mm->cpu_vm_mask and performing the flush
+ *    locally only.  This will force any subsequent cpus which run this
+ *    task to flush the context from the local tlb if the process migrates
+ *    to another cpu (again).
+ *
+ * 3) For shared address spaces (threads) and swapping we bite the
+ *    bullet for most cases and perform the cross call.
+ *
+ *    The performance gain from "optimizing" away the cross call for threads is
+ *    questionable (in theory the big win for threads is the massive sharing of
+ *    address space state across processors).
+ *
+ *    For the swapping case the locking is difficult to get right, we'd have to
+ *    enforce strict ordered access to mm->cpu_vm_mask via a spinlock for example.
+ *    Then again one could argue that when you are swapping, the cost of a cross
+ *    call won't even show up on the performance radar.  But in any case we do get
+ *    rid of the cross-call when the task has a dead context or the task has only
+ *    ever run on the local cpu.
  */
-static void smp_cross_call_avoidance(struct mm_struct *mm)
-{
-	u32 ctx;
-
-	spin_lock(&scheduler_lock);
-	get_new_mmu_context(mm);
-	mm->cpu_vm_mask = (1UL << smp_processor_id());
-	current->tss.ctx = ctx = mm->context & 0x3ff;
-	spitfire_set_secondary_context(ctx);
-	__asm__ __volatile__("flush %g6");
-	spitfire_flush_dtlb_secondary_context();
-	spitfire_flush_itlb_secondary_context();
-	__asm__ __volatile__("flush %g6");
-	if(!segment_eq(current->tss.current_ds,USER_DS)) {
-		/* Rarely happens. */
-		current->tss.ctx = 0;
-		spitfire_set_secondary_context(0);
-		__asm__ __volatile__("flush %g6");
-	}
-	spin_unlock(&scheduler_lock);
-}
-
 void smp_flush_tlb_mm(struct mm_struct *mm)
 {
 	u32 ctx = mm->context & 0x3ff;
 
 	if(mm == current->mm && atomic_read(&mm->count) == 1) {
-		if(mm->cpu_vm_mask == (1UL << smp_processor_id()))
-			goto local_flush_and_out;
-		return smp_cross_call_avoidance(mm);
+		if(mm->cpu_vm_mask != (1UL << smp_processor_id()))
+			mm->cpu_vm_mask = (1UL << smp_processor_id());
+		goto local_flush_and_out;
 	}
 	smp_cross_call(&xcall_flush_tlb_mm, ctx, 0, 0);
 
@@ -410,9 +442,9 @@ void smp_flush_tlb_range(struct mm_struct *mm, unsigned long start,
 	start &= PAGE_MASK;
 	end   &= PAGE_MASK;
 	if(mm == current->mm && atomic_read(&mm->count) == 1) {
-		if(mm->cpu_vm_mask == (1UL << smp_processor_id()))
-			goto local_flush_and_out;
-		return smp_cross_call_avoidance(mm);
+		if(mm->cpu_vm_mask != (1UL << smp_processor_id()))
+			mm->cpu_vm_mask = (1UL << smp_processor_id());
+		goto local_flush_and_out;
 	}
 	smp_cross_call(&xcall_flush_tlb_range, ctx, start, end);
 
@@ -426,22 +458,26 @@ void smp_flush_tlb_page(struct mm_struct *mm, unsigned long page)
 
 	page &= PAGE_MASK;
 	if(mm == current->mm && atomic_read(&mm->count) == 1) {
-		if(mm->cpu_vm_mask == (1UL << smp_processor_id()))
-			goto local_flush_and_out;
-		return smp_cross_call_avoidance(mm);
-	}
-#if 0 /* XXX Disabled until further notice... */
-	else if(atomic_read(&mm->count) == 1) {
+		if(mm->cpu_vm_mask != (1UL << smp_processor_id()))
+			mm->cpu_vm_mask = (1UL << smp_processor_id());
+		goto local_flush_and_out;
+	} else {
 		/* Try to handle two special cases to avoid cross calls
 		 * in common scenerios where we are swapping process
 		 * pages out.
 		 */
-		if((mm->context ^ tlb_context_cache) & CTX_VERSION_MASK)
+		if(((mm->context ^ tlb_context_cache) & CTX_VERSION_MASK) ||
+		   (mm->cpu_vm_mask == 0)) {
+			/* A dead context cannot ever become "alive" until
+			 * a task switch is done to it.
+			 */
 			return; /* It's dead, nothing to do. */
-		if(mm->cpu_vm_mask == (1UL << smp_processor_id()))
-			goto local_flush_and_out;
+		}
+		if(mm->cpu_vm_mask == (1UL << smp_processor_id())) {
+			__flush_tlb_page(ctx, page, SECONDARY_CONTEXT);
+			return; /* Only local flush is necessary. */
+		}
 	}
-#endif
 	smp_cross_call(&xcall_flush_tlb_page, ctx, page, 0);
 
 local_flush_and_out:
@@ -644,6 +680,100 @@ __initfunc(void smp_tick_init(void))
 	prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
 }
 
+static inline unsigned long find_flush_base(unsigned long size)
+{
+	struct page *p = mem_map;
+	unsigned long found, base;
+
+	size = PAGE_ALIGN(size);
+	found = size;
+	base = page_address(p);
+	while(found != 0) {
+		/* Failure. */
+		if(p >= (mem_map + max_mapnr))
+			return 0UL;
+		if(PageSkip(p)) {
+			p = p->next_hash;
+			base = page_address(p);
+			found = size;
+		} else {
+			found -= PAGE_SIZE;
+			p++;
+		}
+	}
+	return base;
+}
+
+cycles_t cacheflush_time;
+
+__initfunc(static void smp_tune_scheduling (void))
+{
+	unsigned long flush_base, flags, *p;
+	unsigned int ecache_size;
+	cycles_t tick1, tick2, raw;
+
+	/* Approximate heuristic for SMP scheduling.  It is an
+	 * estimation of the time it takes to flush the L2 cache
+	 * on the local processor.
+	 *
+	 * The ia32 chooses to use the L1 cache flush time instead,
+	 * and I consider this complete nonsense.  The Ultra can service
+	 * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
+	 * L2 misses are what create extra bus traffic (ie. the "cost"
+	 * of moving a process from one cpu to another).
+	 */
+	printk("SMP: Calibrating ecache flush... ");
+	ecache_size = prom_getintdefault(linux_cpus[0].prom_node,
+					 "ecache-size", (512 *1024));
+	flush_base = find_flush_base(ecache_size << 1);
+
+	if(flush_base != 0UL) {
+		__save_and_cli(flags);
+
+		/* Scan twice the size once just to get the TLB entries
+		 * loaded and make sure the second scan measures pure misses.
+		 */
+		for(p = (unsigned long *)flush_base;
+		    ((unsigned long)p) < (flush_base + (ecache_size<<1));
+		    p += (64 / sizeof(unsigned long)))
+			*((volatile unsigned long *)p);
+
+		/* Now the real measurement. */
+		__asm__ __volatile__("
+		b,pt	%%xcc, 1f
+		 rd	%%tick, %0
+
+		.align	64
+1:		ldx	[%2 + 0x000], %%g1
+		ldx	[%2 + 0x040], %%g2
+		ldx	[%2 + 0x080], %%g3
+		ldx	[%2 + 0x0c0], %%g5
+		add	%2, 0x100, %2
+		cmp	%2, %4
+		bne,pt	%%xcc, 1b
+		 nop
+	
+		rd	%%tick, %1"
+		: "=&r" (tick1), "=&r" (tick2), "=&r" (flush_base)
+		: "2" (flush_base), "r" (flush_base + ecache_size)
+		: "g1", "g2", "g3", "g5");
+
+		__restore_flags(flags);
+
+		raw = (tick2 - tick1);
+
+		/* Dampen it a little, considering two processes
+		 * sharing the cache and fitting.
+		 */
+		cacheflush_time = (raw - (raw >> 2));
+	} else
+		cacheflush_time = ((ecache_size << 2) +
+				   (ecache_size << 1));
+
+	printk("Using heuristic of %d cycles.\n",
+	       (int) cacheflush_time);
+}
+
 int __init setup_profiling_timer(unsigned int multiplier)
 {
 	unsigned long flags;