1 files changed, 729 insertions, 0 deletions
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
new file mode 100644
index 000000000..e8b975070
--- /dev/null
+++ b/arch/s390/kernel/smp.c
@@ -0,0 +1,729 @@
+/*
+ *  arch/s390/kernel/smp.c
+ *
+ *  S390 version
+ *    Copyright (C) 1999,2000 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *    Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
+ *               Martin Schwidefsky (schwidefsky@de.ibm.com)
+ *
+ *  based on other smp stuff by 
+ *    (c) 1995 Alan Cox, CymruNET Ltd  <alan@cymru.net>
+ *    (c) 1998 Ingo Molnar
+ *
+ * We work with logical cpu numbering everywhere we can. The only
+ * functions using the real cpu address (got from STAP) are the sigp
+ * functions. For all other functions we use the identity mapping.
+ * That means that cpu_number_map[i] == i for every cpu. cpu_number_map is
+ * used e.g. to find the idle task belonging to a logical cpu. Every array
+ * in the kernel is sorted by the logical cpu number and not by the physical
+ * one which is causing all the confusion with __cpu_logical_map and
+ * cpu_number_map in other architectures.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+
+#include <linux/delay.h>
+
+#include <asm/sigp.h>
+#include <asm/pgalloc.h>
+#include <asm/irq.h>
+
+#include "cpcmd.h"
+
+/* prototypes */
+extern void update_one_process( struct task_struct *p,
+                                unsigned long ticks, unsigned long user,
+                                unsigned long system, int cpu);
+extern int cpu_idle(void * unused);
+
+extern __u16 boot_cpu_addr;
+
+/*
+ * An array with a pointer the lowcore of every CPU.
+ */
+static int       max_cpus = NR_CPUS;	  /* Setup configured maximum number of CPUs to activate	*/
+int              smp_num_cpus;
+struct _lowcore *lowcore_ptr[NR_CPUS];
+unsigned int     prof_multiplier[NR_CPUS];
+unsigned int     prof_old_multiplier[NR_CPUS];
+unsigned int     prof_counter[NR_CPUS];
+volatile int     __cpu_logical_map[NR_CPUS]; /* logical cpu to cpu address */
+cycles_t         cacheflush_time=0;
+int              smp_threads_ready=0;      /* Set when the idlers are all forked. */
+unsigned long    ipi_count=0;              /* Number of IPIs delivered. */
+static atomic_t  smp_commenced = ATOMIC_INIT(0);
+
+spinlock_t       kernel_flag = SPIN_LOCK_UNLOCKED;
+
+/*
+ *      Setup routine for controlling SMP activation
+ *
+ *      Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ *      activation entirely (the MPS table probe still happens, though).
+ *
+ *      Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ *      greater than 0, limits the maximum number of CPUs activated in
+ *      SMP mode to <NUM>.
+ */
+
+static int __init nosmp(char *str)
+{
+	max_cpus = 0;
+	return 1;
+}
+
+__setup("nosmp", nosmp);
+
+static int __init maxcpus(char *str)
+{
+	get_option(&str, &max_cpus);
+	return 1;
+}
+
+__setup("maxcpus=", maxcpus);
+
+/*
+ * Reboot, halt and power_off routines for SMP.
+ */
+extern char vmhalt_cmd[];
+extern char vmpoff_cmd[];
+
+extern void reipl(unsigned long devno);
+
+void do_machine_restart(void)
+{
+        smp_send_stop();
+	reipl(S390_lowcore.ipl_device);
+}
+
+void machine_restart(char * __unused) 
+{
+        if (smp_processor_id() != 0) {
+                smp_ext_call_async(0, ec_restart);
+                for (;;);
+        } else
+                do_machine_restart();
+}
+
+void do_machine_halt(void)
+{
+        smp_send_stop();
+        if (MACHINE_IS_VM && strlen(vmhalt_cmd) > 0)
+                cpcmd(vmhalt_cmd, NULL, 0);
+        disabled_wait(0);
+}
+
+void machine_halt(void)
+{
+        if (smp_processor_id() != 0) {
+                smp_ext_call_async(0, ec_halt);
+                for (;;);
+        } else
+                do_machine_halt();
+}
+
+void do_machine_power_off(void)
+{
+        smp_send_stop();
+        if (MACHINE_IS_VM && strlen(vmpoff_cmd) > 0)
+                cpcmd(vmpoff_cmd, NULL, 0);
+        disabled_wait(0);
+}
+
+void machine_power_off(void)
+{
+        if (smp_processor_id() != 0) {
+                smp_ext_call_async(0, ec_power_off);
+                for (;;);
+        } else
+                do_machine_power_off();
+}
+
+/*
+ * This is the main routine where commands issued by other
+ * cpus are handled.
+ */
+
+void do_ext_call_interrupt(__u16 source_cpu_addr)
+{
+        ec_ext_call *ec, *next;
+        int bits;
+
+        /*
+         * handle bit signal external calls
+         *
+         * For the ec_schedule signal we have to do nothing. All the work
+         * is done automatically when we return from the interrupt.
+	 * For the ec_restart, ec_halt and ec_power_off we call the
+         * appropriate routine.
+         */
+        do {
+                bits = atomic_read(&S390_lowcore.ext_call_fast);
+        } while (atomic_compare_and_swap(bits,0,&S390_lowcore.ext_call_fast));
+
+        if (test_bit(ec_restart, &bits))
+		do_machine_restart();
+        if (test_bit(ec_halt, &bits))
+		do_machine_halt();
+        if (test_bit(ec_power_off, &bits))
+		do_machine_power_off();
+
+        /*
+         * Handle external call commands with a parameter area
+         */
+        do {
+                ec = (ec_ext_call *) atomic_read(&S390_lowcore.ext_call_queue);
+        } while (atomic_compare_and_swap((int) ec, 0,
+                                         &S390_lowcore.ext_call_queue));
+        if (ec == NULL)
+                return;   /* no command signals */
+
+        /* Make a fifo out of the lifo */
+        next = ec;
+        ec->next = NULL;
+        while (next != NULL) {
+                ec_ext_call *tmp = next->next;
+                next->next = ec;
+                ec = next;
+                next = tmp;
+        }
+
+        /* Execute every sigp command on the queue */
+        while (ec != NULL) {
+                switch (ec->cmd) {
+                case ec_get_ctl: {
+                        ec_creg_parms *pp;
+                        pp = (ec_creg_parms *) ec->parms;
+                        atomic_set(&ec->status,ec_executing);
+                        asm volatile (
+                                "   bras  1,0f\n"
+                                "   stctl 0,0,0(%0)\n"
+                                "0: ex    %1,0(1)\n"
+                                : : "a" (pp->cregs+pp->start_ctl),
+                                "a" ((pp->start_ctl<<4) + pp->end_ctl)
+                                : "memory", "1" );
+                        atomic_set(&ec->status,ec_done);
+                        return;
+                }
+                case ec_set_ctl: {
+                        ec_creg_parms *pp;
+                        pp = (ec_creg_parms *) ec->parms;
+                        atomic_set(&ec->status,ec_executing);
+                        asm volatile (
+                                "   bras  1,0f\n"
+                                "   lctl 0,0,0(%0)\n"
+                                "0: ex    %1,0(1)\n"
+                                : : "a" (pp->cregs+pp->start_ctl),
+                                "a" ((pp->start_ctl<<4) + pp->end_ctl)
+                                : "memory", "1" );
+                        atomic_set(&ec->status,ec_done);
+                        return;
+                }
+                case ec_set_ctl_masked: {
+                        ec_creg_mask_parms *pp;
+                        u32 cregs[16];
+                        int i;
+
+                        pp = (ec_creg_mask_parms *) ec->parms;
+                        atomic_set(&ec->status,ec_executing);
+                        asm volatile (
+                                "   bras  1,0f\n"
+                                "   stctl 0,0,0(%0)\n"
+                                "0: ex    %1,0(1)\n"
+                                : : "a" (cregs+pp->start_ctl),
+                                "a" ((pp->start_ctl<<4) + pp->end_ctl)
+                                : "memory", "1" );
+                        for (i = pp->start_ctl; i <= pp->end_ctl; i++)
+                                cregs[i] = (cregs[i] & pp->andvals[i])
+                                                     | pp->orvals[i];
+                        asm volatile (
+                                "   bras  1,0f\n"
+                                "   lctl 0,0,0(%0)\n"
+                                "0: ex    %1,0(1)\n"
+                                : : "a" (cregs+pp->start_ctl),
+                                "a" ((pp->start_ctl<<4) + pp->end_ctl)
+                                : "memory", "1" );
+                        atomic_set(&ec->status,ec_done);
+                        return;
+                }
+                default:
+                }
+                ec = ec->next;
+        }
+}
+
+/*
+ * Send an external call sigp to another cpu and wait for its completion.
+ */
+sigp_ccode smp_ext_call_sync(int cpu, ec_cmd_sig cmd, void *parms)
+{
+        struct _lowcore *lowcore = &get_cpu_lowcore(cpu);
+        sigp_ccode ccode;
+        ec_ext_call ec;
+
+        ec.cmd = cmd;
+        atomic_set(&ec.status, ec_pending);
+        ec.parms = parms;
+        do {
+                ec.next = (ec_ext_call*) atomic_read(&lowcore->ext_call_queue);
+        } while (atomic_compare_and_swap((int) ec.next, (int)(&ec),
+                                         &lowcore->ext_call_queue));
+        /*
+         * We try once to deliver the signal. There are four possible
+         * return codes:
+         * 0) Order code accepted - can't show up on an external call
+         * 1) Status stored - fine, wait for completion.
+         * 2) Busy - there is another signal pending. Thats fine too, because
+         *    do_ext_call from the pending signal will execute all signals on
+         *    the queue. We wait for completion.
+         * 3) Not operational - something very bad has happened to the cpu.
+         *    do not wait for completion.
+         */
+        ccode = signal_processor(cpu, sigp_external_call);
+
+        if (ccode != sigp_not_operational)
+                /* wait for completion, FIXME: possible seed of a deadlock */
+                while (atomic_read(&ec.status) != ec_done);
+
+        return ccode;
+}
+
+/*
+ * Send an external call sigp to another cpu and return without waiting
+ * for its completion. Currently we do not support parameters with
+ * asynchronous sigps.
+ */
+sigp_ccode smp_ext_call_async(int cpu, ec_bit_sig sig)
+{
+        struct _lowcore *lowcore = &get_cpu_lowcore(cpu);
+        sigp_ccode ccode;
+
+        /*
+         * Set signaling bit in lowcore of target cpu and kick it
+         */
+        atomic_set_mask(1<<sig, &lowcore->ext_call_fast);
+        ccode = signal_processor(cpu, sigp_external_call);
+        return ccode;
+}
+
+/*
+ * Send an external call sigp to every other cpu in the system and
+ * wait for the completion of the sigps.
+ */
+void smp_ext_call_sync_others(ec_cmd_sig cmd, void *parms)
+{
+        struct _lowcore *lowcore;
+        ec_ext_call ec[NR_CPUS];
+        sigp_ccode ccode;
+        int i;
+
+        for (i = 0; i < smp_num_cpus; i++) {
+                if (smp_processor_id() == i)
+                        continue;
+                lowcore = &get_cpu_lowcore(i);
+                ec[i].cmd = cmd;
+                atomic_set(&ec[i].status, ec_pending);
+                ec[i].parms = parms;
+                do {
+                        ec[i].next = (ec_ext_call *)
+                                        atomic_read(&lowcore->ext_call_queue);
+                } while (atomic_compare_and_swap((int) ec[i].next, (int)(ec+i),
+                                                 &lowcore->ext_call_queue));
+                ccode = signal_processor(i, sigp_external_call);
+        }
+
+        /* wait for completion, FIXME: possible seed of a deadlock */
+        for (i = 0; i < smp_num_cpus; i++) {
+                if (smp_processor_id() == i)
+                        continue;
+                while (atomic_read(&ec[i].status) != ec_done);
+        }
+}
+
+/*
+ * Send an external call sigp to every other cpu in the system and
+ * return without waiting for the completion of the sigps. Currently
+ * we do not support parameters with asynchronous sigps.
+ */
+void smp_ext_call_async_others(ec_bit_sig sig)
+{
+        struct _lowcore *lowcore;
+        sigp_ccode ccode;
+        int i;
+
+        for (i = 0; i < smp_num_cpus; i++) {
+                if (smp_processor_id() == i)
+                        continue;
+                lowcore = &get_cpu_lowcore(i);
+                /*
+                 * Set signaling bit in lowcore of target cpu and kick it
+                 */
+                atomic_set_mask(1<<sig, &lowcore->ext_call_fast);
+                ccode = signal_processor(i, sigp_external_call);
+        }
+}
+
+/*
+ * cycles through all the cpus,
+ * returns early if info is not NULL & the processor has something
+ * of intrest to report in the info structure.
+ * it returns the next cpu to check if it returns early.
+ * i.e. it should be used as follows if you wish to receive info.
+ * next_cpu=0;
+ * do
+ * {
+ *    info->cpu=next_cpu;
+ *    next_cpu=smp_signal_others(order_code,parameter,1,info);
+ *    ... check info here
+ * } while(next_cpu<=smp_num_cpus)
+ *
+ *  if you are lazy just use it like
+ * smp_signal_others(order_code,parameter,0,1,NULL);
+ */
+int smp_signal_others(sigp_order_code order_code, u32 parameter,
+                      int spin, sigp_info *info)
+{
+        sigp_ccode   ccode;
+        u32          dummy;
+        u16          i;
+
+        if (info)
+                info->intresting = 0;
+        for (i = (info ? info->cpu : 0); i < smp_num_cpus; i++) {
+                if (smp_processor_id() != i) {
+                        do {
+                                ccode = signal_processor_ps(
+                                        (info ? &info->status : &dummy),
+                                        parameter, i, order_code);
+                        } while(spin && ccode == sigp_busy);
+                        if (info && ccode != sigp_order_code_accepted) {
+                                info->intresting = 1;
+                                info->cpu = i;
+                                info->ccode = ccode;
+                                i++;
+                                break;
+                        }
+                }
+        }
+        return i;
+}
+
+/*
+ * this function sends a 'stop' sigp to all other CPUs in the system.
+ * it goes straight through.
+ */
+
+void smp_send_stop(void)
+{
+        smp_signal_others(sigp_stop, 0, 1, NULL);
+}
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+
+void smp_send_reschedule(int cpu)
+{
+        smp_ext_call_async(cpu, ec_schedule);
+}
+
+/*
+ * Set a bit in a control register of all cpus
+ */
+void smp_ctl_set_bit(int cr, int bit) {
+        ec_creg_mask_parms parms;
+
+        if (atomic_read(&smp_commenced) != 0) {
+                parms.start_ctl = cr;
+                parms.end_ctl = cr;
+                parms.orvals[cr] = 1 << bit;
+                parms.andvals[cr] = 0xFFFFFFFF;
+                smp_ext_call_sync_others(ec_set_ctl_masked,&parms);
+        }
+        __ctl_set_bit(cr, bit);
+}
+
+/*
+ * Clear a bit in a control register of all cpus
+ */
+void smp_ctl_clear_bit(int cr, int bit) {
+        ec_creg_mask_parms parms;
+
+        if (atomic_read(&smp_commenced) != 0) {
+                parms.start_ctl = cr;
+                parms.end_ctl = cr;
+                parms.orvals[cr] = 0x00000000;
+                parms.andvals[cr] = ~(1 << bit);
+                smp_ext_call_sync_others(ec_set_ctl_masked,&parms);
+        }
+        __ctl_clear_bit(cr, bit);
+}
+
+
+/*
+ * Lets check how many CPUs we have.
+ */
+
+void smp_count_cpus(void)
+{
+        int curr_cpu;
+
+        __cpu_logical_map[0] = boot_cpu_addr;
+        current->processor = 0;
+        smp_num_cpus = 1;
+        for (curr_cpu = 0;
+             curr_cpu <= 65535 && smp_num_cpus < max_cpus; curr_cpu++) {
+                if ((__u16) curr_cpu == boot_cpu_addr)
+                        continue;
+                __cpu_logical_map[smp_num_cpus] = (__u16) curr_cpu;
+                if (signal_processor(smp_num_cpus, sigp_sense) ==
+                    sigp_not_operational)
+                        continue;
+                smp_num_cpus++;
+        }
+        printk("Detected %d CPU's\n",(int) smp_num_cpus);
+        printk("Boot cpu address %2X\n", boot_cpu_addr);
+}
+
+
+/*
+ *      Activate a secondary processor.
+ */
+extern void init_100hz_timer(void);
+
+int __init start_secondary(void *cpuvoid)
+{
+        /* Setup the cpu */
+        cpu_init();
+        /* Print info about this processor */
+        print_cpu_info(&safe_get_cpu_lowcore(smp_processor_id()).cpu_data);
+        /* Wait for completion of smp startup */
+        while (!atomic_read(&smp_commenced))
+                /* nothing */ ;
+        /* init per CPU 100 hz timer */
+        init_100hz_timer();
+        /* cpu_idle will call schedule for us */
+        return cpu_idle(NULL);
+}
+
+/*
+ * The restart interrupt handler jumps to start_secondary directly
+ * without the detour over initialize_secondary. We defined it here
+ * so that the linker doesn't complain.
+ */
+void __init initialize_secondary(void)
+{
+}
+
+static int __init fork_by_hand(void)
+{
+       struct pt_regs regs;
+       /* don't care about the psw and regs settings since we'll never
+          reschedule the forked task. */
+       memset(&regs,sizeof(pt_regs),0);
+       return do_fork(CLONE_VM|CLONE_PID, 0, &regs);
+}
+
+static void __init do_boot_cpu(int cpu)
+{
+        struct task_struct *idle;
+        struct _lowcore    *cpu_lowcore;
+
+        /* We can't use kernel_thread since we must _avoid_ to reschedule
+           the child. */
+        if (fork_by_hand() < 0)
+                panic("failed fork for CPU %d", cpu);
+
+        /*
+         * We remove it from the pidhash and the runqueue
+         * once we got the process:
+         */
+        idle = init_task.prev_task;
+        if (!idle)
+                panic("No idle process for CPU %d",cpu);
+        idle->processor = cpu;
+        idle->has_cpu = 1; /* we schedule the first task manually */
+
+        del_from_runqueue(idle);
+        unhash_process(idle);
+        init_tasks[cpu] = idle;
+
+        cpu_lowcore=&get_cpu_lowcore(cpu);
+        cpu_lowcore->kernel_stack=idle->thread.ksp;
+        __asm__ __volatile__("stctl 0,15,%0\n\t"
+                             "stam  0,15,%1"
+                             : "=m" (cpu_lowcore->cregs_save_area[0]),
+                               "=m" (cpu_lowcore->access_regs_save_area[0])
+                             : : "memory");
+
+        eieio();
+        signal_processor(cpu,sigp_restart);
+}
+
+/*
+ *      Architecture specific routine called by the kernel just before init is
+ *      fired off. This allows the BP to have everything in order [we hope].
+ *      At the end of this all the APs will hit the system scheduling and off
+ *      we go. Each AP will load the system gdt's and jump through the kernel
+ *      init into idle(). At this point the scheduler will one day take over
+ *      and give them jobs to do. smp_callin is a standard routine
+ *      we use to track CPUs as they power up.
+ */
+
+void __init smp_commence(void)
+{
+        /*
+         *      Lets the callins below out of their loop.
+         */
+        atomic_set(&smp_commenced,1);
+}
+
+/*
+ *	Cycle through the processors sending APIC IPIs to boot each.
+ */
+
+void __init smp_boot_cpus(void)
+{
+        struct _lowcore *curr_lowcore;
+        sigp_ccode   ccode;
+        int i;
+        
+        smp_count_cpus();
+        memset(lowcore_ptr,0,sizeof(lowcore_ptr));  
+        
+        /*
+         *      Initialize the logical to physical CPU number mapping
+         *      and the per-CPU profiling counter/multiplier
+         */
+        
+        for (i = 0; i < NR_CPUS; i++) {
+                prof_counter[i] = 1;
+                prof_old_multiplier[i] = 1;
+                prof_multiplier[i] = 1;
+        }
+
+        print_cpu_info(&safe_get_cpu_lowcore(0).cpu_data);
+
+        for(i = 0; i < smp_num_cpus; i++)
+        {
+                curr_lowcore = (struct _lowcore *)
+                                    __get_free_page(GFP_KERNEL|GFP_DMA);
+                if (curr_lowcore == NULL) {
+                        printk("smp_boot_cpus failed to allocate prefix memory\n");
+                        break;
+                }
+                lowcore_ptr[i] = curr_lowcore;
+                memcpy(curr_lowcore, &S390_lowcore, sizeof(struct _lowcore));
+                /*
+                 * Most of the parameters are set up when the cpu is
+                 * started up.
+                 */
+                if (smp_processor_id() == i)
+                        set_prefix((u32) curr_lowcore);
+                else {
+                        ccode = signal_processor_p((u32)(curr_lowcore),
+                                                   i, sigp_set_prefix);
+                        if(ccode) {
+                                /* if this gets troublesome I'll have to do
+                                 * something about it. */
+                                printk("ccode %d for cpu %d  returned when "
+                                       "setting prefix in smp_boot_cpus not good.\n",
+                                       (int) ccode, (int) i);
+                        }
+                        else
+                                do_boot_cpu(i);
+                }
+        }
+}
+
+/*
+ * the frequency of the profiling timer can be changed
+ * by writing a multiplier value into /proc/profile.
+ *
+ * usually you want to run this on all CPUs ;)
+ */
+int setup_profiling_timer(unsigned int multiplier)
+{
+        return 0;
+}
+
+/*
+ * Local timer interrupt handler. It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+
+void smp_local_timer_interrupt(struct pt_regs * regs)
+{
+	int user = (user_mode(regs) != 0);
+        int cpu = smp_processor_id();
+
+        /*
+         * The profiling function is SMP safe. (nothing can mess
+         * around with "current", and the profiling counters are
+         * updated with atomic operations). This is especially
+         * useful with a profiling multiplier != 1
+         */
+        if (!user_mode(regs))
+                s390_do_profile(regs->psw.addr);
+
+        if (!--prof_counter[cpu]) {
+                int system = 1-user;
+                struct task_struct * p = current;
+
+                /*
+                 * The multiplier may have changed since the last time we got
+                 * to this point as a result of the user writing to
+                 * /proc/profile.  In this case we need to adjust the APIC
+                 * timer accordingly.
+                 *
+                 * Interrupts are already masked off at this point.
+                 */
+                prof_counter[cpu] = prof_multiplier[cpu];
+                if (prof_counter[cpu] != prof_old_multiplier[cpu]) {
+			/* FIXME setup_APIC_timer(calibration_result/prof_counter[cpu]
+			   ); */
+                  prof_old_multiplier[cpu] = prof_counter[cpu];
+                }
+
+                /*
+                 * After doing the above, we need to make like
+                 * a normal interrupt - otherwise timer interrupts
+                 * ignore the global interrupt lock, which is the
+                 * WrongThing (tm) to do.
+                 */
+
+                irq_enter(cpu, 0);
+                update_one_process(p, 1, user, system, cpu);
+                if (p->pid) {
+                        p->counter -= 1;
+                        if (p->counter <= 0) {
+                                p->counter = 0;
+                                p->need_resched = 1;
+                        }
+                        if (p->priority < DEF_PRIORITY) {
+                                kstat.cpu_nice += user;
+                                kstat.per_cpu_nice[cpu] += user;
+                        } else {
+                                kstat.cpu_user += user;
+                                kstat.per_cpu_user[cpu] += user;
+                        }
+                        kstat.cpu_system += system;
+                        kstat.per_cpu_system[cpu] += system;
+
+                }
+                irq_exit(cpu, 0);
+        }
+}
+