diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1999-12-04 03:58:56 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1999-12-04 03:58:56 +0000 |
commit | 1d67e90f19a7acfd9a05dc59678e7d0c5090bd0d (patch) | |
tree | 357efc7b93f8f5102110d20d293f41360ec212fc /arch/i386/kernel | |
parent | aea27b2e18d69af87e673972246e66657b4fa274 (diff) |
Merge with Linux 2.3.21.
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r-- | arch/i386/kernel/Makefile | 9 | ||||
-rw-r--r-- | arch/i386/kernel/apm.c | 35 | ||||
-rw-r--r-- | arch/i386/kernel/entry.S | 9 | ||||
-rw-r--r-- | arch/i386/kernel/head.S | 9 | ||||
-rw-r--r-- | arch/i386/kernel/i386_ksyms.c | 12 | ||||
-rw-r--r-- | arch/i386/kernel/i8259.c | 244 | ||||
-rw-r--r-- | arch/i386/kernel/io_apic.c | 513 | ||||
-rw-r--r-- | arch/i386/kernel/irq.c | 84 | ||||
-rw-r--r-- | arch/i386/kernel/mtrr.c | 30 | ||||
-rw-r--r-- | arch/i386/kernel/pci-i386.c | 312 | ||||
-rw-r--r-- | arch/i386/kernel/pci-i386.h | 29 | ||||
-rw-r--r-- | arch/i386/kernel/pci-pc.c (renamed from arch/i386/kernel/bios32.c) | 665 | ||||
-rw-r--r-- | arch/i386/kernel/pci-visws.c | 131 | ||||
-rw-r--r-- | arch/i386/kernel/smp.c | 2091 | ||||
-rw-r--r-- | arch/i386/kernel/smpboot.c | 1650 | ||||
-rw-r--r-- | arch/i386/kernel/time.c | 2 | ||||
-rw-r--r-- | arch/i386/kernel/traps.c | 126 |
17 files changed, 3626 insertions, 2325 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 875f52d5a..29afabd7a 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -19,7 +19,12 @@ OX_OBJS := i386_ksyms.o MX_OBJS := ifdef CONFIG_PCI -O_OBJS += bios32.o +O_OBJS += pci-i386.o +ifdef CONFIG_VISWS +O_OBJS += pci-visws.o +else +O_OBJS += pci-pc.o +endif endif ifdef CONFIG_MCA @@ -43,7 +48,7 @@ else endif ifdef CONFIG_SMP -O_OBJS += smp.o trampoline.o +O_OBJS += smp.o smpboot.o trampoline.o endif ifdef CONFIG_X86_IO_APIC diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 3bafdfcfc..a54994667 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -643,33 +643,6 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) return APM_SUCCESS; } -static int apm_get_battery_status(u_short which, u_short *status, - u_short *bat, u_short *life, u_short *nbat) -{ - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - - if (apm_bios_info.version < 0x0102) { - /* pretend we only have one battery. */ - if (which != 1) - return APM_BAD_DEVICE; - *nbat = 1; - return apm_get_power_status(status, bat, life); - } - - if (apm_bios_call(0x530a, (0x8000 | (which)), 0, &eax, - &ebx, &ecx, &edx, &esi)) - return (eax >> 8) & 0xff; - *status = ebx; - *bat = ecx; - *life = edx; - *nbat = esi; - return APM_SUCCESS; -} - static int __init apm_engage_power_management(u_short device) { u32 eax; @@ -1263,7 +1236,6 @@ int apm_get_info(char *buf, char **start, off_t fpos, int length, int dummy) unsigned short bx; unsigned short cx; unsigned short dx; - unsigned short nbat; unsigned short error; unsigned short ac_line_status = 0xff; unsigned short battery_status = 0xff; @@ -1473,7 +1445,7 @@ static int __init apm_init(void) if (apm_bios_info.version == 0) { printk(KERN_INFO "apm: BIOS not found.\n"); - return; + return -1; } printk(KERN_INFO "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", @@ -1483,7 +1455,7 @@ static int __init apm_init(void) driver_version); if ((apm_bios_info.flags & APM_32_BIT_SUPPORT) == 0) { printk(KERN_INFO "apm: no 32 bit BIOS support\n"); - return; + return -1; } /* @@ -1512,7 +1484,7 @@ static int __init apm_init(void) if (apm_disabled) { printk(KERN_NOTICE "apm: disabled on user request.\n"); - return; + return -1; } #ifdef CONFIG_SMP @@ -1571,6 +1543,7 @@ static int __init apm_init(void) misc_register(&apm_device); kernel_thread(apm, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND | SIGCHLD); + return 0; } module_init(apm_init) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 47f23b6b6..4b88dda89 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -323,9 +323,14 @@ ENTRY(debug) jmp error_code ENTRY(nmi) + pushl %eax + SAVE_ALL + movl %esp,%edx pushl $0 - pushl $ SYMBOL_NAME(do_nmi) - jmp error_code + pushl %edx + call SYMBOL_NAME(do_nmi) + addl $8,%esp + RESTORE_ALL ENTRY(int3) pushl $0 diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index ac854e721..f1aa50586 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -243,6 +243,15 @@ is386: pushl %ecx # restore original EFLAGS xorl %eax,%eax lldt %ax cld # gcc2 wants the direction flag cleared at all times +#ifdef __SMP__ + movb ready, %cl + cmpb $1,%cl + je 1f # the first CPU calls start_kernel + # all other CPUs call initialize_secondary + call SYMBOL_NAME(initialize_secondary) + jmp L6 +1: +#endif call SYMBOL_NAME(start_kernel) L6: jmp L6 # main should never return here, but diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 61422f372..043132b8e 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -8,6 +8,7 @@ #include <linux/in6.h> #include <linux/interrupt.h> #include <linux/smp_lock.h> +#include <linux/acpi.h> #include <asm/semaphore.h> #include <asm/processor.h> @@ -17,6 +18,7 @@ #include <asm/hardirq.h> #include <asm/delay.h> #include <asm/irq.h> +#include <asm/mmx.h> extern void dump_thread(struct pt_regs *, struct user *); extern int dump_fpu(elf_fpregset_t *); @@ -41,6 +43,7 @@ EXPORT_SYMBOL(enable_irq); EXPORT_SYMBOL(disable_irq); EXPORT_SYMBOL(disable_irq_nosync); EXPORT_SYMBOL(kernel_thread); +EXPORT_SYMBOL(acpi_idle); EXPORT_SYMBOL_NOVERS(__down_failed); EXPORT_SYMBOL_NOVERS(__down_failed_interruptible); @@ -71,7 +74,13 @@ EXPORT_SYMBOL(clear_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__generic_copy_from_user); EXPORT_SYMBOL(__generic_copy_to_user); -EXPORT_SYMBOL(strlen_user); +EXPORT_SYMBOL(strnlen_user); + +#ifdef CONFIG_X86_USE_3DNOW +EXPORT_SYMBOL(_mmx_memcpy); +EXPORT_SYMBOL(mmx_clear_page); +EXPORT_SYMBOL(mmx_copy_page); +#endif #ifdef __SMP__ EXPORT_SYMBOL(cpu_data); @@ -117,3 +126,4 @@ EXPORT_SYMBOL(mca_is_adapter_used); #ifdef CONFIG_VT EXPORT_SYMBOL(screen_info); #endif + diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index ce4082848..3e9097f06 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -1,7 +1,6 @@ #include <linux/config.h> #include <linux/ptrace.h> #include <linux/errno.h> -#include <linux/kernel_stat.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> @@ -9,68 +8,23 @@ #include <linux/timex.h> #include <linux/malloc.h> #include <linux/random.h> -#include <linux/smp.h> #include <linux/smp_lock.h> #include <linux/init.h> +#include <linux/kernel_stat.h> #include <asm/system.h> #include <asm/io.h> #include <asm/irq.h> #include <asm/bitops.h> -#include <asm/smp.h> #include <asm/pgtable.h> #include <asm/delay.h> #include <asm/desc.h> #include <linux/irq.h> - -/* - * Intel specific no controller code - * odd that no-controller should be architecture dependent - * but see the ifdef __SMP__ - */ - -static void enable_none(unsigned int irq) { } -static unsigned int startup_none(unsigned int irq) { return 0; } -static void disable_none(unsigned int irq) { } -static void ack_none(unsigned int irq) -{ -#ifdef __SMP__ - /* - * [currently unexpected vectors happen only on SMP and APIC. - * if we want to have non-APIC and non-8259A controllers - * in the future with unexpected vectors, this ack should - * probably be made controller-specific.] - */ - ack_APIC_irq(); -#endif -} - -/* startup is the same as "enable", shutdown is same as "disable" */ -#define shutdown_none disable_none -#define end_none enable_none - -struct hw_interrupt_type no_irq_type = { - "none", - startup_none, - shutdown_none, - enable_none, - disable_none, - ack_none, - end_none -}; - - -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ /* + * Common place to define all x86 IRQ vectors + * * This builds up the IRQ handler stubs using some ugly macros in irq.h * * These macros create the low-level assembly IRQ routines that save @@ -79,7 +33,6 @@ struct hw_interrupt_type no_irq_type = { * interrupt-controller happy. */ - BUILD_COMMON_IRQ() #define BI(x,y) \ @@ -93,7 +46,7 @@ BUILD_COMMON_IRQ() /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x20-0x30) + * (these are usually mapped to vectors 0x20-0x2f) */ BUILD_16_IRQS(0x0) @@ -126,9 +79,9 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) */ BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) -BUILD_SMP_INTERRUPT(stop_cpu_interrupt,STOP_CPU_VECTOR) BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_SMP_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_SMP_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) /* * every pentium local APIC has two 'local interrupts', with a @@ -150,7 +103,7 @@ BUILD_SMP_TIMER_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) -static void (*interrupt[NR_IRQS])(void) = { +void (*interrupt[NR_IRQS])(void) = { IRQLIST_16(0x0), #ifdef CONFIG_X86_IO_APIC @@ -164,17 +117,23 @@ static void (*interrupt[NR_IRQS])(void) = { #undef IRQ #undef IRQLIST_16 +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + * this file should become arch/i386/kernel/irq.c when the old irq.c + * moves to arch independent land + */ - - -static void enable_8259A_irq(unsigned int irq); +void enable_8259A_irq(unsigned int irq); void disable_8259A_irq(unsigned int irq); /* shutdown is same as "disable" */ #define end_8259A_irq enable_8259A_irq #define shutdown_8259A_irq disable_8259A_irq -static void mask_and_ack_8259A(unsigned int); +void mask_and_ack_8259A(unsigned int); static unsigned int startup_8259A_irq(unsigned int irq) { @@ -207,8 +166,8 @@ static unsigned int cached_irq_mask = 0xffff; /* * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not connected to any IO-APIC pin, it's - * fed to the CPU IRQ line directly. + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. * * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. * this 'mixed mode' IRQ handling costs nothing because it's only used @@ -224,22 +183,20 @@ void disable_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; cached_irq_mask |= mask; - if (irq & 8) { + if (irq & 8) outb(cached_A1,0xA1); - } else { + else outb(cached_21,0x21); - } } -static void enable_8259A_irq(unsigned int irq) +void enable_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); cached_irq_mask &= mask; - if (irq & 8) { + if (irq & 8) outb(cached_A1,0xA1); - } else { + else outb(cached_21,0x21); - } } int i8259A_irq_pending(unsigned int irq) @@ -260,24 +217,139 @@ void make_8259A_irq(unsigned int irq) } /* + * This function assumes to be called rarely. Switching between + * 8259A registers is slow. + */ +static inline int i8259A_irq_real(unsigned int irq) +{ + int value; + int irqmask = 1<<irq; + + if (irq < 8) { + outb(0x0B,0x20); /* ISR register */ + value = inb(0x20) & irqmask; + outb(0x0A,0x20); /* back to the IRR register */ + return value; + } + outb(0x0B,0xA0); /* ISR register */ + value = inb(0xA0) & (irqmask >> 8); + outb(0x0A,0xA0); /* back to the IRR register */ + return value; +} + +/* * Careful! The 8259A is a fragile beast, it pretty * much _has_ to be done exactly like this (mask it * first, _then_ send the EOI, and the order of EOI * to the two 8259s is important! */ -static void mask_and_ack_8259A(unsigned int irq) +void mask_and_ack_8259A(unsigned int irq) { - cached_irq_mask |= 1 << irq; + unsigned int irqmask = 1 << irq; + + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecesserily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: if (irq & 8) { - inb(0xA1); /* DUMMY */ + inb(0xA1); /* DUMMY - (do we need this?) */ outb(cached_A1,0xA1); - outb(0x62,0x20); /* Specific EOI to cascade */ - outb(0x20,0xA0); + outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ + outb(0x20,0xA0); /* 'generic EOI' to slave */ } else { - inb(0x21); /* DUMMY */ + inb(0x21); /* DUMMY - (do we need this?) */ outb(cached_21,0x21); - outb(0x20,0x20); + outb(0x20,0x20); /* 'generic EOI' to master */ } + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask = 0; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk("spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + irq_err_count++; + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + save_flags(flags); + cli(); + + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-2 */ + + /* + * outb_p - this has to work on a wide range of PC hardware. + */ + outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ + outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ + if (auto_eoi) + outb_p(0x03, 0x21); /* master does Auto EOI */ + else + outb_p(0x01, 0x21); /* master expects normal EOI */ + + outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ + outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ + outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode + is to be investigated) */ + + if (auto_eoi) + /* + * in AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_irq_type.ack = disable_8259A_irq; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_21, 0x21); /* restore master IRQ mask */ + outb(cached_A1, 0xA1); /* restore slave IRQ mask */ + + restore_flags(flags); } #ifndef CONFIG_VISWS @@ -307,7 +379,7 @@ static struct irqaction irq13 = { math_error_irq, 0, 0, "fpu", NULL, NULL }; * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, 0, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, 0, 0, "cascade", NULL, NULL}; #endif @@ -315,6 +387,8 @@ void init_ISA_irqs (void) { int i; + init_8259A(0); + for (i = 0; i < NR_IRQS; i++) { irq_desc[i].status = IRQ_DISABLED; irq_desc[i].action = 0; @@ -357,9 +431,9 @@ void __init init_IRQ(void) #ifdef __SMP__ /* - IRQ0 must be given a fixed assignment and initialized - before init_IRQ_SMP. - */ + * IRQ0 must be given a fixed assignment and initialized, + * because it's used before the IO-APIC is set up. + */ set_intr_gate(IRQ0_TRAP_VECTOR, interrupt[0]); /* @@ -371,17 +445,15 @@ void __init init_IRQ(void) /* IPI for invalidation */ set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); - /* IPI for CPU halt */ - set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt); - /* self generated IPI for local APIC timer */ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - /* IPI vector for APIC spurious interrupts */ + /* IPI vectors for APIC spurious and error interrupts */ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); #endif /* @@ -397,13 +469,3 @@ void __init init_IRQ(void) setup_irq(13, &irq13); #endif } - -#ifdef CONFIG_X86_IO_APIC -void __init init_IRQ_SMP(void) -{ - int i; - for (i = 0; i < NR_IRQS ; i++) - if (IO_APIC_VECTOR(i) > 0) - set_intr_gate(IO_APIC_VECTOR(i), interrupt[i]); -} -#endif diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 34e3ff86f..9fb8bcd3a 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1,7 +1,7 @@ /* * Intel IO-APIC support for multi-Pentium hosts. * - * Copyright (C) 1997, 1998 Ingo Molnar, Hajnalka Szabo + * Copyright (C) 1997, 1998, 1999 Ingo Molnar, Hajnalka Szabo * * Many thanks to Stig Venaas for trying out countless experimental * patches and reporting/debugging problems patiently! @@ -18,15 +18,21 @@ #include <linux/init.h> #include <linux/delay.h> #include <asm/io.h> +#include <asm/desc.h> #include <linux/irq.h> +#undef __init +#define __init + /* * volatile is justified in this case, IO-APIC register contents * might change spontaneously, GCC should not cache it */ #define IO_APIC_BASE(idx) ((volatile int *)__fix_to_virt(FIX_IO_APIC_BASE_0 + idx)) +extern int nmi_watchdog; + /* * The structure of the IO-APIC: */ @@ -59,6 +65,11 @@ int nr_ioapic_registers[MAX_IO_APICS]; enum ioapic_irq_destination_types { dest_Fixed = 0, dest_LowestPrio = 1, + dest_SMI = 2, + dest__reserved_1 = 3, + dest_NMI = 4, + dest_INIT = 5, + dest__reserved_2 = 6, dest_ExtINT = 7 }; @@ -94,14 +105,7 @@ struct IO_APIC_route_entry { * MP-BIOS irq configuration table structures: */ -enum mp_irq_source_types { - mp_INT = 0, - mp_NMI = 1, - mp_SMI = 2, - mp_ExtINT = 3 -}; - -struct mpc_config_ioapic mp_apics[MAX_IO_APICS];/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];/* I/O APIC entries */ int mp_irq_entries = 0; /* # of MP IRQ source entries */ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* MP IRQ source entries */ @@ -202,16 +206,10 @@ static void name##_IO_APIC_irq(unsigned int irq) \ FINAL; \ } -/* - * We disable IO-APIC IRQs by setting their 'destination CPU mask' to - * zero. Trick by Ramesh Nalluri. - */ -DO_ACTION( disable, 1, &= 0x00ffffff, io_apic_sync(entry->apic))/* destination = 0x00 */ -DO_ACTION( enable, 1, |= 0xff000000, ) /* destination = 0xff */ DO_ACTION( mask, 0, |= 0x00010000, io_apic_sync(entry->apic))/* mask = 1 */ DO_ACTION( unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -289,7 +287,7 @@ static int __init find_irq_entry(int apic, int pin, int type) for (i = 0; i < mp_irq_entries; i++) if ( (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_dstapic == mp_apics[apic].mpc_apicid) && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid) && (mp_irqs[i].mpc_dstirq == pin)) return i; @@ -330,7 +328,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pci_pin) int lbus = mp_irqs[i].mpc_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_apics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) break; if ((apic || IO_APIC_IRQ(mp_irqs[i].mpc_dstirq)) && @@ -589,24 +587,30 @@ static int __init assign_irq_vector(int irq) static int current_vector = IRQ0_TRAP_VECTOR, offset = 0; if (IO_APIC_VECTOR(irq) > 0) return IO_APIC_VECTOR(irq); + if (current_vector == 0xFF) + panic("ran out of interrupt sources!"); +next: current_vector += 8; - if (current_vector > 0xFE) { + if (current_vector == SYSCALL_VECTOR) + goto next; + + if (current_vector > 0xFF) { offset++; current_vector = IRQ0_TRAP_VECTOR + offset; - printk("WARNING: ASSIGN_IRQ_VECTOR wrapped back to %02X\n", - current_vector); } - if (current_vector == SYSCALL_VECTOR) - panic("ran out of interrupt sources!"); IO_APIC_VECTOR(irq) = current_vector; return current_vector; } +extern void (*interrupt[NR_IRQS])(void); +static struct hw_interrupt_type ioapic_level_irq_type; +static struct hw_interrupt_type ioapic_edge_irq_type; + void __init setup_IO_APIC_irqs(void) { struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1; + int apic, pin, idx, irq, first_notcon = 1, vector; printk("init IO_APIC IRQs\n"); @@ -621,15 +625,15 @@ void __init setup_IO_APIC_irqs(void) entry.delivery_mode = dest_LowestPrio; entry.dest_mode = 1; /* logical delivery */ entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = 0; /* but no route */ + entry.dest.logical.logical_dest = APIC_ALL_CPUS; /* all CPUs */ idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { if (first_notcon) { - printk(" IO-APIC (apicid-pin) %d-%d", mp_apics[apic].mpc_apicid, pin); + printk(" IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); first_notcon = 0; } else - printk(", %d-%d", mp_apics[apic].mpc_apicid, pin); + printk(", %d-%d", mp_ioapics[apic].mpc_apicid, pin); continue; } @@ -639,17 +643,29 @@ void __init setup_IO_APIC_irqs(void) if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; - entry.dest.logical.logical_dest = 0xff; + entry.dest.logical.logical_dest = APIC_ALL_CPUS; } - irq = pin_2_irq(idx,apic,pin); + irq = pin_2_irq(idx, apic, pin); add_pin_to_irq(irq, apic, pin); if (!apic && !IO_APIC_IRQ(irq)) continue; - entry.vector = assign_irq_vector(irq); + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + + if (IO_APIC_irq_trigger(irq)) + irq_desc[irq].handler = &ioapic_level_irq_type; + else + irq_desc[irq].handler = &ioapic_edge_irq_type; + set_intr_gate(vector, interrupt[irq]); + + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); } @@ -660,34 +676,47 @@ void __init setup_IO_APIC_irqs(void) } /* - * Set up a certain pin as ExtINT delivered interrupt + * Set up the 8259A-master output pin as broadcast to all + * CPUs. */ -void __init setup_ExtINT_pin(unsigned int apic, unsigned int pin, int irq) +void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) { struct IO_APIC_route_entry entry; - /* - * add it to the IO-APIC irq-routing table: - */ memset(&entry,0,sizeof(entry)); - entry.delivery_mode = dest_ExtINT; - entry.dest_mode = 0; /* physical delivery */ - entry.mask = 0; /* unmask IRQ now */ - /* - * We use physical delivery to get the timer IRQ - * to the boot CPU. 'boot_cpu_id' is the physical - * APIC ID of the boot CPU. - */ - entry.dest.physical.physical_dest = boot_cpu_id; + disable_8259A_irq(0); - entry.vector = assign_irq_vector(irq); + apic_readaround(APIC_LVT0); + apic_write(APIC_LVT0, 0x00010700); // mask LVT0 + init_8259A(1); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = 1; /* logical delivery */ + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = APIC_ALL_CPUS; + entry.delivery_mode = dest_LowestPrio; entry.polarity = 0; entry.trigger = 0; + entry.vector = vector; - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + /* + * The timer IRQ doesnt have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].handler = &ioapic_edge_irq_type; + + /* + * Add it to the IO-APIC irq-routing table: + */ + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + + enable_8259A_irq(0); } void __init UNEXPECTED_IO_APIC(void) @@ -705,7 +734,7 @@ void __init print_IO_APIC(void) printk("number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) - printk("number of IO-APIC #%d registers: %d.\n", mp_apics[i].mpc_apicid, nr_ioapic_registers[i]); + printk("number of IO-APIC #%d registers: %d.\n", mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); /* * We are a bit conservative about what we expect. We have to @@ -717,8 +746,10 @@ void __init print_IO_APIC(void) *(int *)®_00 = io_apic_read(apic, 0); *(int *)®_01 = io_apic_read(apic, 1); - *(int *)®_02 = io_apic_read(apic, 2); - printk("\nIO APIC #%d......\n", mp_apics[apic].mpc_apicid); + if (reg_01.version >= 0x10) + *(int *)®_02 = io_apic_read(apic, 2); + + printk("\nIO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); printk(".... register #00: %08X\n", *(int *)®_00); printk("....... : physical APIC id: %02X\n", reg_00.ID); if (reg_00.__reserved_1 || reg_00.__reserved_2) @@ -730,12 +761,15 @@ void __init print_IO_APIC(void) (reg_01.entries != 0x17) && /* typical ISA+PCI boards */ (reg_01.entries != 0x1b) && /* Compaq Proliant boards */ (reg_01.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.entries != 0x3F) /* bigger Xeon boards */ + (reg_01.entries != 0x22) && /* bigger Xeon boards */ + (reg_01.entries != 0x2E) && + (reg_01.entries != 0x3F) ) UNEXPECTED_IO_APIC(); printk("....... : IO APIC version: %04X\n", reg_01.version); - if ( (reg_01.version != 0x10) && /* oldest IO-APICs */ + if ( (reg_01.version != 0x01) && /* 82489DX IO-APICs */ + (reg_01.version != 0x10) && /* oldest IO-APICs */ (reg_01.version != 0x11) && /* Pentium/Pro IO-APICs */ (reg_01.version != 0x13) /* Xeon IO-APICs */ ) @@ -743,10 +777,12 @@ void __init print_IO_APIC(void) if (reg_01.__reserved_1 || reg_01.__reserved_2) UNEXPECTED_IO_APIC(); - printk(".... register #02: %08X\n", *(int *)®_02); - printk("....... : arbitration: %02X\n", reg_02.arbitration); - if (reg_02.__reserved_1 || reg_02.__reserved_2) - UNEXPECTED_IO_APIC(); + if (reg_01.version >= 0x10) { + printk(".... register #02: %08X\n", *(int *)®_02); + printk("....... : arbitration: %02X\n", reg_02.arbitration); + if (reg_02.__reserved_1 || reg_02.__reserved_2) + UNEXPECTED_IO_APIC(); + } printk(".... IRQ redirection table:\n"); @@ -797,8 +833,116 @@ void __init print_IO_APIC(void) return; } +static void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + printk("0123456789abcdef0123456789abcdef\n"); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1<<j)) + printk("1"); + else + printk("0"); + } + printk("\n"); + } +} + +void /*__init*/ print_local_APIC(void * dummy) +{ + unsigned int v, ver, maxlvt; + + printk("\nprinting local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk("... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + v = apic_read(APIC_LVR); + printk("... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + v = apic_read(APIC_ARBPRI); + printk("... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk("... APIC PROCPRI: %08x\n", v); + } + + v = apic_read(APIC_EOI); + printk("... APIC EOI: %08x\n", v); + v = apic_read(APIC_LDR); + printk("... APIC LDR: %08x\n", v); + v = apic_read(APIC_DFR); + printk("... APIC DFR: %08x\n", v); + v = apic_read(APIC_SPIV); + printk("... APIC SPIV: %08x\n", v); + + printk("... APIC ISR field:\n"); + print_APIC_bitfield(APIC_ISR); + printk("... APIC TMR field:\n"); + print_APIC_bitfield(APIC_TMR); + printk("... APIC IRR field:\n"); + print_APIC_bitfield(APIC_IRR); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + /* + * Due to the Pentium erratum 3AP. + */ + if (maxlvt > 3) { + apic_readaround(APIC_SPIV); // not strictly necessery + apic_write(APIC_ESR, 0); + } + v = apic_read(APIC_ESR); + printk("... APIC ESR: %08x\n", v); + } + + v = apic_read(APIC_ICR); + printk("... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk("... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk("... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk("... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk("... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk("... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk("... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk("... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk("... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk("... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + smp_call_function(print_local_APIC, NULL, 1, 1); + print_local_APIC(NULL); +} + static void __init init_sym_mode(void) { + struct IO_APIC_reg_01 reg_01; int i; for (i = 0; i < PIN_MAP_SIZE; i++) { @@ -809,24 +953,21 @@ static void __init init_sym_mode(void) for (i = 0; i < MAX_PIRQS; i++) pirq_entries[i] =- 1; - printk("enabling symmetric IO mode... "); - - outb(0x70, 0x22); - outb(0x01, 0x23); - - printk("...done.\n"); + if (pic_mode) { + /* + * PIC mode, enable symmetric IO mode in the IMCR. + */ + printk("leaving PIC mode, enabling symmetric IO mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } /* * The number of IO-APIC IRQ registers (== #pins): */ - { - struct IO_APIC_reg_01 reg_01; - int i; - - for (i = 0; i < nr_ioapics; i++) { - *(int *)®_01 = io_apic_read(i, 1); - nr_ioapic_registers[i] = reg_01.entries+1; - } + for (i = 0; i < nr_ioapics; i++) { + *(int *)®_01 = io_apic_read(i, 1); + nr_ioapic_registers[i] = reg_01.entries+1; } /* @@ -835,24 +976,41 @@ static void __init init_sym_mode(void) clear_IO_APIC(); } +static void clear_lapic_ints (void * dummy) +{ + int maxlvt; + + maxlvt = get_maxlvt(); + apic_write_around(APIC_LVTT, 0x00010000); + apic_write_around(APIC_LVT0, 0x00010000); + apic_write_around(APIC_LVT1, 0x00010000); + if (maxlvt >= 3) + apic_write_around(APIC_LVTERR, 0x00010000); + if (maxlvt >= 4) + apic_write_around(APIC_LVTPC, 0x00010000); +} + /* * Not an __init, needed by the reboot code */ void init_pic_mode(void) { /* - * Clear the IO-APIC before rebooting: + * Clear the IO-APIC and local APICs before rebooting: */ clear_IO_APIC(); + smp_call_function(clear_lapic_ints, NULL, 1, 1); + clear_lapic_ints(NULL); /* * Put it back into PIC mode (has an effect only on - * certain boards) + * certain older boards) */ - printk("disabling symmetric IO mode... "); + if (pic_mode) { + printk("disabling symmetric IO mode, entering PIC mode.\n"); outb_p(0x70, 0x22); outb_p(0x00, 0x23); - printk("...done.\n"); + } } static void __init setup_ioapic_id(void) @@ -914,10 +1072,13 @@ static void __init construct_default_ISA_mptable(void) * MP specification 1.4 defines some extra rules for default * configurations, fix them up here: */ - switch (mpc_default_type) { case 2: + /* + * IRQ0 is not connected: + */ + mp_irqs[0].mpc_irqtype = mp_ExtINT; break; default: /* @@ -942,7 +1103,7 @@ static int __init timer_irq_works(void) unsigned int t1 = jiffies; sti(); - mdelay(100); + mdelay(40); if (jiffies-t1>1) return 1; @@ -950,6 +1111,27 @@ static int __init timer_irq_works(void) return 0; } +extern atomic_t nmi_counter[NR_CPUS]; + +static int __init nmi_irq_works(void) +{ + atomic_t tmp[NR_CPUS]; + int j, cpu; + + memcpy(tmp, nmi_counter, sizeof(tmp)); + sti(); + mdelay(50); + + for (j = 0; j < smp_num_cpus; j++) { + cpu = cpu_logical_map(j); + if (atomic_read(nmi_counter+cpu) - atomic_read(tmp+cpu) <= 3) { + printk("CPU#%d NMI appears to be stuck.\n", cpu); + return 0; + } + } + return 1; +} + /* * In the SMP+IOAPIC case it might happen that there are an unspecified * number of pending IRQ events unhandled. These cases are very rare, @@ -964,12 +1146,11 @@ static int __init timer_irq_works(void) */ static void enable_edge_ioapic_irq(unsigned int irq) { - enable_IO_APIC_irq(irq); + unmask_IO_APIC_irq(irq); } static void disable_edge_ioapic_irq(unsigned int irq) { - disable_IO_APIC_irq(irq); } /* @@ -995,8 +1176,17 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq) } #define shutdown_edge_ioapic_irq disable_edge_ioapic_irq -void static ack_edge_ioapic_irq(unsigned int i) + +/* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ +void static ack_edge_ioapic_irq(unsigned int irq) { + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) + == (IRQ_PENDING | IRQ_DISABLED)) + mask_IO_APIC_irq(irq); ack_APIC_irq(); } void static end_edge_ioapic_irq(unsigned int i){} @@ -1055,7 +1245,8 @@ static struct hw_interrupt_type ioapic_level_irq_type = { static inline void init_IO_APIC_traps(void) { - int i; + int irq; + /* * NOTE! The local APIC isn't very good at handling * multiple interrupts at the same interrupt level. @@ -1067,36 +1258,62 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (i = 0; i < NR_IRQS ; i++) { - if (IO_APIC_VECTOR(i) > 0) { - if (IO_APIC_irq_trigger(i)) - irq_desc[i].handler = &ioapic_level_irq_type; - else - irq_desc[i].handler = &ioapic_edge_irq_type; - /* - * disable it in the 8259A: - */ - if (i < 16) - disable_8259A_irq(i); - } else { - if (!IO_APIC_IRQ(i)) - continue; - + for (irq = 0; irq < NR_IRQS ; irq++) { + if (IO_APIC_IRQ(irq) && !IO_APIC_VECTOR(irq)) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (i < 16) { - make_8259A_irq(i); - continue; - } - - /* Strange. Oh, well.. */ - irq_desc[i].handler = &no_irq_type; + if (irq < 16) + make_8259A_irq(irq); + else + /* Strange. Oh, well.. */ + irq_desc[irq].handler = &no_irq_type; } } - init_IRQ_SMP(); +} + +void static ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +void static end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type = { + "local-APIC-edge", + NULL, /* startup_irq() not used for IRQ0 */ + NULL, /* shutdown_irq() not used for IRQ0 */ + NULL, /* enable_irq() not used for IRQ0 */ + NULL, /* disable_irq() not used for IRQ0 */ + ack_lapic_irq, + end_lapic_irq +}; + +static void enable_NMI_through_LVT0 (void * dummy) +{ + apic_readaround(APIC_LVT0); + apic_write(APIC_LVT0, 0x00000400); // unmask and set to NMI +} + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + printk("activating NMI Watchdog ..."); + + smp_call_function(enable_NMI_through_LVT0, NULL, 1, 1); + enable_NMI_through_LVT0(NULL); + + printk(" done.\n"); } /* @@ -1108,45 +1325,78 @@ static inline void init_IO_APIC_traps(void) static inline void check_timer(void) { int pin1, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); pin1 = find_timer_pin(mp_INT); pin2 = find_timer_pin(mp_ExtINT); - enable_IO_APIC_irq(0); - if (!timer_irq_works()) { - if (pin1 != -1) - printk("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); - printk("...trying to set up timer as ExtINT... "); + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + if (timer_irq_works()) { + if (nmi_watchdog) { + disable_8259A_irq(0); + init_8259A(1); + setup_nmi(); + enable_8259A_irq(0); + if (nmi_irq_works()) + return; + } else + return; + } - if (pin2 != -1) { - printk(".. (found pin %d) ...", pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - setup_ExtINT_pin(0, pin2, 0); - make_8259A_irq(0); + if (pin1 != -1) { + printk("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); + clear_IO_APIC_pin(0, pin1); + } + + printk("...trying to set up timer (IRQ0) through the 8259A ... "); + if (pin2 != -1) { + printk("\n..... (found pin %d) ...", pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(pin2, vector); + if (timer_irq_works()) { + printk("works.\n"); + if (nmi_watchdog) { + setup_nmi(); + if (nmi_irq_works()) + return; + } else + return; } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(0, pin2); + } + printk(" failed.\n"); - if (!timer_irq_works()) { - printk(" failed.\n"); - printk("...trying to set up timer as BP IRQ..."); - /* - * Just in case ... - */ - if (pin1 != -1) - clear_IO_APIC_pin(0, pin1); - if (pin2 != -1) - clear_IO_APIC_pin(0, pin2); + if (nmi_watchdog) + printk("timer doesnt work through the IO-APIC - cannot activate NMI Watchdog!\n"); - make_8259A_irq(0); + printk("...trying to set up timer as Virtual Wire IRQ..."); - if (!timer_irq_works()) { - printk(" failed.\n"); - panic("IO-APIC + timer doesn't work!"); - } - } + disable_8259A_irq(0); + irq_desc[0].handler = &lapic_irq_type; + init_8259A(1); // AEOI mode + apic_readaround(APIC_LVT0); + apic_write(APIC_LVT0, 0x00000000 | vector); // Fixed mode + enable_8259A_irq(0); + + if (timer_irq_works()) { printk(" works.\n"); + return; } + printk(" failed :(.\n"); + panic("IO-APIC + timer doesn't work! pester mingo@redhat.com"); } /* @@ -1189,6 +1439,5 @@ void __init setup_IO_APIC(void) setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); - print_IO_APIC(); } diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 3106f1966..8ec329287 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -22,7 +22,6 @@ #include <linux/ptrace.h> #include <linux/errno.h> -#include <linux/kernel_stat.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> @@ -30,14 +29,13 @@ #include <linux/timex.h> #include <linux/malloc.h> #include <linux/random.h> -#include <linux/smp.h> #include <linux/smp_lock.h> #include <linux/init.h> +#include <linux/kernel_stat.h> #include <asm/system.h> #include <asm/io.h> #include <asm/bitops.h> -#include <asm/smp.h> #include <asm/pgtable.h> #include <asm/delay.h> #include <asm/desc.h> @@ -48,7 +46,7 @@ unsigned int local_bh_count[NR_CPUS]; unsigned int local_irq_count[NR_CPUS]; -atomic_t nmi_counter; +extern atomic_t nmi_counter[NR_CPUS]; /* * Linux has a controller-independent x86 interrupt architecture. @@ -75,7 +73,8 @@ spinlock_t irq_controller_lock = SPIN_LOCK_UNLOCKED; /* * Controller mappings for all interrupt sources: */ -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { 0, &no_irq_type, }}; +irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = + { [0 ... NR_IRQS-1] = { 0, &no_irq_type, }}; /* * Special irq handlers. @@ -84,6 +83,52 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { 0, &n void no_action(int cpl, void *dev_id, struct pt_regs *regs) { } /* + * Generic no controller code + */ + +static void enable_none(unsigned int irq) { } +static unsigned int startup_none(unsigned int irq) { return 0; } +static void disable_none(unsigned int irq) { } +static void ack_none(unsigned int irq) +{ +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves, it doesnt deserve + * a generic callback i think. + */ +#if CONFIG_X86 + printk("unexpected IRQ trap at vector %02x\n", irq); +#ifdef __SMP__ + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + */ + ack_APIC_irq(); +#endif +#endif +} + +/* startup is the same as "enable", shutdown is same as "disable" */ +#define shutdown_none disable_none +#define end_none enable_none + +struct hw_interrupt_type no_irq_type = { + "none", + startup_none, + shutdown_none, + enable_none, + disable_none, + ack_none, + end_none +}; + +volatile unsigned long irq_err_count; + +/* * Generic, controller-independent functions: */ @@ -106,22 +151,30 @@ int get_irq_list(char *buf) #ifndef __SMP__ p += sprintf(p, "%10u ", kstat_irqs(i)); #else - for (j=0; j<smp_num_cpus; j++) + for (j = 0; j < smp_num_cpus; j++) p += sprintf(p, "%10u ", kstat.irqs[cpu_logical_map(j)][i]); #endif p += sprintf(p, " %14s", irq_desc[i].handler->typename); p += sprintf(p, " %s", action->name); - for (action=action->next; action; action = action->next) { + for (action=action->next; action; action = action->next) p += sprintf(p, ", %s", action->name); - } *p++ = '\n'; } - p += sprintf(p, "NMI: %10u\n", atomic_read(&nmi_counter)); -#ifdef __SMP__ - p += sprintf(p, "ERR: %10lu\n", ipi_count); -#endif + p += sprintf(p, "NMI: "); + for (j = 0; j < smp_num_cpus; j++) + p += sprintf(p, "%10u ", + atomic_read(nmi_counter+cpu_logical_map(j))); + p += sprintf(p, "\n"); +#if CONFIG_SMP + p += sprintf(p, "LOC: "); + for (j = 0; j < smp_num_cpus; j++) + p += sprintf(p, "%10u ", + apic_timer_irqs[cpu_logical_map(j)]); + p += sprintf(p, "\n"); +#endif + p += sprintf(p, "ERR: %10lu\n", irq_err_count); return p - buf; } @@ -520,7 +573,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs) kstat.irqs[cpu][irq]++; desc = irq_desc + irq; spin_lock(&irq_controller_lock); - irq_desc[irq].handler->ack(irq); + desc->handler->ack(irq); /* REPLAY is when Linux resends an IRQ that was dropped earlier WAITING is used by probe to mark irqs that are being tested @@ -570,9 +623,8 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs) spin_unlock(&irq_controller_lock); } desc->status &= ~IRQ_INPROGRESS; - if (!(desc->status & IRQ_DISABLED)){ - irq_desc[irq].handler->end(irq); - } + if (!(desc->status & IRQ_DISABLED)) + desc->handler->end(irq); spin_unlock(&irq_controller_lock); /* diff --git a/arch/i386/kernel/mtrr.c b/arch/i386/kernel/mtrr.c index f76c68f59..f55e86b61 100644 --- a/arch/i386/kernel/mtrr.c +++ b/arch/i386/kernel/mtrr.c @@ -223,6 +223,8 @@ 19990819 Alan Cox <alan@redhat.com> Tested Zoltan's changes on a pre production Athlon - 100% success. + 19991008 Manfred Spraul <manfreds@colorfullife.com> + replaced spin_lock_reschedule() with a normal semaphore. */ #include <linux/types.h> #include <linux/errno.h> @@ -303,8 +305,6 @@ typedef u8 mtrr_type; TRUE) #endif -#define spin_lock_reschedule(lock) while (!spin_trylock(lock)) schedule (); - #ifndef CONFIG_PROC_FS # define compute_ascii() while (0) #endif @@ -314,7 +314,7 @@ static char *ascii_buffer = NULL; static unsigned int ascii_buf_bytes = 0; #endif static unsigned int *usage_table = NULL; -static spinlock_t main_lock = SPIN_LOCK_UNLOCKED; +static DECLARE_MUTEX(main_lock); /* Private functions */ #ifdef CONFIG_PROC_FS @@ -1172,7 +1172,7 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, increment = increment ? 1 : 0; max = get_num_var_ranges (); /* Search for existing MTRR */ - spin_lock_reschedule (&main_lock); + down(&main_lock); for (i = 0; i < max; ++i) { (*get_mtrr) (i, &lbase, &lsize, <ype); @@ -1181,7 +1181,7 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, /* At this point we know there is some kind of overlap/enclosure */ if ( (base < lbase) || (base + size > lbase + lsize) ) { - spin_unlock (&main_lock); + up(&main_lock); printk ("mtrr: 0x%lx,0x%lx overlaps existing 0x%lx,0x%lx\n", base, size, lbase, lsize); return -EINVAL; @@ -1190,14 +1190,14 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, if (ltype != type) { if (type == MTRR_TYPE_UNCACHABLE) continue; - spin_unlock (&main_lock); + up(&main_lock); printk ( "mtrr: type mismatch for %lx,%lx old: %s new: %s\n", base, size, attrib_to_str (ltype), attrib_to_str (type) ); return -EINVAL; } if (increment) ++usage_table[i]; compute_ascii (); - spin_unlock (&main_lock); + up(&main_lock); return i; } /* Search for an empty MTRR */ @@ -1211,7 +1211,7 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, set_mtrr (i, base, size, type); usage_table[i] = 1; compute_ascii (); - spin_unlock (&main_lock); + up(&main_lock); return i; } /* End Function mtrr_add */ @@ -1232,7 +1232,7 @@ int mtrr_del (int reg, unsigned long base, unsigned long size) if ( !(boot_cpu_data.x86_capability & X86_FEATURE_MTRR) ) return -ENODEV; max = get_num_var_ranges (); - spin_lock_reschedule (&main_lock); + down(&main_lock); if (reg < 0) { /* Search for existing MTRR */ @@ -1247,14 +1247,14 @@ int mtrr_del (int reg, unsigned long base, unsigned long size) } if (reg < 0) { - spin_unlock (&main_lock); + up(&main_lock); printk ("mtrr: no MTRR for %lx,%lx found\n", base, size); return -EINVAL; } } if (reg >= max) { - spin_unlock (&main_lock); + up(&main_lock); printk ("mtrr: register: %d too big\n", reg); return -EINVAL; } @@ -1262,7 +1262,7 @@ int mtrr_del (int reg, unsigned long base, unsigned long size) { if ((reg == 3) && arr3_protected) { - spin_unlock (&main_lock); + up(&main_lock); printk ("mtrr: ARR3 cannot be changed\n"); return -EINVAL; } @@ -1270,19 +1270,19 @@ int mtrr_del (int reg, unsigned long base, unsigned long size) (*get_mtrr) (reg, &lbase, &lsize, <ype); if (lsize < 1) { - spin_unlock (&main_lock); + up(&main_lock); printk ("mtrr: MTRR %d not used\n", reg); return -EINVAL; } if (usage_table[reg] < 1) { - spin_unlock (&main_lock); + up(&main_lock); printk ("mtrr: reg: %d has count=0\n", reg); return -EINVAL; } if (--usage_table[reg] < 1) set_mtrr (reg, 0, 0, 0); compute_ascii (); - spin_unlock (&main_lock); + up(&main_lock); return reg; } /* End Function mtrr_del */ diff --git a/arch/i386/kernel/pci-i386.c b/arch/i386/kernel/pci-i386.c new file mode 100644 index 000000000..af362611d --- /dev/null +++ b/arch/i386/kernel/pci-i386.c @@ -0,0 +1,312 @@ +/* + * Low-Level PCI Access for i386 machines + * + * Copyright 1993, 1994 Drew Eckhardt + * Visionary Computing + * (Unix and Linux consulting and custom programming) + * Drew@Colorado.EDU + * +1 (303) 786-7975 + * + * Drew's work was sponsored by: + * iX Multiuser Multitasking Magazine + * Hannover, Germany + * hm@ix.de + * + * Copyright 1997--1999 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * For more information, please consult the following manuals (look at + * http://www.pcisig.com/ for how to get them): + * + * PCI BIOS Specification + * PCI Local Bus Specification + * PCI to PCI Bridge Specification + * PCI System Design Guide + * + * + * CHANGELOG : + * Jun 17, 1994 : Modified to accommodate the broken pre-PCI BIOS SPECIFICATION + * Revision 2.0 present on <thys@dennis.ee.up.ac.za>'s ASUS mainboard. + * + * Jan 5, 1995 : Modified to probe PCI hardware at boot time by Frederic + * Potter, potter@cao-vlsi.ibp.fr + * + * Jan 10, 1995 : Modified to store the information about configured pci + * devices into a list, which can be accessed via /proc/pci by + * Curtis Varner, cvarner@cs.ucr.edu + * + * Jan 12, 1995 : CPU-PCI bridge optimization support by Frederic Potter. + * Alpha version. Intel & UMC chipset support only. + * + * Apr 16, 1995 : Source merge with the DEC Alpha PCI support. Most of the code + * moved to drivers/pci/pci.c. + * + * Dec 7, 1996 : Added support for direct configuration access of boards + * with Intel compatible access schemes (tsbogend@alpha.franken.de) + * + * Feb 3, 1997 : Set internal functions to static, save/restore flags + * avoid dead locks reading broken PCI BIOS, werner@suse.de + * + * Apr 26, 1997 : Fixed case when there is BIOS32, but not PCI BIOS + * (mj@atrey.karlin.mff.cuni.cz) + * + * May 7, 1997 : Added some missing cli()'s. [mj] + * + * Jun 20, 1997 : Corrected problems in "conf1" type accesses. + * (paubert@iram.es) + * + * Aug 2, 1997 : Split to PCI BIOS handling and direct PCI access parts + * and cleaned it up... Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * Feb 6, 1998 : No longer using BIOS to find devices and device classes. [mj] + * + * May 1, 1998 : Support for peer host bridges. [mj] + * + * Jun 19, 1998 : Changed to use spinlocks, so that PCI configuration space + * can be accessed from interrupts even on SMP systems. [mj] + * + * August 1998 : Better support for peer host bridges and more paranoid + * checks for direct hardware access. Ugh, this file starts to look as + * a large gallery of common hardware bug workarounds (watch the comments) + * -- the PCI specs themselves are sane, but most implementors should be + * hit hard with \hammer scaled \magstep5. [mj] + * + * Jan 23, 1999 : More improvements to peer host bridge logic. i450NX fixup. [mj] + * + * Feb 8, 1999 : Added UM8886BF I/O address fixup. [mj] + * + * August 1999 : New resource management and configuration access stuff. [mj] + * + * Sep 19, 1999 : Use PCI IRQ routing tables for detection of peer host bridges. + * Based on ideas by Chris Frantz and David Hinds. [mj] + * + * Sep 28, 1999 : Handle unreported/unassigned IRQs. Thanks to Shuu Yamaguchi + * for a lot of patience during testing. [mj] + * + * Oct 8, 1999 : Split to pci-i386.c, pci-pc.c and pci-visws.c. [mj] + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/errno.h> + +#include "pci-i386.h" + +/* + * Assign new address to PCI resource. We hope our resource information + * is complete. On the PC, we don't re-assign resources unless we are + * forced to do so. + * + * Expects start=0, end=size-1, flags=resource type. + */ + +static int __init pcibios_assign_resource(struct pci_dev *dev, int i) +{ + struct resource *r = &dev->resource[i]; + struct resource *pr = pci_find_parent_resource(dev, r); + unsigned long size = r->end + 1; + u32 new, check; + + if (!pr) { + printk(KERN_ERR "PCI: Cannot find parent resource for device %s\n", dev->slot_name); + return -EINVAL; + } + if (r->flags & IORESOURCE_IO) { + /* + * We need to avoid collisions with `mirrored' VGA ports and other strange + * ISA hardware, so we always want the addresses kilobyte aligned. + */ + if (size > 0x100) { + printk(KERN_ERR "PCI: I/O Region %s/%d too large (%ld bytes)\n", dev->slot_name, i, size); + return -EFBIG; + } + if (allocate_resource(pr, r, size, 0x1000, ~0, 1024)) { + printk(KERN_ERR "PCI: Allocation of I/O region %s/%d (%ld bytes) failed\n", dev->slot_name, i, size); + return -EBUSY; + } + } else { + if (allocate_resource(pr, r, size, 0x10000000, ~0, size)) { + printk(KERN_ERR "PCI: Allocation of memory region %s/%d (%ld bytes) failed\n", dev->slot_name, i, size); + return -EBUSY; + } + } + if (i < 6) { + int reg = PCI_BASE_ADDRESS_0 + 4*i; + new = r->start | (r->flags & PCI_REGION_FLAG_MASK); + pci_write_config_dword(dev, reg, new); + pci_read_config_dword(dev, reg, &check); + if (new != check) + printk(KERN_ERR "PCI: Error while updating region %s/%d (%08x != %08x)\n", dev->slot_name, i, new, check); + } else if (i == PCI_ROM_RESOURCE) { + r->flags |= PCI_ROM_ADDRESS_ENABLE; + pci_write_config_dword(dev, dev->rom_base_reg, r->start | (r->flags & PCI_REGION_FLAG_MASK)); + } + printk("PCI: Assigned addresses %08lx-%08lx to region %s/%d\n", r->start, r->end, dev->slot_name, i); + return 0; +} + +/* + * Handle resources of PCI devices. If the world were perfect, we could + * just allocate all the resource regions and do nothing more. It isn't. + * On the other hand, we cannot just re-allocate all devices, as it would + * require us to know lots of host bridge internals. So we attempt to + * keep as much of the original configuration as possible, but tweak it + * when it's found to be wrong. + * + * Known BIOS problems we have to work around: + * - I/O or memory regions not configured + * - regions configured, but not enabled in the command register + * - bogus I/O addresses above 64K used + * - expansion ROMs left enabled (this may sound harmless, but given + * the fact the PCI specs explicitly allow address decoders to be + * shared between expansion ROMs and other resource regions, it's + * at least dangerous) + * + * Our solution: + * (1) Allocate resources for all buses behind PCI-to-PCI bridges. + * This gives us fixed barriers on where we can allocate. + * (2) Allocate resources for all enabled devices. If there is + * a collision, just mark the resource as unallocated. Also + * disable expansion ROMs during this step. + * (3) Try to allocate resources for disabled devices. If the + * resources were assigned correctly, everything goes well, + * if they weren't, they won't disturb allocation of other + * resources. + * (4) Assign new addresses to resources which were either + * not configured at all or misconfigured. If explicitly + * requested by the user, configure expansion ROM address + * as well. Finally enable the I/O and Memory bits. + */ + +static void __init pcibios_allocate_bus_resources(struct pci_bus *bus) +{ + struct pci_dev *dev; + int idx; + struct resource *r, *pr; + + /* Depth-First Search on bus tree */ + while (bus) { + if ((dev = bus->self)) { + for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) { + r = &dev->resource[idx]; + if (!r->start) + continue; + pr = pci_find_parent_resource(dev, r); + if (!pr || request_resource(pr, r) < 0) + printk(KERN_ERR "PCI: Cannot allocate resource region %d of bridge %s\n", idx, dev->slot_name); + } + } + if (bus->children) + pcibios_allocate_bus_resources(bus->children); + bus = bus->next; + } +} + +static void __init pcibios_allocate_resources(int pass) +{ + struct pci_dev *dev; + int idx, disabled; + u16 command; + struct resource *r, *pr; + + for(dev=pci_devices; dev; dev=dev->next) { + pci_read_config_word(dev, PCI_COMMAND, &command); + for(idx = 0; idx < 6; idx++) { + r = &dev->resource[idx]; + if (r->parent) /* Already allocated */ + continue; + if (!r->start) /* Address not assigned at all */ + continue; + if (r->flags & IORESOURCE_IO) + disabled = !(command & PCI_COMMAND_IO); + else + disabled = !(command & PCI_COMMAND_MEMORY); + if (pass == disabled) { + DBG("PCI: Resource %08lx-%08lx (f=%lx, d=%d, p=%d)\n", + r->start, r->end, r->flags, disabled, pass); + pr = pci_find_parent_resource(dev, r); + if (!pr || request_resource(pr, r) < 0) { + printk(KERN_ERR "PCI: Cannot allocate resource region %d of device %s\n", idx, dev->slot_name); + /* We'll assign a new address later */ + r->start -= r->end; + r->start = 0; + } + } + } + if (!pass) { + r = &dev->resource[PCI_ROM_RESOURCE]; + if (r->flags & PCI_ROM_ADDRESS_ENABLE) { + /* Turn the ROM off, leave the resource region, but keep it unregistered. */ + u32 reg; + DBG("PCI: Switching off ROM of %s\n", dev->slot_name); + r->flags &= ~PCI_ROM_ADDRESS_ENABLE; + pci_read_config_dword(dev, dev->rom_base_reg, ®); + pci_write_config_dword(dev, dev->rom_base_reg, reg & ~PCI_ROM_ADDRESS_ENABLE); + } + } + } +} + +static void __init pcibios_assign_resources(void) +{ + struct pci_dev *dev; + u16 cmd, old_cmd; + int idx; + int fault = 0; + struct resource *r; + + for(dev=pci_devices; dev; dev=dev->next) { + pci_read_config_word(dev, PCI_COMMAND, &cmd); + old_cmd = cmd; + for(idx=0; idx<6; idx++) { + r = &dev->resource[idx]; + if (((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && idx < 4) || + ((dev->class >> 8) == PCI_CLASS_DISPLAY_VGA && (r->flags & IORESOURCE_IO))) + /* + * Don't touch IDE controllers and I/O ports of video cards! + * Neither enable anything in their command registers. + */ + continue; + if (!r->start && r->end) { + /* + * We shall assign a new address to this resource, either because + * the BIOS forgot to do so or because we have decided the old + * address was unusable for some reason. + */ + if (pcibios_assign_resource(dev, idx) < 0) + fault = 1; + } + if (r->flags & IORESOURCE_IO) + cmd |= PCI_COMMAND_IO; + if (r->flags & IORESOURCE_MEM) + cmd |= PCI_COMMAND_MEMORY; + } + + if (cmd != old_cmd) { + if (fault) + printk("PCI: Not enabling device %s because of resource collisions\n", dev->slot_name); + else { + printk("PCI: Enabling device %s (%04x -> %04x)\n", dev->slot_name, old_cmd, cmd); + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + } + + if (pci_probe & PCI_ASSIGN_ROMS) { + r = &dev->resource[PCI_ROM_RESOURCE]; + r->end -= r->start; + r->start = 0; + if (r->end) + pcibios_assign_resource(dev, PCI_ROM_RESOURCE); + } + } +} + +void __init pcibios_resource_survey(void) +{ + pcibios_allocate_bus_resources(pci_root); + pcibios_allocate_resources(0); + pcibios_allocate_resources(1); + pcibios_assign_resources(); +} diff --git a/arch/i386/kernel/pci-i386.h b/arch/i386/kernel/pci-i386.h new file mode 100644 index 000000000..41ac2b856 --- /dev/null +++ b/arch/i386/kernel/pci-i386.h @@ -0,0 +1,29 @@ +/* + * Low-Level PCI Access for i386 machines. + * + * (c) 1999 Martin Mares <mj@ucw.cz> + */ + +#undef DEBUG + +#ifdef DEBUG +#define DBG(x...) printk(x) +#else +#define DBG(x...) +#endif + +#define PCI_PROBE_BIOS 1 +#define PCI_PROBE_CONF1 2 +#define PCI_PROBE_CONF2 4 +#define PCI_NO_SORT 0x100 +#define PCI_BIOS_SORT 0x200 +#define PCI_NO_CHECKS 0x400 +#define PCI_NO_PEER_FIXUP 0x800 +#define PCI_ASSIGN_ROMS 0x1000 +#define PCI_NO_IRQ_SCAN 0x2000 + +extern unsigned int pci_probe; + +/* pci-i386.c */ + +void pcibios_resource_survey(void); diff --git a/arch/i386/kernel/bios32.c b/arch/i386/kernel/pci-pc.c index f0c63c938..be3076f30 100644 --- a/arch/i386/kernel/bios32.c +++ b/arch/i386/kernel/pci-pc.c @@ -1,119 +1,53 @@ /* - * bios32.c - Low-Level PCI Access + * Low-Level PCI Support for PC * - * $Id: bios32.c,v 1.48 1998/09/26 08:06:55 mj Exp $ - * - * Copyright 1993, 1994 Drew Eckhardt - * Visionary Computing - * (Unix and Linux consulting and custom programming) - * Drew@Colorado.EDU - * +1 (303) 786-7975 - * - * Drew's work was sponsored by: - * iX Multiuser Multitasking Magazine - * Hannover, Germany - * hm@ix.de - * - * Copyright 1997--1999 Martin Mares <mj@atrey.karlin.mff.cuni.cz> - * - * For more information, please consult the following manuals (look at - * http://www.pcisig.com/ for how to get them): - * - * PCI BIOS Specification - * PCI Local Bus Specification - * PCI to PCI Bridge Specification - * PCI System Design Guide - * - * - * CHANGELOG : - * Jun 17, 1994 : Modified to accommodate the broken pre-PCI BIOS SPECIFICATION - * Revision 2.0 present on <thys@dennis.ee.up.ac.za>'s ASUS mainboard. - * - * Jan 5, 1995 : Modified to probe PCI hardware at boot time by Frederic - * Potter, potter@cao-vlsi.ibp.fr - * - * Jan 10, 1995 : Modified to store the information about configured pci - * devices into a list, which can be accessed via /proc/pci by - * Curtis Varner, cvarner@cs.ucr.edu - * - * Jan 12, 1995 : CPU-PCI bridge optimization support by Frederic Potter. - * Alpha version. Intel & UMC chipset support only. - * - * Apr 16, 1995 : Source merge with the DEC Alpha PCI support. Most of the code - * moved to drivers/pci/pci.c. - * - * Dec 7, 1996 : Added support for direct configuration access of boards - * with Intel compatible access schemes (tsbogend@alpha.franken.de) - * - * Feb 3, 1997 : Set internal functions to static, save/restore flags - * avoid dead locks reading broken PCI BIOS, werner@suse.de - * - * Apr 26, 1997 : Fixed case when there is BIOS32, but not PCI BIOS - * (mj@atrey.karlin.mff.cuni.cz) - * - * May 7, 1997 : Added some missing cli()'s. [mj] - * - * Jun 20, 1997 : Corrected problems in "conf1" type accesses. - * (paubert@iram.es) - * - * Aug 2, 1997 : Split to PCI BIOS handling and direct PCI access parts - * and cleaned it up... Martin Mares <mj@atrey.karlin.mff.cuni.cz> - * - * Feb 6, 1998 : No longer using BIOS to find devices and device classes. [mj] - * - * May 1, 1998 : Support for peer host bridges. [mj] - * - * Jun 19, 1998 : Changed to use spinlocks, so that PCI configuration space - * can be accessed from interrupts even on SMP systems. [mj] - * - * August 1998 : Better support for peer host bridges and more paranoid - * checks for direct hardware access. Ugh, this file starts to look as - * a large gallery of common hardware bug workarounds (watch the comments) - * -- the PCI specs themselves are sane, but most implementors should be - * hit hard with \hammer scaled \magstep5. [mj] - * - * Jan 23, 1999 : More improvements to peer host bridge logic. i450NX fixup. [mj] - * - * Feb 8, 1999 : Added UM8886BF I/O address fixup. [mj] - * - * August 1999 : New resource management and configuration access stuff. [mj] + * (c) 1999 Martin Mares <mj@ucw.cz> */ #include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/pci.h> #include <linux/init.h> -#include <linux/ioport.h> #include <linux/malloc.h> -#include <linux/smp_lock.h> +#include <linux/interrupt.h> #include <linux/irq.h> -#include <linux/spinlock.h> -#include <asm/page.h> #include <asm/segment.h> -#include <asm/system.h> #include <asm/io.h> #include <asm/smp.h> -#undef DEBUG +#include "pci-i386.h" -#ifdef DEBUG -#define DBG(x...) printk(x) -#else -#define DBG(x...) -#endif +unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2; -#define PCI_PROBE_BIOS 1 -#define PCI_PROBE_CONF1 2 -#define PCI_PROBE_CONF2 4 -#define PCI_NO_SORT 0x100 -#define PCI_BIOS_SORT 0x200 -#define PCI_NO_CHECKS 0x400 -#define PCI_NO_PEER_FIXUP 0x800 -#define PCI_ASSIGN_ROMS 0x1000 +/* + * IRQ routing table provided by the BIOS + */ -static unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2; +struct irq_info { + u8 bus, devfn; /* Bus, device and function */ + struct { + u8 link; /* IRQ line ID, chipset dependent, 0=not routed */ + u16 bitmap; /* Available IRQs */ + } __attribute__((packed)) irq[4]; + u8 slot; /* Slot number, 0=onboard */ + u8 rfu; +} __attribute__((packed)); + +struct irq_routing_table { + u32 signature; /* PIRQ_SIGNATURE should be here */ + u16 version; /* PIRQ_VERSION */ + u16 size; /* Table size in bytes */ + u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */ + u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */ + u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */ + u32 miniport_data; /* Crap */ + u8 rfu[11]; + u8 checksum; /* Modulo 256 checksum must give zero */ + struct irq_info slots[0]; +} __attribute__((packed)); /* * Direct access to PCI hardware... @@ -129,55 +63,55 @@ static unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CON static int pci_conf1_read_config_byte(struct pci_dev *dev, int where, u8 *value) { - outl(CONFIG_CMD(dev,where), 0xCF8); - *value = inb(0xCFC + (where&3)); - return PCIBIOS_SUCCESSFUL; + outl(CONFIG_CMD(dev,where), 0xCF8); + *value = inb(0xCFC + (where&3)); + return PCIBIOS_SUCCESSFUL; } static int pci_conf1_read_config_word(struct pci_dev *dev, int where, u16 *value) { - outl(CONFIG_CMD(dev,where), 0xCF8); - *value = inw(0xCFC + (where&2)); - return PCIBIOS_SUCCESSFUL; + outl(CONFIG_CMD(dev,where), 0xCF8); + *value = inw(0xCFC + (where&2)); + return PCIBIOS_SUCCESSFUL; } static int pci_conf1_read_config_dword(struct pci_dev *dev, int where, u32 *value) { - outl(CONFIG_CMD(dev,where), 0xCF8); - *value = inl(0xCFC); - return PCIBIOS_SUCCESSFUL; + outl(CONFIG_CMD(dev,where), 0xCF8); + *value = inl(0xCFC); + return PCIBIOS_SUCCESSFUL; } static int pci_conf1_write_config_byte(struct pci_dev *dev, int where, u8 value) { - outl(CONFIG_CMD(dev,where), 0xCF8); - outb(value, 0xCFC + (where&3)); - return PCIBIOS_SUCCESSFUL; + outl(CONFIG_CMD(dev,where), 0xCF8); + outb(value, 0xCFC + (where&3)); + return PCIBIOS_SUCCESSFUL; } static int pci_conf1_write_config_word(struct pci_dev *dev, int where, u16 value) { - outl(CONFIG_CMD(dev,where), 0xCF8); - outw(value, 0xCFC + (where&2)); - return PCIBIOS_SUCCESSFUL; + outl(CONFIG_CMD(dev,where), 0xCF8); + outw(value, 0xCFC + (where&2)); + return PCIBIOS_SUCCESSFUL; } static int pci_conf1_write_config_dword(struct pci_dev *dev, int where, u32 value) { - outl(CONFIG_CMD(dev,where), 0xCF8); - outl(value, 0xCFC); - return PCIBIOS_SUCCESSFUL; + outl(CONFIG_CMD(dev,where), 0xCF8); + outl(value, 0xCFC); + return PCIBIOS_SUCCESSFUL; } #undef CONFIG_CMD static struct pci_ops pci_direct_conf1 = { - pci_conf1_read_config_byte, - pci_conf1_read_config_word, - pci_conf1_read_config_dword, - pci_conf1_write_config_byte, - pci_conf1_write_config_word, - pci_conf1_write_config_dword + pci_conf1_read_config_byte, + pci_conf1_read_config_word, + pci_conf1_read_config_dword, + pci_conf1_write_config_byte, + pci_conf1_write_config_word, + pci_conf1_write_config_dword }; /* @@ -192,50 +126,50 @@ static struct pci_ops pci_direct_conf1 = { static int pci_conf2_read_config_byte(struct pci_dev *dev, int where, u8 *value) { - SET(dev); - *value = inb(IOADDR(dev->devfn,where)); - outb (0, 0xCF8); - return PCIBIOS_SUCCESSFUL; + SET(dev); + *value = inb(IOADDR(dev->devfn,where)); + outb (0, 0xCF8); + return PCIBIOS_SUCCESSFUL; } static int pci_conf2_read_config_word(struct pci_dev *dev, int where, u16 *value) { - SET(dev); - *value = inw(IOADDR(dev->devfn,where)); - outb (0, 0xCF8); - return PCIBIOS_SUCCESSFUL; + SET(dev); + *value = inw(IOADDR(dev->devfn,where)); + outb (0, 0xCF8); + return PCIBIOS_SUCCESSFUL; } static int pci_conf2_read_config_dword(struct pci_dev *dev, int where, u32 *value) { - SET(dev); - *value = inl (IOADDR(dev->devfn,where)); - outb (0, 0xCF8); - return PCIBIOS_SUCCESSFUL; + SET(dev); + *value = inl (IOADDR(dev->devfn,where)); + outb (0, 0xCF8); + return PCIBIOS_SUCCESSFUL; } static int pci_conf2_write_config_byte(struct pci_dev *dev, int where, u8 value) { - SET(dev); - outb (value, IOADDR(dev->devfn,where)); - outb (0, 0xCF8); - return PCIBIOS_SUCCESSFUL; + SET(dev); + outb (value, IOADDR(dev->devfn,where)); + outb (0, 0xCF8); + return PCIBIOS_SUCCESSFUL; } static int pci_conf2_write_config_word(struct pci_dev *dev, int where, u16 value) { - SET(dev); - outw (value, IOADDR(dev->devfn,where)); - outb (0, 0xCF8); - return PCIBIOS_SUCCESSFUL; + SET(dev); + outw (value, IOADDR(dev->devfn,where)); + outb (0, 0xCF8); + return PCIBIOS_SUCCESSFUL; } static int pci_conf2_write_config_dword(struct pci_dev *dev, int where, u32 value) { - SET(dev); - outl (value, IOADDR(dev->devfn,where)); - outb (0, 0xCF8); - return PCIBIOS_SUCCESSFUL; + SET(dev); + outl (value, IOADDR(dev->devfn,where)); + outb (0, 0xCF8); + return PCIBIOS_SUCCESSFUL; } #undef SET @@ -243,12 +177,12 @@ static int pci_conf2_write_config_dword(struct pci_dev *dev, int where, u32 valu #undef FUNC static struct pci_ops pci_direct_conf2 = { - pci_conf2_read_config_byte, - pci_conf2_read_config_word, - pci_conf2_read_config_dword, - pci_conf2_write_config_byte, - pci_conf2_write_config_word, - pci_conf2_write_config_dword + pci_conf2_read_config_byte, + pci_conf2_read_config_word, + pci_conf2_read_config_dword, + pci_conf2_write_config_byte, + pci_conf2_write_config_word, + pci_conf2_write_config_dword }; /* @@ -267,10 +201,6 @@ static int __init pci_sanity_check(struct pci_ops *o) struct pci_bus bus; /* Fake bus and device */ struct pci_dev dev; -#ifdef CONFIG_VISWS - return 1; /* Lithium PCI Bridges are non-standard */ -#endif - if (pci_probe & PCI_NO_CHECKS) return 1; bus.number = 0; @@ -347,6 +277,8 @@ static struct pci_ops * __init pci_check_direct(void) #define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b #define PCIBIOS_WRITE_CONFIG_WORD 0xb10c #define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d +#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e +#define PCIBIOS_SET_PCI_HW_INT 0xb10f /* BIOS32 signature: "_32_" */ #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) @@ -427,7 +359,7 @@ static unsigned long bios32_service(unsigned long service) printk("bios32_service(0x%lx): not present\n", service); return 0; default: /* Shouldn't happen */ - printk("bios32_service(0x%lx): returned 0x%x, report to <mj@ucw.cz>.\n", + printk("bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n", service, return_code); return 0; } @@ -489,31 +421,6 @@ static int __init check_pcibios(void) return 0; } -#if 0 /* Not used */ - -static int pci_bios_find_class (unsigned int class_code, unsigned short index, - unsigned char *bus, unsigned char *device_fn) -{ - unsigned long bx; - unsigned long ret; - - __asm__ ("lcall (%%edi)\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=b" (bx), - "=a" (ret) - : "1" (PCIBIOS_FIND_PCI_CLASS_CODE), - "c" (class_code), - "S" ((int) index), - "D" (&pci_indirect)); - *bus = (bx >> 8) & 0xff; - *device_fn = bx & 0xff; - return (int) (ret & 0xff00) >> 8; -} - -#endif - static int __init pci_bios_find_device (unsigned short vendor, unsigned short device_id, unsigned short index, unsigned char *bus, unsigned char *device_fn) { @@ -757,85 +664,73 @@ static void __init pcibios_sort(void) *last = NULL; } -#endif - /* - * Several BIOS'es forget to assign addresses to I/O ranges. Try to fix it. + * Ask BIOS for IRQ Routing Table */ -static void __init pcibios_fixup_io_addr(struct pci_dev *dev, int idx) -{ - unsigned int reg = PCI_BASE_ADDRESS_0 + 4*idx; - struct resource *r = &dev->resource[idx]; - unsigned int size = r->end - r->start + 1; +struct irq_routing_options { + u16 size; + struct irq_info *table; + u16 segment; +} __attribute__((packed)); - if (((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && idx < 4) || - (dev->class >> 8) == PCI_CLASS_DISPLAY_VGA) { - /* - * In case the BIOS didn't assign an address 0--3 to an IDE - * controller, we don't try to fix it as it means "use default - * addresses" at least with several broken chips and the IDE - * driver needs the original settings to recognize which devices - * correspond to the primary controller. - * - * We don't assign VGA I/O ranges as well. - */ - return; - } - /* - * We need to avoid collisions with `mirrored' VGA ports and other strange - * ISA hardware, so we always want the addresses kilobyte aligned. - */ - if (!size || size > 256) { - printk(KERN_ERR "PCI: Cannot assign I/O space to device %s, %d bytes are too much.\n", dev->name, size); - return; - } else { - u32 try; +static unsigned long pcibios_irq_page __initdata = 0; - r->start = 0; - r->end = size - 1; - if (pci_assign_resource(dev, idx)) { - printk(KERN_ERR "PCI: Unable to find free %d bytes of I/O space for device %s.\n", size, dev->name); - return; - } - printk("PCI: Assigned I/O space %04lx-%04lx to device %s\n", r->start, r->end, dev->name); - pci_read_config_dword(dev, reg, &try); - if ((try & PCI_BASE_ADDRESS_IO_MASK) != r->start) { - r->start = 0; - pci_write_config_dword(dev, reg, 0); - printk(KERN_ERR "PCI: I/O address setup failed, got %04x\n", try); - } - } +static inline void __init pcibios_free_irq_routing_table(void) +{ + if (pcibios_irq_page) + free_page(pcibios_irq_page); } -/* - * Assign address to expansion ROM. This is a highly experimental feature - * and you must enable it by "pci=rom". It's even not guaranteed to work - * with all cards since the PCI specs allow address decoders to be shared - * between the ROM space and one of the standard regions (sigh!). - */ -static void __init pcibios_fixup_rom_addr(struct pci_dev *dev) +static struct irq_routing_table * __init pcibios_get_irq_routing_table(void) { - int reg = (dev->hdr_type == 1) ? PCI_ROM_ADDRESS1 : PCI_ROM_ADDRESS; - struct resource *r = &dev->resource[PCI_ROM_RESOURCE]; - unsigned long rom_size = r->end - r->start + 1; - - r->start = 0; - r->end = rom_size - 1; - if (pci_assign_resource(dev, PCI_ROM_RESOURCE)) - printk(KERN_ERR "PCI: Unable to find free space for expansion ROM of device %s (0x%lx bytes)\n", - dev->name, rom_size); - else { - DBG("PCI: Assigned address %08lx to expansion ROM of %s (0x%lx bytes)\n", r->start, dev->name, rom_size); - pci_write_config_dword(dev, reg, r->start | PCI_ROM_ADDRESS_ENABLE); - r->flags |= PCI_ROM_ADDRESS_ENABLE; + struct irq_routing_options opt; + struct irq_routing_table *rt; + int ret, map; + + if (pci_probe & PCI_NO_IRQ_SCAN) + return NULL; + pcibios_irq_page = __get_free_page(GFP_KERNEL); + if (!pcibios_irq_page) + return 0; + rt = (void *) pcibios_irq_page; + opt.table = rt->slots; + opt.size = PAGE_SIZE - sizeof(struct irq_routing_table); + opt.segment = __KERNEL_DS; + + DBG("PCI: Fetching IRQ routing table... "); + __asm__("push %%es\n\t" + "push %%ds\n\t" + "pop %%es\n\t" + "lcall (%%esi)\n\t" + "pop %%es\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (ret), + "=b" (map) + : "0" (PCIBIOS_GET_ROUTING_OPTIONS), + "1" (0), + "D" ((long) &opt), + "S" (&pci_indirect)); + DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map); + if (ret & 0xff00) { + printk(KERN_ERR "PCI: Error %02x when fetching IRQ routing table.\n", (ret >> 8) & 0xff); + return 0; } + + memset(rt, 0, sizeof(struct irq_routing_table)); + rt->size = opt.size + sizeof(struct irq_routing_table); + printk("PCI: Using BIOS Interrupt Routing Table\n"); + return rt; } +#endif + /* * Several buggy motherboards address only 16 devices and mirror * them to next 16 IDs. We try to detect this `feature' on all - * primary busses (those containing host bridges as they are + * primary buses (those containing host bridges as they are * expected to be unique) and remove the ghost devices. */ @@ -868,7 +763,7 @@ static void __init pcibios_fixup_ghosts(struct pci_bus *b) } if (!seen_host_bridge) return; - printk("PCI: Ignoring ghost devices on bus %d\n", b->number); + printk("PCI: Ignoring ghost devices on bus %02x\n", b->number); for(e=b->devices; e->sibling != d; e=e->sibling); e->sibling = NULL; for(z=&pci_devices; (d=*z);) @@ -893,16 +788,11 @@ static void __init pcibios_fixup_peer_bridges(void) struct pci_dev *d; struct pci_ops *ops = pci_root->ops; -#ifdef CONFIG_VISWS - pci_scan_bus(1, ops, NULL); - return; -#endif - #ifdef CONFIG_PCI_DIRECT /* * Don't search for peer host bridges if we use config type 2 - * since it reads bogus values for non-existent busses and - * chipsets supporting multiple primary busses use conf1 anyway. + * since it reads bogus values for non-existent buses and + * chipsets supporting multiple primary buses use conf1 anyway. */ if (ops == &pci_direct_conf2) return; @@ -966,7 +856,7 @@ static void __init pci_fixup_i450nx(struct pci_dev *d) */ int pxb, reg; u8 busno, suba, subb; - printk("PCI: Searching for i450NX host bridges on %s\n", d->name); + printk("PCI: Searching for i450NX host bridges on %s\n", d->slot_name); reg = 0xd0; for(pxb=0; pxb<2; pxb++) { pci_read_config_byte(d, reg++, &busno); @@ -989,113 +879,214 @@ static void __init pci_fixup_umc_ide(struct pci_dev *d) */ int i; - printk("PCI: Fixing base address flags for device %s\n", d->name); + printk("PCI: Fixing base address flags for device %s\n", d->slot_name); for(i=0; i<4; i++) d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO; } +static void __init pci_fixup_ide_bases(struct pci_dev *d) +{ + int i; + + /* + * PCI IDE controllers use non-standard I/O port decoding, respect it. + */ + if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE) + return; + DBG("PCI: IDE base address fixup for %s\n", d->slot_name); + for(i=0; i<4; i++) { + struct resource *r = &d->resource[i]; + if ((r->start & ~0x80) == 0x374) { + r->start |= 2; + r->end = r->start; + } + } +} + struct pci_fixup pcibios_fixups[] = { { PCI_FIXUP_HEADER, PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx }, { PCI_FIXUP_HEADER, PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide }, + { PCI_FIXUP_HEADER, PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases }, { 0 } }; /* - * Allocate resources for all PCI devices. We need to do that before - * we try to fix up anything. + * Fix up IRQs of all PCI devices. */ -static void __init pcibios_claim_resources(struct pci_bus *bus) -{ - struct pci_dev *dev; - int idx; +extern int skip_ioapic_setup; - while (bus) { - for (dev=bus->devices; dev; dev=dev->sibling) - for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) { - struct resource *r = &dev->resource[idx]; - struct resource *pr; - if (!r->start) - continue; - pr = pci_find_parent_resource(dev, r); - if (!pr || request_resource(pr, r) < 0) { - printk(KERN_ERR "PCI: Address space collision on region %d of device %s\n", idx, dev->name); - /* We probably should disable the region, shouldn't we? */ - } +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) +#define PIRQ_VERSION 0x0100 + +/* + * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. + */ + +static struct irq_routing_table * __init pcibios_find_irq_routing_table(void) +{ + u8 *addr; + struct irq_routing_table *rt; + int i; + u8 sum; + + for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { + rt = (struct irq_routing_table *) addr; + if (rt->signature != PIRQ_SIGNATURE || + rt->version != PIRQ_VERSION || + rt->size % 16 || + rt->size < sizeof(struct irq_routing_table)) + continue; + sum = 0; + for(i=0; i<rt->size; i++) + sum += addr[i]; + if (!sum) { + printk("PCI: Interrupt Routing Table found at 0x%p [router type %04x/%04x]\n", + rt, rt->rtr_vendor, rt->rtr_device); + return rt; } - if (bus->children) - pcibios_claim_resources(bus->children); - bus = bus->next; } + return NULL; } /* - * Fix base addresses, I/O and memory enables and IRQ's (mostly work-arounds - * for buggy PCI BIOS'es :-[). + * If we have a IRQ routing table, use it to search for peer host + * bridges. It's a gross hack, but since there are no other known + * ways how to get a list of buses, we have to go this way. */ -extern int skip_ioapic_setup; - -static void __init pcibios_fixup_devices(void) +static void __init pcibios_irq_peer_trick(struct irq_routing_table *rt) { - struct pci_dev *dev; - int i, has_io, has_mem; - unsigned short cmd; + u8 busmap[256]; + int i; + struct irq_info *e; - for(dev = pci_devices; dev; dev=dev->next) { + memset(busmap, 0, sizeof(busmap)); + for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { + e = &rt->slots[i]; + DBG("b=%02x d=%02x s=%02x\n", e->bus, e->devfn, e->slot); + busmap[e->bus] = 1; + } + for(i=1; i<256; i++) /* - * There are buggy BIOSes that forget to enable I/O and memory - * access to PCI devices. We try to fix this, but we need to - * be sure that the BIOS didn't forget to assign an address - * to the device. [mj] + * It might be a secondary bus, but in this case its parent is already + * known (ascending bus order) and therefore pci_scan_bus returns immediately. */ - has_io = has_mem = 0; - for(i=0; i<6; i++) { - struct resource *r = &dev->resource[i]; - if (r->flags & PCI_BASE_ADDRESS_SPACE_IO) { - has_io = 1; - if (!r->start || r->start == PCI_BASE_ADDRESS_IO_MASK) - pcibios_fixup_io_addr(dev, i); - } else if (r->start) - has_mem = 1; - } + if (busmap[i] && pci_scan_bus(i, pci_root->ops, NULL)) + printk("PCI: Discovered primary peer bus %02x [IRQ]\n", i); + pci_probe |= PCI_NO_PEER_FIXUP; +} + +/* + * In case BIOS forgets to tell us about IRQ, we try to look it up in the routing + * table, but unfortunately we have to know the interrupt router chip. + */ + +static char * __init pcibios_lookup_irq(struct pci_dev *dev, struct irq_routing_table *rt, int pin) +{ + struct irq_info *q; + struct pci_dev *router; + int i, pirq, newirq; + u32 rtrid, mask; + u8 x; + + pin--; + DBG("IRQ for %s(%d)", dev->slot_name, pin); + while (dev->bus->self) { + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + dev = dev->bus->self; + DBG(" -> %s(%d)", dev->slot_name, pin); + } + for(q = rt->slots, i = rt->size - sizeof(struct irq_routing_table); + i && (q->bus != dev->bus->number || PCI_SLOT(q->devfn) != PCI_SLOT(dev->devfn)); + i -= sizeof(struct irq_info), q++) + ; + if (!i) { + DBG(" -> not found in routing table\n"); + return NULL; + } + pirq = q->irq[pin].link; + mask = q->irq[pin].bitmap; + if (!pirq) { + DBG(" -> not routed\n"); + return NULL; + } + DBG(" -> PIRQ %02x, mask %04x", pirq, mask); + if ((dev->class >> 8) == PCI_CLASS_DISPLAY_VGA) + newirq = 0; + else for(newirq = 15; newirq && !(mask & (1 << newirq)); newirq--) + ; + if (!(router = pci_find_slot(rt->rtr_bus, rt->rtr_devfn))) { + DBG(" -> router not found\n"); + return NULL; + } +#define ID(x,y) ((x << 16) | y) + rtrid = ID(rt->rtr_vendor, rt->rtr_device); + if (!rtrid) { /* - * Don't enable VGA-compatible cards since they have - * fixed I/O and memory space. - * - * Don't enabled disabled IDE interfaces either because - * some BIOSes may reallocate the same address when they - * find that no devices are attached. + * Several BIOSes forget to set the router type. In such cases, we + * use chip vendor/device. This doesn't guarantee us semantics of + * PIRQ values, but was found to work in practice and it's still + * better than not trying. */ - if (((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) && - ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE)) { - pci_read_config_word(dev, PCI_COMMAND, &cmd); - if (has_io && !(cmd & PCI_COMMAND_IO)) { - printk("PCI: Enabling I/O for device %s\n", dev->name); - cmd |= PCI_COMMAND_IO; - pci_write_config_word(dev, PCI_COMMAND, cmd); - } - if (has_mem && !(cmd & PCI_COMMAND_MEMORY)) { - printk("PCI: Enabling memory for device %s\n", dev->name); - cmd |= PCI_COMMAND_MEMORY; - pci_write_config_word(dev, PCI_COMMAND, cmd); - } + DBG(" [%s]", router->slot_name); + rtrid = ID(router->vendor, router->device); + } + switch (rtrid) { + case ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371FB_0): + case ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371SB_0): + case ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_0): + /* Intel PIIX: PIRQ holds configuration register address */ + pci_read_config_byte(router, pirq, &x); + if (x < 16) { + DBG(" -> [PIIX] %02x\n", x); + dev->irq = x; + return "PIIX"; + } else if (newirq) { + DBG(" -> [PIIX] set to %02x\n", newirq); + pci_write_config_byte(router, pirq, newirq); + dev->irq = newirq; + return "PIIX-NEW"; } - /* - * Assign address to expansion ROM if requested. - */ - if ((pci_probe & PCI_ASSIGN_ROMS) && dev->resource[PCI_ROM_RESOURCE].end) - pcibios_fixup_rom_addr(dev); + DBG(" -> [PIIX] sink\n"); + return NULL; + default: + DBG(" -> unknown router %04x/%04x\n", rt->rtr_vendor, rt->rtr_device); + if (newirq && mask == (1 << newirq)) { + /* Only one IRQ available -> use it */ + dev->irq = newirq; + return "guess"; + } + return NULL; + } +#undef ID +} + +static void __init pcibios_fixup_irqs(void) +{ + struct irq_routing_table *rtable; + struct pci_dev *dev; + u8 pin; + + rtable = pcibios_find_irq_routing_table(); +#ifdef CONFIG_PCI_BIOS + if (!rtable && pci_bios_present) + rtable = pcibios_get_irq_routing_table(); +#endif + + if (rtable) + pcibios_irq_peer_trick(rtable); + + for(dev=pci_devices; dev; dev=dev->next) { + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); #if defined(CONFIG_X86_IO_APIC) /* - * Recalculate IRQ numbers if we use the I/O APIC + * Recalculate IRQ numbers if we use the I/O APIC. */ if(!skip_ioapic_setup) { int irq; - unsigned char pin; - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (pin) { pin--; /* interrupt pins are numbered starting from 1 */ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); @@ -1115,14 +1106,24 @@ static void __init pcibios_fixup_devices(void) dev->irq = irq; } } + rtable = NULL; /* Avoid IRQ assignment below */ } #endif /* - * Fix out-of-range IRQ numbers + * Fix out-of-range IRQ numbers and missing IRQs. */ if (dev->irq >= NR_IRQS) dev->irq = 0; + if (pin && !dev->irq && rtable && rtable->version) { + char *msg = pcibios_lookup_irq(dev, rtable, pin); + if (msg) + printk("PCI: Assigned IRQ %d to device %s [%s]\n", dev->irq, dev->slot_name, msg); + } } + +#ifdef CONFIG_PCI_BIOS + pcibios_free_irq_routing_table(); +#endif } /* @@ -1133,6 +1134,7 @@ static void __init pcibios_fixup_devices(void) void __init pcibios_fixup_bus(struct pci_bus *b) { pcibios_fixup_ghosts(b); + pci_read_bridge_bases(b); } /* @@ -1170,10 +1172,10 @@ void __init pcibios_init(void) printk("PCI: Probing PCI hardware\n"); pci_scan_bus(0, ops, NULL); + pcibios_fixup_irqs(); if (!(pci_probe & PCI_NO_PEER_FIXUP)) pcibios_fixup_peer_bridges(); - pcibios_claim_resources(pci_root); - pcibios_fixup_devices(); + pcibios_resource_survey(); #ifdef CONFIG_PCI_BIOS if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT)) @@ -1197,6 +1199,9 @@ char * __init pcibios_setup(char *str) } else if (!strcmp(str, "nosort")) { pci_probe |= PCI_NO_SORT; return NULL; + } else if (!strcmp(str, "noirq")) { + pci_probe |= PCI_NO_IRQ_SCAN; + return NULL; } #endif #ifdef CONFIG_PCI_DIRECT diff --git a/arch/i386/kernel/pci-visws.c b/arch/i386/kernel/pci-visws.c new file mode 100644 index 000000000..31a767a22 --- /dev/null +++ b/arch/i386/kernel/pci-visws.c @@ -0,0 +1,131 @@ +/* + * Low-Level PCI Support for SGI Visual Workstation + * + * (c) 1999 Martin Mares <mj@ucw.cz> + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/irq.h> + +#include <asm/smp.h> +#include <asm/lithium.h> + +#include "pci-i386.h" + +unsigned int pci_probe = 0; + +/* + * The VISWS uses configuration access type 1 only. + */ + +#define CONFIG_CMD(dev, where) (0x80000000 | (dev->bus->number << 16) | (dev->devfn << 8) | (where & ~3)) + +static int pci_conf1_read_config_byte(struct pci_dev *dev, int where, u8 *value) +{ + outl(CONFIG_CMD(dev,where), 0xCF8); + *value = inb(0xCFC + (where&3)); + return PCIBIOS_SUCCESSFUL; +} + +static int pci_conf1_read_config_word(struct pci_dev *dev, int where, u16 *value) +{ + outl(CONFIG_CMD(dev,where), 0xCF8); + *value = inw(0xCFC + (where&2)); + return PCIBIOS_SUCCESSFUL; +} + +static int pci_conf1_read_config_dword(struct pci_dev *dev, int where, u32 *value) +{ + outl(CONFIG_CMD(dev,where), 0xCF8); + *value = inl(0xCFC); + return PCIBIOS_SUCCESSFUL; +} + +static int pci_conf1_write_config_byte(struct pci_dev *dev, int where, u8 value) +{ + outl(CONFIG_CMD(dev,where), 0xCF8); + outb(value, 0xCFC + (where&3)); + return PCIBIOS_SUCCESSFUL; +} + +static int pci_conf1_write_config_word(struct pci_dev *dev, int where, u16 value) +{ + outl(CONFIG_CMD(dev,where), 0xCF8); + outw(value, 0xCFC + (where&2)); + return PCIBIOS_SUCCESSFUL; +} + +static int pci_conf1_write_config_dword(struct pci_dev *dev, int where, u32 value) +{ + outl(CONFIG_CMD(dev,where), 0xCF8); + outl(value, 0xCFC); + return PCIBIOS_SUCCESSFUL; +} + +#undef CONFIG_CMD + +static struct pci_ops visws_pci_ops = { + pci_conf1_read_config_byte, + pci_conf1_read_config_word, + pci_conf1_read_config_dword, + pci_conf1_write_config_byte, + pci_conf1_write_config_word, + pci_conf1_write_config_dword +}; + +static void __init pcibios_fixup_irqs(void) +{ + struct pci_dev *dev, *p; + u8 pin; + int irq; + + for(dev=pci_devices; dev; dev=dev->next) { + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + dev->irq = 0; + if (!pin) + continue; + pin--; + if (dev->bus->parent) { + p = dev->bus->parent->self; + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + } else + p = dev; + irq = visws_get_PCI_irq_vector(p->bus->number, PCI_SLOT(p->devfn), pin+1); + if (irq >= 0) + dev->irq = irq; + DBG("PCI IRQ: %s pin %d -> %d\n", dev->slot_name, pin, irq); + } +} + +void __init pcibios_fixup_bus(struct pci_bus *b) +{ + pci_read_bridge_bases(b); +} + +#if 0 +static struct resource visws_pci_bus_resources[2] = { + { "Host bus 1", 0xf4000000, 0xf7ffffff, 0 }, + { "Host bus 2", 0xf0000000, 0xf3ffffff, 0 } +}; +#endif + +void __init pcibios_init(void) +{ + unsigned int sec_bus = li_pcib_read16(LI_PCI_BUSNUM) & 0xff; + + printk("PCI: Probing PCI hardware on host buses 00 and %02x\n", sec_bus); + pci_scan_bus(0, &visws_pci_ops, NULL); + pci_scan_bus(sec_bus, &visws_pci_ops, NULL); + pcibios_fixup_irqs(); + pcibios_resource_survey(); +} + +char * __init pcibios_setup(char *str) +{ + return str; +} diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index f44234eb7..e2253ccca 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -1,63 +1,61 @@ /* - * Intel MP v1.1/v1.4 specification support routines for multi-pentium - * hosts. + * Intel SMP support routines. * * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998 Ingo Molnar - * - * Supported by Caldera http://www.caldera.com. - * Much of the core SMP work is based on previous work by Thomas Radke, to - * whom a great many thanks are extended. - * - * Thanks to Intel for making available several different Pentium, - * Pentium Pro and Pentium-II/Xeon MP machines. + * (c) 1998-99 Ingo Molnar <mingo@redhat.com> * * This code is released under the GNU public license version 2 or * later. - * - * Fixes - * Felix Koop : NR_CPUS used properly - * Jose Renau : Handle single CPU case. - * Alan Cox : By repeated request 8) - Total BogoMIP report. - * Greg Wright : Fix for kernel stacks panic. - * Erich Boleyn : MP v1.4 and additional changes. - * Matthias Sattler : Changes for 2.1 kernel map. - * Michel Lespinasse : Changes for 2.1 kernel map. - * Michael Chastain : Change trampoline.S to gnu as. - * Alan Cox : Dumb bug: 'B' step PPro's are fine - * Ingo Molnar : Added APIC timers, based on code - * from Jose Renau - * Alan Cox : Added EBDA scanning - * Ingo Molnar : various cleanups and rewrites - * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. */ #include <linux/config.h> +#include <linux/init.h> + #include <linux/mm.h> #include <linux/kernel_stat.h> -#include <linux/delay.h> -#include <linux/mc146818rtc.h> #include <linux/smp_lock.h> -#include <linux/init.h> -#include <asm/mtrr.h> -#include <asm/msr.h> - #include <linux/irq.h> -#define JIFFIE_TIMEOUT 100 +#include <linux/delay.h> +#include <linux/mc146818rtc.h> +#include <asm/mtrr.h> -extern void update_one_process( struct task_struct *p, - unsigned long ticks, unsigned long user, - unsigned long system, int cpu); /* * Some notes on processor bugs: * - * Pentium and Pentium Pro (and all CPUs) have bugs. The Linux issues - * for SMP are handled as follows. + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. + * The Linux implications for SMP are handled as follows: + * + * Pentium III / [Xeon] + * None of the E1AP-E3AP erratas are visible to the user. + * + * E1AP. see PII A1AP + * E2AP. see PII A2AP + * E3AP. see PII A3AP + * + * Pentium II / [Xeon] + * None of the A1AP-A3AP erratas are visible to the user. + * + * A1AP. see PPro 1AP + * A2AP. see PPro 2AP + * A3AP. see PPro 7AP * * Pentium Pro - * Occasional delivery of 'spurious interrupt' as trap #16. This - * is very rare. The kernel logs the event and recovers + * None of 1AP-9AP erratas are visible to the normal user, + * except occasional delivery of 'spurious interrupt' as trap #15. + * This is very rare and a non-problem. + * + * 1AP. Linux maps APIC as non-cacheable + * 2AP. worked around in hardware + * 3AP. fixed in C0 and above steppings microcode update. + * Linux does not use excessive STARTUP_IPIs. + * 4AP. worked around in hardware + * 5AP. symmetric IO mode (normal Linux operation) not affected. + * 'noapic' mode has vector 0xf filled out properly. + * 6AP. 'noapic' mode might be affected - fixed in later steppings + * 7AP. We do not assume writes to the LVT deassering IRQs + * 8AP. We do not enable low power mode (deep sleep) during MP bootup + * 9AP. We do not use mixed mode * * Pentium * There is a marginal case where REP MOVS on 100MHz SMP @@ -77,1351 +75,34 @@ extern void update_one_process( struct task_struct *p, * 4AP. Linux never generated 3 interrupts of the same priority * to cause a lost local interrupt. * 5AP. Remote read is never used - * 9AP. XXX NEED TO CHECK WE HANDLE THIS XXX - * 10AP. XXX NEED TO CHECK WE HANDLE THIS XXX + * 6AP. not affected - worked around in hardware + * 7AP. not affected - worked around in hardware + * 8AP. worked around in hardware - we get explicit CS errors if not + * 9AP. only 'noapic' mode affected. Might generate spurious + * interrupts, we log only the first one and count the + * rest silently. + * 10AP. not affected - worked around in hardware * 11AP. Linux reads the APIC between writes to avoid this, as per * the documentation. Make sure you preserve this as it affects * the C stepping chips too. + * 12AP. not affected - worked around in hardware + * 13AP. not affected - worked around in hardware + * 14AP. we always deassert INIT during bootup + * 15AP. not affected - worked around in hardware + * 16AP. not affected - worked around in hardware + * 17AP. not affected - worked around in hardware + * 18AP. not affected - worked around in hardware + * 19AP. not affected - worked around in BIOS * - * If this sounds worrying believe me these bugs are ___RARE___ and - * there's about nothing of note with C stepping upwards. + * If this sounds worrying believe me these bugs are either ___RARE___, + * or are signal timing bugs worked around in hardware and there's + * about nothing of note with C stepping upwards. */ - -/* Kernel spinlock */ +/* The 'big kernel lock' */ spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED; -/* - * function prototypes: - */ -static void cache_APIC_registers (void); -static void stop_this_cpu (void); - -static int smp_b_stepping = 0; /* Set if we find a B stepping CPU */ - -static int max_cpus = -1; /* Setup configured maximum number of CPUs to activate */ -int smp_found_config=0; /* Have we found an SMP box */ - -unsigned long cpu_present_map = 0; /* Bitmask of physically existing CPUs */ -unsigned long cpu_online_map = 0; /* Bitmask of currently online CPUs */ -int smp_num_cpus = 0; /* Total count of live CPUs */ -int smp_threads_ready=0; /* Set when the idlers are all forked */ -volatile int cpu_number_map[NR_CPUS]; /* which CPU maps to which logical number */ -volatile int __cpu_logical_map[NR_CPUS]; /* which logical number maps to which CPU */ -static volatile unsigned long cpu_callin_map[NR_CPUS] = {0,}; /* We always use 0 the rest is ready for parallel delivery */ -static volatile unsigned long cpu_callout_map[NR_CPUS] = {0,}; /* We always use 0 the rest is ready for parallel delivery */ -volatile unsigned long smp_invalidate_needed; /* Used for the invalidate map that's also checked in the spinlock */ -volatile unsigned long kstack_ptr; /* Stack vector for booting CPUs */ -struct cpuinfo_x86 cpu_data[NR_CPUS]; /* Per CPU bogomips and other parameters */ -static unsigned int num_processors = 1; /* Internal processor count */ -unsigned long mp_ioapic_addr = 0xFEC00000; /* Address of the I/O apic (not yet used) */ -unsigned char boot_cpu_id = 0; /* Processor that is doing the boot up */ -static int smp_activated = 0; /* Tripped once we need to start cross invalidating */ -int apic_version[NR_CPUS]; /* APIC version number */ -unsigned long apic_retval; /* Just debugging the assembler.. */ - -volatile unsigned long kernel_counter=0; /* Number of times the processor holds the lock */ -volatile unsigned long syscall_count=0; /* Number of times the processor holds the syscall lock */ - -volatile unsigned long ipi_count; /* Number of IPIs delivered */ - -const char lk_lockmsg[] = "lock from interrupt context at %p\n"; - -int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, }; -extern int nr_ioapics; -extern struct mpc_config_ioapic mp_apics [MAX_IO_APICS]; -extern int mp_irq_entries; -extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES]; -extern int mpc_default_type; -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { -1, }; -int mp_current_pci_id = 0; -unsigned long mp_lapic_addr = 0; -int skip_ioapic_setup = 0; /* 1 if "noapic" boot option passed */ - -/* #define SMP_DEBUG */ - -#ifdef SMP_DEBUG -#define SMP_PRINTK(x) printk x -#else -#define SMP_PRINTK(x) -#endif - -/* - * IA s/w dev Vol 3, Section 7.4 - */ -#define APIC_DEFAULT_PHYS_BASE 0xfee00000 - -#define CLEAR_TSC wrmsr(0x10, 0x00001000, 0x00001000) - -/* - * Setup routine for controlling SMP activation - * - * Command-line option of "nosmp" or "maxcpus=0" will disable SMP - * activation entirely (the MPS table probe still happens, though). - * - * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer - * greater than 0, limits the maximum number of CPUs activated in - * SMP mode to <NUM>. - */ - -static int __init nosmp(char *str) -{ - max_cpus = 0; - return 1; -} - -__setup("nosmp", nosmp); - -static int __init maxcpus(char *str) -{ - get_option(&str, &max_cpus); - return 1; -} - -__setup("maxcpus=", maxcpus); - -void ack_APIC_irq(void) -{ - /* Clear the IPI */ - - /* Dummy read */ - apic_read(APIC_SPIV); - - /* Docs say use 0 for future compatibility */ - apic_write(APIC_EOI, 0); -} - -/* - * Intel MP BIOS table parsing routines: - */ - -#ifndef CONFIG_X86_VISWS_APIC -/* - * Checksum an MP configuration block. - */ - -static int mpf_checksum(unsigned char *mp, int len) -{ - int sum=0; - while(len--) - sum+=*mp++; - return sum&0xFF; -} - -/* - * Processor encoding in an MP configuration block - */ - -static char *mpc_family(int family,int model) -{ - static char n[32]; - static char *model_defs[]= - { - "80486DX","80486DX", - "80486SX","80486DX/2 or 80487", - "80486SL","Intel5X2(tm)", - "Unknown","Unknown", - "80486DX/4" - }; - if (family==0x6) - return("Pentium(tm) Pro"); - if (family==0x5) - return("Pentium(tm)"); - if (family==0x0F && model==0x0F) - return("Special controller"); - if (family==0x04 && model<9) - return model_defs[model]; - sprintf(n,"Unknown CPU [%d:%d]",family, model); - return n; -} - - -/* - * Read the MPC - */ - -static int __init smp_read_mpc(struct mp_config_table *mpc) -{ - char str[16]; - int count=sizeof(*mpc); - int ioapics = 0; - unsigned char *mpt=((unsigned char *)mpc)+count; - - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) - { - panic("SMP mptable: bad signature [%c%c%c%c]!\n", - mpc->mpc_signature[0], - mpc->mpc_signature[1], - mpc->mpc_signature[2], - mpc->mpc_signature[3]); - return 1; - } - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) - { - panic("SMP mptable: checksum error!\n"); - return 1; - } - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) - { - printk("Bad Config Table version (%d)!!\n",mpc->mpc_spec); - return 1; - } - memcpy(str,mpc->mpc_oem,8); - str[8]=0; - printk("OEM ID: %s ",str); - - memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk("Product ID: %s ",str); - - printk("APIC at: 0x%lX\n",mpc->mpc_lapic); - - /* save the local APIC address, it might be non-default */ - mp_lapic_addr = mpc->mpc_lapic; - - /* - * Now process the configuration blocks. - */ - - while(count<mpc->mpc_length) - { - switch(*mpt) - { - case MP_PROCESSOR: - { - struct mpc_config_processor *m= - (struct mpc_config_processor *)mpt; - if (m->mpc_cpuflag&CPU_ENABLED) - { - printk("Processor #%d %s APIC version %d\n", - m->mpc_apicid, - mpc_family((m->mpc_cpufeature& - CPU_FAMILY_MASK)>>8, - (m->mpc_cpufeature& - CPU_MODEL_MASK)>>4), - m->mpc_apicver); -#ifdef SMP_DEBUG - if (m->mpc_featureflag&(1<<0)) - printk(" Floating point unit present.\n"); - if (m->mpc_featureflag&(1<<7)) - printk(" Machine Exception supported.\n"); - if (m->mpc_featureflag&(1<<8)) - printk(" 64 bit compare & exchange supported.\n"); - if (m->mpc_featureflag&(1<<9)) - printk(" Internal APIC present.\n"); -#endif - if (m->mpc_cpuflag&CPU_BOOTPROCESSOR) - { - SMP_PRINTK((" Bootup CPU\n")); - boot_cpu_id=m->mpc_apicid; - } - else /* Boot CPU already counted */ - num_processors++; - - if (m->mpc_apicid>NR_CPUS) - printk("Processor #%d unused. (Max %d processors).\n",m->mpc_apicid, NR_CPUS); - else - { - int ver = m->mpc_apicver; - - cpu_present_map|=(1<<m->mpc_apicid); - /* - * Validate version - */ - if (ver == 0x0) { - printk("BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; - } - } - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_BUS: - { - struct mpc_config_bus *m= - (struct mpc_config_bus *)mpt; - memcpy(str,m->mpc_bustype,6); - str[6]=0; - SMP_PRINTK(("Bus #%d is %s\n", - m->mpc_busid, - str)); - if (strncmp(m->mpc_bustype,"ISA",3) == 0) - mp_bus_id_to_type[m->mpc_busid] = - MP_BUS_ISA; - else - if (strncmp(m->mpc_bustype,"EISA",4) == 0) - mp_bus_id_to_type[m->mpc_busid] = - MP_BUS_EISA; - if (strncmp(m->mpc_bustype,"PCI",3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = - MP_BUS_PCI; - mp_bus_id_to_pci_bus[m->mpc_busid] = - mp_current_pci_id; - mp_current_pci_id++; - } - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_IOAPIC: - { - struct mpc_config_ioapic *m= - (struct mpc_config_ioapic *)mpt; - if (m->mpc_flags&MPC_APIC_USABLE) - { - ioapics++; - printk("I/O APIC #%d Version %d at 0x%lX.\n", - m->mpc_apicid,m->mpc_apicver, - m->mpc_apicaddr); - mp_apics [nr_ioapics] = *m; - if (++nr_ioapics > MAX_IO_APICS) - --nr_ioapics; - } - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_INTSRC: - { - struct mpc_config_intsrc *m= - (struct mpc_config_intsrc *)mpt; - - mp_irqs [mp_irq_entries] = *m; - if (++mp_irq_entries == MAX_IRQ_SOURCES) { - printk("Max irq sources exceeded!!\n"); - printk("Skipping remaining sources.\n"); - --mp_irq_entries; - } - - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_LINTSRC: - { - struct mpc_config_intlocal *m= - (struct mpc_config_intlocal *)mpt; - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - } - } - if (ioapics > MAX_IO_APICS) - { - printk("Warning: Max I/O APICs exceeded (max %d, found %d).\n", MAX_IO_APICS, ioapics); - printk("Warning: switching to non APIC mode.\n"); - skip_ioapic_setup=1; - } - return num_processors; -} - -/* - * Scan the memory blocks for an SMP configuration block. - */ - -static int __init smp_scan_config(unsigned long base, unsigned long length) -{ - unsigned long *bp=phys_to_virt(base); - struct intel_mp_floating *mpf; - - SMP_PRINTK(("Scan SMP from %p for %ld bytes.\n", - bp,length)); - if (sizeof(*mpf)!=16) - printk("Error: MPF size\n"); - - while (length>0) - { - if (*bp==SMP_MAGIC_IDENT) - { - mpf=(struct intel_mp_floating *)bp; - if (mpf->mpf_length==1 && - !mpf_checksum((unsigned char *)bp,16) && - (mpf->mpf_specification == 1 - || mpf->mpf_specification == 4) ) - { - printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2&(1<<7)) - printk(" IMCR and PIC compatibility mode.\n"); - else - printk(" Virtual Wire compatibility mode.\n"); - smp_found_config=1; - /* - * Now see if we need to read further. - */ - if (mpf->mpf_feature1!=0) - { - unsigned long cfg; - - /* local APIC has default address */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - /* - * We need to know what the local - * APIC id of the boot CPU is! - */ - -/* - * - * HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK - * - * It's not just a crazy hack. ;-) - */ - /* - * Standard page mapping - * functions don't work yet. - * We know that page 0 is not - * used. Steal it for now! - */ - - cfg=pg0[0]; - pg0[0] = (mp_lapic_addr | _PAGE_RW | _PAGE_PRESENT); - local_flush_tlb(); - - boot_cpu_id = GET_APIC_ID(*((volatile unsigned long *) APIC_ID)); - - /* - * Give it back - */ - - pg0[0]= cfg; - local_flush_tlb(); - -/* - * - * END OF HACK END OF HACK END OF HACK END OF HACK END OF HACK - * - */ - /* - * 2 CPUs, numbered 0 & 1. - */ - cpu_present_map=3; - num_processors=2; - printk("I/O APIC at 0xFEC00000.\n"); - - /* - * Save the default type number, we - * need it later to set the IO-APIC - * up properly: - */ - mpc_default_type = mpf->mpf_feature1; - - printk("Bus #0 is "); - } - switch(mpf->mpf_feature1) - { - case 1: - case 5: - printk("ISA\n"); - break; - case 2: - printk("EISA with no IRQ8 chaining\n"); - break; - case 6: - case 3: - printk("EISA\n"); - break; - case 4: - case 7: - printk("MCA\n"); - break; - case 0: - break; - default: - printk("???\nUnknown standard configuration %d\n", - mpf->mpf_feature1); - return 1; - } - if (mpf->mpf_feature1>4) - { - printk("Bus #1 is PCI\n"); - - /* - * Set local APIC version to - * the integrated form. - * It's initialized to zero - * otherwise, representing - * a discrete 82489DX. - */ - apic_version[0] = 0x10; - apic_version[1] = 0x10; - } - /* - * Read the physical hardware table. - * Anything here will override the - * defaults. - */ - if (mpf->mpf_physptr) - smp_read_mpc((void *)mpf->mpf_physptr); - - __cpu_logical_map[0] = boot_cpu_id; - global_irq_holder = boot_cpu_id; - current->processor = boot_cpu_id; - - printk("Processors: %d\n", num_processors); - /* - * Only use the first configuration found. - */ - return 1; - } - } - bp+=4; - length-=16; - } - - return 0; -} - -void __init init_intel_smp (void) -{ - /* - * FIXME: Linux assumes you have 640K of base ram.. - * this continues the error... - * - * 1) Scan the bottom 1K for a signature - * 2) Scan the top 1K of base RAM - * 3) Scan the 64K of bios - */ - if (!smp_scan_config(0x0,0x400) && - !smp_scan_config(639*0x400,0x400) && - !smp_scan_config(0xF0000,0x10000)) { - /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. - * - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E, calculate and scan it here. - * - * NOTE! There are Linux loaders that will corrupt the EBDA - * area, and as such this kind of SMP config may be less - * trustworthy, simply because the SMP table may have been - * stomped on during early boot. These loaders are buggy and - * should be fixed. - */ - unsigned int address; - - address = *(unsigned short *)phys_to_virt(0x40E); - address<<=4; - smp_scan_config(address, 0x1000); - if (smp_found_config) - printk(KERN_WARNING "WARNING: MP table in the EBDA can be UNSAFE, contact linux-smp@vger.rutgers.edu if you experience SMP problems!\n"); - } -} - -#else - -/* - * The Visual Workstation is Intel MP compliant in the hardware - * sense, but it doesnt have a BIOS(-configuration table). - * No problem for Linux. - */ -void __init init_visws_smp(void) -{ - smp_found_config = 1; - - cpu_present_map |= 2; /* or in id 1 */ - apic_version[1] |= 0x10; /* integrated APIC */ - apic_version[0] |= 0x10; - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; -} - -#endif - -/* - * - Intel MP Configuration Table - * - or SGI Visual Workstation configuration - */ -void __init init_smp_config (void) -{ -#ifndef CONFIG_VISWS - init_intel_smp(); -#else - init_visws_smp(); -#endif -} - - - -/* - * Trampoline 80x86 program as an array. - */ - -extern unsigned char trampoline_data []; -extern unsigned char trampoline_end []; -static unsigned char *trampoline_base; - -/* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller - * has made sure it's suitably aligned. - */ - -static unsigned long __init setup_trampoline(void) -{ - memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); - return virt_to_phys(trampoline_base); -} - -/* - * We are called very early to get the low memory for the - * SMP bootup trampoline page. - */ -unsigned long __init smp_alloc_memory(unsigned long mem_base) -{ - if (virt_to_phys((void *)mem_base) >= 0x9F000) - panic("smp_alloc_memory: Insufficient low memory for kernel trampoline 0x%lx.", mem_base); - trampoline_base = (void *)mem_base; - return mem_base + PAGE_SIZE; -} - -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ - -void __init smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c=&cpu_data[id]; - - *c = boot_cpu_data; - c->pte_quick = 0; - c->pgd_quick = 0; - c->pgtable_cache_sz = 0; - identify_cpu(c); - /* - * Mask B, Pentium, but not Pentium MMX - */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 5 && - c->x86_mask >= 1 && c->x86_mask <= 4 && - c->x86_model <= 3) - smp_b_stepping=1; /* Remember we have B step Pentia with bugs */ -} - -/* - * Architecture specific routine called by the kernel just before init is - * fired off. This allows the BP to have everything in order [we hope]. - * At the end of this all the APs will hit the system scheduling and off - * we go. Each AP will load the system gdt's and jump through the kernel - * init into idle(). At this point the scheduler will one day take over - * and give them jobs to do. smp_callin is a standard routine - * we use to track CPUs as they power up. - */ - -static atomic_t smp_commenced = ATOMIC_INIT(0); - -void __init smp_commence(void) -{ - /* - * Lets the callins below out of their loop. - */ - SMP_PRINTK(("Setting commenced=1, go go go\n")); - - wmb(); - atomic_set(&smp_commenced,1); -} - -void __init enable_local_APIC(void) -{ - unsigned long value; - - value = apic_read(APIC_SPIV); - value |= (1<<8); /* Enable APIC (bit==1) */ -#if 0 - value &= ~(1<<9); /* Enable focus processor (bit==0) */ -#else - value |= (1<<9); /* Disable focus processor (bit==1) */ -#endif - value |= 0xff; /* Set spurious IRQ vector to 0xff */ - apic_write(APIC_SPIV,value); - - /* - * Set Task Priority to 'accept all' - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write(APIC_TASKPRI,value); - - /* - * Clear the logical destination ID, just to be safe. - * also, put the APIC into flat delivery mode. - */ - value = apic_read(APIC_LDR); - value &= ~APIC_LDR_MASK; - apic_write(APIC_LDR,value); - - value = apic_read(APIC_DFR); - value |= SET_APIC_DFR(0xf); - apic_write(APIC_DFR, value); - - udelay(100); /* B safe */ -} - -unsigned long __init init_smp_mappings(unsigned long memory_start) -{ - unsigned long apic_phys; - - memory_start = PAGE_ALIGN(memory_start); - if (smp_found_config) { - apic_phys = mp_lapic_addr; - } else { - /* - * set up a fake all zeroes page to simulate the - * local APIC and another one for the IO-APIC. We - * could use the real zero-page, but it's safer - * this way if some buggy code writes to this page ... - */ - apic_phys = __pa(memory_start); - memset((void *)memory_start, 0, PAGE_SIZE); - memory_start += PAGE_SIZE; - } - set_fixmap(FIX_APIC_BASE,apic_phys); - printk("mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys); - -#ifdef CONFIG_X86_IO_APIC - { - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_apics[i].mpc_apicaddr; - } else { - ioapic_phys = __pa(memory_start); - memset((void *)memory_start, 0, PAGE_SIZE); - memory_start += PAGE_SIZE; - } - set_fixmap(idx,ioapic_phys); - printk("mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - } - } -#endif - - return memory_start; -} - -extern void calibrate_delay(void); - -void __init smp_callin(void) -{ - int cpuid; - unsigned long timeout; - - /* - * (This works even if the APIC is not enabled.) - */ - cpuid = GET_APIC_ID(apic_read(APIC_ID)); - - SMP_PRINTK(("CPU#%d waiting for CALLOUT\n", cpuid)); - - /* - * STARTUP IPIs are fragile beasts as they might sometimes - * trigger some glue motherboard logic. Complete APIC bus - * silence for 1 second, this overestimates the time the - * boot CPU is spending to send the up to 2 STARTUP IPIs - * by a factor of two. This should be enough. - */ - - /* - * Waiting 2s total for startup (udelay is not yet working) - */ - timeout = jiffies + 2*HZ; - while (time_before(jiffies,timeout)) - { - /* - * Has the boot CPU finished it's STARTUP sequence? - */ - if (test_bit(cpuid, (unsigned long *)&cpu_callout_map[0])) - break; - } - - while (!time_before(jiffies,timeout)) { - printk("BUG: CPU%d started up but did not get a callout!\n", - cpuid); - stop_this_cpu(); - } - - /* - * the boot CPU has finished the init stage and is spinning - * on callin_map until we finish. We are free to set up this - * CPU, first the APIC. (this is probably redundant on most - * boards) - */ - - SMP_PRINTK(("CALLIN, before enable_local_APIC().\n")); - enable_local_APIC(); - - /* - * Set up our APIC timer. - */ - setup_APIC_clock(); - - __sti(); - -#ifdef CONFIG_MTRR - /* Must be done before calibration delay is computed */ - mtrr_init_secondary_cpu (); -#endif - /* - * Get our bogomips. - */ - calibrate_delay(); - SMP_PRINTK(("Stack at about %p\n",&cpuid)); - - /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - - /* - * Allow the master to continue. - */ - set_bit(cpuid, (unsigned long *)&cpu_callin_map[0]); -} - -int cpucount = 0; - -extern int cpu_idle(void); - -/* - * Activate a secondary processor. - */ -int __init start_secondary(void *unused) -{ - /* - * Dont put anything before smp_callin(), SMP - * booting is too fragile that we want to limit the - * things done here to the most necessary things. - */ - cpu_init(); - smp_callin(); - while (!atomic_read(&smp_commenced)) - /* nothing */ ; - return cpu_idle(); -} - -/* - * Everything has been set up for the secondary - * CPUs - they just need to reload everything - * from the task structure - * This function must not return. - */ -void __init initialize_secondary(void) -{ - /* - * We don't actually need to load the full TSS, - * basically just the stack pointer and the eip. - */ - - asm volatile( - "movl %0,%%esp\n\t" - "jmp *%1" - : - :"r" (current->thread.esp),"r" (current->thread.eip)); -} - -extern struct { - void * esp; - unsigned short ss; -} stack_start; - -static int __init fork_by_hand(void) -{ - struct pt_regs regs; - /* don't care about the eip and regs settings since we'll never - reschedule the forked task. */ - return do_fork(CLONE_VM|CLONE_PID, 0, ®s); -} - -static void __init do_boot_cpu(int i) -{ - unsigned long cfg; - pgd_t maincfg; - struct task_struct *idle; - unsigned long send_status, accept_status; - int timeout, num_starts, j; - unsigned long start_eip; - - cpucount++; - /* We can't use kernel_thread since we must _avoid_ to reschedule - the child. */ - if (fork_by_hand() < 0) - panic("failed fork for CPU %d", i); - - /* - * We remove it from the pidhash and the runqueue - * once we got the process: - */ - idle = init_task.prev_task; - if (!idle) - panic("No idle process for CPU %d", i); - - idle->processor = i; - __cpu_logical_map[cpucount] = i; - cpu_number_map[i] = cpucount; - idle->has_cpu = 1; /* we schedule the first task manually */ - idle->thread.eip = (unsigned long) start_secondary; - - del_from_runqueue(idle); - unhash_process(idle); - init_tasks[cpucount] = idle; - - /* start_eip had better be page-aligned! */ - start_eip = setup_trampoline(); - - printk("Booting processor %d eip %lx\n", i, start_eip); /* So we see what's up */ - stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle); - - /* - * This grunge runs the startup process for - * the targeted processor. - */ - - SMP_PRINTK(("Setting warm reset code and vector.\n")); - - CMOS_WRITE(0xa, 0xf); - local_flush_tlb(); - SMP_PRINTK(("1.\n")); - *((volatile unsigned short *) phys_to_virt(0x469)) = start_eip >> 4; - SMP_PRINTK(("2.\n")); - *((volatile unsigned short *) phys_to_virt(0x467)) = start_eip & 0xf; - SMP_PRINTK(("3.\n")); - - maincfg=swapper_pg_dir[0]; - ((unsigned long *)swapper_pg_dir)[0]=0x102007; - - /* - * Be paranoid about clearing APIC errors. - */ - - if ( apic_version[i] & 0xF0 ) - { - apic_write(APIC_ESR, 0); - accept_status = (apic_read(APIC_ESR) & 0xEF); - } - - /* - * Status is now clean - */ - - send_status = 0; - accept_status = 0; - - /* - * Starting actual IPI sequence... - */ - - SMP_PRINTK(("Asserting INIT.\n")); - - /* - * Turn INIT on - */ - - cfg=apic_read(APIC_ICR2); - cfg&=0x00FFFFFF; - apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); /* Target chip */ - cfg=apic_read(APIC_ICR); - cfg&=~0xCDFFF; /* Clear bits */ - cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_ASSERT | APIC_DEST_DM_INIT); - apic_write(APIC_ICR, cfg); /* Send IPI */ - - udelay(200); - SMP_PRINTK(("Deasserting INIT.\n")); - - cfg=apic_read(APIC_ICR2); - cfg&=0x00FFFFFF; - apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); /* Target chip */ - cfg=apic_read(APIC_ICR); - cfg&=~0xCDFFF; /* Clear bits */ - cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_DM_INIT); - apic_write(APIC_ICR, cfg); /* Send IPI */ - - /* - * Should we send STARTUP IPIs ? - * - * Determine this based on the APIC version. - * If we don't have an integrated APIC, don't - * send the STARTUP IPIs. - */ - - if ( apic_version[i] & 0xF0 ) - num_starts = 2; - else - num_starts = 0; - - /* - * Run STARTUP IPI loop. - */ - - for (j = 1; !(send_status || accept_status) - && (j <= num_starts) ; j++) - { - SMP_PRINTK(("Sending STARTUP #%d.\n",j)); - apic_write(APIC_ESR, 0); - SMP_PRINTK(("After apic_write.\n")); - - /* - * STARTUP IPI - */ - - cfg=apic_read(APIC_ICR2); - cfg&=0x00FFFFFF; - apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); /* Target chip */ - cfg=apic_read(APIC_ICR); - cfg&=~0xCDFFF; /* Clear bits */ - cfg |= (APIC_DEST_DM_STARTUP | (start_eip >> 12)); /* Boot on the stack */ - SMP_PRINTK(("Before start apic_write.\n")); - apic_write(APIC_ICR, cfg); /* Kick the second */ - - SMP_PRINTK(("Startup point 1.\n")); - - timeout = 0; - SMP_PRINTK(("Waiting for send to finish...\n")); - do { - SMP_PRINTK(("+")); - udelay(100); - send_status = apic_read(APIC_ICR) & 0x1000; - } while (send_status && (timeout++ < 1000)); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); - accept_status = (apic_read(APIC_ESR) & 0xEF); - } - SMP_PRINTK(("After Startup.\n")); - - if (send_status) /* APIC never delivered?? */ - printk("APIC never delivered???\n"); - if (accept_status) /* Send accept error */ - printk("APIC delivery error (%lx).\n", accept_status); - - if ( !(send_status || accept_status) ) - { - /* - * allow APs to start initializing. - */ - SMP_PRINTK(("Before Callout %d.\n", i)); - set_bit(i, (unsigned long *)&cpu_callout_map[0]); - SMP_PRINTK(("After Callout %d.\n", i)); - - for(timeout=0;timeout<50000;timeout++) - { - if (cpu_callin_map[0]&(1<<i)) - break; /* It has booted */ - udelay(100); /* Wait 5s total for a response */ - } - if (cpu_callin_map[0]&(1<<i)) - { - /* number CPUs logically, starting from 1 (BSP is 0) */ -#if 0 - cpu_number_map[i] = cpucount; - __cpu_logical_map[cpucount] = i; -#endif - printk("OK.\n"); - printk("CPU%d: ", i); - print_cpu_info(&cpu_data[i]); - } - else - { - if (*((volatile unsigned char *)phys_to_virt(8192))==0xA5) - printk("Stuck ??\n"); - else - printk("Not responding.\n"); - } - SMP_PRINTK(("CPU has booted.\n")); - } - else - { - __cpu_logical_map[cpucount] = -1; - cpu_number_map[i] = -1; - cpucount--; - } - - swapper_pg_dir[0]=maincfg; - local_flush_tlb(); - - /* mark "stuck" area as not stuck */ - *((volatile unsigned long *)phys_to_virt(8192)) = 0; -} - -cycles_t cacheflush_time; -extern unsigned long cpu_hz; - -static void smp_tune_scheduling (void) -{ - unsigned long cachesize; - /* - * Rough estimation for SMP scheduling, this is the number of - * cycles it takes for a fully memory-limited process to flush - * the SMP-local cache. - * - * (For a P5 this pretty much means we will choose another idle - * CPU almost always at wakeup time (this is due to the small - * L1 cache), on PIIs it's around 50-100 usecs, depending on - * the cache size) - */ - - if (!cpu_hz) { - /* - * this basically disables processor-affinity - * scheduling on SMP without a TSC. - */ - cacheflush_time = 0; - return; - } else { - cachesize = boot_cpu_data.x86_cache_size; - if (cachesize == -1) - cachesize = 8; /* Pentiums */ - - cacheflush_time = cpu_hz/1024*cachesize/5000; - } - - printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", - (long)cacheflush_time/(cpu_hz/1000000), - ((long)cacheflush_time*100/(cpu_hz/1000000)) % 100); -} - -unsigned int prof_multiplier[NR_CPUS]; -unsigned int prof_old_multiplier[NR_CPUS]; -unsigned int prof_counter[NR_CPUS]; - -/* - * Cycle through the processors sending APIC IPIs to boot each. - */ - -void __init smp_boot_cpus(void) -{ - int i; - -#ifdef CONFIG_MTRR - /* Must be done before other processors booted */ - mtrr_init_boot_cpu (); -#endif - /* - * Initialize the logical to physical CPU number mapping - * and the per-CPU profiling counter/multiplier - */ - - for (i = 0; i < NR_CPUS; i++) { - cpu_number_map[i] = -1; - prof_counter[i] = 1; - prof_old_multiplier[i] = 1; - prof_multiplier[i] = 1; - } - - /* - * Setup boot CPU information - */ - - smp_store_cpu_info(boot_cpu_id); /* Final full version of the data */ - smp_tune_scheduling(); - printk("CPU%d: ", boot_cpu_id); - print_cpu_info(&cpu_data[boot_cpu_id]); - - /* - * not necessary because the MP table should list the boot - * CPU too, but we do it for the sake of robustness anyway. - * (and for the case when a non-SMP board boots an SMP kernel) - */ - cpu_present_map |= (1 << hard_smp_processor_id()); - - cpu_number_map[boot_cpu_id] = 0; - - init_idle(); - - /* - * If we couldnt find an SMP configuration at boot time, - * get out of here now! - */ - - if (!smp_found_config) - { - printk(KERN_NOTICE "SMP motherboard not detected. Using dummy APIC emulation.\n"); -#ifndef CONFIG_VISWS - io_apic_irqs = 0; -#endif - cpu_online_map = cpu_present_map; - smp_num_cpus = 1; - goto smp_done; - } - - /* - * If SMP should be disabled, then really disable it! - */ - - if (!max_cpus) - { - smp_found_config = 0; - printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); - } - -#ifdef SMP_DEBUG - { - int reg; - - /* - * This is to verify that we're looking at - * a real local APIC. Check these against - * your board if the CPUs aren't getting - * started for no apparent reason. - */ - - reg = apic_read(APIC_VERSION); - SMP_PRINTK(("Getting VERSION: %x\n", reg)); - - apic_write(APIC_VERSION, 0); - reg = apic_read(APIC_VERSION); - SMP_PRINTK(("Getting VERSION: %x\n", reg)); - - /* - * The two version reads above should print the same - * NON-ZERO!!! numbers. If the second one is zero, - * there is a problem with the APIC write/read - * definitions. - * - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - - - reg = apic_read(APIC_LVT0); - SMP_PRINTK(("Getting LVT0: %x\n", reg)); - - reg = apic_read(APIC_LVT1); - SMP_PRINTK(("Getting LVT1: %x\n", reg)); - } -#endif - - enable_local_APIC(); - - /* - * Set up our local APIC timer: - */ - setup_APIC_clock (); - - /* - * Now scan the CPU present map and fire up the other CPUs. - */ - - /* - * Add all detected CPUs. (later on we can down individual - * CPUs which will change cpu_online_map but not necessarily - * cpu_present_map. We are pretty much ready for hot-swap CPUs.) - */ - cpu_online_map = cpu_present_map; - mb(); - - SMP_PRINTK(("CPU map: %lx\n", cpu_present_map)); - - for(i=0;i<NR_CPUS;i++) - { - /* - * Don't even attempt to start the boot CPU! - */ - if (i == boot_cpu_id) - continue; - - if ((cpu_online_map & (1 << i)) - && (max_cpus < 0 || max_cpus > cpucount+1)) - { - do_boot_cpu(i); - } - - /* - * Make sure we unmap all failed CPUs - */ - - if (cpu_number_map[i] == -1 && (cpu_online_map & (1 << i))) { - printk("CPU #%d not responding. Removing from cpu_online_map.\n",i); - cpu_online_map &= ~(1 << i); - } - } - - /* - * Cleanup possible dangling ends... - */ - -#ifndef CONFIG_VISWS - { - unsigned long cfg; - - /* - * Install writable page 0 entry. - */ - cfg = pg0[0]; - pg0[0] = _PAGE_RW | _PAGE_PRESENT; /* writeable, present, addr 0 */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - - CMOS_WRITE(0, 0xf); - - *((volatile long *) phys_to_virt(0x467)) = 0; - - /* - * Restore old page 0 entry. - */ - - pg0[0] = cfg; - local_flush_tlb(); - } -#endif - - /* - * Allow the user to impress friends. - */ - - SMP_PRINTK(("Before bogomips.\n")); - if (!cpucount) { - printk(KERN_ERR "Error: only one processor found.\n"); - cpu_online_map = (1<<hard_smp_processor_id()); - } else { - unsigned long bogosum = 0; - for(i = 0; i < 32; i++) - if (cpu_online_map&(1<<i)) - bogosum+=cpu_data[i].loops_per_sec; - printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", - cpucount+1, - (bogosum+2500)/500000, - ((bogosum+2500)/5000)%100); - SMP_PRINTK(("Before bogocount - setting activated=1.\n")); - smp_activated = 1; - } - smp_num_cpus = cpucount + 1; - - if (smp_b_stepping) - printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n"); - SMP_PRINTK(("Boot done.\n")); - - cache_APIC_registers(); -#ifndef CONFIG_VISWS - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if (!skip_ioapic_setup) - setup_IO_APIC(); -#endif - -smp_done: - /* - * now we know the other CPUs have fired off and we know our - * APIC ID, so we can go init the TSS and stuff: - */ - cpu_init(); -} - +volatile unsigned long smp_invalidate_needed; /* * the following functions deal with sending IPIs between CPUs. @@ -1429,17 +110,6 @@ smp_done: * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. */ - -/* - * Silly serialization to work around CPU bug in P5s. - * We can safely turn it off on a 686. - */ -#ifdef CONFIG_X86_GOOD_APIC -# define FORCE_APIC_SERIALIZATION 0 -#else -# define FORCE_APIC_SERIALIZATION 1 -#endif - static unsigned int cached_APIC_ICR; static unsigned int cached_APIC_ICR2; @@ -1462,7 +132,7 @@ void cache_APIC_registers (void) static inline unsigned int __get_ICR (void) { -#if FORCE_APIC_SERIALIZATION +#if FORCE_READ_AROUND_WRITE /* * Wait for the APIC to become ready - this should never occur. It's * a debugging check really. @@ -1473,11 +143,11 @@ static inline unsigned int __get_ICR (void) while (count < 1000) { cfg = slow_ICR; - if (!(cfg&(1<<12))) { - if (count) - atomic_add(count, (atomic_t*)&ipi_count); + if (!(cfg&(1<<12))) return cfg; - } + printk("CPU #%d: ICR still busy [%08x]\n", + smp_processor_id(), cfg); + irq_err_count++; count++; udelay(10); } @@ -1491,19 +161,25 @@ static inline unsigned int __get_ICR (void) static inline unsigned int __get_ICR2 (void) { -#if FORCE_APIC_SERIALIZATION +#if FORCE_READ_AROUND_WRITE return slow_ICR2; #else return cached_APIC_ICR2; #endif } +#define LOGICAL_DELIVERY 1 + static inline int __prepare_ICR (unsigned int shortcut, int vector) { unsigned int cfg; cfg = __get_ICR(); - cfg |= APIC_DEST_DM_FIXED|shortcut|vector; + cfg |= APIC_DEST_DM_FIXED|shortcut|vector +#if LOGICAL_DELIVERY + |APIC_DEST_LOGICAL +#endif + ; return cfg; } @@ -1513,7 +189,11 @@ static inline int __prepare_ICR2 (unsigned int dest) unsigned int cfg; cfg = __get_ICR2(); +#if LOGICAL_DELIVERY + cfg |= SET_APIC_DEST_FIELD((1<<dest)); +#else cfg |= SET_APIC_DEST_FIELD(dest); +#endif return cfg; } @@ -1526,7 +206,7 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector) * have to lock out interrupts to be safe. Otherwise it's just one * single atomic write to the APIC, no need for cli/sti. */ -#if FORCE_APIC_SERIALIZATION +#if FORCE_READ_AROUND_WRITE unsigned long flags; __save_flags(flags); @@ -1536,21 +216,26 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector) /* * No need to touch the target chip field */ - cfg = __prepare_ICR(shortcut, vector); /* * Send the IPI. The write to APIC_ICR fires this off. */ apic_write(APIC_ICR, cfg); -#if FORCE_APIC_SERIALIZATION +#if FORCE_READ_AROUND_WRITE __restore_flags(flags); #endif } static inline void send_IPI_allbutself(int vector) { - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); + /* + * if there are no other CPUs in the system then + * we get an APIC send error if we try to broadcast. + * thus we have to avoid sending IPIs in this case. + */ + if (smp_num_cpus > 1) + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); } static inline void send_IPI_all(int vector) @@ -1566,7 +251,7 @@ void send_IPI_self(int vector) static inline void send_IPI_single(int dest, int vector) { unsigned long cfg; -#if FORCE_APIC_SERIALIZATION +#if FORCE_READ_AROUND_WRITE unsigned long flags; __save_flags(flags); @@ -1589,7 +274,7 @@ static inline void send_IPI_single(int dest, int vector) * Send the IPI. The write to APIC_ICR fires this off. */ apic_write(APIC_ICR, cfg); -#if FORCE_APIC_SERIALIZATION +#if FORCE_READ_AROUND_WRITE __restore_flags(flags); #endif } @@ -1715,200 +400,97 @@ void smp_send_reschedule(int cpu) } /* - * this function sends a 'stop' IPI to all other CPUs in the system. - * it goes straight through. - */ - -void smp_send_stop(void) -{ - send_IPI_allbutself(STOP_CPU_VECTOR); -} - -/* Structure and data for smp_call_function(). This is designed to minimise + * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -struct smp_call_function_struct { +static volatile struct call_data_struct { void (*func) (void *info); void *info; - atomic_t unstarted_count; - atomic_t unfinished_count; + atomic_t started; + atomic_t finished; int wait; -}; -static volatile struct smp_call_function_struct *smp_call_function_data = NULL; +} *call_data = NULL; /* * this function sends a 'generic call function' IPI to all other CPUs * in the system. */ -int smp_call_function (void (*func) (void *info), void *info, int retry, - int wait) -/* [SUMMARY] Run a function on all other CPUs. - <func> The function to run. This must be fast and non-blocking. - <info> An arbitrary pointer to pass to the function. - <retry> If true, keep retrying until ready. - <wait> If true, wait until function has completed on other CPUs. - [RETURNS] 0 on success, else a negative status code. Does not return until - remote CPUs are nearly ready to execute <<func>> or are or have executed. -*/ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) +/* + * [SUMMARY] Run a function on all other CPUs. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <nonatomic> If true, we might schedule away to lock the mutex + * <wait> If true, wait (atomically) until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute <<func>> or are or have executed. + */ { + struct call_data_struct data; + int ret, cpus = smp_num_cpus-1; + static DECLARE_MUTEX(lock); unsigned long timeout; - struct smp_call_function_struct data; - static spinlock_t lock = SPIN_LOCK_UNLOCKED; - - if (retry) { - while (1) { - if (smp_call_function_data) { - schedule (); /* Give a mate a go */ - continue; - } - spin_lock (&lock); - if (smp_call_function_data) { - spin_unlock (&lock); /* Bad luck */ - continue; - } - /* Mine, all mine! */ - break; - } - } - else { - if (smp_call_function_data) return -EBUSY; - spin_lock (&lock); - if (smp_call_function_data) { - spin_unlock (&lock); + + if (nonatomic) + down(&lock); + else + if (down_trylock(&lock)) return -EBUSY; - } - } - smp_call_function_data = &data; - spin_unlock (&lock); + + if (call_data) // temporary debugging check + BUG(); + + call_data = &data; data.func = func; data.info = info; - atomic_set (&data.unstarted_count, smp_num_cpus - 1); + atomic_set(&data.started, 0); data.wait = wait; - if (wait) atomic_set (&data.unfinished_count, smp_num_cpus - 1); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself (CALL_FUNCTION_VECTOR); - /* Wait for response */ - timeout = jiffies + JIFFIE_TIMEOUT; - while ( (atomic_read (&data.unstarted_count) > 0) && - time_before (jiffies, timeout) ) - barrier (); - if (atomic_read (&data.unstarted_count) > 0) { - smp_call_function_data = NULL; - return -ETIMEDOUT; - } if (wait) - while (atomic_read (&data.unfinished_count) > 0) - barrier (); - smp_call_function_data = NULL; + atomic_set(&data.finished, 0); + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + timeout = jiffies + HZ; + while ((atomic_read(&data.started) != cpus) + && time_before(jiffies, timeout)) + barrier(); + ret = -ETIMEDOUT; + if (atomic_read(&data.started) != cpus) + goto out; + ret = 0; + if (wait) + while (atomic_read(&data.finished) != cpus) + barrier(); +out: + call_data = NULL; + up(&lock); return 0; } -static unsigned int calibration_result; - -void setup_APIC_timer(unsigned int clocks); - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -void smp_local_timer_interrupt(struct pt_regs * regs) +static void stop_this_cpu (void * dummy) { - int user = (user_mode(regs) != 0); - int cpu = smp_processor_id(); - /* - * The profiling function is SMP safe. (nothing can mess - * around with "current", and the profiling counters are - * updated with atomic operations). This is especially - * useful with a profiling multiplier != 1 + * Remove this CPU: */ - if (!user) - x86_do_profile(regs->eip); - - if (!--prof_counter[cpu]) { - int system = 1 - user; - struct task_struct * p = current; - - /* - * The multiplier may have changed since the last time we got - * to this point as a result of the user writing to - * /proc/profile. In this case we need to adjust the APIC - * timer accordingly. - * - * Interrupts are already masked off at this point. - */ - prof_counter[cpu] = prof_multiplier[cpu]; - if (prof_counter[cpu] != prof_old_multiplier[cpu]) { - setup_APIC_timer(calibration_result/prof_counter[cpu]); - prof_old_multiplier[cpu] = prof_counter[cpu]; - } - - /* - * After doing the above, we need to make like - * a normal interrupt - otherwise timer interrupts - * ignore the global interrupt lock, which is the - * WrongThing (tm) to do. - */ - - irq_enter(cpu, 0); - update_one_process(p, 1, user, system, cpu); - if (p->pid) { - p->counter -= 1; - if (p->counter <= 0) { - p->counter = 0; - p->need_resched = 1; - } - if (p->priority < DEF_PRIORITY) { - kstat.cpu_nice += user; - kstat.per_cpu_nice[cpu] += user; - } else { - kstat.cpu_user += user; - kstat.per_cpu_user[cpu] += user; - } - kstat.cpu_system += system; - kstat.per_cpu_system[cpu] += system; - - } - irq_exit(cpu, 0); - } + clear_bit(smp_processor_id(), &cpu_online_map); - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ + if (cpu_data[smp_processor_id()].hlt_works_ok) + for(;;) __asm__("hlt"); + for (;;); } /* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesnt support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] + * this function calls the 'stop' function on all other CPUs in the system. */ -void smp_apic_timer_interrupt(struct pt_regs * regs) + +void smp_send_stop(void) { - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow, and we - * want to be able to accept NMI tlb invalidates - * during this time. - */ - ack_APIC_irq(); - smp_local_timer_interrupt(regs); + smp_call_function(stop_this_cpu, NULL, 1, 0); } /* @@ -1944,39 +526,24 @@ asmlinkage void smp_invalidate_interrupt(void) } -static void stop_this_cpu (void) +asmlinkage void smp_call_function_interrupt(void) { + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + ack_APIC_irq(); /* - * Remove this CPU: + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function */ - clear_bit(smp_processor_id(), &cpu_online_map); - - if (cpu_data[smp_processor_id()].hlt_works_ok) - for(;;) __asm__("hlt"); - for (;;); -} - -/* - * CPU halt call-back - */ -asmlinkage void smp_stop_cpu_interrupt(void) -{ - stop_this_cpu(); -} - -asmlinkage void smp_call_function_interrupt(void) -{ - void (*func) (void *info) = smp_call_function_data->func; - void *info = smp_call_function_data->info; - int wait = smp_call_function_data->wait; - - ack_APIC_irq (); - /* Notify initiating CPU that I've grabbed the data and am about to - execute the function */ - atomic_dec (&smp_call_function_data->unstarted_count); - /* At this point the structure may be out of scope unless wait==1 */ - (*func) (info); - if (wait) atomic_dec (&smp_call_function_data->unfinished_count); + atomic_inc(&call_data->started); + /* + * At this point the structure may be out of scope unless wait==1 + */ + (*func)(info); + if (wait) + atomic_inc(&call_data->finished); } /* @@ -1991,6 +558,34 @@ asmlinkage void smp_spurious_interrupt(void) } /* + * This interrupt should never happen with our APIC/SMP architecture + */ + +static spinlock_t err_lock; + +asmlinkage void smp_error_interrupt(void) +{ + unsigned long v; + + spin_lock(&err_lock); + + v = apic_read(APIC_ESR); + printk("APIC error interrupt on CPU#%d, should never happen.\n", + smp_processor_id()); + printk("... APIC ESR0: %08lx\n", v); + + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + printk("... APIC ESR1: %08lx\n", v); + + ack_APIC_irq(); + + irq_err_count++; + + spin_unlock(&err_lock); +} + +/* * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts * per second. We assume that the caller has already set up the local * APIC. @@ -1999,6 +594,10 @@ asmlinkage void smp_spurious_interrupt(void) * closely follows bus clocks. */ +int prof_multiplier[NR_CPUS] = { 1, }; +int prof_old_multiplier[NR_CPUS] = { 1, }; +int prof_counter[NR_CPUS] = { 1, }; + /* * The timer chip is already set up at HZ interrupts per second here, * but we do not accept timer interrupts yet. We only allow the BP @@ -2015,66 +614,102 @@ static unsigned int __init get_8254_timer_count(void) return count; } +void __init wait_8254_wraparound(void) +{ + unsigned int curr_count, prev_count=~0; + int delta; + + curr_count = get_8254_timer_count(); + + do { + prev_count = curr_count; + curr_count = get_8254_timer_count(); + delta = curr_count-prev_count; + + /* + * This limit for delta seems arbitrary, but it isn't, it's + * slightly above the level of error a buggy Mercury/Neptune + * chipset timer can cause. + */ + + } while (delta<300); +} + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call - * this function twice, once with a bogus timeout value, second - * time for real. The other (noncalibrating) CPUs call this - * function only once, with the real value. - * - * We are strictly in irqs off mode here, as we do not want to - * get an APIC interrupt go off accidentally. + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. * * We do reads before writes even if unnecessary, to get around the - * APIC double write bug. + * P5 APIC double write bug. */ #define APIC_DIVISOR 16 -void setup_APIC_timer(unsigned int clocks) +void __setup_APIC_LVTT(unsigned int clocks) { - unsigned long lvtt1_value; - unsigned int tmp_value; + unsigned int lvtt1_value, tmp_value; - /* - * Unfortunately the local APIC timer cannot be set up into NMI - * mode. With the IO APIC we can re-route the external timer - * interrupt and broadcast it as an NMI to all CPUs, so no pain. - */ tmp_value = apic_read(APIC_LVTT); - lvtt1_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - apic_write(APIC_LVTT , lvtt1_value); + lvtt1_value = SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV) | + APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + apic_write(APIC_LVTT, lvtt1_value); /* * Divide PICLK by 16 */ tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR , (tmp_value & ~APIC_TDR_DIV_1 ) - | APIC_TDR_DIV_16); + apic_write(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); tmp_value = apic_read(APIC_TMICT); apic_write(APIC_TMICT, clocks/APIC_DIVISOR); } -void __init wait_8254_wraparound(void) +void setup_APIC_timer(void * data) { - unsigned int curr_count, prev_count=~0; + unsigned int clocks = (unsigned int) data, slice, t0, t1, nr; + unsigned long flags; int delta; - curr_count = get_8254_timer_count(); - - do { - prev_count = curr_count; - curr_count = get_8254_timer_count(); - delta = curr_count-prev_count; + __save_flags(flags); + __sti(); + /* + * ok, Intel has some smart code in their APIC that knows + * if a CPU was in 'hlt' lowpower mode, and this increases + * its APIC arbitration priority. To avoid the external timer + * IRQ APIC event being in synchron with the APIC clock we + * introduce an interrupt skew to spread out timer events. + * + * The number of slices within a 'big' timeslice is smp_num_cpus+1 + */ + slice = clocks / (smp_num_cpus+1); + nr = cpu_number_map[smp_processor_id()] + 1; + printk("cpu: %d, clocks: %d, slice: %d, nr: %d.\n", + smp_processor_id(), clocks, slice, nr); /* - * This limit for delta seems arbitrary, but it isn't, it's - * slightly above the level of error a buggy Mercury/Neptune - * chipset timer can cause. + * Wait for IRQ0's slice: */ + wait_8254_wraparound(); - } while (delta<300); + __setup_APIC_LVTT(clocks); + + t0 = apic_read(APIC_TMCCT)*APIC_DIVISOR; + do { + t1 = apic_read(APIC_TMCCT)*APIC_DIVISOR; + delta = (int)(t0 - t1 - slice*nr); + } while (delta < 0); + + __setup_APIC_LVTT(clocks); + + printk("CPU%d<C0:%d,C:%d,D:%d,S:%d,C:%d>\n", + smp_processor_id(), t0, t1, delta, slice, clocks); + + __restore_flags(flags); } /* @@ -2092,10 +727,11 @@ void __init wait_8254_wraparound(void) int __init calibrate_APIC_clock(void) { - unsigned long long t1,t2; - long tt1,tt2; - long calibration_result; + unsigned long long t1 = 0, t2 = 0; + long tt1, tt2; + long result; int i; + const int LOOPS = HZ/10; printk("calibrating APIC timer ... "); @@ -2104,7 +740,7 @@ int __init calibrate_APIC_clock(void) * value into the APIC clock, we just want to get the * counter running for calibration. */ - setup_APIC_timer(1000000000); + __setup_APIC_LVTT(1000000000); /* * The timer chip counts down to zero. Let's wait @@ -2112,23 +748,24 @@ int __init calibrate_APIC_clock(void) * (the current tick might have been already half done) */ - wait_8254_wraparound (); + wait_8254_wraparound(); /* * We wrapped around just now. Let's start: */ - rdtscll(t1); - tt1=apic_read(APIC_TMCCT); + if (cpu_has_tsc) + rdtscll(t1); + tt1 = apic_read(APIC_TMCCT); -#define LOOPS (HZ/10) /* * Let's wait LOOPS wraprounds: */ - for (i=0; i<LOOPS; i++) - wait_8254_wraparound (); + for (i = 0; i < LOOPS; i++) + wait_8254_wraparound(); - tt2=apic_read(APIC_TMCCT); - rdtscll(t2); + tt2 = apic_read(APIC_TMCCT); + if (cpu_has_tsc) + rdtscll(t2); /* * The APIC bus clock counter is 32 bits only, it @@ -2138,71 +775,37 @@ int __init calibrate_APIC_clock(void) * underflown to be exact, as the timer counts down ;) */ - calibration_result = (tt1-tt2)*APIC_DIVISOR/LOOPS; - - SMP_PRINTK(("\n..... %ld CPU clocks in 1 timer chip tick.", - (unsigned long)(t2-t1)/LOOPS)); - - SMP_PRINTK(("\n..... %ld APIC bus clocks in 1 timer chip tick.", - calibration_result)); + result = (tt1-tt2)*APIC_DIVISOR/LOOPS; + if (cpu_has_tsc) + printk("\n..... CPU clock speed is %ld.%04ld MHz.\n", + ((long)(t2-t1)/LOOPS)/(1000000/HZ), + ((long)(t2-t1)/LOOPS)%(1000000/HZ)); - printk("\n..... CPU clock speed is %ld.%04ld MHz.\n", - ((long)(t2-t1)/LOOPS)/(1000000/HZ), - ((long)(t2-t1)/LOOPS)%(1000000/HZ) ); + printk("..... host bus clock speed is %ld.%04ld MHz.\n", + result/(1000000/HZ), + result%(1000000/HZ)); - printk("..... system bus clock speed is %ld.%04ld MHz.\n", - calibration_result/(1000000/HZ), - calibration_result%(1000000/HZ) ); -#undef LOOPS - - return calibration_result; + return result; } -void __init setup_APIC_clock(void) +static unsigned int calibration_result; + +void __init setup_APIC_clocks(void) { unsigned long flags; - static volatile int calibration_lock; - __save_flags(flags); __cli(); - SMP_PRINTK(("setup_APIC_clock() called.\n")); - - /* - * [ setup_APIC_clock() is called from all CPUs, but we want - * to do this part of the setup only once ... and it fits - * here best ] - */ - if (!test_and_set_bit(0,&calibration_lock)) { - - calibration_result=calibrate_APIC_clock(); - /* - * Signal completion to the other CPU[s]: - */ - calibration_lock = 3; - - } else { - /* - * Other CPU is calibrating, wait for finish: - */ - SMP_PRINTK(("waiting for other CPU calibrating APIC ... ")); - while (calibration_lock == 1); - SMP_PRINTK(("done, continuing.\n")); - } - -/* - * Now set up the timer for real. - */ + calibration_result = calibrate_APIC_clock(); - setup_APIC_timer (calibration_result); + smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1); /* - * We ACK the APIC, just in case there is something pending. + * Now set up the timer for real. */ - - ack_APIC_irq (); + setup_APIC_timer((void *)calibration_result); __restore_flags(flags); } @@ -2224,9 +827,9 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; /* - * Set the new multiplier for each CPU. CPUs don't start using the + * Set the new multiplier for each CPU. CPUs don't start using the * new values until the next timer interrupt in which they do process - * accounting. At that time they also adjust their APIC timers + * accounting. At that time they also adjust their APIC timers * accordingly. */ for (i = 0; i < NR_CPUS; ++i) @@ -2237,3 +840,111 @@ int setup_profiling_timer(unsigned int multiplier) #undef APIC_DIVISOR +/* + * Local timer interrupt handler. It does both profiling and + * process statistics/rescheduling. + * + * We do profiling in every local tick, statistics/rescheduling + * happen only every 'profiling multiplier' ticks. The default + * multiplier is 1 and it can be changed by writing the new multiplier + * value into /proc/profile. + */ + +inline void smp_local_timer_interrupt(struct pt_regs * regs) +{ + int user = (user_mode(regs) != 0); + int cpu = smp_processor_id(); + + /* + * The profiling function is SMP safe. (nothing can mess + * around with "current", and the profiling counters are + * updated with atomic operations). This is especially + * useful with a profiling multiplier != 1 + */ + if (!user) + x86_do_profile(regs->eip); + + if (--prof_counter[cpu] <= 0) { + int system = 1 - user; + struct task_struct * p = current; + + /* + * The multiplier may have changed since the last time we got + * to this point as a result of the user writing to + * /proc/profile. In this case we need to adjust the APIC + * timer accordingly. + * + * Interrupts are already masked off at this point. + */ + prof_counter[cpu] = prof_multiplier[cpu]; + if (prof_counter[cpu] != prof_old_multiplier[cpu]) { + __setup_APIC_LVTT(calibration_result/prof_counter[cpu]); + prof_old_multiplier[cpu] = prof_counter[cpu]; + } + + /* + * After doing the above, we need to make like + * a normal interrupt - otherwise timer interrupts + * ignore the global interrupt lock, which is the + * WrongThing (tm) to do. + */ + + irq_enter(cpu, 0); + update_one_process(p, 1, user, system, cpu); + if (p->pid) { + p->counter -= 1; + if (p->counter <= 0) { + p->counter = 0; + p->need_resched = 1; + } + if (p->priority < DEF_PRIORITY) { + kstat.cpu_nice += user; + kstat.per_cpu_nice[cpu] += user; + } else { + kstat.cpu_user += user; + kstat.per_cpu_user[cpu] += user; + } + kstat.cpu_system += system; + kstat.per_cpu_system[cpu] += system; + + } + irq_exit(cpu, 0); + } + + /* + * We take the 'long' return path, and there every subsystem + * grabs the apropriate locks (kernel lock/ irq lock). + * + * we might want to decouple profiling from the 'long path', + * and do the profiling totally in assembly. + * + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesnt support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +unsigned int apic_timer_irqs [NR_CPUS] = { 0, }; + +void smp_apic_timer_interrupt(struct pt_regs * regs) +{ + /* + * the NMI deadlock-detector uses this. + */ + apic_timer_irqs[smp_processor_id()]++; + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + smp_local_timer_interrupt(regs); +} + diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c new file mode 100644 index 000000000..46335ee8f --- /dev/null +++ b/arch/i386/kernel/smpboot.c @@ -0,0 +1,1650 @@ +/* + * Intel MP v1.1/v1.4 specification compliant parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998, 1999 Ingo Molnar <mingo@redhat.com> + * + * Much of the core SMP work is based on previous work by Thomas Radke, to + * whom a great many thanks are extended. + * + * Thanks to Intel for making available several different Pentium, + * Pentium Pro and Pentium-II/Xeon MP machines. + * Original development of Linux SMP code supported by Caldera. + * + * This code is released under the GNU public license version 2 or + * later. + * + * Fixes + * Felix Koop : NR_CPUS used properly + * Jose Renau : Handle single CPU case. + * Alan Cox : By repeated request 8) - Total BogoMIP report. + * Greg Wright : Fix for kernel stacks panic. + * Erich Boleyn : MP v1.4 and additional changes. + * Matthias Sattler : Changes for 2.1 kernel map. + * Michel Lespinasse : Changes for 2.1 kernel map. + * Michael Chastain : Change trampoline.S to gnu as. + * Alan Cox : Dumb bug: 'B' step PPro's are fine + * Ingo Molnar : Added APIC timers, based on code + * from Jose Renau + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. + * Maciej W. Rozycki : Bits for genuine 82489DX timers + */ + +#include <linux/config.h> +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/smp_lock.h> +#include <linux/irq.h> + +#include <linux/delay.h> +#include <linux/mc146818rtc.h> +#include <asm/mtrr.h> + +/* Set if we find a B stepping CPU */ +static int smp_b_stepping = 0; + +/* Setup configured maximum number of CPUs to activate */ +static int max_cpus = -1; +/* 1 if "noapic" boot option passed */ +int skip_ioapic_setup = 0; + +/* Total count of live CPUs */ +int smp_num_cpus = 0; +/* Internal processor count */ +static unsigned int num_processors = 1; + +/* Have we found an SMP box */ +int smp_found_config = 0; + +/* Bitmask of physically existing CPUs */ +unsigned long cpu_present_map = 0; +/* Bitmask of currently online CPUs */ +unsigned long cpu_online_map = 0; + +/* which CPU maps to which logical number */ +volatile int cpu_number_map[NR_CPUS]; +/* which logical number maps to which CPU */ +volatile int __cpu_logical_map[NR_CPUS]; + +static volatile unsigned long cpu_callin_map = 0; +static volatile unsigned long cpu_callout_map = 0; + +/* Per CPU bogomips and other parameters */ +struct cpuinfo_x86 cpu_data[NR_CPUS]; +/* Processor that is doing the boot up */ +static unsigned int boot_cpu_id = 0; + +/* Tripped once we need to start cross invalidating */ +static int smp_activated = 0; +/* Set when the idlers are all forked */ +int smp_threads_ready = 0; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [NR_CPUS]; +int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, }; +extern int nr_ioapics; +extern struct mpc_config_ioapic mp_ioapics [MAX_IO_APICS]; +extern int mp_irq_entries; +extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES]; +extern int mpc_default_type; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { -1, }; +int mp_current_pci_id = 0; +unsigned long mp_lapic_addr = 0; +int pic_mode; + +extern void cache_APIC_registers (void); + +#define SMP_DEBUG 1 + +#if SMP_DEBUG +#define dprintk(x...) printk(##x) +#else +#define dprintk(x...) +#endif + +/* + * IA s/w dev Vol 3, Section 7.4 + */ +#define APIC_DEFAULT_PHYS_BASE 0xfee00000 + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to <NUM>. + */ + +static int __init nosmp(char *str) +{ + max_cpus = 0; + return 1; +} + +__setup("nosmp", nosmp); + +static int __init maxcpus(char *str) +{ + get_option(&str, &max_cpus); + return 1; +} + +__setup("maxcpus=", maxcpus); + +/* + * Intel MP BIOS table parsing routines: + */ + +#ifndef CONFIG_X86_VISWS_APIC +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum=0; + while(len--) + sum+=*mp++; + return sum&0xFF; +} + +/* + * Processor encoding in an MP configuration block + */ + +static char __init *mpc_family(int family,int model) +{ + static char n[32]; + static char *model_defs[]= + { + "80486DX","80486DX", + "80486SX","80486DX/2 or 80487", + "80486SL","80486SX/2", + "Unknown","80486DX/2-WB", + "80486DX/4","80486DX/4-WB" + }; + + switch (family) { + case 0x04: + if (model < 10) + return model_defs[model]; + break; + + case 0x05: + return("Pentium(tm)"); + + case 0x06: + return("Pentium(tm) Pro"); + + case 0x0F: + if (model == 0x0F) + return("Special controller"); + } + sprintf(n,"Unknown CPU [%d:%d]",family, model); + return n; +} + +static void __init MP_processor_info (struct mpc_config_processor *m) +{ + int ver; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + printk("Processor #%d %s APIC version %d\n", + m->mpc_apicid, + mpc_family( (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8 , + (m->mpc_cpufeature & CPU_MODEL_MASK)>>4), + m->mpc_apicver); + +#ifdef SMP_DEBUG + if (m->mpc_featureflag&(1<<0)) + printk(" Floating point unit present.\n"); + if (m->mpc_featureflag&(1<<7)) + printk(" Machine Exception supported.\n"); + if (m->mpc_featureflag&(1<<8)) + printk(" 64 bit compare & exchange supported.\n"); + if (m->mpc_featureflag&(1<<9)) + printk(" Internal APIC present.\n"); +#endif + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + dprintk(" Bootup CPU\n"); + boot_cpu_id = m->mpc_apicid; + } else + /* Boot CPU already counted */ + num_processors++; + + if (m->mpc_apicid > NR_CPUS) { + printk("Processor #%d unused. (Max %d processors).\n", + m->mpc_apicid, NR_CPUS); + return; + } + ver = m->mpc_apicver; + + cpu_present_map |= (1<<m->mpc_apicid); + /* + * Validate version + */ + if (ver == 0x0) { + printk("BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; +} + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + dprintk("Bus #%d is %s\n", m->mpc_busid, str); + + if (strncmp(str, "ISA", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else { + if (strncmp(str, "EISA", 4) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else { + if (strncmp(str, "PCI", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else { + printk("Unknown bustype %s\n", str); + panic("cannot handle bus - mail to linux-smp@vger.rutgers.edu"); + } } } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk("I/O APIC #%d Version %d at 0x%lX.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk("Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + /* + * Well it seems all SMP boards in existence + * use ExtINT/LVT1 == LINT0 and + * NMI/LVT2 == LINT1 - the following check + * will show us if this assumptions is false. + * Until then we do not have to add baggage. + */ + if ((m->mpc_irqtype == mp_ExtINT) && + (m->mpc_destapiclint != 0)) + BUG(); + if ((m->mpc_irqtype == mp_NMI) && + (m->mpc_destapiclint != 1)) + BUG(); +} + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) + { + panic("SMP mptable: bad signature [%c%c%c%c]!\n", + mpc->mpc_signature[0], + mpc->mpc_signature[1], + mpc->mpc_signature[2], + mpc->mpc_signature[3]); + return 1; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) + { + panic("SMP mptable: checksum error!\n"); + return 1; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) + { + printk("Bad Config Table version (%d)!!\n",mpc->mpc_spec); + return 1; + } + memcpy(str,mpc->mpc_oem,8); + str[8]=0; + printk("OEM ID: %s ",str); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk("Product ID: %s ",str); + + printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + + /* save the local APIC address, it might be non-default */ + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + } + } + return num_processors; +} + +/* + * Scan the memory blocks for an SMP configuration block. + */ +static int __init smp_get_mpf(struct intel_mp_floating *mpf) +{ + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(" IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(" Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + smp_found_config = 1; + /* + * default CPU id - if it's different in the mptable + * then we change it before first using it. + */ + boot_cpu_id = 0; + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + cpu_present_map = 3; + num_processors = 2; + + nr_ioapics = 1; + mp_ioapics[0].mpc_apicaddr = 0xFEC00000; + /* + * Save the default type number, we + * need it later to set the IO-APIC + * up properly: + */ + mpc_default_type = mpf->mpf_feature1; + + printk("Bus #0 is "); + } + + switch (mpf->mpf_feature1) { + case 1: + case 5: + printk("ISA\n"); + break; + case 2: + printk("EISA with no IRQ0 and no IRQ13 DMA chaining\n"); + break; + case 6: + case 3: + printk("EISA\n"); + break; + case 4: + case 7: + printk("MCA\n"); + break; + case 0: + if (!mpf->mpf_physptr) + BUG(); + break; + default: + printk("???\nUnknown standard configuration %d\n", + mpf->mpf_feature1); + return 1; + } + if (mpf->mpf_feature1 > 4) { + printk("Bus #1 is PCI\n"); + + /* + * Set local APIC version to the integrated form. + * It's initialized to zero otherwise, representing + * a discrete 82489DX. + */ + apic_version[0] = 0x10; + apic_version[1] = 0x10; + } + /* + * Read the physical hardware table. Anything here will override the + * defaults. + */ + if (mpf->mpf_physptr) + smp_read_mpc((void *)mpf->mpf_physptr); + + __cpu_logical_map[0] = boot_cpu_id; + global_irq_holder = boot_cpu_id; + current->processor = boot_cpu_id; + + printk("Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ + return 1; +} + +static int __init smp_scan_config(unsigned long base, unsigned long length) +{ + unsigned long *bp = phys_to_virt(base); + struct intel_mp_floating *mpf; + + dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + printk("Error: MPF size\n"); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + printk("found SMP MP-table at %08ld\n", + virt_to_phys(mpf)); + smp_get_mpf(mpf); + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init init_intel_smp (void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + */ + + address = *(unsigned short *)phys_to_virt(0x40E); + address <<= 4; + smp_scan_config(address, 0x1000); + if (smp_found_config) + printk(KERN_WARNING "WARNING: MP table in the EBDA can be UNSAFE, contact linux-smp@vger.rutgers.edu if you experience SMP problems!\n"); +} + +#else + +/* + * The Visual Workstation is Intel MP compliant in the hardware + * sense, but it doesnt have a BIOS(-configuration table). + * No problem for Linux. + */ +void __init init_visws_smp(void) +{ + smp_found_config = 1; + + cpu_present_map |= 2; /* or in id 1 */ + apic_version[1] |= 0x10; /* integrated APIC */ + apic_version[0] |= 0x10; + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +} + +#endif + +/* + * - Intel MP Configuration Table + * - or SGI Visual Workstation configuration + */ +void __init init_smp_config (void) +{ +#ifndef CONFIG_VISWS + init_intel_smp(); +#else + init_visws_smp(); +#endif +} + + + +/* + * Trampoline 80x86 program as an array. + */ + +extern unsigned char trampoline_data []; +extern unsigned char trampoline_end []; +static unsigned char *trampoline_base; + +/* + * Currently trivial. Write the real->protected mode + * bootstrap into the page concerned. The caller + * has made sure it's suitably aligned. + */ + +static unsigned long __init setup_trampoline(void) +{ + memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); + return virt_to_phys(trampoline_base); +} + +/* + * We are called very early to get the low memory for the + * SMP bootup trampoline page. + */ +unsigned long __init smp_alloc_memory(unsigned long mem_base) +{ + if (virt_to_phys((void *)mem_base) >= 0x9F000) + BUG(); + trampoline_base = (void *)mem_base; + return mem_base + PAGE_SIZE; +} + +/* + * The bootstrap kernel entry code has set these up. Save them for + * a given CPU + */ + +void __init smp_store_cpu_info(int id) +{ + struct cpuinfo_x86 *c=&cpu_data[id]; + + *c = boot_cpu_data; + c->pte_quick = 0; + c->pgd_quick = 0; + c->pgtable_cache_sz = 0; + identify_cpu(c); + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 5 && + c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_model <= 3) + /* + * Remember we have B step Pentia with bugs + */ + smp_b_stepping = 1; +} + +/* + * Architecture specific routine called by the kernel just before init is + * fired off. This allows the BP to have everything in order [we hope]. + * At the end of this all the APs will hit the system scheduling and off + * we go. Each AP will load the system gdt's and jump through the kernel + * init into idle(). At this point the scheduler will one day take over + * and give them jobs to do. smp_callin is a standard routine + * we use to track CPUs as they power up. + */ + +static atomic_t smp_commenced = ATOMIC_INIT(0); + +void __init smp_commence(void) +{ + /* + * Lets the callins below out of their loop. + */ + dprintk("Setting commenced=1, go go go\n"); + + wmb(); + atomic_set(&smp_commenced,1); +} + +extern void __error_in_io_apic_c(void); + + +int get_maxlvt(void) +{ + unsigned int v, ver, maxlvt; + + v = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(v); + /* 82489DXs do not report # of LVT entries. */ + maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; + return maxlvt; +} + +void __init setup_local_APIC(void) +{ + unsigned long value, ver, maxlvt; + + if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) + __error_in_io_apic_c(); + + value = apic_read(APIC_SPIV); + value = 0xf; + /* + * Enable APIC + */ + value |= (1<<8); +#if 0 + /* Enable focus processor (bit==0) */ + value &= ~(1<<9); +#else + /* Disable focus processor (bit==1) */ + value |= (1<<9); +#endif + /* + * Set spurious IRQ vector + */ + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV,value); + + /* + * Set up LVT0, LVT1: + * + * set up through-local-APIC on the BP's LINT0. This is not + * strictly necessery in pure symmetric-IO mode, but sometimes + * we delegate interrupts to the 8259A. + */ + if (hard_smp_processor_id() == boot_cpu_id) { + value = 0x00000700; + printk("enabled ExtINT on CPU#%d\n", hard_smp_processor_id()); + } else { + value = 0x00010700; + printk("masked ExtINT on CPU#%d\n", hard_smp_processor_id()); + } + apic_write_around(APIC_LVT0,value); + + /* + * only the BP should see the LINT1 NMI signal, obviously. + */ + if (hard_smp_processor_id() == boot_cpu_id) + value = 0x00000400; // unmask NMI + else + value = 0x00010400; // mask NMI + apic_write_around(APIC_LVT1,value); + + value = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(value); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + maxlvt = get_maxlvt(); + /* + * Due to the Pentium erratum 3AP. + */ + if (maxlvt > 3) { + apic_readaround(APIC_SPIV); // not strictly necessery + apic_write(APIC_ESR, 0); + } + value = apic_read(APIC_ESR); + printk("ESR value before enabling vector: %08lx\n", value); + + value = apic_read(APIC_LVTERR); + value = ERROR_APIC_VECTOR; // enables sending errors + apic_write(APIC_LVTERR,value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt != 3) { + apic_readaround(APIC_SPIV); + apic_write(APIC_ESR, 0); + } + value = apic_read(APIC_ESR); + printk("ESR value after enabling vector: %08lx\n", value); + } else + printk("No ESR for 82489DX.\n"); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write(APIC_TASKPRI,value); + + /* + * Set up the logical destination ID and put the + * APIC into flat delivery mode. + */ + value = apic_read(APIC_LDR); + value &= ~APIC_LDR_MASK; + value |= (1<<(smp_processor_id()+24)); + apic_write(APIC_LDR,value); + + value = apic_read(APIC_DFR); + value |= SET_APIC_DFR(0xf); + apic_write(APIC_DFR, value); +} + +unsigned long __init init_smp_mappings(unsigned long memory_start) +{ + unsigned long apic_phys; + + memory_start = PAGE_ALIGN(memory_start); + if (smp_found_config) { + apic_phys = mp_lapic_addr; + } else { + /* + * set up a fake all zeroes page to simulate the + * local APIC and another one for the IO-APIC. We + * could use the real zero-page, but it's safer + * this way if some buggy code writes to this page ... + */ + apic_phys = __pa(memory_start); + memset((void *)memory_start, 0, PAGE_SIZE); + memory_start += PAGE_SIZE; + } + set_fixmap(FIX_APIC_BASE,apic_phys); + dprintk("mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys); + +#ifdef CONFIG_X86_IO_APIC + { + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; + + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = __pa(memory_start); + memset((void *)memory_start, 0, PAGE_SIZE); + memory_start += PAGE_SIZE; + } + set_fixmap(idx,ioapic_phys); + dprintk("mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + } + } +#endif + + return memory_start; +} + +/* + * TSC synchronization. + * + * We first check wether all CPUs have their TSC's synchronized, + * then we print a warning if not, and always resync. + */ + +static atomic_t tsc_start_flag = ATOMIC_INIT(0); +static atomic_t tsc_count_start = ATOMIC_INIT(0); +static atomic_t tsc_count_stop = ATOMIC_INIT(0); +static unsigned long long tsc_values[NR_CPUS] = { 0, }; + +#define NR_LOOPS 5 + +extern unsigned long fast_gettimeoffset_quotient; + +/* + * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit + * multiplication. Not terribly optimized but we need it at boot time only + * anyway. + * + * result == a / b + * == (a1 + a2*(2^32)) / b + * == a1/b + a2*(2^32/b) + * == a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b + * ^---- (this multiplication can overflow) + */ + +static unsigned long long div64 (unsigned long long a, unsigned long b0) +{ + unsigned int a1, a2; + unsigned long long res; + + a1 = ((unsigned int*)&a)[0]; + a2 = ((unsigned int*)&a)[1]; + + res = a1/b0 + + (unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) + + a2 / b0 + + (a2 * (0xffffffff % b0)) / b0; + + return res; +} + +static void __init synchronize_tsc_bp (void) +{ + int i; + unsigned long long t0; + unsigned long long sum, avg; + long long delta; + unsigned long one_usec; + int buggy = 0; + + printk("checking TSC synchronization across CPUs: "); + + one_usec = ((1<<30)/fast_gettimeoffset_quotient)*(1<<2); + + atomic_set(&tsc_start_flag, 1); + wmb(); + + /* + * We loop a few times to get a primed instruction cache, + * then the last pass is more or less synchronized and + * the BP and APs set their cycle counters to zero all at + * once. This reduces the chance of having random offsets + * between the processors, and guarantees that the maximum + * delay between the cycle counters is never bigger than + * the latency of information-passing (cachelines) between + * two CPUs. + */ + for (i = 0; i < NR_LOOPS; i++) { + /* + * all APs synchronize but they loop on '== num_cpus' + */ + while (atomic_read(&tsc_count_start) != smp_num_cpus-1) mb(); + atomic_set(&tsc_count_stop, 0); + wmb(); + /* + * this lets the APs save their current TSC: + */ + atomic_inc(&tsc_count_start); + + rdtscll(tsc_values[smp_processor_id()]); + /* + * We clear the TSC in the last loop: + */ + if (i == NR_LOOPS-1) + write_tsc(0, 0); + + /* + * Wait for all APs to leave the synchronization point: + */ + while (atomic_read(&tsc_count_stop) != smp_num_cpus-1) mb(); + atomic_set(&tsc_count_start, 0); + wmb(); + atomic_inc(&tsc_count_stop); + } + + sum = 0; + for (i = 0; i < NR_CPUS; i++) { + if (!(cpu_online_map & (1 << i))) + continue; + + t0 = tsc_values[i]; + sum += t0; + } + avg = div64(sum, smp_num_cpus); + + sum = 0; + for (i = 0; i < NR_CPUS; i++) { + if (!(cpu_online_map & (1 << i))) + continue; + + delta = tsc_values[i] - avg; + if (delta < 0) + delta = -delta; + /* + * We report bigger than 2 microseconds clock differences. + */ + if (delta > 2*one_usec) { + long realdelta; + if (!buggy) { + buggy = 1; + printk("\n"); + } + realdelta = div64(delta, one_usec); + if (tsc_values[i] < avg) + realdelta = -realdelta; + + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", + i, realdelta); + } + + sum += delta; + } + if (!buggy) + printk("passed.\n"); +} + +static void __init synchronize_tsc_ap (void) +{ + int i; + + /* + * smp_num_cpus is not necessarily known at the time + * this gets called, so we first wait for the BP to + * finish SMP initialization: + */ + while (!atomic_read(&tsc_start_flag)) mb(); + + for (i = 0; i < NR_LOOPS; i++) { + atomic_inc(&tsc_count_start); + while (atomic_read(&tsc_count_start) != smp_num_cpus) mb(); + + rdtscll(tsc_values[smp_processor_id()]); + if (i == NR_LOOPS-1) + write_tsc(0, 0); + + atomic_inc(&tsc_count_stop); + while (atomic_read(&tsc_count_stop) != smp_num_cpus) mb(); + } +} +#undef NR_LOOPS + +extern void calibrate_delay(void); + +void __init smp_callin(void) +{ + int cpuid; + unsigned long timeout; + + /* + * (This works even if the APIC is not enabled.) + */ + cpuid = GET_APIC_ID(apic_read(APIC_ID)); + + dprintk("CPU#%d waiting for CALLOUT\n", cpuid); + + /* + * STARTUP IPIs are fragile beasts as they might sometimes + * trigger some glue motherboard logic. Complete APIC bus + * silence for 1 second, this overestimates the time the + * boot CPU is spending to send the up to 2 STARTUP IPIs + * by a factor of two. This should be enough. + */ + + /* + * Waiting 2s total for startup (udelay is not yet working) + */ + timeout = jiffies + 2*HZ; + while (time_before(jiffies, timeout)) { + /* + * Has the boot CPU finished it's STARTUP sequence? + */ + if (test_bit(cpuid, &cpu_callout_map)) + break; + } + + if (!time_before(jiffies, timeout)) { + printk("BUG: CPU%d started up but did not get a callout!\n", + cpuid); + BUG(); + } + + /* + * the boot CPU has finished the init stage and is spinning + * on callin_map until we finish. We are free to set up this + * CPU, first the APIC. (this is probably redundant on most + * boards) + */ + + dprintk("CALLIN, before setup_local_APIC().\n"); + setup_local_APIC(); + + sti(); + +#ifdef CONFIG_MTRR + /* + * Must be done before calibration delay is computed + */ + mtrr_init_secondary_cpu (); +#endif + /* + * Get our bogomips. + */ + calibrate_delay(); + dprintk("Stack at about %p\n",&cpuid); + + /* + * Save our processor parameters + */ + smp_store_cpu_info(cpuid); + + /* + * Allow the master to continue. + */ + set_bit(cpuid, &cpu_callin_map); + + /* + * Synchronize the TSC with the BP + */ + if (cpu_has_tsc) + synchronize_tsc_ap (); +} + +int cpucount = 0; + +extern int cpu_idle(void); + +/* + * Activate a secondary processor. + */ +int __init start_secondary(void *unused) +{ + /* + * Dont put anything before smp_callin(), SMP + * booting is too fragile that we want to limit the + * things done here to the most necessary things. + */ + cpu_init(); + smp_callin(); + while (!atomic_read(&smp_commenced)) + /* nothing */ ; + return cpu_idle(); +} + +/* + * Everything has been set up for the secondary + * CPUs - they just need to reload everything + * from the task structure + * This function must not return. + */ +void __init initialize_secondary(void) +{ + /* + * We don't actually need to load the full TSS, + * basically just the stack pointer and the eip. + */ + + asm volatile( + "movl %0,%%esp\n\t" + "jmp *%1" + : + :"r" (current->thread.esp),"r" (current->thread.eip)); +} + +extern struct { + void * esp; + unsigned short ss; +} stack_start; + +static int __init fork_by_hand(void) +{ + struct pt_regs regs; + /* + * don't care about the eip and regs settings since + * we'll never reschedule the forked task. + */ + return do_fork(CLONE_VM|CLONE_PID, 0, ®s); +} + +static void __init do_boot_cpu(int i) +{ + unsigned long cfg; + pgd_t maincfg; + struct task_struct *idle; + unsigned long send_status, accept_status; + int timeout, num_starts, j; + unsigned long start_eip; + + cpucount++; + /* + * We can't use kernel_thread since we must avoid to + * reschedule the child. + */ + if (fork_by_hand() < 0) + panic("failed fork for CPU %d", i); + + /* + * We remove it from the pidhash and the runqueue + * once we got the process: + */ + idle = init_task.prev_task; + if (!idle) + panic("No idle process for CPU %d", i); + + idle->processor = i; + __cpu_logical_map[cpucount] = i; + cpu_number_map[i] = cpucount; + idle->has_cpu = 1; /* we schedule the first task manually */ + idle->thread.eip = (unsigned long) start_secondary; + + del_from_runqueue(idle); + unhash_process(idle); + init_tasks[cpucount] = idle; + + /* start_eip had better be page-aligned! */ + start_eip = setup_trampoline(); + + /* So we see what's up */ + printk("Booting processor %d eip %lx\n", i, start_eip); + stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle); + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + + dprintk("Setting warm reset code and vector.\n"); + + CMOS_WRITE(0xa, 0xf); + local_flush_tlb(); + dprintk("1.\n"); + *((volatile unsigned short *) phys_to_virt(0x469)) = start_eip >> 4; + dprintk("2.\n"); + *((volatile unsigned short *) phys_to_virt(0x467)) = start_eip & 0xf; + dprintk("3.\n"); + + maincfg=swapper_pg_dir[0]; + ((unsigned long *)swapper_pg_dir)[0]=0x102007; + + /* + * Be paranoid about clearing APIC errors. + */ + + if (APIC_INTEGRATED(apic_version[i])) { + apic_readaround(APIC_SPIV); + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + } + + /* + * Status is now clean + */ + send_status = 0; + accept_status = 0; + + /* + * Starting actual IPI sequence... + */ + + dprintk("Asserting INIT.\n"); + + /* + * Turn INIT on + */ + cfg = apic_read(APIC_ICR2); + cfg &= 0x00FFFFFF; + + /* + * Target chip + */ + apic_write(APIC_ICR2, cfg | SET_APIC_DEST_FIELD(i)); + + /* + * Send IPI + */ + cfg = apic_read(APIC_ICR); + cfg &= ~0xCDFFF; + cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_ASSERT | APIC_DEST_DM_INIT); + apic_write(APIC_ICR, cfg); + + udelay(200); + dprintk("Deasserting INIT.\n"); + + /* Target chip */ + cfg = apic_read(APIC_ICR2); + cfg &= 0x00FFFFFF; + apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); + + /* Send IPI */ + cfg = apic_read(APIC_ICR); + cfg &= ~0xCDFFF; + cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_DM_INIT); + apic_write(APIC_ICR, cfg); + + /* + * Should we send STARTUP IPIs ? + * + * Determine this based on the APIC version. + * If we don't have an integrated APIC, don't + * send the STARTUP IPIs. + */ + + if (APIC_INTEGRATED(apic_version[i])) + num_starts = 2; + else + num_starts = 0; + + /* + * Run STARTUP IPI loop. + */ + + for (j = 1; j <= num_starts; j++) { + dprintk("Sending STARTUP #%d.\n",j); + apic_readaround(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + dprintk("After apic_write.\n"); + + /* + * STARTUP IPI + */ + + /* Target chip */ + cfg = apic_read(APIC_ICR2); + cfg &= 0x00FFFFFF; + apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); + + /* Boot on the stack */ + cfg = apic_read(APIC_ICR); + cfg &= ~0xCDFFF; + cfg |= (APIC_DEST_DM_STARTUP | (start_eip >> 12)); + + /* Kick the second */ + apic_write(APIC_ICR, cfg); + + dprintk("Startup point 1.\n"); + + dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & 0x1000; + } while (send_status && (timeout++ < 1000)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + accept_status = (apic_read(APIC_ESR) & 0xEF); + if (send_status || accept_status) + break; + } + dprintk("After Startup.\n"); + + if (send_status) + printk("APIC never delivered???\n"); + if (accept_status) + printk("APIC delivery error (%lx).\n", accept_status); + + if (!send_status && !accept_status) { + /* + * allow APs to start initializing. + */ + dprintk("Before Callout %d.\n", i); + set_bit(i, &cpu_callout_map); + dprintk("After Callout %d.\n", i); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (test_bit(i, &cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (test_bit(i, &cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + printk("OK.\n"); + printk("CPU%d: ", i); + print_cpu_info(&cpu_data[i]); + } else { + if (*((volatile unsigned char *)phys_to_virt(8192)) + == 0xA5) /* trampoline code not run */ + printk("Stuck ??\n"); + else + printk("CPU booted but not responding.\n"); + } + dprintk("CPU has booted.\n"); + } else { + __cpu_logical_map[cpucount] = -1; + cpu_number_map[i] = -1; + cpucount--; + } + + swapper_pg_dir[0]=maincfg; + local_flush_tlb(); + + /* mark "stuck" area as not stuck */ + *((volatile unsigned long *)phys_to_virt(8192)) = 0; +} + +cycles_t cacheflush_time; +extern unsigned long cpu_hz; + +static void smp_tune_scheduling (void) +{ + unsigned long cachesize; + /* + * Rough estimation for SMP scheduling, this is the number of + * cycles it takes for a fully memory-limited process to flush + * the SMP-local cache. + * + * (For a P5 this pretty much means we will choose another idle + * CPU almost always at wakeup time (this is due to the small + * L1 cache), on PIIs it's around 50-100 usecs, depending on + * the cache size) + */ + + if (!cpu_hz) { + /* + * this basically disables processor-affinity + * scheduling on SMP without a TSC. + */ + cacheflush_time = 0; + return; + } else { + cachesize = boot_cpu_data.x86_cache_size; + if (cachesize == -1) + cachesize = 8; /* Pentiums */ + + cacheflush_time = cpu_hz/1024*cachesize/5000; + } + + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", + (long)cacheflush_time/(cpu_hz/1000000), + ((long)cacheflush_time*100/(cpu_hz/1000000)) % 100); +} + +/* + * Cycle through the processors sending APIC IPIs to boot each. + */ + +extern int prof_multiplier[NR_CPUS]; +extern int prof_old_multiplier[NR_CPUS]; +extern int prof_counter[NR_CPUS]; + +void __init smp_boot_cpus(void) +{ + int i; + +#ifdef CONFIG_MTRR + /* Must be done before other processors booted */ + mtrr_init_boot_cpu (); +#endif + /* + * Initialize the logical to physical CPU number mapping + * and the per-CPU profiling counter/multiplier + */ + + for (i = 0; i < NR_CPUS; i++) { + cpu_number_map[i] = -1; + prof_counter[i] = 1; + prof_old_multiplier[i] = 1; + prof_multiplier[i] = 1; + } + + /* + * Setup boot CPU information + */ + + smp_store_cpu_info(boot_cpu_id); /* Final full version of the data */ + smp_tune_scheduling(); + printk("CPU%d: ", boot_cpu_id); + print_cpu_info(&cpu_data[boot_cpu_id]); + + /* + * not necessary because the MP table should list the boot + * CPU too, but we do it for the sake of robustness anyway. + * (and for the case when a non-SMP board boots an SMP kernel) + */ + cpu_present_map |= (1 << hard_smp_processor_id()); + + cpu_number_map[boot_cpu_id] = 0; + + init_idle(); + + /* + * If we couldnt find an SMP configuration at boot time, + * get out of here now! + */ + + if (!smp_found_config) { + printk(KERN_NOTICE "SMP motherboard not detected. Using dummy APIC emulation.\n"); +#ifndef CONFIG_VISWS + io_apic_irqs = 0; +#endif + cpu_online_map = cpu_present_map; + smp_num_cpus = 1; + goto smp_done; + } + + /* + * If SMP should be disabled, then really disable it! + */ + + if (!max_cpus) { + smp_found_config = 0; + printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); + } + +#ifdef SMP_DEBUG + { + int reg; + + /* + * This is to verify that we're looking at + * a real local APIC. Check these against + * your board if the CPUs aren't getting + * started for no apparent reason. + */ + + reg = apic_read(APIC_LVR); + dprintk("Getting VERSION: %x\n", reg); + + apic_write(APIC_LVR, 0); + reg = apic_read(APIC_LVR); + dprintk("Getting VERSION: %x\n", reg); + + /* + * The two version reads above should print the same + * NON-ZERO!!! numbers. If the second one is zero, + * there is a problem with the APIC write/read + * definitions. + * + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + + + reg = apic_read(APIC_LVT0); + dprintk("Getting LVT0: %x\n", reg); + + reg = apic_read(APIC_LVT1); + dprintk("Getting LVT1: %x\n", reg); + } +#endif + + setup_local_APIC(); + + if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) + BUG(); + + /* + * Now scan the CPU present map and fire up the other CPUs. + */ + + /* + * Add all detected CPUs. (later on we can down individual + * CPUs which will change cpu_online_map but not necessarily + * cpu_present_map. We are pretty much ready for hot-swap CPUs.) + */ + cpu_online_map = cpu_present_map; + mb(); + + dprintk("CPU map: %lx\n", cpu_present_map); + + for (i = 0; i < NR_CPUS; i++) { + /* + * Don't even attempt to start the boot CPU! + */ + if (i == boot_cpu_id) + continue; + + if ((cpu_online_map & (1 << i)) + && (max_cpus < 0 || max_cpus > cpucount+1)) { + do_boot_cpu(i); + } + + /* + * Make sure we unmap all failed CPUs + */ + if (cpu_number_map[i] == -1 && (cpu_online_map & (1 << i))) { + printk("CPU #%d not responding - cannot use it.\n",i); + cpu_online_map &= ~(1 << i); + } + } + + /* + * Cleanup possible dangling ends... + */ + +#ifndef CONFIG_VISWS + { + unsigned long cfg; + + /* + * Install writable page 0 entry to set BIOS data area. + */ + cfg = pg0[0]; + /* writeable, present, addr 0 */ + pg0[0] = _PAGE_RW | _PAGE_PRESENT | 0; + local_flush_tlb(); + + /* + * Paranoid: Set warm reset code and vector here back + * to default values. + */ + CMOS_WRITE(0, 0xf); + + *((volatile long *) phys_to_virt(0x467)) = 0; + + /* + * Restore old page 0 entry. + */ + pg0[0] = cfg; + local_flush_tlb(); + } +#endif + + /* + * Allow the user to impress friends. + */ + + dprintk("Before bogomips.\n"); + if (!cpucount) { + printk(KERN_ERR "Error: only one processor found.\n"); + cpu_online_map = (1<<hard_smp_processor_id()); + } else { + unsigned long bogosum = 0; + for(i = 0; i < 32; i++) + if (cpu_online_map&(1<<i)) + bogosum+=cpu_data[i].loops_per_sec; + printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + (bogosum+2500)/500000, + ((bogosum+2500)/5000)%100); + dprintk("Before bogocount - setting activated=1.\n"); + smp_activated = 1; + } + smp_num_cpus = cpucount + 1; + + if (smp_b_stepping) + printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n"); + dprintk("Boot done.\n"); + + cache_APIC_registers(); +#ifndef CONFIG_VISWS + /* + * Here we can be sure that there is an IO-APIC in the system. Let's + * go and set it up: + */ + if (!skip_ioapic_setup) + setup_IO_APIC(); +#endif + +smp_done: + /* + * now we know the other CPUs have fired off and we know our + * APIC ID, so we can go init the TSS and stuff: + */ + cpu_init(); + + /* + * Set up all local APIC timers in the system: + */ + setup_APIC_clocks(); + + /* + * Synchronize the TSC with the AP + */ + if (cpu_has_tsc && cpucount) + synchronize_tsc_bp(); +} + diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 9d18999a0..d3f0d3109 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -74,7 +74,7 @@ static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ * Equal to 2^32 * (1 / (clocks per usec) ). * Initialized in time_init. */ -static unsigned long fast_gettimeoffset_quotient=0; +unsigned long fast_gettimeoffset_quotient=0; extern rwlock_t xtime_lock; diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index f3e6f75aa..ebd1cd002 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -58,10 +58,17 @@ struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, */ struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; +extern int console_loglevel; + +static inline void console_silent(void) +{ + console_loglevel = 0; +} + static inline void console_verbose(void) { - extern int console_loglevel; - console_loglevel = 15; + if (console_loglevel) + console_loglevel = 15; } #define DO_ERROR(trapnr, signr, str, name, tsk) \ @@ -202,8 +209,6 @@ void die(const char * str, struct pt_regs * regs, long err) printk("%s: %04lx\n", str, err & 0xffff); show_registers(regs); -spin_lock_irq(&die_lock); - spin_unlock_irq(&die_lock); do_exit(SIGSEGV); } @@ -292,7 +297,11 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); printk("You probably have a hardware problem with your RAM chips\n"); -} + + /* Clear and disable the memory parity error line. */ + reason = (reason & 0xf) | 4; + outb(reason, 0x61); +} static void io_check_error(unsigned char reason, struct pt_regs * regs) { @@ -301,8 +310,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs) printk("NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); - /* Re-enable the IOCK line, wait for a few seconds */ - reason |= 8; + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & 0xf) | 8; outb(reason, 0x61); i = 2000; while (--i) udelay(1000); @@ -325,18 +334,107 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) printk("Do you have a strange power saving mode enabled?\n"); } +atomic_t nmi_counter[NR_CPUS]; + +#if CONFIG_SMP + +int nmi_watchdog = 1; + +static int __init setup_nmi_watchdog(char *str) +{ + get_option(&str, &nmi_watchdog); + return 1; +} + +__setup("nmi_watchdog=", setup_nmi_watchdog); + +extern spinlock_t console_lock; +static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED; + +inline void nmi_watchdog_tick(struct pt_regs * regs) +{ + /* + * the best way to detect wether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are broadcasted to every CPU, here + * we only have to check the current processor. + * + * since NMIs dont listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up console_lock first ... + * [when there will be more tty-related locks, break them up + * here too!] + */ + + static unsigned int last_irq_sums [NR_CPUS] = { 0, }, + alert_counter [NR_CPUS] = { 0, }; + + /* + * Since current-> is always on the stack, and we always switch + * the stack NMI-atomically, it's safe to use smp_processor_id(). + */ + int sum, cpu = smp_processor_id(); + + sum = apic_timer_irqs[cpu]; + + if (last_irq_sums[cpu] == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + alert_counter[cpu]++; + if (alert_counter[cpu] == 5*HZ) { + spin_lock(&nmi_print_lock); + spin_unlock(&console_lock); // we are in trouble anyway + printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu); + show_registers(regs); + printk("console shuts up ...\n"); + console_silent(); + spin_unlock(&nmi_print_lock); + do_exit(SIGSEGV); + } + } else { + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } +} +#endif + asmlinkage void do_nmi(struct pt_regs * regs, long error_code) { unsigned char reason = inb(0x61); - extern atomic_t nmi_counter; - atomic_inc(&nmi_counter); + atomic_inc(nmi_counter+smp_processor_id()); + if (!(reason & 0xc0)) { +#if CONFIG_SMP + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog) { + nmi_watchdog_tick(regs); + return; + } else + unknown_nmi_error(reason, regs); +#else + unknown_nmi_error(reason, regs); +#endif + return; + } if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); - if (!(reason & 0xc0)) - unknown_nmi_error(reason, regs); + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered. + */ + outb(0x8f, 0x70); + inb(0x71); /* dummy */ + outb(0x0f, 0x70); + inb(0x71); /* dummy */ } /* @@ -455,6 +553,7 @@ asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs, asmlinkage void math_state_restore(struct pt_regs regs) { __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ + if(current->used_math) __asm__("frstor %0": :"m" (current->thread.i387)); else @@ -489,7 +588,6 @@ void __init trap_init_f00f_bug(void) pmd_t * pmd; pte_t * pte; -return; /* * Allocate a new page in virtual address space, * move the IDT into it and write protect this page. @@ -658,7 +756,7 @@ cobalt_init(void) */ set_fixmap(FIX_APIC_BASE, APIC_PHYS_BASE); printk("Local APIC ID %lx\n", apic_read(APIC_ID)); - printk("Local APIC Version %lx\n", apic_read(APIC_VERSION)); + printk("Local APIC Version %lx\n", apic_read(APIC_LVR)); set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); printk("Cobalt Revision %lx\n", co_cpu_read(CO_CPU_REV)); @@ -679,7 +777,7 @@ void __init trap_init(void) set_trap_gate(0,÷_error); set_trap_gate(1,&debug); - set_trap_gate(2,&nmi); + set_intr_gate(2,&nmi); set_system_gate(3,&int3); /* int3-5 can be called from all */ set_system_gate(4,&overflow); set_system_gate(5,&bounds); |